blob: 8c651d14436948a0711c6848cc2a0dd21137999b [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
#pragma ident "%Z%%M% %I% %E% SMI"
const char tcp_version[] = "%Z%%M% %I% %E% SMI";
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/stropts.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/suntpi.h>
#include <sys/xti_inet.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/vtrace.h>
#include <sys/kmem.h>
#include <sys/ethernet.h>
#include <sys/cpuvar.h>
#include <sys/dlpi.h>
#include <sys/multidata.h>
#include <sys/multidata_impl.h>
#include <sys/pattr.h>
#include <sys/policy.h>
#include <sys/zone.h>
#include <sys/errno.h>
#include <sys/signal.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/isa_defs.h>
#include <sys/md5.h>
#include <sys/random.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <net/if.h>
#include <net/route.h>
#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/optcom.h>
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>
#include <inet/tcp.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/ipdrop.h>
#include <inet/tcp_trace.h>
#include <inet/ipclassifier.h>
#include <inet/ip_ire.h>
#include <inet/ip_if.h>
#include <inet/ipp_common.h>
#include <sys/squeue.h>
/*
* TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
*
* (Read the detailed design doc in PSARC case directory)
*
* The entire tcp state is contained in tcp_t and conn_t structure
* which are allocated in tandem using ipcl_conn_create() and passing
* IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
* the references on the tcp_t. The tcp_t structure is never compressed
* and packets always land on the correct TCP perimeter from the time
* eager is created till the time tcp_t dies (as such the old mentat
* TCP global queue is not used for detached state and no IPSEC checking
* is required). The global queue is still allocated to send out resets
* for connection which have no listeners and IP directly calls
* tcp_xmit_listeners_reset() which does any policy check.
*
* Protection and Synchronisation mechanism:
*
* The tcp data structure does not use any kind of lock for protecting
* its state but instead uses 'squeues' for mutual exclusion from various
* read and write side threads. To access a tcp member, the thread should
* always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
* squeue_fill). Since the squeues allow a direct function call, caller
* can pass any tcp function having prototype of edesc_t as argument
* (different from traditional STREAMs model where packets come in only
* designated entry points). The list of functions that can be directly
* called via squeue are listed before the usual function prototype.
*
* Referencing:
*
* TCP is MT-Hot and we use a reference based scheme to make sure that the
* tcp structure doesn't disappear when its needed. When the application
* creates an outgoing connection or accepts an incoming connection, we
* start out with 2 references on 'conn_ref'. One for TCP and one for IP.
* The IP reference is just a symbolic reference since ip_tcpclose()
* looks at tcp structure after tcp_close_output() returns which could
* have dropped the last TCP reference. So as long as the connection is
* in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
* conn_t. The classifier puts its own reference when the connection is
* inserted in listen or connected hash. Anytime a thread needs to enter
* the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
* on write side or by doing a classify on read side and then puts a
* reference on the conn before doing squeue_enter/tryenter/fill. For
* read side, the classifier itself puts the reference under fanout lock
* to make sure that tcp can't disappear before it gets processed. The
* squeue will drop this reference automatically so the called function
* doesn't have to do a DEC_REF.
*
* Opening a new connection:
*
* The outgoing connection open is pretty simple. ip_tcpopen() does the
* work in creating the conn/tcp structure and initializing it. The
* squeue assignment is done based on the CPU the application
* is running on. So for outbound connections, processing is always done
* on application CPU which might be different from the incoming CPU
* being interrupted by the NIC. An optimal way would be to figure out
* the NIC <-> CPU binding at listen time, and assign the outgoing
* connection to the squeue attached to the CPU that will be interrupted
* for incoming packets (we know the NIC based on the bind IP address).
* This might seem like a problem if more data is going out but the
* fact is that in most cases the transmit is ACK driven transmit where
* the outgoing data normally sits on TCP's xmit queue waiting to be
* transmitted.
*
* Accepting a connection:
*
* This is a more interesting case because of various races involved in
* establishing a eager in its own perimeter. Read the meta comment on
* top of tcp_conn_request(). But briefly, the squeue is picked by
* ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
*
* Closing a connection:
*
* The close is fairly straight forward. tcp_close() calls tcp_close_output()
* via squeue to do the close and mark the tcp as detached if the connection
* was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
* reference but tcp_close() drop IP's reference always. So if tcp was
* not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
* and 1 because it is in classifier's connected hash. This is the condition
* we use to determine that its OK to clean up the tcp outside of squeue
* when time wait expires (check the ref under fanout and conn_lock and
* if it is 2, remove it from fanout hash and kill it).
*
* Although close just drops the necessary references and marks the
* tcp_detached state, tcp_close needs to know the tcp_detached has been
* set (under squeue) before letting the STREAM go away (because a
* inbound packet might attempt to go up the STREAM while the close
* has happened and tcp_detached is not set). So a special lock and
* flag is used along with a condition variable (tcp_closelock, tcp_closed,
* and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
* tcp_detached.
*
* Special provisions and fast paths:
*
* We make special provision for (AF_INET, SOCK_STREAM) sockets which
* can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
* will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
* all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
* check to send packets directly to tcp_rput_data via squeue. Everyone
* else comes through tcp_input() on the read side.
*
* We also make special provisions for sockfs by marking tcp_issocket
* whenever we have only sockfs on top of TCP. This allows us to skip
* putting the tcp in acceptor hash since a sockfs listener can never
* become acceptor and also avoid allocating a tcp_t for acceptor STREAM
* since eager has already been allocated and the accept now happens
* on acceptor STREAM. There is a big blob of comment on top of
* tcp_conn_request explaining the new accept. When socket is POP'd,
* sockfs sends us an ioctl to mark the fact and we go back to old
* behaviour. Once tcp_issocket is unset, its never set for the
* life of that connection.
*
* IPsec notes :
*
* Since a packet is always executed on the correct TCP perimeter
* all IPsec processing is defered to IP including checking new
* connections and setting IPSEC policies for new connection. The
* only exception is tcp_xmit_listeners_reset() which is called
* directly from IP and needs to policy check to see if TH_RST
* can be sent out.
*/
extern major_t TCP6_MAJ;
/*
* Values for squeue switch:
* 1: squeue_enter_nodrain
* 2: squeue_enter
* 3: squeue_fill
*/
int tcp_squeue_close = 2;
int tcp_squeue_wput = 2;
squeue_func_t tcp_squeue_close_proc;
squeue_func_t tcp_squeue_wput_proc;
extern vmem_t *ip_minor_arena;
/*
* This controls how tiny a write must be before we try to copy it
* into the the mblk on the tail of the transmit queue. Not much
* speedup is observed for values larger than sixteen. Zero will
* disable the optimisation.
*/
int tcp_tx_pull_len = 16;
/*
* TCP Statistics.
*
* How TCP statistics work.
*
* There are two types of statistics invoked by two macros.
*
* TCP_STAT(name) does non-atomic increment of a named stat counter. It is
* supposed to be used in non MT-hot paths of the code.
*
* TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
* supposed to be used for DEBUG purposes and may be used on a hot path.
*
* Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
* (use "kstat tcp" to get them).
*
* There is also additional debugging facility that marks tcp_clean_death()
* instances and saves them in tcp_t structure. It is triggered by
* TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
* tcp_clean_death() calls that counts the number of times each tag was hit. It
* is triggered by TCP_CLD_COUNTERS define.
*
* How to add new counters.
*
* 1) Add a field in the tcp_stat structure describing your counter.
* 2) Add a line in tcp_statistics with the name of the counter.
*
* IMPORTANT!! - make sure that both are in sync !!
* 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
*
* Please avoid using private counters which are not kstat-exported.
*
* TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
* in tcp_t structure.
*
* TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
*/
#define TCP_COUNTERS 1
#define TCP_CLD_COUNTERS 0
#ifndef TCP_DEBUG_COUNTER
#ifdef DEBUG
#define TCP_DEBUG_COUNTER 1
#else
#define TCP_DEBUG_COUNTER 0
#endif
#endif
#define TCP_TAG_CLEAN_DEATH 1
#define TCP_MAX_CLEAN_DEATH_TAG 32
#ifdef lint
static int _lint_dummy_;
#endif
#if TCP_COUNTERS
#define TCP_STAT(x) (tcp_statistics.x.value.ui64++)
#define TCP_STAT_UPDATE(x, n) (tcp_statistics.x.value.ui64 += (n))
#define TCP_STAT_SET(x, n) (tcp_statistics.x.value.ui64 = (n))
#elif defined(lint)
#define TCP_STAT(x) ASSERT(_lint_dummy_ == 0);
#define TCP_STAT_UPDATE(x, n) ASSERT(_lint_dummy_ == 0);
#define TCP_STAT_SET(x, n) ASSERT(_lint_dummy_ == 0);
#else
#define TCP_STAT(x)
#define TCP_STAT_UPDATE(x, n)
#define TCP_STAT_SET(x, n)
#endif
#if TCP_CLD_COUNTERS
static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
#define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
#elif defined(lint)
#define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
#else
#define TCP_CLD_STAT(x)
#endif
#if TCP_DEBUG_COUNTER
#define TCP_DBGSTAT(x) atomic_add_64(&(tcp_statistics.x.value.ui64), 1)
#elif defined(lint)
#define TCP_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
#else
#define TCP_DBGSTAT(x)
#endif
typedef struct tcp_stat {
kstat_named_t tcp_time_wait;
kstat_named_t tcp_time_wait_syn;
kstat_named_t tcp_time_wait_syn_success;
kstat_named_t tcp_time_wait_syn_fail;
kstat_named_t tcp_reinput_syn;
kstat_named_t tcp_ip_output;
kstat_named_t tcp_detach_non_time_wait;
kstat_named_t tcp_detach_time_wait;
kstat_named_t tcp_time_wait_reap;
kstat_named_t tcp_clean_death_nondetached;
kstat_named_t tcp_reinit_calls;
kstat_named_t tcp_eager_err1;
kstat_named_t tcp_eager_err2;
kstat_named_t tcp_eager_blowoff_calls;
kstat_named_t tcp_eager_blowoff_q;
kstat_named_t tcp_eager_blowoff_q0;
kstat_named_t tcp_not_hard_bound;
kstat_named_t tcp_no_listener;
kstat_named_t tcp_found_eager;
kstat_named_t tcp_wrong_queue;
kstat_named_t tcp_found_eager_binding1;
kstat_named_t tcp_found_eager_bound1;
kstat_named_t tcp_eager_has_listener1;
kstat_named_t tcp_open_alloc;
kstat_named_t tcp_open_detached_alloc;
kstat_named_t tcp_rput_time_wait;
kstat_named_t tcp_listendrop;
kstat_named_t tcp_listendropq0;
kstat_named_t tcp_wrong_rq;
kstat_named_t tcp_rsrv_calls;
kstat_named_t tcp_eagerfree2;
kstat_named_t tcp_eagerfree3;
kstat_named_t tcp_eagerfree4;
kstat_named_t tcp_eagerfree5;
kstat_named_t tcp_timewait_syn_fail;
kstat_named_t tcp_listen_badflags;
kstat_named_t tcp_timeout_calls;
kstat_named_t tcp_timeout_cached_alloc;
kstat_named_t tcp_timeout_cancel_reqs;
kstat_named_t tcp_timeout_canceled;
kstat_named_t tcp_timermp_alloced;
kstat_named_t tcp_timermp_freed;
kstat_named_t tcp_timermp_allocfail;
kstat_named_t tcp_timermp_allocdblfail;
kstat_named_t tcp_push_timer_cnt;
kstat_named_t tcp_ack_timer_cnt;
kstat_named_t tcp_ire_null1;
kstat_named_t tcp_ire_null;
kstat_named_t tcp_ip_send;
kstat_named_t tcp_ip_ire_send;
kstat_named_t tcp_wsrv_called;
kstat_named_t tcp_flwctl_on;
kstat_named_t tcp_timer_fire_early;
kstat_named_t tcp_timer_fire_miss;
kstat_named_t tcp_freelist_cleanup;
kstat_named_t tcp_rput_v6_error;
kstat_named_t tcp_out_sw_cksum;
kstat_named_t tcp_zcopy_on;
kstat_named_t tcp_zcopy_off;
kstat_named_t tcp_zcopy_backoff;
kstat_named_t tcp_zcopy_disable;
kstat_named_t tcp_mdt_pkt_out;
kstat_named_t tcp_mdt_pkt_out_v4;
kstat_named_t tcp_mdt_pkt_out_v6;
kstat_named_t tcp_mdt_discarded;
kstat_named_t tcp_mdt_conn_halted1;
kstat_named_t tcp_mdt_conn_halted2;
kstat_named_t tcp_mdt_conn_halted3;
kstat_named_t tcp_mdt_conn_resumed1;
kstat_named_t tcp_mdt_conn_resumed2;
kstat_named_t tcp_mdt_legacy_small;
kstat_named_t tcp_mdt_legacy_all;
kstat_named_t tcp_mdt_legacy_ret;
kstat_named_t tcp_mdt_allocfail;
kstat_named_t tcp_mdt_addpdescfail;
kstat_named_t tcp_mdt_allocd;
kstat_named_t tcp_mdt_linked;
kstat_named_t tcp_fusion_flowctl;
kstat_named_t tcp_fusion_backenabled;
kstat_named_t tcp_fusion_urg;
kstat_named_t tcp_fusion_putnext;
kstat_named_t tcp_fusion_unfusable;
kstat_named_t tcp_fusion_aborted;
kstat_named_t tcp_fusion_unqualified;
kstat_named_t tcp_in_ack_unsent_drop;
} tcp_stat_t;
#if (TCP_COUNTERS || TCP_DEBUG_COUNTER)
static tcp_stat_t tcp_statistics = {
{ "tcp_time_wait", KSTAT_DATA_UINT64 },
{ "tcp_time_wait_syn", KSTAT_DATA_UINT64 },
{ "tcp_time_wait_success", KSTAT_DATA_UINT64 },
{ "tcp_time_wait_fail", KSTAT_DATA_UINT64 },
{ "tcp_reinput_syn", KSTAT_DATA_UINT64 },
{ "tcp_ip_output", KSTAT_DATA_UINT64 },
{ "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 },
{ "tcp_detach_time_wait", KSTAT_DATA_UINT64 },
{ "tcp_time_wait_reap", KSTAT_DATA_UINT64 },
{ "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 },
{ "tcp_reinit_calls", KSTAT_DATA_UINT64 },
{ "tcp_eager_err1", KSTAT_DATA_UINT64 },
{ "tcp_eager_err2", KSTAT_DATA_UINT64 },
{ "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 },
{ "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 },
{ "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 },
{ "tcp_not_hard_bound", KSTAT_DATA_UINT64 },
{ "tcp_no_listener", KSTAT_DATA_UINT64 },
{ "tcp_found_eager", KSTAT_DATA_UINT64 },
{ "tcp_wrong_queue", KSTAT_DATA_UINT64 },
{ "tcp_found_eager_binding1", KSTAT_DATA_UINT64 },
{ "tcp_found_eager_bound1", KSTAT_DATA_UINT64 },
{ "tcp_eager_has_listener1", KSTAT_DATA_UINT64 },
{ "tcp_open_alloc", KSTAT_DATA_UINT64 },
{ "tcp_open_detached_alloc", KSTAT_DATA_UINT64 },
{ "tcp_rput_time_wait", KSTAT_DATA_UINT64 },
{ "tcp_listendrop", KSTAT_DATA_UINT64 },
{ "tcp_listendropq0", KSTAT_DATA_UINT64 },
{ "tcp_wrong_rq", KSTAT_DATA_UINT64 },
{ "tcp_rsrv_calls", KSTAT_DATA_UINT64 },
{ "tcp_eagerfree2", KSTAT_DATA_UINT64 },
{ "tcp_eagerfree3", KSTAT_DATA_UINT64 },
{ "tcp_eagerfree4", KSTAT_DATA_UINT64 },
{ "tcp_eagerfree5", KSTAT_DATA_UINT64 },
{ "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 },
{ "tcp_listen_badflags", KSTAT_DATA_UINT64 },
{ "tcp_timeout_calls", KSTAT_DATA_UINT64 },
{ "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 },
{ "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 },
{ "tcp_timeout_canceled", KSTAT_DATA_UINT64 },
{ "tcp_timermp_alloced", KSTAT_DATA_UINT64 },
{ "tcp_timermp_freed", KSTAT_DATA_UINT64 },
{ "tcp_timermp_allocfail", KSTAT_DATA_UINT64 },
{ "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 },
{ "tcp_push_timer_cnt", KSTAT_DATA_UINT64 },
{ "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 },
{ "tcp_ire_null1", KSTAT_DATA_UINT64 },
{ "tcp_ire_null", KSTAT_DATA_UINT64 },
{ "tcp_ip_send", KSTAT_DATA_UINT64 },
{ "tcp_ip_ire_send", KSTAT_DATA_UINT64 },
{ "tcp_wsrv_called", KSTAT_DATA_UINT64 },
{ "tcp_flwctl_on", KSTAT_DATA_UINT64 },
{ "tcp_timer_fire_early", KSTAT_DATA_UINT64 },
{ "tcp_timer_fire_miss", KSTAT_DATA_UINT64 },
{ "tcp_freelist_cleanup", KSTAT_DATA_UINT64 },
{ "tcp_rput_v6_error", KSTAT_DATA_UINT64 },
{ "tcp_out_sw_cksum", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_on", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_off", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_backoff", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_disable", KSTAT_DATA_UINT64 },
{ "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 },
{ "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 },
{ "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 },
{ "tcp_mdt_discarded", KSTAT_DATA_UINT64 },
{ "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 },
{ "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 },
{ "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 },
{ "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 },
{ "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 },
{ "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 },
{ "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 },
{ "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 },
{ "tcp_mdt_allocfail", KSTAT_DATA_UINT64 },
{ "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 },
{ "tcp_mdt_allocd", KSTAT_DATA_UINT64 },
{ "tcp_mdt_linked", KSTAT_DATA_UINT64 },
{ "tcp_fusion_flowctl", KSTAT_DATA_UINT64 },
{ "tcp_fusion_backenabled", KSTAT_DATA_UINT64 },
{ "tcp_fusion_urg", KSTAT_DATA_UINT64 },
{ "tcp_fusion_putnext", KSTAT_DATA_UINT64 },
{ "tcp_fusion_unfusable", KSTAT_DATA_UINT64 },
{ "tcp_fusion_aborted", KSTAT_DATA_UINT64 },
{ "tcp_fusion_unqualified", KSTAT_DATA_UINT64 },
{ "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 },
};
static kstat_t *tcp_kstat;
#endif
/*
* Call either ip_output or ip_output_v6. This replaces putnext() calls on the
* tcp write side.
*/
#define CALL_IP_WPUT(connp, q, mp) { \
ASSERT(((q)->q_flag & QREADR) == 0); \
TCP_DBGSTAT(tcp_ip_output); \
connp->conn_send(connp, (mp), (q), IP_WPUT); \
}
/*
* Was this tcp created via socket() interface?
*/
#define TCP_IS_SOCKET(tcp) ((tcp)->tcp_issocket)
/* Macros for timestamp comparisons */
#define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
#define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0)
/*
* Parameters for TCP Initial Send Sequence number (ISS) generation. When
* tcp_strong_iss is set to 1, which is the default, the ISS is calculated
* by adding three components: a time component which grows by 1 every 4096
* nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
* a per-connection component which grows by 125000 for every new connection;
* and an "extra" component that grows by a random amount centered
* approximately on 64000. This causes the the ISS generator to cycle every
* 4.89 hours if no TCP connections are made, and faster if connections are
* made.
*
* When tcp_strong_iss is set to 0, ISS is calculated by adding two
* components: a time component which grows by 250000 every second; and
* a per-connection component which grows by 125000 for every new connections.
*
* A third method, when tcp_strong_iss is set to 2, for generating ISS is
* prescribed by Steve Bellovin. This involves adding time, the 125000 per
* connection, and a one-way hash (MD5) of the connection ID <sport, dport,
* src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
* password.
*/
#define ISS_INCR 250000
#define ISS_NSEC_SHT 12
static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */
static kmutex_t tcp_iss_key_lock;
static MD5_CTX tcp_iss_key;
static sin_t sin_null; /* Zero address for quick clears */
static sin6_t sin6_null; /* Zero address for quick clears */
/* Packet dropper for TCP IPsec policy drops. */
static ipdropper_t tcp_dropper;
/*
* This implementation follows the 4.3BSD interpretation of the urgent
* pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
* incompatible changes in protocols like telnet and rlogin.
*/
#define TCP_OLD_URP_INTERPRETATION 1
#define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached)
#define TCP_IS_DETACHED_NONEAGER(tcp) \
(TCP_IS_DETACHED(tcp) && \
(!(tcp)->tcp_hard_binding))
/*
* TCP reassembly macros. We hide starting and ending sequence numbers in
* b_next and b_prev of messages on the reassembly queue. The messages are
* chained using b_cont. These macros are used in tcp_reass() so we don't
* have to see the ugly casts and assignments.
*/
#define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next))
#define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \
(mblk_t *)(uintptr_t)(u))
#define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev))
#define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \
(mblk_t *)(uintptr_t)(u))
/*
* Implementation of TCP Timers.
* =============================
*
* INTERFACE:
*
* There are two basic functions dealing with tcp timers:
*
* timeout_id_t tcp_timeout(connp, func, time)
* clock_t tcp_timeout_cancel(connp, timeout_id)
* TCP_TIMER_RESTART(tcp, intvl)
*
* tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
* after 'time' ticks passed. The function called by timeout() must adhere to
* the same restrictions as a driver soft interrupt handler - it must not sleep
* or call other functions that might sleep. The value returned is the opaque
* non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
* cancel the request. The call to tcp_timeout() may fail in which case it
* returns zero. This is different from the timeout(9F) function which never
* fails.
*
* The call-back function 'func' always receives 'connp' as its single
* argument. It is always executed in the squeue corresponding to the tcp
* structure. The tcp structure is guaranteed to be present at the time the
* call-back is called.
*
* NOTE: The call-back function 'func' is never called if tcp is in
* the TCPS_CLOSED state.
*
* tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
* request. locks acquired by the call-back routine should not be held across
* the call to tcp_timeout_cancel() or a deadlock may result.
*
* tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
* Otherwise, it returns an integer value greater than or equal to 0. In
* particular, if the call-back function is already placed on the squeue, it can
* not be canceled.
*
* NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
* within squeue context corresponding to the tcp instance. Since the
* call-back is also called via the same squeue, there are no race
* conditions described in untimeout(9F) manual page since all calls are
* strictly serialized.
*
* TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
* stored in tcp_timer_tid and starts a new one using
* MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
* and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
* field.
*
* NOTE: since the timeout cancellation is not guaranteed, the cancelled
* call-back may still be called, so it is possible tcp_timer() will be
* called several times. This should not be a problem since tcp_timer()
* should always check the tcp instance state.
*
*
* IMPLEMENTATION:
*
* TCP timers are implemented using three-stage process. The call to
* tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
* when the timer expires. The tcp_timer_callback() arranges the call of the
* tcp_timer_handler() function via squeue corresponding to the tcp
* instance. The tcp_timer_handler() calls actual requested timeout call-back
* and passes tcp instance as an argument to it. Information is passed between
* stages using the tcp_timer_t structure which contains the connp pointer, the
* tcp call-back to call and the timeout id returned by the timeout(9F).
*
* The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
* like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
* mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
* returns the pointer to this mblk.
*
* The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
* looks like a normal mblk without actual dblk attached to it.
*
* To optimize performance each tcp instance holds a small cache of timer
* mblocks. In the current implementation it caches up to two timer mblocks per
* tcp instance. The cache is preserved over tcp frees and is only freed when
* the whole tcp structure is destroyed by its kmem destructor. Since all tcp
* timer processing happens on a corresponding squeue, the cache manipulation
* does not require any locks. Experiments show that majority of timer mblocks
* allocations are satisfied from the tcp cache and do not involve kmem calls.
*
* The tcp_timeout() places a refhold on the connp instance which guarantees
* that it will be present at the time the call-back function fires. The
* tcp_timer_handler() drops the reference after calling the call-back, so the
* call-back function does not need to manipulate the references explicitly.
*/
typedef struct tcp_timer_s {
conn_t *connp;
void (*tcpt_proc)(void *);
timeout_id_t tcpt_tid;
} tcp_timer_t;
static kmem_cache_t *tcp_timercache;
kmem_cache_t *tcp_sack_info_cache;
kmem_cache_t *tcp_iphc_cache;
#define TCP_TIMER(tcp, f, tim) tcp_timeout(tcp->tcp_connp, f, tim)
#define TCP_TIMER_CANCEL(tcp, id) tcp_timeout_cancel(tcp->tcp_connp, id)
/*
* To restart the TCP retransmission timer.
*/
#define TCP_TIMER_RESTART(tcp, intvl) \
{ \
if ((tcp)->tcp_timer_tid != 0) { \
(void) TCP_TIMER_CANCEL((tcp), \
(tcp)->tcp_timer_tid); \
} \
(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, \
MSEC_TO_TICK(intvl)); \
}
/*
* For scalability, we must not run a timer for every TCP connection
* in TIME_WAIT state. To see why, consider (for time wait interval of
* 4 minutes):
* 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
*
* This list is ordered by time, so you need only delete from the head
* until you get to entries which aren't old enough to delete yet.
* The list consists of only the detached TIME_WAIT connections.
*
* Note that the timer (tcp_time_wait_expire) is started when the tcp_t
* becomes detached TIME_WAIT (either by changing the state and already
* being detached or the other way around). This means that the TIME_WAIT
* state can be extended (up to doubled) if the connection doesn't become
* detached for a long time.
*
* The list manipulations (including tcp_time_wait_next/prev)
* are protected by the tcp_time_wait_lock. The content of the
* detached TIME_WAIT connections is protected by the normal perimeters.
*/
typedef struct tcp_squeue_priv_s {
kmutex_t tcp_time_wait_lock;
/* Protects the next 3 globals */
timeout_id_t tcp_time_wait_tid;
tcp_t *tcp_time_wait_head;
tcp_t *tcp_time_wait_tail;
tcp_t *tcp_free_list;
} tcp_squeue_priv_t;
/*
* TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
* Running it every 5 seconds seems to give the best results.
*/
#define TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
#define TCP_XMIT_LOWATER 4096
#define TCP_XMIT_HIWATER 49152
#define TCP_RECV_LOWATER 2048
#define TCP_RECV_HIWATER 49152
/*
* PAWS needs a timer for 24 days. This is the number of ticks in 24 days
*/
#define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz))
#define TIDUSZ 4096 /* transport interface data unit size */
/*
* Bind hash list size and has function. It has to be a power of 2 for
* hashing.
*/
#define TCP_BIND_FANOUT_SIZE 512
#define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
/*
* Size of listen and acceptor hash list. It has to be a power of 2 for
* hashing.
*/
#define TCP_FANOUT_SIZE 256
#ifdef _ILP32
#define TCP_ACCEPTOR_HASH(accid) \
(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
#else
#define TCP_ACCEPTOR_HASH(accid) \
((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
#endif /* _ILP32 */
#define IP_ADDR_CACHE_SIZE 2048
#define IP_ADDR_CACHE_HASH(faddr) \
(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
/* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */
#define TCP_HSP_HASH_SIZE 256
#define TCP_HSP_HASH(addr) \
(((addr>>24) ^ (addr >>16) ^ \
(addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE)
/*
* TCP options struct returned from tcp_parse_options.
*/
typedef struct tcp_opt_s {
uint32_t tcp_opt_mss;
uint32_t tcp_opt_wscale;
uint32_t tcp_opt_ts_val;
uint32_t tcp_opt_ts_ecr;
tcp_t *tcp;
} tcp_opt_t;
/*
* RFC1323-recommended phrasing of TSTAMP option, for easier parsing
*/
#ifdef _BIG_ENDIAN
#define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
(TCPOPT_TSTAMP << 8) | 10)
#else
#define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
(TCPOPT_NOP << 8) | TCPOPT_NOP)
#endif
/*
* Flags returned from tcp_parse_options.
*/
#define TCP_OPT_MSS_PRESENT 1
#define TCP_OPT_WSCALE_PRESENT 2
#define TCP_OPT_TSTAMP_PRESENT 4
#define TCP_OPT_SACK_OK_PRESENT 8
#define TCP_OPT_SACK_PRESENT 16
/* TCP option length */
#define TCPOPT_NOP_LEN 1
#define TCPOPT_MAXSEG_LEN 4
#define TCPOPT_WS_LEN 3
#define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1)
#define TCPOPT_TSTAMP_LEN 10
#define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2)
#define TCPOPT_SACK_OK_LEN 2
#define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2)
#define TCPOPT_REAL_SACK_LEN 4
#define TCPOPT_MAX_SACK_LEN 36
#define TCPOPT_HEADER_LEN 2
/* TCP cwnd burst factor. */
#define TCP_CWND_INFINITE 65535
#define TCP_CWND_SS 3
#define TCP_CWND_NORMAL 5
/* Maximum TCP initial cwin (start/restart). */
#define TCP_MAX_INIT_CWND 8
/*
* Initialize cwnd according to RFC 3390. def_max_init_cwnd is
* either tcp_slow_start_initial or tcp_slow_start_after idle
* depending on the caller. If the upper layer has not used the
* TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
* should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
* If the upper layer has changed set the tcp_init_cwnd, just use
* it to calculate the tcp_cwnd.
*/
#define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \
{ \
if ((tcp)->tcp_init_cwnd == 0) { \
(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \
MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
} else { \
(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \
} \
tcp->tcp_cwnd_cnt = 0; \
}
/* TCP Timer control structure */
typedef struct tcpt_s {
pfv_t tcpt_pfv; /* The routine we are to call */
tcp_t *tcpt_tcp; /* The parameter we are to pass in */
} tcpt_t;
/* Host Specific Parameter structure */
typedef struct tcp_hsp {
struct tcp_hsp *tcp_hsp_next;
in6_addr_t tcp_hsp_addr_v6;
in6_addr_t tcp_hsp_subnet_v6;
uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */
int32_t tcp_hsp_sendspace;
int32_t tcp_hsp_recvspace;
int32_t tcp_hsp_tstamp;
} tcp_hsp_t;
#define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6)
#define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6)
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
void tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
void tcp_input(void *arg, mblk_t *mp, void *arg2);
void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
static void tcp_close_output(void *arg, mblk_t *mp, void *arg2);
static void tcp_output(void *arg, mblk_t *mp, void *arg2);
static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
/* Prototype for TCP functions */
static void tcp_random_init(void);
int tcp_random(void);
static void tcp_accept(tcp_t *tcp, mblk_t *mp);
static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
tcp_t *eager);
static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
boolean_t user_specified);
static void tcp_closei_local(tcp_t *tcp);
static void tcp_close_detached(tcp_t *tcp);
static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
mblk_t *idmp, mblk_t **defermp);
static void tcp_connect(tcp_t *tcp, mblk_t *mp);
static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp,
in_port_t dstport, uint_t srcid);
static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
in_port_t dstport, uint32_t flowinfo, uint_t srcid,
uint32_t scope_id);
static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
static void tcp_disconnect(tcp_t *tcp, mblk_t *mp);
static char *tcp_display(tcp_t *tcp, char *, char);
static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
static void tcp_eager_unlink(tcp_t *tcp);
static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
int unixerr);
static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
int tlierr, int unixerr);
static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
static int tcp_tpistate(tcp_t *tcp);
static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
int caller_holds_lock);
static void tcp_bind_hash_remove(tcp_t *tcp);
static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id);
void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
static void tcp_acceptor_hash_remove(tcp_t *tcp);
static void tcp_capability_req(tcp_t *tcp, mblk_t *mp);
static void tcp_info_req(tcp_t *tcp, mblk_t *mp);
static void tcp_addr_req(tcp_t *tcp, mblk_t *mp);
static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
static int tcp_header_init_ipv4(tcp_t *tcp);
static int tcp_header_init_ipv6(tcp_t *tcp);
int tcp_init(tcp_t *tcp, queue_t *q);
static int tcp_init_values(tcp_t *tcp);
static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim,
t_scalar_t addr_length);
static void tcp_ip_ire_mark_advice(tcp_t *tcp);
static void tcp_ip_notify(tcp_t *tcp);
static mblk_t *tcp_ire_mp(mblk_t *mp);
static void tcp_iss_init(tcp_t *tcp);
static void tcp_keepalive_killer(void *arg);
static int tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk);
static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
static void tcp_mss_set(tcp_t *tcp, uint32_t size);
static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
int *do_disconnectp, int *t_errorp, int *sys_errorp);
static boolean_t tcp_allow_connopt_set(int level, int name);
int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
static int tcp_opt_get_user(ipha_t *ipha, uchar_t *ptr);
int tcp_opt_set(queue_t *q, uint_t optset_context, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
mblk_t *mblk);
static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
uchar_t *ptr, uint_t len);
static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t tcp_param_register(tcpparam_t *tcppa, int cnt);
static int tcp_param_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static void tcp_iss_key_init(uint8_t *phrase, int len);
static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
static void tcp_reinit(tcp_t *tcp);
static void tcp_reinit_values(tcp_t *tcp);
static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
tcp_t *thisstream, cred_t *cr);
static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp);
static void tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len);
static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
static boolean_t tcp_send_rst_chk(void);
static void tcp_ss_rexmit(tcp_t *tcp);
static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
static void tcp_process_options(tcp_t *, tcph_t *);
static void tcp_rput_common(tcp_t *tcp, mblk_t *mp);
static void tcp_rsrv(queue_t *q);
static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
static int tcp_snmp_get(queue_t *q, mblk_t *mpctl);
static int tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr,
int len);
static int tcp_snmp_state(tcp_t *tcp);
static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static void tcp_timer(void *arg);
static void tcp_timer_callback(void *);
static in_port_t tcp_update_next_port(in_port_t port, boolean_t random);
static in_port_t tcp_get_next_priv_port(void);
static void tcp_wput(queue_t *q, mblk_t *mp);
static void tcp_wput_sock(queue_t *q, mblk_t *mp);
void tcp_wput_accept(queue_t *q, mblk_t *mp);
static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
static int tcp_send(queue_t *q, tcp_t *tcp, const int mss,
const int tcp_hdr_len, const int tcp_tcp_hdr_len,
const int num_sack_blk, int *usable, uint_t *snxt,
int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
const int mdt_thres);
static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
const int tcp_hdr_len, const int tcp_tcp_hdr_len,
const int num_sack_blk, int *usable, uint_t *snxt,
int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
const int mdt_thres);
static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
int num_sack_blk);
static void tcp_wsrv(queue_t *q);
static int tcp_xmit_end(tcp_t *tcp);
void tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len);
static mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send,
int32_t *offset, mblk_t **end_mp, uint32_t seq,
boolean_t sendall, uint32_t *seg_len, boolean_t rexmit);
static void tcp_ack_timer(void *arg);
static mblk_t *tcp_ack_mp(tcp_t *tcp);
static void tcp_push_timer(void *arg);
static void tcp_xmit_early_reset(char *str, mblk_t *mp,
uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len);
static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
uint32_t ack, int ctl);
static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr);
static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr);
static int setmaxps(queue_t *q, int maxpsz);
static void tcp_set_rto(tcp_t *, time_t);
static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
boolean_t, boolean_t);
static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
boolean_t ipsec_mctl);
static boolean_t tcp_cmpbuf(void *a, uint_t alen,
boolean_t b_valid, void *b, uint_t blen);
static boolean_t tcp_allocbuf(void **dstp, uint_t *dstlenp,
boolean_t src_valid, void *src, uint_t srclen);
static void tcp_savebuf(void **dstp, uint_t *dstlenp,
boolean_t src_valid, void *src, uint_t srclen);
static mblk_t *tcp_setsockopt_mp(int level, int cmd,
char *opt, int optlen);
static int tcp_pkt_set(uchar_t *, uint_t, uchar_t **, uint_t *);
static int tcp_build_hdrs(queue_t *, tcp_t *);
static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
uint32_t seg_seq, uint32_t seg_ack, int seg_len,
tcph_t *tcph);
boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *);
boolean_t tcp_reserved_port_del(in_port_t, in_port_t);
boolean_t tcp_reserved_port_check(in_port_t);
static tcp_t *tcp_alloc_temp_tcp(in_port_t);
static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
static void tcp_timers_stop(tcp_t *);
static timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
static clock_t tcp_timeout_cancel(conn_t *, timeout_id_t);
static mblk_t *tcp_mdt_info_mp(mblk_t *);
static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
const boolean_t, const uint32_t, const uint32_t,
const uint32_t, const uint32_t);
static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
const uint_t, const uint_t, boolean_t *);
static void tcp_send_data(tcp_t *, queue_t *, mblk_t *);
extern mblk_t *tcp_timermp_alloc(int);
extern void tcp_timermp_free(tcp_t *);
static void tcp_timer_free(tcp_t *tcp, mblk_t *mp);
static void tcp_stop_lingering(tcp_t *tcp);
static void tcp_close_linger_timeout(void *arg);
void tcp_ddi_init(void);
void tcp_ddi_destroy(void);
static void tcp_kstat_init(void);
static void tcp_kstat_fini(void);
static int tcp_kstat_update(kstat_t *kp, int rw);
void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
conn_t *tcp_get_next_conn(connf_t *, conn_t *);
static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
tcph_t *tcph, mblk_t *idmp);
static squeue_func_t tcp_squeue_switch(int);
static int tcp_open(queue_t *, dev_t *, int, int, cred_t *);
static int tcp_close(queue_t *, int);
static int tcpclose_accept(queue_t *);
static int tcp_modclose(queue_t *);
static void tcp_wput_mod(queue_t *, mblk_t *);
static void tcp_squeue_add(squeue_t *);
static boolean_t tcp_zcopy_check(tcp_t *);
static void tcp_zcopy_notify(tcp_t *);
static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *);
static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
static void tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
static void tcp_unfuse(tcp_t *);
static boolean_t tcp_fuse_output(tcp_t *, mblk_t *);
static void tcp_fuse_output_urg(tcp_t *, mblk_t *);
static boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
extern mblk_t *allocb_tryhard(size_t);
/*
* Routines related to the TCP_IOC_ABORT_CONN ioctl command.
*
* TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
* TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
* (defined in tcp.h) needs to be filled in and passed into the kernel
* via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
* structure contains the four-tuple of a TCP connection and a range of TCP
* states (specified by ac_start and ac_end). The use of wildcard addresses
* and ports is allowed. Connections with a matching four tuple and a state
* within the specified range will be aborted. The valid states for the
* ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
* inclusive.
*
* An application which has its connection aborted by this ioctl will receive
* an error that is dependent on the connection state at the time of the abort.
* If the connection state is < TCPS_TIME_WAIT, an application should behave as
* though a RST packet has been received. If the connection state is equal to
* TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
* and all resources associated with the connection will be freed.
*/
static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *);
static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
boolean_t);
static void tcp_clrqfull(tcp_t *);
static void tcp_setqfull(tcp_t *);
static struct module_info tcp_rinfo = {
#define TCP_MODULE_ID 5105
TCP_MODULE_ID, "tcp", 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
};
static struct module_info tcp_winfo = {
TCP_MODULE_ID, "tcp", 0, INFPSZ, 127, 16
};
/*
* Entry points for TCP as a module. It only allows SNMP requests
* to pass through.
*/
struct qinit tcp_mod_rinit = {
(pfi_t)putnext, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo
};
struct qinit tcp_mod_winit = {
(pfi_t)tcp_wput_mod, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo
};
/*
* Entry points for TCP as a device. The normal case which supports
* the TCP functionality.
*/
struct qinit tcp_rinit = {
NULL, (pfi_t)tcp_rsrv, tcp_open, tcp_close, NULL, &tcp_rinfo
};
struct qinit tcp_winit = {
(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
};
/* Initial entry point for TCP in socket mode. */
struct qinit tcp_sock_winit = {
(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
};
/*
* Entry points for TCP as a acceptor STREAM opened by sockfs when doing
* an accept. Avoid allocating data structures since eager has already
* been created.
*/
struct qinit tcp_acceptor_rinit = {
NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo
};
struct qinit tcp_acceptor_winit = {
(pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
};
struct streamtab tcpinfo = {
&tcp_rinit, &tcp_winit
};
extern squeue_func_t tcp_squeue_wput_proc;
extern squeue_func_t tcp_squeue_timer_proc;
/* Protected by tcp_g_q_lock */
static queue_t *tcp_g_q; /* Default queue used during detached closes */
kmutex_t tcp_g_q_lock;
/* Protected by tcp_hsp_lock */
/*
* XXX The host param mechanism should go away and instead we should use
* the metrics associated with the routes to determine the default sndspace
* and rcvspace.
*/
static tcp_hsp_t **tcp_hsp_hash; /* Hash table for HSPs */
krwlock_t tcp_hsp_lock;
/*
* Extra privileged ports. In host byte order.
* Protected by tcp_epriv_port_lock.
*/
#define TCP_NUM_EPRIV_PORTS 64
static int tcp_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS;
static uint16_t tcp_g_epriv_ports[TCP_NUM_EPRIV_PORTS] = { 2049, 4045 };
kmutex_t tcp_epriv_port_lock;
/*
* The smallest anonymous port in the priviledged port range which TCP
* looks for free port. Use in the option TCP_ANONPRIVBIND.
*/
static in_port_t tcp_min_anonpriv_port = 512;
/* Only modified during _init and _fini thus no locking is needed. */
static caddr_t tcp_g_nd; /* Head of 'named dispatch' variable list */
/* Hint not protected by any lock */
static uint_t tcp_next_port_to_try;
/* TCP bind hash list - all tcp_t with state >= BOUND. */
static tf_t tcp_bind_fanout[TCP_BIND_FANOUT_SIZE];
/* TCP queue hash list - all tcp_t in case they will be an acceptor. */
static tf_t tcp_acceptor_fanout[TCP_FANOUT_SIZE];
/*
* TCP has a private interface for other kernel modules to reserve a
* port range for them to use. Once reserved, TCP will not use any ports
* in the range. This interface relies on the TCP_EXCLBIND feature. If
* the semantics of TCP_EXCLBIND is changed, implementation of this interface
* has to be verified.
*
* There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port
* range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port
* range is [port a, port b] inclusive. And each port range is between
* TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive.
*
* Note that the default anonymous port range starts from 32768. There is
* no port "collision" between that and the reserved port range. If there
* is port collision (because the default smallest anonymous port is lowered
* or some apps specifically bind to ports in the reserved port range), the
* system may not be able to reserve a port range even there are enough
* unbound ports as a reserved port range contains consecutive ports .
*/
#define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5
#define TCP_RESERVED_PORTS_RANGE_MAX 1000
#define TCP_SMALLEST_RESERVED_PORT 10240
#define TCP_LARGEST_RESERVED_PORT 20480
/* Structure to represent those reserved port ranges. */
typedef struct tcp_rport_s {
in_port_t lo_port;
in_port_t hi_port;
tcp_t **temp_tcp_array;
} tcp_rport_t;
/* The reserved port array. */
static tcp_rport_t tcp_reserved_port[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE];
/* Locks to protect the tcp_reserved_ports array. */
static krwlock_t tcp_reserved_port_lock;
/* The number of ranges in the array. */
uint32_t tcp_reserved_port_array_size = 0;
/*
* MIB-2 stuff for SNMP
* Note: tcpInErrs {tcp 15} is accumulated in ip.c
*/
mib2_tcp_t tcp_mib; /* SNMP fixed size info */
kstat_t *tcp_mibkp; /* kstat exporting tcp_mib data */
/*
* Object to represent database of options to search passed to
* {sock,tpi}optcom_req() interface routine to take care of option
* management and associated methods.
* XXX These and other externs should ideally move to a TCP header
*/
extern optdb_obj_t tcp_opt_obj;
extern uint_t tcp_max_optsize;
boolean_t tcp_icmp_source_quench = B_FALSE;
/*
* Following assumes TPI alignment requirements stay along 32 bit
* boundaries
*/
#define ROUNDUP32(x) \
(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
/* Template for response to info request. */
static struct T_info_ack tcp_g_t_info_ack = {
T_INFO_ACK, /* PRIM_type */
0, /* TSDU_size */
T_INFINITE, /* ETSDU_size */
T_INVALID, /* CDATA_size */
T_INVALID, /* DDATA_size */
sizeof (sin_t), /* ADDR_size */
0, /* OPT_size - not initialized here */
TIDUSZ, /* TIDU_size */
T_COTS_ORD, /* SERV_type */
TCPS_IDLE, /* CURRENT_state */
(XPG4_1|EXPINLINE) /* PROVIDER_flag */
};
static struct T_info_ack tcp_g_t_info_ack_v6 = {
T_INFO_ACK, /* PRIM_type */
0, /* TSDU_size */
T_INFINITE, /* ETSDU_size */
T_INVALID, /* CDATA_size */
T_INVALID, /* DDATA_size */
sizeof (sin6_t), /* ADDR_size */
0, /* OPT_size - not initialized here */
TIDUSZ, /* TIDU_size */
T_COTS_ORD, /* SERV_type */
TCPS_IDLE, /* CURRENT_state */
(XPG4_1|EXPINLINE) /* PROVIDER_flag */
};
#define MS 1L
#define SECONDS (1000 * MS)
#define MINUTES (60 * SECONDS)
#define HOURS (60 * MINUTES)
#define DAYS (24 * HOURS)
#define PARAM_MAX (~(uint32_t)0)
/* Max size IP datagram is 64k - 1 */
#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
/* Max of the above */
#define TCP_MSS_MAX TCP_MSS_MAX_IPV4
/* Largest TCP port number */
#define TCP_MAX_PORT (64 * 1024 - 1)
/*
* tcp_wroff_xtra is the extra space in front of TCP/IP header for link
* layer header. It has to be a multiple of 4.
*/
static tcpparam_t tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
#define tcp_wroff_xtra tcp_wroff_xtra_param.tcp_param_val
/*
* All of these are alterable, within the min/max values given, at run time.
* Note that the default value of "tcp_time_wait_interval" is four minutes,
* per the TCP spec.
*/
/* BEGIN CSTYLED */
tcpparam_t tcp_param_arr[] = {
/*min max value name */
{ 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"},
{ 1, PARAM_MAX, 128, "tcp_conn_req_max_q" },
{ 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" },
{ 1, 1024, 1, "tcp_conn_req_min" },
{ 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" },
{ 128, (1<<30), 1024*1024, "tcp_cwnd_max" },
{ 0, 10, 0, "tcp_debug" },
{ 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"},
{ 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"},
{ 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"},
{ 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"},
{ 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"},
{ 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"},
{ 1, 255, 64, "tcp_ipv4_ttl"},
{ 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"},
{ 0, 100, 10, "tcp_maxpsz_multiplier" },
{ 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"},
{ 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
{ 1, TCP_MSS_MAX, 108, "tcp_mss_min"},
{ 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"},
{ 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"},
{ 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"},
{ 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"},
{ 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" },
{ 0, 16, 0, "tcp_snd_lowat_fraction" },
{ 0, 128000, 0, "tcp_sth_rcv_hiwat" },
{ 0, 128000, 0, "tcp_sth_rcv_lowat" },
{ 1, 10000, 3, "tcp_dupack_fast_retransmit" },
{ 0, 1, 0, "tcp_ignore_path_mtu" },
{ 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"},
{ 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"},
{ TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
{ TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
{ TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
{ 1, 65536, 4, "tcp_recv_hiwat_minmss"},
{ 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"},
{ 0, TCP_MSS_MAX, 64, "tcp_co_min"},
{ 8192, (1<<30), 1024*1024, "tcp_max_buf"},
/*
* Question: What default value should I set for tcp_strong_iss?
*/
{ 0, 2, 1, "tcp_strong_iss"},
{ 0, 65536, 20, "tcp_rtt_updates"},
{ 0, 1, 1, "tcp_wscale_always"},
{ 0, 1, 0, "tcp_tstamp_always"},
{ 0, 1, 1, "tcp_tstamp_if_wscale"},
{ 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"},
{ 0, 16, 2, "tcp_deferred_acks_max"},
{ 1, 16384, 4, "tcp_slow_start_after_idle"},
{ 1, 4, 4, "tcp_slow_start_initial"},
{ 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"},
{ 0, 2, 2, "tcp_sack_permitted"},
{ 0, 1, 0, "tcp_trace"},
{ 0, 1, 1, "tcp_compression_enabled"},
{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"},
{ 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"},
{ 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
{ 0, 1, 0, "tcp_rev_src_routes"},
{ 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"},
{ 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"},
{ 0, 16, 8, "tcp_local_dacks_max"},
{ 0, 2, 1, "tcp_ecn_permitted"},
{ 0, 1, 1, "tcp_rst_sent_rate_enabled"},
{ 0, PARAM_MAX, 40, "tcp_rst_sent_rate"},
{ 0, 100*MS, 50*MS, "tcp_push_timer_interval"},
{ 0, 1, 0, "tcp_use_smss_as_mss_opt"},
{ 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"},
};
/* END CSTYLED */
#define tcp_time_wait_interval tcp_param_arr[0].tcp_param_val
#define tcp_conn_req_max_q tcp_param_arr[1].tcp_param_val
#define tcp_conn_req_max_q0 tcp_param_arr[2].tcp_param_val
#define tcp_conn_req_min tcp_param_arr[3].tcp_param_val
#define tcp_conn_grace_period tcp_param_arr[4].tcp_param_val
#define tcp_cwnd_max_ tcp_param_arr[5].tcp_param_val
#define tcp_dbg tcp_param_arr[6].tcp_param_val
#define tcp_smallest_nonpriv_port tcp_param_arr[7].tcp_param_val
#define tcp_ip_abort_cinterval tcp_param_arr[8].tcp_param_val
#define tcp_ip_abort_linterval tcp_param_arr[9].tcp_param_val
#define tcp_ip_abort_interval tcp_param_arr[10].tcp_param_val
#define tcp_ip_notify_cinterval tcp_param_arr[11].tcp_param_val
#define tcp_ip_notify_interval tcp_param_arr[12].tcp_param_val
#define tcp_ipv4_ttl tcp_param_arr[13].tcp_param_val
#define tcp_keepalive_interval_high tcp_param_arr[14].tcp_param_max
#define tcp_keepalive_interval tcp_param_arr[14].tcp_param_val
#define tcp_keepalive_interval_low tcp_param_arr[14].tcp_param_min
#define tcp_maxpsz_multiplier tcp_param_arr[15].tcp_param_val
#define tcp_mss_def_ipv4 tcp_param_arr[16].tcp_param_val
#define tcp_mss_max_ipv4 tcp_param_arr[17].tcp_param_val
#define tcp_mss_min tcp_param_arr[18].tcp_param_val
#define tcp_naglim_def tcp_param_arr[19].tcp_param_val
#define tcp_rexmit_interval_initial tcp_param_arr[20].tcp_param_val
#define tcp_rexmit_interval_max tcp_param_arr[21].tcp_param_val
#define tcp_rexmit_interval_min tcp_param_arr[22].tcp_param_val
#define tcp_deferred_ack_interval tcp_param_arr[23].tcp_param_val
#define tcp_snd_lowat_fraction tcp_param_arr[24].tcp_param_val
#define tcp_sth_rcv_hiwat tcp_param_arr[25].tcp_param_val
#define tcp_sth_rcv_lowat tcp_param_arr[26].tcp_param_val
#define tcp_dupack_fast_retransmit tcp_param_arr[27].tcp_param_val
#define tcp_ignore_path_mtu tcp_param_arr[28].tcp_param_val
#define tcp_smallest_anon_port tcp_param_arr[29].tcp_param_val
#define tcp_largest_anon_port tcp_param_arr[30].tcp_param_val
#define tcp_xmit_hiwat tcp_param_arr[31].tcp_param_val
#define tcp_xmit_lowat tcp_param_arr[32].tcp_param_val
#define tcp_recv_hiwat tcp_param_arr[33].tcp_param_val
#define tcp_recv_hiwat_minmss tcp_param_arr[34].tcp_param_val
#define tcp_fin_wait_2_flush_interval tcp_param_arr[35].tcp_param_val
#define tcp_co_min tcp_param_arr[36].tcp_param_val
#define tcp_max_buf tcp_param_arr[37].tcp_param_val
#define tcp_strong_iss tcp_param_arr[38].tcp_param_val
#define tcp_rtt_updates tcp_param_arr[39].tcp_param_val
#define tcp_wscale_always tcp_param_arr[40].tcp_param_val
#define tcp_tstamp_always tcp_param_arr[41].tcp_param_val
#define tcp_tstamp_if_wscale tcp_param_arr[42].tcp_param_val
#define tcp_rexmit_interval_extra tcp_param_arr[43].tcp_param_val
#define tcp_deferred_acks_max tcp_param_arr[44].tcp_param_val
#define tcp_slow_start_after_idle tcp_param_arr[45].tcp_param_val
#define tcp_slow_start_initial tcp_param_arr[46].tcp_param_val
#define tcp_co_timer_interval tcp_param_arr[47].tcp_param_val
#define tcp_sack_permitted tcp_param_arr[48].tcp_param_val
#define tcp_trace tcp_param_arr[49].tcp_param_val
#define tcp_compression_enabled tcp_param_arr[50].tcp_param_val
#define tcp_ipv6_hoplimit tcp_param_arr[51].tcp_param_val
#define tcp_mss_def_ipv6 tcp_param_arr[52].tcp_param_val
#define tcp_mss_max_ipv6 tcp_param_arr[53].tcp_param_val
#define tcp_rev_src_routes tcp_param_arr[54].tcp_param_val
#define tcp_local_dack_interval tcp_param_arr[55].tcp_param_val
#define tcp_ndd_get_info_interval tcp_param_arr[56].tcp_param_val
#define tcp_local_dacks_max tcp_param_arr[57].tcp_param_val
#define tcp_ecn_permitted tcp_param_arr[58].tcp_param_val
#define tcp_rst_sent_rate_enabled tcp_param_arr[59].tcp_param_val
#define tcp_rst_sent_rate tcp_param_arr[60].tcp_param_val
#define tcp_push_timer_interval tcp_param_arr[61].tcp_param_val
#define tcp_use_smss_as_mss_opt tcp_param_arr[62].tcp_param_val
#define tcp_keepalive_abort_interval_high tcp_param_arr[63].tcp_param_max
#define tcp_keepalive_abort_interval tcp_param_arr[63].tcp_param_val
#define tcp_keepalive_abort_interval_low tcp_param_arr[63].tcp_param_min
/*
* tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
* each header fragment in the header buffer. Each parameter value has
* to be a multiple of 4 (32-bit aligned).
*/
static tcpparam_t tcp_mdt_head_param = { 32, 256, 32, "tcp_mdt_hdr_head_min" };
static tcpparam_t tcp_mdt_tail_param = { 0, 256, 32, "tcp_mdt_hdr_tail_min" };
#define tcp_mdt_hdr_head_min tcp_mdt_head_param.tcp_param_val
#define tcp_mdt_hdr_tail_min tcp_mdt_tail_param.tcp_param_val
/*
* tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
* the maximum number of payload buffers associated per Multidata.
*/
static tcpparam_t tcp_mdt_max_pbufs_param =
{ 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
#define tcp_mdt_max_pbufs tcp_mdt_max_pbufs_param.tcp_param_val
/* Round up the value to the nearest mss. */
#define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss))
/*
* Set ECN capable transport (ECT) code point in IP header.
*
* Note that there are 2 ECT code points '01' and '10', which are called
* ECT(1) and ECT(0) respectively. Here we follow the original ECT code
* point ECT(0) for TCP as described in RFC 2481.
*/
#define SET_ECT(tcp, iph) \
if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
/* We need to clear the code point first. */ \
((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
} else { \
((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
}
/*
* The format argument to pass to tcp_display().
* DISP_PORT_ONLY means that the returned string has only port info.
* DISP_ADDR_AND_PORT means that the returned string also contains the
* remote and local IP address.
*/
#define DISP_PORT_ONLY 1
#define DISP_ADDR_AND_PORT 2
/*
* This controls the rate some ndd info report functions can be used
* by non-priviledged users. It stores the last time such info is
* requested. When those report functions are called again, this
* is checked with the current time and compare with the ndd param
* tcp_ndd_get_info_interval.
*/
static clock_t tcp_last_ndd_get_info_time = 0;
#define NDD_TOO_QUICK_MSG \
"ndd get info rate too high for non-priviledged users, try again " \
"later.\n"
#define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n"
#define IS_VMLOANED_MBLK(mp) \
(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
/*
* These two variables control the rate for TCP to generate RSTs in
* response to segments not belonging to any connections. We limit
* TCP to sent out tcp_rst_sent_rate (ndd param) number of RSTs in
* each 1 second interval. This is to protect TCP against DoS attack.
*/
static clock_t tcp_last_rst_intrvl;
static uint32_t tcp_rst_cnt;
/* The number of RST not sent because of the rate limit. */
static uint32_t tcp_rst_unsent;
/* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
boolean_t tcp_mdt_chain = B_TRUE;
/*
* MDT threshold in the form of effective send MSS multiplier; we take
* the MDT path if the amount of unsent data exceeds the threshold value
* (default threshold is 1*SMSS).
*/
uint_t tcp_mdt_smss_threshold = 1;
uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */
/*
* Forces all connections to obey the value of the tcp_maxpsz_multiplier
* tunable settable via NDD. Otherwise, the per-connection behavior is
* determined dynamically during tcp_adapt_ire(), which is the default.
*/
boolean_t tcp_static_maxpsz = B_FALSE;
/* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
uint32_t tcp_random_anon_port = 1;
/*
* If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
* than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
* data, TCP will not respond with an ACK. RFC 793 requires that
* TCP responds with an ACK for such a bogus ACK. By not following
* the RFC, we prevent TCP from getting into an ACK storm if somehow
* an attacker successfully spoofs an acceptable segment to our
* peer; or when our peer is "confused."
*/
uint32_t tcp_drop_ack_unsent_cnt = 10;
/*
* Hook functions to enable cluster networking
* On non-clustered systems these vectors must always be NULL.
*/
void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport) = NULL;
void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport) = NULL;
void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport,
uint8_t *faddrp, in_port_t fport) = NULL;
void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport,
uint8_t *faddrp, in_port_t fport) = NULL;
/*
* The following are defined in ip.c
*/
extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp);
extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, uint8_t *faddrp);
#define CL_INET_CONNECT(tcp) { \
if (cl_inet_connect != NULL) { \
/* \
* Running in cluster mode - register active connection \
* information \
*/ \
if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
if ((tcp)->tcp_ipha->ipha_src != 0) { \
(*cl_inet_connect)(IPPROTO_TCP, AF_INET,\
(uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
(in_port_t)(tcp)->tcp_lport, \
(uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
(in_port_t)(tcp)->tcp_fport); \
} \
} else { \
if (!IN6_IS_ADDR_UNSPECIFIED( \
&(tcp)->tcp_ip6h->ip6_src)) {\
(*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\
(uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
(in_port_t)(tcp)->tcp_lport, \
(uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
(in_port_t)(tcp)->tcp_fport); \
} \
} \
} \
}
#define CL_INET_DISCONNECT(tcp) { \
if (cl_inet_disconnect != NULL) { \
/* \
* Running in cluster mode - deregister active \
* connection information \
*/ \
if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
if ((tcp)->tcp_ip_src != 0) { \
(*cl_inet_disconnect)(IPPROTO_TCP, \
AF_INET, \
(uint8_t *)(&((tcp)->tcp_ip_src)),\
(in_port_t)(tcp)->tcp_lport, \
(uint8_t *) \
(&((tcp)->tcp_ipha->ipha_dst)),\
(in_port_t)(tcp)->tcp_fport); \
} \
} else { \
if (!IN6_IS_ADDR_UNSPECIFIED( \
&(tcp)->tcp_ip_src_v6)) { \
(*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\
(uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
(in_port_t)(tcp)->tcp_lport, \
(uint8_t *) \
(&((tcp)->tcp_ip6h->ip6_dst)),\
(in_port_t)(tcp)->tcp_fport); \
} \
} \
} \
}
/*
* Cluster networking hook for traversing current connection list.
* This routine is used to extract the current list of live connections
* which must continue to to be dispatched to this node.
*/
int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
#define IPH_TCPH_CHECKSUMP(ipha, hlen) \
((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + 16)))
#ifdef _BIG_ENDIAN
#define IP_TCP_CSUM_COMP IPPROTO_TCP
#else
#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8)
#endif
#define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \
(sum) += (ttl_protocol) + (ipha)->ipha_ident + \
((v_hlen_tos_len) >> 16) + \
((v_hlen_tos_len) & 0xFFFF) + \
(ipha)->ipha_fragment_offset_and_flags; \
(sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \
(sum) = ~((sum) + ((sum) >> 16)); \
(ipha)->ipha_hdr_checksum = (uint16_t)(sum); \
}
/*
* Macros that determine whether or not IP processing is needed for TCP.
*/
#define TCP_IPOPT_POLICY_V4(tcp) \
((tcp)->tcp_ipversion == IPV4_VERSION && \
((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH || \
CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) || \
CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp)))
#define TCP_IPOPT_POLICY_V6(tcp) \
((tcp)->tcp_ipversion == IPV6_VERSION && \
((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN || \
CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) || \
CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp)))
#define TCP_LOOPBACK_IP(tcp) \
(TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) || \
!CONN_IS_MD_FASTPATH((tcp)->tcp_connp))
boolean_t do_tcp_fusion = B_TRUE;
/*
* This routine gets called by the eager tcp upon changing state from
* SYN_RCVD to ESTABLISHED. It fuses a direct path between itself
* and the active connect tcp such that the regular tcp processings
* may be bypassed under allowable circumstances. Because the fusion
* requires both endpoints to be in the same squeue, it does not work
* for simultaneous active connects because there is no easy way to
* switch from one squeue to another once the connection is created.
* This is different from the eager tcp case where we assign it the
* same squeue as the one given to the active connect tcp during open.
*/
static void
tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
{
conn_t *peer_connp, *connp = tcp->tcp_connp;
tcp_t *peer_tcp;
ASSERT(!tcp->tcp_fused);
ASSERT(tcp->tcp_loopback);
ASSERT(tcp->tcp_loopback_peer == NULL);
/*
* We need to check the listener tcp to make sure it's a socket
* endpoint, but we can't really use tcp_listener since we get
* here after sending up T_CONN_IND and tcp_wput_accept() may be
* called independently, at which point tcp_listener is cleared;
* this is why we use tcp_saved_listener. The listener itself
* is guaranteed to be around until tcp_accept_finish() is called
* on this eager -- this won't happen until we're done since
* we're inside the eager's perimeter now.
*/
ASSERT(tcp->tcp_saved_listener != NULL);
/*
* Lookup peer endpoint; search for the remote endpoint having
* the reversed address-port quadruplet in ESTABLISHED state,
* which is guaranteed to be unique in the system. Zone check
* is applied accordingly for loopback address, but not for
* local address since we want fusion to happen across Zones.
*/
if (tcp->tcp_ipversion == IPV4_VERSION) {
peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
(ipha_t *)iphdr, tcph);
} else {
peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
(ip6_t *)iphdr, tcph);
}
/*
* We can only proceed if peer exists, resides in the same squeue
* as our conn and is not raw-socket. The squeue assignment of
* this eager tcp was done earlier at the time of SYN processing
* in ip_fanout_tcp{_v6}. Note that similar squeues by itself
* doesn't guarantee a safe condition to fuse, hence we perform
* additional tests below.
*/
ASSERT(peer_connp == NULL || peer_connp != connp);
if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
!IPCL_IS_TCP(peer_connp)) {
if (peer_connp != NULL) {
TCP_STAT(tcp_fusion_unqualified);
CONN_DEC_REF(peer_connp);
}
return;
}
peer_tcp = peer_connp->conn_tcp; /* active connect tcp */
ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL);
ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
/*
* Fuse the endpoints; we perform further checks against both
* tcp endpoints to ensure that a fusion is allowed to happen.
* In particular we bail out for TPI, non-simple TCP/IP or if
* IPsec/IPQoS policy exists. We could actually do it for the
* XTI/TLI/TPI case but this requires more testing, so for now
* we handle only the socket case.
*/
if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
TCP_IS_SOCKET(tcp->tcp_saved_listener) && TCP_IS_SOCKET(peer_tcp) &&
!TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) &&
!IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
mblk_t *mp;
struct stroptions *stropt;
queue_t *peer_rq = peer_tcp->tcp_rq;
size_t sth_hiwat;
ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
/*
* We need to drain data on both endpoints during unfuse.
* If we need to send up SIGURG at the time of draining,
* we want to be sure that an mblk is readily available.
* This is why we pre-allocate the M_PCSIG mblks for both
* endpoints which will only be used during/after unfuse.
*/
if ((mp = allocb(1, BPRI_HI)) == NULL) {
CONN_DEC_REF(peer_connp);
return;
}
ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
tcp->tcp_fused_sigurg_mp = mp;
if ((mp = allocb(1, BPRI_HI)) == NULL) {
freeb(tcp->tcp_fused_sigurg_mp);
tcp->tcp_fused_sigurg_mp = NULL;
CONN_DEC_REF(peer_connp);
return;
}
ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
peer_tcp->tcp_fused_sigurg_mp = mp;
/* Allocate M_SETOPTS mblk */
mp = allocb(sizeof (*stropt), BPRI_HI);
if (mp == NULL) {
freeb(tcp->tcp_fused_sigurg_mp);
tcp->tcp_fused_sigurg_mp = NULL;
freeb(peer_tcp->tcp_fused_sigurg_mp);
peer_tcp->tcp_fused_sigurg_mp = NULL;
CONN_DEC_REF(peer_connp);
return;
}
/* Fuse both endpoints */
peer_tcp->tcp_loopback_peer = tcp;
tcp->tcp_loopback_peer = peer_tcp;
peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
/*
* We never use regular tcp paths in fusion and should
* therefore clear tcp_unsent on both endpoints. Having
* them set to non-zero values means asking for trouble
* especially after unfuse, where we may end up sending
* through regular tcp paths which expect xmit_list and
* friends to be correctly setup.
*/
peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
tcp_timers_stop(tcp);
tcp_timers_stop(peer_tcp);
/*
* Set the stream head's write offset value to zero, since we
* won't be needing any room for TCP/IP headers, and tell it
* to not break up the writes. This would reduce the amount
* of work done by kmem. In addition, we set the receive
* buffer to twice that of q_hiwat in order to simulate the
* non-fusion case. Note that we can only do this for the
* active connect tcp since our eager is still detached;
* it will be dealt with later in tcp_accept_finish().
*/
DB_TYPE(mp) = M_SETOPTS;
mp->b_wptr += sizeof (*stropt);
sth_hiwat = peer_rq->q_hiwat << 1;
if (sth_hiwat > tcp_max_buf)
sth_hiwat = tcp_max_buf;
stropt = (struct stroptions *)mp->b_rptr;
stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
stropt->so_wroff = 0;
stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat);
/* Send the options up */
putnext(peer_rq, mp);
} else {
TCP_STAT(tcp_fusion_unqualified);
}
CONN_DEC_REF(peer_connp);
}
/*
* Unfuse a previously-fused pair of tcp loopback endpoints.
*/
static void
tcp_unfuse(tcp_t *tcp)
{
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
ASSERT(tcp->tcp_fused && peer_tcp != NULL);
ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
/*
* Drain any pending data; the detached check is needed because
* we may be called from tcp_fuse_output(). Note that in case of
* a detached tcp, the draining will happen later after the tcp
* is unfused. For non-urgent data, this can be handled by the
* regular tcp_rcv_drain(). If we have urgent data sitting in
* the receive list, we will need to send up a SIGURG signal first
* before draining the data. All of these will be handled by the
* code in tcp_fuse_rcv_drain() when called from tcp_rcv_drain().
*/
if (!TCP_IS_DETACHED(tcp)) {
(void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
&tcp->tcp_fused_sigurg_mp);
}
if (!TCP_IS_DETACHED(peer_tcp)) {
(void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
&peer_tcp->tcp_fused_sigurg_mp);
}
/* Lift up any flow-control conditions */
if (tcp->tcp_flow_stopped) {
tcp_clrqfull(tcp);
tcp->tcp_flow_stopped = B_FALSE;
TCP_STAT(tcp_fusion_backenabled);
}
if (peer_tcp->tcp_flow_stopped) {
tcp_clrqfull(peer_tcp);
peer_tcp->tcp_flow_stopped = B_FALSE;
TCP_STAT(tcp_fusion_backenabled);
}
/* Free up M_PCSIG mblk(s) if not needed */
if (!tcp->tcp_fused_sigurg && tcp->tcp_fused_sigurg_mp != NULL) {
freeb(tcp->tcp_fused_sigurg_mp);
tcp->tcp_fused_sigurg_mp = NULL;
}
if (!peer_tcp->tcp_fused_sigurg &&
peer_tcp->tcp_fused_sigurg_mp != NULL) {
freeb(peer_tcp->tcp_fused_sigurg_mp);
peer_tcp->tcp_fused_sigurg_mp = NULL;
}
/*
* Update th_seq and th_ack in the header template
*/
U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
/* Unfuse the endpoints */
peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
}
/*
* Fusion output routine for urgent data. This routine is called by
* tcp_fuse_output() for handling non-M_DATA mblks.
*/
static void
tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
{
mblk_t *mp1;
struct T_exdata_ind *tei;
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
mblk_t *head, *prev_head = NULL;
ASSERT(tcp->tcp_fused);
ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
/*
* Urgent data arrives in the form of T_EXDATA_REQ from above.
* Each occurence denotes a new urgent pointer. For each new
* urgent pointer we signal (SIGURG) the receiving app to indicate
* that it needs to go into urgent mode. This is similar to the
* urgent data handling in the regular tcp. We don't need to keep
* track of where the urgent pointer is, because each T_EXDATA_REQ
* "advances" the urgent pointer for us.
*
* The actual urgent data carried by T_EXDATA_REQ is then prepended
* by a T_EXDATA_IND before being enqueued behind any existing data
* destined for the receiving app. There is only a single urgent
* pointer (out-of-band mark) for a given tcp. If the new urgent
* data arrives before the receiving app reads some existing urgent
* data, the previous marker is lost. This behavior is emulated
* accordingly below, by removing any existing T_EXDATA_IND messages
* and essentially converting old urgent data into non-urgent.
*/
ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
/* Let sender get out of urgent mode */
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
/*
* Send up SIGURG to the receiving peer; if the peer is detached
* or if we can't allocate the M_PCSIG, indicate that we need to
* signal upon draining to the peer by marking tcp_fused_sigurg.
* This flag will only get cleared once SIGURG is delivered and
* is not affected by the tcp_fused flag -- delivery will still
* happen even after an endpoint is unfused, to handle the case
* where the sending endpoint immediately closes/unfuses after
* sending urgent data and the accept is not yet finished.
*/
if (!TCP_IS_DETACHED(peer_tcp) &&
((mp1 = allocb(1, BPRI_HI)) != NULL ||
(mp1 = allocb_tryhard(1)) != NULL)) {
peer_tcp->tcp_fused_sigurg = B_FALSE;
/* Send up the signal */
DB_TYPE(mp1) = M_PCSIG;
*mp1->b_wptr++ = (uchar_t)SIGURG;
putnext(peer_tcp->tcp_rq, mp1);
} else {
peer_tcp->tcp_fused_sigurg = B_TRUE;
}
/* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
DB_TYPE(mp) = M_PROTO;
tei = (struct T_exdata_ind *)mp->b_rptr;
tei->PRIM_type = T_EXDATA_IND;
tei->MORE_flag = 0;
mp->b_wptr = (uchar_t *)&tei[1];
TCP_STAT(tcp_fusion_urg);
BUMP_MIB(&tcp_mib, tcpOutUrg);
head = peer_tcp->tcp_rcv_list;
while (head != NULL) {
/*
* Remove existing T_EXDATA_IND, keep the data which follows
* it and relink our list. Note that we don't modify the
* tcp_rcv_last_tail since it never points to T_EXDATA_IND.
*/
if (DB_TYPE(head) != M_DATA) {
mp1 = head;
ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
head = mp1->b_cont;
mp1->b_cont = NULL;
head->b_next = mp1->b_next;
mp1->b_next = NULL;
if (prev_head != NULL)
prev_head->b_next = head;
if (peer_tcp->tcp_rcv_list == mp1)
peer_tcp->tcp_rcv_list = head;
if (peer_tcp->tcp_rcv_last_head == mp1)
peer_tcp->tcp_rcv_last_head = head;
freeb(mp1);
}
prev_head = head;
head = head->b_next;
}
}
/*
* Fusion output routine, called by tcp_output() and tcp_wput_proto().
*/
static boolean_t
tcp_fuse_output(tcp_t *tcp, mblk_t *mp)
{
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
queue_t *peer_rq;
mblk_t *mp_tail = mp;
uint32_t send_size = 0;
ASSERT(tcp->tcp_fused);
ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
DB_TYPE(mp) == M_PCPROTO);
peer_rq = peer_tcp->tcp_rq;
/* If this connection requires IP, unfuse and use regular path */
if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) ||
IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
TCP_STAT(tcp_fusion_aborted);
tcp_unfuse(tcp);
return (B_FALSE);
}
for (;;) {
if (DB_TYPE(mp_tail) == M_DATA)
send_size += MBLKL(mp_tail);
if (mp_tail->b_cont == NULL)
break;
mp_tail = mp_tail->b_cont;
}
if (send_size == 0) {
freemsg(mp);
return (B_TRUE);
}
/*
* Handle urgent data; we either send up SIGURG to the peer now
* or do it later when we drain, in case the peer is detached
* or if we're short of memory for M_PCSIG mblk.
*/
if (DB_TYPE(mp) != M_DATA)
tcp_fuse_output_urg(tcp, mp);
/*
* Enqueue data into the peer's receive list; we may or may not
* drain the contents depending on the conditions below.
*/
tcp_rcv_enqueue(peer_tcp, mp, send_size);
/* In case it wrapped around and also to keep it constant */
peer_tcp->tcp_rwnd += send_size;
/*
* If peer is detached, exercise flow-control when needed; we will
* get back-enabled either in tcp_accept_finish() or tcp_unfuse().
*/
if (TCP_IS_DETACHED(peer_tcp) &&
peer_tcp->tcp_rcv_cnt > peer_rq->q_hiwat) {
tcp_setqfull(tcp);
tcp->tcp_flow_stopped = B_TRUE;
TCP_STAT(tcp_fusion_flowctl);
}
loopback_packets++;
tcp->tcp_last_sent_len = send_size;
/* Need to adjust the following SNMP MIB-related variables */
tcp->tcp_snxt += send_size;
tcp->tcp_suna = tcp->tcp_snxt;
peer_tcp->tcp_rnxt += send_size;
peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
BUMP_MIB(&tcp_mib, tcpOutDataSegs);
UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size);
BUMP_MIB(&tcp_mib, tcpInSegs);
BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size);
BUMP_LOCAL(tcp->tcp_obsegs);
BUMP_LOCAL(peer_tcp->tcp_ibsegs);
if (!TCP_IS_DETACHED(peer_tcp)) {
/*
* If we can't send SIGURG above due to lack of memory,
* schedule push timer and try again. Otherwise drain
* the data if we're not flow-controlled.
*/
if (peer_tcp->tcp_fused_sigurg) {
if (peer_tcp->tcp_push_tid == 0) {
peer_tcp->tcp_push_tid =
TCP_TIMER(peer_tcp, tcp_push_timer,
MSEC_TO_TICK(tcp_push_timer_interval));
}
} else if (!tcp->tcp_flow_stopped) {
if (!canputnext(peer_rq)) {
tcp_setqfull(tcp);
tcp->tcp_flow_stopped = B_TRUE;
TCP_STAT(tcp_fusion_flowctl);
} else {
ASSERT(peer_tcp->tcp_rcv_list != NULL);
(void) tcp_fuse_rcv_drain(peer_rq,
peer_tcp, NULL);
TCP_STAT(tcp_fusion_putnext);
}
}
}
return (B_TRUE);
}
/*
* This routine gets called to deliver data upstream on a fused or
* previously fused tcp loopback endpoint; the latter happens only
* when there is a pending SIGURG signal plus urgent data that can't
* be sent upstream in the past.
*/
static boolean_t
tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
{
mblk_t *mp;
#ifdef DEBUG
uint_t cnt = 0;
#endif
ASSERT(tcp->tcp_loopback);
ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
/* No need for the push timer now, in case it was scheduled */
if (tcp->tcp_push_tid != 0) {
(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
tcp->tcp_push_tid = 0;
}
/*
* If there's urgent data sitting in receive list and we didn't
* get a chance to send up a SIGURG signal, make sure we send
* it first before draining in order to ensure that SIOCATMARK
* works properly.
*/
if (tcp->tcp_fused_sigurg) {
/*
* sigurg_mpp is normally NULL, i.e. when we're still
* fused and didn't get here because of tcp_unfuse().
* In this case try hard to allocate the M_PCSIG mblk.
*/
if (sigurg_mpp == NULL &&
(mp = allocb(1, BPRI_HI)) == NULL &&
(mp = allocb_tryhard(1)) == NULL) {
/* Alloc failed; try again next time */
tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
MSEC_TO_TICK(tcp_push_timer_interval));
return (B_TRUE);
} else if (sigurg_mpp != NULL) {
/*
* Use the supplied M_PCSIG mblk; it means we're
* either unfused or in the process of unfusing,
* and the drain must happen now.
*/
mp = *sigurg_mpp;
*sigurg_mpp = NULL;
}
ASSERT(mp != NULL);
tcp->tcp_fused_sigurg = B_FALSE;
/* Send up the signal */
DB_TYPE(mp) = M_PCSIG;
*mp->b_wptr++ = (uchar_t)SIGURG;
putnext(q, mp);
/*
* Let the regular tcp_rcv_drain() path handle
* draining the data if we're no longer fused.
*/
if (!tcp->tcp_fused)
return (B_FALSE);
}
/* Drain the data */
while ((mp = tcp->tcp_rcv_list) != NULL) {
tcp->tcp_rcv_list = mp->b_next;
mp->b_next = NULL;
#ifdef DEBUG
cnt += msgdsize(mp);
#endif
putnext(q, mp);
}
ASSERT(cnt == tcp->tcp_rcv_cnt);
tcp->tcp_rcv_last_head = NULL;
tcp->tcp_rcv_last_tail = NULL;
tcp->tcp_rcv_cnt = 0;
tcp->tcp_rwnd = q->q_hiwat;
return (B_TRUE);
}
/*
* This is the walker function, which is TCP specific.
* It walks through the conn_hash bucket searching for the
* next valid connp/tcp in the list, selecting connp/tcp
* which haven't closed or condemned. It also REFHOLDS the
* reference for the tcp, ensuring that the tcp exists
* when the caller uses the tcp.
*
* tcp_get_next_conn
* get the next entry in the conn global list
* and put a reference on the next_conn.
* decrement the reference on the current conn.
*/
conn_t *
tcp_get_next_conn(connf_t *connfp, conn_t *connp)
{
conn_t *next_connp;
if (connfp == NULL)
return (NULL);
mutex_enter(&connfp->connf_lock);
next_connp = (connp == NULL) ?
connfp->connf_head : connp->conn_g_next;
while (next_connp != NULL) {
mutex_enter(&next_connp->conn_lock);
if ((next_connp->conn_state_flags &
(CONN_CONDEMNED | CONN_INCIPIENT)) ||
!IPCL_IS_TCP(next_connp)) {
/*
* This conn has been condemned or
* is closing.
*/
mutex_exit(&next_connp->conn_lock);
next_connp = next_connp->conn_g_next;
continue;
}
ASSERT(next_connp->conn_tcp != NULL);
CONN_INC_REF_LOCKED(next_connp);
mutex_exit(&next_connp->conn_lock);
break;
}
mutex_exit(&connfp->connf_lock);
if (connp != NULL) {
CONN_DEC_REF(connp);
}
return (next_connp);
}
/*
* Figure out the value of window scale opton. Note that the rwnd is
* ASSUMED to be rounded up to the nearest MSS before the calculation.
* We cannot find the scale value and then do a round up of tcp_rwnd
* because the scale value may not be correct after that.
*
* Set the compiler flag to make this function inline.
*/
static void
tcp_set_ws_value(tcp_t *tcp)
{
int i;
uint32_t rwnd = tcp->tcp_rwnd;
for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
i++, rwnd >>= 1)
;
tcp->tcp_rcv_ws = i;
}
/*
* Remove a connection from the list of detached TIME_WAIT connections.
*/
static void
tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
{
boolean_t locked = B_FALSE;
if (tcp_time_wait == NULL) {
tcp_time_wait = *((tcp_squeue_priv_t **)
squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
locked = B_TRUE;
}
if (tcp->tcp_time_wait_expire == 0) {
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
if (locked)
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
return;
}
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
if (tcp == tcp_time_wait->tcp_time_wait_head) {
ASSERT(tcp->tcp_time_wait_prev == NULL);
tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
if (tcp_time_wait->tcp_time_wait_head != NULL) {
tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
NULL;
} else {
tcp_time_wait->tcp_time_wait_tail = NULL;
}
} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
ASSERT(tcp->tcp_time_wait_next == NULL);
tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
} else {
ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
tcp->tcp_time_wait_prev->tcp_time_wait_next =
tcp->tcp_time_wait_next;
tcp->tcp_time_wait_next->tcp_time_wait_prev =
tcp->tcp_time_wait_prev;
}
tcp->tcp_time_wait_next = NULL;
tcp->tcp_time_wait_prev = NULL;
tcp->tcp_time_wait_expire = 0;
if (locked)
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
}
/*
* Add a connection to the list of detached TIME_WAIT connections
* and set its time to expire.
*/
static void
tcp_time_wait_append(tcp_t *tcp)
{
tcp_squeue_priv_t *tcp_time_wait =
*((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
SQPRIVATE_TCP));
tcp_timers_stop(tcp);
/* Freed above */
ASSERT(tcp->tcp_timer_tid == 0);
ASSERT(tcp->tcp_ack_tid == 0);
/* must have happened at the time of detaching the tcp */
ASSERT(tcp->tcp_ptpahn == NULL);
ASSERT(tcp->tcp_flow_stopped == 0);
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
ASSERT(tcp->tcp_time_wait_expire == NULL);
ASSERT(tcp->tcp_listener == NULL);
tcp->tcp_time_wait_expire = ddi_get_lbolt();
/*
* The value computed below in tcp->tcp_time_wait_expire may
* appear negative or wrap around. That is ok since our
* interest is only in the difference between the current lbolt
* value and tcp->tcp_time_wait_expire. But the value should not
* be zero, since it means the tcp is not in the TIME_WAIT list.
* The corresponding comparison in tcp_time_wait_collector() uses
* modular arithmetic.
*/
tcp->tcp_time_wait_expire +=
drv_usectohz(tcp_time_wait_interval * 1000);
if (tcp->tcp_time_wait_expire == 0)
tcp->tcp_time_wait_expire = 1;
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
TCP_DBGSTAT(tcp_time_wait);
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
if (tcp_time_wait->tcp_time_wait_head == NULL) {
ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
tcp_time_wait->tcp_time_wait_head = tcp;
} else {
ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
TCPS_TIME_WAIT);
tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
}
tcp_time_wait->tcp_time_wait_tail = tcp;
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
}
/* ARGSUSED */
void
tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
ASSERT(tcp != NULL);
if (tcp->tcp_state == TCPS_CLOSED) {
return;
}
ASSERT((tcp->tcp_family == AF_INET &&
tcp->tcp_ipversion == IPV4_VERSION) ||
(tcp->tcp_family == AF_INET6 &&
(tcp->tcp_ipversion == IPV4_VERSION ||
tcp->tcp_ipversion == IPV6_VERSION)));
ASSERT(!tcp->tcp_listener);
TCP_STAT(tcp_time_wait_reap);
ASSERT(TCP_IS_DETACHED(tcp));
/*
* Because they have no upstream client to rebind or tcp_close()
* them later, we axe the connection here and now.
*/
tcp_close_detached(tcp);
}
void
tcp_cleanup(tcp_t *tcp)
{
mblk_t *mp;
char *tcp_iphc;
int tcp_iphc_len;
int tcp_hdr_grown;
tcp_sack_info_t *tcp_sack_info;
conn_t *connp = tcp->tcp_connp;
tcp_bind_hash_remove(tcp);
tcp_free(tcp);
conn_delete_ire(connp, NULL);
if (connp->conn_flags & IPCL_TCPCONN) {
if (connp->conn_latch != NULL)
IPLATCH_REFRELE(connp->conn_latch);
if (connp->conn_policy != NULL)
IPPH_REFRELE(connp->conn_policy);
}
/*
* Since we will bzero the entire structure, we need to
* remove it and reinsert it in global hash list. We
* know the walkers can't get to this conn because we
* had set CONDEMNED flag earlier and checked reference
* under conn_lock so walker won't pick it and when we
* go the ipcl_globalhash_remove() below, no walker
* can get to it.
*/
ipcl_globalhash_remove(connp);
/* Save some state */
mp = tcp->tcp_timercache;
tcp_sack_info = tcp->tcp_sack_info;
tcp_iphc = tcp->tcp_iphc;
tcp_iphc_len = tcp->tcp_iphc_len;
tcp_hdr_grown = tcp->tcp_hdr_grown;
bzero(connp, sizeof (conn_t));
bzero(tcp, sizeof (tcp_t));
/* restore the state */
tcp->tcp_timercache = mp;
tcp->tcp_sack_info = tcp_sack_info;
tcp->tcp_iphc = tcp_iphc;
tcp->tcp_iphc_len = tcp_iphc_len;
tcp->tcp_hdr_grown = tcp_hdr_grown;
tcp->tcp_connp = connp;
connp->conn_tcp = tcp;
connp->conn_flags = IPCL_TCPCONN;
connp->conn_state_flags = CONN_INCIPIENT;
connp->conn_ulp = IPPROTO_TCP;
connp->conn_ref = 1;
ipcl_globalhash_insert(connp);
}
/*
* Blows away all tcps whose TIME_WAIT has expired. List traversal
* is done forwards from the head.
*/
/* ARGSUSED */
void
tcp_time_wait_collector(void *arg)
{
tcp_t *tcp;
clock_t now;
mblk_t *mp;
conn_t *connp;
kmutex_t *lock;
squeue_t *sqp = (squeue_t *)arg;
tcp_squeue_priv_t *tcp_time_wait =
*((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
tcp_time_wait->tcp_time_wait_tid = 0;
if (tcp_time_wait->tcp_free_list != NULL &&
tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
TCP_STAT(tcp_freelist_cleanup);
while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
CONN_DEC_REF(tcp->tcp_connp);
}
}
/*
* In order to reap time waits reliably, we should use a
* source of time that is not adjustable by the user -- hence
* the call to ddi_get_lbolt().
*/
now = ddi_get_lbolt();
while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
/*
* Compare times using modular arithmetic, since
* lbolt can wrapover.
*/
if ((now - tcp->tcp_time_wait_expire) < 0) {
break;
}
tcp_time_wait_remove(tcp, tcp_time_wait);
connp = tcp->tcp_connp;
ASSERT(connp->conn_fanout != NULL);
lock = &connp->conn_fanout->connf_lock;
/*
* This is essentially a TW reclaim fast path optimization for
* performance where the timewait collector checks under the
* fanout lock (so that no one else can get access to the
* conn_t) that the refcnt is 2 i.e. one for TCP and one for
* the classifier hash list. If ref count is indeed 2, we can
* just remove the conn under the fanout lock and avoid
* cleaning up the conn under the squeue, provided that
* clustering callbacks are not enabled. If clustering is
* enabled, we need to make the clustering callback before
* setting the CONDEMNED flag and after dropping all locks and
* so we forego this optimization and fall back to the slow
* path. Also please see the comments in tcp_closei_local
* regarding the refcnt logic.
*
* Since we are holding the tcp_time_wait_lock, its better
* not to block on the fanout_lock because other connections
* can't add themselves to time_wait list. So we do a
* tryenter instead of mutex_enter.
*/
if (mutex_tryenter(lock)) {
mutex_enter(&connp->conn_lock);
if ((connp->conn_ref == 2) &&
(cl_inet_disconnect == NULL)) {
ipcl_hash_remove_locked(connp,
connp->conn_fanout);
/*
* Set the CONDEMNED flag now itself so that
* the refcnt cannot increase due to any
* walker. But we have still not cleaned up
* conn_ire_cache. This is still ok since
* we are going to clean it up in tcp_cleanup
* immediately and any interface unplumb
* thread will wait till the ire is blown away
*/