| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License, Version 1.0 only |
| * (the "License"). You may not use this file except in compliance |
| * with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright 2005 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| /* Copyright (c) 1990 Mentat Inc. */ |
| |
| #pragma ident "%Z%%M% %I% %E% SMI" |
| |
| const char tcp_version[] = "%Z%%M% %I% %E% SMI"; |
| |
| #include <sys/types.h> |
| #include <sys/stream.h> |
| #include <sys/strsun.h> |
| #include <sys/strsubr.h> |
| #include <sys/stropts.h> |
| #include <sys/strlog.h> |
| #include <sys/strsun.h> |
| #define _SUN_TPI_VERSION 2 |
| #include <sys/tihdr.h> |
| #include <sys/timod.h> |
| #include <sys/ddi.h> |
| #include <sys/sunddi.h> |
| #include <sys/suntpi.h> |
| #include <sys/xti_inet.h> |
| #include <sys/cmn_err.h> |
| #include <sys/debug.h> |
| #include <sys/vtrace.h> |
| #include <sys/kmem.h> |
| #include <sys/ethernet.h> |
| #include <sys/cpuvar.h> |
| #include <sys/dlpi.h> |
| #include <sys/multidata.h> |
| #include <sys/multidata_impl.h> |
| #include <sys/pattr.h> |
| #include <sys/policy.h> |
| #include <sys/zone.h> |
| |
| #include <sys/errno.h> |
| #include <sys/signal.h> |
| #include <sys/socket.h> |
| #include <sys/sockio.h> |
| #include <sys/isa_defs.h> |
| #include <sys/md5.h> |
| #include <sys/random.h> |
| #include <netinet/in.h> |
| #include <netinet/tcp.h> |
| #include <netinet/ip6.h> |
| #include <netinet/icmp6.h> |
| #include <net/if.h> |
| #include <net/route.h> |
| #include <inet/ipsec_impl.h> |
| |
| #include <inet/common.h> |
| #include <inet/ip.h> |
| #include <inet/ip6.h> |
| #include <inet/ip_ndp.h> |
| #include <inet/mi.h> |
| #include <inet/mib2.h> |
| #include <inet/nd.h> |
| #include <inet/optcom.h> |
| #include <inet/snmpcom.h> |
| #include <inet/kstatcom.h> |
| #include <inet/tcp.h> |
| #include <net/pfkeyv2.h> |
| #include <inet/ipsec_info.h> |
| #include <inet/ipdrop.h> |
| #include <inet/tcp_trace.h> |
| |
| #include <inet/ipclassifier.h> |
| #include <inet/ip_ire.h> |
| #include <inet/ip_if.h> |
| #include <inet/ipp_common.h> |
| #include <sys/squeue.h> |
| |
| /* |
| * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) |
| * |
| * (Read the detailed design doc in PSARC case directory) |
| * |
| * The entire tcp state is contained in tcp_t and conn_t structure |
| * which are allocated in tandem using ipcl_conn_create() and passing |
| * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect |
| * the references on the tcp_t. The tcp_t structure is never compressed |
| * and packets always land on the correct TCP perimeter from the time |
| * eager is created till the time tcp_t dies (as such the old mentat |
| * TCP global queue is not used for detached state and no IPSEC checking |
| * is required). The global queue is still allocated to send out resets |
| * for connection which have no listeners and IP directly calls |
| * tcp_xmit_listeners_reset() which does any policy check. |
| * |
| * Protection and Synchronisation mechanism: |
| * |
| * The tcp data structure does not use any kind of lock for protecting |
| * its state but instead uses 'squeues' for mutual exclusion from various |
| * read and write side threads. To access a tcp member, the thread should |
| * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or |
| * squeue_fill). Since the squeues allow a direct function call, caller |
| * can pass any tcp function having prototype of edesc_t as argument |
| * (different from traditional STREAMs model where packets come in only |
| * designated entry points). The list of functions that can be directly |
| * called via squeue are listed before the usual function prototype. |
| * |
| * Referencing: |
| * |
| * TCP is MT-Hot and we use a reference based scheme to make sure that the |
| * tcp structure doesn't disappear when its needed. When the application |
| * creates an outgoing connection or accepts an incoming connection, we |
| * start out with 2 references on 'conn_ref'. One for TCP and one for IP. |
| * The IP reference is just a symbolic reference since ip_tcpclose() |
| * looks at tcp structure after tcp_close_output() returns which could |
| * have dropped the last TCP reference. So as long as the connection is |
| * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the |
| * conn_t. The classifier puts its own reference when the connection is |
| * inserted in listen or connected hash. Anytime a thread needs to enter |
| * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr |
| * on write side or by doing a classify on read side and then puts a |
| * reference on the conn before doing squeue_enter/tryenter/fill. For |
| * read side, the classifier itself puts the reference under fanout lock |
| * to make sure that tcp can't disappear before it gets processed. The |
| * squeue will drop this reference automatically so the called function |
| * doesn't have to do a DEC_REF. |
| * |
| * Opening a new connection: |
| * |
| * The outgoing connection open is pretty simple. ip_tcpopen() does the |
| * work in creating the conn/tcp structure and initializing it. The |
| * squeue assignment is done based on the CPU the application |
| * is running on. So for outbound connections, processing is always done |
| * on application CPU which might be different from the incoming CPU |
| * being interrupted by the NIC. An optimal way would be to figure out |
| * the NIC <-> CPU binding at listen time, and assign the outgoing |
| * connection to the squeue attached to the CPU that will be interrupted |
| * for incoming packets (we know the NIC based on the bind IP address). |
| * This might seem like a problem if more data is going out but the |
| * fact is that in most cases the transmit is ACK driven transmit where |
| * the outgoing data normally sits on TCP's xmit queue waiting to be |
| * transmitted. |
| * |
| * Accepting a connection: |
| * |
| * This is a more interesting case because of various races involved in |
| * establishing a eager in its own perimeter. Read the meta comment on |
| * top of tcp_conn_request(). But briefly, the squeue is picked by |
| * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. |
| * |
| * Closing a connection: |
| * |
| * The close is fairly straight forward. tcp_close() calls tcp_close_output() |
| * via squeue to do the close and mark the tcp as detached if the connection |
| * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its |
| * reference but tcp_close() drop IP's reference always. So if tcp was |
| * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP |
| * and 1 because it is in classifier's connected hash. This is the condition |
| * we use to determine that its OK to clean up the tcp outside of squeue |
| * when time wait expires (check the ref under fanout and conn_lock and |
| * if it is 2, remove it from fanout hash and kill it). |
| * |
| * Although close just drops the necessary references and marks the |
| * tcp_detached state, tcp_close needs to know the tcp_detached has been |
| * set (under squeue) before letting the STREAM go away (because a |
| * inbound packet might attempt to go up the STREAM while the close |
| * has happened and tcp_detached is not set). So a special lock and |
| * flag is used along with a condition variable (tcp_closelock, tcp_closed, |
| * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked |
| * tcp_detached. |
| * |
| * Special provisions and fast paths: |
| * |
| * We make special provision for (AF_INET, SOCK_STREAM) sockets which |
| * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP |
| * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles |
| * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY |
| * check to send packets directly to tcp_rput_data via squeue. Everyone |
| * else comes through tcp_input() on the read side. |
| * |
| * We also make special provisions for sockfs by marking tcp_issocket |
| * whenever we have only sockfs on top of TCP. This allows us to skip |
| * putting the tcp in acceptor hash since a sockfs listener can never |
| * become acceptor and also avoid allocating a tcp_t for acceptor STREAM |
| * since eager has already been allocated and the accept now happens |
| * on acceptor STREAM. There is a big blob of comment on top of |
| * tcp_conn_request explaining the new accept. When socket is POP'd, |
| * sockfs sends us an ioctl to mark the fact and we go back to old |
| * behaviour. Once tcp_issocket is unset, its never set for the |
| * life of that connection. |
| * |
| * IPsec notes : |
| * |
| * Since a packet is always executed on the correct TCP perimeter |
| * all IPsec processing is defered to IP including checking new |
| * connections and setting IPSEC policies for new connection. The |
| * only exception is tcp_xmit_listeners_reset() which is called |
| * directly from IP and needs to policy check to see if TH_RST |
| * can be sent out. |
| */ |
| |
| |
| extern major_t TCP6_MAJ; |
| |
| /* |
| * Values for squeue switch: |
| * 1: squeue_enter_nodrain |
| * 2: squeue_enter |
| * 3: squeue_fill |
| */ |
| int tcp_squeue_close = 2; |
| int tcp_squeue_wput = 2; |
| |
| squeue_func_t tcp_squeue_close_proc; |
| squeue_func_t tcp_squeue_wput_proc; |
| |
| extern vmem_t *ip_minor_arena; |
| |
| /* |
| * This controls how tiny a write must be before we try to copy it |
| * into the the mblk on the tail of the transmit queue. Not much |
| * speedup is observed for values larger than sixteen. Zero will |
| * disable the optimisation. |
| */ |
| int tcp_tx_pull_len = 16; |
| |
| /* |
| * TCP Statistics. |
| * |
| * How TCP statistics work. |
| * |
| * There are two types of statistics invoked by two macros. |
| * |
| * TCP_STAT(name) does non-atomic increment of a named stat counter. It is |
| * supposed to be used in non MT-hot paths of the code. |
| * |
| * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is |
| * supposed to be used for DEBUG purposes and may be used on a hot path. |
| * |
| * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat |
| * (use "kstat tcp" to get them). |
| * |
| * There is also additional debugging facility that marks tcp_clean_death() |
| * instances and saves them in tcp_t structure. It is triggered by |
| * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for |
| * tcp_clean_death() calls that counts the number of times each tag was hit. It |
| * is triggered by TCP_CLD_COUNTERS define. |
| * |
| * How to add new counters. |
| * |
| * 1) Add a field in the tcp_stat structure describing your counter. |
| * 2) Add a line in tcp_statistics with the name of the counter. |
| * |
| * IMPORTANT!! - make sure that both are in sync !! |
| * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. |
| * |
| * Please avoid using private counters which are not kstat-exported. |
| * |
| * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances |
| * in tcp_t structure. |
| * |
| * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. |
| */ |
| |
| #define TCP_COUNTERS 1 |
| #define TCP_CLD_COUNTERS 0 |
| |
| #ifndef TCP_DEBUG_COUNTER |
| #ifdef DEBUG |
| #define TCP_DEBUG_COUNTER 1 |
| #else |
| #define TCP_DEBUG_COUNTER 0 |
| #endif |
| #endif |
| |
| |
| #define TCP_TAG_CLEAN_DEATH 1 |
| #define TCP_MAX_CLEAN_DEATH_TAG 32 |
| |
| #ifdef lint |
| static int _lint_dummy_; |
| #endif |
| |
| #if TCP_COUNTERS |
| #define TCP_STAT(x) (tcp_statistics.x.value.ui64++) |
| #define TCP_STAT_UPDATE(x, n) (tcp_statistics.x.value.ui64 += (n)) |
| #define TCP_STAT_SET(x, n) (tcp_statistics.x.value.ui64 = (n)) |
| #elif defined(lint) |
| #define TCP_STAT(x) ASSERT(_lint_dummy_ == 0); |
| #define TCP_STAT_UPDATE(x, n) ASSERT(_lint_dummy_ == 0); |
| #define TCP_STAT_SET(x, n) ASSERT(_lint_dummy_ == 0); |
| #else |
| #define TCP_STAT(x) |
| #define TCP_STAT_UPDATE(x, n) |
| #define TCP_STAT_SET(x, n) |
| #endif |
| |
| #if TCP_CLD_COUNTERS |
| static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; |
| #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ |
| #elif defined(lint) |
| #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); |
| #else |
| #define TCP_CLD_STAT(x) |
| #endif |
| |
| #if TCP_DEBUG_COUNTER |
| #define TCP_DBGSTAT(x) atomic_add_64(&(tcp_statistics.x.value.ui64), 1) |
| #elif defined(lint) |
| #define TCP_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); |
| #else |
| #define TCP_DBGSTAT(x) |
| #endif |
| |
| typedef struct tcp_stat { |
| kstat_named_t tcp_time_wait; |
| kstat_named_t tcp_time_wait_syn; |
| kstat_named_t tcp_time_wait_syn_success; |
| kstat_named_t tcp_time_wait_syn_fail; |
| kstat_named_t tcp_reinput_syn; |
| kstat_named_t tcp_ip_output; |
| kstat_named_t tcp_detach_non_time_wait; |
| kstat_named_t tcp_detach_time_wait; |
| kstat_named_t tcp_time_wait_reap; |
| kstat_named_t tcp_clean_death_nondetached; |
| kstat_named_t tcp_reinit_calls; |
| kstat_named_t tcp_eager_err1; |
| kstat_named_t tcp_eager_err2; |
| kstat_named_t tcp_eager_blowoff_calls; |
| kstat_named_t tcp_eager_blowoff_q; |
| kstat_named_t tcp_eager_blowoff_q0; |
| kstat_named_t tcp_not_hard_bound; |
| kstat_named_t tcp_no_listener; |
| kstat_named_t tcp_found_eager; |
| kstat_named_t tcp_wrong_queue; |
| kstat_named_t tcp_found_eager_binding1; |
| kstat_named_t tcp_found_eager_bound1; |
| kstat_named_t tcp_eager_has_listener1; |
| kstat_named_t tcp_open_alloc; |
| kstat_named_t tcp_open_detached_alloc; |
| kstat_named_t tcp_rput_time_wait; |
| kstat_named_t tcp_listendrop; |
| kstat_named_t tcp_listendropq0; |
| kstat_named_t tcp_wrong_rq; |
| kstat_named_t tcp_rsrv_calls; |
| kstat_named_t tcp_eagerfree2; |
| kstat_named_t tcp_eagerfree3; |
| kstat_named_t tcp_eagerfree4; |
| kstat_named_t tcp_eagerfree5; |
| kstat_named_t tcp_timewait_syn_fail; |
| kstat_named_t tcp_listen_badflags; |
| kstat_named_t tcp_timeout_calls; |
| kstat_named_t tcp_timeout_cached_alloc; |
| kstat_named_t tcp_timeout_cancel_reqs; |
| kstat_named_t tcp_timeout_canceled; |
| kstat_named_t tcp_timermp_alloced; |
| kstat_named_t tcp_timermp_freed; |
| kstat_named_t tcp_timermp_allocfail; |
| kstat_named_t tcp_timermp_allocdblfail; |
| kstat_named_t tcp_push_timer_cnt; |
| kstat_named_t tcp_ack_timer_cnt; |
| kstat_named_t tcp_ire_null1; |
| kstat_named_t tcp_ire_null; |
| kstat_named_t tcp_ip_send; |
| kstat_named_t tcp_ip_ire_send; |
| kstat_named_t tcp_wsrv_called; |
| kstat_named_t tcp_flwctl_on; |
| kstat_named_t tcp_timer_fire_early; |
| kstat_named_t tcp_timer_fire_miss; |
| kstat_named_t tcp_freelist_cleanup; |
| kstat_named_t tcp_rput_v6_error; |
| kstat_named_t tcp_out_sw_cksum; |
| kstat_named_t tcp_zcopy_on; |
| kstat_named_t tcp_zcopy_off; |
| kstat_named_t tcp_zcopy_backoff; |
| kstat_named_t tcp_zcopy_disable; |
| kstat_named_t tcp_mdt_pkt_out; |
| kstat_named_t tcp_mdt_pkt_out_v4; |
| kstat_named_t tcp_mdt_pkt_out_v6; |
| kstat_named_t tcp_mdt_discarded; |
| kstat_named_t tcp_mdt_conn_halted1; |
| kstat_named_t tcp_mdt_conn_halted2; |
| kstat_named_t tcp_mdt_conn_halted3; |
| kstat_named_t tcp_mdt_conn_resumed1; |
| kstat_named_t tcp_mdt_conn_resumed2; |
| kstat_named_t tcp_mdt_legacy_small; |
| kstat_named_t tcp_mdt_legacy_all; |
| kstat_named_t tcp_mdt_legacy_ret; |
| kstat_named_t tcp_mdt_allocfail; |
| kstat_named_t tcp_mdt_addpdescfail; |
| kstat_named_t tcp_mdt_allocd; |
| kstat_named_t tcp_mdt_linked; |
| kstat_named_t tcp_fusion_flowctl; |
| kstat_named_t tcp_fusion_backenabled; |
| kstat_named_t tcp_fusion_urg; |
| kstat_named_t tcp_fusion_putnext; |
| kstat_named_t tcp_fusion_unfusable; |
| kstat_named_t tcp_fusion_aborted; |
| kstat_named_t tcp_fusion_unqualified; |
| kstat_named_t tcp_in_ack_unsent_drop; |
| } tcp_stat_t; |
| |
| #if (TCP_COUNTERS || TCP_DEBUG_COUNTER) |
| static tcp_stat_t tcp_statistics = { |
| { "tcp_time_wait", KSTAT_DATA_UINT64 }, |
| { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, |
| { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, |
| { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, |
| { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, |
| { "tcp_ip_output", KSTAT_DATA_UINT64 }, |
| { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, |
| { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, |
| { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, |
| { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, |
| { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, |
| { "tcp_eager_err1", KSTAT_DATA_UINT64 }, |
| { "tcp_eager_err2", KSTAT_DATA_UINT64 }, |
| { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, |
| { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, |
| { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, |
| { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, |
| { "tcp_no_listener", KSTAT_DATA_UINT64 }, |
| { "tcp_found_eager", KSTAT_DATA_UINT64 }, |
| { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, |
| { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, |
| { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, |
| { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, |
| { "tcp_open_alloc", KSTAT_DATA_UINT64 }, |
| { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, |
| { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, |
| { "tcp_listendrop", KSTAT_DATA_UINT64 }, |
| { "tcp_listendropq0", KSTAT_DATA_UINT64 }, |
| { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, |
| { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, |
| { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, |
| { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, |
| { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, |
| { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, |
| { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, |
| { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, |
| { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, |
| { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, |
| { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, |
| { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, |
| { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, |
| { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, |
| { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, |
| { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, |
| { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, |
| { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, |
| { "tcp_ire_null1", KSTAT_DATA_UINT64 }, |
| { "tcp_ire_null", KSTAT_DATA_UINT64 }, |
| { "tcp_ip_send", KSTAT_DATA_UINT64 }, |
| { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, |
| { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, |
| { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, |
| { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, |
| { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, |
| { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, |
| { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, |
| { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, |
| { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, |
| { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, |
| { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, |
| { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, |
| { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, |
| { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, |
| { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, |
| { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, |
| { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, |
| { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, |
| { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, |
| { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, |
| { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, |
| }; |
| |
| static kstat_t *tcp_kstat; |
| |
| #endif |
| |
| /* |
| * Call either ip_output or ip_output_v6. This replaces putnext() calls on the |
| * tcp write side. |
| */ |
| #define CALL_IP_WPUT(connp, q, mp) { \ |
| ASSERT(((q)->q_flag & QREADR) == 0); \ |
| TCP_DBGSTAT(tcp_ip_output); \ |
| connp->conn_send(connp, (mp), (q), IP_WPUT); \ |
| } |
| |
| /* |
| * Was this tcp created via socket() interface? |
| */ |
| #define TCP_IS_SOCKET(tcp) ((tcp)->tcp_issocket) |
| |
| |
| /* Macros for timestamp comparisons */ |
| #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) |
| #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) |
| |
| /* |
| * Parameters for TCP Initial Send Sequence number (ISS) generation. When |
| * tcp_strong_iss is set to 1, which is the default, the ISS is calculated |
| * by adding three components: a time component which grows by 1 every 4096 |
| * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); |
| * a per-connection component which grows by 125000 for every new connection; |
| * and an "extra" component that grows by a random amount centered |
| * approximately on 64000. This causes the the ISS generator to cycle every |
| * 4.89 hours if no TCP connections are made, and faster if connections are |
| * made. |
| * |
| * When tcp_strong_iss is set to 0, ISS is calculated by adding two |
| * components: a time component which grows by 250000 every second; and |
| * a per-connection component which grows by 125000 for every new connections. |
| * |
| * A third method, when tcp_strong_iss is set to 2, for generating ISS is |
| * prescribed by Steve Bellovin. This involves adding time, the 125000 per |
| * connection, and a one-way hash (MD5) of the connection ID <sport, dport, |
| * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered |
| * password. |
| */ |
| #define ISS_INCR 250000 |
| #define ISS_NSEC_SHT 12 |
| |
| static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ |
| static kmutex_t tcp_iss_key_lock; |
| static MD5_CTX tcp_iss_key; |
| static sin_t sin_null; /* Zero address for quick clears */ |
| static sin6_t sin6_null; /* Zero address for quick clears */ |
| |
| /* Packet dropper for TCP IPsec policy drops. */ |
| static ipdropper_t tcp_dropper; |
| |
| /* |
| * This implementation follows the 4.3BSD interpretation of the urgent |
| * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause |
| * incompatible changes in protocols like telnet and rlogin. |
| */ |
| #define TCP_OLD_URP_INTERPRETATION 1 |
| |
| #define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached) |
| |
| #define TCP_IS_DETACHED_NONEAGER(tcp) \ |
| (TCP_IS_DETACHED(tcp) && \ |
| (!(tcp)->tcp_hard_binding)) |
| |
| /* |
| * TCP reassembly macros. We hide starting and ending sequence numbers in |
| * b_next and b_prev of messages on the reassembly queue. The messages are |
| * chained using b_cont. These macros are used in tcp_reass() so we don't |
| * have to see the ugly casts and assignments. |
| */ |
| #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) |
| #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ |
| (mblk_t *)(uintptr_t)(u)) |
| #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) |
| #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ |
| (mblk_t *)(uintptr_t)(u)) |
| |
| /* |
| * Implementation of TCP Timers. |
| * ============================= |
| * |
| * INTERFACE: |
| * |
| * There are two basic functions dealing with tcp timers: |
| * |
| * timeout_id_t tcp_timeout(connp, func, time) |
| * clock_t tcp_timeout_cancel(connp, timeout_id) |
| * TCP_TIMER_RESTART(tcp, intvl) |
| * |
| * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' |
| * after 'time' ticks passed. The function called by timeout() must adhere to |
| * the same restrictions as a driver soft interrupt handler - it must not sleep |
| * or call other functions that might sleep. The value returned is the opaque |
| * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to |
| * cancel the request. The call to tcp_timeout() may fail in which case it |
| * returns zero. This is different from the timeout(9F) function which never |
| * fails. |
| * |
| * The call-back function 'func' always receives 'connp' as its single |
| * argument. It is always executed in the squeue corresponding to the tcp |
| * structure. The tcp structure is guaranteed to be present at the time the |
| * call-back is called. |
| * |
| * NOTE: The call-back function 'func' is never called if tcp is in |
| * the TCPS_CLOSED state. |
| * |
| * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() |
| * request. locks acquired by the call-back routine should not be held across |
| * the call to tcp_timeout_cancel() or a deadlock may result. |
| * |
| * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. |
| * Otherwise, it returns an integer value greater than or equal to 0. In |
| * particular, if the call-back function is already placed on the squeue, it can |
| * not be canceled. |
| * |
| * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called |
| * within squeue context corresponding to the tcp instance. Since the |
| * call-back is also called via the same squeue, there are no race |
| * conditions described in untimeout(9F) manual page since all calls are |
| * strictly serialized. |
| * |
| * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout |
| * stored in tcp_timer_tid and starts a new one using |
| * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back |
| * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid |
| * field. |
| * |
| * NOTE: since the timeout cancellation is not guaranteed, the cancelled |
| * call-back may still be called, so it is possible tcp_timer() will be |
| * called several times. This should not be a problem since tcp_timer() |
| * should always check the tcp instance state. |
| * |
| * |
| * IMPLEMENTATION: |
| * |
| * TCP timers are implemented using three-stage process. The call to |
| * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function |
| * when the timer expires. The tcp_timer_callback() arranges the call of the |
| * tcp_timer_handler() function via squeue corresponding to the tcp |
| * instance. The tcp_timer_handler() calls actual requested timeout call-back |
| * and passes tcp instance as an argument to it. Information is passed between |
| * stages using the tcp_timer_t structure which contains the connp pointer, the |
| * tcp call-back to call and the timeout id returned by the timeout(9F). |
| * |
| * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - |
| * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo |
| * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() |
| * returns the pointer to this mblk. |
| * |
| * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It |
| * looks like a normal mblk without actual dblk attached to it. |
| * |
| * To optimize performance each tcp instance holds a small cache of timer |
| * mblocks. In the current implementation it caches up to two timer mblocks per |
| * tcp instance. The cache is preserved over tcp frees and is only freed when |
| * the whole tcp structure is destroyed by its kmem destructor. Since all tcp |
| * timer processing happens on a corresponding squeue, the cache manipulation |
| * does not require any locks. Experiments show that majority of timer mblocks |
| * allocations are satisfied from the tcp cache and do not involve kmem calls. |
| * |
| * The tcp_timeout() places a refhold on the connp instance which guarantees |
| * that it will be present at the time the call-back function fires. The |
| * tcp_timer_handler() drops the reference after calling the call-back, so the |
| * call-back function does not need to manipulate the references explicitly. |
| */ |
| |
| typedef struct tcp_timer_s { |
| conn_t *connp; |
| void (*tcpt_proc)(void *); |
| timeout_id_t tcpt_tid; |
| } tcp_timer_t; |
| |
| static kmem_cache_t *tcp_timercache; |
| kmem_cache_t *tcp_sack_info_cache; |
| kmem_cache_t *tcp_iphc_cache; |
| |
| #define TCP_TIMER(tcp, f, tim) tcp_timeout(tcp->tcp_connp, f, tim) |
| #define TCP_TIMER_CANCEL(tcp, id) tcp_timeout_cancel(tcp->tcp_connp, id) |
| |
| /* |
| * To restart the TCP retransmission timer. |
| */ |
| #define TCP_TIMER_RESTART(tcp, intvl) \ |
| { \ |
| if ((tcp)->tcp_timer_tid != 0) { \ |
| (void) TCP_TIMER_CANCEL((tcp), \ |
| (tcp)->tcp_timer_tid); \ |
| } \ |
| (tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, \ |
| MSEC_TO_TICK(intvl)); \ |
| } |
| |
| /* |
| * For scalability, we must not run a timer for every TCP connection |
| * in TIME_WAIT state. To see why, consider (for time wait interval of |
| * 4 minutes): |
| * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's |
| * |
| * This list is ordered by time, so you need only delete from the head |
| * until you get to entries which aren't old enough to delete yet. |
| * The list consists of only the detached TIME_WAIT connections. |
| * |
| * Note that the timer (tcp_time_wait_expire) is started when the tcp_t |
| * becomes detached TIME_WAIT (either by changing the state and already |
| * being detached or the other way around). This means that the TIME_WAIT |
| * state can be extended (up to doubled) if the connection doesn't become |
| * detached for a long time. |
| * |
| * The list manipulations (including tcp_time_wait_next/prev) |
| * are protected by the tcp_time_wait_lock. The content of the |
| * detached TIME_WAIT connections is protected by the normal perimeters. |
| */ |
| |
| typedef struct tcp_squeue_priv_s { |
| kmutex_t tcp_time_wait_lock; |
| /* Protects the next 3 globals */ |
| timeout_id_t tcp_time_wait_tid; |
| tcp_t *tcp_time_wait_head; |
| tcp_t *tcp_time_wait_tail; |
| tcp_t *tcp_free_list; |
| } tcp_squeue_priv_t; |
| |
| /* |
| * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. |
| * Running it every 5 seconds seems to give the best results. |
| */ |
| #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) |
| |
| |
| #define TCP_XMIT_LOWATER 4096 |
| #define TCP_XMIT_HIWATER 49152 |
| #define TCP_RECV_LOWATER 2048 |
| #define TCP_RECV_HIWATER 49152 |
| |
| /* |
| * PAWS needs a timer for 24 days. This is the number of ticks in 24 days |
| */ |
| #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) |
| |
| #define TIDUSZ 4096 /* transport interface data unit size */ |
| |
| /* |
| * Bind hash list size and has function. It has to be a power of 2 for |
| * hashing. |
| */ |
| #define TCP_BIND_FANOUT_SIZE 512 |
| #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) |
| /* |
| * Size of listen and acceptor hash list. It has to be a power of 2 for |
| * hashing. |
| */ |
| #define TCP_FANOUT_SIZE 256 |
| |
| #ifdef _ILP32 |
| #define TCP_ACCEPTOR_HASH(accid) \ |
| (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) |
| #else |
| #define TCP_ACCEPTOR_HASH(accid) \ |
| ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) |
| #endif /* _ILP32 */ |
| |
| #define IP_ADDR_CACHE_SIZE 2048 |
| #define IP_ADDR_CACHE_HASH(faddr) \ |
| (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) |
| |
| /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */ |
| #define TCP_HSP_HASH_SIZE 256 |
| |
| #define TCP_HSP_HASH(addr) \ |
| (((addr>>24) ^ (addr >>16) ^ \ |
| (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE) |
| |
| /* |
| * TCP options struct returned from tcp_parse_options. |
| */ |
| typedef struct tcp_opt_s { |
| uint32_t tcp_opt_mss; |
| uint32_t tcp_opt_wscale; |
| uint32_t tcp_opt_ts_val; |
| uint32_t tcp_opt_ts_ecr; |
| tcp_t *tcp; |
| } tcp_opt_t; |
| |
| /* |
| * RFC1323-recommended phrasing of TSTAMP option, for easier parsing |
| */ |
| |
| #ifdef _BIG_ENDIAN |
| #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ |
| (TCPOPT_TSTAMP << 8) | 10) |
| #else |
| #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ |
| (TCPOPT_NOP << 8) | TCPOPT_NOP) |
| #endif |
| |
| /* |
| * Flags returned from tcp_parse_options. |
| */ |
| #define TCP_OPT_MSS_PRESENT 1 |
| #define TCP_OPT_WSCALE_PRESENT 2 |
| #define TCP_OPT_TSTAMP_PRESENT 4 |
| #define TCP_OPT_SACK_OK_PRESENT 8 |
| #define TCP_OPT_SACK_PRESENT 16 |
| |
| /* TCP option length */ |
| #define TCPOPT_NOP_LEN 1 |
| #define TCPOPT_MAXSEG_LEN 4 |
| #define TCPOPT_WS_LEN 3 |
| #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) |
| #define TCPOPT_TSTAMP_LEN 10 |
| #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) |
| #define TCPOPT_SACK_OK_LEN 2 |
| #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) |
| #define TCPOPT_REAL_SACK_LEN 4 |
| #define TCPOPT_MAX_SACK_LEN 36 |
| #define TCPOPT_HEADER_LEN 2 |
| |
| /* TCP cwnd burst factor. */ |
| #define TCP_CWND_INFINITE 65535 |
| #define TCP_CWND_SS 3 |
| #define TCP_CWND_NORMAL 5 |
| |
| /* Maximum TCP initial cwin (start/restart). */ |
| #define TCP_MAX_INIT_CWND 8 |
| |
| /* |
| * Initialize cwnd according to RFC 3390. def_max_init_cwnd is |
| * either tcp_slow_start_initial or tcp_slow_start_after idle |
| * depending on the caller. If the upper layer has not used the |
| * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd |
| * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. |
| * If the upper layer has changed set the tcp_init_cwnd, just use |
| * it to calculate the tcp_cwnd. |
| */ |
| #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ |
| { \ |
| if ((tcp)->tcp_init_cwnd == 0) { \ |
| (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ |
| MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ |
| } else { \ |
| (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ |
| } \ |
| tcp->tcp_cwnd_cnt = 0; \ |
| } |
| |
| /* TCP Timer control structure */ |
| typedef struct tcpt_s { |
| pfv_t tcpt_pfv; /* The routine we are to call */ |
| tcp_t *tcpt_tcp; /* The parameter we are to pass in */ |
| } tcpt_t; |
| |
| /* Host Specific Parameter structure */ |
| typedef struct tcp_hsp { |
| struct tcp_hsp *tcp_hsp_next; |
| in6_addr_t tcp_hsp_addr_v6; |
| in6_addr_t tcp_hsp_subnet_v6; |
| uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */ |
| int32_t tcp_hsp_sendspace; |
| int32_t tcp_hsp_recvspace; |
| int32_t tcp_hsp_tstamp; |
| } tcp_hsp_t; |
| #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6) |
| #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6) |
| |
| /* |
| * Functions called directly via squeue having a prototype of edesc_t. |
| */ |
| void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); |
| void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); |
| void tcp_input(void *arg, mblk_t *mp, void *arg2); |
| void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_output(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); |
| |
| |
| /* Prototype for TCP functions */ |
| static void tcp_random_init(void); |
| int tcp_random(void); |
| static void tcp_accept(tcp_t *tcp, mblk_t *mp); |
| static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, |
| tcp_t *eager); |
| static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); |
| static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, |
| int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, |
| boolean_t user_specified); |
| static void tcp_closei_local(tcp_t *tcp); |
| static void tcp_close_detached(tcp_t *tcp); |
| static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, |
| mblk_t *idmp, mblk_t **defermp); |
| static void tcp_connect(tcp_t *tcp, mblk_t *mp); |
| static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, |
| in_port_t dstport, uint_t srcid); |
| static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, |
| in_port_t dstport, uint32_t flowinfo, uint_t srcid, |
| uint32_t scope_id); |
| static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); |
| static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); |
| static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); |
| static char *tcp_display(tcp_t *tcp, char *, char); |
| static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); |
| static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); |
| static void tcp_eager_unlink(tcp_t *tcp); |
| static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, |
| int unixerr); |
| static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, |
| int tlierr, int unixerr); |
| static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, |
| char *value, caddr_t cp, cred_t *cr); |
| static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, |
| char *value, caddr_t cp, cred_t *cr); |
| static int tcp_tpistate(tcp_t *tcp); |
| static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, |
| int caller_holds_lock); |
| static void tcp_bind_hash_remove(tcp_t *tcp); |
| static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id); |
| void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); |
| static void tcp_acceptor_hash_remove(tcp_t *tcp); |
| static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); |
| static void tcp_info_req(tcp_t *tcp, mblk_t *mp); |
| static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); |
| static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); |
| static int tcp_header_init_ipv4(tcp_t *tcp); |
| static int tcp_header_init_ipv6(tcp_t *tcp); |
| int tcp_init(tcp_t *tcp, queue_t *q); |
| static int tcp_init_values(tcp_t *tcp); |
| static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); |
| static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, |
| t_scalar_t addr_length); |
| static void tcp_ip_ire_mark_advice(tcp_t *tcp); |
| static void tcp_ip_notify(tcp_t *tcp); |
| static mblk_t *tcp_ire_mp(mblk_t *mp); |
| static void tcp_iss_init(tcp_t *tcp); |
| static void tcp_keepalive_killer(void *arg); |
| static int tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk); |
| static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); |
| static void tcp_mss_set(tcp_t *tcp, uint32_t size); |
| static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, |
| int *do_disconnectp, int *t_errorp, int *sys_errorp); |
| static boolean_t tcp_allow_connopt_set(int level, int name); |
| int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); |
| int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr); |
| static int tcp_opt_get_user(ipha_t *ipha, uchar_t *ptr); |
| int tcp_opt_set(queue_t *q, uint_t optset_context, int level, |
| int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, |
| uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, |
| mblk_t *mblk); |
| static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); |
| static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, |
| uchar_t *ptr, uint_t len); |
| static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); |
| static boolean_t tcp_param_register(tcpparam_t *tcppa, int cnt); |
| static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static void tcp_iss_key_init(uint8_t *phrase, int len); |
| static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); |
| static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); |
| static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); |
| static void tcp_reinit(tcp_t *tcp); |
| static void tcp_reinit_values(tcp_t *tcp); |
| static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, |
| tcp_t *thisstream, cred_t *cr); |
| |
| static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); |
| static void tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len); |
| static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); |
| static boolean_t tcp_send_rst_chk(void); |
| static void tcp_ss_rexmit(tcp_t *tcp); |
| static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); |
| static void tcp_process_options(tcp_t *, tcph_t *); |
| static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); |
| static void tcp_rsrv(queue_t *q); |
| static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); |
| static int tcp_snmp_get(queue_t *q, mblk_t *mpctl); |
| static int tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, |
| int len); |
| static int tcp_snmp_state(tcp_t *tcp); |
| static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static void tcp_timer(void *arg); |
| static void tcp_timer_callback(void *); |
| static in_port_t tcp_update_next_port(in_port_t port, boolean_t random); |
| static in_port_t tcp_get_next_priv_port(void); |
| static void tcp_wput(queue_t *q, mblk_t *mp); |
| static void tcp_wput_sock(queue_t *q, mblk_t *mp); |
| void tcp_wput_accept(queue_t *q, mblk_t *mp); |
| static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); |
| static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); |
| static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); |
| static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, |
| const int tcp_hdr_len, const int tcp_tcp_hdr_len, |
| const int num_sack_blk, int *usable, uint_t *snxt, |
| int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, |
| const int mdt_thres); |
| static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, |
| const int tcp_hdr_len, const int tcp_tcp_hdr_len, |
| const int num_sack_blk, int *usable, uint_t *snxt, |
| int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, |
| const int mdt_thres); |
| static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, |
| int num_sack_blk); |
| static void tcp_wsrv(queue_t *q); |
| static int tcp_xmit_end(tcp_t *tcp); |
| void tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len); |
| static mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, |
| int32_t *offset, mblk_t **end_mp, uint32_t seq, |
| boolean_t sendall, uint32_t *seg_len, boolean_t rexmit); |
| static void tcp_ack_timer(void *arg); |
| static mblk_t *tcp_ack_mp(tcp_t *tcp); |
| static void tcp_push_timer(void *arg); |
| static void tcp_xmit_early_reset(char *str, mblk_t *mp, |
| uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len); |
| static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, |
| uint32_t ack, int ctl); |
| static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr); |
| static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr); |
| static int setmaxps(queue_t *q, int maxpsz); |
| static void tcp_set_rto(tcp_t *, time_t); |
| static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, |
| boolean_t, boolean_t); |
| static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, |
| boolean_t ipsec_mctl); |
| static boolean_t tcp_cmpbuf(void *a, uint_t alen, |
| boolean_t b_valid, void *b, uint_t blen); |
| static boolean_t tcp_allocbuf(void **dstp, uint_t *dstlenp, |
| boolean_t src_valid, void *src, uint_t srclen); |
| static void tcp_savebuf(void **dstp, uint_t *dstlenp, |
| boolean_t src_valid, void *src, uint_t srclen); |
| static mblk_t *tcp_setsockopt_mp(int level, int cmd, |
| char *opt, int optlen); |
| static int tcp_pkt_set(uchar_t *, uint_t, uchar_t **, uint_t *); |
| static int tcp_build_hdrs(queue_t *, tcp_t *); |
| static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, |
| uint32_t seg_seq, uint32_t seg_ack, int seg_len, |
| tcph_t *tcph); |
| boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); |
| boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *); |
| boolean_t tcp_reserved_port_del(in_port_t, in_port_t); |
| boolean_t tcp_reserved_port_check(in_port_t); |
| static tcp_t *tcp_alloc_temp_tcp(in_port_t); |
| static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *); |
| static void tcp_timers_stop(tcp_t *); |
| static timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t); |
| static clock_t tcp_timeout_cancel(conn_t *, timeout_id_t); |
| static mblk_t *tcp_mdt_info_mp(mblk_t *); |
| static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); |
| static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, |
| const boolean_t, const uint32_t, const uint32_t, |
| const uint32_t, const uint32_t); |
| static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, |
| const uint_t, const uint_t, boolean_t *); |
| static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); |
| extern mblk_t *tcp_timermp_alloc(int); |
| extern void tcp_timermp_free(tcp_t *); |
| static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); |
| static void tcp_stop_lingering(tcp_t *tcp); |
| static void tcp_close_linger_timeout(void *arg); |
| void tcp_ddi_init(void); |
| void tcp_ddi_destroy(void); |
| static void tcp_kstat_init(void); |
| static void tcp_kstat_fini(void); |
| static int tcp_kstat_update(kstat_t *kp, int rw); |
| void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); |
| conn_t *tcp_get_next_conn(connf_t *, conn_t *); |
| static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, |
| tcph_t *tcph, uint_t ipvers, mblk_t *idmp); |
| static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, |
| tcph_t *tcph, mblk_t *idmp); |
| static squeue_func_t tcp_squeue_switch(int); |
| |
| static int tcp_open(queue_t *, dev_t *, int, int, cred_t *); |
| static int tcp_close(queue_t *, int); |
| static int tcpclose_accept(queue_t *); |
| static int tcp_modclose(queue_t *); |
| static void tcp_wput_mod(queue_t *, mblk_t *); |
| |
| static void tcp_squeue_add(squeue_t *); |
| static boolean_t tcp_zcopy_check(tcp_t *); |
| static void tcp_zcopy_notify(tcp_t *); |
| static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); |
| static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); |
| static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); |
| |
| static void tcp_fuse(tcp_t *, uchar_t *, tcph_t *); |
| static void tcp_unfuse(tcp_t *); |
| static boolean_t tcp_fuse_output(tcp_t *, mblk_t *); |
| static void tcp_fuse_output_urg(tcp_t *, mblk_t *); |
| static boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **); |
| |
| extern mblk_t *allocb_tryhard(size_t); |
| |
| /* |
| * Routines related to the TCP_IOC_ABORT_CONN ioctl command. |
| * |
| * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting |
| * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure |
| * (defined in tcp.h) needs to be filled in and passed into the kernel |
| * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t |
| * structure contains the four-tuple of a TCP connection and a range of TCP |
| * states (specified by ac_start and ac_end). The use of wildcard addresses |
| * and ports is allowed. Connections with a matching four tuple and a state |
| * within the specified range will be aborted. The valid states for the |
| * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, |
| * inclusive. |
| * |
| * An application which has its connection aborted by this ioctl will receive |
| * an error that is dependent on the connection state at the time of the abort. |
| * If the connection state is < TCPS_TIME_WAIT, an application should behave as |
| * though a RST packet has been received. If the connection state is equal to |
| * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel |
| * and all resources associated with the connection will be freed. |
| */ |
| static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); |
| static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); |
| static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); |
| static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *); |
| static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); |
| static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, |
| boolean_t); |
| |
| |
| static void tcp_clrqfull(tcp_t *); |
| static void tcp_setqfull(tcp_t *); |
| |
| static struct module_info tcp_rinfo = { |
| #define TCP_MODULE_ID 5105 |
| TCP_MODULE_ID, "tcp", 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER |
| }; |
| |
| static struct module_info tcp_winfo = { |
| TCP_MODULE_ID, "tcp", 0, INFPSZ, 127, 16 |
| }; |
| |
| /* |
| * Entry points for TCP as a module. It only allows SNMP requests |
| * to pass through. |
| */ |
| struct qinit tcp_mod_rinit = { |
| (pfi_t)putnext, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo |
| }; |
| |
| struct qinit tcp_mod_winit = { |
| (pfi_t)tcp_wput_mod, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo |
| }; |
| |
| /* |
| * Entry points for TCP as a device. The normal case which supports |
| * the TCP functionality. |
| */ |
| struct qinit tcp_rinit = { |
| NULL, (pfi_t)tcp_rsrv, tcp_open, tcp_close, NULL, &tcp_rinfo |
| }; |
| |
| struct qinit tcp_winit = { |
| (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo |
| }; |
| |
| /* Initial entry point for TCP in socket mode. */ |
| struct qinit tcp_sock_winit = { |
| (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo |
| }; |
| |
| /* |
| * Entry points for TCP as a acceptor STREAM opened by sockfs when doing |
| * an accept. Avoid allocating data structures since eager has already |
| * been created. |
| */ |
| struct qinit tcp_acceptor_rinit = { |
| NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo |
| }; |
| |
| struct qinit tcp_acceptor_winit = { |
| (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo |
| }; |
| |
| struct streamtab tcpinfo = { |
| &tcp_rinit, &tcp_winit |
| }; |
| |
| |
| extern squeue_func_t tcp_squeue_wput_proc; |
| extern squeue_func_t tcp_squeue_timer_proc; |
| |
| /* Protected by tcp_g_q_lock */ |
| static queue_t *tcp_g_q; /* Default queue used during detached closes */ |
| kmutex_t tcp_g_q_lock; |
| |
| /* Protected by tcp_hsp_lock */ |
| /* |
| * XXX The host param mechanism should go away and instead we should use |
| * the metrics associated with the routes to determine the default sndspace |
| * and rcvspace. |
| */ |
| static tcp_hsp_t **tcp_hsp_hash; /* Hash table for HSPs */ |
| krwlock_t tcp_hsp_lock; |
| |
| /* |
| * Extra privileged ports. In host byte order. |
| * Protected by tcp_epriv_port_lock. |
| */ |
| #define TCP_NUM_EPRIV_PORTS 64 |
| static int tcp_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; |
| static uint16_t tcp_g_epriv_ports[TCP_NUM_EPRIV_PORTS] = { 2049, 4045 }; |
| kmutex_t tcp_epriv_port_lock; |
| |
| /* |
| * The smallest anonymous port in the priviledged port range which TCP |
| * looks for free port. Use in the option TCP_ANONPRIVBIND. |
| */ |
| static in_port_t tcp_min_anonpriv_port = 512; |
| |
| /* Only modified during _init and _fini thus no locking is needed. */ |
| static caddr_t tcp_g_nd; /* Head of 'named dispatch' variable list */ |
| |
| /* Hint not protected by any lock */ |
| static uint_t tcp_next_port_to_try; |
| |
| |
| /* TCP bind hash list - all tcp_t with state >= BOUND. */ |
| static tf_t tcp_bind_fanout[TCP_BIND_FANOUT_SIZE]; |
| |
| /* TCP queue hash list - all tcp_t in case they will be an acceptor. */ |
| static tf_t tcp_acceptor_fanout[TCP_FANOUT_SIZE]; |
| |
| /* |
| * TCP has a private interface for other kernel modules to reserve a |
| * port range for them to use. Once reserved, TCP will not use any ports |
| * in the range. This interface relies on the TCP_EXCLBIND feature. If |
| * the semantics of TCP_EXCLBIND is changed, implementation of this interface |
| * has to be verified. |
| * |
| * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port |
| * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port |
| * range is [port a, port b] inclusive. And each port range is between |
| * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive. |
| * |
| * Note that the default anonymous port range starts from 32768. There is |
| * no port "collision" between that and the reserved port range. If there |
| * is port collision (because the default smallest anonymous port is lowered |
| * or some apps specifically bind to ports in the reserved port range), the |
| * system may not be able to reserve a port range even there are enough |
| * unbound ports as a reserved port range contains consecutive ports . |
| */ |
| #define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5 |
| #define TCP_RESERVED_PORTS_RANGE_MAX 1000 |
| #define TCP_SMALLEST_RESERVED_PORT 10240 |
| #define TCP_LARGEST_RESERVED_PORT 20480 |
| |
| /* Structure to represent those reserved port ranges. */ |
| typedef struct tcp_rport_s { |
| in_port_t lo_port; |
| in_port_t hi_port; |
| tcp_t **temp_tcp_array; |
| } tcp_rport_t; |
| |
| /* The reserved port array. */ |
| static tcp_rport_t tcp_reserved_port[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; |
| |
| /* Locks to protect the tcp_reserved_ports array. */ |
| static krwlock_t tcp_reserved_port_lock; |
| |
| /* The number of ranges in the array. */ |
| uint32_t tcp_reserved_port_array_size = 0; |
| |
| /* |
| * MIB-2 stuff for SNMP |
| * Note: tcpInErrs {tcp 15} is accumulated in ip.c |
| */ |
| mib2_tcp_t tcp_mib; /* SNMP fixed size info */ |
| kstat_t *tcp_mibkp; /* kstat exporting tcp_mib data */ |
| |
| /* |
| * Object to represent database of options to search passed to |
| * {sock,tpi}optcom_req() interface routine to take care of option |
| * management and associated methods. |
| * XXX These and other externs should ideally move to a TCP header |
| */ |
| extern optdb_obj_t tcp_opt_obj; |
| extern uint_t tcp_max_optsize; |
| |
| boolean_t tcp_icmp_source_quench = B_FALSE; |
| /* |
| * Following assumes TPI alignment requirements stay along 32 bit |
| * boundaries |
| */ |
| #define ROUNDUP32(x) \ |
| (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) |
| |
| /* Template for response to info request. */ |
| static struct T_info_ack tcp_g_t_info_ack = { |
| T_INFO_ACK, /* PRIM_type */ |
| 0, /* TSDU_size */ |
| T_INFINITE, /* ETSDU_size */ |
| T_INVALID, /* CDATA_size */ |
| T_INVALID, /* DDATA_size */ |
| sizeof (sin_t), /* ADDR_size */ |
| 0, /* OPT_size - not initialized here */ |
| TIDUSZ, /* TIDU_size */ |
| T_COTS_ORD, /* SERV_type */ |
| TCPS_IDLE, /* CURRENT_state */ |
| (XPG4_1|EXPINLINE) /* PROVIDER_flag */ |
| }; |
| |
| static struct T_info_ack tcp_g_t_info_ack_v6 = { |
| T_INFO_ACK, /* PRIM_type */ |
| 0, /* TSDU_size */ |
| T_INFINITE, /* ETSDU_size */ |
| T_INVALID, /* CDATA_size */ |
| T_INVALID, /* DDATA_size */ |
| sizeof (sin6_t), /* ADDR_size */ |
| 0, /* OPT_size - not initialized here */ |
| TIDUSZ, /* TIDU_size */ |
| T_COTS_ORD, /* SERV_type */ |
| TCPS_IDLE, /* CURRENT_state */ |
| (XPG4_1|EXPINLINE) /* PROVIDER_flag */ |
| }; |
| |
| #define MS 1L |
| #define SECONDS (1000 * MS) |
| #define MINUTES (60 * SECONDS) |
| #define HOURS (60 * MINUTES) |
| #define DAYS (24 * HOURS) |
| |
| #define PARAM_MAX (~(uint32_t)0) |
| |
| /* Max size IP datagram is 64k - 1 */ |
| #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) |
| #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) |
| /* Max of the above */ |
| #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 |
| |
| /* Largest TCP port number */ |
| #define TCP_MAX_PORT (64 * 1024 - 1) |
| |
| /* |
| * tcp_wroff_xtra is the extra space in front of TCP/IP header for link |
| * layer header. It has to be a multiple of 4. |
| */ |
| static tcpparam_t tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; |
| #define tcp_wroff_xtra tcp_wroff_xtra_param.tcp_param_val |
| |
| /* |
| * All of these are alterable, within the min/max values given, at run time. |
| * Note that the default value of "tcp_time_wait_interval" is four minutes, |
| * per the TCP spec. |
| */ |
| /* BEGIN CSTYLED */ |
| tcpparam_t tcp_param_arr[] = { |
| /*min max value name */ |
| { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, |
| { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, |
| { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, |
| { 1, 1024, 1, "tcp_conn_req_min" }, |
| { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, |
| { 128, (1<<30), 1024*1024, "tcp_cwnd_max" }, |
| { 0, 10, 0, "tcp_debug" }, |
| { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, |
| { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, |
| { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, |
| { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"}, |
| { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, |
| { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, |
| { 1, 255, 64, "tcp_ipv4_ttl"}, |
| { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, |
| { 0, 100, 10, "tcp_maxpsz_multiplier" }, |
| { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, |
| { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, |
| { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, |
| { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, |
| { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"}, |
| { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, |
| { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, |
| { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, |
| { 0, 16, 0, "tcp_snd_lowat_fraction" }, |
| { 0, 128000, 0, "tcp_sth_rcv_hiwat" }, |
| { 0, 128000, 0, "tcp_sth_rcv_lowat" }, |
| { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, |
| { 0, 1, 0, "tcp_ignore_path_mtu" }, |
| { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, |
| { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, |
| { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, |
| { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, |
| { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, |
| { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, |
| { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, |
| { 0, TCP_MSS_MAX, 64, "tcp_co_min"}, |
| { 8192, (1<<30), 1024*1024, "tcp_max_buf"}, |
| /* |
| * Question: What default value should I set for tcp_strong_iss? |
| */ |
| { 0, 2, 1, "tcp_strong_iss"}, |
| { 0, 65536, 20, "tcp_rtt_updates"}, |
| { 0, 1, 1, "tcp_wscale_always"}, |
| { 0, 1, 0, "tcp_tstamp_always"}, |
| { 0, 1, 1, "tcp_tstamp_if_wscale"}, |
| { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, |
| { 0, 16, 2, "tcp_deferred_acks_max"}, |
| { 1, 16384, 4, "tcp_slow_start_after_idle"}, |
| { 1, 4, 4, "tcp_slow_start_initial"}, |
| { 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"}, |
| { 0, 2, 2, "tcp_sack_permitted"}, |
| { 0, 1, 0, "tcp_trace"}, |
| { 0, 1, 1, "tcp_compression_enabled"}, |
| { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, |
| { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, |
| { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, |
| { 0, 1, 0, "tcp_rev_src_routes"}, |
| { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, |
| { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"}, |
| { 0, 16, 8, "tcp_local_dacks_max"}, |
| { 0, 2, 1, "tcp_ecn_permitted"}, |
| { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, |
| { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, |
| { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, |
| { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, |
| { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, |
| }; |
| /* END CSTYLED */ |
| |
| |
| #define tcp_time_wait_interval tcp_param_arr[0].tcp_param_val |
| #define tcp_conn_req_max_q tcp_param_arr[1].tcp_param_val |
| #define tcp_conn_req_max_q0 tcp_param_arr[2].tcp_param_val |
| #define tcp_conn_req_min tcp_param_arr[3].tcp_param_val |
| #define tcp_conn_grace_period tcp_param_arr[4].tcp_param_val |
| #define tcp_cwnd_max_ tcp_param_arr[5].tcp_param_val |
| #define tcp_dbg tcp_param_arr[6].tcp_param_val |
| #define tcp_smallest_nonpriv_port tcp_param_arr[7].tcp_param_val |
| #define tcp_ip_abort_cinterval tcp_param_arr[8].tcp_param_val |
| #define tcp_ip_abort_linterval tcp_param_arr[9].tcp_param_val |
| #define tcp_ip_abort_interval tcp_param_arr[10].tcp_param_val |
| #define tcp_ip_notify_cinterval tcp_param_arr[11].tcp_param_val |
| #define tcp_ip_notify_interval tcp_param_arr[12].tcp_param_val |
| #define tcp_ipv4_ttl tcp_param_arr[13].tcp_param_val |
| #define tcp_keepalive_interval_high tcp_param_arr[14].tcp_param_max |
| #define tcp_keepalive_interval tcp_param_arr[14].tcp_param_val |
| #define tcp_keepalive_interval_low tcp_param_arr[14].tcp_param_min |
| #define tcp_maxpsz_multiplier tcp_param_arr[15].tcp_param_val |
| #define tcp_mss_def_ipv4 tcp_param_arr[16].tcp_param_val |
| #define tcp_mss_max_ipv4 tcp_param_arr[17].tcp_param_val |
| #define tcp_mss_min tcp_param_arr[18].tcp_param_val |
| #define tcp_naglim_def tcp_param_arr[19].tcp_param_val |
| #define tcp_rexmit_interval_initial tcp_param_arr[20].tcp_param_val |
| #define tcp_rexmit_interval_max tcp_param_arr[21].tcp_param_val |
| #define tcp_rexmit_interval_min tcp_param_arr[22].tcp_param_val |
| #define tcp_deferred_ack_interval tcp_param_arr[23].tcp_param_val |
| #define tcp_snd_lowat_fraction tcp_param_arr[24].tcp_param_val |
| #define tcp_sth_rcv_hiwat tcp_param_arr[25].tcp_param_val |
| #define tcp_sth_rcv_lowat tcp_param_arr[26].tcp_param_val |
| #define tcp_dupack_fast_retransmit tcp_param_arr[27].tcp_param_val |
| #define tcp_ignore_path_mtu tcp_param_arr[28].tcp_param_val |
| #define tcp_smallest_anon_port tcp_param_arr[29].tcp_param_val |
| #define tcp_largest_anon_port tcp_param_arr[30].tcp_param_val |
| #define tcp_xmit_hiwat tcp_param_arr[31].tcp_param_val |
| #define tcp_xmit_lowat tcp_param_arr[32].tcp_param_val |
| #define tcp_recv_hiwat tcp_param_arr[33].tcp_param_val |
| #define tcp_recv_hiwat_minmss tcp_param_arr[34].tcp_param_val |
| #define tcp_fin_wait_2_flush_interval tcp_param_arr[35].tcp_param_val |
| #define tcp_co_min tcp_param_arr[36].tcp_param_val |
| #define tcp_max_buf tcp_param_arr[37].tcp_param_val |
| #define tcp_strong_iss tcp_param_arr[38].tcp_param_val |
| #define tcp_rtt_updates tcp_param_arr[39].tcp_param_val |
| #define tcp_wscale_always tcp_param_arr[40].tcp_param_val |
| #define tcp_tstamp_always tcp_param_arr[41].tcp_param_val |
| #define tcp_tstamp_if_wscale tcp_param_arr[42].tcp_param_val |
| #define tcp_rexmit_interval_extra tcp_param_arr[43].tcp_param_val |
| #define tcp_deferred_acks_max tcp_param_arr[44].tcp_param_val |
| #define tcp_slow_start_after_idle tcp_param_arr[45].tcp_param_val |
| #define tcp_slow_start_initial tcp_param_arr[46].tcp_param_val |
| #define tcp_co_timer_interval tcp_param_arr[47].tcp_param_val |
| #define tcp_sack_permitted tcp_param_arr[48].tcp_param_val |
| #define tcp_trace tcp_param_arr[49].tcp_param_val |
| #define tcp_compression_enabled tcp_param_arr[50].tcp_param_val |
| #define tcp_ipv6_hoplimit tcp_param_arr[51].tcp_param_val |
| #define tcp_mss_def_ipv6 tcp_param_arr[52].tcp_param_val |
| #define tcp_mss_max_ipv6 tcp_param_arr[53].tcp_param_val |
| #define tcp_rev_src_routes tcp_param_arr[54].tcp_param_val |
| #define tcp_local_dack_interval tcp_param_arr[55].tcp_param_val |
| #define tcp_ndd_get_info_interval tcp_param_arr[56].tcp_param_val |
| #define tcp_local_dacks_max tcp_param_arr[57].tcp_param_val |
| #define tcp_ecn_permitted tcp_param_arr[58].tcp_param_val |
| #define tcp_rst_sent_rate_enabled tcp_param_arr[59].tcp_param_val |
| #define tcp_rst_sent_rate tcp_param_arr[60].tcp_param_val |
| #define tcp_push_timer_interval tcp_param_arr[61].tcp_param_val |
| #define tcp_use_smss_as_mss_opt tcp_param_arr[62].tcp_param_val |
| #define tcp_keepalive_abort_interval_high tcp_param_arr[63].tcp_param_max |
| #define tcp_keepalive_abort_interval tcp_param_arr[63].tcp_param_val |
| #define tcp_keepalive_abort_interval_low tcp_param_arr[63].tcp_param_min |
| |
| /* |
| * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of |
| * each header fragment in the header buffer. Each parameter value has |
| * to be a multiple of 4 (32-bit aligned). |
| */ |
| static tcpparam_t tcp_mdt_head_param = { 32, 256, 32, "tcp_mdt_hdr_head_min" }; |
| static tcpparam_t tcp_mdt_tail_param = { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; |
| #define tcp_mdt_hdr_head_min tcp_mdt_head_param.tcp_param_val |
| #define tcp_mdt_hdr_tail_min tcp_mdt_tail_param.tcp_param_val |
| |
| /* |
| * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out |
| * the maximum number of payload buffers associated per Multidata. |
| */ |
| static tcpparam_t tcp_mdt_max_pbufs_param = |
| { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; |
| #define tcp_mdt_max_pbufs tcp_mdt_max_pbufs_param.tcp_param_val |
| |
| /* Round up the value to the nearest mss. */ |
| #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) |
| |
| /* |
| * Set ECN capable transport (ECT) code point in IP header. |
| * |
| * Note that there are 2 ECT code points '01' and '10', which are called |
| * ECT(1) and ECT(0) respectively. Here we follow the original ECT code |
| * point ECT(0) for TCP as described in RFC 2481. |
| */ |
| #define SET_ECT(tcp, iph) \ |
| if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ |
| /* We need to clear the code point first. */ \ |
| ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ |
| ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ |
| } else { \ |
| ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ |
| ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ |
| } |
| |
| /* |
| * The format argument to pass to tcp_display(). |
| * DISP_PORT_ONLY means that the returned string has only port info. |
| * DISP_ADDR_AND_PORT means that the returned string also contains the |
| * remote and local IP address. |
| */ |
| #define DISP_PORT_ONLY 1 |
| #define DISP_ADDR_AND_PORT 2 |
| |
| /* |
| * This controls the rate some ndd info report functions can be used |
| * by non-priviledged users. It stores the last time such info is |
| * requested. When those report functions are called again, this |
| * is checked with the current time and compare with the ndd param |
| * tcp_ndd_get_info_interval. |
| */ |
| static clock_t tcp_last_ndd_get_info_time = 0; |
| #define NDD_TOO_QUICK_MSG \ |
| "ndd get info rate too high for non-priviledged users, try again " \ |
| "later.\n" |
| #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n" |
| |
| #define IS_VMLOANED_MBLK(mp) \ |
| (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) |
| |
| /* |
| * These two variables control the rate for TCP to generate RSTs in |
| * response to segments not belonging to any connections. We limit |
| * TCP to sent out tcp_rst_sent_rate (ndd param) number of RSTs in |
| * each 1 second interval. This is to protect TCP against DoS attack. |
| */ |
| static clock_t tcp_last_rst_intrvl; |
| static uint32_t tcp_rst_cnt; |
| |
| /* The number of RST not sent because of the rate limit. */ |
| static uint32_t tcp_rst_unsent; |
| |
| /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ |
| boolean_t tcp_mdt_chain = B_TRUE; |
| |
| /* |
| * MDT threshold in the form of effective send MSS multiplier; we take |
| * the MDT path if the amount of unsent data exceeds the threshold value |
| * (default threshold is 1*SMSS). |
| */ |
| uint_t tcp_mdt_smss_threshold = 1; |
| |
| uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ |
| |
| /* |
| * Forces all connections to obey the value of the tcp_maxpsz_multiplier |
| * tunable settable via NDD. Otherwise, the per-connection behavior is |
| * determined dynamically during tcp_adapt_ire(), which is the default. |
| */ |
| boolean_t tcp_static_maxpsz = B_FALSE; |
| |
| /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ |
| uint32_t tcp_random_anon_port = 1; |
| |
| /* |
| * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more |
| * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent |
| * data, TCP will not respond with an ACK. RFC 793 requires that |
| * TCP responds with an ACK for such a bogus ACK. By not following |
| * the RFC, we prevent TCP from getting into an ACK storm if somehow |
| * an attacker successfully spoofs an acceptable segment to our |
| * peer; or when our peer is "confused." |
| */ |
| uint32_t tcp_drop_ack_unsent_cnt = 10; |
| |
| /* |
| * Hook functions to enable cluster networking |
| * On non-clustered systems these vectors must always be NULL. |
| */ |
| |
| void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family, |
| uint8_t *laddrp, in_port_t lport) = NULL; |
| void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family, |
| uint8_t *laddrp, in_port_t lport) = NULL; |
| void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family, |
| uint8_t *laddrp, in_port_t lport, |
| uint8_t *faddrp, in_port_t fport) = NULL; |
| void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family, |
| uint8_t *laddrp, in_port_t lport, |
| uint8_t *faddrp, in_port_t fport) = NULL; |
| |
| /* |
| * The following are defined in ip.c |
| */ |
| extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family, |
| uint8_t *laddrp); |
| extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, |
| uint8_t *laddrp, uint8_t *faddrp); |
| |
| #define CL_INET_CONNECT(tcp) { \ |
| if (cl_inet_connect != NULL) { \ |
| /* \ |
| * Running in cluster mode - register active connection \ |
| * information \ |
| */ \ |
| if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ |
| if ((tcp)->tcp_ipha->ipha_src != 0) { \ |
| (*cl_inet_connect)(IPPROTO_TCP, AF_INET,\ |
| (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ |
| (in_port_t)(tcp)->tcp_lport, \ |
| (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ |
| (in_port_t)(tcp)->tcp_fport); \ |
| } \ |
| } else { \ |
| if (!IN6_IS_ADDR_UNSPECIFIED( \ |
| &(tcp)->tcp_ip6h->ip6_src)) {\ |
| (*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\ |
| (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ |
| (in_port_t)(tcp)->tcp_lport, \ |
| (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ |
| (in_port_t)(tcp)->tcp_fport); \ |
| } \ |
| } \ |
| } \ |
| } |
| |
| #define CL_INET_DISCONNECT(tcp) { \ |
| if (cl_inet_disconnect != NULL) { \ |
| /* \ |
| * Running in cluster mode - deregister active \ |
| * connection information \ |
| */ \ |
| if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ |
| if ((tcp)->tcp_ip_src != 0) { \ |
| (*cl_inet_disconnect)(IPPROTO_TCP, \ |
| AF_INET, \ |
| (uint8_t *)(&((tcp)->tcp_ip_src)),\ |
| (in_port_t)(tcp)->tcp_lport, \ |
| (uint8_t *) \ |
| (&((tcp)->tcp_ipha->ipha_dst)),\ |
| (in_port_t)(tcp)->tcp_fport); \ |
| } \ |
| } else { \ |
| if (!IN6_IS_ADDR_UNSPECIFIED( \ |
| &(tcp)->tcp_ip_src_v6)) { \ |
| (*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\ |
| (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ |
| (in_port_t)(tcp)->tcp_lport, \ |
| (uint8_t *) \ |
| (&((tcp)->tcp_ip6h->ip6_dst)),\ |
| (in_port_t)(tcp)->tcp_fport); \ |
| } \ |
| } \ |
| } \ |
| } |
| |
| /* |
| * Cluster networking hook for traversing current connection list. |
| * This routine is used to extract the current list of live connections |
| * which must continue to to be dispatched to this node. |
| */ |
| int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg); |
| |
| #define IPH_TCPH_CHECKSUMP(ipha, hlen) \ |
| ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + 16))) |
| |
| #ifdef _BIG_ENDIAN |
| #define IP_TCP_CSUM_COMP IPPROTO_TCP |
| #else |
| #define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) |
| #endif |
| |
| #define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \ |
| (sum) += (ttl_protocol) + (ipha)->ipha_ident + \ |
| ((v_hlen_tos_len) >> 16) + \ |
| ((v_hlen_tos_len) & 0xFFFF) + \ |
| (ipha)->ipha_fragment_offset_and_flags; \ |
| (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \ |
| (sum) = ~((sum) + ((sum) >> 16)); \ |
| (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \ |
| } |
| |
| /* |
| * Macros that determine whether or not IP processing is needed for TCP. |
| */ |
| #define TCP_IPOPT_POLICY_V4(tcp) \ |
| ((tcp)->tcp_ipversion == IPV4_VERSION && \ |
| ((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH || \ |
| CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) || \ |
| CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp))) |
| |
| #define TCP_IPOPT_POLICY_V6(tcp) \ |
| ((tcp)->tcp_ipversion == IPV6_VERSION && \ |
| ((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN || \ |
| CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) || \ |
| CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp))) |
| |
| #define TCP_LOOPBACK_IP(tcp) \ |
| (TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) || \ |
| !CONN_IS_MD_FASTPATH((tcp)->tcp_connp)) |
| |
| boolean_t do_tcp_fusion = B_TRUE; |
| |
| /* |
| * This routine gets called by the eager tcp upon changing state from |
| * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself |
| * and the active connect tcp such that the regular tcp processings |
| * may be bypassed under allowable circumstances. Because the fusion |
| * requires both endpoints to be in the same squeue, it does not work |
| * for simultaneous active connects because there is no easy way to |
| * switch from one squeue to another once the connection is created. |
| * This is different from the eager tcp case where we assign it the |
| * same squeue as the one given to the active connect tcp during open. |
| */ |
| static void |
| tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) |
| { |
| conn_t *peer_connp, *connp = tcp->tcp_connp; |
| tcp_t *peer_tcp; |
| |
| ASSERT(!tcp->tcp_fused); |
| ASSERT(tcp->tcp_loopback); |
| ASSERT(tcp->tcp_loopback_peer == NULL); |
| /* |
| * We need to check the listener tcp to make sure it's a socket |
| * endpoint, but we can't really use tcp_listener since we get |
| * here after sending up T_CONN_IND and tcp_wput_accept() may be |
| * called independently, at which point tcp_listener is cleared; |
| * this is why we use tcp_saved_listener. The listener itself |
| * is guaranteed to be around until tcp_accept_finish() is called |
| * on this eager -- this won't happen until we're done since |
| * we're inside the eager's perimeter now. |
| */ |
| ASSERT(tcp->tcp_saved_listener != NULL); |
| |
| /* |
| * Lookup peer endpoint; search for the remote endpoint having |
| * the reversed address-port quadruplet in ESTABLISHED state, |
| * which is guaranteed to be unique in the system. Zone check |
| * is applied accordingly for loopback address, but not for |
| * local address since we want fusion to happen across Zones. |
| */ |
| if (tcp->tcp_ipversion == IPV4_VERSION) { |
| peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp, |
| (ipha_t *)iphdr, tcph); |
| } else { |
| peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp, |
| (ip6_t *)iphdr, tcph); |
| } |
| |
| /* |
| * We can only proceed if peer exists, resides in the same squeue |
| * as our conn and is not raw-socket. The squeue assignment of |
| * this eager tcp was done earlier at the time of SYN processing |
| * in ip_fanout_tcp{_v6}. Note that similar squeues by itself |
| * doesn't guarantee a safe condition to fuse, hence we perform |
| * additional tests below. |
| */ |
| ASSERT(peer_connp == NULL || peer_connp != connp); |
| if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp || |
| !IPCL_IS_TCP(peer_connp)) { |
| if (peer_connp != NULL) { |
| TCP_STAT(tcp_fusion_unqualified); |
| CONN_DEC_REF(peer_connp); |
| } |
| return; |
| } |
| peer_tcp = peer_connp->conn_tcp; /* active connect tcp */ |
| |
| ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused); |
| ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL); |
| ASSERT(peer_connp->conn_sqp == connp->conn_sqp); |
| |
| /* |
| * Fuse the endpoints; we perform further checks against both |
| * tcp endpoints to ensure that a fusion is allowed to happen. |
| * In particular we bail out for TPI, non-simple TCP/IP or if |
| * IPsec/IPQoS policy exists. We could actually do it for the |
| * XTI/TLI/TPI case but this requires more testing, so for now |
| * we handle only the socket case. |
| */ |
| if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable && |
| TCP_IS_SOCKET(tcp->tcp_saved_listener) && TCP_IS_SOCKET(peer_tcp) && |
| !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) && |
| !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) { |
| mblk_t *mp; |
| struct stroptions *stropt; |
| queue_t *peer_rq = peer_tcp->tcp_rq; |
| size_t sth_hiwat; |
| |
| ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL); |
| |
| /* |
| * We need to drain data on both endpoints during unfuse. |
| * If we need to send up SIGURG at the time of draining, |
| * we want to be sure that an mblk is readily available. |
| * This is why we pre-allocate the M_PCSIG mblks for both |
| * endpoints which will only be used during/after unfuse. |
| */ |
| if ((mp = allocb(1, BPRI_HI)) == NULL) { |
| CONN_DEC_REF(peer_connp); |
| return; |
| } |
| ASSERT(tcp->tcp_fused_sigurg_mp == NULL); |
| tcp->tcp_fused_sigurg_mp = mp; |
| |
| if ((mp = allocb(1, BPRI_HI)) == NULL) { |
| freeb(tcp->tcp_fused_sigurg_mp); |
| tcp->tcp_fused_sigurg_mp = NULL; |
| CONN_DEC_REF(peer_connp); |
| return; |
| } |
| ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL); |
| peer_tcp->tcp_fused_sigurg_mp = mp; |
| |
| /* Allocate M_SETOPTS mblk */ |
| mp = allocb(sizeof (*stropt), BPRI_HI); |
| if (mp == NULL) { |
| freeb(tcp->tcp_fused_sigurg_mp); |
| tcp->tcp_fused_sigurg_mp = NULL; |
| freeb(peer_tcp->tcp_fused_sigurg_mp); |
| peer_tcp->tcp_fused_sigurg_mp = NULL; |
| CONN_DEC_REF(peer_connp); |
| return; |
| } |
| |
| /* Fuse both endpoints */ |
| peer_tcp->tcp_loopback_peer = tcp; |
| tcp->tcp_loopback_peer = peer_tcp; |
| peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE; |
| |
| /* |
| * We never use regular tcp paths in fusion and should |
| * therefore clear tcp_unsent on both endpoints. Having |
| * them set to non-zero values means asking for trouble |
| * especially after unfuse, where we may end up sending |
| * through regular tcp paths which expect xmit_list and |
| * friends to be correctly setup. |
| */ |
| peer_tcp->tcp_unsent = tcp->tcp_unsent = 0; |
| |
| tcp_timers_stop(tcp); |
| tcp_timers_stop(peer_tcp); |
| |
| /* |
| * Set the stream head's write offset value to zero, since we |
| * won't be needing any room for TCP/IP headers, and tell it |
| * to not break up the writes. This would reduce the amount |
| * of work done by kmem. In addition, we set the receive |
| * buffer to twice that of q_hiwat in order to simulate the |
| * non-fusion case. Note that we can only do this for the |
| * active connect tcp since our eager is still detached; |
| * it will be dealt with later in tcp_accept_finish(). |
| */ |
| DB_TYPE(mp) = M_SETOPTS; |
| mp->b_wptr += sizeof (*stropt); |
| |
| sth_hiwat = peer_rq->q_hiwat << 1; |
| if (sth_hiwat > tcp_max_buf) |
| sth_hiwat = tcp_max_buf; |
| |
| stropt = (struct stroptions *)mp->b_rptr; |
| stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT; |
| stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE); |
| stropt->so_wroff = 0; |
| stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat); |
| |
| /* Send the options up */ |
| putnext(peer_rq, mp); |
| } else { |
| TCP_STAT(tcp_fusion_unqualified); |
| } |
| CONN_DEC_REF(peer_connp); |
| } |
| |
| /* |
| * Unfuse a previously-fused pair of tcp loopback endpoints. |
| */ |
| static void |
| tcp_unfuse(tcp_t *tcp) |
| { |
| tcp_t *peer_tcp = tcp->tcp_loopback_peer; |
| |
| ASSERT(tcp->tcp_fused && peer_tcp != NULL); |
| ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp); |
| ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); |
| ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0); |
| ASSERT(tcp->tcp_fused_sigurg_mp != NULL); |
| ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL); |
| |
| /* |
| * Drain any pending data; the detached check is needed because |
| * we may be called from tcp_fuse_output(). Note that in case of |
| * a detached tcp, the draining will happen later after the tcp |
| * is unfused. For non-urgent data, this can be handled by the |
| * regular tcp_rcv_drain(). If we have urgent data sitting in |
| * the receive list, we will need to send up a SIGURG signal first |
| * before draining the data. All of these will be handled by the |
| * code in tcp_fuse_rcv_drain() when called from tcp_rcv_drain(). |
| */ |
| if (!TCP_IS_DETACHED(tcp)) { |
| (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp, |
| &tcp->tcp_fused_sigurg_mp); |
| } |
| if (!TCP_IS_DETACHED(peer_tcp)) { |
| (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp, |
| &peer_tcp->tcp_fused_sigurg_mp); |
| } |
| /* Lift up any flow-control conditions */ |
| if (tcp->tcp_flow_stopped) { |
| tcp_clrqfull(tcp); |
| tcp->tcp_flow_stopped = B_FALSE; |
| TCP_STAT(tcp_fusion_backenabled); |
| } |
| if (peer_tcp->tcp_flow_stopped) { |
| tcp_clrqfull(peer_tcp); |
| peer_tcp->tcp_flow_stopped = B_FALSE; |
| TCP_STAT(tcp_fusion_backenabled); |
| } |
| |
| /* Free up M_PCSIG mblk(s) if not needed */ |
| if (!tcp->tcp_fused_sigurg && tcp->tcp_fused_sigurg_mp != NULL) { |
| freeb(tcp->tcp_fused_sigurg_mp); |
| tcp->tcp_fused_sigurg_mp = NULL; |
| } |
| if (!peer_tcp->tcp_fused_sigurg && |
| peer_tcp->tcp_fused_sigurg_mp != NULL) { |
| freeb(peer_tcp->tcp_fused_sigurg_mp); |
| peer_tcp->tcp_fused_sigurg_mp = NULL; |
| } |
| |
| /* |
| * Update th_seq and th_ack in the header template |
| */ |
| U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq); |
| U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); |
| U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq); |
| U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack); |
| |
| /* Unfuse the endpoints */ |
| peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE; |
| peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL; |
| } |
| |
| /* |
| * Fusion output routine for urgent data. This routine is called by |
| * tcp_fuse_output() for handling non-M_DATA mblks. |
| */ |
| static void |
| tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp) |
| { |
| mblk_t *mp1; |
| struct T_exdata_ind *tei; |
| tcp_t *peer_tcp = tcp->tcp_loopback_peer; |
| mblk_t *head, *prev_head = NULL; |
| |
| ASSERT(tcp->tcp_fused); |
| ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); |
| ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); |
| ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA); |
| ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0); |
| |
| /* |
| * Urgent data arrives in the form of T_EXDATA_REQ from above. |
| * Each occurence denotes a new urgent pointer. For each new |
| * urgent pointer we signal (SIGURG) the receiving app to indicate |
| * that it needs to go into urgent mode. This is similar to the |
| * urgent data handling in the regular tcp. We don't need to keep |
| * track of where the urgent pointer is, because each T_EXDATA_REQ |
| * "advances" the urgent pointer for us. |
| * |
| * The actual urgent data carried by T_EXDATA_REQ is then prepended |
| * by a T_EXDATA_IND before being enqueued behind any existing data |
| * destined for the receiving app. There is only a single urgent |
| * pointer (out-of-band mark) for a given tcp. If the new urgent |
| * data arrives before the receiving app reads some existing urgent |
| * data, the previous marker is lost. This behavior is emulated |
| * accordingly below, by removing any existing T_EXDATA_IND messages |
| * and essentially converting old urgent data into non-urgent. |
| */ |
| ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID); |
| /* Let sender get out of urgent mode */ |
| tcp->tcp_valid_bits &= ~TCP_URG_VALID; |
| |
| /* |
| * Send up SIGURG to the receiving peer; if the peer is detached |
| * or if we can't allocate the M_PCSIG, indicate that we need to |
| * signal upon draining to the peer by marking tcp_fused_sigurg. |
| * This flag will only get cleared once SIGURG is delivered and |
| * is not affected by the tcp_fused flag -- delivery will still |
| * happen even after an endpoint is unfused, to handle the case |
| * where the sending endpoint immediately closes/unfuses after |
| * sending urgent data and the accept is not yet finished. |
| */ |
| if (!TCP_IS_DETACHED(peer_tcp) && |
| ((mp1 = allocb(1, BPRI_HI)) != NULL || |
| (mp1 = allocb_tryhard(1)) != NULL)) { |
| peer_tcp->tcp_fused_sigurg = B_FALSE; |
| /* Send up the signal */ |
| DB_TYPE(mp1) = M_PCSIG; |
| *mp1->b_wptr++ = (uchar_t)SIGURG; |
| putnext(peer_tcp->tcp_rq, mp1); |
| } else { |
| peer_tcp->tcp_fused_sigurg = B_TRUE; |
| } |
| |
| /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */ |
| DB_TYPE(mp) = M_PROTO; |
| tei = (struct T_exdata_ind *)mp->b_rptr; |
| tei->PRIM_type = T_EXDATA_IND; |
| tei->MORE_flag = 0; |
| mp->b_wptr = (uchar_t *)&tei[1]; |
| |
| TCP_STAT(tcp_fusion_urg); |
| BUMP_MIB(&tcp_mib, tcpOutUrg); |
| |
| head = peer_tcp->tcp_rcv_list; |
| while (head != NULL) { |
| /* |
| * Remove existing T_EXDATA_IND, keep the data which follows |
| * it and relink our list. Note that we don't modify the |
| * tcp_rcv_last_tail since it never points to T_EXDATA_IND. |
| */ |
| if (DB_TYPE(head) != M_DATA) { |
| mp1 = head; |
| |
| ASSERT(DB_TYPE(mp1->b_cont) == M_DATA); |
| head = mp1->b_cont; |
| mp1->b_cont = NULL; |
| head->b_next = mp1->b_next; |
| mp1->b_next = NULL; |
| if (prev_head != NULL) |
| prev_head->b_next = head; |
| if (peer_tcp->tcp_rcv_list == mp1) |
| peer_tcp->tcp_rcv_list = head; |
| if (peer_tcp->tcp_rcv_last_head == mp1) |
| peer_tcp->tcp_rcv_last_head = head; |
| freeb(mp1); |
| } |
| prev_head = head; |
| head = head->b_next; |
| } |
| } |
| |
| /* |
| * Fusion output routine, called by tcp_output() and tcp_wput_proto(). |
| */ |
| static boolean_t |
| tcp_fuse_output(tcp_t *tcp, mblk_t *mp) |
| { |
| tcp_t *peer_tcp = tcp->tcp_loopback_peer; |
| queue_t *peer_rq; |
| mblk_t *mp_tail = mp; |
| uint32_t send_size = 0; |
| |
| ASSERT(tcp->tcp_fused); |
| ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); |
| ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); |
| ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO || |
| DB_TYPE(mp) == M_PCPROTO); |
| |
| peer_rq = peer_tcp->tcp_rq; |
| |
| /* If this connection requires IP, unfuse and use regular path */ |
| if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) || |
| IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) { |
| TCP_STAT(tcp_fusion_aborted); |
| tcp_unfuse(tcp); |
| return (B_FALSE); |
| } |
| |
| for (;;) { |
| if (DB_TYPE(mp_tail) == M_DATA) |
| send_size += MBLKL(mp_tail); |
| if (mp_tail->b_cont == NULL) |
| break; |
| mp_tail = mp_tail->b_cont; |
| } |
| |
| if (send_size == 0) { |
| freemsg(mp); |
| return (B_TRUE); |
| } |
| |
| /* |
| * Handle urgent data; we either send up SIGURG to the peer now |
| * or do it later when we drain, in case the peer is detached |
| * or if we're short of memory for M_PCSIG mblk. |
| */ |
| if (DB_TYPE(mp) != M_DATA) |
| tcp_fuse_output_urg(tcp, mp); |
| |
| /* |
| * Enqueue data into the peer's receive list; we may or may not |
| * drain the contents depending on the conditions below. |
| */ |
| tcp_rcv_enqueue(peer_tcp, mp, send_size); |
| |
| /* In case it wrapped around and also to keep it constant */ |
| peer_tcp->tcp_rwnd += send_size; |
| |
| /* |
| * If peer is detached, exercise flow-control when needed; we will |
| * get back-enabled either in tcp_accept_finish() or tcp_unfuse(). |
| */ |
| if (TCP_IS_DETACHED(peer_tcp) && |
| peer_tcp->tcp_rcv_cnt > peer_rq->q_hiwat) { |
| tcp_setqfull(tcp); |
| tcp->tcp_flow_stopped = B_TRUE; |
| TCP_STAT(tcp_fusion_flowctl); |
| } |
| |
| loopback_packets++; |
| tcp->tcp_last_sent_len = send_size; |
| |
| /* Need to adjust the following SNMP MIB-related variables */ |
| tcp->tcp_snxt += send_size; |
| tcp->tcp_suna = tcp->tcp_snxt; |
| peer_tcp->tcp_rnxt += send_size; |
| peer_tcp->tcp_rack = peer_tcp->tcp_rnxt; |
| |
| BUMP_MIB(&tcp_mib, tcpOutDataSegs); |
| UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size); |
| |
| BUMP_MIB(&tcp_mib, tcpInSegs); |
| BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); |
| UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size); |
| |
| BUMP_LOCAL(tcp->tcp_obsegs); |
| BUMP_LOCAL(peer_tcp->tcp_ibsegs); |
| |
| if (!TCP_IS_DETACHED(peer_tcp)) { |
| /* |
| * If we can't send SIGURG above due to lack of memory, |
| * schedule push timer and try again. Otherwise drain |
| * the data if we're not flow-controlled. |
| */ |
| if (peer_tcp->tcp_fused_sigurg) { |
| if (peer_tcp->tcp_push_tid == 0) { |
| peer_tcp->tcp_push_tid = |
| TCP_TIMER(peer_tcp, tcp_push_timer, |
| MSEC_TO_TICK(tcp_push_timer_interval)); |
| } |
| } else if (!tcp->tcp_flow_stopped) { |
| if (!canputnext(peer_rq)) { |
| tcp_setqfull(tcp); |
| tcp->tcp_flow_stopped = B_TRUE; |
| TCP_STAT(tcp_fusion_flowctl); |
| } else { |
| ASSERT(peer_tcp->tcp_rcv_list != NULL); |
| (void) tcp_fuse_rcv_drain(peer_rq, |
| peer_tcp, NULL); |
| TCP_STAT(tcp_fusion_putnext); |
| } |
| } |
| } |
| return (B_TRUE); |
| } |
| |
| /* |
| * This routine gets called to deliver data upstream on a fused or |
| * previously fused tcp loopback endpoint; the latter happens only |
| * when there is a pending SIGURG signal plus urgent data that can't |
| * be sent upstream in the past. |
| */ |
| static boolean_t |
| tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) |
| { |
| mblk_t *mp; |
| #ifdef DEBUG |
| uint_t cnt = 0; |
| #endif |
| |
| ASSERT(tcp->tcp_loopback); |
| ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg); |
| ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL); |
| ASSERT(sigurg_mpp != NULL || tcp->tcp_fused); |
| |
| /* No need for the push timer now, in case it was scheduled */ |
| if (tcp->tcp_push_tid != 0) { |
| (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); |
| tcp->tcp_push_tid = 0; |
| } |
| /* |
| * If there's urgent data sitting in receive list and we didn't |
| * get a chance to send up a SIGURG signal, make sure we send |
| * it first before draining in order to ensure that SIOCATMARK |
| * works properly. |
| */ |
| if (tcp->tcp_fused_sigurg) { |
| /* |
| * sigurg_mpp is normally NULL, i.e. when we're still |
| * fused and didn't get here because of tcp_unfuse(). |
| * In this case try hard to allocate the M_PCSIG mblk. |
| */ |
| if (sigurg_mpp == NULL && |
| (mp = allocb(1, BPRI_HI)) == NULL && |
| (mp = allocb_tryhard(1)) == NULL) { |
| /* Alloc failed; try again next time */ |
| tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer, |
| MSEC_TO_TICK(tcp_push_timer_interval)); |
| return (B_TRUE); |
| } else if (sigurg_mpp != NULL) { |
| /* |
| * Use the supplied M_PCSIG mblk; it means we're |
| * either unfused or in the process of unfusing, |
| * and the drain must happen now. |
| */ |
| mp = *sigurg_mpp; |
| *sigurg_mpp = NULL; |
| } |
| ASSERT(mp != NULL); |
| |
| tcp->tcp_fused_sigurg = B_FALSE; |
| /* Send up the signal */ |
| DB_TYPE(mp) = M_PCSIG; |
| *mp->b_wptr++ = (uchar_t)SIGURG; |
| putnext(q, mp); |
| /* |
| * Let the regular tcp_rcv_drain() path handle |
| * draining the data if we're no longer fused. |
| */ |
| if (!tcp->tcp_fused) |
| return (B_FALSE); |
| } |
| |
| /* Drain the data */ |
| while ((mp = tcp->tcp_rcv_list) != NULL) { |
| tcp->tcp_rcv_list = mp->b_next; |
| mp->b_next = NULL; |
| #ifdef DEBUG |
| cnt += msgdsize(mp); |
| #endif |
| putnext(q, mp); |
| } |
| |
| ASSERT(cnt == tcp->tcp_rcv_cnt); |
| tcp->tcp_rcv_last_head = NULL; |
| tcp->tcp_rcv_last_tail = NULL; |
| tcp->tcp_rcv_cnt = 0; |
| tcp->tcp_rwnd = q->q_hiwat; |
| |
| return (B_TRUE); |
| } |
| |
| /* |
| * This is the walker function, which is TCP specific. |
| * It walks through the conn_hash bucket searching for the |
| * next valid connp/tcp in the list, selecting connp/tcp |
| * which haven't closed or condemned. It also REFHOLDS the |
| * reference for the tcp, ensuring that the tcp exists |
| * when the caller uses the tcp. |
| * |
| * tcp_get_next_conn |
| * get the next entry in the conn global list |
| * and put a reference on the next_conn. |
| * decrement the reference on the current conn. |
| */ |
| conn_t * |
| tcp_get_next_conn(connf_t *connfp, conn_t *connp) |
| { |
| conn_t *next_connp; |
| |
| if (connfp == NULL) |
| return (NULL); |
| |
| mutex_enter(&connfp->connf_lock); |
| |
| next_connp = (connp == NULL) ? |
| connfp->connf_head : connp->conn_g_next; |
| |
| while (next_connp != NULL) { |
| mutex_enter(&next_connp->conn_lock); |
| if ((next_connp->conn_state_flags & |
| (CONN_CONDEMNED | CONN_INCIPIENT)) || |
| !IPCL_IS_TCP(next_connp)) { |
| /* |
| * This conn has been condemned or |
| * is closing. |
| */ |
| mutex_exit(&next_connp->conn_lock); |
| next_connp = next_connp->conn_g_next; |
| continue; |
| } |
| ASSERT(next_connp->conn_tcp != NULL); |
| CONN_INC_REF_LOCKED(next_connp); |
| mutex_exit(&next_connp->conn_lock); |
| break; |
| } |
| |
| mutex_exit(&connfp->connf_lock); |
| |
| if (connp != NULL) { |
| CONN_DEC_REF(connp); |
| } |
| |
| return (next_connp); |
| } |
| |
| /* |
| * Figure out the value of window scale opton. Note that the rwnd is |
| * ASSUMED to be rounded up to the nearest MSS before the calculation. |
| * We cannot find the scale value and then do a round up of tcp_rwnd |
| * because the scale value may not be correct after that. |
| * |
| * Set the compiler flag to make this function inline. |
| */ |
| static void |
| tcp_set_ws_value(tcp_t *tcp) |
| { |
| int i; |
| uint32_t rwnd = tcp->tcp_rwnd; |
| |
| for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; |
| i++, rwnd >>= 1) |
| ; |
| tcp->tcp_rcv_ws = i; |
| } |
| |
| /* |
| * Remove a connection from the list of detached TIME_WAIT connections. |
| */ |
| static void |
| tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) |
| { |
| boolean_t locked = B_FALSE; |
| |
| if (tcp_time_wait == NULL) { |
| tcp_time_wait = *((tcp_squeue_priv_t **) |
| squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); |
| mutex_enter(&tcp_time_wait->tcp_time_wait_lock); |
| locked = B_TRUE; |
| } |
| |
| if (tcp->tcp_time_wait_expire == 0) { |
| ASSERT(tcp->tcp_time_wait_next == NULL); |
| ASSERT(tcp->tcp_time_wait_prev == NULL); |
| if (locked) |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| return; |
| } |
| ASSERT(TCP_IS_DETACHED(tcp)); |
| ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); |
| |
| if (tcp == tcp_time_wait->tcp_time_wait_head) { |
| ASSERT(tcp->tcp_time_wait_prev == NULL); |
| tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; |
| if (tcp_time_wait->tcp_time_wait_head != NULL) { |
| tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = |
| NULL; |
| } else { |
| tcp_time_wait->tcp_time_wait_tail = NULL; |
| } |
| } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { |
| ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); |
| ASSERT(tcp->tcp_time_wait_next == NULL); |
| tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; |
| ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); |
| tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; |
| } else { |
| ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); |
| ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); |
| tcp->tcp_time_wait_prev->tcp_time_wait_next = |
| tcp->tcp_time_wait_next; |
| tcp->tcp_time_wait_next->tcp_time_wait_prev = |
| tcp->tcp_time_wait_prev; |
| } |
| tcp->tcp_time_wait_next = NULL; |
| tcp->tcp_time_wait_prev = NULL; |
| tcp->tcp_time_wait_expire = 0; |
| |
| if (locked) |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| } |
| |
| /* |
| * Add a connection to the list of detached TIME_WAIT connections |
| * and set its time to expire. |
| */ |
| static void |
| tcp_time_wait_append(tcp_t *tcp) |
| { |
| tcp_squeue_priv_t *tcp_time_wait = |
| *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, |
| SQPRIVATE_TCP)); |
| |
| tcp_timers_stop(tcp); |
| |
| /* Freed above */ |
| ASSERT(tcp->tcp_timer_tid == 0); |
| ASSERT(tcp->tcp_ack_tid == 0); |
| |
| /* must have happened at the time of detaching the tcp */ |
| ASSERT(tcp->tcp_ptpahn == NULL); |
| ASSERT(tcp->tcp_flow_stopped == 0); |
| ASSERT(tcp->tcp_time_wait_next == NULL); |
| ASSERT(tcp->tcp_time_wait_prev == NULL); |
| ASSERT(tcp->tcp_time_wait_expire == NULL); |
| ASSERT(tcp->tcp_listener == NULL); |
| |
| tcp->tcp_time_wait_expire = ddi_get_lbolt(); |
| /* |
| * The value computed below in tcp->tcp_time_wait_expire may |
| * appear negative or wrap around. That is ok since our |
| * interest is only in the difference between the current lbolt |
| * value and tcp->tcp_time_wait_expire. But the value should not |
| * be zero, since it means the tcp is not in the TIME_WAIT list. |
| * The corresponding comparison in tcp_time_wait_collector() uses |
| * modular arithmetic. |
| */ |
| tcp->tcp_time_wait_expire += |
| drv_usectohz(tcp_time_wait_interval * 1000); |
| if (tcp->tcp_time_wait_expire == 0) |
| tcp->tcp_time_wait_expire = 1; |
| |
| ASSERT(TCP_IS_DETACHED(tcp)); |
| ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); |
| ASSERT(tcp->tcp_time_wait_next == NULL); |
| ASSERT(tcp->tcp_time_wait_prev == NULL); |
| TCP_DBGSTAT(tcp_time_wait); |
| mutex_enter(&tcp_time_wait->tcp_time_wait_lock); |
| if (tcp_time_wait->tcp_time_wait_head == NULL) { |
| ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); |
| tcp_time_wait->tcp_time_wait_head = tcp; |
| } else { |
| ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); |
| ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == |
| TCPS_TIME_WAIT); |
| tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; |
| tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; |
| } |
| tcp_time_wait->tcp_time_wait_tail = tcp; |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| } |
| |
| /* ARGSUSED */ |
| void |
| tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) |
| { |
| conn_t *connp = (conn_t *)arg; |
| tcp_t *tcp = connp->conn_tcp; |
| |
| ASSERT(tcp != NULL); |
| if (tcp->tcp_state == TCPS_CLOSED) { |
| return; |
| } |
| |
| ASSERT((tcp->tcp_family == AF_INET && |
| tcp->tcp_ipversion == IPV4_VERSION) || |
| (tcp->tcp_family == AF_INET6 && |
| (tcp->tcp_ipversion == IPV4_VERSION || |
| tcp->tcp_ipversion == IPV6_VERSION))); |
| ASSERT(!tcp->tcp_listener); |
| |
| TCP_STAT(tcp_time_wait_reap); |
| ASSERT(TCP_IS_DETACHED(tcp)); |
| |
| /* |
| * Because they have no upstream client to rebind or tcp_close() |
| * them later, we axe the connection here and now. |
| */ |
| tcp_close_detached(tcp); |
| } |
| |
| void |
| tcp_cleanup(tcp_t *tcp) |
| { |
| mblk_t *mp; |
| char *tcp_iphc; |
| int tcp_iphc_len; |
| int tcp_hdr_grown; |
| tcp_sack_info_t *tcp_sack_info; |
| conn_t *connp = tcp->tcp_connp; |
| |
| tcp_bind_hash_remove(tcp); |
| tcp_free(tcp); |
| |
| conn_delete_ire(connp, NULL); |
| if (connp->conn_flags & IPCL_TCPCONN) { |
| if (connp->conn_latch != NULL) |
| IPLATCH_REFRELE(connp->conn_latch); |
| if (connp->conn_policy != NULL) |
| IPPH_REFRELE(connp->conn_policy); |
| } |
| |
| /* |
| * Since we will bzero the entire structure, we need to |
| * remove it and reinsert it in global hash list. We |
| * know the walkers can't get to this conn because we |
| * had set CONDEMNED flag earlier and checked reference |
| * under conn_lock so walker won't pick it and when we |
| * go the ipcl_globalhash_remove() below, no walker |
| * can get to it. |
| */ |
| ipcl_globalhash_remove(connp); |
| |
| /* Save some state */ |
| mp = tcp->tcp_timercache; |
| |
| tcp_sack_info = tcp->tcp_sack_info; |
| tcp_iphc = tcp->tcp_iphc; |
| tcp_iphc_len = tcp->tcp_iphc_len; |
| tcp_hdr_grown = tcp->tcp_hdr_grown; |
| |
| bzero(connp, sizeof (conn_t)); |
| bzero(tcp, sizeof (tcp_t)); |
| |
| /* restore the state */ |
| tcp->tcp_timercache = mp; |
| |
| tcp->tcp_sack_info = tcp_sack_info; |
| tcp->tcp_iphc = tcp_iphc; |
| tcp->tcp_iphc_len = tcp_iphc_len; |
| tcp->tcp_hdr_grown = tcp_hdr_grown; |
| |
| |
| tcp->tcp_connp = connp; |
| |
| connp->conn_tcp = tcp; |
| connp->conn_flags = IPCL_TCPCONN; |
| connp->conn_state_flags = CONN_INCIPIENT; |
| connp->conn_ulp = IPPROTO_TCP; |
| connp->conn_ref = 1; |
| |
| ipcl_globalhash_insert(connp); |
| } |
| |
| /* |
| * Blows away all tcps whose TIME_WAIT has expired. List traversal |
| * is done forwards from the head. |
| */ |
| /* ARGSUSED */ |
| void |
| tcp_time_wait_collector(void *arg) |
| { |
| tcp_t *tcp; |
| clock_t now; |
| mblk_t *mp; |
| conn_t *connp; |
| kmutex_t *lock; |
| |
| squeue_t *sqp = (squeue_t *)arg; |
| tcp_squeue_priv_t *tcp_time_wait = |
| *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); |
| |
| mutex_enter(&tcp_time_wait->tcp_time_wait_lock); |
| tcp_time_wait->tcp_time_wait_tid = 0; |
| |
| if (tcp_time_wait->tcp_free_list != NULL && |
| tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { |
| TCP_STAT(tcp_freelist_cleanup); |
| while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { |
| tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; |
| CONN_DEC_REF(tcp->tcp_connp); |
| } |
| } |
| |
| /* |
| * In order to reap time waits reliably, we should use a |
| * source of time that is not adjustable by the user -- hence |
| * the call to ddi_get_lbolt(). |
| */ |
| now = ddi_get_lbolt(); |
| while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { |
| /* |
| * Compare times using modular arithmetic, since |
| * lbolt can wrapover. |
| */ |
| if ((now - tcp->tcp_time_wait_expire) < 0) { |
| break; |
| } |
| |
| tcp_time_wait_remove(tcp, tcp_time_wait); |
| |
| connp = tcp->tcp_connp; |
| ASSERT(connp->conn_fanout != NULL); |
| lock = &connp->conn_fanout->connf_lock; |
| /* |
| * This is essentially a TW reclaim fast path optimization for |
| * performance where the timewait collector checks under the |
| * fanout lock (so that no one else can get access to the |
| * conn_t) that the refcnt is 2 i.e. one for TCP and one for |
| * the classifier hash list. If ref count is indeed 2, we can |
| * just remove the conn under the fanout lock and avoid |
| * cleaning up the conn under the squeue, provided that |
| * clustering callbacks are not enabled. If clustering is |
| * enabled, we need to make the clustering callback before |
| * setting the CONDEMNED flag and after dropping all locks and |
| * so we forego this optimization and fall back to the slow |
| * path. Also please see the comments in tcp_closei_local |
| * regarding the refcnt logic. |
| * |
| * Since we are holding the tcp_time_wait_lock, its better |
| * not to block on the fanout_lock because other connections |
| * can't add themselves to time_wait list. So we do a |
| * tryenter instead of mutex_enter. |
| */ |
| if (mutex_tryenter(lock)) { |
| mutex_enter(&connp->conn_lock); |
| if ((connp->conn_ref == 2) && |
| (cl_inet_disconnect == NULL)) { |
| ipcl_hash_remove_locked(connp, |
| connp->conn_fanout); |
| /* |
| * Set the CONDEMNED flag now itself so that |
| * the refcnt cannot increase due to any |
| * walker. But we have still not cleaned up |
| * conn_ire_cache. This is still ok since |
| * we are going to clean it up in tcp_cleanup |
| * immediately and any interface unplumb |
| * thread will wait till the ire is blown away |
| */ |
|