| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| /* Copyright (c) 1990 Mentat Inc. */ |
| |
| #include <sys/types.h> |
| #include <sys/stream.h> |
| #include <sys/strsun.h> |
| #include <sys/strsubr.h> |
| #include <sys/stropts.h> |
| #include <sys/strlog.h> |
| #define _SUN_TPI_VERSION 2 |
| #include <sys/tihdr.h> |
| #include <sys/timod.h> |
| #include <sys/ddi.h> |
| #include <sys/sunddi.h> |
| #include <sys/suntpi.h> |
| #include <sys/xti_inet.h> |
| #include <sys/cmn_err.h> |
| #include <sys/debug.h> |
| #include <sys/sdt.h> |
| #include <sys/vtrace.h> |
| #include <sys/kmem.h> |
| #include <sys/ethernet.h> |
| #include <sys/cpuvar.h> |
| #include <sys/dlpi.h> |
| #include <sys/multidata.h> |
| #include <sys/multidata_impl.h> |
| #include <sys/pattr.h> |
| #include <sys/policy.h> |
| #include <sys/priv.h> |
| #include <sys/zone.h> |
| #include <sys/sunldi.h> |
| |
| #include <sys/errno.h> |
| #include <sys/signal.h> |
| #include <sys/socket.h> |
| #include <sys/socketvar.h> |
| #include <sys/sockio.h> |
| #include <sys/isa_defs.h> |
| #include <sys/md5.h> |
| #include <sys/random.h> |
| #include <sys/sodirect.h> |
| #include <sys/uio.h> |
| #include <sys/systm.h> |
| #include <netinet/in.h> |
| #include <netinet/tcp.h> |
| #include <netinet/ip6.h> |
| #include <netinet/icmp6.h> |
| #include <net/if.h> |
| #include <net/route.h> |
| #include <inet/ipsec_impl.h> |
| |
| #include <inet/common.h> |
| #include <inet/ip.h> |
| #include <inet/ip_impl.h> |
| #include <inet/ip6.h> |
| #include <inet/ip_ndp.h> |
| #include <inet/proto_set.h> |
| #include <inet/mib2.h> |
| #include <inet/nd.h> |
| #include <inet/optcom.h> |
| #include <inet/snmpcom.h> |
| #include <inet/kstatcom.h> |
| #include <inet/tcp.h> |
| #include <inet/tcp_impl.h> |
| #include <net/pfkeyv2.h> |
| #include <inet/ipsec_info.h> |
| #include <inet/ipdrop.h> |
| |
| #include <inet/ipclassifier.h> |
| #include <inet/ip_ire.h> |
| #include <inet/ip_ftable.h> |
| #include <inet/ip_if.h> |
| #include <inet/ipp_common.h> |
| #include <inet/ip_netinfo.h> |
| #include <sys/squeue_impl.h> |
| #include <sys/squeue.h> |
| #include <inet/kssl/ksslapi.h> |
| #include <sys/tsol/label.h> |
| #include <sys/tsol/tnet.h> |
| #include <rpc/pmap_prot.h> |
| #include <sys/callo.h> |
| |
| /* |
| * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) |
| * |
| * (Read the detailed design doc in PSARC case directory) |
| * |
| * The entire tcp state is contained in tcp_t and conn_t structure |
| * which are allocated in tandem using ipcl_conn_create() and passing |
| * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect |
| * the references on the tcp_t. The tcp_t structure is never compressed |
| * and packets always land on the correct TCP perimeter from the time |
| * eager is created till the time tcp_t dies (as such the old mentat |
| * TCP global queue is not used for detached state and no IPSEC checking |
| * is required). The global queue is still allocated to send out resets |
| * for connection which have no listeners and IP directly calls |
| * tcp_xmit_listeners_reset() which does any policy check. |
| * |
| * Protection and Synchronisation mechanism: |
| * |
| * The tcp data structure does not use any kind of lock for protecting |
| * its state but instead uses 'squeues' for mutual exclusion from various |
| * read and write side threads. To access a tcp member, the thread should |
| * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS, |
| * or SQ_NODRAIN). Since the squeues allow a direct function call, caller |
| * can pass any tcp function having prototype of edesc_t as argument |
| * (different from traditional STREAMs model where packets come in only |
| * designated entry points). The list of functions that can be directly |
| * called via squeue are listed before the usual function prototype. |
| * |
| * Referencing: |
| * |
| * TCP is MT-Hot and we use a reference based scheme to make sure that the |
| * tcp structure doesn't disappear when its needed. When the application |
| * creates an outgoing connection or accepts an incoming connection, we |
| * start out with 2 references on 'conn_ref'. One for TCP and one for IP. |
| * The IP reference is just a symbolic reference since ip_tcpclose() |
| * looks at tcp structure after tcp_close_output() returns which could |
| * have dropped the last TCP reference. So as long as the connection is |
| * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the |
| * conn_t. The classifier puts its own reference when the connection is |
| * inserted in listen or connected hash. Anytime a thread needs to enter |
| * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr |
| * on write side or by doing a classify on read side and then puts a |
| * reference on the conn before doing squeue_enter/tryenter/fill. For |
| * read side, the classifier itself puts the reference under fanout lock |
| * to make sure that tcp can't disappear before it gets processed. The |
| * squeue will drop this reference automatically so the called function |
| * doesn't have to do a DEC_REF. |
| * |
| * Opening a new connection: |
| * |
| * The outgoing connection open is pretty simple. tcp_open() does the |
| * work in creating the conn/tcp structure and initializing it. The |
| * squeue assignment is done based on the CPU the application |
| * is running on. So for outbound connections, processing is always done |
| * on application CPU which might be different from the incoming CPU |
| * being interrupted by the NIC. An optimal way would be to figure out |
| * the NIC <-> CPU binding at listen time, and assign the outgoing |
| * connection to the squeue attached to the CPU that will be interrupted |
| * for incoming packets (we know the NIC based on the bind IP address). |
| * This might seem like a problem if more data is going out but the |
| * fact is that in most cases the transmit is ACK driven transmit where |
| * the outgoing data normally sits on TCP's xmit queue waiting to be |
| * transmitted. |
| * |
| * Accepting a connection: |
| * |
| * This is a more interesting case because of various races involved in |
| * establishing a eager in its own perimeter. Read the meta comment on |
| * top of tcp_conn_request(). But briefly, the squeue is picked by |
| * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. |
| * |
| * Closing a connection: |
| * |
| * The close is fairly straight forward. tcp_close() calls tcp_close_output() |
| * via squeue to do the close and mark the tcp as detached if the connection |
| * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its |
| * reference but tcp_close() drop IP's reference always. So if tcp was |
| * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP |
| * and 1 because it is in classifier's connected hash. This is the condition |
| * we use to determine that its OK to clean up the tcp outside of squeue |
| * when time wait expires (check the ref under fanout and conn_lock and |
| * if it is 2, remove it from fanout hash and kill it). |
| * |
| * Although close just drops the necessary references and marks the |
| * tcp_detached state, tcp_close needs to know the tcp_detached has been |
| * set (under squeue) before letting the STREAM go away (because a |
| * inbound packet might attempt to go up the STREAM while the close |
| * has happened and tcp_detached is not set). So a special lock and |
| * flag is used along with a condition variable (tcp_closelock, tcp_closed, |
| * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked |
| * tcp_detached. |
| * |
| * Special provisions and fast paths: |
| * |
| * We make special provision for (AF_INET, SOCK_STREAM) sockets which |
| * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP |
| * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles |
| * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY |
| * check to send packets directly to tcp_rput_data via squeue. Everyone |
| * else comes through tcp_input() on the read side. |
| * |
| * We also make special provisions for sockfs by marking tcp_issocket |
| * whenever we have only sockfs on top of TCP. This allows us to skip |
| * putting the tcp in acceptor hash since a sockfs listener can never |
| * become acceptor and also avoid allocating a tcp_t for acceptor STREAM |
| * since eager has already been allocated and the accept now happens |
| * on acceptor STREAM. There is a big blob of comment on top of |
| * tcp_conn_request explaining the new accept. When socket is POP'd, |
| * sockfs sends us an ioctl to mark the fact and we go back to old |
| * behaviour. Once tcp_issocket is unset, its never set for the |
| * life of that connection. |
| * |
| * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT) |
| * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's |
| * directly to the socket (sodirect) and start an asynchronous copyout |
| * to a user-land receive-side buffer (uioa) when a blocking socket read |
| * (e.g. read, recv, ...) is pending. |
| * |
| * This is accomplished when tcp_issocket is set and tcp_sodirect is not |
| * NULL so points to an sodirect_t and if marked enabled then we enqueue |
| * all mblk_t's directly to the socket. |
| * |
| * Further, if the sodirect_t sod_uioa and if marked enabled (due to a |
| * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous |
| * copyout will be started directly to the user-land uio buffer. Also, as we |
| * have a pending read, TCP's push logic can take into account the number of |
| * bytes to be received and only awake the blocked read()er when the uioa_t |
| * byte count has been satisfied. |
| * |
| * IPsec notes : |
| * |
| * Since a packet is always executed on the correct TCP perimeter |
| * all IPsec processing is defered to IP including checking new |
| * connections and setting IPSEC policies for new connection. The |
| * only exception is tcp_xmit_listeners_reset() which is called |
| * directly from IP and needs to policy check to see if TH_RST |
| * can be sent out. |
| * |
| * PFHooks notes : |
| * |
| * For mdt case, one meta buffer contains multiple packets. Mblks for every |
| * packet are assembled and passed to the hooks. When packets are blocked, |
| * or boundary of any packet is changed, the mdt processing is stopped, and |
| * packets of the meta buffer are send to the IP path one by one. |
| */ |
| |
| /* |
| * Values for squeue switch: |
| * 1: SQ_NODRAIN |
| * 2: SQ_PROCESS |
| * 3: SQ_FILL |
| */ |
| int tcp_squeue_wput = 2; /* /etc/systems */ |
| int tcp_squeue_flag; |
| |
| /* |
| * Macros for sodirect: |
| * |
| * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the |
| * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t |
| * if it exists and is enabled, else to NULL. Note, in the current |
| * sodirect implementation the sod_lockp must not be held across any |
| * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC |
| * will result as sod_lockp is the streamhead stdata.sd_lock. |
| * |
| * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the |
| * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve |
| * side tcp code path dealing with a tcp_rcv_list or putnext() isn't |
| * being used when sodirect code paths should be. |
| */ |
| |
| #define SOD_PTR_ENTER(tcp, sodp) \ |
| (sodp) = (tcp)->tcp_sodirect; \ |
| \ |
| if ((sodp) != NULL) { \ |
| mutex_enter((sodp)->sod_lockp); \ |
| if (!((sodp)->sod_state & SOD_ENABLED)) { \ |
| mutex_exit((sodp)->sod_lockp); \ |
| (sodp) = NULL; \ |
| } \ |
| } |
| |
| #define SOD_NOT_ENABLED(tcp) \ |
| ((tcp)->tcp_sodirect == NULL || \ |
| !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED)) |
| |
| /* |
| * This controls how tiny a write must be before we try to copy it |
| * into the the mblk on the tail of the transmit queue. Not much |
| * speedup is observed for values larger than sixteen. Zero will |
| * disable the optimisation. |
| */ |
| int tcp_tx_pull_len = 16; |
| |
| /* |
| * TCP Statistics. |
| * |
| * How TCP statistics work. |
| * |
| * There are two types of statistics invoked by two macros. |
| * |
| * TCP_STAT(name) does non-atomic increment of a named stat counter. It is |
| * supposed to be used in non MT-hot paths of the code. |
| * |
| * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is |
| * supposed to be used for DEBUG purposes and may be used on a hot path. |
| * |
| * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat |
| * (use "kstat tcp" to get them). |
| * |
| * There is also additional debugging facility that marks tcp_clean_death() |
| * instances and saves them in tcp_t structure. It is triggered by |
| * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for |
| * tcp_clean_death() calls that counts the number of times each tag was hit. It |
| * is triggered by TCP_CLD_COUNTERS define. |
| * |
| * How to add new counters. |
| * |
| * 1) Add a field in the tcp_stat structure describing your counter. |
| * 2) Add a line in the template in tcp_kstat2_init() with the name |
| * of the counter. |
| * |
| * IMPORTANT!! - make sure that both are in sync !! |
| * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. |
| * |
| * Please avoid using private counters which are not kstat-exported. |
| * |
| * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances |
| * in tcp_t structure. |
| * |
| * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. |
| */ |
| |
| #ifndef TCP_DEBUG_COUNTER |
| #ifdef DEBUG |
| #define TCP_DEBUG_COUNTER 1 |
| #else |
| #define TCP_DEBUG_COUNTER 0 |
| #endif |
| #endif |
| |
| #define TCP_CLD_COUNTERS 0 |
| |
| #define TCP_TAG_CLEAN_DEATH 1 |
| #define TCP_MAX_CLEAN_DEATH_TAG 32 |
| |
| #ifdef lint |
| static int _lint_dummy_; |
| #endif |
| |
| #if TCP_CLD_COUNTERS |
| static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; |
| #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ |
| #elif defined(lint) |
| #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); |
| #else |
| #define TCP_CLD_STAT(x) |
| #endif |
| |
| #if TCP_DEBUG_COUNTER |
| #define TCP_DBGSTAT(tcps, x) \ |
| atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1) |
| #define TCP_G_DBGSTAT(x) \ |
| atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1) |
| #elif defined(lint) |
| #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0); |
| #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); |
| #else |
| #define TCP_DBGSTAT(tcps, x) |
| #define TCP_G_DBGSTAT(x) |
| #endif |
| |
| #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) |
| |
| tcp_g_stat_t tcp_g_statistics; |
| kstat_t *tcp_g_kstat; |
| |
| /* |
| * Call either ip_output or ip_output_v6. This replaces putnext() calls on the |
| * tcp write side. |
| */ |
| #define CALL_IP_WPUT(connp, q, mp) { \ |
| ASSERT(((q)->q_flag & QREADR) == 0); \ |
| TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output); \ |
| connp->conn_send(connp, (mp), (q), IP_WPUT); \ |
| } |
| |
| /* Macros for timestamp comparisons */ |
| #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) |
| #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) |
| |
| /* |
| * Parameters for TCP Initial Send Sequence number (ISS) generation. When |
| * tcp_strong_iss is set to 1, which is the default, the ISS is calculated |
| * by adding three components: a time component which grows by 1 every 4096 |
| * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); |
| * a per-connection component which grows by 125000 for every new connection; |
| * and an "extra" component that grows by a random amount centered |
| * approximately on 64000. This causes the the ISS generator to cycle every |
| * 4.89 hours if no TCP connections are made, and faster if connections are |
| * made. |
| * |
| * When tcp_strong_iss is set to 0, ISS is calculated by adding two |
| * components: a time component which grows by 250000 every second; and |
| * a per-connection component which grows by 125000 for every new connections. |
| * |
| * A third method, when tcp_strong_iss is set to 2, for generating ISS is |
| * prescribed by Steve Bellovin. This involves adding time, the 125000 per |
| * connection, and a one-way hash (MD5) of the connection ID <sport, dport, |
| * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered |
| * password. |
| */ |
| #define ISS_INCR 250000 |
| #define ISS_NSEC_SHT 12 |
| |
| static sin_t sin_null; /* Zero address for quick clears */ |
| static sin6_t sin6_null; /* Zero address for quick clears */ |
| |
| /* |
| * This implementation follows the 4.3BSD interpretation of the urgent |
| * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause |
| * incompatible changes in protocols like telnet and rlogin. |
| */ |
| #define TCP_OLD_URP_INTERPRETATION 1 |
| |
| #define TCP_IS_DETACHED_NONEAGER(tcp) \ |
| (TCP_IS_DETACHED(tcp) && \ |
| (!(tcp)->tcp_hard_binding)) |
| |
| /* |
| * TCP reassembly macros. We hide starting and ending sequence numbers in |
| * b_next and b_prev of messages on the reassembly queue. The messages are |
| * chained using b_cont. These macros are used in tcp_reass() so we don't |
| * have to see the ugly casts and assignments. |
| */ |
| #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) |
| #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ |
| (mblk_t *)(uintptr_t)(u)) |
| #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) |
| #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ |
| (mblk_t *)(uintptr_t)(u)) |
| |
| /* |
| * Implementation of TCP Timers. |
| * ============================= |
| * |
| * INTERFACE: |
| * |
| * There are two basic functions dealing with tcp timers: |
| * |
| * timeout_id_t tcp_timeout(connp, func, time) |
| * clock_t tcp_timeout_cancel(connp, timeout_id) |
| * TCP_TIMER_RESTART(tcp, intvl) |
| * |
| * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' |
| * after 'time' ticks passed. The function called by timeout() must adhere to |
| * the same restrictions as a driver soft interrupt handler - it must not sleep |
| * or call other functions that might sleep. The value returned is the opaque |
| * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to |
| * cancel the request. The call to tcp_timeout() may fail in which case it |
| * returns zero. This is different from the timeout(9F) function which never |
| * fails. |
| * |
| * The call-back function 'func' always receives 'connp' as its single |
| * argument. It is always executed in the squeue corresponding to the tcp |
| * structure. The tcp structure is guaranteed to be present at the time the |
| * call-back is called. |
| * |
| * NOTE: The call-back function 'func' is never called if tcp is in |
| * the TCPS_CLOSED state. |
| * |
| * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() |
| * request. locks acquired by the call-back routine should not be held across |
| * the call to tcp_timeout_cancel() or a deadlock may result. |
| * |
| * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. |
| * Otherwise, it returns an integer value greater than or equal to 0. In |
| * particular, if the call-back function is already placed on the squeue, it can |
| * not be canceled. |
| * |
| * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called |
| * within squeue context corresponding to the tcp instance. Since the |
| * call-back is also called via the same squeue, there are no race |
| * conditions described in untimeout(9F) manual page since all calls are |
| * strictly serialized. |
| * |
| * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout |
| * stored in tcp_timer_tid and starts a new one using |
| * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back |
| * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid |
| * field. |
| * |
| * NOTE: since the timeout cancellation is not guaranteed, the cancelled |
| * call-back may still be called, so it is possible tcp_timer() will be |
| * called several times. This should not be a problem since tcp_timer() |
| * should always check the tcp instance state. |
| * |
| * |
| * IMPLEMENTATION: |
| * |
| * TCP timers are implemented using three-stage process. The call to |
| * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function |
| * when the timer expires. The tcp_timer_callback() arranges the call of the |
| * tcp_timer_handler() function via squeue corresponding to the tcp |
| * instance. The tcp_timer_handler() calls actual requested timeout call-back |
| * and passes tcp instance as an argument to it. Information is passed between |
| * stages using the tcp_timer_t structure which contains the connp pointer, the |
| * tcp call-back to call and the timeout id returned by the timeout(9F). |
| * |
| * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - |
| * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo |
| * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() |
| * returns the pointer to this mblk. |
| * |
| * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It |
| * looks like a normal mblk without actual dblk attached to it. |
| * |
| * To optimize performance each tcp instance holds a small cache of timer |
| * mblocks. In the current implementation it caches up to two timer mblocks per |
| * tcp instance. The cache is preserved over tcp frees and is only freed when |
| * the whole tcp structure is destroyed by its kmem destructor. Since all tcp |
| * timer processing happens on a corresponding squeue, the cache manipulation |
| * does not require any locks. Experiments show that majority of timer mblocks |
| * allocations are satisfied from the tcp cache and do not involve kmem calls. |
| * |
| * The tcp_timeout() places a refhold on the connp instance which guarantees |
| * that it will be present at the time the call-back function fires. The |
| * tcp_timer_handler() drops the reference after calling the call-back, so the |
| * call-back function does not need to manipulate the references explicitly. |
| */ |
| |
| typedef struct tcp_timer_s { |
| conn_t *connp; |
| void (*tcpt_proc)(void *); |
| callout_id_t tcpt_tid; |
| } tcp_timer_t; |
| |
| static kmem_cache_t *tcp_timercache; |
| kmem_cache_t *tcp_sack_info_cache; |
| kmem_cache_t *tcp_iphc_cache; |
| |
| /* |
| * For scalability, we must not run a timer for every TCP connection |
| * in TIME_WAIT state. To see why, consider (for time wait interval of |
| * 4 minutes): |
| * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's |
| * |
| * This list is ordered by time, so you need only delete from the head |
| * until you get to entries which aren't old enough to delete yet. |
| * The list consists of only the detached TIME_WAIT connections. |
| * |
| * Note that the timer (tcp_time_wait_expire) is started when the tcp_t |
| * becomes detached TIME_WAIT (either by changing the state and already |
| * being detached or the other way around). This means that the TIME_WAIT |
| * state can be extended (up to doubled) if the connection doesn't become |
| * detached for a long time. |
| * |
| * The list manipulations (including tcp_time_wait_next/prev) |
| * are protected by the tcp_time_wait_lock. The content of the |
| * detached TIME_WAIT connections is protected by the normal perimeters. |
| * |
| * This list is per squeue and squeues are shared across the tcp_stack_t's. |
| * Things on tcp_time_wait_head remain associated with the tcp_stack_t |
| * and conn_netstack. |
| * The tcp_t's that are added to tcp_free_list are disassociated and |
| * have NULL tcp_tcps and conn_netstack pointers. |
| */ |
| typedef struct tcp_squeue_priv_s { |
| kmutex_t tcp_time_wait_lock; |
| callout_id_t tcp_time_wait_tid; |
| tcp_t *tcp_time_wait_head; |
| tcp_t *tcp_time_wait_tail; |
| tcp_t *tcp_free_list; |
| uint_t tcp_free_list_cnt; |
| } tcp_squeue_priv_t; |
| |
| /* |
| * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. |
| * Running it every 5 seconds seems to give the best results. |
| */ |
| #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) |
| |
| /* |
| * To prevent memory hog, limit the number of entries in tcp_free_list |
| * to 1% of available memory / number of cpus |
| */ |
| uint_t tcp_free_list_max_cnt = 0; |
| |
| #define TCP_XMIT_LOWATER 4096 |
| #define TCP_XMIT_HIWATER 49152 |
| #define TCP_RECV_LOWATER 2048 |
| #define TCP_RECV_HIWATER 49152 |
| |
| /* |
| * PAWS needs a timer for 24 days. This is the number of ticks in 24 days |
| */ |
| #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) |
| |
| #define TIDUSZ 4096 /* transport interface data unit size */ |
| |
| /* |
| * Bind hash list size and has function. It has to be a power of 2 for |
| * hashing. |
| */ |
| #define TCP_BIND_FANOUT_SIZE 512 |
| #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) |
| /* |
| * Size of listen and acceptor hash list. It has to be a power of 2 for |
| * hashing. |
| */ |
| #define TCP_FANOUT_SIZE 256 |
| |
| #ifdef _ILP32 |
| #define TCP_ACCEPTOR_HASH(accid) \ |
| (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) |
| #else |
| #define TCP_ACCEPTOR_HASH(accid) \ |
| ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) |
| #endif /* _ILP32 */ |
| |
| #define IP_ADDR_CACHE_SIZE 2048 |
| #define IP_ADDR_CACHE_HASH(faddr) \ |
| (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) |
| |
| /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */ |
| #define TCP_HSP_HASH_SIZE 256 |
| |
| #define TCP_HSP_HASH(addr) \ |
| (((addr>>24) ^ (addr >>16) ^ \ |
| (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE) |
| |
| /* |
| * TCP options struct returned from tcp_parse_options. |
| */ |
| typedef struct tcp_opt_s { |
| uint32_t tcp_opt_mss; |
| uint32_t tcp_opt_wscale; |
| uint32_t tcp_opt_ts_val; |
| uint32_t tcp_opt_ts_ecr; |
| tcp_t *tcp; |
| } tcp_opt_t; |
| |
| /* |
| * TCP option struct passing information b/w lisenter and eager. |
| */ |
| struct tcp_options { |
| uint_t to_flags; |
| ssize_t to_boundif; /* IPV6_BOUND_IF */ |
| }; |
| |
| #define TCPOPT_BOUNDIF 0x00000001 /* set IPV6_BOUND_IF */ |
| #define TCPOPT_RECVPKTINFO 0x00000002 /* set IPV6_RECVPKTINFO */ |
| |
| /* |
| * RFC1323-recommended phrasing of TSTAMP option, for easier parsing |
| */ |
| |
| #ifdef _BIG_ENDIAN |
| #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ |
| (TCPOPT_TSTAMP << 8) | 10) |
| #else |
| #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ |
| (TCPOPT_NOP << 8) | TCPOPT_NOP) |
| #endif |
| |
| /* |
| * Flags returned from tcp_parse_options. |
| */ |
| #define TCP_OPT_MSS_PRESENT 1 |
| #define TCP_OPT_WSCALE_PRESENT 2 |
| #define TCP_OPT_TSTAMP_PRESENT 4 |
| #define TCP_OPT_SACK_OK_PRESENT 8 |
| #define TCP_OPT_SACK_PRESENT 16 |
| |
| /* TCP option length */ |
| #define TCPOPT_NOP_LEN 1 |
| #define TCPOPT_MAXSEG_LEN 4 |
| #define TCPOPT_WS_LEN 3 |
| #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) |
| #define TCPOPT_TSTAMP_LEN 10 |
| #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) |
| #define TCPOPT_SACK_OK_LEN 2 |
| #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) |
| #define TCPOPT_REAL_SACK_LEN 4 |
| #define TCPOPT_MAX_SACK_LEN 36 |
| #define TCPOPT_HEADER_LEN 2 |
| |
| /* TCP cwnd burst factor. */ |
| #define TCP_CWND_INFINITE 65535 |
| #define TCP_CWND_SS 3 |
| #define TCP_CWND_NORMAL 5 |
| |
| /* Maximum TCP initial cwin (start/restart). */ |
| #define TCP_MAX_INIT_CWND 8 |
| |
| /* |
| * Initialize cwnd according to RFC 3390. def_max_init_cwnd is |
| * either tcp_slow_start_initial or tcp_slow_start_after idle |
| * depending on the caller. If the upper layer has not used the |
| * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd |
| * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. |
| * If the upper layer has changed set the tcp_init_cwnd, just use |
| * it to calculate the tcp_cwnd. |
| */ |
| #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ |
| { \ |
| if ((tcp)->tcp_init_cwnd == 0) { \ |
| (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ |
| MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ |
| } else { \ |
| (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ |
| } \ |
| tcp->tcp_cwnd_cnt = 0; \ |
| } |
| |
| /* TCP Timer control structure */ |
| typedef struct tcpt_s { |
| pfv_t tcpt_pfv; /* The routine we are to call */ |
| tcp_t *tcpt_tcp; /* The parameter we are to pass in */ |
| } tcpt_t; |
| |
| /* Host Specific Parameter structure */ |
| typedef struct tcp_hsp { |
| struct tcp_hsp *tcp_hsp_next; |
| in6_addr_t tcp_hsp_addr_v6; |
| in6_addr_t tcp_hsp_subnet_v6; |
| uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */ |
| int32_t tcp_hsp_sendspace; |
| int32_t tcp_hsp_recvspace; |
| int32_t tcp_hsp_tstamp; |
| } tcp_hsp_t; |
| #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6) |
| #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6) |
| |
| /* |
| * Functions called directly via squeue having a prototype of edesc_t. |
| */ |
| void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); |
| void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); |
| void tcp_input(void *arg, mblk_t *mp, void *arg2); |
| void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); |
| void tcp_output(void *arg, mblk_t *mp, void *arg2); |
| void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); |
| static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2); |
| |
| |
| /* Prototype for TCP functions */ |
| static void tcp_random_init(void); |
| int tcp_random(void); |
| static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp); |
| static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, |
| tcp_t *eager); |
| static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); |
| static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, |
| int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, |
| boolean_t user_specified); |
| static void tcp_closei_local(tcp_t *tcp); |
| static void tcp_close_detached(tcp_t *tcp); |
| static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, |
| mblk_t *idmp, mblk_t **defermp); |
| static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp); |
| static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, |
| in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid); |
| static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, |
| in_port_t dstport, uint32_t flowinfo, uint_t srcid, |
| uint32_t scope_id, cred_t *cr, pid_t pid); |
| static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); |
| static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); |
| static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); |
| static char *tcp_display(tcp_t *tcp, char *, char); |
| static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); |
| static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); |
| static void tcp_eager_unlink(tcp_t *tcp); |
| static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, |
| int unixerr); |
| static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, |
| int tlierr, int unixerr); |
| static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, |
| char *value, caddr_t cp, cred_t *cr); |
| static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, |
| char *value, caddr_t cp, cred_t *cr); |
| static int tcp_tpistate(tcp_t *tcp); |
| static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, |
| int caller_holds_lock); |
| static void tcp_bind_hash_remove(tcp_t *tcp); |
| static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *); |
| void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); |
| static void tcp_acceptor_hash_remove(tcp_t *tcp); |
| static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); |
| static void tcp_info_req(tcp_t *tcp, mblk_t *mp); |
| static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); |
| static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); |
| void tcp_g_q_setup(tcp_stack_t *); |
| void tcp_g_q_create(tcp_stack_t *); |
| void tcp_g_q_destroy(tcp_stack_t *); |
| static int tcp_header_init_ipv4(tcp_t *tcp); |
| static int tcp_header_init_ipv6(tcp_t *tcp); |
| int tcp_init(tcp_t *tcp, queue_t *q); |
| static int tcp_init_values(tcp_t *tcp); |
| static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); |
| static void tcp_ip_ire_mark_advice(tcp_t *tcp); |
| static void tcp_ip_notify(tcp_t *tcp); |
| static mblk_t *tcp_ire_mp(mblk_t **mpp); |
| static void tcp_iss_init(tcp_t *tcp); |
| static void tcp_keepalive_killer(void *arg); |
| static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); |
| static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss); |
| static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, |
| int *do_disconnectp, int *t_errorp, int *sys_errorp); |
| static boolean_t tcp_allow_connopt_set(int level, int name); |
| int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); |
| int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr); |
| int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, |
| int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, |
| uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, |
| mblk_t *mblk); |
| static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); |
| static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, |
| uchar_t *ptr, uint_t len); |
| static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); |
| static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, |
| tcp_stack_t *); |
| static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *); |
| static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); |
| static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); |
| static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); |
| static void tcp_reinit(tcp_t *tcp); |
| static void tcp_reinit_values(tcp_t *tcp); |
| static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, |
| tcp_t *thisstream, cred_t *cr); |
| |
| static uint_t tcp_rwnd_reopen(tcp_t *tcp); |
| static uint_t tcp_rcv_drain(tcp_t *tcp); |
| static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); |
| static boolean_t tcp_send_rst_chk(tcp_stack_t *); |
| static void tcp_ss_rexmit(tcp_t *tcp); |
| static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); |
| static void tcp_process_options(tcp_t *, tcph_t *); |
| static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); |
| static void tcp_rsrv(queue_t *q); |
| static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); |
| static int tcp_snmp_state(tcp_t *tcp); |
| static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, |
| cred_t *cr); |
| static void tcp_timer(void *arg); |
| static void tcp_timer_callback(void *); |
| static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, |
| boolean_t random); |
| static in_port_t tcp_get_next_priv_port(const tcp_t *); |
| static void tcp_wput_sock(queue_t *q, mblk_t *mp); |
| static void tcp_wput_fallback(queue_t *q, mblk_t *mp); |
| void tcp_tpi_accept(queue_t *q, mblk_t *mp); |
| static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); |
| static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); |
| static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); |
| static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, |
| const int tcp_hdr_len, const int tcp_tcp_hdr_len, |
| const int num_sack_blk, int *usable, uint_t *snxt, |
| int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, |
| const int mdt_thres); |
| static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, |
| const int tcp_hdr_len, const int tcp_tcp_hdr_len, |
| const int num_sack_blk, int *usable, uint_t *snxt, |
| int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, |
| const int mdt_thres); |
| static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, |
| int num_sack_blk); |
| static void tcp_wsrv(queue_t *q); |
| static int tcp_xmit_end(tcp_t *tcp); |
| static void tcp_ack_timer(void *arg); |
| static mblk_t *tcp_ack_mp(tcp_t *tcp); |
| static void tcp_xmit_early_reset(char *str, mblk_t *mp, |
| uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, |
| zoneid_t zoneid, tcp_stack_t *, conn_t *connp); |
| static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, |
| uint32_t ack, int ctl); |
| static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *); |
| static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *); |
| static int setmaxps(queue_t *q, int maxpsz); |
| static void tcp_set_rto(tcp_t *, time_t); |
| static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, |
| boolean_t, boolean_t); |
| static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, |
| boolean_t ipsec_mctl); |
| static int tcp_build_hdrs(tcp_t *); |
| static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, |
| uint32_t seg_seq, uint32_t seg_ack, int seg_len, |
| tcph_t *tcph); |
| boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); |
| static mblk_t *tcp_mdt_info_mp(mblk_t *); |
| static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); |
| static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, |
| const boolean_t, const uint32_t, const uint32_t, |
| const uint32_t, const uint32_t, tcp_stack_t *); |
| static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, |
| const uint_t, const uint_t, boolean_t *); |
| static mblk_t *tcp_lso_info_mp(mblk_t *); |
| static void tcp_lso_update(tcp_t *, ill_lso_capab_t *); |
| static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); |
| extern mblk_t *tcp_timermp_alloc(int); |
| extern void tcp_timermp_free(tcp_t *); |
| static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); |
| static void tcp_stop_lingering(tcp_t *tcp); |
| static void tcp_close_linger_timeout(void *arg); |
| static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); |
| static void tcp_stack_shutdown(netstackid_t stackid, void *arg); |
| static void tcp_stack_fini(netstackid_t stackid, void *arg); |
| static void *tcp_g_kstat_init(tcp_g_stat_t *); |
| static void tcp_g_kstat_fini(kstat_t *); |
| static void *tcp_kstat_init(netstackid_t, tcp_stack_t *); |
| static void tcp_kstat_fini(netstackid_t, kstat_t *); |
| static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); |
| static void tcp_kstat2_fini(netstackid_t, kstat_t *); |
| static int tcp_kstat_update(kstat_t *kp, int rw); |
| void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); |
| static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, |
| tcph_t *tcph, uint_t ipvers, mblk_t *idmp); |
| static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, |
| tcph_t *tcph, mblk_t *idmp); |
| static int tcp_squeue_switch(int); |
| |
| static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); |
| static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); |
| static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); |
| static int tcp_tpi_close(queue_t *, int); |
| static int tcpclose_accept(queue_t *); |
| |
| static void tcp_squeue_add(squeue_t *); |
| static boolean_t tcp_zcopy_check(tcp_t *); |
| static void tcp_zcopy_notify(tcp_t *); |
| static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); |
| static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); |
| static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); |
| |
| extern void tcp_kssl_input(tcp_t *, mblk_t *); |
| |
| void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2); |
| void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2); |
| |
| static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, |
| sock_upper_handle_t, cred_t *); |
| static int tcp_listen(sock_lower_handle_t, int, cred_t *); |
| static int tcp_post_ip_bind(tcp_t *, mblk_t *, int, cred_t *, pid_t); |
| static int tcp_do_listen(conn_t *, int, cred_t *); |
| static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, |
| cred_t *, pid_t); |
| static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, |
| boolean_t); |
| static int tcp_do_unbind(conn_t *); |
| static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *, |
| boolean_t); |
| |
| static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); |
| |
| /* |
| * Routines related to the TCP_IOC_ABORT_CONN ioctl command. |
| * |
| * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting |
| * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure |
| * (defined in tcp.h) needs to be filled in and passed into the kernel |
| * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t |
| * structure contains the four-tuple of a TCP connection and a range of TCP |
| * states (specified by ac_start and ac_end). The use of wildcard addresses |
| * and ports is allowed. Connections with a matching four tuple and a state |
| * within the specified range will be aborted. The valid states for the |
| * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, |
| * inclusive. |
| * |
| * An application which has its connection aborted by this ioctl will receive |
| * an error that is dependent on the connection state at the time of the abort. |
| * If the connection state is < TCPS_TIME_WAIT, an application should behave as |
| * though a RST packet has been received. If the connection state is equal to |
| * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel |
| * and all resources associated with the connection will be freed. |
| */ |
| static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); |
| static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); |
| static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); |
| static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); |
| static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); |
| static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, |
| boolean_t, tcp_stack_t *); |
| |
| static struct module_info tcp_rinfo = { |
| TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER |
| }; |
| |
| static struct module_info tcp_winfo = { |
| TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 |
| }; |
| |
| /* |
| * Entry points for TCP as a device. The normal case which supports |
| * the TCP functionality. |
| * We have separate open functions for the /dev/tcp and /dev/tcp6 devices. |
| */ |
| struct qinit tcp_rinitv4 = { |
| NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo |
| }; |
| |
| struct qinit tcp_rinitv6 = { |
| NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo |
| }; |
| |
| struct qinit tcp_winit = { |
| (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo |
| }; |
| |
| /* Initial entry point for TCP in socket mode. */ |
| struct qinit tcp_sock_winit = { |
| (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo |
| }; |
| |
| /* TCP entry point during fallback */ |
| struct qinit tcp_fallback_sock_winit = { |
| (pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo |
| }; |
| |
| /* |
| * Entry points for TCP as a acceptor STREAM opened by sockfs when doing |
| * an accept. Avoid allocating data structures since eager has already |
| * been created. |
| */ |
| struct qinit tcp_acceptor_rinit = { |
| NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo |
| }; |
| |
| struct qinit tcp_acceptor_winit = { |
| (pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo |
| }; |
| |
| /* |
| * Entry points for TCP loopback (read side only) |
| * The open routine is only used for reopens, thus no need to |
| * have a separate one for tcp_openv6. |
| */ |
| struct qinit tcp_loopback_rinit = { |
| (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, (pfi_t)0, |
| &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD |
| }; |
| |
| /* For AF_INET aka /dev/tcp */ |
| struct streamtab tcpinfov4 = { |
| &tcp_rinitv4, &tcp_winit |
| }; |
| |
| /* For AF_INET6 aka /dev/tcp6 */ |
| struct streamtab tcpinfov6 = { |
| &tcp_rinitv6, &tcp_winit |
| }; |
| |
| sock_downcalls_t sock_tcp_downcalls; |
| |
| /* |
| * Have to ensure that tcp_g_q_close is not done by an |
| * interrupt thread. |
| */ |
| static taskq_t *tcp_taskq; |
| |
| /* Setable only in /etc/system. Move to ndd? */ |
| boolean_t tcp_icmp_source_quench = B_FALSE; |
| |
| /* |
| * Following assumes TPI alignment requirements stay along 32 bit |
| * boundaries |
| */ |
| #define ROUNDUP32(x) \ |
| (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) |
| |
| /* Template for response to info request. */ |
| static struct T_info_ack tcp_g_t_info_ack = { |
| T_INFO_ACK, /* PRIM_type */ |
| 0, /* TSDU_size */ |
| T_INFINITE, /* ETSDU_size */ |
| T_INVALID, /* CDATA_size */ |
| T_INVALID, /* DDATA_size */ |
| sizeof (sin_t), /* ADDR_size */ |
| 0, /* OPT_size - not initialized here */ |
| TIDUSZ, /* TIDU_size */ |
| T_COTS_ORD, /* SERV_type */ |
| TCPS_IDLE, /* CURRENT_state */ |
| (XPG4_1|EXPINLINE) /* PROVIDER_flag */ |
| }; |
| |
| static struct T_info_ack tcp_g_t_info_ack_v6 = { |
| T_INFO_ACK, /* PRIM_type */ |
| 0, /* TSDU_size */ |
| T_INFINITE, /* ETSDU_size */ |
| T_INVALID, /* CDATA_size */ |
| T_INVALID, /* DDATA_size */ |
| sizeof (sin6_t), /* ADDR_size */ |
| 0, /* OPT_size - not initialized here */ |
| TIDUSZ, /* TIDU_size */ |
| T_COTS_ORD, /* SERV_type */ |
| TCPS_IDLE, /* CURRENT_state */ |
| (XPG4_1|EXPINLINE) /* PROVIDER_flag */ |
| }; |
| |
| #define MS 1L |
| #define SECONDS (1000 * MS) |
| #define MINUTES (60 * SECONDS) |
| #define HOURS (60 * MINUTES) |
| #define DAYS (24 * HOURS) |
| |
| #define PARAM_MAX (~(uint32_t)0) |
| |
| /* Max size IP datagram is 64k - 1 */ |
| #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) |
| #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) |
| /* Max of the above */ |
| #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 |
| |
| /* Largest TCP port number */ |
| #define TCP_MAX_PORT (64 * 1024 - 1) |
| |
| /* |
| * tcp_wroff_xtra is the extra space in front of TCP/IP header for link |
| * layer header. It has to be a multiple of 4. |
| */ |
| static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; |
| #define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val |
| |
| /* |
| * All of these are alterable, within the min/max values given, at run time. |
| * Note that the default value of "tcp_time_wait_interval" is four minutes, |
| * per the TCP spec. |
| */ |
| /* BEGIN CSTYLED */ |
| static tcpparam_t lcl_tcp_param_arr[] = { |
| /*min max value name */ |
| { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, |
| { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, |
| { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, |
| { 1, 1024, 1, "tcp_conn_req_min" }, |
| { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, |
| { 128, (1<<30), 1024*1024, "tcp_cwnd_max" }, |
| { 0, 10, 0, "tcp_debug" }, |
| { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, |
| { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, |
| { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, |
| { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"}, |
| { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, |
| { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, |
| { 1, 255, 64, "tcp_ipv4_ttl"}, |
| { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, |
| { 0, 100, 10, "tcp_maxpsz_multiplier" }, |
| { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, |
| { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, |
| { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, |
| { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, |
| { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"}, |
| { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, |
| { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, |
| { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, |
| { 0, 16, 0, "tcp_snd_lowat_fraction" }, |
| { 0, 128000, 0, "tcp_sth_rcv_hiwat" }, |
| { 0, 128000, 0, "tcp_sth_rcv_lowat" }, |
| { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, |
| { 0, 1, 0, "tcp_ignore_path_mtu" }, |
| { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, |
| { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, |
| { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, |
| { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, |
| { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, |
| { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, |
| { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, |
| { 8192, (1<<30), 1024*1024, "tcp_max_buf"}, |
| /* |
| * Question: What default value should I set for tcp_strong_iss? |
| */ |
| { 0, 2, 1, "tcp_strong_iss"}, |
| { 0, 65536, 20, "tcp_rtt_updates"}, |
| { 0, 1, 1, "tcp_wscale_always"}, |
| { 0, 1, 0, "tcp_tstamp_always"}, |
| { 0, 1, 1, "tcp_tstamp_if_wscale"}, |
| { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, |
| { 0, 16, 2, "tcp_deferred_acks_max"}, |
| { 1, 16384, 4, "tcp_slow_start_after_idle"}, |
| { 1, 4, 4, "tcp_slow_start_initial"}, |
| { 0, 2, 2, "tcp_sack_permitted"}, |
| { 0, 1, 1, "tcp_compression_enabled"}, |
| { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, |
| { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, |
| { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, |
| { 0, 1, 0, "tcp_rev_src_routes"}, |
| { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, |
| { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"}, |
| { 0, 16, 8, "tcp_local_dacks_max"}, |
| { 0, 2, 1, "tcp_ecn_permitted"}, |
| { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, |
| { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, |
| { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, |
| { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, |
| { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, |
| }; |
| /* END CSTYLED */ |
| |
| /* |
| * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of |
| * each header fragment in the header buffer. Each parameter value has |
| * to be a multiple of 4 (32-bit aligned). |
| */ |
| static tcpparam_t lcl_tcp_mdt_head_param = |
| { 32, 256, 32, "tcp_mdt_hdr_head_min" }; |
| static tcpparam_t lcl_tcp_mdt_tail_param = |
| { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; |
| #define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val |
| #define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val |
| |
| /* |
| * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out |
| * the maximum number of payload buffers associated per Multidata. |
| */ |
| static tcpparam_t lcl_tcp_mdt_max_pbufs_param = |
| { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; |
| #define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val |
| |
| /* Round up the value to the nearest mss. */ |
| #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) |
| |
| /* |
| * Set ECN capable transport (ECT) code point in IP header. |
| * |
| * Note that there are 2 ECT code points '01' and '10', which are called |
| * ECT(1) and ECT(0) respectively. Here we follow the original ECT code |
| * point ECT(0) for TCP as described in RFC 2481. |
| */ |
| #define SET_ECT(tcp, iph) \ |
| if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ |
| /* We need to clear the code point first. */ \ |
| ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ |
| ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ |
| } else { \ |
| ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ |
| ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ |
| } |
| |
| /* |
| * The format argument to pass to tcp_display(). |
| * DISP_PORT_ONLY means that the returned string has only port info. |
| * DISP_ADDR_AND_PORT means that the returned string also contains the |
| * remote and local IP address. |
| */ |
| #define DISP_PORT_ONLY 1 |
| #define DISP_ADDR_AND_PORT 2 |
| |
| #define NDD_TOO_QUICK_MSG \ |
| "ndd get info rate too high for non-privileged users, try again " \ |
| "later.\n" |
| #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n" |
| |
| #define IS_VMLOANED_MBLK(mp) \ |
| (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) |
| |
| |
| /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ |
| boolean_t tcp_mdt_chain = B_TRUE; |
| |
| /* |
| * MDT threshold in the form of effective send MSS multiplier; we take |
| * the MDT path if the amount of unsent data exceeds the threshold value |
| * (default threshold is 1*SMSS). |
| */ |
| uint_t tcp_mdt_smss_threshold = 1; |
| |
| uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ |
| |
| /* |
| * Forces all connections to obey the value of the tcps_maxpsz_multiplier |
| * tunable settable via NDD. Otherwise, the per-connection behavior is |
| * determined dynamically during tcp_adapt_ire(), which is the default. |
| */ |
| boolean_t tcp_static_maxpsz = B_FALSE; |
| |
| /* Setable in /etc/system */ |
| /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ |
| uint32_t tcp_random_anon_port = 1; |
| |
| /* |
| * To reach to an eager in Q0 which can be dropped due to an incoming |
| * new SYN request when Q0 is full, a new doubly linked list is |
| * introduced. This list allows to select an eager from Q0 in O(1) time. |
| * This is needed to avoid spending too much time walking through the |
| * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of |
| * this new list has to be a member of Q0. |
| * This list is headed by listener's tcp_t. When the list is empty, |
| * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0, |
| * of listener's tcp_t point to listener's tcp_t itself. |
| * |
| * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager |
| * in the list. MAKE_UNDROPPABLE() takes the eager out of the list. |
| * These macros do not affect the eager's membership to Q0. |
| */ |
| |
| |
| #define MAKE_DROPPABLE(listener, eager) \ |
| if ((eager)->tcp_eager_next_drop_q0 == NULL) { \ |
| (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\ |
| = (eager); \ |
| (eager)->tcp_eager_prev_drop_q0 = (listener); \ |
| (eager)->tcp_eager_next_drop_q0 = \ |
| (listener)->tcp_eager_next_drop_q0; \ |
| (listener)->tcp_eager_next_drop_q0 = (eager); \ |
| } |
| |
| #define MAKE_UNDROPPABLE(eager) \ |
| if ((eager)->tcp_eager_next_drop_q0 != NULL) { \ |
| (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \ |
| = (eager)->tcp_eager_prev_drop_q0; \ |
| (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \ |
| = (eager)->tcp_eager_next_drop_q0; \ |
| (eager)->tcp_eager_prev_drop_q0 = NULL; \ |
| (eager)->tcp_eager_next_drop_q0 = NULL; \ |
| } |
| |
| /* |
| * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more |
| * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent |
| * data, TCP will not respond with an ACK. RFC 793 requires that |
| * TCP responds with an ACK for such a bogus ACK. By not following |
| * the RFC, we prevent TCP from getting into an ACK storm if somehow |
| * an attacker successfully spoofs an acceptable segment to our |
| * peer; or when our peer is "confused." |
| */ |
| uint32_t tcp_drop_ack_unsent_cnt = 10; |
| |
| /* |
| * Hook functions to enable cluster networking |
| * On non-clustered systems these vectors must always be NULL. |
| */ |
| |
| void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol, |
| sa_family_t addr_family, uint8_t *laddrp, |
| in_port_t lport, void *args) = NULL; |
| void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol, |
| sa_family_t addr_family, uint8_t *laddrp, |
| in_port_t lport, void *args) = NULL; |
| |
| int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, |
| boolean_t is_outgoing, |
| sa_family_t addr_family, |
| uint8_t *laddrp, in_port_t lport, |
| uint8_t *faddrp, in_port_t fport, |
| void *args) = NULL; |
| |
| void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol, |
| sa_family_t addr_family, uint8_t *laddrp, |
| in_port_t lport, uint8_t *faddrp, |
| in_port_t fport, void *args) = NULL; |
| |
| /* |
| * The following are defined in ip.c |
| */ |
| extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol, |
| sa_family_t addr_family, uint8_t *laddrp, |
| void *args); |
| extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol, |
| sa_family_t addr_family, uint8_t *laddrp, |
| uint8_t *faddrp, void *args); |
| |
| |
| /* |
| * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err) |
| */ |
| #define CL_INET_CONNECT(connp, tcp, is_outgoing, err) { \ |
| (err) = 0; \ |
| if (cl_inet_connect2 != NULL) { \ |
| /* \ |
| * Running in cluster mode - register active connection \ |
| * information \ |
| */ \ |
| if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ |
| if ((tcp)->tcp_ipha->ipha_src != 0) { \ |
| (err) = (*cl_inet_connect2)( \ |
| (connp)->conn_netstack->netstack_stackid,\ |
| IPPROTO_TCP, is_outgoing, AF_INET, \ |
| (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ |
| (in_port_t)(tcp)->tcp_lport, \ |
| (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ |
| (in_port_t)(tcp)->tcp_fport, NULL); \ |
| } \ |
| } else { \ |
| if (!IN6_IS_ADDR_UNSPECIFIED( \ |
| &(tcp)->tcp_ip6h->ip6_src)) { \ |
| (err) = (*cl_inet_connect2)( \ |
| (connp)->conn_netstack->netstack_stackid,\ |
| IPPROTO_TCP, is_outgoing, AF_INET6, \ |
| (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ |
| (in_port_t)(tcp)->tcp_lport, \ |
| (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ |
| (in_port_t)(tcp)->tcp_fport, NULL); \ |
| } \ |
| } \ |
| } \ |
| } |
| |
| #define CL_INET_DISCONNECT(connp, tcp) { \ |
| if (cl_inet_disconnect != NULL) { \ |
| /* \ |
| * Running in cluster mode - deregister active \ |
| * connection information \ |
| */ \ |
| if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ |
| if ((tcp)->tcp_ip_src != 0) { \ |
| (*cl_inet_disconnect)( \ |
| (connp)->conn_netstack->netstack_stackid,\ |
| IPPROTO_TCP, AF_INET, \ |
| (uint8_t *)(&((tcp)->tcp_ip_src)), \ |
| (in_port_t)(tcp)->tcp_lport, \ |
| (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ |
| (in_port_t)(tcp)->tcp_fport, NULL); \ |
| } \ |
| } else { \ |
| if (!IN6_IS_ADDR_UNSPECIFIED( \ |
| &(tcp)->tcp_ip_src_v6)) { \ |
| (*cl_inet_disconnect)( \ |
| (connp)->conn_netstack->netstack_stackid,\ |
| IPPROTO_TCP, AF_INET6, \ |
| (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ |
| (in_port_t)(tcp)->tcp_lport, \ |
| (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ |
| (in_port_t)(tcp)->tcp_fport, NULL); \ |
| } \ |
| } \ |
| } \ |
| } |
| |
| /* |
| * Cluster networking hook for traversing current connection list. |
| * This routine is used to extract the current list of live connections |
| * which must continue to to be dispatched to this node. |
| */ |
| int cl_tcp_walk_list(netstackid_t stack_id, |
| int (*callback)(cl_tcp_info_t *, void *), void *arg); |
| |
| static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), |
| void *arg, tcp_stack_t *tcps); |
| |
| #define DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) \ |
| DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, \ |
| iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \ |
| ip6_t *, ip6h, int, 0); |
| |
| /* |
| * Figure out the value of window scale opton. Note that the rwnd is |
| * ASSUMED to be rounded up to the nearest MSS before the calculation. |
| * We cannot find the scale value and then do a round up of tcp_rwnd |
| * because the scale value may not be correct after that. |
| * |
| * Set the compiler flag to make this function inline. |
| */ |
| static void |
| tcp_set_ws_value(tcp_t *tcp) |
| { |
| int i; |
| uint32_t rwnd = tcp->tcp_rwnd; |
| |
| for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; |
| i++, rwnd >>= 1) |
| ; |
| tcp->tcp_rcv_ws = i; |
| } |
| |
| /* |
| * Remove a connection from the list of detached TIME_WAIT connections. |
| * It returns B_FALSE if it can't remove the connection from the list |
| * as the connection has already been removed from the list due to an |
| * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. |
| */ |
| static boolean_t |
| tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) |
| { |
| boolean_t locked = B_FALSE; |
| |
| if (tcp_time_wait == NULL) { |
| tcp_time_wait = *((tcp_squeue_priv_t **) |
| squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); |
| mutex_enter(&tcp_time_wait->tcp_time_wait_lock); |
| locked = B_TRUE; |
| } else { |
| ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); |
| } |
| |
| if (tcp->tcp_time_wait_expire == 0) { |
| ASSERT(tcp->tcp_time_wait_next == NULL); |
| ASSERT(tcp->tcp_time_wait_prev == NULL); |
| if (locked) |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| return (B_FALSE); |
| } |
| ASSERT(TCP_IS_DETACHED(tcp)); |
| ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); |
| |
| if (tcp == tcp_time_wait->tcp_time_wait_head) { |
| ASSERT(tcp->tcp_time_wait_prev == NULL); |
| tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; |
| if (tcp_time_wait->tcp_time_wait_head != NULL) { |
| tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = |
| NULL; |
| } else { |
| tcp_time_wait->tcp_time_wait_tail = NULL; |
| } |
| } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { |
| ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); |
| ASSERT(tcp->tcp_time_wait_next == NULL); |
| tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; |
| ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); |
| tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; |
| } else { |
| ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); |
| ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); |
| tcp->tcp_time_wait_prev->tcp_time_wait_next = |
| tcp->tcp_time_wait_next; |
| tcp->tcp_time_wait_next->tcp_time_wait_prev = |
| tcp->tcp_time_wait_prev; |
| } |
| tcp->tcp_time_wait_next = NULL; |
| tcp->tcp_time_wait_prev = NULL; |
| tcp->tcp_time_wait_expire = 0; |
| |
| if (locked) |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| return (B_TRUE); |
| } |
| |
| /* |
| * Add a connection to the list of detached TIME_WAIT connections |
| * and set its time to expire. |
| */ |
| static void |
| tcp_time_wait_append(tcp_t *tcp) |
| { |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| tcp_squeue_priv_t *tcp_time_wait = |
| *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, |
| SQPRIVATE_TCP)); |
| |
| tcp_timers_stop(tcp); |
| |
| /* Freed above */ |
| ASSERT(tcp->tcp_timer_tid == 0); |
| ASSERT(tcp->tcp_ack_tid == 0); |
| |
| /* must have happened at the time of detaching the tcp */ |
| ASSERT(tcp->tcp_ptpahn == NULL); |
| ASSERT(tcp->tcp_flow_stopped == 0); |
| ASSERT(tcp->tcp_time_wait_next == NULL); |
| ASSERT(tcp->tcp_time_wait_prev == NULL); |
| ASSERT(tcp->tcp_time_wait_expire == NULL); |
| ASSERT(tcp->tcp_listener == NULL); |
| |
| tcp->tcp_time_wait_expire = ddi_get_lbolt(); |
| /* |
| * The value computed below in tcp->tcp_time_wait_expire may |
| * appear negative or wrap around. That is ok since our |
| * interest is only in the difference between the current lbolt |
| * value and tcp->tcp_time_wait_expire. But the value should not |
| * be zero, since it means the tcp is not in the TIME_WAIT list. |
| * The corresponding comparison in tcp_time_wait_collector() uses |
| * modular arithmetic. |
| */ |
| tcp->tcp_time_wait_expire += |
| drv_usectohz(tcps->tcps_time_wait_interval * 1000); |
| if (tcp->tcp_time_wait_expire == 0) |
| tcp->tcp_time_wait_expire = 1; |
| |
| ASSERT(TCP_IS_DETACHED(tcp)); |
| ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); |
| ASSERT(tcp->tcp_time_wait_next == NULL); |
| ASSERT(tcp->tcp_time_wait_prev == NULL); |
| TCP_DBGSTAT(tcps, tcp_time_wait); |
| |
| mutex_enter(&tcp_time_wait->tcp_time_wait_lock); |
| if (tcp_time_wait->tcp_time_wait_head == NULL) { |
| ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); |
| tcp_time_wait->tcp_time_wait_head = tcp; |
| } else { |
| ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); |
| ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == |
| TCPS_TIME_WAIT); |
| tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; |
| tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; |
| } |
| tcp_time_wait->tcp_time_wait_tail = tcp; |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| } |
| |
| /* ARGSUSED */ |
| void |
| tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) |
| { |
| conn_t *connp = (conn_t *)arg; |
| tcp_t *tcp = connp->conn_tcp; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| |
| ASSERT(tcp != NULL); |
| if (tcp->tcp_state == TCPS_CLOSED) { |
| return; |
| } |
| |
| ASSERT((tcp->tcp_family == AF_INET && |
| tcp->tcp_ipversion == IPV4_VERSION) || |
| (tcp->tcp_family == AF_INET6 && |
| (tcp->tcp_ipversion == IPV4_VERSION || |
| tcp->tcp_ipversion == IPV6_VERSION))); |
| ASSERT(!tcp->tcp_listener); |
| |
| TCP_STAT(tcps, tcp_time_wait_reap); |
| ASSERT(TCP_IS_DETACHED(tcp)); |
| |
| /* |
| * Because they have no upstream client to rebind or tcp_close() |
| * them later, we axe the connection here and now. |
| */ |
| tcp_close_detached(tcp); |
| } |
| |
| /* |
| * Remove cached/latched IPsec references. |
| */ |
| void |
| tcp_ipsec_cleanup(tcp_t *tcp) |
| { |
| conn_t *connp = tcp->tcp_connp; |
| |
| ASSERT(connp->conn_flags & IPCL_TCPCONN); |
| |
| if (connp->conn_latch != NULL) { |
| IPLATCH_REFRELE(connp->conn_latch, |
| connp->conn_netstack); |
| connp->conn_latch = NULL; |
| } |
| if (connp->conn_policy != NULL) { |
| IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); |
| connp->conn_policy = NULL; |
| } |
| } |
| |
| /* |
| * Cleaup before placing on free list. |
| * Disassociate from the netstack/tcp_stack_t since the freelist |
| * is per squeue and not per netstack. |
| */ |
| void |
| tcp_cleanup(tcp_t *tcp) |
| { |
| mblk_t *mp; |
| char *tcp_iphc; |
| int tcp_iphc_len; |
| int tcp_hdr_grown; |
| tcp_sack_info_t *tcp_sack_info; |
| conn_t *connp = tcp->tcp_connp; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| netstack_t *ns = tcps->tcps_netstack; |
| mblk_t *tcp_rsrv_mp; |
| |
| tcp_bind_hash_remove(tcp); |
| |
| /* Cleanup that which needs the netstack first */ |
| tcp_ipsec_cleanup(tcp); |
| |
| tcp_free(tcp); |
| |
| /* Release any SSL context */ |
| if (tcp->tcp_kssl_ent != NULL) { |
| kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); |
| tcp->tcp_kssl_ent = NULL; |
| } |
| |
| if (tcp->tcp_kssl_ctx != NULL) { |
| kssl_release_ctx(tcp->tcp_kssl_ctx); |
| tcp->tcp_kssl_ctx = NULL; |
| } |
| tcp->tcp_kssl_pending = B_FALSE; |
| |
| conn_delete_ire(connp, NULL); |
| |
| /* |
| * Since we will bzero the entire structure, we need to |
| * remove it and reinsert it in global hash list. We |
| * know the walkers can't get to this conn because we |
| * had set CONDEMNED flag earlier and checked reference |
| * under conn_lock so walker won't pick it and when we |
| * go the ipcl_globalhash_remove() below, no walker |
| * can get to it. |
| */ |
| ipcl_globalhash_remove(connp); |
| |
| /* |
| * Now it is safe to decrement the reference counts. |
| * This might be the last reference on the netstack and TCPS |
| * in which case it will cause the tcp_g_q_close and |
| * the freeing of the IP Instance. |
| */ |
| connp->conn_netstack = NULL; |
| netstack_rele(ns); |
| ASSERT(tcps != NULL); |
| tcp->tcp_tcps = NULL; |
| TCPS_REFRELE(tcps); |
| |
| /* Save some state */ |
| mp = tcp->tcp_timercache; |
| |
| tcp_sack_info = tcp->tcp_sack_info; |
| tcp_iphc = tcp->tcp_iphc; |
| tcp_iphc_len = tcp->tcp_iphc_len; |
| tcp_hdr_grown = tcp->tcp_hdr_grown; |
| tcp_rsrv_mp = tcp->tcp_rsrv_mp; |
| |
| if (connp->conn_cred != NULL) { |
| crfree(connp->conn_cred); |
| connp->conn_cred = NULL; |
| } |
| if (connp->conn_peercred != NULL) { |
| crfree(connp->conn_peercred); |
| connp->conn_peercred = NULL; |
| } |
| ipcl_conn_cleanup(connp); |
| connp->conn_flags = IPCL_TCPCONN; |
| bzero(tcp, sizeof (tcp_t)); |
| |
| /* restore the state */ |
| tcp->tcp_timercache = mp; |
| |
| tcp->tcp_sack_info = tcp_sack_info; |
| tcp->tcp_iphc = tcp_iphc; |
| tcp->tcp_iphc_len = tcp_iphc_len; |
| tcp->tcp_hdr_grown = tcp_hdr_grown; |
| tcp->tcp_rsrv_mp = tcp_rsrv_mp; |
| |
| tcp->tcp_connp = connp; |
| |
| ASSERT(connp->conn_tcp == tcp); |
| ASSERT(connp->conn_flags & IPCL_TCPCONN); |
| connp->conn_state_flags = CONN_INCIPIENT; |
| ASSERT(connp->conn_ulp == IPPROTO_TCP); |
| ASSERT(connp->conn_ref == 1); |
| } |
| |
| /* |
| * Blows away all tcps whose TIME_WAIT has expired. List traversal |
| * is done forwards from the head. |
| * This walks all stack instances since |
| * tcp_time_wait remains global across all stacks. |
| */ |
| /* ARGSUSED */ |
| void |
| tcp_time_wait_collector(void *arg) |
| { |
| tcp_t *tcp; |
| clock_t now; |
| mblk_t *mp; |
| conn_t *connp; |
| kmutex_t *lock; |
| boolean_t removed; |
| |
| squeue_t *sqp = (squeue_t *)arg; |
| tcp_squeue_priv_t *tcp_time_wait = |
| *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); |
| |
| mutex_enter(&tcp_time_wait->tcp_time_wait_lock); |
| tcp_time_wait->tcp_time_wait_tid = 0; |
| |
| if (tcp_time_wait->tcp_free_list != NULL && |
| tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { |
| TCP_G_STAT(tcp_freelist_cleanup); |
| while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { |
| tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; |
| tcp->tcp_time_wait_next = NULL; |
| tcp_time_wait->tcp_free_list_cnt--; |
| ASSERT(tcp->tcp_tcps == NULL); |
| CONN_DEC_REF(tcp->tcp_connp); |
| } |
| ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); |
| } |
| |
| /* |
| * In order to reap time waits reliably, we should use a |
| * source of time that is not adjustable by the user -- hence |
| * the call to ddi_get_lbolt(). |
| */ |
| now = ddi_get_lbolt(); |
| while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { |
| /* |
| * Compare times using modular arithmetic, since |
| * lbolt can wrapover. |
| */ |
| if ((now - tcp->tcp_time_wait_expire) < 0) { |
| break; |
| } |
| |
| removed = tcp_time_wait_remove(tcp, tcp_time_wait); |
| ASSERT(removed); |
| |
| connp = tcp->tcp_connp; |
| ASSERT(connp->conn_fanout != NULL); |
| lock = &connp->conn_fanout->connf_lock; |
| /* |
| * This is essentially a TW reclaim fast path optimization for |
| * performance where the timewait collector checks under the |
| * fanout lock (so that no one else can get access to the |
| * conn_t) that the refcnt is 2 i.e. one for TCP and one for |
| * the classifier hash list. If ref count is indeed 2, we can |
| * just remove the conn under the fanout lock and avoid |
| * cleaning up the conn under the squeue, provided that |
| * clustering callbacks are not enabled. If clustering is |
| * enabled, we need to make the clustering callback before |
| * setting the CONDEMNED flag and after dropping all locks and |
| * so we forego this optimization and fall back to the slow |
| * path. Also please see the comments in tcp_closei_local |
| * regarding the refcnt logic. |
| * |
| * Since we are holding the tcp_time_wait_lock, its better |
| * not to block on the fanout_lock because other connections |
| * can't add themselves to time_wait list. So we do a |
| * tryenter instead of mutex_enter. |
| */ |
| if (mutex_tryenter(lock)) { |
| mutex_enter(&connp->conn_lock); |
| if ((connp->conn_ref == 2) && |
| (cl_inet_disconnect == NULL)) { |
| ipcl_hash_remove_locked(connp, |
| connp->conn_fanout); |
| /* |
| * Set the CONDEMNED flag now itself so that |
| * the refcnt cannot increase due to any |
| * walker. But we have still not cleaned up |
| * conn_ire_cache. This is still ok since |
| * we are going to clean it up in tcp_cleanup |
| * immediately and any interface unplumb |
| * thread will wait till the ire is blown away |
| */ |
| connp->conn_state_flags |= CONN_CONDEMNED; |
| mutex_exit(lock); |
| mutex_exit(&connp->conn_lock); |
| if (tcp_time_wait->tcp_free_list_cnt < |
| tcp_free_list_max_cnt) { |
| /* Add to head of tcp_free_list */ |
| mutex_exit( |
| &tcp_time_wait->tcp_time_wait_lock); |
| tcp_cleanup(tcp); |
| ASSERT(connp->conn_latch == NULL); |
| ASSERT(connp->conn_policy == NULL); |
| ASSERT(tcp->tcp_tcps == NULL); |
| ASSERT(connp->conn_netstack == NULL); |
| |
| mutex_enter( |
| &tcp_time_wait->tcp_time_wait_lock); |
| tcp->tcp_time_wait_next = |
| tcp_time_wait->tcp_free_list; |
| tcp_time_wait->tcp_free_list = tcp; |
| tcp_time_wait->tcp_free_list_cnt++; |
| continue; |
| } else { |
| /* Do not add to tcp_free_list */ |
| mutex_exit( |
| &tcp_time_wait->tcp_time_wait_lock); |
| tcp_bind_hash_remove(tcp); |
| conn_delete_ire(tcp->tcp_connp, NULL); |
| tcp_ipsec_cleanup(tcp); |
| CONN_DEC_REF(tcp->tcp_connp); |
| } |
| } else { |
| CONN_INC_REF_LOCKED(connp); |
| mutex_exit(lock); |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| mutex_exit(&connp->conn_lock); |
| /* |
| * We can reuse the closemp here since conn has |
| * detached (otherwise we wouldn't even be in |
| * time_wait list). tcp_closemp_used can safely |
| * be changed without taking a lock as no other |
| * thread can concurrently access it at this |
| * point in the connection lifecycle. |
| */ |
| |
| if (tcp->tcp_closemp.b_prev == NULL) |
| tcp->tcp_closemp_used = B_TRUE; |
| else |
| cmn_err(CE_PANIC, |
| "tcp_timewait_collector: " |
| "concurrent use of tcp_closemp: " |
| "connp %p tcp %p\n", (void *)connp, |
| (void *)tcp); |
| |
| TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); |
| mp = &tcp->tcp_closemp; |
| SQUEUE_ENTER_ONE(connp->conn_sqp, mp, |
| tcp_timewait_output, connp, |
| SQ_FILL, SQTAG_TCP_TIMEWAIT); |
| } |
| } else { |
| mutex_enter(&connp->conn_lock); |
| CONN_INC_REF_LOCKED(connp); |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| mutex_exit(&connp->conn_lock); |
| /* |
| * We can reuse the closemp here since conn has |
| * detached (otherwise we wouldn't even be in |
| * time_wait list). tcp_closemp_used can safely |
| * be changed without taking a lock as no other |
| * thread can concurrently access it at this |
| * point in the connection lifecycle. |
| */ |
| |
| if (tcp->tcp_closemp.b_prev == NULL) |
| tcp->tcp_closemp_used = B_TRUE; |
| else |
| cmn_err(CE_PANIC, "tcp_timewait_collector: " |
| "concurrent use of tcp_closemp: " |
| "connp %p tcp %p\n", (void *)connp, |
| (void *)tcp); |
| |
| TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); |
| mp = &tcp->tcp_closemp; |
| SQUEUE_ENTER_ONE(connp->conn_sqp, mp, |
| tcp_timewait_output, connp, |
| SQ_FILL, SQTAG_TCP_TIMEWAIT); |
| } |
| mutex_enter(&tcp_time_wait->tcp_time_wait_lock); |
| } |
| |
| if (tcp_time_wait->tcp_free_list != NULL) |
| tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; |
| |
| tcp_time_wait->tcp_time_wait_tid = |
| timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp, |
| TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION, |
| CALLOUT_FLAG_ROUNDUP); |
| mutex_exit(&tcp_time_wait->tcp_time_wait_lock); |
| } |
| |
| /* |
| * Reply to a clients T_CONN_RES TPI message. This function |
| * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES |
| * on the acceptor STREAM and processed in tcp_wput_accept(). |
| * Read the block comment on top of tcp_conn_request(). |
| */ |
| static void |
| tcp_tli_accept(tcp_t *listener, mblk_t *mp) |
| { |
| tcp_t *acceptor; |
| tcp_t *eager; |
| tcp_t *tcp; |
| struct T_conn_res *tcr; |
| t_uscalar_t acceptor_id; |
| t_scalar_t seqnum; |
| mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ |
| struct tcp_options *tcpopt; |
| mblk_t *ok_mp; |
| mblk_t *mp1; |
| tcp_stack_t *tcps = listener->tcp_tcps; |
| |
| if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { |
| tcp_err_ack(listener, mp, TPROTO, 0); |
| return; |
| } |
| tcr = (struct T_conn_res *)mp->b_rptr; |
| |
| /* |
| * Under ILP32 the stream head points tcr->ACCEPTOR_id at the |
| * read side queue of the streams device underneath us i.e. the |
| * read side queue of 'ip'. Since we can't deference QUEUE_ptr we |
| * look it up in the queue_hash. Under LP64 it sends down the |
| * minor_t of the accepting endpoint. |
| * |
| * Once the acceptor/eager are modified (in tcp_accept_swap) the |
| * fanout hash lock is held. |
| * This prevents any thread from entering the acceptor queue from |
| * below (since it has not been hard bound yet i.e. any inbound |
| * packets will arrive on the listener or default tcp queue and |
| * go through tcp_lookup). |
| * The CONN_INC_REF will prevent the acceptor from closing. |
| * |
| * XXX It is still possible for a tli application to send down data |
| * on the accepting stream while another thread calls t_accept. |
| * This should not be a problem for well-behaved applications since |
| * the T_OK_ACK is sent after the queue swapping is completed. |
| * |
| * If the accepting fd is the same as the listening fd, avoid |
| * queue hash lookup since that will return an eager listener in a |
| * already established state. |
| */ |
| acceptor_id = tcr->ACCEPTOR_id; |
| mutex_enter(&listener->tcp_eager_lock); |
| if (listener->tcp_acceptor_id == acceptor_id) { |
| eager = listener->tcp_eager_next_q; |
| /* only count how many T_CONN_INDs so don't count q0 */ |
| if ((listener->tcp_conn_req_cnt_q != 1) || |
| (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { |
| mutex_exit(&listener->tcp_eager_lock); |
| tcp_err_ack(listener, mp, TBADF, 0); |
| return; |
| } |
| if (listener->tcp_conn_req_cnt_q0 != 0) { |
| /* Throw away all the eagers on q0. */ |
| tcp_eager_cleanup(listener, 1); |
| } |
| if (listener->tcp_syn_defense) { |
| listener->tcp_syn_defense = B_FALSE; |
| if (listener->tcp_ip_addr_cache != NULL) { |
| kmem_free(listener->tcp_ip_addr_cache, |
| IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); |
| listener->tcp_ip_addr_cache = NULL; |
| } |
| } |
| /* |
| * Transfer tcp_conn_req_max to the eager so that when |
| * a disconnect occurs we can revert the endpoint to the |
| * listen state. |
| */ |
| eager->tcp_conn_req_max = listener->tcp_conn_req_max; |
| ASSERT(listener->tcp_conn_req_cnt_q0 == 0); |
| /* |
| * Get a reference on the acceptor just like the |
| * tcp_acceptor_hash_lookup below. |
| */ |
| acceptor = listener; |
| CONN_INC_REF(acceptor->tcp_connp); |
| } else { |
| acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); |
| if (acceptor == NULL) { |
| if (listener->tcp_debug) { |
| (void) strlog(TCP_MOD_ID, 0, 1, |
| SL_ERROR|SL_TRACE, |
| "tcp_accept: did not find acceptor 0x%x\n", |
| acceptor_id); |
| } |
| mutex_exit(&listener->tcp_eager_lock); |
| tcp_err_ack(listener, mp, TPROVMISMATCH, 0); |
| return; |
| } |
| /* |
| * Verify acceptor state. The acceptable states for an acceptor |
| * include TCPS_IDLE and TCPS_BOUND. |
| */ |
| switch (acceptor->tcp_state) { |
| case TCPS_IDLE: |
| /* FALLTHRU */ |
| case TCPS_BOUND: |
| break; |
| default: |
| CONN_DEC_REF(acceptor->tcp_connp); |
| mutex_exit(&listener->tcp_eager_lock); |
| tcp_err_ack(listener, mp, TOUTSTATE, 0); |
| return; |
| } |
| } |
| |
| /* The listener must be in TCPS_LISTEN */ |
| if (listener->tcp_state != TCPS_LISTEN) { |
| CONN_DEC_REF(acceptor->tcp_connp); |
| mutex_exit(&listener->tcp_eager_lock); |
| tcp_err_ack(listener, mp, TOUTSTATE, 0); |
| return; |
| } |
| |
| /* |
| * Rendezvous with an eager connection request packet hanging off |
| * 'tcp' that has the 'seqnum' tag. We tagged the detached open |
| * tcp structure when the connection packet arrived in |
| * tcp_conn_request(). |
| */ |
| seqnum = tcr->SEQ_number; |
| eager = listener; |
| do { |
| eager = eager->tcp_eager_next_q; |
| if (eager == NULL) { |
| CONN_DEC_REF(acceptor->tcp_connp); |
| mutex_exit(&listener->tcp_eager_lock); |
| tcp_err_ack(listener, mp, TBADSEQ, 0); |
| return; |
| } |
| } while (eager->tcp_conn_req_seqnum != seqnum); |
| mutex_exit(&listener->tcp_eager_lock); |
| |
| /* |
| * At this point, both acceptor and listener have 2 ref |
| * that they begin with. Acceptor has one additional ref |
| * we placed in lookup while listener has 3 additional |
| * ref for being behind the squeue (tcp_accept() is |
| * done on listener's squeue); being in classifier hash; |
| * and eager's ref on listener. |
| */ |
| ASSERT(listener->tcp_connp->conn_ref >= 5); |
| ASSERT(acceptor->tcp_connp->conn_ref >= 3); |
| |
| /* |
| * The eager at this point is set in its own squeue and |
| * could easily have been killed (tcp_accept_finish will |
| * deal with that) because of a TH_RST so we can only |
| * ASSERT for a single ref. |
| */ |
| ASSERT(eager->tcp_connp->conn_ref >= 1); |
| |
| /* Pre allocate the stroptions mblk also */ |
| opt_mp = allocb(MAX(sizeof (struct tcp_options), |
| sizeof (struct T_conn_res)), BPRI_HI); |
| if (opt_mp == NULL) { |
| CONN_DEC_REF(acceptor->tcp_connp); |
| CONN_DEC_REF(eager->tcp_connp); |
| tcp_err_ack(listener, mp, TSYSERR, ENOMEM); |
| return; |
| } |
| DB_TYPE(opt_mp) = M_SETOPTS; |
| opt_mp->b_wptr += sizeof (struct tcp_options); |
| tcpopt = (struct tcp_options *)opt_mp->b_rptr; |
| tcpopt->to_flags = 0; |
| |
| /* |
| * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO |
| * from listener to acceptor. |
| */ |
| if (listener->tcp_bound_if != 0) { |
| tcpopt->to_flags |= TCPOPT_BOUNDIF; |
| tcpopt->to_boundif = listener->tcp_bound_if; |
| } |
| if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { |
| tcpopt->to_flags |= TCPOPT_RECVPKTINFO; |
| } |
| |
| /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ |
| if ((mp1 = copymsg(mp)) == NULL) { |
| CONN_DEC_REF(acceptor->tcp_connp); |
| CONN_DEC_REF(eager->tcp_connp); |
| freemsg(opt_mp); |
| tcp_err_ack(listener, mp, TSYSERR, ENOMEM); |
| return; |
| } |
| |
| tcr = (struct T_conn_res *)mp1->b_rptr; |
| |
| /* |
| * This is an expanded version of mi_tpi_ok_ack_alloc() |
| * which allocates a larger mblk and appends the new |
| * local address to the ok_ack. The address is copied by |
| * soaccept() for getsockname(). |
| */ |
| { |
| int extra; |
| |
| extra = (eager->tcp_family == AF_INET) ? |
| sizeof (sin_t) : sizeof (sin6_t); |
| |
| /* |
| * Try to re-use mp, if possible. Otherwise, allocate |
| * an mblk and return it as ok_mp. In any case, mp |
| * is no longer usable upon return. |
| */ |
| if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { |
| CONN_DEC_REF(acceptor->tcp_connp); |
| CONN_DEC_REF(eager->tcp_connp); |
| freemsg(opt_mp); |
| /* Original mp has been freed by now, so use mp1 */ |
| tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); |
| return; |
| } |
| |
| mp = NULL; /* We should never use mp after this point */ |
| |
| switch (extra) { |
| case sizeof (sin_t): { |
| sin_t *sin = (sin_t *)ok_mp->b_wptr; |
| |
| ok_mp->b_wptr += extra; |
| sin->sin_family = AF_INET; |
| sin->sin_port = eager->tcp_lport; |
| sin->sin_addr.s_addr = |
| eager->tcp_ipha->ipha_src; |
| break; |
| } |
| case sizeof (sin6_t): { |
| sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; |
| |
| ok_mp->b_wptr += extra; |
| sin6->sin6_family = AF_INET6; |
| sin6->sin6_port = eager->tcp_lport; |
| if (eager->tcp_ipversion == IPV4_VERSION) { |
| sin6->sin6_flowinfo = 0; |
| IN6_IPADDR_TO_V4MAPPED( |
| eager->tcp_ipha->ipha_src, |
| &sin6->sin6_addr); |
| } else { |
| ASSERT(eager->tcp_ip6h != NULL); |
| sin6->sin6_flowinfo = |
| eager->tcp_ip6h->ip6_vcf & |
| ~IPV6_VERS_AND_FLOW_MASK; |
| sin6->sin6_addr = |
| eager->tcp_ip6h->ip6_src; |
| } |
| sin6->sin6_scope_id = 0; |
| sin6->__sin6_src_id = 0; |
| break; |
| } |
| default: |
| break; |
| } |
| ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); |
| } |
| |
| /* |
| * If there are no options we know that the T_CONN_RES will |
| * succeed. However, we can't send the T_OK_ACK upstream until |
| * the tcp_accept_swap is done since it would be dangerous to |
| * let the application start using the new fd prior to the swap. |
| */ |
| tcp_accept_swap(listener, acceptor, eager); |
| |
| /* |
| * tcp_accept_swap unlinks eager from listener but does not drop |
| * the eager's reference on the listener. |
| */ |
| ASSERT(eager->tcp_listener == NULL); |
| ASSERT(listener->tcp_connp->conn_ref >= 5); |
| |
| /* |
| * The eager is now associated with its own queue. Insert in |
| * the hash so that the connection can be reused for a future |
| * T_CONN_RES. |
| */ |
| tcp_acceptor_hash_insert(acceptor_id, eager); |
| |
| /* |
| * We now do the processing of options with T_CONN_RES. |
| * We delay till now since we wanted to have queue to pass to |
| * option processing routines that points back to the right |
| * instance structure which does not happen until after |
| * tcp_accept_swap(). |
| * |
| * Note: |
| * The sanity of the logic here assumes that whatever options |
| * are appropriate to inherit from listner=>eager are done |
| * before this point, and whatever were to be overridden (or not) |
| * in transfer logic from eager=>acceptor in tcp_accept_swap(). |
| * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it |
| * before its ACCEPTOR_id comes down in T_CONN_RES ] |
| * This may not be true at this point in time but can be fixed |
| * independently. This option processing code starts with |
| * the instantiated acceptor instance and the final queue at |
| * this point. |
| */ |
| |
| if (tcr->OPT_length != 0) { |
| /* Options to process */ |
| int t_error = 0; |
| int sys_error = 0; |
| int do_disconnect = 0; |
| |
| if (tcp_conprim_opt_process(eager, mp1, |
| &do_disconnect, &t_error, &sys_error) < 0) { |
| eager->tcp_accept_error = 1; |
| if (do_disconnect) { |
| /* |
| * An option failed which does not allow |
| * connection to be accepted. |
| * |
| * We allow T_CONN_RES to succeed and |
| * put a T_DISCON_IND on the eager queue. |
| */ |
| ASSERT(t_error == 0 && sys_error == 0); |
| eager->tcp_send_discon_ind = 1; |
| } else { |
| ASSERT(t_error != 0); |
| freemsg(ok_mp); |
| /* |
| * Original mp was either freed or set |
| * to ok_mp above, so use mp1 instead. |
| */ |
| tcp_err_ack(listener, mp1, t_error, sys_error); |
| goto finish; |
| } |
| } |
| /* |
| * Most likely success in setting options (except if |
| * eager->tcp_send_discon_ind set). |
| * mp1 option buffer represented by OPT_length/offset |
| * potentially modified and contains results of setting |
| * options at this point |
| */ |
| } |
| |
| /* We no longer need mp1, since all options processing has passed */ |
| freemsg(mp1); |
| |
| putnext(listener->tcp_rq, ok_mp); |
| |
| mutex_enter(&listener->tcp_eager_lock); |
| if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { |
| tcp_t *tail; |
| mblk_t *conn_ind; |
| |
| /* |
| * This path should not be executed if listener and |
| * acceptor streams are the same. |
| */ |
| ASSERT(listener != acceptor); |
| |
| tcp = listener->tcp_eager_prev_q0; |
| /* |
| * listener->tcp_eager_prev_q0 points to the TAIL of the |
| * deferred T_conn_ind queue. We need to get to the head of |
| * the queue in order to send up T_conn_ind the same order as |
| * how the 3WHS is completed. |
| */ |
| while (tcp != listener) { |
| if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) |
| break; |
| else |
| tcp = tcp->tcp_eager_prev_q0; |
| } |
| ASSERT(tcp != listener); |
| conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; |
| ASSERT(conn_ind != NULL); |
| tcp->tcp_conn.tcp_eager_conn_ind = NULL; |
| |
| /* Move from q0 to q */ |
| ASSERT(listener->tcp_conn_req_cnt_q0 > 0); |
| listener->tcp_conn_req_cnt_q0--; |
| listener->tcp_conn_req_cnt_q++; |
| tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = |
| tcp->tcp_eager_prev_q0; |
| tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = |
| tcp->tcp_eager_next_q0; |
| tcp->tcp_eager_prev_q0 = NULL; |
| tcp->tcp_eager_next_q0 = NULL; |
| tcp->tcp_conn_def_q0 = B_FALSE; |
| |
| /* Make sure the tcp isn't in the list of droppables */ |
| ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && |
| tcp->tcp_eager_prev_drop_q0 == NULL); |
| |
| /* |
| * Insert at end of the queue because sockfs sends |
| * down T_CONN_RES in chronological order. Leaving |
| * the older conn indications at front of the queue |
| * helps reducing search time. |
| */ |
| tail = listener->tcp_eager_last_q; |
| if (tail != NULL) |
| tail->tcp_eager_next_q = tcp; |
| else |
| listener->tcp_eager_next_q = tcp; |
| listener->tcp_eager_last_q = tcp; |
| tcp->tcp_eager_next_q = NULL; |
| mutex_exit(&listener->tcp_eager_lock); |
| putnext(tcp->tcp_rq, conn_ind); |
| } else { |
| mutex_exit(&listener->tcp_eager_lock); |
| } |
| |
| /* |
| * Done with the acceptor - free it |
| * |
| * Note: from this point on, no access to listener should be made |
| * as listener can be equal to acceptor. |
| */ |
| finish: |
| ASSERT(acceptor->tcp_detached); |
| ASSERT(tcps->tcps_g_q != NULL); |
| ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); |
| acceptor->tcp_rq = tcps->tcps_g_q; |
| acceptor->tcp_wq = WR(tcps->tcps_g_q); |
| (void) tcp_clean_death(acceptor, 0, 2); |
| CONN_DEC_REF(acceptor->tcp_connp); |
| |
| /* |
| * In case we already received a FIN we have to make tcp_rput send |
| * the ordrel_ind. This will also send up a window update if the window |
| * has opened up. |
| * |
| * In the normal case of a successful connection acceptance |
| * we give the O_T_BIND_REQ to the read side put procedure as an |
| * indication that this was just accepted. This tells tcp_rput to |
| * pass up any data queued in tcp_rcv_list. |
| * |
| * In the fringe case where options sent with T_CONN_RES failed and |
| * we required, we would be indicating a T_DISCON_IND to blow |
| * away this connection. |
| */ |
| |
| /* |
| * XXX: we currently have a problem if XTI application closes the |
| * acceptor stream in between. This problem exists in on10-gate also |
| * and is well know but nothing can be done short of major rewrite |
| * to fix it. Now it is possible to take care of it by assigning TLI/XTI |
| * eager same squeue as listener (we can distinguish non socket |
| * listeners at the time of handling a SYN in tcp_conn_request) |
| * and do most of the work that tcp_accept_finish does here itself |
| * and then get behind the acceptor squeue to access the acceptor |
| * queue. |
| */ |
| /* |
| * We already have a ref on tcp so no need to do one before squeue_enter |
| */ |
| SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish, |
| eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH); |
| } |
| |
| /* |
| * Swap information between the eager and acceptor for a TLI/XTI client. |
| * The sockfs accept is done on the acceptor stream and control goes |
| * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not |
| * called. In either case, both the eager and listener are in their own |
| * perimeter (squeue) and the code has to deal with potential race. |
| * |
| * See the block comment on top of tcp_accept() and tcp_wput_accept(). |
| */ |
| static void |
| tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) |
| { |
| conn_t *econnp, *aconnp; |
| |
| ASSERT(eager->tcp_rq == listener->tcp_rq); |
| ASSERT(eager->tcp_detached && !acceptor->tcp_detached); |
| ASSERT(!eager->tcp_hard_bound); |
| ASSERT(!TCP_IS_SOCKET(acceptor)); |
| ASSERT(!TCP_IS_SOCKET(eager)); |
| ASSERT(!TCP_IS_SOCKET(listener)); |
| |
| acceptor->tcp_detached = B_TRUE; |
| /* |
| * To permit stream re-use by TLI/XTI, the eager needs a copy of |
| * the acceptor id. |
| */ |
| eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; |
| |
| /* remove eager from listen list... */ |
| mutex_enter(&listener->tcp_eager_lock); |
| tcp_eager_unlink(eager); |
| ASSERT(eager->tcp_eager_next_q == NULL && |
| eager->tcp_eager_last_q == NULL); |
| ASSERT(eager->tcp_eager_next_q0 == NULL && |
| eager->tcp_eager_prev_q0 == NULL); |
| mutex_exit(&listener->tcp_eager_lock); |
| eager->tcp_rq = acceptor->tcp_rq; |
| eager->tcp_wq = acceptor->tcp_wq; |
| |
| econnp = eager->tcp_connp; |
| aconnp = acceptor->tcp_connp; |
| |
| eager->tcp_rq->q_ptr = econnp; |
| eager->tcp_wq->q_ptr = econnp; |
| |
| /* |
| * In the TLI/XTI loopback case, we are inside the listener's squeue, |
| * which might be a different squeue from our peer TCP instance. |
| * For TCP Fusion, the peer expects that whenever tcp_detached is |
| * clear, our TCP queues point to the acceptor's queues. Thus, use |
| * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq |
| * above reach global visibility prior to the clearing of tcp_detached. |
| */ |
| membar_producer(); |
| eager->tcp_detached = B_FALSE; |
| |
| ASSERT(eager->tcp_ack_tid == 0); |
| |
| econnp->conn_dev = aconnp->conn_dev; |
| econnp->conn_minor_arena = aconnp->conn_minor_arena; |
| ASSERT(econnp->conn_minor_arena != NULL); |
| if (eager->tcp_cred != NULL) |
| crfree(eager->tcp_cred); |
| eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; |
| ASSERT(econnp->conn_netstack == aconnp->conn_netstack); |
| ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); |
| |
| aconnp->conn_cred = NULL; |
| |
| econnp->conn_zoneid = aconnp->conn_zoneid; |
| econnp->conn_allzones = aconnp->conn_allzones; |
| |
| econnp->conn_mac_exempt = aconnp->conn_mac_exempt; |
| aconnp->conn_mac_exempt = B_FALSE; |
| |
| ASSERT(aconnp->conn_peercred == NULL); |
| |
| /* Do the IPC initialization */ |
| CONN_INC_REF(econnp); |
| |
| econnp->conn_multicast_loop = aconnp->conn_multicast_loop; |
| econnp->conn_af_isv6 = aconnp->conn_af_isv6; |
| econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6; |
| |
| /* Done with old IPC. Drop its ref on its connp */ |
| CONN_DEC_REF(aconnp); |
| } |
| |
| |
| /* |
| * Adapt to the information, such as rtt and rtt_sd, provided from the |
| * ire cached in conn_cache_ire. If no ire cached, do a ire lookup. |
| * |
| * Checks for multicast and broadcast destination address. |
| * Returns zero on failure; non-zero if ok. |
| * |
| * Note that the MSS calculation here is based on the info given in |
| * the IRE. We do not do any calculation based on TCP options. They |
| * will be handled in tcp_rput_other() and tcp_rput_data() when TCP |
| * knows which options to use. |
| * |
| * Note on how TCP gets its parameters for a connection. |
| * |
| * When a tcp_t structure is allocated, it gets all the default parameters. |
| * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd, |
| * spipe, rpipe, ... from the route metrics. Route metric overrides the |
| * default. |
| * |
| * An incoming SYN with a multicast or broadcast destination address, is dropped |
| * in 1 of 2 places. |
| * |
| * 1. If the packet was received over the wire it is dropped in |
| * ip_rput_process_broadcast() |
| * |
| * 2. If the packet was received through internal IP loopback, i.e. the packet |
| * was generated and received on the same machine, it is dropped in |
| * ip_wput_local() |
| * |
| * An incoming SYN with a multicast or broadcast source address is always |
| * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to |
| * reject an attempt to connect to a broadcast or multicast (destination) |
| * address. |
| */ |
| static int |
| tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) |
| { |
| tcp_hsp_t *hsp; |
| ire_t *ire; |
| ire_t *sire = NULL; |
| iulp_t *ire_uinfo = NULL; |
| uint32_t mss_max; |
| uint32_t mss; |
| boolean_t tcp_detached = TCP_IS_DETACHED(tcp); |
| conn_t *connp = tcp->tcp_connp; |
| boolean_t ire_cacheable = B_FALSE; |
| zoneid_t zoneid = connp->conn_zoneid; |
| int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | |
| MATCH_IRE_SECATTR; |
| ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); |
| ill_t *ill = NULL; |
| boolean_t incoming = (ire_mp == NULL); |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; |
| |
| ASSERT(connp->conn_ire_cache == NULL); |
| |
| if (tcp->tcp_ipversion == IPV4_VERSION) { |
| |
| if (CLASSD(tcp->tcp_connp->conn_rem)) { |
| BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); |
| return (0); |
| } |
| /* |
| * If IP_NEXTHOP is set, then look for an IRE_CACHE |
| * for the destination with the nexthop as gateway. |
| * ire_ctable_lookup() is used because this particular |
| * ire, if it exists, will be marked private. |
| * If that is not available, use the interface ire |
| * for the nexthop. |
| * |
| * TSol: tcp_update_label will detect label mismatches based |
| * only on the destination's label, but that would not |
| * detect label mismatches based on the security attributes |
| * of routes or next hop gateway. Hence we need to pass the |
| * label to ire_ftable_lookup below in order to locate the |
| * right prefix (and/or) ire cache. Similarly we also need |
| * pass the label to the ire_cache_lookup below to locate |
| * the right ire that also matches on the label. |
| */ |
| if (tcp->tcp_connp->conn_nexthop_set) { |
| ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, |
| tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, |
| tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, |
| ipst); |
| if (ire == NULL) { |
| ire = ire_ftable_lookup( |
| tcp->tcp_connp->conn_nexthop_v4, |
| 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, |
| tsl, match_flags, ipst); |
| if (ire == NULL) |
| return (0); |
| } else { |
| ire_uinfo = &ire->ire_uinfo; |
| } |
| } else { |
| ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, |
| zoneid, tsl, ipst); |
| if (ire != NULL) { |
| ire_cacheable = B_TRUE; |
| ire_uinfo = (ire_mp != NULL) ? |
| &((ire_t *)ire_mp->b_rptr)->ire_uinfo: |
| &ire->ire_uinfo; |
| |
| } else { |
| if (ire_mp == NULL) { |
| ire = ire_ftable_lookup( |
| tcp->tcp_connp->conn_rem, |
| 0, 0, 0, NULL, &sire, zoneid, 0, |
| tsl, (MATCH_IRE_RECURSIVE | |
| MATCH_IRE_DEFAULT), ipst); |
| if (ire == NULL) |
| return (0); |
| ire_uinfo = (sire != NULL) ? |
| &sire->ire_uinfo : |
| &ire->ire_uinfo; |
| } else { |
| ire = (ire_t *)ire_mp->b_rptr; |
| ire_uinfo = |
| &((ire_t *) |
| ire_mp->b_rptr)->ire_uinfo; |
| } |
| } |
| } |
| ASSERT(ire != NULL); |
| |
| if ((ire->ire_src_addr == INADDR_ANY) || |
| (ire->ire_type & IRE_BROADCAST)) { |
| /* |
| * ire->ire_mp is non null when ire_mp passed in is used |
| * ire->ire_mp is set in ip_bind_insert_ire[_v6](). |
| */ |
| if (ire->ire_mp == NULL) |
| ire_refrele(ire); |
| if (sire != NULL) |
| ire_refrele(sire); |
| return (0); |
| } |
| |
| if (tcp->tcp_ipha->ipha_src == INADDR_ANY) { |
| ipaddr_t src_addr; |
| |
| /* |
| * ip_bind_connected() has stored the correct source |
| * address in conn_src. |
| */ |
| src_addr = tcp->tcp_connp->conn_src; |
| tcp->tcp_ipha->ipha_src = src_addr; |
| /* |
| * Copy of the src addr. in tcp_t is needed |
| * for the lookup funcs. |
| */ |
| IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6); |
| } |
| /* |
| * Set the fragment bit so that IP will tell us if the MTU |
| * should change. IP tells us the latest setting of |
| * ip_path_mtu_discovery through ire_frag_flag. |
| */ |
| if (ipst->ips_ip_path_mtu_discovery) { |
| tcp->tcp_ipha->ipha_fragment_offset_and_flags = |
| htons(IPH_DF); |
| } |
| /* |
| * If ire_uinfo is NULL, this is the IRE_INTERFACE case |
| * for IP_NEXTHOP. No cache ire has been found for the |
| * destination and we are working with the nexthop's |
| * interface ire. Since we need to forward all packets |
| * to the nexthop first, we "blindly" set tcp_localnet |
| * to false, eventhough the destination may also be |
| * onlink. |
| */ |
| if (ire_uinfo == NULL) |
| tcp->tcp_localnet = 0; |
| else |
| tcp->tcp_localnet = (ire->ire_gateway_addr == 0); |
| } else { |
| /* |
| * For incoming connection ire_mp = NULL |
| * For outgoing connection ire_mp != NULL |
| * Technically we should check conn_incoming_ill |
| * when ire_mp is NULL and conn_outgoing_ill when |
| * ire_mp is non-NULL. But this is performance |
| * critical path and for IPV*_BOUND_IF, outgoing |
| * and incoming ill are always set to the same value. |
| */ |
| ill_t *dst_ill = NULL; |
| ipif_t *dst_ipif = NULL; |
| |
| ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); |
| |
| if (connp->conn_outgoing_ill != NULL) { |
| /* Outgoing or incoming path */ |
| int err; |
| |
| dst_ill = conn_get_held_ill(connp, |
| &connp->conn_outgoing_ill, &err); |
| if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) { |
| ip1dbg(("tcp_adapt_ire: ill_lookup failed\n")); |
| return (0); |
| } |
| match_flags |= MATCH_IRE_ILL; |
| dst_ipif = dst_ill->ill_ipif; |
| } |
| ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, |
| 0, 0, dst_ipif, zoneid, tsl, match_flags |