| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. |
| * Copyright 2011 Nexenta Systems, Inc. All rights reserved. |
| * Copyright 2019 Joyent, Inc. |
| * Copyright (c) 2014, 2016 by Delphix. All rights reserved. |
| */ |
| |
| /* This file contains all TCP input processing functions. */ |
| |
| #include <sys/types.h> |
| #include <sys/stream.h> |
| #include <sys/strsun.h> |
| #include <sys/strsubr.h> |
| #include <sys/stropts.h> |
| #include <sys/strlog.h> |
| #define _SUN_TPI_VERSION 2 |
| #include <sys/tihdr.h> |
| #include <sys/suntpi.h> |
| #include <sys/xti_inet.h> |
| #include <sys/squeue_impl.h> |
| #include <sys/squeue.h> |
| #include <sys/tsol/tnet.h> |
| |
| #include <inet/common.h> |
| #include <inet/ip.h> |
| #include <inet/tcp.h> |
| #include <inet/tcp_impl.h> |
| #include <inet/tcp_cluster.h> |
| #include <inet/proto_set.h> |
| #include <inet/ipsec_impl.h> |
| |
| /* |
| * RFC7323-recommended phrasing of TSTAMP option, for easier parsing |
| */ |
| |
| #ifdef _BIG_ENDIAN |
| #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ |
| (TCPOPT_TSTAMP << 8) | 10) |
| #else |
| #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ |
| (TCPOPT_NOP << 8) | TCPOPT_NOP) |
| #endif |
| |
| /* |
| * PAWS needs a timer for 24 days. This is the number of ticks in 24 days |
| */ |
| #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) |
| |
| /* |
| * Since tcp_listener is not cleared atomically with tcp_detached |
| * being cleared we need this extra bit to tell a detached connection |
| * apart from one that is in the process of being accepted. |
| */ |
| #define TCP_IS_DETACHED_NONEAGER(tcp) \ |
| (TCP_IS_DETACHED(tcp) && \ |
| (!(tcp)->tcp_hard_binding)) |
| |
| /* |
| * Steps to do when a tcp_t moves to TIME-WAIT state. |
| * |
| * This connection is done, we don't need to account for it. Decrement |
| * the listener connection counter if needed. |
| * |
| * Decrement the connection counter of the stack. Note that this counter |
| * is per CPU. So the total number of connections in a stack is the sum of all |
| * of them. Since there is no lock for handling all of them exclusively, the |
| * resulting sum is only an approximation. |
| * |
| * Unconditionally clear the exclusive binding bit so this TIME-WAIT |
| * connection won't interfere with new ones. |
| * |
| * Start the TIME-WAIT timer. If upper layer has not closed the connection, |
| * the timer is handled within the context of this tcp_t. When the timer |
| * fires, tcp_clean_death() is called. If upper layer closes the connection |
| * during this period, tcp_time_wait_append() will be called to add this |
| * tcp_t to the global TIME-WAIT list. Note that this means that the |
| * actual wait time in TIME-WAIT state will be longer than the |
| * tcps_time_wait_interval since the period before upper layer closes the |
| * connection is not accounted for when tcp_time_wait_append() is called. |
| * |
| * If upper layer has closed the connection, call tcp_time_wait_append() |
| * directly. |
| * |
| */ |
| #define SET_TIME_WAIT(tcps, tcp, connp) \ |
| { \ |
| (tcp)->tcp_state = TCPS_TIME_WAIT; \ |
| if ((tcp)->tcp_listen_cnt != NULL) \ |
| TCP_DECR_LISTEN_CNT(tcp); \ |
| atomic_dec_64( \ |
| (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt); \ |
| (connp)->conn_exclbind = 0; \ |
| if (!TCP_IS_DETACHED(tcp)) { \ |
| TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \ |
| } else { \ |
| tcp_time_wait_append(tcp); \ |
| TCP_DBGSTAT(tcps, tcp_rput_time_wait); \ |
| } \ |
| } |
| |
| /* |
| * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more |
| * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent |
| * data, TCP will not respond with an ACK. RFC 793 requires that |
| * TCP responds with an ACK for such a bogus ACK. By not following |
| * the RFC, we prevent TCP from getting into an ACK storm if somehow |
| * an attacker successfully spoofs an acceptable segment to our |
| * peer; or when our peer is "confused." |
| */ |
| static uint32_t tcp_drop_ack_unsent_cnt = 10; |
| |
| /* |
| * To protect TCP against attacker using a small window and requesting |
| * large amount of data (DoS attack by conuming memory), TCP checks the |
| * window advertised in the last ACK of the 3-way handshake. TCP uses |
| * the tcp_mss (the size of one packet) value for comparion. The window |
| * should be larger than tcp_mss. But while a sane TCP should advertise |
| * a receive window larger than or equal to 4*MSS to avoid stop and go |
| * tarrfic, not all TCP stacks do that. This is especially true when |
| * tcp_mss is a big value. |
| * |
| * To work around this issue, an additional fixed value for comparison |
| * is also used. If the advertised window is smaller than both tcp_mss |
| * and tcp_init_wnd_chk, the ACK is considered as invalid. So for large |
| * tcp_mss value (say, 8K), a window larger than tcp_init_wnd_chk but |
| * smaller than 8K is considered to be OK. |
| */ |
| static uint32_t tcp_init_wnd_chk = 4096; |
| |
| /* Process ICMP source quench message or not. */ |
| static boolean_t tcp_icmp_source_quench = B_FALSE; |
| |
| static boolean_t tcp_outbound_squeue_switch = B_FALSE; |
| |
| static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *, |
| ip_recv_attr_t *); |
| static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *, |
| ip_recv_attr_t *); |
| static boolean_t tcp_drop_q0(tcp_t *); |
| static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); |
| static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *, |
| ip_recv_attr_t *); |
| static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *); |
| static void tcp_process_options(tcp_t *, tcpha_t *); |
| static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); |
| static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); |
| static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); |
| static void tcp_set_rto(tcp_t *, hrtime_t); |
| static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); |
| |
| /* |
| * CC wrapper hook functions |
| */ |
| static void |
| cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked, |
| uint16_t type) |
| { |
| uint32_t old_cwnd = tcp->tcp_cwnd; |
| |
| tcp->tcp_ccv.bytes_this_ack = bytes_acked; |
| if (tcp->tcp_cwnd <= tcp->tcp_swnd) |
| tcp->tcp_ccv.flags |= CCF_CWND_LIMITED; |
| else |
| tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED; |
| |
| if (type == CC_ACK) { |
| if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { |
| if (tcp->tcp_ccv.flags & CCF_RTO) |
| tcp->tcp_ccv.flags &= ~CCF_RTO; |
| |
| tcp->tcp_ccv.t_bytes_acked += |
| min(tcp->tcp_ccv.bytes_this_ack, |
| tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss); |
| if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) { |
| tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd; |
| tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND; |
| } |
| } else { |
| tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND; |
| tcp->tcp_ccv.t_bytes_acked = 0; |
| } |
| } |
| |
| if (CC_ALGO(tcp)->ack_received != NULL) { |
| /* |
| * The FreeBSD code where this originated had a comment "Find |
| * a way to live without this" in several places where curack |
| * got set. If they eventually dump curack from the cc |
| * variables, we'll need to adapt our code. |
| */ |
| tcp->tcp_ccv.curack = seg_ack; |
| CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type); |
| } |
| |
| DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd, |
| uint32_t, tcp->tcp_cwnd); |
| } |
| |
| void |
| cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type) |
| { |
| uint32_t old_cwnd = tcp->tcp_cwnd; |
| uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh; |
| switch (type) { |
| case CC_NDUPACK: |
| if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) { |
| tcp->tcp_rexmit_max = tcp->tcp_snxt; |
| if (tcp->tcp_ecn_ok) { |
| tcp->tcp_cwr_snd_max = tcp->tcp_snxt; |
| tcp->tcp_cwr = B_TRUE; |
| tcp->tcp_ecn_cwr_sent = B_FALSE; |
| } |
| } |
| break; |
| case CC_ECN: |
| if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) { |
| tcp->tcp_rexmit_max = tcp->tcp_snxt; |
| if (tcp->tcp_ecn_ok) { |
| tcp->tcp_cwr_snd_max = tcp->tcp_snxt; |
| tcp->tcp_cwr = B_TRUE; |
| tcp->tcp_ecn_cwr_sent = B_FALSE; |
| } |
| } |
| break; |
| case CC_RTO: |
| tcp->tcp_ccv.flags |= CCF_RTO; |
| tcp->tcp_dupack_cnt = 0; |
| tcp->tcp_ccv.t_bytes_acked = 0; |
| /* |
| * Give up on fast recovery and congestion recovery if we were |
| * attempting either. |
| */ |
| EXIT_RECOVERY(tcp->tcp_ccv.flags); |
| if (CC_ALGO(tcp)->cong_signal == NULL) { |
| /* |
| * RFC5681 Section 3.1 |
| * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4) |
| */ |
| tcp->tcp_cwnd_ssthresh = max( |
| (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss, |
| 2) * tcp->tcp_mss; |
| tcp->tcp_cwnd = tcp->tcp_mss; |
| } |
| |
| if (tcp->tcp_ecn_ok) { |
| tcp->tcp_cwr = B_TRUE; |
| tcp->tcp_cwr_snd_max = tcp->tcp_snxt; |
| tcp->tcp_ecn_cwr_sent = B_FALSE; |
| } |
| break; |
| } |
| |
| if (CC_ALGO(tcp)->cong_signal != NULL) { |
| tcp->tcp_ccv.curack = seg_ack; |
| CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type); |
| } |
| |
| DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd, |
| uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh, |
| uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type); |
| } |
| |
| static void |
| cc_post_recovery(tcp_t *tcp, uint32_t seg_ack) |
| { |
| uint32_t old_cwnd = tcp->tcp_cwnd; |
| |
| if (CC_ALGO(tcp)->post_recovery != NULL) { |
| tcp->tcp_ccv.curack = seg_ack; |
| CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv); |
| } |
| tcp->tcp_ccv.t_bytes_acked = 0; |
| |
| DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp, |
| uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd); |
| } |
| |
| /* |
| * Set the MSS associated with a particular tcp based on its current value, |
| * and a new one passed in. Observe minimums and maximums, and reset other |
| * state variables that we want to view as multiples of MSS. |
| * |
| * The value of MSS could be either increased or descreased. |
| */ |
| void |
| tcp_mss_set(tcp_t *tcp, uint32_t mss) |
| { |
| uint32_t mss_max; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| conn_t *connp = tcp->tcp_connp; |
| |
| if (connp->conn_ipversion == IPV4_VERSION) |
| mss_max = tcps->tcps_mss_max_ipv4; |
| else |
| mss_max = tcps->tcps_mss_max_ipv6; |
| |
| if (mss < tcps->tcps_mss_min) |
| mss = tcps->tcps_mss_min; |
| if (mss > mss_max) |
| mss = mss_max; |
| /* |
| * Unless naglim has been set by our client to |
| * a non-mss value, force naglim to track mss. |
| * This can help to aggregate small writes. |
| */ |
| if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) |
| tcp->tcp_naglim = mss; |
| /* |
| * TCP should be able to buffer at least 4 MSS data for obvious |
| * performance reason. |
| */ |
| if ((mss << 2) > connp->conn_sndbuf) |
| connp->conn_sndbuf = mss << 2; |
| |
| /* |
| * Set the send lowater to at least twice of MSS. |
| */ |
| if ((mss << 1) > connp->conn_sndlowat) |
| connp->conn_sndlowat = mss << 1; |
| |
| /* |
| * Update tcp_cwnd according to the new value of MSS. Keep the |
| * previous ratio to preserve the transmit rate. |
| */ |
| tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; |
| tcp->tcp_cwnd_cnt = 0; |
| |
| tcp->tcp_mss = mss; |
| (void) tcp_maxpsz_set(tcp, B_TRUE); |
| } |
| |
| /* |
| * Extract option values from a tcp header. We put any found values into the |
| * tcpopt struct and return a bitmask saying which options were found. |
| */ |
| int |
| tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt) |
| { |
| uchar_t *endp; |
| int len; |
| uint32_t mss; |
| uchar_t *up = (uchar_t *)tcpha; |
| int found = 0; |
| int32_t sack_len; |
| tcp_seq sack_begin, sack_end; |
| tcp_t *tcp; |
| |
| endp = up + TCP_HDR_LENGTH(tcpha); |
| up += TCP_MIN_HEADER_LENGTH; |
| /* |
| * If timestamp option is aligned as recommended in RFC 7323 Appendix |
| * A, and is the only option, return quickly. |
| */ |
| if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH + |
| TCPOPT_REAL_TS_LEN && |
| OK_32PTR(up) && |
| *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { |
| tcpopt->tcp_opt_ts_val = ABE32_TO_U32((up+4)); |
| tcpopt->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); |
| |
| return (TCP_OPT_TSTAMP_PRESENT); |
| } |
| while (up < endp) { |
| len = endp - up; |
| switch (*up) { |
| case TCPOPT_EOL: |
| break; |
| |
| case TCPOPT_NOP: |
| up++; |
| continue; |
| |
| case TCPOPT_MAXSEG: |
| if (len < TCPOPT_MAXSEG_LEN || |
| up[1] != TCPOPT_MAXSEG_LEN) |
| break; |
| |
| mss = BE16_TO_U16(up+2); |
| /* Caller must handle tcp_mss_min and tcp_mss_max_* */ |
| tcpopt->tcp_opt_mss = mss; |
| found |= TCP_OPT_MSS_PRESENT; |
| |
| up += TCPOPT_MAXSEG_LEN; |
| continue; |
| |
| case TCPOPT_WSCALE: |
| if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) |
| break; |
| |
| if (up[2] > TCP_MAX_WINSHIFT) |
| tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; |
| else |
| tcpopt->tcp_opt_wscale = up[2]; |
| found |= TCP_OPT_WSCALE_PRESENT; |
| |
| up += TCPOPT_WS_LEN; |
| continue; |
| |
| case TCPOPT_SACK_PERMITTED: |
| if (len < TCPOPT_SACK_OK_LEN || |
| up[1] != TCPOPT_SACK_OK_LEN) |
| break; |
| found |= TCP_OPT_SACK_OK_PRESENT; |
| up += TCPOPT_SACK_OK_LEN; |
| continue; |
| |
| case TCPOPT_SACK: |
| if (len <= 2 || up[1] <= 2 || len < up[1]) |
| break; |
| |
| /* If TCP is not interested in SACK blks... */ |
| if ((tcp = tcpopt->tcp) == NULL) { |
| up += up[1]; |
| continue; |
| } |
| sack_len = up[1] - TCPOPT_HEADER_LEN; |
| up += TCPOPT_HEADER_LEN; |
| |
| /* |
| * If the list is empty, allocate one and assume |
| * nothing is sack'ed. |
| */ |
| if (tcp->tcp_notsack_list == NULL) { |
| tcp_notsack_update(&(tcp->tcp_notsack_list), |
| tcp->tcp_suna, tcp->tcp_snxt, |
| &(tcp->tcp_num_notsack_blk), |
| &(tcp->tcp_cnt_notsack_list)); |
| |
| /* |
| * Make sure tcp_notsack_list is not NULL. |
| * This happens when kmem_alloc(KM_NOSLEEP) |
| * returns NULL. |
| */ |
| if (tcp->tcp_notsack_list == NULL) { |
| up += sack_len; |
| continue; |
| } |
| tcp->tcp_fack = tcp->tcp_suna; |
| } |
| |
| while (sack_len > 0) { |
| if (up + 8 > endp) { |
| up = endp; |
| break; |
| } |
| sack_begin = BE32_TO_U32(up); |
| up += 4; |
| sack_end = BE32_TO_U32(up); |
| up += 4; |
| sack_len -= 8; |
| /* |
| * Bounds checking. Make sure the SACK |
| * info is within tcp_suna and tcp_snxt. |
| * If this SACK blk is out of bound, ignore |
| * it but continue to parse the following |
| * blks. |
| */ |
| if (SEQ_LEQ(sack_end, sack_begin) || |
| SEQ_LT(sack_begin, tcp->tcp_suna) || |
| SEQ_GT(sack_end, tcp->tcp_snxt)) { |
| continue; |
| } |
| tcp_notsack_insert(&(tcp->tcp_notsack_list), |
| sack_begin, sack_end, |
| &(tcp->tcp_num_notsack_blk), |
| &(tcp->tcp_cnt_notsack_list)); |
| if (SEQ_GT(sack_end, tcp->tcp_fack)) { |
| tcp->tcp_fack = sack_end; |
| } |
| } |
| found |= TCP_OPT_SACK_PRESENT; |
| continue; |
| |
| case TCPOPT_TSTAMP: |
| if (len < TCPOPT_TSTAMP_LEN || |
| up[1] != TCPOPT_TSTAMP_LEN) |
| break; |
| |
| tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); |
| tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); |
| |
| found |= TCP_OPT_TSTAMP_PRESENT; |
| |
| up += TCPOPT_TSTAMP_LEN; |
| continue; |
| |
| default: |
| if (len <= 1 || len < (int)up[1] || up[1] == 0) |
| break; |
| up += up[1]; |
| continue; |
| } |
| break; |
| } |
| return (found); |
| } |
| |
| /* |
| * Process all TCP option in SYN segment. Note that this function should |
| * be called after tcp_set_destination() is called so that the necessary info |
| * from IRE is already set in the tcp structure. |
| * |
| * This function sets up the correct tcp_mss value according to the |
| * MSS option value and our header size. It also sets up the window scale |
| * and timestamp values, and initialize SACK info blocks. But it does not |
| * change receive window size after setting the tcp_mss value. The caller |
| * should do the appropriate change. |
| */ |
| static void |
| tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) |
| { |
| int options; |
| tcp_opt_t tcpopt; |
| uint32_t mss_max; |
| char *tmp_tcph; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| conn_t *connp = tcp->tcp_connp; |
| |
| tcpopt.tcp = NULL; |
| options = tcp_parse_options(tcpha, &tcpopt); |
| |
| /* |
| * Process MSS option. Note that MSS option value does not account |
| * for IP or TCP options. This means that it is equal to MTU - minimum |
| * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for |
| * IPv6. |
| */ |
| if (!(options & TCP_OPT_MSS_PRESENT)) { |
| if (connp->conn_ipversion == IPV4_VERSION) |
| tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; |
| else |
| tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; |
| } else { |
| if (connp->conn_ipversion == IPV4_VERSION) |
| mss_max = tcps->tcps_mss_max_ipv4; |
| else |
| mss_max = tcps->tcps_mss_max_ipv6; |
| if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min) |
| tcpopt.tcp_opt_mss = tcps->tcps_mss_min; |
| else if (tcpopt.tcp_opt_mss > mss_max) |
| tcpopt.tcp_opt_mss = mss_max; |
| } |
| |
| /* Process Window Scale option. */ |
| if (options & TCP_OPT_WSCALE_PRESENT) { |
| tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; |
| tcp->tcp_snd_ws_ok = B_TRUE; |
| } else { |
| tcp->tcp_snd_ws = B_FALSE; |
| tcp->tcp_snd_ws_ok = B_FALSE; |
| tcp->tcp_rcv_ws = B_FALSE; |
| } |
| |
| /* Process Timestamp option. */ |
| if ((options & TCP_OPT_TSTAMP_PRESENT) && |
| (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { |
| tmp_tcph = (char *)tcp->tcp_tcpha; |
| |
| tcp->tcp_snd_ts_ok = B_TRUE; |
| tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; |
| tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); |
| ASSERT(OK_32PTR(tmp_tcph)); |
| ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); |
| |
| /* Fill in our template header with basic timestamp option. */ |
| tmp_tcph += connp->conn_ht_ulp_len; |
| tmp_tcph[0] = TCPOPT_NOP; |
| tmp_tcph[1] = TCPOPT_NOP; |
| tmp_tcph[2] = TCPOPT_TSTAMP; |
| tmp_tcph[3] = TCPOPT_TSTAMP_LEN; |
| connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN; |
| connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN; |
| tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4); |
| } else { |
| tcp->tcp_snd_ts_ok = B_FALSE; |
| } |
| |
| /* |
| * Process SACK options. If SACK is enabled for this connection, |
| * then allocate the SACK info structure. Note the following ways |
| * when tcp_snd_sack_ok is set to true. |
| * |
| * For active connection: in tcp_set_destination() called in |
| * tcp_connect(). |
| * |
| * For passive connection: in tcp_set_destination() called in |
| * tcp_input_listener(). |
| * |
| * That's the reason why the extra TCP_IS_DETACHED() check is there. |
| * That check makes sure that if we did not send a SACK OK option, |
| * we will not enable SACK for this connection even though the other |
| * side sends us SACK OK option. For active connection, the SACK |
| * info structure has already been allocated. So we need to free |
| * it if SACK is disabled. |
| */ |
| if ((options & TCP_OPT_SACK_OK_PRESENT) && |
| (tcp->tcp_snd_sack_ok || |
| (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { |
| ASSERT(tcp->tcp_num_sack_blk == 0); |
| ASSERT(tcp->tcp_notsack_list == NULL); |
| |
| tcp->tcp_snd_sack_ok = B_TRUE; |
| if (tcp->tcp_snd_ts_ok) { |
| tcp->tcp_max_sack_blk = 3; |
| } else { |
| tcp->tcp_max_sack_blk = 4; |
| } |
| } else if (tcp->tcp_snd_sack_ok) { |
| /* |
| * Resetting tcp_snd_sack_ok to B_FALSE so that |
| * no SACK info will be used for this |
| * connection. This assumes that SACK usage |
| * permission is negotiated. This may need |
| * to be changed once this is clarified. |
| */ |
| ASSERT(tcp->tcp_num_sack_blk == 0); |
| ASSERT(tcp->tcp_notsack_list == NULL); |
| tcp->tcp_snd_sack_ok = B_FALSE; |
| } |
| |
| /* |
| * Now we know the exact TCP/IP header length, subtract |
| * that from tcp_mss to get our side's MSS. |
| */ |
| tcp->tcp_mss -= connp->conn_ht_iphc_len; |
| |
| /* |
| * Here we assume that the other side's header size will be equal to |
| * our header size. We calculate the real MSS accordingly. Need to |
| * take into additional stuffs IPsec puts in. |
| * |
| * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) |
| */ |
| tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len + |
| tcp->tcp_ipsec_overhead - |
| ((connp->conn_ipversion == IPV4_VERSION ? |
| IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); |
| |
| /* |
| * Set MSS to the smaller one of both ends of the connection. |
| * We should not have called tcp_mss_set() before, but our |
| * side of the MSS should have been set to a proper value |
| * by tcp_set_destination(). tcp_mss_set() will also set up the |
| * STREAM head parameters properly. |
| * |
| * If we have a larger-than-16-bit window but the other side |
| * didn't want to do window scale, tcp_rwnd_set() will take |
| * care of that. |
| */ |
| tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); |
| |
| /* |
| * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been |
| * updated properly. |
| */ |
| TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); |
| |
| if (tcp->tcp_cc_algo->conn_init != NULL) |
| tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv); |
| } |
| |
| /* |
| * Add a new piece to the tcp reassembly queue. If the gap at the beginning |
| * is filled, return as much as we can. The message passed in may be |
| * multi-part, chained using b_cont. "start" is the starting sequence |
| * number for this piece. |
| */ |
| static mblk_t * |
| tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) |
| { |
| uint32_t end, bytes; |
| mblk_t *mp1; |
| mblk_t *mp2; |
| mblk_t *next_mp; |
| uint32_t u1; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| |
| |
| /* Walk through all the new pieces. */ |
| do { |
| ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= |
| (uintptr_t)INT_MAX); |
| end = start + (int)(mp->b_wptr - mp->b_rptr); |
| next_mp = mp->b_cont; |
| if (start == end) { |
| /* Empty. Blast it. */ |
| freeb(mp); |
| continue; |
| } |
| bytes = end - start; |
| mp->b_cont = NULL; |
| TCP_REASS_SET_SEQ(mp, start); |
| TCP_REASS_SET_END(mp, end); |
| mp1 = tcp->tcp_reass_tail; |
| if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) { |
| if (mp1 != NULL) { |
| /* |
| * New stuff is beyond the tail; link it on the |
| * end. |
| */ |
| mp1->b_cont = mp; |
| } else { |
| tcp->tcp_reass_head = mp; |
| } |
| tcp->tcp_reass_tail = mp; |
| TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); |
| TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes); |
| tcp->tcp_cs.tcp_in_data_unorder_segs++; |
| tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes; |
| continue; |
| } |
| mp1 = tcp->tcp_reass_head; |
| u1 = TCP_REASS_SEQ(mp1); |
| /* New stuff at the front? */ |
| if (SEQ_LT(start, u1)) { |
| /* Yes... Check for overlap. */ |
| mp->b_cont = mp1; |
| tcp->tcp_reass_head = mp; |
| tcp_reass_elim_overlap(tcp, mp); |
| continue; |
| } |
| /* |
| * The new piece fits somewhere between the head and tail. |
| * We find our slot, where mp1 precedes us and mp2 trails. |
| */ |
| for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { |
| u1 = TCP_REASS_SEQ(mp2); |
| if (SEQ_LEQ(start, u1)) |
| break; |
| } |
| /* Link ourselves in */ |
| mp->b_cont = mp2; |
| mp1->b_cont = mp; |
| |
| /* Trim overlap with following mblk(s) first */ |
| tcp_reass_elim_overlap(tcp, mp); |
| |
| /* Trim overlap with preceding mblk */ |
| tcp_reass_elim_overlap(tcp, mp1); |
| |
| } while (start = end, mp = next_mp); |
| mp1 = tcp->tcp_reass_head; |
| /* Anything ready to go? */ |
| if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) |
| return (NULL); |
| /* Eat what we can off the queue */ |
| for (;;) { |
| mp = mp1->b_cont; |
| end = TCP_REASS_END(mp1); |
| TCP_REASS_SET_SEQ(mp1, 0); |
| TCP_REASS_SET_END(mp1, 0); |
| if (!mp) { |
| tcp->tcp_reass_tail = NULL; |
| break; |
| } |
| if (end != TCP_REASS_SEQ(mp)) { |
| mp1->b_cont = NULL; |
| break; |
| } |
| mp1 = mp; |
| } |
| mp1 = tcp->tcp_reass_head; |
| tcp->tcp_reass_head = mp; |
| return (mp1); |
| } |
| |
| /* Eliminate any overlap that mp may have over later mblks */ |
| static void |
| tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) |
| { |
| uint32_t end; |
| mblk_t *mp1; |
| uint32_t u1; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| |
| end = TCP_REASS_END(mp); |
| while ((mp1 = mp->b_cont) != NULL) { |
| u1 = TCP_REASS_SEQ(mp1); |
| if (!SEQ_GT(end, u1)) |
| break; |
| if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { |
| mp->b_wptr -= end - u1; |
| TCP_REASS_SET_END(mp, u1); |
| TCPS_BUMP_MIB(tcps, tcpInDataPartDupSegs); |
| TCPS_UPDATE_MIB(tcps, tcpInDataPartDupBytes, |
| end - u1); |
| break; |
| } |
| mp->b_cont = mp1->b_cont; |
| TCP_REASS_SET_SEQ(mp1, 0); |
| TCP_REASS_SET_END(mp1, 0); |
| freeb(mp1); |
| TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); |
| TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, end - u1); |
| } |
| if (!mp1) |
| tcp->tcp_reass_tail = mp; |
| } |
| |
| /* |
| * This function does PAWS protection check, per RFC 7323 section 5. Requires |
| * that timestamp options are already processed into tcpoptp. Returns B_TRUE if |
| * the segment passes the PAWS test, else returns B_FALSE. |
| */ |
| boolean_t |
| tcp_paws_check(tcp_t *tcp, const tcp_opt_t *tcpoptp) |
| { |
| if (TSTMP_LT(tcpoptp->tcp_opt_ts_val, |
| tcp->tcp_ts_recent)) { |
| if (LBOLT_FASTPATH64 < |
| (tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { |
| /* This segment is not acceptable. */ |
| return (B_FALSE); |
| } else { |
| /* |
| * Connection has been idle for |
| * too long. Reset the timestamp |
| */ |
| tcp->tcp_ts_recent = |
| tcpoptp->tcp_opt_ts_val; |
| } |
| } |
| return (B_TRUE); |
| } |
| |
| /* |
| * Defense for the SYN attack - |
| * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest |
| * one from the list of droppable eagers. This list is a subset of q0. |
| * see comments before the definition of MAKE_DROPPABLE(). |
| * 2. Don't drop a SYN request before its first timeout. This gives every |
| * request at least til the first timeout to complete its 3-way handshake. |
| * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many |
| * requests currently on the queue that has timed out. This will be used |
| * as an indicator of whether an attack is under way, so that appropriate |
| * actions can be taken. (It's incremented in tcp_timer() and decremented |
| * either when eager goes into ESTABLISHED, or gets freed up.) |
| * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on |
| * # of timeout drops back to <= q0len/32 => SYN alert off |
| */ |
| static boolean_t |
| tcp_drop_q0(tcp_t *tcp) |
| { |
| tcp_t *eager; |
| mblk_t *mp; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| |
| ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); |
| ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); |
| |
| /* Pick oldest eager from the list of droppable eagers */ |
| eager = tcp->tcp_eager_prev_drop_q0; |
| |
| /* If list is empty. return B_FALSE */ |
| if (eager == tcp) { |
| return (B_FALSE); |
| } |
| |
| /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */ |
| if ((mp = allocb(0, BPRI_HI)) == NULL) |
| return (B_FALSE); |
| |
| /* |
| * Take this eager out from the list of droppable eagers since we are |
| * going to drop it. |
| */ |
| MAKE_UNDROPPABLE(eager); |
| |
| if (tcp->tcp_connp->conn_debug) { |
| (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, |
| "tcp_drop_q0: listen half-open queue (max=%d) overflow" |
| " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, |
| tcp->tcp_conn_req_cnt_q0, |
| tcp_display(tcp, NULL, DISP_PORT_ONLY)); |
| } |
| |
| TCPS_BUMP_MIB(tcps, tcpHalfOpenDrop); |
| |
| /* Put a reference on the conn as we are enqueueing it in the sqeue */ |
| CONN_INC_REF(eager->tcp_connp); |
| |
| SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, |
| tcp_clean_death_wrapper, eager->tcp_connp, NULL, |
| SQ_FILL, SQTAG_TCP_DROP_Q0); |
| |
| return (B_TRUE); |
| } |
| |
| /* |
| * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6 |
| */ |
| static mblk_t * |
| tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, |
| ip_recv_attr_t *ira) |
| { |
| tcp_t *ltcp = lconnp->conn_tcp; |
| tcp_t *tcp = connp->conn_tcp; |
| mblk_t *tpi_mp; |
| ipha_t *ipha; |
| ip6_t *ip6h; |
| sin6_t sin6; |
| uint_t ifindex = ira->ira_ruifindex; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| |
| if (ira->ira_flags & IRAF_IS_IPV4) { |
| ipha = (ipha_t *)mp->b_rptr; |
| |
| connp->conn_ipversion = IPV4_VERSION; |
| IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); |
| IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); |
| connp->conn_saddr_v6 = connp->conn_laddr_v6; |
| |
| sin6 = sin6_null; |
| sin6.sin6_addr = connp->conn_faddr_v6; |
| sin6.sin6_port = connp->conn_fport; |
| sin6.sin6_family = AF_INET6; |
| sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, |
| IPCL_ZONEID(lconnp), tcps->tcps_netstack); |
| |
| if (connp->conn_recv_ancillary.crb_recvdstaddr) { |
| sin6_t sin6d; |
| |
| sin6d = sin6_null; |
| sin6d.sin6_addr = connp->conn_laddr_v6; |
| sin6d.sin6_port = connp->conn_lport; |
| sin6d.sin6_family = AF_INET; |
| tpi_mp = mi_tpi_extconn_ind(NULL, |
| (char *)&sin6d, sizeof (sin6_t), |
| (char *)&tcp, |
| (t_scalar_t)sizeof (intptr_t), |
| (char *)&sin6d, sizeof (sin6_t), |
| (t_scalar_t)ltcp->tcp_conn_req_seqnum); |
| } else { |
| tpi_mp = mi_tpi_conn_ind(NULL, |
| (char *)&sin6, sizeof (sin6_t), |
| (char *)&tcp, (t_scalar_t)sizeof (intptr_t), |
| (t_scalar_t)ltcp->tcp_conn_req_seqnum); |
| } |
| } else { |
| ip6h = (ip6_t *)mp->b_rptr; |
| |
| connp->conn_ipversion = IPV6_VERSION; |
| connp->conn_laddr_v6 = ip6h->ip6_dst; |
| connp->conn_faddr_v6 = ip6h->ip6_src; |
| connp->conn_saddr_v6 = connp->conn_laddr_v6; |
| |
| sin6 = sin6_null; |
| sin6.sin6_addr = connp->conn_faddr_v6; |
| sin6.sin6_port = connp->conn_fport; |
| sin6.sin6_family = AF_INET6; |
| sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; |
| sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, |
| IPCL_ZONEID(lconnp), tcps->tcps_netstack); |
| |
| if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { |
| /* Pass up the scope_id of remote addr */ |
| sin6.sin6_scope_id = ifindex; |
| } else { |
| sin6.sin6_scope_id = 0; |
| } |
| if (connp->conn_recv_ancillary.crb_recvdstaddr) { |
| sin6_t sin6d; |
| |
| sin6d = sin6_null; |
| sin6.sin6_addr = connp->conn_laddr_v6; |
| sin6d.sin6_port = connp->conn_lport; |
| sin6d.sin6_family = AF_INET6; |
| if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6)) |
| sin6d.sin6_scope_id = ifindex; |
| |
| tpi_mp = mi_tpi_extconn_ind(NULL, |
| (char *)&sin6d, sizeof (sin6_t), |
| (char *)&tcp, (t_scalar_t)sizeof (intptr_t), |
| (char *)&sin6d, sizeof (sin6_t), |
| (t_scalar_t)ltcp->tcp_conn_req_seqnum); |
| } else { |
| tpi_mp = mi_tpi_conn_ind(NULL, |
| (char *)&sin6, sizeof (sin6_t), |
| (char *)&tcp, (t_scalar_t)sizeof (intptr_t), |
| (t_scalar_t)ltcp->tcp_conn_req_seqnum); |
| } |
| } |
| |
| tcp->tcp_mss = tcps->tcps_mss_def_ipv6; |
| return (tpi_mp); |
| } |
| |
| /* Handle a SYN on an AF_INET socket */ |
| static mblk_t * |
| tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, |
| ip_recv_attr_t *ira) |
| { |
| tcp_t *ltcp = lconnp->conn_tcp; |
| tcp_t *tcp = connp->conn_tcp; |
| sin_t sin; |
| mblk_t *tpi_mp = NULL; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| ipha_t *ipha; |
| |
| ASSERT(ira->ira_flags & IRAF_IS_IPV4); |
| ipha = (ipha_t *)mp->b_rptr; |
| |
| connp->conn_ipversion = IPV4_VERSION; |
| IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); |
| IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); |
| connp->conn_saddr_v6 = connp->conn_laddr_v6; |
| |
| sin = sin_null; |
| sin.sin_addr.s_addr = connp->conn_faddr_v4; |
| sin.sin_port = connp->conn_fport; |
| sin.sin_family = AF_INET; |
| if (lconnp->conn_recv_ancillary.crb_recvdstaddr) { |
| sin_t sind; |
| |
| sind = sin_null; |
| sind.sin_addr.s_addr = connp->conn_laddr_v4; |
| sind.sin_port = connp->conn_lport; |
| sind.sin_family = AF_INET; |
| tpi_mp = mi_tpi_extconn_ind(NULL, |
| (char *)&sind, sizeof (sin_t), (char *)&tcp, |
| (t_scalar_t)sizeof (intptr_t), (char *)&sind, |
| sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); |
| } else { |
| tpi_mp = mi_tpi_conn_ind(NULL, |
| (char *)&sin, sizeof (sin_t), |
| (char *)&tcp, (t_scalar_t)sizeof (intptr_t), |
| (t_scalar_t)ltcp->tcp_conn_req_seqnum); |
| } |
| |
| tcp->tcp_mss = tcps->tcps_mss_def_ipv4; |
| return (tpi_mp); |
| } |
| |
| /* |
| * Called via squeue to get on to eager's perimeter. It sends a |
| * TH_RST if eager is in the fanout table. The listener wants the |
| * eager to disappear either by means of tcp_eager_blowoff() or |
| * tcp_eager_cleanup() being called. tcp_eager_kill() can also be |
| * called (via squeue) if the eager cannot be inserted in the |
| * fanout table in tcp_input_listener(). |
| */ |
| /* ARGSUSED */ |
| void |
| tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) |
| { |
| conn_t *econnp = (conn_t *)arg; |
| tcp_t *eager = econnp->conn_tcp; |
| tcp_t *listener = eager->tcp_listener; |
| |
| /* |
| * We could be called because listener is closing. Since |
| * the eager was using listener's queue's, we avoid |
| * using the listeners queues from now on. |
| */ |
| ASSERT(eager->tcp_detached); |
| econnp->conn_rq = NULL; |
| econnp->conn_wq = NULL; |
| |
| /* |
| * An eager's conn_fanout will be NULL if it's a duplicate |
| * for an existing 4-tuples in the conn fanout table. |
| * We don't want to send an RST out in such case. |
| */ |
| if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) { |
| tcp_xmit_ctl("tcp_eager_kill, can't wait", |
| eager, eager->tcp_snxt, 0, TH_RST); |
| } |
| |
| /* We are here because listener wants this eager gone */ |
| if (listener != NULL) { |
| mutex_enter(&listener->tcp_eager_lock); |
| tcp_eager_unlink(eager); |
| if (eager->tcp_tconnind_started) { |
| /* |
| * The eager has sent a conn_ind up to the |
| * listener but listener decides to close |
| * instead. We need to drop the extra ref |
| * placed on eager in tcp_input_data() before |
| * sending the conn_ind to listener. |
| */ |
| CONN_DEC_REF(econnp); |
| } |
| mutex_exit(&listener->tcp_eager_lock); |
| CONN_DEC_REF(listener->tcp_connp); |
| } |
| |
| if (eager->tcp_state != TCPS_CLOSED) |
| tcp_close_detached(eager); |
| } |
| |
| /* |
| * Reset any eager connection hanging off this listener marked |
| * with 'seqnum' and then reclaim it's resources. |
| */ |
| boolean_t |
| tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) |
| { |
| tcp_t *eager; |
| mblk_t *mp; |
| |
| eager = listener; |
| mutex_enter(&listener->tcp_eager_lock); |
| do { |
| eager = eager->tcp_eager_next_q; |
| if (eager == NULL) { |
| mutex_exit(&listener->tcp_eager_lock); |
| return (B_FALSE); |
| } |
| } while (eager->tcp_conn_req_seqnum != seqnum); |
| |
| if (eager->tcp_closemp_used) { |
| mutex_exit(&listener->tcp_eager_lock); |
| return (B_TRUE); |
| } |
| eager->tcp_closemp_used = B_TRUE; |
| TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); |
| CONN_INC_REF(eager->tcp_connp); |
| mutex_exit(&listener->tcp_eager_lock); |
| mp = &eager->tcp_closemp; |
| SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, |
| eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); |
| return (B_TRUE); |
| } |
| |
| /* |
| * Reset any eager connection hanging off this listener |
| * and then reclaim it's resources. |
| */ |
| void |
| tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) |
| { |
| tcp_t *eager; |
| mblk_t *mp; |
| tcp_stack_t *tcps = listener->tcp_tcps; |
| |
| ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); |
| |
| if (!q0_only) { |
| /* First cleanup q */ |
| TCP_STAT(tcps, tcp_eager_blowoff_q); |
| eager = listener->tcp_eager_next_q; |
| while (eager != NULL) { |
| if (!eager->tcp_closemp_used) { |
| eager->tcp_closemp_used = B_TRUE; |
| TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); |
| CONN_INC_REF(eager->tcp_connp); |
| mp = &eager->tcp_closemp; |
| SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, |
| tcp_eager_kill, eager->tcp_connp, NULL, |
| SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); |
| } |
| eager = eager->tcp_eager_next_q; |
| } |
| } |
| /* Then cleanup q0 */ |
| TCP_STAT(tcps, tcp_eager_blowoff_q0); |
| eager = listener->tcp_eager_next_q0; |
| while (eager != listener) { |
| if (!eager->tcp_closemp_used) { |
| eager->tcp_closemp_used = B_TRUE; |
| TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); |
| CONN_INC_REF(eager->tcp_connp); |
| mp = &eager->tcp_closemp; |
| SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, |
| tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, |
| SQTAG_TCP_EAGER_CLEANUP_Q0); |
| } |
| eager = eager->tcp_eager_next_q0; |
| } |
| } |
| |
| /* |
| * If we are an eager connection hanging off a listener that hasn't |
| * formally accepted the connection yet, get off its list and blow off |
| * any data that we have accumulated. |
| */ |
| void |
| tcp_eager_unlink(tcp_t *tcp) |
| { |
| tcp_t *listener = tcp->tcp_listener; |
| |
| ASSERT(listener != NULL); |
| ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); |
| if (tcp->tcp_eager_next_q0 != NULL) { |
| ASSERT(tcp->tcp_eager_prev_q0 != NULL); |
| |
| /* Remove the eager tcp from q0 */ |
| tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = |
| tcp->tcp_eager_prev_q0; |
| tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = |
| tcp->tcp_eager_next_q0; |
| ASSERT(listener->tcp_conn_req_cnt_q0 > 0); |
| listener->tcp_conn_req_cnt_q0--; |
| |
| tcp->tcp_eager_next_q0 = NULL; |
| tcp->tcp_eager_prev_q0 = NULL; |
| |
| /* |
| * Take the eager out, if it is in the list of droppable |
| * eagers. |
| */ |
| MAKE_UNDROPPABLE(tcp); |
| |
| if (tcp->tcp_syn_rcvd_timeout != 0) { |
| /* we have timed out before */ |
| ASSERT(listener->tcp_syn_rcvd_timeout > 0); |
| listener->tcp_syn_rcvd_timeout--; |
| } |
| } else { |
| tcp_t **tcpp = &listener->tcp_eager_next_q; |
| tcp_t *prev = NULL; |
| |
| for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { |
| if (tcpp[0] == tcp) { |
| if (listener->tcp_eager_last_q == tcp) { |
| /* |
| * If we are unlinking the last |
| * element on the list, adjust |
| * tail pointer. Set tail pointer |
| * to nil when list is empty. |
| */ |
| ASSERT(tcp->tcp_eager_next_q == NULL); |
| if (listener->tcp_eager_last_q == |
| listener->tcp_eager_next_q) { |
| listener->tcp_eager_last_q = |
| NULL; |
| } else { |
| /* |
| * We won't get here if there |
| * is only one eager in the |
| * list. |
| */ |
| ASSERT(prev != NULL); |
| listener->tcp_eager_last_q = |
| prev; |
| } |
| } |
| tcpp[0] = tcp->tcp_eager_next_q; |
| tcp->tcp_eager_next_q = NULL; |
| tcp->tcp_eager_last_q = NULL; |
| ASSERT(listener->tcp_conn_req_cnt_q > 0); |
| listener->tcp_conn_req_cnt_q--; |
| break; |
| } |
| prev = tcpp[0]; |
| } |
| } |
| tcp->tcp_listener = NULL; |
| } |
| |
| /* BEGIN CSTYLED */ |
| /* |
| * |
| * The sockfs ACCEPT path: |
| * ======================= |
| * |
| * The eager is now established in its own perimeter as soon as SYN is |
| * received in tcp_input_listener(). When sockfs receives conn_ind, it |
| * completes the accept processing on the acceptor STREAM. The sending |
| * of conn_ind part is common for both sockfs listener and a TLI/XTI |
| * listener but a TLI/XTI listener completes the accept processing |
| * on the listener perimeter. |
| * |
| * Common control flow for 3 way handshake: |
| * ---------------------------------------- |
| * |
| * incoming SYN (listener perimeter) -> tcp_input_listener() |
| * |
| * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data() |
| * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() |
| * |
| * Sockfs ACCEPT Path: |
| * ------------------- |
| * |
| * open acceptor stream (tcp_open allocates tcp_tli_accept() |
| * as STREAM entry point) |
| * |
| * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept() |
| * |
| * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager |
| * association (we are not behind eager's squeue but sockfs is protecting us |
| * and no one knows about this stream yet. The STREAMS entry point q->q_info |
| * is changed to point at tcp_wput(). |
| * |
| * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to |
| * listener (done on listener's perimeter). |
| * |
| * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish |
| * accept. |
| * |
| * TLI/XTI client ACCEPT path: |
| * --------------------------- |
| * |
| * soaccept() sends T_CONN_RES on the listener STREAM. |
| * |
| * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send |
| * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()). |
| * |
| * Locks: |
| * ====== |
| * |
| * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and |
| * and listeners->tcp_eager_next_q. |
| * |
| * Referencing: |
| * ============ |
| * |
| * 1) We start out in tcp_input_listener by eager placing a ref on |
| * listener and listener adding eager to listeners->tcp_eager_next_q0. |
| * |
| * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before |
| * doing so we place a ref on the eager. This ref is finally dropped at the |
| * end of tcp_accept_finish() while unwinding from the squeue, i.e. the |
| * reference is dropped by the squeue framework. |
| * |
| * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish |
| * |
| * The reference must be released by the same entity that added the reference |
| * In the above scheme, the eager is the entity that adds and releases the |
| * references. Note that tcp_accept_finish executes in the squeue of the eager |
| * (albeit after it is attached to the acceptor stream). Though 1. executes |
| * in the listener's squeue, the eager is nascent at this point and the |
| * reference can be considered to have been added on behalf of the eager. |
| * |
| * Eager getting a Reset or listener closing: |
| * ========================================== |
| * |
| * Once the listener and eager are linked, the listener never does the unlink. |
| * If the listener needs to close, tcp_eager_cleanup() is called which queues |
| * a message on all eager perimeter. The eager then does the unlink, clears |
| * any pointers to the listener's queue and drops the reference to the |
| * listener. The listener waits in tcp_close outside the squeue until its |
| * refcount has dropped to 1. This ensures that the listener has waited for |
| * all eagers to clear their association with the listener. |
| * |
| * Similarly, if eager decides to go away, it can unlink itself and close. |
| * When the T_CONN_RES comes down, we check if eager has closed. Note that |
| * the reference to eager is still valid because of the extra ref we put |
| * in tcp_send_conn_ind. |
| * |
| * Listener can always locate the eager under the protection |
| * of the listener->tcp_eager_lock, and then do a refhold |
| * on the eager during the accept processing. |
| * |
| * The acceptor stream accesses the eager in the accept processing |
| * based on the ref placed on eager before sending T_conn_ind. |
| * The only entity that can negate this refhold is a listener close |
| * which is mutually exclusive with an active acceptor stream. |
| * |
| * Eager's reference on the listener |
| * =================================== |
| * |
| * If the accept happens (even on a closed eager) the eager drops its |
| * reference on the listener at the start of tcp_accept_finish. If the |
| * eager is killed due to an incoming RST before the T_conn_ind is sent up, |
| * the reference is dropped in tcp_closei_local. If the listener closes, |
| * the reference is dropped in tcp_eager_kill. In all cases the reference |
| * is dropped while executing in the eager's context (squeue). |
| */ |
| /* END CSTYLED */ |
| |
| /* Process the SYN packet, mp, directed at the listener 'tcp' */ |
| |
| /* |
| * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. |
| * tcp_input_data will not see any packets for listeners since the listener |
| * has conn_recv set to tcp_input_listener. |
| */ |
| /* ARGSUSED */ |
| static void |
| tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) |
| { |
| tcpha_t *tcpha; |
| uint32_t seg_seq; |
| tcp_t *eager; |
| int err; |
| conn_t *econnp = NULL; |
| squeue_t *new_sqp; |
| mblk_t *mp1; |
| uint_t ip_hdr_len; |
| conn_t *lconnp = (conn_t *)arg; |
| tcp_t *listener = lconnp->conn_tcp; |
| tcp_stack_t *tcps = listener->tcp_tcps; |
| ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; |
| uint_t flags; |
| mblk_t *tpi_mp; |
| uint_t ifindex = ira->ira_ruifindex; |
| boolean_t tlc_set = B_FALSE; |
| |
| ip_hdr_len = ira->ira_ip_hdr_length; |
| tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; |
| flags = (unsigned int)tcpha->tha_flags & 0xFF; |
| |
| DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, lconnp->conn_ixa, |
| __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, listener, |
| __dtrace_tcp_tcph_t *, tcpha); |
| |
| if (!(flags & TH_SYN)) { |
| if ((flags & TH_RST) || (flags & TH_URG)) { |
| freemsg(mp); |
| return; |
| } |
| if (flags & TH_ACK) { |
| /* Note this executes in listener's squeue */ |
| tcp_xmit_listeners_reset(mp, ira, ipst, lconnp); |
| return; |
| } |
| |
| freemsg(mp); |
| return; |
| } |
| |
| if (listener->tcp_state != TCPS_LISTEN) |
| goto error2; |
| |
| ASSERT(IPCL_IS_BOUND(lconnp)); |
| |
| mutex_enter(&listener->tcp_eager_lock); |
| |
| /* |
| * The system is under memory pressure, so we need to do our part |
| * to relieve the pressure. So we only accept new request if there |
| * is nothing waiting to be accepted or waiting to complete the 3-way |
| * handshake. This means that busy listener will not get too many |
| * new requests which they cannot handle in time while non-busy |
| * listener is still functioning properly. |
| */ |
| if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 || |
| listener->tcp_conn_req_cnt_q0 > 0)) { |
| mutex_exit(&listener->tcp_eager_lock); |
| TCP_STAT(tcps, tcp_listen_mem_drop); |
| goto error2; |
| } |
| |
| if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) { |
| mutex_exit(&listener->tcp_eager_lock); |
| TCP_STAT(tcps, tcp_listendrop); |
| TCPS_BUMP_MIB(tcps, tcpListenDrop); |
| if (lconnp->conn_debug) { |
| (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, |
| "tcp_input_listener: listen backlog (max=%d) " |
| "overflow (%d pending) on %s", |
| listener->tcp_conn_req_max, |
| listener->tcp_conn_req_cnt_q, |
| tcp_display(listener, NULL, DISP_PORT_ONLY)); |
| } |
| goto error2; |
| } |
| |
| if (listener->tcp_conn_req_cnt_q0 >= |
| listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { |
| /* |
| * Q0 is full. Drop a pending half-open req from the queue |
| * to make room for the new SYN req. Also mark the time we |
| * drop a SYN. |
| * |
| * A more aggressive defense against SYN attack will |
| * be to set the "tcp_syn_defense" flag now. |
| */ |
| TCP_STAT(tcps, tcp_listendropq0); |
| listener->tcp_last_rcv_lbolt = ddi_get_lbolt64(); |
| if (!tcp_drop_q0(listener)) { |
| mutex_exit(&listener->tcp_eager_lock); |
| TCPS_BUMP_MIB(tcps, tcpListenDropQ0); |
| if (lconnp->conn_debug) { |
| (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, |
| "tcp_input_listener: listen half-open " |
| "queue (max=%d) full (%d pending) on %s", |
| tcps->tcps_conn_req_max_q0, |
| listener->tcp_conn_req_cnt_q0, |
| tcp_display(listener, NULL, |
| DISP_PORT_ONLY)); |
| } |
| goto error2; |
| } |
| } |
| |
| /* |
| * Enforce the limit set on the number of connections per listener. |
| * Note that tlc_cnt starts with 1. So need to add 1 to tlc_max |
| * for comparison. |
| */ |
| if (listener->tcp_listen_cnt != NULL) { |
| tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt; |
| int64_t now; |
| |
| if (atomic_inc_32_nv(&tlc->tlc_cnt) > tlc->tlc_max + 1) { |
| mutex_exit(&listener->tcp_eager_lock); |
| now = ddi_get_lbolt64(); |
| atomic_dec_32(&tlc->tlc_cnt); |
| TCP_STAT(tcps, tcp_listen_cnt_drop); |
| tlc->tlc_drop++; |
| if (now - tlc->tlc_report_time > |
| MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) { |
| zcmn_err(lconnp->conn_zoneid, CE_WARN, |
| "Listener (port %d) connection max (%u) " |
| "reached: %u attempts dropped total\n", |
| ntohs(listener->tcp_connp->conn_lport), |
| tlc->tlc_max, tlc->tlc_drop); |
| tlc->tlc_report_time = now; |
| } |
| goto error2; |
| } |
| tlc_set = B_TRUE; |
| } |
| |
| mutex_exit(&listener->tcp_eager_lock); |
| |
| /* |
| * IP sets ira_sqp to either the senders conn_sqp (for loopback) |
| * or based on the ring (for packets from GLD). Otherwise it is |
| * set based on lbolt i.e., a somewhat random number. |
| */ |
| ASSERT(ira->ira_sqp != NULL); |
| new_sqp = ira->ira_sqp; |
| |
| econnp = tcp_get_conn(arg2, tcps); |
| if (econnp == NULL) |
| goto error2; |
| |
| ASSERT(econnp->conn_netstack == lconnp->conn_netstack); |
| econnp->conn_sqp = new_sqp; |
| econnp->conn_initial_sqp = new_sqp; |
| econnp->conn_ixa->ixa_sqp = new_sqp; |
| |
| econnp->conn_fport = tcpha->tha_lport; |
| econnp->conn_lport = tcpha->tha_fport; |
| |
| err = conn_inherit_parent(lconnp, econnp); |
| if (err != 0) |
| goto error3; |
| |
| /* We already know the laddr of the new connection is ours */ |
| econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation; |
| |
| ASSERT(OK_32PTR(mp->b_rptr)); |
| ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION || |
| IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); |
| |
| if (lconnp->conn_family == AF_INET) { |
| ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); |
| tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira); |
| } else { |
| tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira); |
| } |
| |
| if (tpi_mp == NULL) |
| goto error3; |
| |
| eager = econnp->conn_tcp; |
| eager->tcp_detached = B_TRUE; |
| SOCK_CONNID_INIT(eager->tcp_connid); |
| |
| /* |
| * Initialize the eager's tcp_t and inherit some parameters from |
| * the listener. |
| */ |
| tcp_init_values(eager, listener); |
| |
| ASSERT((econnp->conn_ixa->ixa_flags & |
| (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | |
| IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) == |
| (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | |
| IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)); |
| |
| if (!tcps->tcps_dev_flow_ctl) |
| econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; |
| |
| /* Prepare for diffing against previous packets */ |
| eager->tcp_recvifindex = 0; |
| eager->tcp_recvhops = 0xffffffffU; |
| |
| if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) { |
| if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) || |
| IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) { |
| econnp->conn_incoming_ifindex = ifindex; |
| econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; |
| econnp->conn_ixa->ixa_scopeid = ifindex; |
| } |
| } |
| |
| if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) == |
| (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) && |
| tcps->tcps_rev_src_routes) { |
| ipha_t *ipha = (ipha_t *)mp->b_rptr; |
| ip_pkt_t *ipp = &econnp->conn_xmit_ipp; |
| |
| /* Source routing option copyover (reverse it) */ |
| err = ip_find_hdr_v4(ipha, ipp, B_TRUE); |
| if (err != 0) { |
| freemsg(tpi_mp); |
| goto error3; |
| } |
| ip_pkt_source_route_reverse_v4(ipp); |
| } |
| |
| ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL); |
| ASSERT(!eager->tcp_tconnind_started); |
| /* |
| * If the SYN came with a credential, it's a loopback packet or a |
| * labeled packet; attach the credential to the TPI message. |
| */ |
| if (ira->ira_cred != NULL) |
| mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid); |
| |
| eager->tcp_conn.tcp_eager_conn_ind = tpi_mp; |
| ASSERT(eager->tcp_ordrel_mp == NULL); |
| |
| /* Inherit the listener's non-STREAMS flag */ |
| if (IPCL_IS_NONSTR(lconnp)) { |
| econnp->conn_flags |= IPCL_NONSTR; |
| /* All non-STREAMS tcp_ts are sockets */ |
| eager->tcp_issocket = B_TRUE; |
| } else { |
| /* |
| * Pre-allocate the T_ordrel_ind mblk for TPI socket so that |
| * at close time, we will always have that to send up. |
| * Otherwise, we need to do special handling in case the |
| * allocation fails at that time. |
| */ |
| if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) |
| goto error3; |
| } |
| /* |
| * Now that the IP addresses and ports are setup in econnp we |
| * can do the IPsec policy work. |
| */ |
| if (ira->ira_flags & IRAF_IPSEC_SECURE) { |
| if (lconnp->conn_policy != NULL) { |
| /* |
| * Inherit the policy from the listener; use |
| * actions from ira |
| */ |
| if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) { |
| CONN_DEC_REF(econnp); |
| freemsg(mp); |
| goto error3; |
| } |
| } |
| } |
| |
| /* |
| * tcp_set_destination() may set tcp_rwnd according to the route |
| * metrics. If it does not, the eager's receive window will be set |
| * to the listener's receive window later in this function. |
| */ |
| eager->tcp_rwnd = 0; |
| |
| if (is_system_labeled()) { |
| ip_xmit_attr_t *ixa = econnp->conn_ixa; |
| |
| ASSERT(ira->ira_tsl != NULL); |
| /* Discard any old label */ |
| if (ixa->ixa_free_flags & IXA_FREE_TSL) { |
| ASSERT(ixa->ixa_tsl != NULL); |
| label_rele(ixa->ixa_tsl); |
| ixa->ixa_free_flags &= ~IXA_FREE_TSL; |
| ixa->ixa_tsl = NULL; |
| } |
| if ((lconnp->conn_mlp_type != mlptSingle || |
| lconnp->conn_mac_mode != CONN_MAC_DEFAULT) && |
| ira->ira_tsl != NULL) { |
| /* |
| * If this is an MLP connection or a MAC-Exempt |
| * connection with an unlabeled node, packets are to be |
| * exchanged using the security label of the received |
| * SYN packet instead of the server application's label. |
| * tsol_check_dest called from ip_set_destination |
| * might later update TSF_UNLABELED by replacing |
| * ixa_tsl with a new label. |
| */ |
| label_hold(ira->ira_tsl); |
| ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); |
| DTRACE_PROBE2(mlp_syn_accept, conn_t *, |
| econnp, ts_label_t *, ixa->ixa_tsl) |
| } else { |
| ixa->ixa_tsl = crgetlabel(econnp->conn_cred); |
| DTRACE_PROBE2(syn_accept, conn_t *, |
| econnp, ts_label_t *, ixa->ixa_tsl) |
| } |
| /* |
| * conn_connect() called from tcp_set_destination will verify |
| * the destination is allowed to receive packets at the |
| * security label of the SYN-ACK we are generating. As part of |
| * that, tsol_check_dest() may create a new effective label for |
| * this connection. |
| * Finally conn_connect() will call conn_update_label. |
| * All that remains for TCP to do is to call |
| * conn_build_hdr_template which is done as part of |
| * tcp_set_destination. |
| */ |
| } |
| |
| /* |
| * Since we will clear tcp_listener before we clear tcp_detached |
| * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress |
| * so we can tell a TCP_IS_DETACHED_NONEAGER apart. |
| */ |
| eager->tcp_hard_binding = B_TRUE; |
| |
| tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ |
| TCP_BIND_HASH(econnp->conn_lport)], eager, 0); |
| |
| CL_INET_CONNECT(econnp, B_FALSE, err); |
| if (err != 0) { |
| tcp_bind_hash_remove(eager); |
| goto error3; |
| } |
| |
| SOCK_CONNID_BUMP(eager->tcp_connid); |
| |
| /* |
| * Adapt our mss, ttl, ... based on the remote address. |
| */ |
| |
| if (tcp_set_destination(eager) != 0) { |
| TCPS_BUMP_MIB(tcps, tcpAttemptFails); |
| /* Undo the bind_hash_insert */ |
| tcp_bind_hash_remove(eager); |
| goto error3; |
| } |
| |
| /* Process all TCP options. */ |
| tcp_process_options(eager, tcpha); |
| |
| /* Is the other end ECN capable? */ |
| if (tcps->tcps_ecn_permitted >= 1 && |
| (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { |
| eager->tcp_ecn_ok = B_TRUE; |
| } |
| |
| /* |
| * The listener's conn_rcvbuf should be the default window size or a |
| * window size changed via SO_RCVBUF option. First round up the |
| * eager's tcp_rwnd to the nearest MSS. Then find out the window |
| * scale option value if needed. Call tcp_rwnd_set() to finish the |
| * setting. |
| * |
| * Note if there is a rpipe metric associated with the remote host, |
| * we should not inherit receive window size from listener. |
| */ |
| eager->tcp_rwnd = MSS_ROUNDUP( |
| (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf : |
| eager->tcp_rwnd), eager->tcp_mss); |
| if (eager->tcp_snd_ws_ok) |
| tcp_set_ws_value(eager); |
| /* |
| * Note that this is the only place tcp_rwnd_set() is called for |
| * accepting a connection. We need to call it here instead of |
| * after the 3-way handshake because we need to tell the other |
| * side our rwnd in the SYN-ACK segment. |
| */ |
| (void) tcp_rwnd_set(eager, eager->tcp_rwnd); |
| |
| ASSERT(eager->tcp_connp->conn_rcvbuf != 0 && |
| eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd); |
| |
| ASSERT(econnp->conn_rcvbuf != 0 && |
| econnp->conn_rcvbuf == eager->tcp_rwnd); |
| |
| /* Put a ref on the listener for the eager. */ |
| CONN_INC_REF(lconnp); |
| mutex_enter(&listener->tcp_eager_lock); |
| listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; |
| eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0; |
| listener->tcp_eager_next_q0 = eager; |
| eager->tcp_eager_prev_q0 = listener; |
| |
| /* Set tcp_listener before adding it to tcp_conn_fanout */ |
| eager->tcp_listener = listener; |
| eager->tcp_saved_listener = listener; |
| |
| /* |
| * Set tcp_listen_cnt so that when the connection is done, the counter |
| * is decremented. |
| */ |
| eager->tcp_listen_cnt = listener->tcp_listen_cnt; |
| |
| /* |
| * Tag this detached tcp vector for later retrieval |
| * by our listener client in tcp_accept(). |
| */ |
| eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum; |
| listener->tcp_conn_req_cnt_q0++; |
| if (++listener->tcp_conn_req_seqnum == -1) { |
| /* |
| * -1 is "special" and defined in TPI as something |
| * that should never be used in T_CONN_IND |
| */ |
| ++listener->tcp_conn_req_seqnum; |
| } |
| mutex_exit(&listener->tcp_eager_lock); |
| |
| if (listener->tcp_syn_defense) { |
| /* Don't drop the SYN that comes from a good IP source */ |
| ipaddr_t *addr_cache; |
| |
| addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); |
| if (addr_cache != NULL && econnp->conn_faddr_v4 == |
| addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) { |
| eager->tcp_dontdrop = B_TRUE; |
| } |
| } |
| |
| /* |
| * We need to insert the eager in its own perimeter but as soon |
| * as we do that, we expose the eager to the classifier and |
| * should not touch any field outside the eager's perimeter. |
| * So do all the work necessary before inserting the eager |
| * in its own perimeter. Be optimistic that conn_connect() |
| * will succeed but undo everything if it fails. |
| */ |
| seg_seq = ntohl(tcpha->tha_seq); |
| eager->tcp_irs = seg_seq; |
| eager->tcp_rack = seg_seq; |
| eager->tcp_rnxt = seg_seq + 1; |
| eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt); |
| TCPS_BUMP_MIB(tcps, tcpPassiveOpens); |
| eager->tcp_state = TCPS_SYN_RCVD; |
| DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, |
| econnp->conn_ixa, void, NULL, tcp_t *, eager, void, NULL, |
| int32_t, TCPS_LISTEN); |
| |
| mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, |
| NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); |
| if (mp1 == NULL) { |
| /* |
| * Increment the ref count as we are going to |
| * enqueueing an mp in squeue |
| */ |
| CONN_INC_REF(econnp); |
| goto error; |
| } |
| |
| /* |
| * We need to start the rto timer. In normal case, we start |
| * the timer after sending the packet on the wire (or at |
| * least believing that packet was sent by waiting for |
| * conn_ip_output() to return). Since this is the first packet |
| * being sent on the wire for the eager, our initial tcp_rto |
| * is at least tcp_rexmit_interval_min which is a fairly |
| * large value to allow the algorithm to adjust slowly to large |
| * fluctuations of RTT during first few transmissions. |
| * |
| * Starting the timer first and then sending the packet in this |
| * case shouldn't make much difference since tcp_rexmit_interval_min |
| * is of the order of several 100ms and starting the timer |
| * first and then sending the packet will result in difference |
| * of few micro seconds. |
| * |
| * Without this optimization, we are forced to hold the fanout |
| * lock across the ipcl_bind_insert() and sending the packet |
| * so that we don't race against an incoming packet (maybe RST) |
| * for this eager. |
| * |
| * It is necessary to acquire an extra reference on the eager |
| * at this point and hold it until after tcp_send_data() to |
| * ensure against an eager close race. |
| */ |
| |
| CONN_INC_REF(econnp); |
| |
| TCP_TIMER_RESTART(eager, eager->tcp_rto); |
| |
| /* |
| * Insert the eager in its own perimeter now. We are ready to deal |
| * with any packets on eager. |
| */ |
| if (ipcl_conn_insert(econnp) != 0) |
| goto error; |
| |
| ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp); |
| freemsg(mp); |
| /* |
| * Send the SYN-ACK. Use the right squeue so that conn_ixa is |
| * only used by one thread at a time. |
| */ |
| if (econnp->conn_sqp == lconnp->conn_sqp) { |
| DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *, |
| econnp->conn_ixa, __dtrace_tcp_void_ip_t *, mp1->b_rptr, |
| tcp_t *, eager, __dtrace_tcp_tcph_t *, |
| &mp1->b_rptr[econnp->conn_ixa->ixa_ip_hdr_length]); |
| (void) conn_ip_output(mp1, econnp->conn_ixa); |
| CONN_DEC_REF(econnp); |
| } else { |
| SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack, |
| econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK); |
| } |
| return; |
| error: |
| freemsg(mp1); |
| eager->tcp_closemp_used = B_TRUE; |
| TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); |
| mp1 = &eager->tcp_closemp; |
| SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, |
| econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2); |
| |
| /* |
| * If a connection already exists, send the mp to that connections so |
| * that it can be appropriately dealt with. |
| */ |
| ipst = tcps->tcps_netstack->netstack_ip; |
| |
| if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) { |
| if (!IPCL_IS_CONNECTED(econnp)) { |
| /* |
| * Something bad happened. ipcl_conn_insert() |
| * failed because a connection already existed |
| * in connected hash but we can't find it |
| * anymore (someone blew it away). Just |
| * free this message and hopefully remote |
| * will retransmit at which time the SYN can be |
| * treated as a new connection or dealth with |
| * a TH_RST if a connection already exists. |
| */ |
| CONN_DEC_REF(econnp); |
| freemsg(mp); |
| } else { |
| SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data, |
| econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1); |
| } |
| } else { |
| /* Nobody wants this packet */ |
| freemsg(mp); |
| } |
| return; |
| error3: |
| CONN_DEC_REF(econnp); |
| error2: |
| freemsg(mp); |
| if (tlc_set) |
| atomic_dec_32(&listener->tcp_listen_cnt->tlc_cnt); |
| } |
| |
| /* |
| * In an ideal case of vertical partition in NUMA architecture, its |
| * beneficial to have the listener and all the incoming connections |
| * tied to the same squeue. The other constraint is that incoming |
| * connections should be tied to the squeue attached to interrupted |
| * CPU for obvious locality reason so this leaves the listener to |
| * be tied to the same squeue. Our only problem is that when listener |
| * is binding, the CPU that will get interrupted by the NIC whose |
| * IP address the listener is binding to is not even known. So |
| * the code below allows us to change that binding at the time the |
| * CPU is interrupted by virtue of incoming connection's squeue. |
| * |
| * This is usefull only in case of a listener bound to a specific IP |
| * address. For other kind of listeners, they get bound the |
| * very first time and there is no attempt to rebind them. |
| */ |
| void |
| tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, |
| ip_recv_attr_t *ira) |
| { |
| conn_t *connp = (conn_t *)arg; |
| squeue_t *sqp = (squeue_t *)arg2; |
| squeue_t *new_sqp; |
| uint32_t conn_flags; |
| |
| /* |
| * IP sets ira_sqp to either the senders conn_sqp (for loopback) |
| * or based on the ring (for packets from GLD). Otherwise it is |
| * set based on lbolt i.e., a somewhat random number. |
| */ |
| ASSERT(ira->ira_sqp != NULL); |
| new_sqp = ira->ira_sqp; |
| |
| if (connp->conn_fanout == NULL) |
| goto done; |
| |
| if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { |
| mutex_enter(&connp->conn_fanout->connf_lock); |
| mutex_enter(&connp->conn_lock); |
| /* |
| * No one from read or write side can access us now |
| * except for already queued packets on this squeue. |
| * But since we haven't changed the squeue yet, they |
| * can't execute. If they are processed after we have |
| * changed the squeue, they are sent back to the |
| * correct squeue down below. |
| * But a listner close can race with processing of |
| * incoming SYN. If incoming SYN processing changes |
| * the squeue then the listener close which is waiting |
| * to enter the squeue would operate on the wrong |
| * squeue. Hence we don't change the squeue here unless |
| * the refcount is exactly the minimum refcount. The |
| * minimum refcount of 4 is counted as - 1 each for |
| * TCP and IP, 1 for being in the classifier hash, and |
| * 1 for the mblk being processed. |
| */ |
| |
| if (connp->conn_ref != 4 || |
| connp->conn_tcp->tcp_state != TCPS_LISTEN) { |
| mutex_exit(&connp->conn_lock); |
| mutex_exit(&connp->conn_fanout->connf_lock); |
| goto done; |
| } |
| if (connp->conn_sqp != new_sqp) { |
| while (connp->conn_sqp != new_sqp) |
| (void) atomic_cas_ptr(&connp->conn_sqp, sqp, |
| new_sqp); |
| /* No special MT issues for outbound ixa_sqp hint */ |
| connp->conn_ixa->ixa_sqp = new_sqp; |
| } |
| |
| do { |
| conn_flags = connp->conn_flags; |
| conn_flags |= IPCL_FULLY_BOUND; |
| (void) atomic_cas_32(&connp->conn_flags, |
| connp->conn_flags, conn_flags); |
| } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); |
| |
| mutex_exit(&connp->conn_fanout->connf_lock); |
| mutex_exit(&connp->conn_lock); |
| |
| /* |
| * Assume we have picked a good squeue for the listener. Make |
| * subsequent SYNs not try to change the squeue. |
| */ |
| connp->conn_recv = tcp_input_listener; |
| } |
| |
| done: |
| if (connp->conn_sqp != sqp) { |
| CONN_INC_REF(connp); |
| SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, |
| ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); |
| } else { |
| tcp_input_listener(connp, mp, sqp, ira); |
| } |
| } |
| |
| /* |
| * Send up all messages queued on tcp_rcv_list. |
| */ |
| uint_t |
| tcp_rcv_drain(tcp_t *tcp) |
| { |
| mblk_t *mp; |
| uint_t ret = 0; |
| #ifdef DEBUG |
| uint_t cnt = 0; |
| #endif |
| queue_t *q = tcp->tcp_connp->conn_rq; |
| |
| /* Can't drain on an eager connection */ |
| if (tcp->tcp_listener != NULL) |
| return (ret); |
| |
| /* Can't be a non-STREAMS connection */ |
| ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); |
| |
| /* No need for the push timer now. */ |
| if (tcp->tcp_push_tid != 0) { |
| (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); |
| tcp->tcp_push_tid = 0; |
| } |
| |
| /* |
| * Handle two cases here: we are currently fused or we were |
| * previously fused and have some urgent data to be delivered |
| * upstream. The latter happens because we either ran out of |
| * memory or were detached and therefore sending the SIGURG was |
| * deferred until this point. In either case we pass control |
| * over to tcp_fuse_rcv_drain() since it may need to complete |
| * some work. |
| */ |
| if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { |
| if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : |
| &tcp->tcp_fused_sigurg_mp)) |
| return (ret); |
| } |
| |
| while ((mp = tcp->tcp_rcv_list) != NULL) { |
| tcp->tcp_rcv_list = mp->b_next; |
| mp->b_next = NULL; |
| #ifdef DEBUG |
| cnt += msgdsize(mp); |
| #endif |
| putnext(q, mp); |
| } |
| #ifdef DEBUG |
| ASSERT(cnt == tcp->tcp_rcv_cnt); |
| #endif |
| tcp->tcp_rcv_last_head = NULL; |
| tcp->tcp_rcv_last_tail = NULL; |
| tcp->tcp_rcv_cnt = 0; |
| |
| if (canputnext(q)) |
| return (tcp_rwnd_reopen(tcp)); |
| |
| return (ret); |
| } |
| |
| /* |
| * Queue data on tcp_rcv_list which is a b_next chain. |
| * tcp_rcv_last_head/tail is the last element of this chain. |
| * Each element of the chain is a b_cont chain. |
| * |
| * M_DATA messages are added to the current element. |
| * Other messages are added as new (b_next) elements. |
| */ |
| void |
| tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr) |
| { |
| ASSERT(seg_len == msgdsize(mp)); |
| ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); |
| |
| if (is_system_labeled()) { |
| ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); |
| /* |
| * Provide for protocols above TCP such as RPC. NOPID leaves |
| * db_cpid unchanged. |
| * The cred could have already been set. |
| */ |
| if (cr != NULL) |
| mblk_setcred(mp, cr, NOPID); |
| } |
| |
| if (tcp->tcp_rcv_list == NULL) { |
| ASSERT(tcp->tcp_rcv_last_head == NULL); |
| tcp->tcp_rcv_list = mp; |
| tcp->tcp_rcv_last_head = mp; |
| } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { |
| tcp->tcp_rcv_last_tail->b_cont = mp; |
| } else { |
| tcp->tcp_rcv_last_head->b_next = mp; |
| tcp->tcp_rcv_last_head = mp; |
| } |
| |
| while (mp->b_cont) |
| mp = mp->b_cont; |
| |
| tcp->tcp_rcv_last_tail = mp; |
| tcp->tcp_rcv_cnt += seg_len; |
| tcp->tcp_rwnd -= seg_len; |
| } |
| |
| /* Generate an ACK-only (no data) segment for a TCP endpoint */ |
| mblk_t * |
| tcp_ack_mp(tcp_t *tcp) |
| { |
| uint32_t seq_no; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| conn_t *connp = tcp->tcp_connp; |
| |
| /* |
| * There are a few cases to be considered while setting the sequence no. |
| * Essentially, we can come here while processing an unacceptable pkt |
| * in the TCPS_SYN_RCVD state, in which case we set the sequence number |
| * to snxt (per RFC 793), note the swnd wouldn't have been set yet. |
| * If we are here for a zero window probe, stick with suna. In all |
| * other cases, we check if suna + swnd encompasses snxt and set |
| * the sequence number to snxt, if so. If snxt falls outside the |
| * window (the receiver probably shrunk its window), we will go with |
| * suna + swnd, otherwise the sequence no will be unacceptable to the |
| * receiver. |
| */ |
| if (tcp->tcp_zero_win_probe) { |
| seq_no = tcp->tcp_suna; |
| } else if (tcp->tcp_state == TCPS_SYN_RCVD) { |
| ASSERT(tcp->tcp_swnd == 0); |
| seq_no = tcp->tcp_snxt; |
| } else { |
| seq_no = SEQ_GT(tcp->tcp_snxt, |
| (tcp->tcp_suna + tcp->tcp_swnd)) ? |
| (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; |
| } |
| |
| if (tcp->tcp_valid_bits) { |
| /* |
| * For the complex case where we have to send some |
| * controls (FIN or SYN), let tcp_xmit_mp do it. |
| */ |
| return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, |
| NULL, B_FALSE)); |
| } else { |
| /* Generate a simple ACK */ |
| int data_length; |
| uchar_t *rptr; |
| tcpha_t *tcpha; |
| mblk_t *mp1; |
| int32_t total_hdr_len; |
| int32_t tcp_hdr_len; |
| int32_t num_sack_blk = 0; |
| int32_t sack_opt_len; |
| ip_xmit_attr_t *ixa = connp->conn_ixa; |
| |
| /* |
| * Allocate space for TCP + IP headers |
| * and link-level header |
| */ |
| if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { |
| num_sack_blk = MIN(tcp->tcp_max_sack_blk, |
| tcp->tcp_num_sack_blk); |
| sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + |
| TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; |
| total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len; |
| tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len; |
| } else { |
| total_hdr_len = connp->conn_ht_iphc_len; |
| tcp_hdr_len = connp->conn_ht_ulp_len; |
| } |
| mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); |
| if (!mp1) |
| return (NULL); |
| |
| /* Update the latest receive window size in TCP header. */ |
| tcp->tcp_tcpha->tha_win = |
| htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); |
| /* copy in prototype TCP + IP header */ |
| rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; |
| mp1->b_rptr = rptr; |
| mp1->b_wptr = rptr + total_hdr_len; |
| bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); |
| |
| tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; |
| |
| /* Set the TCP sequence number. */ |
| tcpha->tha_seq = htonl(seq_no); |
| |
| /* Set up the TCP flag field. */ |
| tcpha->tha_flags = (uchar_t)TH_ACK; |
| if (tcp->tcp_ecn_echo_on) |
| tcpha->tha_flags |= TH_ECE; |
| |
| tcp->tcp_rack = tcp->tcp_rnxt; |
| tcp->tcp_rack_cnt = 0; |
| |
| /* fill in timestamp option if in use */ |
| if (tcp->tcp_snd_ts_ok) { |
| uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; |
| |
| U32_TO_BE32(llbolt, |
| (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); |
| U32_TO_BE32(tcp->tcp_ts_recent, |
| (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); |
| } |
| |
| /* Fill in SACK options */ |
| if (num_sack_blk > 0) { |
| uchar_t *wptr = (uchar_t *)tcpha + |
| connp->conn_ht_ulp_len; |
| sack_blk_t *tmp; |
| int32_t i; |
| |
| wptr[0] = TCPOPT_NOP; |
| wptr[1] = TCPOPT_NOP; |
| wptr[2] = TCPOPT_SACK; |
| wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * |
| sizeof (sack_blk_t); |
| wptr += TCPOPT_REAL_SACK_LEN; |
| |
| tmp = tcp->tcp_sack_list; |
| for (i = 0; i < num_sack_blk; i++) { |
| U32_TO_BE32(tmp[i].begin, wptr); |
| wptr += sizeof (tcp_seq); |
| U32_TO_BE32(tmp[i].end, wptr); |
| wptr += sizeof (tcp_seq); |
| } |
| tcpha->tha_offset_and_reserved += |
| ((num_sack_blk * 2 + 1) << 4); |
| } |
| |
| ixa->ixa_pktlen = total_hdr_len; |
| |
| if (ixa->ixa_flags & IXAF_IS_IPV4) { |
| ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len); |
| } else { |
| ip6_t *ip6 = (ip6_t *)rptr; |
| |
| ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); |
| } |
| |
| /* |
| * Prime pump for checksum calculation in IP. Include the |
| * adjustment for a source route if any. |
| */ |
| data_length = tcp_hdr_len + connp->conn_sum; |
| data_length = (data_length >> 16) + (data_length & 0xFFFF); |
| tcpha->tha_sum = htons(data_length); |
| |
| if (tcp->tcp_ip_forward_progress) { |
| tcp->tcp_ip_forward_progress = B_FALSE; |
| connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; |
| } else { |
| connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; |
| } |
| return (mp1); |
| } |
| } |
| |
| /* |
| * Dummy socket upcalls for if/when the conn_t gets detached from a |
| * direct-callback sonode via a user-driven close(). Easy to catch with |
| * DTrace FBT, and should be mostly harmless. |
| */ |
| |
| /* ARGSUSED */ |
| static sock_upper_handle_t |
| tcp_dummy_newconn(sock_upper_handle_t x, sock_lower_handle_t y, |
| sock_downcalls_t *z, cred_t *cr, pid_t pid, sock_upcalls_t **ignored) |
| { |
| ASSERT(0); /* Panic in debug, otherwise ignore. */ |
| return (NULL); |
| } |
| |
| /* ARGSUSED */ |
| static void |
| tcp_dummy_connected(sock_upper_handle_t x, sock_connid_t y, cred_t *cr, |
| pid_t pid) |
| { |
| ASSERT(x == NULL); |
| /* Normally we'd crhold(cr) and attach it to socket state. */ |
| /* LINTED */ |
| } |
| |
| /* ARGSUSED */ |
| static int |
| tcp_dummy_disconnected(sock_upper_handle_t x, sock_connid_t y, int blah) |
| { |
| ASSERT(0); /* Panic in debug, otherwise ignore. */ |
| return (-1); |
| } |
| |
| /* ARGSUSED */ |
| static void |
| tcp_dummy_opctl(sock_upper_handle_t x, sock_opctl_action_t y, uintptr_t blah) |
| { |
| ASSERT(x == NULL); |
| /* We really want this one to be a harmless NOP for now. */ |
| /* LINTED */ |
| } |
| |
| /* ARGSUSED */ |
| static ssize_t |
| tcp_dummy_recv(sock_upper_handle_t x, mblk_t *mp, size_t len, int flags, |
| int *error, boolean_t *push) |
| { |
| ASSERT(x == NULL); |
| |
| /* |
| * Consume the message, set ESHUTDOWN, and return an error. |
| * Nobody's home! |
| */ |
| freemsg(mp); |
| *error = ESHUTDOWN; |
| return (-1); |
| } |
| |
| /* ARGSUSED */ |
| static void |
| tcp_dummy_set_proto_props(sock_upper_handle_t x, struct sock_proto_props *y) |
| { |
| ASSERT(0); /* Panic in debug, otherwise ignore. */ |
| } |
| |
| /* ARGSUSED */ |
| static void |
| tcp_dummy_txq_full(sock_upper_handle_t x, boolean_t y) |
| { |
| ASSERT(0); /* Panic in debug, otherwise ignore. */ |
| } |
| |
| /* ARGSUSED */ |
| static void |
| tcp_dummy_signal_oob(sock_upper_handle_t x, ssize_t len) |
| { |
| ASSERT(x == NULL); |
| /* Otherwise, this would signal socket state about OOB data. */ |
| } |
| |
| /* ARGSUSED */ |
| static void |
| tcp_dummy_set_error(sock_upper_handle_t x, int err) |
| { |
| ASSERT(0); /* Panic in debug, otherwise ignore. */ |
| } |
| |
| /* ARGSUSED */ |
| static void |
| tcp_dummy_onearg(sock_upper_handle_t x) |
| { |
| ASSERT(0); /* Panic in debug, otherwise ignore. */ |
| } |
| |
| static sock_upcalls_t tcp_dummy_upcalls = { |
| tcp_dummy_newconn, |
| tcp_dummy_connected, |
| tcp_dummy_disconnected, |
| tcp_dummy_opctl, |
| tcp_dummy_recv, |
| tcp_dummy_set_proto_props, |
| tcp_dummy_txq_full, |
| tcp_dummy_signal_oob, |
| tcp_dummy_onearg, |
| tcp_dummy_set_error, |
| tcp_dummy_onearg |
| }; |
| |
| /* |
| * Handle M_DATA messages from IP. Its called directly from IP via |
| * squeue for received IP packets. |
| * |
| * The first argument is always the connp/tcp to which the mp belongs. |
| * There are no exceptions to this rule. The caller has already put |
| * a reference on this connp/tcp and once tcp_input_data() returns, |
| * the squeue will do the refrele. |
| * |
| * The TH_SYN for the listener directly go to tcp_input_listener via |
| * squeue. ICMP errors go directly to tcp_icmp_input(). |
| * |
| * sqp: NULL = recursive, sqp != NULL means called from squeue |
| */ |
| void |
| tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) |
| { |
| int32_t bytes_acked; |
| int32_t gap; |
| mblk_t *mp1; |
| uint_t flags; |
| uint32_t new_swnd = 0; |
| uchar_t *iphdr; |
| uchar_t *rptr; |
| int32_t rgap; |
| uint32_t seg_ack; |
| int seg_len; |
| uint_t ip_hdr_len; |
| uint32_t seg_seq; |
| tcpha_t *tcpha; |
| int urp; |
| tcp_opt_t tcpopt; |
| ip_pkt_t ipp; |
| boolean_t ofo_seg = B_FALSE; /* Out of order segment */ |
| uint32_t cwnd; |
| int mss; |
| conn_t *connp = (conn_t *)arg; |
| squeue_t *sqp = (squeue_t *)arg2; |
| tcp_t *tcp = connp->conn_tcp; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| sock_upcalls_t *sockupcalls; |
| |
| /* |
| * RST from fused tcp loopback peer should trigger an unfuse. |
| */ |
| if (tcp->tcp_fused) { |
| TCP_STAT(tcps, tcp_fusion_aborted); |
| tcp_unfuse(tcp); |
| } |
| |
| mss = 0; |
| iphdr = mp->b_rptr; |
| rptr = mp->b_rptr; |
| ASSERT(OK_32PTR(rptr)); |
| |
| ip_hdr_len = ira->ira_ip_hdr_length; |
| if (connp->conn_recv_ancillary.crb_all != 0) { |
| /* |
| * Record packet information in the ip_pkt_t |
| */ |
| ipp.ipp_fields = 0; |
| if (ira->ira_flags & IRAF_IS_IPV4) { |
| (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, |
| B_FALSE); |
| } else { |
| uint8_t nexthdrp; |
| |
| /* |
| * IPv6 packets can only be received by applications |
| * that are prepared to receive IPv6 addresses. |
| * The IP fanout must ensure this. |
| */ |
| ASSERT(connp->conn_family == AF_INET6); |
| |
| (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, |
| &nexthdrp); |
| ASSERT(nexthdrp == IPPROTO_TCP); |
| |
| /* Could have caused a pullup? */ |
| iphdr = mp->b_rptr; |
| rptr = mp->b_rptr; |
| } |
| } |
| ASSERT(DB_TYPE(mp) == M_DATA); |
| ASSERT(mp->b_next == NULL); |
| |
| tcpha = (tcpha_t *)&rptr[ip_hdr_len]; |
| seg_seq = ntohl(tcpha->tha_seq); |
| seg_ack = ntohl(tcpha->tha_ack); |
| ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); |
| seg_len = (int)(mp->b_wptr - rptr) - |
| (ip_hdr_len + TCP_HDR_LENGTH(tcpha)); |
| if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { |
| do { |
| ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= |
| (uintptr_t)INT_MAX); |
| seg_len += (int)(mp1->b_wptr - mp1->b_rptr); |
| } while ((mp1 = mp1->b_cont) != NULL && |
| mp1->b_datap->db_type == M_DATA); |
| } |
| |
| DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa, |
| __dtrace_tcp_void_ip_t *, iphdr, tcp_t *, tcp, |
| __dtrace_tcp_tcph_t *, tcpha); |
| |
| if (tcp->tcp_state == TCPS_TIME_WAIT) { |
| tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, |
| seg_len, tcpha, ira); |
| return; |
| } |
| |
| if (sqp != NULL) { |
| /* |
| * This is the correct place to update tcp_last_recv_time. Note |
| * that it is also updated for tcp structure that belongs to |
| * global and listener queues which do not really need updating. |
| * But that should not cause any harm. And it is updated for |
| * all kinds of incoming segments, not only for data segments. |
| */ |
| tcp->tcp_last_recv_time = LBOLT_FASTPATH; |
| } |
| |
| flags = (unsigned int)tcpha->tha_flags & 0xFF; |
| |
| TCPS_BUMP_MIB(tcps, tcpHCInSegs); |
| DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); |
| |
| if ((flags & TH_URG) && sqp != NULL) { |
| /* |
| * TCP can't handle urgent pointers that arrive before |
| * the connection has been accept()ed since it can't |
| * buffer OOB data. Discard segment if this happens. |
| * |
| * We can't just rely on a non-null tcp_listener to indicate |
| * that the accept() has completed since unlinking of the |
| * eager and completion of the accept are not atomic. |
| * tcp_detached, when it is not set (B_FALSE) indicates |
| * that the accept() has completed. |
| * |
| * Nor can it reassemble urgent pointers, so discard |
| * if it's not the next segment expected. |
| * |
| * Otherwise, collapse chain into one mblk (discard if |
| * that fails). This makes sure the headers, retransmitted |
| * data, and new data all are in the same mblk. |
| */ |
| ASSERT(mp != NULL); |
| if (tcp->tcp_detached || !pullupmsg(mp, -1)) { |
| freemsg(mp); |
| return; |
| } |
| /* Update pointers into message */ |
| iphdr = rptr = mp->b_rptr; |
| tcpha = (tcpha_t *)&rptr[ip_hdr_len]; |
| if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { |
| /* |
| * Since we can't handle any data with this urgent |
| * pointer that is out of sequence, we expunge |
| * the data. This allows us to still register |
| * the urgent mark and generate the M_PCSIG, |
| * which we can do. |
| */ |
| mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); |
| seg_len = 0; |
| } |
| } |
| |
| sockupcalls = connp->conn_upcalls; |
| /* A conn_t may have belonged to a now-closed socket. Be careful. */ |
| if (sockupcalls == NULL) |
| sockupcalls = &tcp_dummy_upcalls; |
| |
| switch (tcp->tcp_state) { |
| case TCPS_SYN_SENT: |
| if (connp->conn_final_sqp == NULL && |
| tcp_outbound_squeue_switch && sqp != NULL) { |
| ASSERT(connp->conn_initial_sqp == connp->conn_sqp); |
| connp->conn_final_sqp = sqp; |
| if (connp->conn_final_sqp != connp->conn_sqp) { |
| DTRACE_PROBE1(conn__final__sqp__switch, |
| conn_t *, connp); |
| CONN_INC_REF(connp); |
| SQUEUE_SWITCH(connp, connp->conn_final_sqp); |
| SQUEUE_ENTER_ONE(connp->conn_sqp, mp, |
| tcp_input_data, connp, ira, ip_squeue_flag, |
| SQTAG_CONNECT_FINISH); |
| return; |
| } |
| DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp); |
| } |
| if (flags & TH_ACK) { |
| /* |
| * Note that our stack cannot send data before a |
| * connection is established, therefore the |
| * following check is valid. Otherwise, it has |
| * to be changed. |
| */ |
| if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || |
| SEQ_GT(seg_ack, tcp->tcp_snxt)) { |
| freemsg(mp); |
| if (flags & TH_RST) |
| return; |
| tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", |
| tcp, seg_ack, 0, TH_RST); |
| return; |
| } |
| ASSERT(tcp->tcp_suna + 1 == seg_ack); |
| } |
| if (flags & TH_RST) { |
| if (flags & TH_ACK) { |
| DTRACE_TCP5(connect__refused, mblk_t *, NULL, |
| ip_xmit_attr_t *, connp->conn_ixa, |
| void_ip_t *, iphdr, tcp_t *, tcp, |
| tcph_t *, tcpha); |
| (void) tcp_clean_death(tcp, ECONNREFUSED); |
| } |
| freemsg(mp); |
| return; |
| } |
| if (!(flags & TH_SYN)) { |
| freemsg(mp); |
| return; |
| } |
| |
| /* Process all TCP options. */ |
| tcp_process_options(tcp, tcpha); |
| /* |
| * The following changes our rwnd to be a multiple of the |
| * MIN(peer MSS, our MSS) for performance reason. |
| */ |
| (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf, |
| tcp->tcp_mss)); |
| |
| /* Is the other end ECN capable? */ |
| if (tcp->tcp_ecn_ok) { |
| if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { |
| tcp->tcp_ecn_ok = B_FALSE; |
| } |
| } |
| /* |
| * Clear ECN flags because it may interfere with later |
| * processing. |
| */ |
| flags &= ~(TH_ECE|TH_CWR); |
| |
| tcp->tcp_irs = seg_seq; |
| tcp->tcp_rack = seg_seq; |
| tcp->tcp_rnxt = seg_seq + 1; |
| tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); |
| if (!TCP_IS_DETACHED(tcp)) { |
| /* Allocate room for SACK options if needed. */ |
| connp->conn_wroff = connp->conn_ht_iphc_len; |
| if (tcp->tcp_snd_sack_ok) |
| connp->conn_wroff += TCPOPT_MAX_SACK_LEN; |
| if (!tcp->tcp_loopback) |
| connp->conn_wroff += tcps->tcps_wroff_xtra; |
| |
| (void) proto_set_tx_wroff(connp->conn_rq, connp, |
| connp->conn_wroff); |
| } |
| if (flags & TH_ACK) { |
| /* |
| * If we can't get the confirmation upstream, pretend |
| * we didn't even see this one. |
| * |
| * XXX: how can we pretend we didn't see it if we |
| * have updated rnxt et. al. |
| * |
| * For loopback we defer sending up the T_CONN_CON |
| * until after some checks below. |
| */ |
| mp1 = NULL; |
| /* |
| * tcp_sendmsg() checks tcp_state without entering |
| * the squeue so tcp_state should be updated before |
| * sending up connection confirmation. Probe the |
| * state change below when we are sure the connection |
| * confirmation has been sent. |
| */ |
| tcp->tcp_state = TCPS_ESTABLISHED; |
| if (!tcp_conn_con(tcp, iphdr, mp, |
| tcp->tcp_loopback ? &mp1 : NULL, ira)) { |
| tcp->tcp_state = TCPS_SYN_SENT; |
| freemsg(mp); |
| return; |
| } |
| TCPS_CONN_INC(tcps); |
| /* SYN was acked - making progress */ |
| tcp->tcp_ip_forward_progress = B_TRUE; |
| |
| /* One for the SYN */ |
| tcp->tcp_suna = tcp->tcp_iss + 1; |
| tcp->tcp_valid_bits &= ~TCP_ISS_VALID; |
| |
| /* |
| * If SYN was retransmitted, need to reset all |
| * retransmission info. This is because this |
| * segment will be treated as a dup ACK. |
| */ |
| if (tcp->tcp_rexmit) { |
| tcp->tcp_rexmit = B_FALSE; |
| tcp->tcp_rexmit_nxt = tcp->tcp_snxt; |
| tcp->tcp_rexmit_max = tcp->tcp_snxt; |
| tcp->tcp_ms_we_have_waited = 0; |
| |
| /* |
| * Set tcp_cwnd back to 1 MSS, per |
| * recommendation from |
| * draft-floyd-incr-init-win-01.txt, |
| * Increasing TCP's Initial Window. |
| */ |
| DTRACE_PROBE3(cwnd__retransmitted__syn, |
| tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, |
| uint32_t, tcp->tcp_mss); |
| tcp->tcp_cwnd = tcp->tcp_mss; |
| } |
| |
| tcp->tcp_swl1 = seg_seq; |
| tcp->tcp_swl2 = seg_ack; |
| |
| new_swnd = ntohs(tcpha->tha_win); |
| tcp->tcp_swnd = new_swnd; |
| if (new_swnd > tcp->tcp_max_swnd) |
| tcp->tcp_max_swnd = new_swnd; |
| |
| /* |
| * Always send the three-way handshake ack immediately |
| * in order to make the connection complete as soon as |
| * possible on the accepting host. |
| */ |
| flags |= TH_ACK_NEEDED; |
| |
| /* |
| * Trace connect-established here. |
| */ |
| DTRACE_TCP5(connect__established, mblk_t *, NULL, |
| ip_xmit_attr_t *, tcp->tcp_connp->conn_ixa, |
| void_ip_t *, iphdr, tcp_t *, tcp, tcph_t *, tcpha); |
| |
| /* Trace change from SYN_SENT -> ESTABLISHED here */ |
| DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, |
| connp->conn_ixa, void, NULL, tcp_t *, tcp, |
| void, NULL, int32_t, TCPS_SYN_SENT); |
| |
| /* |
| * Special case for loopback. At this point we have |
| * received SYN-ACK from the remote endpoint. In |
| * order to ensure that both endpoints reach the |
| * fused state prior to any data exchange, the final |
| * ACK needs to be sent before we indicate T_CONN_CON |
| * to the module upstream. |
| */ |
| if (tcp->tcp_loopback) { |
| mblk_t *ack_mp; |
| |
| ASSERT(!tcp->tcp_unfusable); |
| ASSERT(mp1 != NULL); |
| /* |
| * For loopback, we always get a pure SYN-ACK |
| * and only need to send back the final ACK |
| * with no data (this is because the other |
| * tcp is ours and we don't do T/TCP). This |
| * final ACK triggers the passive side to |
| * perform fusion in ESTABLISHED state. |
| */ |
| if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { |
| if (tcp->tcp_ack_tid != 0) { |
| (void) TCP_TIMER_CANCEL(tcp, |
| tcp->tcp_ack_tid); |
| tcp->tcp_ack_tid = 0; |
| } |
| tcp_send_data(tcp, ack_mp); |
| TCPS_BUMP_MIB(tcps, tcpHCOutSegs); |
| TCPS_BUMP_MIB(tcps, tcpOutAck); |
| |
| if (!IPCL_IS_NONSTR(connp)) { |
| /* Send up T_CONN_CON */ |
| if (ira->ira_cred != NULL) { |
| mblk_setcred(mp1, |
| ira->ira_cred, |
| ira->ira_cpid); |
| } |
| putnext(connp->conn_rq, mp1); |
| } else { |
| (*sockupcalls->su_connected) |
| (connp->conn_upper_handle, |
| tcp->tcp_connid, |
| ira->ira_cred, |
| ira->ira_cpid); |
| freemsg(mp1); |
| } |
| |
| freemsg(mp); |
| return; |
| } |
| /* |
| * Forget fusion; we need to handle more |
| * complex cases below. Send the deferred |
| * T_CONN_CON message upstream and proceed |
| * as usual. Mark this tcp as not capable |
| * of fusion. |
| */ |
| TCP_STAT(tcps, tcp_fusion_unfusable); |
| tcp->tcp_unfusable = B_TRUE; |
| if (!IPCL_IS_NONSTR(connp)) { |
| if (ira->ira_cred != NULL) { |
| mblk_setcred(mp1, ira->ira_cred, |
| ira->ira_cpid); |
| } |
| putnext(connp->conn_rq, mp1); |
| } else { |
| (*sockupcalls->su_connected) |
| (connp->conn_upper_handle, |
| tcp->tcp_connid, ira->ira_cred, |
| ira->ira_cpid); |
| |