| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. |
| */ |
| |
| #include <sys/types.h> |
| #include <sys/strlog.h> |
| #include <sys/policy.h> |
| #include <sys/strsun.h> |
| #include <sys/squeue_impl.h> |
| #include <sys/squeue.h> |
| |
| #include <inet/common.h> |
| #include <inet/ip.h> |
| #include <inet/tcp.h> |
| #include <inet/tcp_impl.h> |
| |
| /* Control whether TCP can enter defensive mode when under memory pressure. */ |
| static boolean_t tcp_do_reclaim = B_TRUE; |
| |
| /* |
| * Routines related to the TCP_IOC_ABORT_CONN ioctl command. |
| * |
| * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting |
| * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure |
| * (defined in tcp.h) needs to be filled in and passed into the kernel |
| * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t |
| * structure contains the four-tuple of a TCP connection and a range of TCP |
| * states (specified by ac_start and ac_end). The use of wildcard addresses |
| * and ports is allowed. Connections with a matching four tuple and a state |
| * within the specified range will be aborted. The valid states for the |
| * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, |
| * inclusive. |
| * |
| * An application which has its connection aborted by this ioctl will receive |
| * an error that is dependent on the connection state at the time of the abort. |
| * If the connection state is < TCPS_TIME_WAIT, an application should behave as |
| * though a RST packet has been received. If the connection state is equal to |
| * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel |
| * and all resources associated with the connection will be freed. |
| */ |
| static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); |
| static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); |
| static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, |
| ip_recv_attr_t *dummy); |
| static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); |
| void tcp_ioctl_abort_conn(queue_t *, mblk_t *); |
| static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, |
| boolean_t, tcp_stack_t *); |
| |
| /* |
| * Macros used for accessing the different types of sockaddr |
| * structures inside a tcp_ioc_abort_conn_t. |
| */ |
| #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) |
| #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) |
| #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) |
| #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) |
| #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) |
| #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) |
| #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) |
| #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) |
| #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) |
| #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) |
| #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) |
| #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) |
| |
| /* |
| * Return the correct error code to mimic the behavior |
| * of a connection reset. |
| */ |
| #define TCP_AC_GET_ERRCODE(state, err) { \ |
| switch ((state)) { \ |
| case TCPS_SYN_SENT: \ |
| case TCPS_SYN_RCVD: \ |
| (err) = ECONNREFUSED; \ |
| break; \ |
| case TCPS_ESTABLISHED: \ |
| case TCPS_FIN_WAIT_1: \ |
| case TCPS_FIN_WAIT_2: \ |
| case TCPS_CLOSE_WAIT: \ |
| (err) = ECONNRESET; \ |
| break; \ |
| case TCPS_CLOSING: \ |
| case TCPS_LAST_ACK: \ |
| case TCPS_TIME_WAIT: \ |
| (err) = 0; \ |
| break; \ |
| default: \ |
| (err) = ENXIO; \ |
| } \ |
| } |
| |
| /* |
| * Check if a tcp structure matches the info in acp. |
| */ |
| #define TCP_AC_ADDR_MATCH(acp, connp, tcp) \ |
| (((acp)->ac_local.ss_family == AF_INET) ? \ |
| ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ |
| TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \ |
| (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ |
| TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \ |
| (TCP_AC_V4LPORT((acp)) == 0 || \ |
| TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \ |
| (TCP_AC_V4RPORT((acp)) == 0 || \ |
| TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \ |
| (acp)->ac_start <= (tcp)->tcp_state && \ |
| (acp)->ac_end >= (tcp)->tcp_state) : \ |
| ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ |
| IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ |
| &(connp)->conn_laddr_v6)) && \ |
| (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ |
| IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ |
| &(connp)->conn_faddr_v6)) && \ |
| (TCP_AC_V6LPORT((acp)) == 0 || \ |
| TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \ |
| (TCP_AC_V6RPORT((acp)) == 0 || \ |
| TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \ |
| (acp)->ac_start <= (tcp)->tcp_state && \ |
| (acp)->ac_end >= (tcp)->tcp_state)) |
| |
| #define TCP_AC_MATCH(acp, connp, tcp) \ |
| (((acp)->ac_zoneid == ALL_ZONES || \ |
| (acp)->ac_zoneid == (connp)->conn_zoneid) ? \ |
| TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0) |
| |
| /* |
| * Build a message containing a tcp_ioc_abort_conn_t structure |
| * which is filled in with information from acp and tp. |
| */ |
| static mblk_t * |
| tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) |
| { |
| mblk_t *mp; |
| tcp_ioc_abort_conn_t *tacp; |
| |
| mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); |
| if (mp == NULL) |
| return (NULL); |
| |
| *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; |
| tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + |
| sizeof (uint32_t)); |
| |
| tacp->ac_start = acp->ac_start; |
| tacp->ac_end = acp->ac_end; |
| tacp->ac_zoneid = acp->ac_zoneid; |
| |
| if (acp->ac_local.ss_family == AF_INET) { |
| tacp->ac_local.ss_family = AF_INET; |
| tacp->ac_remote.ss_family = AF_INET; |
| TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4; |
| TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4; |
| TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport; |
| TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport; |
| } else { |
| tacp->ac_local.ss_family = AF_INET6; |
| tacp->ac_remote.ss_family = AF_INET6; |
| TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6; |
| TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6; |
| TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport; |
| TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport; |
| } |
| mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); |
| return (mp); |
| } |
| |
| /* |
| * Print a tcp_ioc_abort_conn_t structure. |
| */ |
| static void |
| tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) |
| { |
| char lbuf[128]; |
| char rbuf[128]; |
| sa_family_t af; |
| in_port_t lport, rport; |
| ushort_t logflags; |
| |
| af = acp->ac_local.ss_family; |
| |
| if (af == AF_INET) { |
| (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), |
| lbuf, 128); |
| (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), |
| rbuf, 128); |
| lport = ntohs(TCP_AC_V4LPORT(acp)); |
| rport = ntohs(TCP_AC_V4RPORT(acp)); |
| } else { |
| (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), |
| lbuf, 128); |
| (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), |
| rbuf, 128); |
| lport = ntohs(TCP_AC_V6LPORT(acp)); |
| rport = ntohs(TCP_AC_V6RPORT(acp)); |
| } |
| |
| logflags = SL_TRACE | SL_NOTE; |
| /* |
| * Don't print this message to the console if the operation was done |
| * to a non-global zone. |
| */ |
| if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) |
| logflags |= SL_CONSOLE; |
| (void) strlog(TCP_MOD_ID, 0, 1, logflags, |
| "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " |
| "start = %d, end = %d\n", lbuf, lport, rbuf, rport, |
| acp->ac_start, acp->ac_end); |
| } |
| |
| /* |
| * Called using SQ_FILL when a message built using |
| * tcp_ioctl_abort_build_msg is put into a queue. |
| * Note that when we get here there is no wildcard in acp any more. |
| */ |
| /* ARGSUSED2 */ |
| static void |
| tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, |
| ip_recv_attr_t *dummy) |
| { |
| conn_t *connp = (conn_t *)arg; |
| tcp_t *tcp = connp->conn_tcp; |
| tcp_ioc_abort_conn_t *acp; |
| |
| /* |
| * Don't accept any input on a closed tcp as this TCP logically does |
| * not exist on the system. Don't proceed further with this TCP. |
| * For eg. this packet could trigger another close of this tcp |
| * which would be disastrous for tcp_refcnt. tcp_close_detached / |
| * tcp_clean_death / tcp_closei_local must be called at most once |
| * on a TCP. |
| */ |
| if (tcp->tcp_state == TCPS_CLOSED || |
| tcp->tcp_state == TCPS_BOUND) { |
| freemsg(mp); |
| return; |
| } |
| |
| acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); |
| if (tcp->tcp_state <= acp->ac_end) { |
| /* |
| * If we get here, we are already on the correct |
| * squeue. This ioctl follows the following path |
| * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn |
| * ->tcp_ioctl_abort->squeue_enter (if on a |
| * different squeue) |
| */ |
| int errcode; |
| |
| TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); |
| (void) tcp_clean_death(tcp, errcode); |
| } |
| freemsg(mp); |
| } |
| |
| /* |
| * Abort all matching connections on a hash chain. |
| */ |
| static int |
| tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, |
| boolean_t exact, tcp_stack_t *tcps) |
| { |
| int nmatch, err = 0; |
| tcp_t *tcp; |
| MBLKP mp, last, listhead = NULL; |
| conn_t *tconnp; |
| connf_t *connfp; |
| ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; |
| |
| connfp = &ipst->ips_ipcl_conn_fanout[index]; |
| |
| startover: |
| nmatch = 0; |
| last = NULL; |
| |
| mutex_enter(&connfp->connf_lock); |
| for (tconnp = connfp->connf_head; tconnp != NULL; |
| tconnp = tconnp->conn_next) { |
| tcp = tconnp->conn_tcp; |
| /* |
| * We are missing a check on sin6_scope_id for linklocals here, |
| * but current usage is just for aborting based on zoneid |
| * for shared-IP zones. |
| */ |
| if (TCP_AC_MATCH(acp, tconnp, tcp)) { |
| CONN_INC_REF(tconnp); |
| mp = tcp_ioctl_abort_build_msg(acp, tcp); |
| if (mp == NULL) { |
| err = ENOMEM; |
| CONN_DEC_REF(tconnp); |
| break; |
| } |
| mp->b_prev = (mblk_t *)tcp; |
| |
| if (listhead == NULL) { |
| listhead = mp; |
| last = mp; |
| } else { |
| last->b_next = mp; |
| last = mp; |
| } |
| nmatch++; |
| if (exact) |
| break; |
| } |
| |
| /* Avoid holding lock for too long. */ |
| if (nmatch >= 500) |
| break; |
| } |
| mutex_exit(&connfp->connf_lock); |
| |
| /* Pass mp into the correct tcp */ |
| while ((mp = listhead) != NULL) { |
| listhead = listhead->b_next; |
| tcp = (tcp_t *)mp->b_prev; |
| mp->b_next = mp->b_prev = NULL; |
| SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, |
| tcp_ioctl_abort_handler, tcp->tcp_connp, NULL, |
| SQ_FILL, SQTAG_TCP_ABORT_BUCKET); |
| } |
| |
| *count += nmatch; |
| if (nmatch >= 500 && err == 0) |
| goto startover; |
| return (err); |
| } |
| |
| /* |
| * Abort all connections that matches the attributes specified in acp. |
| */ |
| static int |
| tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps) |
| { |
| sa_family_t af; |
| uint32_t ports; |
| uint16_t *pports; |
| int err = 0, count = 0; |
| boolean_t exact = B_FALSE; /* set when there is no wildcard */ |
| int index = -1; |
| ushort_t logflags; |
| ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; |
| |
| af = acp->ac_local.ss_family; |
| |
| if (af == AF_INET) { |
| if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && |
| TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { |
| pports = (uint16_t *)&ports; |
| pports[1] = TCP_AC_V4LPORT(acp); |
| pports[0] = TCP_AC_V4RPORT(acp); |
| exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); |
| } |
| } else { |
| if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && |
| TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { |
| pports = (uint16_t *)&ports; |
| pports[1] = TCP_AC_V6LPORT(acp); |
| pports[0] = TCP_AC_V6RPORT(acp); |
| exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); |
| } |
| } |
| |
| /* |
| * For cases where remote addr, local port, and remote port are non- |
| * wildcards, tcp_ioctl_abort_bucket will only be called once. |
| */ |
| if (index != -1) { |
| err = tcp_ioctl_abort_bucket(acp, index, |
| &count, exact, tcps); |
| } else { |
| /* |
| * loop through all entries for wildcard case |
| */ |
| for (index = 0; |
| index < ipst->ips_ipcl_conn_fanout_size; |
| index++) { |
| err = tcp_ioctl_abort_bucket(acp, index, |
| &count, exact, tcps); |
| if (err != 0) |
| break; |
| } |
| } |
| |
| logflags = SL_TRACE | SL_NOTE; |
| /* |
| * Don't print this message to the console if the operation was done |
| * to a non-global zone. |
| */ |
| if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) |
| logflags |= SL_CONSOLE; |
| (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " |
| "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); |
| if (err == 0 && count == 0) |
| err = ENOENT; |
| return (err); |
| } |
| |
| /* |
| * Process the TCP_IOC_ABORT_CONN ioctl request. |
| */ |
| void |
| tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) |
| { |
| int err; |
| IOCP iocp; |
| MBLKP mp1; |
| sa_family_t laf, raf; |
| tcp_ioc_abort_conn_t *acp; |
| zone_t *zptr; |
| conn_t *connp = Q_TO_CONN(q); |
| zoneid_t zoneid = connp->conn_zoneid; |
| tcp_t *tcp = connp->conn_tcp; |
| tcp_stack_t *tcps = tcp->tcp_tcps; |
| |
| iocp = (IOCP)mp->b_rptr; |
| |
| if ((mp1 = mp->b_cont) == NULL || |
| iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { |
| err = EINVAL; |
| goto out; |
| } |
| |
| /* check permissions */ |
| if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { |
| err = EPERM; |
| goto out; |
| } |
| |
| if (mp1->b_cont != NULL) { |
| freemsg(mp1->b_cont); |
| mp1->b_cont = NULL; |
| } |
| |
| acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; |
| laf = acp->ac_local.ss_family; |
| raf = acp->ac_remote.ss_family; |
| |
| /* check that a zone with the supplied zoneid exists */ |
| if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { |
| zptr = zone_find_by_id(zoneid); |
| if (zptr != NULL) { |
| zone_rele(zptr); |
| } else { |
| err = EINVAL; |
| goto out; |
| } |
| } |
| |
| /* |
| * For exclusive stacks we set the zoneid to zero |
| * to make TCP operate as if in the global zone. |
| */ |
| if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID) |
| acp->ac_zoneid = GLOBAL_ZONEID; |
| |
| if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || |
| acp->ac_start > acp->ac_end || laf != raf || |
| (laf != AF_INET && laf != AF_INET6)) { |
| err = EINVAL; |
| goto out; |
| } |
| |
| tcp_ioctl_abort_dump(acp); |
| err = tcp_ioctl_abort(acp, tcps); |
| |
| out: |
| if (mp1 != NULL) { |
| freemsg(mp1); |
| mp->b_cont = NULL; |
| } |
| |
| if (err != 0) |
| miocnak(q, mp, 0, err); |
| else |
| miocack(q, mp, 0, 0); |
| } |
| |
| /* |
| * Timeout function to reset the TCP stack variable tcps_reclaim to false. |
| */ |
| void |
| tcp_reclaim_timer(void *arg) |
| { |
| tcp_stack_t *tcps = (tcp_stack_t *)arg; |
| int64_t tot_conn = 0; |
| int i; |
| extern pgcnt_t lotsfree, needfree; |
| |
| for (i = 0; i < tcps->tcps_sc_cnt; i++) |
| tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt; |
| |
| /* |
| * This happens only when a stack is going away. tcps_reclaim_tid |
| * should not be reset to 0 when returning in this case. |
| */ |
| mutex_enter(&tcps->tcps_reclaim_lock); |
| if (!tcps->tcps_reclaim) { |
| mutex_exit(&tcps->tcps_reclaim_lock); |
| return; |
| } |
| |
| if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) { |
| tcps->tcps_reclaim = B_FALSE; |
| tcps->tcps_reclaim_tid = 0; |
| } else { |
| /* Stay in defensive mode and restart the timer */ |
| tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, |
| tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); |
| } |
| mutex_exit(&tcps->tcps_reclaim_lock); |
| } |
| |
| /* |
| * Kmem reclaim call back function. When the system is under memory |
| * pressure, we set the TCP stack variable tcps_reclaim to true. This |
| * variable is reset to false after tcps_reclaim_period msecs. During this |
| * period, TCP will be more aggressive in aborting connections not making |
| * progress, meaning retransmitting for some time (tcp_early_abort seconds). |
| * TCP will also not accept new connection request for those listeners whose |
| * q or q0 is not empty. |
| */ |
| /* ARGSUSED */ |
| void |
| tcp_conn_reclaim(void *arg) |
| { |
| netstack_handle_t nh; |
| netstack_t *ns; |
| tcp_stack_t *tcps; |
| extern pgcnt_t lotsfree, needfree; |
| |
| if (!tcp_do_reclaim) |
| return; |
| |
| /* |
| * The reclaim function may be called even when the system is not |
| * really under memory pressure. |
| */ |
| if (freemem >= lotsfree + needfree) |
| return; |
| |
| netstack_next_init(&nh); |
| while ((ns = netstack_next(&nh)) != NULL) { |
| int i; |
| int64_t tot_conn = 0; |
| |
| /* |
| * During boot time, the first netstack_t is created and |
| * initialized before TCP has registered with the netstack |
| * framework. If this reclaim function is called before TCP |
| * has finished its initialization, netstack_next() will |
| * return the first netstack_t (since its netstack_flags is |
| * not NSF_UNINIT). And its netstack_tcp will be NULL. We |
| * need to catch it. |
| * |
| * All subsequent netstack_t creation will not have this |
| * problem since the initialization is not finished until TCP |
| * has finished its own tcp_stack_t initialization. Hence |
| * netstack_next() will not return one with NULL netstack_tcp. |
| */ |
| if ((tcps = ns->netstack_tcp) == NULL) { |
| netstack_rele(ns); |
| continue; |
| } |
| |
| /* |
| * Even if the system is under memory pressure, the reason may |
| * not be because of TCP activity. Check the number of |
| * connections in each stack. If the number exceeds the |
| * threshold (maxusers), turn on defensive mode. |
| */ |
| for (i = 0; i < tcps->tcps_sc_cnt; i++) |
| tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt; |
| if (tot_conn < maxusers) { |
| netstack_rele(ns); |
| continue; |
| } |
| |
| mutex_enter(&tcps->tcps_reclaim_lock); |
| if (!tcps->tcps_reclaim) { |
| tcps->tcps_reclaim = B_TRUE; |
| tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, |
| tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); |
| TCP_STAT(tcps, tcp_reclaim_cnt); |
| } |
| mutex_exit(&tcps->tcps_reclaim_lock); |
| netstack_rele(ns); |
| } |
| netstack_next_fini(&nh); |
| } |
| |
| /* |
| * Given a tcp_stack_t and a port (in host byte order), find a listener |
| * configuration for that port and return the ratio. |
| */ |
| uint32_t |
| tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port) |
| { |
| tcp_listener_t *tl; |
| uint32_t ratio = 0; |
| |
| mutex_enter(&tcps->tcps_listener_conf_lock); |
| for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; |
| tl = list_next(&tcps->tcps_listener_conf, tl)) { |
| if (tl->tl_port == port) { |
| ratio = tl->tl_ratio; |
| break; |
| } |
| } |
| mutex_exit(&tcps->tcps_listener_conf_lock); |
| return (ratio); |
| } |
| |
| /* |
| * To remove all listener limit configuration in a tcp_stack_t. |
| */ |
| void |
| tcp_listener_conf_cleanup(tcp_stack_t *tcps) |
| { |
| tcp_listener_t *tl; |
| |
| mutex_enter(&tcps->tcps_listener_conf_lock); |
| while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) { |
| list_remove(&tcps->tcps_listener_conf, tl); |
| kmem_free(tl, sizeof (tcp_listener_t)); |
| } |
| mutex_destroy(&tcps->tcps_listener_conf_lock); |
| list_destroy(&tcps->tcps_listener_conf); |
| } |
| |
| /* |
| * When a CPU is added, we need to allocate the per CPU stats struct. |
| */ |
| void |
| tcp_stack_cpu_add(tcp_stack_t *tcps, processorid_t cpu_seqid) |
| { |
| int i; |
| |
| if (cpu_seqid < tcps->tcps_sc_cnt) |
| return; |
| for (i = tcps->tcps_sc_cnt; i <= cpu_seqid; i++) { |
| ASSERT(tcps->tcps_sc[i] == NULL); |
| tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t), |
| KM_SLEEP); |
| } |
| membar_producer(); |
| tcps->tcps_sc_cnt = cpu_seqid + 1; |
| } |
| |
| /* |
| * Diagnostic routine used to return a string associated with the tcp state. |
| * Note that if the caller does not supply a buffer, it will use an internal |
| * static string. This means that if multiple threads call this function at |
| * the same time, output can be corrupted... Note also that this function |
| * does not check the size of the supplied buffer. The caller has to make |
| * sure that it is big enough. |
| */ |
| char * |
| tcp_display(tcp_t *tcp, char *sup_buf, char format) |
| { |
| char buf1[30]; |
| static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; |
| char *buf; |
| char *cp; |
| in6_addr_t local, remote; |
| char local_addrbuf[INET6_ADDRSTRLEN]; |
| char remote_addrbuf[INET6_ADDRSTRLEN]; |
| conn_t *connp; |
| |
| if (sup_buf != NULL) |
| buf = sup_buf; |
| else |
| buf = priv_buf; |
| |
| if (tcp == NULL) |
| return ("NULL_TCP"); |
| |
| connp = tcp->tcp_connp; |
| switch (tcp->tcp_state) { |
| case TCPS_CLOSED: |
| cp = "TCP_CLOSED"; |
| break; |
| case TCPS_IDLE: |
| cp = "TCP_IDLE"; |
| break; |
| case TCPS_BOUND: |
| cp = "TCP_BOUND"; |
| break; |
| case TCPS_LISTEN: |
| cp = "TCP_LISTEN"; |
| break; |
| case TCPS_SYN_SENT: |
| cp = "TCP_SYN_SENT"; |
| break; |
| case TCPS_SYN_RCVD: |
| cp = "TCP_SYN_RCVD"; |
| break; |
| case TCPS_ESTABLISHED: |
| cp = "TCP_ESTABLISHED"; |
| break; |
| case TCPS_CLOSE_WAIT: |
| cp = "TCP_CLOSE_WAIT"; |
| break; |
| case TCPS_FIN_WAIT_1: |
| cp = "TCP_FIN_WAIT_1"; |
| break; |
| case TCPS_CLOSING: |
| cp = "TCP_CLOSING"; |
| break; |
| case TCPS_LAST_ACK: |
| cp = "TCP_LAST_ACK"; |
| break; |
| case TCPS_FIN_WAIT_2: |
| cp = "TCP_FIN_WAIT_2"; |
| break; |
| case TCPS_TIME_WAIT: |
| cp = "TCP_TIME_WAIT"; |
| break; |
| default: |
| (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); |
| cp = buf1; |
| break; |
| } |
| switch (format) { |
| case DISP_ADDR_AND_PORT: |
| if (connp->conn_ipversion == IPV4_VERSION) { |
| /* |
| * Note that we use the remote address in the tcp_b |
| * structure. This means that it will print out |
| * the real destination address, not the next hop's |
| * address if source routing is used. |
| */ |
| IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local); |
| IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote); |
| |
| } else { |
| local = connp->conn_laddr_v6; |
| remote = connp->conn_faddr_v6; |
| } |
| (void) inet_ntop(AF_INET6, &local, local_addrbuf, |
| sizeof (local_addrbuf)); |
| (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, |
| sizeof (remote_addrbuf)); |
| (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", |
| local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf, |
| ntohs(connp->conn_fport), cp); |
| break; |
| case DISP_PORT_ONLY: |
| default: |
| (void) mi_sprintf(buf, "[%u, %u] %s", |
| ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); |
| break; |
| } |
| |
| return (buf); |
| } |