| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| * Copyright 2014 Joyent, Inc. All rights reserved. |
| */ |
| |
| #include <sys/sysmacros.h> |
| #include <sys/types.h> |
| #include <sys/conf.h> |
| #include <sys/time.h> |
| #include <sys/taskq.h> |
| #include <sys/cmn_err.h> |
| #include <sys/sdt.h> |
| #include <sys/atomic.h> |
| #include <netinet/in.h> |
| #include <inet/ip.h> |
| #include <inet/ip6.h> |
| #include <inet/tcp.h> |
| #include <inet/udp_impl.h> |
| #include <inet/ilb.h> |
| |
| #include "ilb_stack.h" |
| #include "ilb_impl.h" |
| #include "ilb_conn.h" |
| #include "ilb_nat.h" |
| |
| /* |
| * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection |
| * |
| * start: starting index into the hash table to do gc |
| * end: ending index into the hash table to do gc |
| * ilbs: pointer to the ilb_stack_t of the IP stack |
| * tid_lock: mutex to protect the timer id. |
| * tid: timer id of the timer |
| */ |
| typedef struct ilb_timer_s { |
| uint32_t start; |
| uint32_t end; |
| ilb_stack_t *ilbs; |
| kmutex_t tid_lock; |
| timeout_id_t tid; |
| } ilb_timer_t; |
| |
| /* Hash macro for finding the index to the conn hash table */ |
| #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \ |
| (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \ |
| (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \ |
| (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \ |
| (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \ |
| ((hash_size) - 1)) |
| |
| /* Kmem cache for the conn hash entry */ |
| static struct kmem_cache *ilb_conn_cache = NULL; |
| |
| /* |
| * There are 60 timers running to do conn cache garbage collection. Each |
| * gc thread is responsible for 1/60 of the conn hash table. |
| */ |
| static int ilb_conn_timer_size = 60; |
| |
| /* Each of the above gc timers wake up every 15s to do the gc. */ |
| static int ilb_conn_cache_timeout = 15; |
| |
| #define ILB_STICKY_HASH(saddr, rule, hash_size) \ |
| (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \ |
| (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \ |
| (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \ |
| (*(saddr) ^ (rule))) & ((hash_size) - 1)) |
| |
| static struct kmem_cache *ilb_sticky_cache = NULL; |
| |
| /* |
| * There are 60 timers running to do sticky cache garbage collection. Each |
| * gc thread is responsible for 1/60 of the sticky hash table. |
| */ |
| static int ilb_sticky_timer_size = 60; |
| |
| /* Each of the above gc timers wake up every 15s to do the gc. */ |
| static int ilb_sticky_timeout = 15; |
| |
| #define ILB_STICKY_REFRELE(s) \ |
| { \ |
| mutex_enter(&(s)->hash->sticky_lock); \ |
| (s)->refcnt--; \ |
| (s)->atime = ddi_get_lbolt64(); \ |
| mutex_exit(&s->hash->sticky_lock); \ |
| } |
| |
| |
| static void |
| ilb_conn_cache_init(void) |
| { |
| ilb_conn_cache = kmem_cache_create("ilb_conn_cache", |
| sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL, |
| ilb_kmem_flags); |
| } |
| |
| void |
| ilb_conn_cache_fini(void) |
| { |
| if (ilb_conn_cache != NULL) { |
| kmem_cache_destroy(ilb_conn_cache); |
| ilb_conn_cache = NULL; |
| } |
| } |
| |
| static void |
| ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s) |
| { |
| ilb_conn_hash_t *hash; |
| ilb_conn_t **next, **prev; |
| ilb_conn_t **next_prev, **prev_next; |
| |
| next_prev = NULL; |
| prev_next = NULL; |
| |
| if (c2s) { |
| hash = connp->conn_c2s_hash; |
| ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); |
| next = &connp->conn_c2s_next; |
| prev = &connp->conn_c2s_prev; |
| if (*next != NULL) |
| next_prev = &(*next)->conn_c2s_prev; |
| if (*prev != NULL) |
| prev_next = &(*prev)->conn_c2s_next; |
| } else { |
| hash = connp->conn_s2c_hash; |
| ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock)); |
| next = &connp->conn_s2c_next; |
| prev = &connp->conn_s2c_prev; |
| if (*next != NULL) |
| next_prev = &(*next)->conn_s2c_prev; |
| if (*prev != NULL) |
| prev_next = &(*prev)->conn_s2c_next; |
| } |
| |
| if (hash->ilb_connp == connp) { |
| hash->ilb_connp = *next; |
| if (*next != NULL) |
| *next_prev = NULL; |
| } else { |
| if (*prev != NULL) |
| *prev_next = *next; |
| if (*next != NULL) |
| *next_prev = *prev; |
| } |
| ASSERT(hash->ilb_conn_cnt > 0); |
| hash->ilb_conn_cnt--; |
| |
| *next = NULL; |
| *prev = NULL; |
| } |
| |
| static void |
| ilb_conn_remove(ilb_conn_t *connp) |
| { |
| ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); |
| ilb_conn_remove_common(connp, B_TRUE); |
| ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); |
| ilb_conn_remove_common(connp, B_FALSE); |
| |
| if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { |
| in_port_t port; |
| |
| port = ntohs(connp->conn_rule_cache.info.nat_sport); |
| vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena, |
| (void *)(uintptr_t)port, 1); |
| } |
| |
| if (connp->conn_sticky != NULL) |
| ILB_STICKY_REFRELE(connp->conn_sticky); |
| ILB_SERVER_REFRELE(connp->conn_server); |
| kmem_cache_free(ilb_conn_cache, connp); |
| } |
| |
| /* |
| * Routine to do periodic garbage collection of conn hash entries. When |
| * a conn hash timer fires, it dispatches a taskq to call this function |
| * to do the gc. Note that each taskq is responisble for a portion of |
| * the table. The portion is stored in timer->start, timer->end. |
| */ |
| static void |
| ilb_conn_cleanup(void *arg) |
| { |
| ilb_timer_t *timer = (ilb_timer_t *)arg; |
| uint32_t i; |
| ilb_stack_t *ilbs; |
| ilb_conn_hash_t *c2s_hash, *s2c_hash; |
| ilb_conn_t *connp, *nxt_connp; |
| int64_t now; |
| int64_t expiry; |
| boolean_t die_now; |
| |
| ilbs = timer->ilbs; |
| c2s_hash = ilbs->ilbs_c2s_conn_hash; |
| ASSERT(c2s_hash != NULL); |
| |
| now = ddi_get_lbolt64(); |
| for (i = timer->start; i < timer->end; i++) { |
| mutex_enter(&c2s_hash[i].ilb_conn_hash_lock); |
| if ((connp = c2s_hash[i].ilb_connp) == NULL) { |
| ASSERT(c2s_hash[i].ilb_conn_cnt == 0); |
| mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); |
| continue; |
| } |
| do { |
| ASSERT(c2s_hash[i].ilb_conn_cnt > 0); |
| ASSERT(connp->conn_c2s_hash == &c2s_hash[i]); |
| nxt_connp = connp->conn_c2s_next; |
| expiry = now - SEC_TO_TICK(connp->conn_expiry); |
| if (connp->conn_server->iser_die_time != 0 && |
| connp->conn_server->iser_die_time < now) |
| die_now = B_TRUE; |
| else |
| die_now = B_FALSE; |
| s2c_hash = connp->conn_s2c_hash; |
| mutex_enter(&s2c_hash->ilb_conn_hash_lock); |
| |
| if (connp->conn_gc || die_now || |
| (connp->conn_c2s_atime < expiry && |
| connp->conn_s2c_atime < expiry)) { |
| /* Need to update the nat list cur_connp */ |
| if (connp == ilbs->ilbs_conn_list_connp) { |
| ilbs->ilbs_conn_list_connp = |
| connp->conn_c2s_next; |
| } |
| ilb_conn_remove(connp); |
| goto nxt_connp; |
| } |
| |
| if (connp->conn_l4 != IPPROTO_TCP) |
| goto nxt_connp; |
| |
| /* Update and check TCP related conn info */ |
| if (connp->conn_c2s_tcp_fin_sent && |
| SEQ_GT(connp->conn_s2c_tcp_ack, |
| connp->conn_c2s_tcp_fss)) { |
| connp->conn_c2s_tcp_fin_acked = B_TRUE; |
| } |
| if (connp->conn_s2c_tcp_fin_sent && |
| SEQ_GT(connp->conn_c2s_tcp_ack, |
| connp->conn_s2c_tcp_fss)) { |
| connp->conn_s2c_tcp_fin_acked = B_TRUE; |
| } |
| if (connp->conn_c2s_tcp_fin_acked && |
| connp->conn_s2c_tcp_fin_acked) { |
| ilb_conn_remove(connp); |
| } |
| nxt_connp: |
| mutex_exit(&s2c_hash->ilb_conn_hash_lock); |
| connp = nxt_connp; |
| } while (connp != NULL); |
| mutex_exit(&c2s_hash[i].ilb_conn_hash_lock); |
| } |
| } |
| |
| /* Conn hash timer routine. It dispatches a taskq and restart the timer */ |
| static void |
| ilb_conn_timer(void *arg) |
| { |
| ilb_timer_t *timer = (ilb_timer_t *)arg; |
| |
| (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup, |
| arg, TQ_SLEEP); |
| mutex_enter(&timer->tid_lock); |
| if (timer->tid == 0) { |
| mutex_exit(&timer->tid_lock); |
| } else { |
| timer->tid = timeout(ilb_conn_timer, arg, |
| SEC_TO_TICK(ilb_conn_cache_timeout)); |
| mutex_exit(&timer->tid_lock); |
| } |
| } |
| |
| void |
| ilb_conn_hash_init(ilb_stack_t *ilbs) |
| { |
| extern pri_t minclsyspri; |
| int i, part; |
| ilb_timer_t *tm; |
| char tq_name[TASKQ_NAMELEN]; |
| |
| /* |
| * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to |
| * the next power of 2. |
| */ |
| if (!ISP2(ilbs->ilbs_conn_hash_size)) { |
| for (i = 0; i < 31; i++) { |
| if (ilbs->ilbs_conn_hash_size < (1 << i)) |
| break; |
| } |
| ilbs->ilbs_conn_hash_size = 1 << i; |
| } |
| |
| /* |
| * Can sleep since this should be called when a rule is being added, |
| * hence we are not in interrupt context. |
| */ |
| ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * |
| ilbs->ilbs_conn_hash_size, KM_SLEEP); |
| ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) * |
| ilbs->ilbs_conn_hash_size, KM_SLEEP); |
| |
| for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { |
| mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock, |
| NULL, MUTEX_DEFAULT, NULL); |
| } |
| for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { |
| mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock, |
| NULL, MUTEX_DEFAULT, NULL); |
| } |
| |
| if (ilb_conn_cache == NULL) |
| ilb_conn_cache_init(); |
| |
| (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p", |
| (void *)ilbs->ilbs_netstack); |
| ASSERT(ilbs->ilbs_conn_taskq == NULL); |
| ilbs->ilbs_conn_taskq = taskq_create(tq_name, |
| ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size, |
| ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); |
| |
| ASSERT(ilbs->ilbs_conn_timer_list == NULL); |
| ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * |
| ilb_conn_timer_size, KM_SLEEP); |
| |
| /* |
| * The hash table is divided in equal partition for those timers |
| * to do garbage collection. |
| */ |
| part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1; |
| for (i = 0; i < ilb_conn_timer_size; i++) { |
| tm = ilbs->ilbs_conn_timer_list + i; |
| tm->start = i * part; |
| tm->end = i * part + part; |
| if (tm->end > ilbs->ilbs_conn_hash_size) |
| tm->end = ilbs->ilbs_conn_hash_size; |
| tm->ilbs = ilbs; |
| mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); |
| /* Spread out the starting execution time of all the timers. */ |
| tm->tid = timeout(ilb_conn_timer, tm, |
| SEC_TO_TICK(ilb_conn_cache_timeout + i)); |
| } |
| } |
| |
| void |
| ilb_conn_hash_fini(ilb_stack_t *ilbs) |
| { |
| uint32_t i; |
| ilb_conn_t *connp; |
| ilb_conn_hash_t *hash; |
| |
| if (ilbs->ilbs_c2s_conn_hash == NULL) { |
| ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); |
| return; |
| } |
| |
| /* Stop all the timers first. */ |
| for (i = 0; i < ilb_conn_timer_size; i++) { |
| timeout_id_t tid; |
| |
| /* Setting tid to 0 tells the timer handler not to restart. */ |
| mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock); |
| tid = ilbs->ilbs_conn_timer_list[i].tid; |
| ilbs->ilbs_conn_timer_list[i].tid = 0; |
| mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock); |
| (void) untimeout(tid); |
| } |
| kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) * |
| ilb_conn_timer_size); |
| taskq_destroy(ilbs->ilbs_conn_taskq); |
| ilbs->ilbs_conn_taskq = NULL; |
| |
| /* Then remove all the conns. */ |
| hash = ilbs->ilbs_s2c_conn_hash; |
| for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) { |
| while ((connp = hash[i].ilb_connp) != NULL) { |
| hash[i].ilb_connp = connp->conn_s2c_next; |
| ILB_SERVER_REFRELE(connp->conn_server); |
| if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) { |
| ilb_nat_src_entry_t *ent; |
| in_port_t port; |
| |
| /* |
| * src_ent will be freed in ilb_nat_src_fini(). |
| */ |
| port = ntohs( |
| connp->conn_rule_cache.info.nat_sport); |
| ent = connp->conn_rule_cache.info.src_ent; |
| vmem_free(ent->nse_port_arena, |
| (void *)(uintptr_t)port, 1); |
| } |
| kmem_cache_free(ilb_conn_cache, connp); |
| } |
| } |
| kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) * |
| ilbs->ilbs_conn_hash_size); |
| kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) * |
| ilbs->ilbs_conn_hash_size); |
| } |
| |
| /* |
| * Internet checksum adjustment calculation routines. We pre-calculate |
| * checksum adjustment so that we don't need to compute the checksum on |
| * the whole packet when we change address/port in the packet. |
| */ |
| |
| static void |
| hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, |
| in_port_t new_port, uint32_t *adj_sum) |
| { |
| uint32_t sum; |
| |
| sum = *oaddr + *(oaddr + 1) + old_port; |
| while ((sum >> 16) != 0) |
| sum = (sum & 0xffff) + (sum >> 16); |
| *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port; |
| } |
| |
| static void |
| hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port, |
| in_port_t new_port, uint32_t *adj_sum) |
| { |
| uint32_t sum = 0; |
| |
| sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) + |
| *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) + |
| old_port; |
| while ((sum >> 16) != 0) |
| sum = (sum & 0xffff) + (sum >> 16); |
| *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + |
| *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) + |
| *(naddr + 6) + *(naddr + 7) + new_port; |
| } |
| |
| static void |
| fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, |
| uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, |
| in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) |
| { |
| uint32_t sum; |
| |
| sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) + |
| old_port2; |
| while ((sum >> 16) != 0) |
| sum = (sum & 0xffff) + (sum >> 16); |
| *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 + |
| *naddr2 + *(naddr2 + 1) + new_port2; |
| } |
| |
| static void |
| fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1, |
| uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2, |
| in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum) |
| { |
| uint32_t sum = 0; |
| |
| sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) + |
| *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) + |
| old_port1; |
| sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) + |
| *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) + |
| old_port2; |
| while ((sum >> 16) != 0) |
| sum = (sum & 0xffff) + (sum >> 16); |
| sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) + |
| *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) + |
| *(naddr1 + 7) + new_port1; |
| *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) + |
| *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) + |
| *(naddr2 + 7) + new_port2; |
| } |
| |
| /* |
| * Add a conn hash entry to the tables. Note that a conn hash entry |
| * (ilb_conn_t) contains info on both directions. And there are two hash |
| * tables, one for client to server and the other for server to client. |
| * So the same entry is added to both tables and can be ccessed by two |
| * thread simultaneously. But each thread will only access data on one |
| * direction, so there is no conflict. |
| */ |
| int |
| ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server, |
| in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport, |
| ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s) |
| { |
| ilb_conn_t *connp; |
| ilb_conn_hash_t *hash; |
| int i; |
| |
| connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP); |
| if (connp == NULL) { |
| if (s != NULL) { |
| if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { |
| ilb_nat_src_entry_t **entry; |
| |
| entry = s->server->iser_nat_src->src_list; |
| vmem_free(entry[s->nat_src_idx]->nse_port_arena, |
| (void *)(uintptr_t)ntohs(info->nat_sport), |
| 1); |
| } |
| ILB_STICKY_REFRELE(s); |
| } |
| return (ENOMEM); |
| } |
| |
| connp->conn_l4 = rule->ir_proto; |
| |
| connp->conn_server = server; |
| ILB_SERVER_REFHOLD(server); |
| connp->conn_sticky = s; |
| |
| connp->conn_rule_cache.topo = rule->ir_topo; |
| connp->conn_rule_cache.info = *info; |
| |
| connp->conn_gc = B_FALSE; |
| |
| connp->conn_expiry = rule->ir_nat_expiry; |
| connp->conn_cr_time = ddi_get_lbolt64(); |
| |
| /* Client to server info. */ |
| connp->conn_c2s_saddr = *src; |
| connp->conn_c2s_sport = sport; |
| connp->conn_c2s_daddr = *dst; |
| connp->conn_c2s_dport = dport; |
| |
| connp->conn_c2s_atime = ddi_get_lbolt64(); |
| /* The packet ths triggers this creation should be counted */ |
| connp->conn_c2s_pkt_cnt = 1; |
| connp->conn_c2s_tcp_fin_sent = B_FALSE; |
| connp->conn_c2s_tcp_fin_acked = B_FALSE; |
| |
| /* Server to client info, before NAT */ |
| switch (rule->ir_topo) { |
| case ILB_TOPO_IMPL_HALF_NAT: |
| connp->conn_s2c_saddr = info->nat_dst; |
| connp->conn_s2c_sport = info->nat_dport; |
| connp->conn_s2c_daddr = *src; |
| connp->conn_s2c_dport = sport; |
| |
| /* Pre-calculate checksum changes for both directions */ |
| if (rule->ir_ipver == IPPROTO_IP) { |
| hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], |
| (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0, |
| &connp->conn_c2s_ip_sum); |
| hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3], |
| (uint16_t *)&info->nat_dst.s6_addr32[3], dport, |
| info->nat_dport, &connp->conn_c2s_tp_sum); |
| *ip_sum = connp->conn_c2s_ip_sum; |
| *tp_sum = connp->conn_c2s_tp_sum; |
| |
| hnat_cksum_v4( |
| (uint16_t *)&info->nat_dst.s6_addr32[3], |
| (uint16_t *)&dst->s6_addr32[3], 0, 0, |
| &connp->conn_s2c_ip_sum); |
| hnat_cksum_v4( |
| (uint16_t *)&info->nat_dst.s6_addr32[3], |
| (uint16_t *)&dst->s6_addr32[3], |
| info->nat_dport, dport, |
| &connp->conn_s2c_tp_sum); |
| } else { |
| connp->conn_c2s_ip_sum = 0; |
| hnat_cksum_v6((uint16_t *)dst, |
| (uint16_t *)&info->nat_dst, dport, |
| info->nat_dport, &connp->conn_c2s_tp_sum); |
| *ip_sum = 0; |
| *tp_sum = connp->conn_c2s_tp_sum; |
| |
| connp->conn_s2c_ip_sum = 0; |
| hnat_cksum_v6((uint16_t *)&info->nat_dst, |
| (uint16_t *)dst, info->nat_dport, dport, |
| &connp->conn_s2c_tp_sum); |
| } |
| break; |
| case ILB_TOPO_IMPL_NAT: |
| connp->conn_s2c_saddr = info->nat_dst; |
| connp->conn_s2c_sport = info->nat_dport; |
| connp->conn_s2c_daddr = info->nat_src; |
| connp->conn_s2c_dport = info->nat_sport; |
| |
| if (rule->ir_ipver == IPPROTO_IP) { |
| fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], |
| (uint16_t *)&dst->s6_addr32[3], |
| (uint16_t *)&info->nat_src.s6_addr32[3], |
| (uint16_t *)&info->nat_dst.s6_addr32[3], |
| 0, 0, 0, 0, &connp->conn_c2s_ip_sum); |
| fnat_cksum_v4((uint16_t *)&src->s6_addr32[3], |
| (uint16_t *)&dst->s6_addr32[3], |
| (uint16_t *)&info->nat_src.s6_addr32[3], |
| (uint16_t *)&info->nat_dst.s6_addr32[3], |
| sport, dport, info->nat_sport, |
| info->nat_dport, &connp->conn_c2s_tp_sum); |
| *ip_sum = connp->conn_c2s_ip_sum; |
| *tp_sum = connp->conn_c2s_tp_sum; |
| |
| fnat_cksum_v4( |
| (uint16_t *)&info->nat_src.s6_addr32[3], |
| (uint16_t *)&info->nat_dst.s6_addr32[3], |
| (uint16_t *)&src->s6_addr32[3], |
| (uint16_t *)&dst->s6_addr32[3], |
| 0, 0, 0, 0, &connp->conn_s2c_ip_sum); |
| fnat_cksum_v4( |
| (uint16_t *)&info->nat_src.s6_addr32[3], |
| (uint16_t *)&info->nat_dst.s6_addr32[3], |
| (uint16_t *)&src->s6_addr32[3], |
| (uint16_t *)&dst->s6_addr32[3], |
| info->nat_sport, info->nat_dport, |
| sport, dport, &connp->conn_s2c_tp_sum); |
| } else { |
| fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst, |
| (uint16_t *)&info->nat_src, |
| (uint16_t *)&info->nat_dst, |
| sport, dport, info->nat_sport, |
| info->nat_dport, &connp->conn_c2s_tp_sum); |
| connp->conn_c2s_ip_sum = 0; |
| *ip_sum = 0; |
| *tp_sum = connp->conn_c2s_tp_sum; |
| |
| fnat_cksum_v6((uint16_t *)&info->nat_src, |
| (uint16_t *)&info->nat_dst, (uint16_t *)src, |
| (uint16_t *)dst, info->nat_sport, |
| info->nat_dport, sport, dport, |
| &connp->conn_s2c_tp_sum); |
| connp->conn_s2c_ip_sum = 0; |
| } |
| break; |
| } |
| |
| connp->conn_s2c_atime = ddi_get_lbolt64(); |
| connp->conn_s2c_pkt_cnt = 1; |
| connp->conn_s2c_tcp_fin_sent = B_FALSE; |
| connp->conn_s2c_tcp_fin_acked = B_FALSE; |
| |
| /* Add it to the s2c hash table. */ |
| hash = ilbs->ilbs_s2c_conn_hash; |
| i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3], |
| ntohs(connp->conn_s2c_sport), |
| (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3], |
| ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size); |
| connp->conn_s2c_hash = &hash[i]; |
| DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i); |
| |
| mutex_enter(&hash[i].ilb_conn_hash_lock); |
| hash[i].ilb_conn_cnt++; |
| connp->conn_s2c_next = hash[i].ilb_connp; |
| if (hash[i].ilb_connp != NULL) |
| hash[i].ilb_connp->conn_s2c_prev = connp; |
| connp->conn_s2c_prev = NULL; |
| hash[i].ilb_connp = connp; |
| mutex_exit(&hash[i].ilb_conn_hash_lock); |
| |
| /* Add it to the c2s hash table. */ |
| hash = ilbs->ilbs_c2s_conn_hash; |
| i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), |
| (uint8_t *)&dst->s6_addr32[3], ntohs(dport), |
| ilbs->ilbs_conn_hash_size); |
| connp->conn_c2s_hash = &hash[i]; |
| DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i); |
| |
| mutex_enter(&hash[i].ilb_conn_hash_lock); |
| hash[i].ilb_conn_cnt++; |
| connp->conn_c2s_next = hash[i].ilb_connp; |
| if (hash[i].ilb_connp != NULL) |
| hash[i].ilb_connp->conn_c2s_prev = connp; |
| connp->conn_c2s_prev = NULL; |
| hash[i].ilb_connp = connp; |
| mutex_exit(&hash[i].ilb_conn_hash_lock); |
| |
| return (0); |
| } |
| |
| /* |
| * If a connection is using TCP, we keep track of simple TCP state transition |
| * so that we know when to clean up an entry. |
| */ |
| static boolean_t |
| update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len, |
| boolean_t c2s) |
| { |
| uint32_t ack, seq; |
| int32_t seg_len; |
| |
| ack = 0; |
| if (tcpha->tha_flags & TH_RST) |
| return (B_FALSE); |
| |
| seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) - |
| TCP_HDR_LENGTH((tcph_t *)tcpha); |
| |
| if (tcpha->tha_flags & TH_ACK) |
| ack = ntohl(tcpha->tha_ack); |
| seq = ntohl(tcpha->tha_seq); |
| if (c2s) { |
| ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock)); |
| if (tcpha->tha_flags & TH_FIN) { |
| connp->conn_c2s_tcp_fss = seq + seg_len; |
| connp->conn_c2s_tcp_fin_sent = B_TRUE; |
| } |
| connp->conn_c2s_tcp_ack = ack; |
| |
| /* Port reuse by the client, restart the conn. */ |
| if (connp->conn_c2s_tcp_fin_sent && |
| SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) { |
| connp->conn_c2s_tcp_fin_sent = B_FALSE; |
| connp->conn_c2s_tcp_fin_acked = B_FALSE; |
| } |
| } else { |
| ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock)); |
| if (tcpha->tha_flags & TH_FIN) { |
| connp->conn_s2c_tcp_fss = seq + seg_len; |
| connp->conn_s2c_tcp_fin_sent = B_TRUE; |
| } |
| connp->conn_s2c_tcp_ack = ack; |
| |
| /* Port reuse by the client, restart the conn. */ |
| if (connp->conn_s2c_tcp_fin_sent && |
| SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) { |
| connp->conn_s2c_tcp_fin_sent = B_FALSE; |
| connp->conn_s2c_tcp_fin_acked = B_FALSE; |
| } |
| } |
| |
| return (B_TRUE); |
| } |
| |
| /* |
| * Helper routint to find conn hash entry given some packet information and |
| * the traffic direction (c2s, client to server?) |
| */ |
| static boolean_t |
| ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src, |
| in_port_t sport, in6_addr_t *dst, in_port_t dport, |
| ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum, |
| int32_t pkt_len, boolean_t c2s) |
| { |
| ilb_conn_hash_t *hash; |
| uint_t i; |
| ilb_conn_t *connp; |
| boolean_t tcp_alive; |
| boolean_t ret = B_FALSE; |
| |
| i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport), |
| (uint8_t *)&dst->s6_addr32[3], ntohs(dport), |
| ilbs->ilbs_conn_hash_size); |
| if (c2s) { |
| hash = ilbs->ilbs_c2s_conn_hash; |
| mutex_enter(&hash[i].ilb_conn_hash_lock); |
| for (connp = hash[i].ilb_connp; connp != NULL; |
| connp = connp->conn_c2s_next) { |
| if (connp->conn_l4 == l4 && |
| connp->conn_c2s_dport == dport && |
| connp->conn_c2s_sport == sport && |
| IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) && |
| IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) { |
| connp->conn_c2s_atime = ddi_get_lbolt64(); |
| connp->conn_c2s_pkt_cnt++; |
| *rule_cache = connp->conn_rule_cache; |
| *ip_sum = connp->conn_c2s_ip_sum; |
| *tp_sum = connp->conn_c2s_tp_sum; |
| ret = B_TRUE; |
| break; |
| } |
| } |
| } else { |
| hash = ilbs->ilbs_s2c_conn_hash; |
| mutex_enter(&hash[i].ilb_conn_hash_lock); |
| for (connp = hash[i].ilb_connp; connp != NULL; |
| connp = connp->conn_s2c_next) { |
| if (connp->conn_l4 == l4 && |
| connp->conn_s2c_dport == dport && |
| connp->conn_s2c_sport == sport && |
| IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) && |
| IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) { |
| connp->conn_s2c_atime = ddi_get_lbolt64(); |
| connp->conn_s2c_pkt_cnt++; |
| *rule_cache = connp->conn_rule_cache; |
| *ip_sum = connp->conn_s2c_ip_sum; |
| *tp_sum = connp->conn_s2c_tp_sum; |
| ret = B_TRUE; |
| break; |
| } |
| } |
| } |
| if (ret) { |
| ILB_S_KSTAT(connp->conn_server, pkt_processed); |
| ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed, |
| pkt_len); |
| |
| switch (l4) { |
| case (IPPROTO_TCP): |
| tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len, |
| c2s); |
| if (!tcp_alive) { |
| connp->conn_gc = B_TRUE; |
| } |
| break; |
| default: |
| break; |
| } |
| } |
| mutex_exit(&hash[i].ilb_conn_hash_lock); |
| |
| return (ret); |
| } |
| |
| /* |
| * To check if a give packet matches an existing conn hash entry. If it |
| * does, return the information about this entry so that the caller can |
| * do the proper NAT. |
| */ |
| boolean_t |
| ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph, |
| in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport, |
| uint32_t pkt_len, in6_addr_t *lb_dst) |
| { |
| ilb_rule_info_t rule_cache; |
| uint32_t adj_ip_sum, adj_tp_sum; |
| boolean_t ret; |
| |
| /* Check the incoming hash table. */ |
| if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, |
| &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) { |
| switch (rule_cache.topo) { |
| case ILB_TOPO_IMPL_NAT: |
| *lb_dst = rule_cache.info.nat_dst; |
| ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, |
| adj_ip_sum, adj_tp_sum, B_TRUE); |
| ret = B_TRUE; |
| break; |
| case ILB_TOPO_IMPL_HALF_NAT: |
| *lb_dst = rule_cache.info.nat_dst; |
| ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, |
| adj_ip_sum, adj_tp_sum, B_TRUE); |
| ret = B_TRUE; |
| break; |
| default: |
| ret = B_FALSE; |
| break; |
| } |
| return (ret); |
| } |
| if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport, |
| &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) { |
| switch (rule_cache.topo) { |
| case ILB_TOPO_IMPL_NAT: |
| *lb_dst = rule_cache.info.src; |
| ilb_full_nat(l3, iph, l4, tph, &rule_cache.info, |
| adj_ip_sum, adj_tp_sum, B_FALSE); |
| ret = B_TRUE; |
| break; |
| case ILB_TOPO_IMPL_HALF_NAT: |
| *lb_dst = *dst; |
| ilb_half_nat(l3, iph, l4, tph, &rule_cache.info, |
| adj_ip_sum, adj_tp_sum, B_FALSE); |
| ret = B_TRUE; |
| break; |
| default: |
| ret = B_FALSE; |
| break; |
| } |
| return (ret); |
| } |
| |
| return (B_FALSE); |
| } |
| |
| /* |
| * To check if an ICMP packet belongs to a connection in one of the conn |
| * hash entries. |
| */ |
| boolean_t |
| ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph, |
| void *icmph, in6_addr_t *lb_dst) |
| { |
| ilb_conn_hash_t *hash; |
| ipha_t *in_iph4; |
| ip6_t *in_iph6; |
| icmph_t *icmph4; |
| icmp6_t *icmph6; |
| in6_addr_t *in_src_p, *in_dst_p; |
| in_port_t *sport, *dport; |
| int l4; |
| uint_t i; |
| ilb_conn_t *connp; |
| ilb_rule_info_t rule_cache; |
| uint32_t adj_ip_sum; |
| boolean_t full_nat; |
| |
| in_iph4 = NULL; |
| in_iph6 = NULL; |
| icmph4 = NULL; |
| icmph6 = NULL; |
| |
| if (l3 == IPPROTO_IP) { |
| in6_addr_t in_src, in_dst; |
| |
| icmph4 = (icmph_t *)icmph; |
| in_iph4 = (ipha_t *)&icmph4[1]; |
| |
| if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) + |
| ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { |
| return (B_FALSE); |
| } |
| |
| IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src); |
| in_src_p = &in_src; |
| IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst); |
| in_dst_p = &in_dst; |
| |
| l4 = in_iph4->ipha_protocol; |
| if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) |
| return (B_FALSE); |
| |
| sport = (in_port_t *)((char *)in_iph4 + |
| IPH_HDR_LENGTH(in_iph4)); |
| dport = sport + 1; |
| |
| DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t, |
| in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t, |
| ntohs(*sport), uint16_t, ntohs(*dport)); |
| } else { |
| ASSERT(l3 == IPPROTO_IPV6); |
| |
| icmph6 = (icmp6_t *)icmph; |
| in_iph6 = (ip6_t *)&icmph6[1]; |
| in_src_p = &in_iph6->ip6_src; |
| in_dst_p = &in_iph6->ip6_dst; |
| |
| if ((uint8_t *)in_iph6 + sizeof (ip6_t) + |
| ICMP_MIN_TP_HDR_LEN > mp->b_wptr) { |
| return (B_FALSE); |
| } |
| |
| l4 = in_iph6->ip6_nxt; |
| /* We don't go deep inside an IPv6 packet yet. */ |
| if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP) |
| return (B_FALSE); |
| |
| sport = (in_port_t *)&in_iph6[1]; |
| dport = sport + 1; |
| |
| DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *, |
| &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst, |
| uint16_t, ntohs(*sport), uint16_t, ntohs(*dport)); |
| } |
| |
| i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport), |
| (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport), |
| ilbs->ilbs_conn_hash_size); |
| hash = ilbs->ilbs_c2s_conn_hash; |
| |
| mutex_enter(&hash[i].ilb_conn_hash_lock); |
| for (connp = hash[i].ilb_connp; connp != NULL; |
| connp = connp->conn_c2s_next) { |
| if (connp->conn_l4 == l4 && |
| connp->conn_c2s_dport == *sport && |
| connp->conn_c2s_sport == *dport && |
| IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) && |
| IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) { |
| connp->conn_c2s_atime = ddi_get_lbolt64(); |
| connp->conn_c2s_pkt_cnt++; |
| rule_cache = connp->conn_rule_cache; |
| adj_ip_sum = connp->conn_c2s_ip_sum; |
| break; |
| } |
| } |
| mutex_exit(&hash[i].ilb_conn_hash_lock); |
| |
| if (connp == NULL) { |
| DTRACE_PROBE(ilb__chk__icmp__conn__failed); |
| return (B_FALSE); |
| } |
| |
| switch (rule_cache.topo) { |
| case ILB_TOPO_IMPL_NAT: |
| full_nat = B_TRUE; |
| break; |
| case ILB_TOPO_IMPL_HALF_NAT: |
| full_nat = B_FALSE; |
| break; |
| default: |
| return (B_FALSE); |
| } |
| |
| *lb_dst = rule_cache.info.nat_dst; |
| if (l3 == IPPROTO_IP) { |
| ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport, |
| &rule_cache.info, adj_ip_sum, full_nat); |
| } else { |
| ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport, |
| &rule_cache.info, full_nat); |
| } |
| return (B_TRUE); |
| } |
| |
| /* |
| * This routine sends up the conn hash table to user land. Note that the |
| * request is an ioctl, hence we cannot really differentiate requests |
| * from different clients. There is no context shared between different |
| * ioctls. Here we make the assumption that the user land ilbd will |
| * only allow one client to show the conn hash table at any time. |
| * Otherwise, the results will be "very" inconsistent. |
| * |
| * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants |
| * to read from the beginning of the able. After a certain entries |
| * are reported, the kernel remembers the position of the last returned |
| * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag, |
| * it will return entries starting from where it was left off. When |
| * the end of table is reached, a flag (ILB_LIST_END) is set to tell |
| * the client that there is no more entry. |
| * |
| * It is assumed that the caller has checked the size of nat so that it |
| * can hold num entries. |
| */ |
| /* ARGSUSED */ |
| int |
| ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat, |
| uint32_t *num, uint32_t *flags) |
| { |
| ilb_conn_hash_t *hash; |
| ilb_conn_t *cur_connp; |
| uint32_t i, j; |
| int ret = 0; |
| |
| mutex_enter(&ilbs->ilbs_conn_list_lock); |
| while (ilbs->ilbs_conn_list_busy) { |
| if (cv_wait_sig(&ilbs->ilbs_conn_list_cv, |
| &ilbs->ilbs_conn_list_lock) == 0) { |
| mutex_exit(&ilbs->ilbs_conn_list_lock); |
| return (EINTR); |
| } |
| } |
| if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) { |
| ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); |
| mutex_exit(&ilbs->ilbs_conn_list_lock); |
| *num = 0; |
| *flags |= ILB_LIST_END; |
| return (0); |
| } |
| ilbs->ilbs_conn_list_busy = B_TRUE; |
| mutex_exit(&ilbs->ilbs_conn_list_lock); |
| |
| if (*flags & ILB_LIST_BEGIN) { |
| i = 0; |
| mutex_enter(&hash[0].ilb_conn_hash_lock); |
| cur_connp = hash[0].ilb_connp; |
| } else if (*flags & ILB_LIST_CONT) { |
| if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) { |
| *num = 0; |
| *flags |= ILB_LIST_END; |
| goto done; |
| } |
| i = ilbs->ilbs_conn_list_cur; |
| mutex_enter(&hash[i].ilb_conn_hash_lock); |
| cur_connp = ilbs->ilbs_conn_list_connp; |
| } else { |
| ret = EINVAL; |
| goto done; |
| } |
| |
| j = 0; |
| while (j < *num) { |
| if (cur_connp == NULL) { |
| mutex_exit(&hash[i].ilb_conn_hash_lock); |
| if (++i == ilbs->ilbs_conn_hash_size) { |
| *flags |= ILB_LIST_END; |
| break; |
| } |
| mutex_enter(&hash[i].ilb_conn_hash_lock); |
| cur_connp = hash[i].ilb_connp; |
| continue; |
| } |
| nat[j].proto = cur_connp->conn_l4; |
| |
| nat[j].in_global = cur_connp->conn_c2s_daddr; |
| nat[j].in_global_port = cur_connp->conn_c2s_dport; |
| nat[j].out_global = cur_connp->conn_c2s_saddr; |
| nat[j].out_global_port = cur_connp->conn_c2s_sport; |
| |
| nat[j].in_local = cur_connp->conn_s2c_saddr; |
| nat[j].in_local_port = cur_connp->conn_s2c_sport; |
| nat[j].out_local = cur_connp->conn_s2c_daddr; |
| nat[j].out_local_port = cur_connp->conn_s2c_dport; |
| |
| nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time); |
| nat[j].last_access_time = |
| TICK_TO_MSEC(cur_connp->conn_c2s_atime); |
| |
| /* |
| * The conn_s2c_pkt_cnt may not be accurate since we are not |
| * holding the s2c hash lock. |
| */ |
| nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt + |
| cur_connp->conn_s2c_pkt_cnt; |
| j++; |
| |
| cur_connp = cur_connp->conn_c2s_next; |
| } |
| ilbs->ilbs_conn_list_connp = cur_connp; |
| if (j == *num) |
| mutex_exit(&hash[i].ilb_conn_hash_lock); |
| |
| ilbs->ilbs_conn_list_cur = i; |
| |
| *num = j; |
| done: |
| mutex_enter(&ilbs->ilbs_conn_list_lock); |
| ilbs->ilbs_conn_list_busy = B_FALSE; |
| cv_signal(&ilbs->ilbs_conn_list_cv); |
| mutex_exit(&ilbs->ilbs_conn_list_lock); |
| |
| return (ret); |
| } |
| |
| |
| /* |
| * Stickiness (persistence) handling routines. |
| */ |
| |
| |
| static void |
| ilb_sticky_cache_init(void) |
| { |
| ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache", |
| sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL, |
| ilb_kmem_flags); |
| } |
| |
| void |
| ilb_sticky_cache_fini(void) |
| { |
| if (ilb_sticky_cache != NULL) { |
| kmem_cache_destroy(ilb_sticky_cache); |
| ilb_sticky_cache = NULL; |
| } |
| } |
| |
| void |
| ilb_sticky_refrele(ilb_sticky_t *s) |
| { |
| ILB_STICKY_REFRELE(s); |
| } |
| |
| static ilb_sticky_t * |
| ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src) |
| { |
| ilb_sticky_t *s; |
| |
| ASSERT(mutex_owned(&hash->sticky_lock)); |
| |
| for (s = list_head(&hash->sticky_head); s != NULL; |
| s = list_next(&hash->sticky_head, s)) { |
| if (s->rule_instance == rule->ir_ks_instance) { |
| if (IN6_ARE_ADDR_EQUAL(src, &s->src)) |
| return (s); |
| } |
| } |
| return (NULL); |
| } |
| |
| static ilb_sticky_t * |
| ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server, |
| in6_addr_t *src) |
| { |
| ilb_sticky_t *s; |
| |
| ASSERT(mutex_owned(&hash->sticky_lock)); |
| |
| if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL) |
| return (NULL); |
| |
| /* |
| * The rule instance is for handling the scenario when the same |
| * client talks to different rules at the same time. Stickiness |
| * is per rule so we can use the rule instance to differentiate |
| * the client's request. |
| */ |
| s->rule_instance = rule->ir_ks_instance; |
| /* |
| * Copy the rule name for listing all sticky cache entry. ir_name |
| * is guaranteed to be NULL terminated. |
| */ |
| (void) strcpy(s->rule_name, rule->ir_name); |
| s->server = server; |
| |
| /* |
| * Grab a ref cnt on the server so that it won't go away while |
| * it is still in the sticky table. |
| */ |
| ILB_SERVER_REFHOLD(server); |
| s->src = *src; |
| s->expiry = rule->ir_sticky_expiry; |
| s->refcnt = 1; |
| s->hash = hash; |
| |
| /* |
| * There is no need to set atime here since the refcnt is not |
| * zero. A sticky entry is removed only when the refcnt is |
| * zero. But just set it here for debugging purpose. The |
| * atime is set when a refrele is done on a sticky entry. |
| */ |
| s->atime = ddi_get_lbolt64(); |
| |
| list_insert_head(&hash->sticky_head, s); |
| hash->sticky_cnt++; |
| return (s); |
| } |
| |
| /* |
| * This routine checks if there is an existing sticky entry which matches |
| * a given packet. If there is one, return it. If there is not, create |
| * a sticky entry using the packet's info. |
| */ |
| ilb_server_t * |
| ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src, |
| ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx) |
| { |
| int i; |
| ilb_sticky_hash_t *hash; |
| ilb_sticky_t *s; |
| |
| ASSERT(server != NULL); |
| |
| *res = NULL; |
| |
| i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3], |
| (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size); |
| hash = &ilbs->ilbs_sticky_hash[i]; |
| |
| /* First check if there is already an entry. */ |
| mutex_enter(&hash->sticky_lock); |
| s = ilb_sticky_lookup(hash, rule, src); |
| |
| /* No sticky entry, add one. */ |
| if (s == NULL) { |
| add_new_entry: |
| s = ilb_sticky_add(hash, rule, server, src); |
| if (s == NULL) { |
| mutex_exit(&hash->sticky_lock); |
| return (NULL); |
| } |
| /* |
| * Find a source for this server. All subseqent requests from |
| * the same client matching this sticky entry will use this |
| * source address in doing NAT. The current algorithm is |
| * simple, rotate the source address. Note that the |
| * source address array does not change after it's created, so |
| * it is OK to just increment the cur index. |
| */ |
| if (server->iser_nat_src != NULL) { |
| /* It is a hint, does not need to be atomic. */ |
| *src_ent_idx = (server->iser_nat_src->cur++ % |
| server->iser_nat_src->num_src); |
| s->nat_src_idx = *src_ent_idx; |
| } |
| mutex_exit(&hash->sticky_lock); |
| *res = s; |
| return (server); |
| } |
| |
| /* |
| * We don't hold any lock accessing iser_enabled. Refer to the |
| * comment in ilb_server_add() about iser_lock. |
| */ |
| if (!s->server->iser_enabled) { |
| /* |
| * s->server == server can only happen if there is a race in |
| * toggling the iser_enabled flag (we don't hold a lock doing |
| * that) so that the load balance algorithm still returns a |
| * disabled server. In this case, just drop the packet... |
| */ |
| if (s->server == server) { |
| mutex_exit(&hash->sticky_lock); |
| return (NULL); |
| } |
| |
| /* |
| * The old server is disabled and there is a new server, use |
| * the new one to create a sticky entry. Since we will |
| * add the entry at the beginning, subsequent lookup will |
| * find this new entry instead of the old one. |
| */ |
| goto add_new_entry; |
| } |
| |
| s->refcnt++; |
| *res = s; |
| mutex_exit(&hash->sticky_lock); |
| if (server->iser_nat_src != NULL) |
| *src_ent_idx = s->nat_src_idx; |
| return (s->server); |
| } |
| |
| static void |
| ilb_sticky_cleanup(void *arg) |
| { |
| ilb_timer_t *timer = (ilb_timer_t *)arg; |
| uint32_t i; |
| ilb_stack_t *ilbs; |
| ilb_sticky_hash_t *hash; |
| ilb_sticky_t *s, *nxt_s; |
| int64_t now, expiry; |
| |
| ilbs = timer->ilbs; |
| hash = ilbs->ilbs_sticky_hash; |
| ASSERT(hash != NULL); |
| |
| now = ddi_get_lbolt64(); |
| for (i = timer->start; i < timer->end; i++) { |
| mutex_enter(&hash[i].sticky_lock); |
| for (s = list_head(&hash[i].sticky_head); s != NULL; |
| s = nxt_s) { |
| nxt_s = list_next(&hash[i].sticky_head, s); |
| if (s->refcnt != 0) |
| continue; |
| expiry = now - SEC_TO_TICK(s->expiry); |
| if (s->atime < expiry) { |
| ILB_SERVER_REFRELE(s->server); |
| list_remove(&hash[i].sticky_head, s); |
| kmem_cache_free(ilb_sticky_cache, s); |
| hash[i].sticky_cnt--; |
| } |
| } |
| mutex_exit(&hash[i].sticky_lock); |
| } |
| } |
| |
| static void |
| ilb_sticky_timer(void *arg) |
| { |
| ilb_timer_t *timer = (ilb_timer_t *)arg; |
| |
| (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq, |
| ilb_sticky_cleanup, arg, TQ_SLEEP); |
| mutex_enter(&timer->tid_lock); |
| if (timer->tid == 0) { |
| mutex_exit(&timer->tid_lock); |
| } else { |
| timer->tid = timeout(ilb_sticky_timer, arg, |
| SEC_TO_TICK(ilb_sticky_timeout)); |
| mutex_exit(&timer->tid_lock); |
| } |
| } |
| |
| void |
| ilb_sticky_hash_init(ilb_stack_t *ilbs) |
| { |
| extern pri_t minclsyspri; |
| int i, part; |
| char tq_name[TASKQ_NAMELEN]; |
| ilb_timer_t *tm; |
| |
| if (!ISP2(ilbs->ilbs_sticky_hash_size)) { |
| for (i = 0; i < 31; i++) { |
| if (ilbs->ilbs_sticky_hash_size < (1 << i)) |
| break; |
| } |
| ilbs->ilbs_sticky_hash_size = 1 << i; |
| } |
| |
| ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) * |
| ilbs->ilbs_sticky_hash_size, KM_SLEEP); |
| for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { |
| mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL, |
| MUTEX_DEFAULT, NULL); |
| list_create(&ilbs->ilbs_sticky_hash[i].sticky_head, |
| sizeof (ilb_sticky_t), |
| offsetof(ilb_sticky_t, list)); |
| } |
| |
| if (ilb_sticky_cache == NULL) |
| ilb_sticky_cache_init(); |
| |
| (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p", |
| (void *)ilbs->ilbs_netstack); |
| ASSERT(ilbs->ilbs_sticky_taskq == NULL); |
| ilbs->ilbs_sticky_taskq = taskq_create(tq_name, |
| ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size, |
| ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); |
| |
| ASSERT(ilbs->ilbs_sticky_timer_list == NULL); |
| ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) * |
| ilb_sticky_timer_size, KM_SLEEP); |
| part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1; |
| for (i = 0; i < ilb_sticky_timer_size; i++) { |
| tm = ilbs->ilbs_sticky_timer_list + i; |
| tm->start = i * part; |
| tm->end = i * part + part; |
| if (tm->end > ilbs->ilbs_sticky_hash_size) |
| tm->end = ilbs->ilbs_sticky_hash_size; |
| tm->ilbs = ilbs; |
| mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL); |
| /* Spread out the starting execution time of all the timers. */ |
| tm->tid = timeout(ilb_sticky_timer, tm, |
| SEC_TO_TICK(ilb_sticky_timeout + i)); |
| } |
| } |
| |
| void |
| ilb_sticky_hash_fini(ilb_stack_t *ilbs) |
| { |
| int i; |
| ilb_sticky_t *s; |
| |
| if (ilbs->ilbs_sticky_hash == NULL) |
| return; |
| |
| /* Stop all the timers first. */ |
| for (i = 0; i < ilb_sticky_timer_size; i++) { |
| timeout_id_t tid; |
| |
| /* Setting tid to 0 tells the timer handler not to restart. */ |
| mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock); |
| tid = ilbs->ilbs_sticky_timer_list[i].tid; |
| ilbs->ilbs_sticky_timer_list[i].tid = 0; |
| mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock); |
| (void) untimeout(tid); |
| } |
| kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) * |
| ilb_sticky_timer_size); |
| taskq_destroy(ilbs->ilbs_sticky_taskq); |
| ilbs->ilbs_sticky_taskq = NULL; |
| |
| for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) { |
| while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head)) |
| != NULL) { |
| list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s); |
| ILB_SERVER_REFRELE(s->server); |
| kmem_free(s, sizeof (ilb_sticky_t)); |
| } |
| } |
| kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size * |
| sizeof (ilb_sticky_hash_t)); |
| } |
| |
| /* |
| * This routine sends up the sticky hash table to user land. Refer to |
| * the comments before ilb_list_nat(). Both routines assume similar |
| * conditions. |
| * |
| * It is assumed that the caller has checked the size of st so that it |
| * can hold num entries. |
| */ |
| /* ARGSUSED */ |
| int |
| ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st, |
| uint32_t *num, uint32_t *flags) |
| { |
| ilb_sticky_hash_t *hash; |
| ilb_sticky_t *curp; |
| uint32_t i, j; |
| int ret = 0; |
| |
| mutex_enter(&ilbs->ilbs_sticky_list_lock); |
| while (ilbs->ilbs_sticky_list_busy) { |
| if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv, |
| &ilbs->ilbs_sticky_list_lock) == 0) { |
| mutex_exit(&ilbs->ilbs_sticky_list_lock); |
| return (EINTR); |
| } |
| } |
| if ((hash = ilbs->ilbs_sticky_hash) == NULL) { |
| mutex_exit(&ilbs->ilbs_sticky_list_lock); |
| *num = 0; |
| *flags |= ILB_LIST_END; |
| return (0); |
| } |
| ilbs->ilbs_sticky_list_busy = B_TRUE; |
| mutex_exit(&ilbs->ilbs_sticky_list_lock); |
| |
| if (*flags & ILB_LIST_BEGIN) { |
| i = 0; |
| mutex_enter(&hash[0].sticky_lock); |
| curp = list_head(&hash[0].sticky_head); |
| } else if (*flags & ILB_LIST_CONT) { |
| if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) { |
| *num = 0; |
| *flags |= ILB_LIST_END; |
| goto done; |
| } |
| i = ilbs->ilbs_sticky_list_cur; |
| mutex_enter(&hash[i].sticky_lock); |
| curp = ilbs->ilbs_sticky_list_curp; |
| } else { |
| ret = EINVAL; |
| goto done; |
| } |
| |
| j = 0; |
| while (j < *num) { |
| if (curp == NULL) { |
| mutex_exit(&hash[i].sticky_lock); |
| if (++i == ilbs->ilbs_sticky_hash_size) { |
| *flags |= ILB_LIST_END; |
| break; |
| } |
| mutex_enter(&hash[i].sticky_lock); |
| curp = list_head(&hash[i].sticky_head); |
| continue; |
| } |
| (void) strcpy(st[j].rule_name, curp->rule_name); |
| st[j].req_addr = curp->src; |
| st[j].srv_addr = curp->server->iser_addr_v6; |
| st[j].expiry_time = TICK_TO_MSEC(curp->expiry); |
| j++; |
| curp = list_next(&hash[i].sticky_head, curp); |
| } |
| ilbs->ilbs_sticky_list_curp = curp; |
| if (j == *num) |
| mutex_exit(&hash[i].sticky_lock); |
| |
| ilbs->ilbs_sticky_list_cur = i; |
| |
| *num = j; |
| done: |
| mutex_enter(&ilbs->ilbs_sticky_list_lock); |
| ilbs->ilbs_sticky_list_busy = B_FALSE; |
| cv_signal(&ilbs->ilbs_sticky_list_cv); |
| mutex_exit(&ilbs->ilbs_sticky_list_lock); |
| |
| return (ret); |
| } |