| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| |
| #include <sys/sysmacros.h> |
| #include <sys/kmem.h> |
| #include <sys/ksynch.h> |
| #include <sys/systm.h> |
| #include <sys/socket.h> |
| #include <sys/disp.h> |
| #include <sys/taskq.h> |
| #include <sys/cmn_err.h> |
| #include <sys/strsun.h> |
| #include <sys/sdt.h> |
| #include <sys/atomic.h> |
| #include <netinet/in.h> |
| #include <inet/ip.h> |
| #include <inet/ip6.h> |
| #include <inet/tcp.h> |
| #include <inet/udp_impl.h> |
| #include <inet/kstatcom.h> |
| |
| #include <inet/ilb_ip.h> |
| #include "ilb_alg.h" |
| #include "ilb_nat.h" |
| #include "ilb_conn.h" |
| |
| /* ILB kmem cache flag */ |
| int ilb_kmem_flags = 0; |
| |
| /* |
| * The default size for the different hash tables. Global for all stacks. |
| * But each stack has its own table, just that their sizes are the same. |
| */ |
| static size_t ilb_rule_hash_size = 2048; |
| |
| static size_t ilb_conn_hash_size = 262144; |
| |
| static size_t ilb_sticky_hash_size = 262144; |
| |
| /* This should be a prime number. */ |
| static size_t ilb_nat_src_hash_size = 97; |
| |
| /* Default NAT cache entry expiry time. */ |
| static uint32_t ilb_conn_tcp_expiry = 120; |
| static uint32_t ilb_conn_udp_expiry = 60; |
| |
| /* Default sticky entry expiry time. */ |
| static uint32_t ilb_sticky_expiry = 60; |
| |
| /* addr is assumed to be a uint8_t * to an ipaddr_t. */ |
| #define ILB_RULE_HASH(addr, hash_size) \ |
| ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \ |
| *(addr)) & ((hash_size) - 1)) |
| |
| /* |
| * Note on ILB delayed processing |
| * |
| * To avoid in line removal on some of the data structures, such as rules, |
| * servers and ilb_conn_hash entries, ILB delays such processing to a taskq. |
| * There are three types of ILB taskq: |
| * |
| * 1. rule handling: created at stack initialialization time, ilb_stack_init() |
| * 2. conn hash handling: created at conn hash initialization time, |
| * ilb_conn_hash_init() |
| * 3. sticky hash handling: created at sticky hash initialization time, |
| * ilb_sticky_hash_init() |
| * |
| * The rule taskq is for processing rule and server removal. When a user |
| * land rule/server removal request comes in, a taskq is dispatched after |
| * removing the rule/server from all related hashes. This taskq will wait |
| * until all references to the rule/server are gone before removing it. |
| * So the user land thread requesting the removal does not need to wait |
| * for the removal completion. |
| * |
| * The conn hash/sticky hash taskq is for processing ilb_conn_hash and |
| * ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers |
| * and ilb_sticky_timer_size timers running for ilb_conn_hash and |
| * ilb_sticky_hash cleanup respectively. Each timer is responsible for one |
| * portion (same size) of the hash table. When a timer fires, it dispatches |
| * a conn hash taskq to clean up its portion of the table. This avoids in |
| * line processing of the removal. |
| * |
| * There is another delayed processing, the clean up of NAT source address |
| * table. We just use the timer to directly handle it instead of using |
| * a taskq. The reason is that the table is small so it is OK to use the |
| * timer. |
| */ |
| |
| /* ILB rule taskq constants. */ |
| #define ILB_RULE_TASKQ_NUM_THR 20 |
| |
| /* Argument passed to ILB rule taskq routines. */ |
| typedef struct { |
| ilb_stack_t *ilbs; |
| ilb_rule_t *rule; |
| } ilb_rule_tq_t; |
| |
| /* kstat handling routines. */ |
| static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *); |
| static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *); |
| static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *); |
| static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *, |
| ilb_server_t *); |
| |
| /* Rule hash handling routines. */ |
| static void ilb_rule_hash_init(ilb_stack_t *); |
| static void ilb_rule_hash_fini(ilb_stack_t *); |
| static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *); |
| static void ilb_rule_hash_del(ilb_rule_t *); |
| static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *, |
| in_port_t, zoneid_t, uint32_t, boolean_t *); |
| |
| static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *); |
| static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *); |
| static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *); |
| static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *, |
| int *); |
| static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int, |
| int, in_port_t, in_port_t, const in6_addr_t *); |
| |
| /* Back end server handling routines. */ |
| static void ilb_server_free(ilb_server_t *); |
| |
| /* Network stack handling routines. */ |
| static void *ilb_stack_init(netstackid_t, netstack_t *); |
| static void ilb_stack_shutdown(netstackid_t, void *); |
| static void ilb_stack_fini(netstackid_t, void *); |
| |
| /* Sticky connection handling routines. */ |
| static void ilb_rule_sticky_init(ilb_rule_t *); |
| static void ilb_rule_sticky_fini(ilb_rule_t *); |
| |
| /* Handy macro to check for unspecified address. */ |
| #define IS_ADDR_UNSPEC(addr) \ |
| (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \ |
| IN6_IS_ADDR_UNSPECIFIED(addr)) |
| |
| /* |
| * Global kstat instance counter. When a rule is created, its kstat instance |
| * number is assigned by ilb_kstat_instance and ilb_kstat_instance is |
| * incremented. |
| */ |
| static uint_t ilb_kstat_instance = 0; |
| |
| /* |
| * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME. |
| * A rule's kstat has ILB_RULE_KS_CNAME class name. |
| */ |
| #define ILB_G_KS_NAME "global" |
| #define ILB_G_KS_CNAME "kstat" |
| #define ILB_RULE_KS_CNAME "rulestat" |
| |
| static kstat_t * |
| ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs) |
| { |
| kstat_t *ksp; |
| ilb_g_kstat_t template = { |
| { "num_rules", KSTAT_DATA_UINT64, 0 }, |
| { "ip_frag_in", KSTAT_DATA_UINT64, 0 }, |
| { "ip_frag_dropped", KSTAT_DATA_UINT64, 0 } |
| }; |
| |
| ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME, |
| ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t), |
| KSTAT_FLAG_VIRTUAL, stackid); |
| if (ksp == NULL) |
| return (NULL); |
| bcopy(&template, ilbs->ilbs_kstat, sizeof (template)); |
| ksp->ks_data = ilbs->ilbs_kstat; |
| ksp->ks_private = (void *)(uintptr_t)stackid; |
| |
| kstat_install(ksp); |
| return (ksp); |
| } |
| |
| static void |
| ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs) |
| { |
| if (ilbs->ilbs_ksp != NULL) { |
| ASSERT(stackid == (netstackid_t)(uintptr_t) |
| ilbs->ilbs_ksp->ks_private); |
| kstat_delete_netstack(ilbs->ilbs_ksp, stackid); |
| ilbs->ilbs_ksp = NULL; |
| } |
| } |
| |
| static kstat_t * |
| ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule) |
| { |
| kstat_t *ksp; |
| ilb_rule_kstat_t template = { |
| { "num_servers", KSTAT_DATA_UINT64, 0 }, |
| { "bytes_not_processed", KSTAT_DATA_UINT64, 0 }, |
| { "pkt_not_processed", KSTAT_DATA_UINT64, 0 }, |
| { "bytes_dropped", KSTAT_DATA_UINT64, 0 }, |
| { "pkt_dropped", KSTAT_DATA_UINT64, 0 }, |
| { "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 }, |
| { "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 }, |
| { "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 }, |
| { "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 }, |
| { "icmp_echo_processed", KSTAT_DATA_UINT64, 0 }, |
| { "icmp_dropped", KSTAT_DATA_UINT64, 0 }, |
| { "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 }, |
| { "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 } |
| }; |
| |
| ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, |
| rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED, |
| NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); |
| if (ksp == NULL) |
| return (NULL); |
| |
| bcopy(&template, &rule->ir_kstat, sizeof (template)); |
| ksp->ks_data = &rule->ir_kstat; |
| ksp->ks_private = (void *)(uintptr_t)stackid; |
| |
| kstat_install(ksp); |
| return (ksp); |
| } |
| |
| static kstat_t * |
| ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule, |
| ilb_server_t *server) |
| { |
| kstat_t *ksp; |
| ilb_server_kstat_t template = { |
| { "bytes_processed", KSTAT_DATA_UINT64, 0 }, |
| { "pkt_processed", KSTAT_DATA_UINT64, 0 }, |
| { "ip_address", KSTAT_DATA_STRING, 0 } |
| }; |
| char cname_buf[KSTAT_STRLEN]; |
| |
| /* 7 is "-sstat" */ |
| ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN); |
| (void) sprintf(cname_buf, "%s-sstat", rule->ir_name); |
| ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance, |
| server->iser_name, cname_buf, KSTAT_TYPE_NAMED, |
| NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid); |
| if (ksp == NULL) |
| return (NULL); |
| |
| bcopy(&template, &server->iser_kstat, sizeof (template)); |
| ksp->ks_data = &server->iser_kstat; |
| ksp->ks_private = (void *)(uintptr_t)stackid; |
| |
| kstat_named_setstr(&server->iser_kstat.ip_address, |
| server->iser_ip_addr); |
| /* We never change the IP address */ |
| ksp->ks_data_size += strlen(server->iser_ip_addr) + 1; |
| |
| kstat_install(ksp); |
| return (ksp); |
| } |
| |
| /* Initialize the rule hash table. */ |
| static void |
| ilb_rule_hash_init(ilb_stack_t *ilbs) |
| { |
| int i; |
| |
| /* |
| * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to |
| * the next power of 2. |
| */ |
| if (!ISP2(ilbs->ilbs_rule_hash_size)) { |
| for (i = 0; i < 31; i++) { |
| if (ilbs->ilbs_rule_hash_size < (1 << i)) |
| break; |
| } |
| ilbs->ilbs_rule_hash_size = 1 << i; |
| } |
| ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) * |
| ilbs->ilbs_rule_hash_size, KM_SLEEP); |
| for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) { |
| mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL, |
| MUTEX_DEFAULT, NULL); |
| } |
| } |
| |
| /* Clean up the rule hash table. */ |
| static void |
| ilb_rule_hash_fini(ilb_stack_t *ilbs) |
| { |
| if (ilbs->ilbs_g_hash == NULL) |
| return; |
| kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) * |
| ilbs->ilbs_rule_hash_size); |
| } |
| |
| /* Add a rule to the rule hash table. */ |
| static void |
| ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr) |
| { |
| int i; |
| |
| i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3], |
| ilbs->ilbs_rule_hash_size); |
| DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i); |
| mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); |
| rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule; |
| if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL) |
| ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule; |
| rule->ir_hash_prev = NULL; |
| ilbs->ilbs_g_hash[i].ilb_hash_rule = rule; |
| |
| rule->ir_hash = &ilbs->ilbs_g_hash[i]; |
| mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); |
| } |
| |
| /* |
| * Remove a rule from the rule hash table. Note that the rule is not freed |
| * in this routine. |
| */ |
| static void |
| ilb_rule_hash_del(ilb_rule_t *rule) |
| { |
| mutex_enter(&rule->ir_hash->ilb_hash_lock); |
| if (rule->ir_hash->ilb_hash_rule == rule) { |
| rule->ir_hash->ilb_hash_rule = rule->ir_hash_next; |
| if (rule->ir_hash_next != NULL) |
| rule->ir_hash_next->ir_hash_prev = NULL; |
| } else { |
| if (rule->ir_hash_prev != NULL) |
| rule->ir_hash_prev->ir_hash_next = |
| rule->ir_hash_next; |
| if (rule->ir_hash_next != NULL) { |
| rule->ir_hash_next->ir_hash_prev = |
| rule->ir_hash_prev; |
| } |
| } |
| mutex_exit(&rule->ir_hash->ilb_hash_lock); |
| |
| rule->ir_hash_next = NULL; |
| rule->ir_hash_prev = NULL; |
| rule->ir_hash = NULL; |
| } |
| |
| /* |
| * Given the info of a packet, look for a match in the rule hash table. |
| */ |
| static ilb_rule_t * |
| ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr, |
| in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy) |
| { |
| int i; |
| ilb_rule_t *rule; |
| ipaddr_t v4_addr; |
| |
| *busy = B_FALSE; |
| IN6_V4MAPPED_TO_IPADDR(addr, v4_addr); |
| i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size); |
| port = ntohs(port); |
| |
| mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); |
| for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; |
| rule = rule->ir_hash_next) { |
| if (!rule->ir_port_range) { |
| if (rule->ir_min_port != port) |
| continue; |
| } else { |
| if (port < rule->ir_min_port || |
| port > rule->ir_max_port) { |
| continue; |
| } |
| } |
| if (rule->ir_ipver != l3 || rule->ir_proto != l4 || |
| rule->ir_zoneid != zoneid) { |
| continue; |
| } |
| |
| if (l3 == IPPROTO_IP) { |
| if (rule->ir_target_v4 != INADDR_ANY && |
| rule->ir_target_v4 != v4_addr) { |
| continue; |
| } |
| } else { |
| if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) && |
| !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) { |
| continue; |
| } |
| } |
| |
| /* |
| * Just update the stats if the rule is disabled. |
| */ |
| mutex_enter(&rule->ir_lock); |
| if (!(rule->ir_flags & ILB_RULE_ENABLED)) { |
| ILB_R_KSTAT(rule, pkt_not_processed); |
| ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len); |
| mutex_exit(&rule->ir_lock); |
| rule = NULL; |
| break; |
| } else if (rule->ir_flags & ILB_RULE_BUSY) { |
| /* |
| * If we are busy... |
| * |
| * XXX we should have a queue to postpone the |
| * packet processing. But this requires a |
| * mechanism in IP to re-start the packet |
| * processing. So for now, just drop the packet. |
| */ |
| ILB_R_KSTAT(rule, pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len); |
| mutex_exit(&rule->ir_lock); |
| *busy = B_TRUE; |
| rule = NULL; |
| break; |
| } else { |
| rule->ir_refcnt++; |
| ASSERT(rule->ir_refcnt != 1); |
| mutex_exit(&rule->ir_lock); |
| break; |
| } |
| } |
| mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); |
| return (rule); |
| } |
| |
| /* |
| * Add a rule to the global rule list. This list is for finding all rules |
| * in an IP stack. The caller is assumed to hold the ilbs_g_lock. |
| */ |
| static void |
| ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule) |
| { |
| ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); |
| rule->ir_next = ilbs->ilbs_rule_head; |
| ilbs->ilbs_rule_head = rule; |
| ILB_KSTAT_UPDATE(ilbs, num_rules, 1); |
| } |
| |
| /* The call is assumed to hold the ilbs_g_lock. */ |
| static void |
| ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule) |
| { |
| ilb_rule_t *tmp_rule; |
| ilb_rule_t *prev_rule; |
| |
| ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); |
| prev_rule = NULL; |
| for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; |
| prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) { |
| if (tmp_rule == rule) |
| break; |
| } |
| if (tmp_rule == NULL) { |
| mutex_exit(&ilbs->ilbs_g_lock); |
| return; |
| } |
| if (prev_rule == NULL) |
| ilbs->ilbs_rule_head = tmp_rule->ir_next; |
| else |
| prev_rule->ir_next = tmp_rule->ir_next; |
| ILB_KSTAT_UPDATE(ilbs, num_rules, -1); |
| } |
| |
| /* |
| * Helper routine to calculate how many source addresses are in a given |
| * range. |
| */ |
| static int64_t |
| num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2) |
| { |
| int64_t ret; |
| uint32_t addr1, addr2; |
| |
| /* |
| * Here we assume that the max number of NAT source cannot be |
| * large such that the most significant 2 s6_addr32 must be |
| * equal. |
| */ |
| addr1 = ntohl(a1->s6_addr32[3]); |
| addr2 = ntohl(a2->s6_addr32[3]); |
| if (a1->s6_addr32[0] != a2->s6_addr32[0] || |
| a1->s6_addr32[1] != a2->s6_addr32[1] || |
| a1->s6_addr32[2] > a2->s6_addr32[2] || |
| (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) { |
| return (-1); |
| } |
| if (a1->s6_addr32[2] == a2->s6_addr32[2]) { |
| return (addr2 - addr1 + 1); |
| } else { |
| ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2])); |
| ret <<= 32; |
| ret = ret + addr1 - addr2; |
| return (ret + 1); |
| } |
| } |
| |
| /* |
| * Add an ILB rule. |
| */ |
| int |
| ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd) |
| { |
| ilb_rule_t *rule; |
| netstackid_t stackid; |
| int ret; |
| in_port_t min_port, max_port; |
| int64_t num_src; |
| |
| /* Sanity checks. */ |
| if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6) |
| return (EINVAL); |
| |
| /* Need to support SCTP... */ |
| if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP) |
| return (EINVAL); |
| |
| /* For full NAT, the NAT source must be supplied. */ |
| if (cmd->topo == ILB_TOPO_IMPL_NAT) { |
| if (IS_ADDR_UNSPEC(&cmd->nat_src_start) || |
| IS_ADDR_UNSPEC(&cmd->nat_src_end)) { |
| return (EINVAL); |
| } |
| } |
| |
| /* Check invalid mask */ |
| if ((cmd->flags & ILB_RULE_STICKY) && |
| IS_ADDR_UNSPEC(&cmd->sticky_mask)) { |
| return (EINVAL); |
| } |
| |
| /* Port is passed in network byte order. */ |
| min_port = ntohs(cmd->min_port); |
| max_port = ntohs(cmd->max_port); |
| if (min_port > max_port) |
| return (EINVAL); |
| |
| /* min_port == 0 means "all ports". Make it so */ |
| if (min_port == 0) { |
| min_port = 1; |
| max_port = 65535; |
| } |
| |
| /* Funny address checking. */ |
| if (cmd->ip_ver == IPPROTO_IP) { |
| in_addr_t v4_addr1, v4_addr2; |
| |
| v4_addr1 = cmd->vip.s6_addr32[3]; |
| if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || |
| CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST || |
| v4_addr1 == INADDR_ANY || |
| !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { |
| return (EINVAL); |
| } |
| |
| if (cmd->topo == ILB_TOPO_IMPL_NAT) { |
| v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]); |
| v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]); |
| if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET || |
| (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET || |
| v4_addr1 == INADDR_BROADCAST || |
| v4_addr2 == INADDR_BROADCAST || |
| v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY || |
| CLASSD(v4_addr1) || CLASSD(v4_addr2) || |
| !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || |
| !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { |
| return (EINVAL); |
| } |
| |
| num_src = v4_addr2 - v4_addr1 + 1; |
| if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC) |
| return (EINVAL); |
| } |
| } else { |
| if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) || |
| IN6_IS_ADDR_MULTICAST(&cmd->vip) || |
| IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) || |
| IN6_IS_ADDR_V4MAPPED(&cmd->vip)) { |
| return (EINVAL); |
| } |
| |
| if (cmd->topo == ILB_TOPO_IMPL_NAT) { |
| if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) || |
| IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) || |
| IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) || |
| IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) || |
| IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) || |
| IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) || |
| IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) || |
| IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) { |
| return (EINVAL); |
| } |
| |
| if ((num_src = num_nat_src_v6(&cmd->nat_src_start, |
| &cmd->nat_src_end)) < 0 || |
| num_src > ILB_MAX_NAT_SRC) { |
| return (EINVAL); |
| } |
| } |
| } |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| if (ilbs->ilbs_g_hash == NULL) |
| ilb_rule_hash_init(ilbs); |
| if (ilbs->ilbs_c2s_conn_hash == NULL) { |
| ASSERT(ilbs->ilbs_s2c_conn_hash == NULL); |
| ilb_conn_hash_init(ilbs); |
| ilb_nat_src_init(ilbs); |
| } |
| |
| /* Make sure that the new rule does not duplicate an existing one. */ |
| if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto, |
| min_port, max_port, &cmd->vip)) { |
| mutex_exit(&ilbs->ilbs_g_lock); |
| return (EEXIST); |
| } |
| |
| rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP); |
| if (rule == NULL) { |
| mutex_exit(&ilbs->ilbs_g_lock); |
| return (ENOMEM); |
| } |
| |
| /* ir_name is all 0 to begin with */ |
| (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1); |
| |
| rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance); |
| stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; |
| if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) { |
| ret = ENOMEM; |
| goto error; |
| } |
| |
| if (cmd->topo == ILB_TOPO_IMPL_NAT) { |
| rule->ir_nat_src_start = cmd->nat_src_start; |
| rule->ir_nat_src_end = cmd->nat_src_end; |
| } |
| |
| rule->ir_ipver = cmd->ip_ver; |
| rule->ir_proto = cmd->proto; |
| rule->ir_topo = cmd->topo; |
| |
| rule->ir_min_port = min_port; |
| rule->ir_max_port = max_port; |
| if (rule->ir_min_port != rule->ir_max_port) |
| rule->ir_port_range = B_TRUE; |
| else |
| rule->ir_port_range = B_FALSE; |
| |
| rule->ir_zoneid = zoneid; |
| |
| rule->ir_target_v6 = cmd->vip; |
| rule->ir_servers = NULL; |
| |
| /* |
| * The default connection drain timeout is indefinite (value 0), |
| * meaning we will wait for all connections to finish. So we |
| * can assign cmd->conn_drain_timeout to it directly. |
| */ |
| rule->ir_conn_drain_timeout = cmd->conn_drain_timeout; |
| if (cmd->nat_expiry != 0) { |
| rule->ir_nat_expiry = cmd->nat_expiry; |
| } else { |
| switch (rule->ir_proto) { |
| case IPPROTO_TCP: |
| rule->ir_nat_expiry = ilb_conn_tcp_expiry; |
| break; |
| case IPPROTO_UDP: |
| rule->ir_nat_expiry = ilb_conn_udp_expiry; |
| break; |
| default: |
| cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p", |
| (void *)rule); |
| break; |
| } |
| } |
| if (cmd->sticky_expiry != 0) |
| rule->ir_sticky_expiry = cmd->sticky_expiry; |
| else |
| rule->ir_sticky_expiry = ilb_sticky_expiry; |
| |
| if (cmd->flags & ILB_RULE_STICKY) { |
| rule->ir_flags |= ILB_RULE_STICKY; |
| rule->ir_sticky_mask = cmd->sticky_mask; |
| if (ilbs->ilbs_sticky_hash == NULL) |
| ilb_sticky_hash_init(ilbs); |
| } |
| if (cmd->flags & ILB_RULE_ENABLED) |
| rule->ir_flags |= ILB_RULE_ENABLED; |
| |
| mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL); |
| cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL); |
| |
| rule->ir_refcnt = 1; |
| |
| switch (cmd->algo) { |
| case ILB_ALG_IMPL_ROUNDROBIN: |
| if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) { |
| ret = ENOMEM; |
| goto error; |
| } |
| rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN; |
| break; |
| case ILB_ALG_IMPL_HASH_IP: |
| case ILB_ALG_IMPL_HASH_IP_SPORT: |
| case ILB_ALG_IMPL_HASH_IP_VIP: |
| if ((rule->ir_alg = ilb_alg_hash_init(rule, |
| &cmd->algo)) == NULL) { |
| ret = ENOMEM; |
| goto error; |
| } |
| rule->ir_alg_type = cmd->algo; |
| break; |
| default: |
| ret = EINVAL; |
| goto error; |
| } |
| |
| /* Add it to the global list and hash array at the end. */ |
| ilb_rule_g_add(ilbs, rule); |
| ilb_rule_hash_add(ilbs, rule, &cmd->vip); |
| |
| mutex_exit(&ilbs->ilbs_g_lock); |
| |
| return (0); |
| |
| error: |
| mutex_exit(&ilbs->ilbs_g_lock); |
| if (rule->ir_ksp != NULL) { |
| /* stackid must be initialized if ir_ksp != NULL */ |
| kstat_delete_netstack(rule->ir_ksp, stackid); |
| } |
| kmem_free(rule, sizeof (ilb_rule_t)); |
| return (ret); |
| } |
| |
| /* |
| * The final part in deleting a rule. Either called directly or by the |
| * taskq dispatched. |
| */ |
| static void |
| ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule) |
| { |
| netstackid_t stackid; |
| ilb_server_t *server; |
| |
| stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; |
| |
| /* |
| * Let the algorithm know that the rule is going away. The |
| * algorithm fini routine will free all its resources with this |
| * rule. |
| */ |
| tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg); |
| |
| while ((server = tmp_rule->ir_servers) != NULL) { |
| mutex_enter(&server->iser_lock); |
| ilb_destroy_nat_src(&server->iser_nat_src); |
| if (tmp_rule->ir_conn_drain_timeout != 0) { |
| /* |
| * The garbage collection thread checks this value |
| * without grabing a lock. So we need to use |
| * atomic_swap_64() to make sure that the value seen |
| * by gc thread is intact. |
| */ |
| (void) atomic_swap_64( |
| (uint64_t *)&server->iser_die_time, |
| ddi_get_lbolt64() + |
| SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout)); |
| } |
| while (server->iser_refcnt > 1) |
| cv_wait(&server->iser_cv, &server->iser_lock); |
| tmp_rule->ir_servers = server->iser_next; |
| kstat_delete_netstack(server->iser_ksp, stackid); |
| kmem_free(server, sizeof (ilb_server_t)); |
| } |
| |
| ASSERT(tmp_rule->ir_ksp != NULL); |
| kstat_delete_netstack(tmp_rule->ir_ksp, stackid); |
| |
| kmem_free(tmp_rule, sizeof (ilb_rule_t)); |
| } |
| |
| /* The routine executed by the delayed rule taskq. */ |
| static void |
| ilb_rule_del_tq(void *arg) |
| { |
| ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs; |
| ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule; |
| |
| mutex_enter(&rule->ir_lock); |
| while (rule->ir_refcnt > 1) |
| cv_wait(&rule->ir_cv, &rule->ir_lock); |
| ilb_rule_del_common(ilbs, rule); |
| kmem_free(arg, sizeof (ilb_rule_tq_t)); |
| } |
| |
| /* Routine to delete a rule. */ |
| int |
| ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name) |
| { |
| ilb_rule_t *tmp_rule; |
| ilb_rule_tq_t *arg; |
| int err; |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, |
| &err)) == NULL) { |
| mutex_exit(&ilbs->ilbs_g_lock); |
| return (err); |
| } |
| |
| /* |
| * First remove the rule from the hash array and the global list so |
| * that no one can find this rule any more. |
| */ |
| ilb_rule_hash_del(tmp_rule); |
| ilb_rule_g_del(ilbs, tmp_rule); |
| mutex_exit(&ilbs->ilbs_g_lock); |
| ILB_RULE_REFRELE(tmp_rule); |
| |
| /* |
| * Now no one can find this rule, we can remove it once all |
| * references to it are dropped and all references to the list |
| * of servers are dropped. So dispatch a task to finish the deletion. |
| * We do this instead of letting the last one referencing the |
| * rule do it. The reason is that the last one may be the |
| * interrupt thread. We want to minimize the work it needs to |
| * do. Rule deletion is not a critical task so it can be delayed. |
| */ |
| arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); |
| arg->ilbs = ilbs; |
| arg->rule = tmp_rule; |
| (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg, |
| TQ_SLEEP); |
| |
| return (0); |
| } |
| |
| /* |
| * Given an IP address, check to see if there is a rule using this |
| * as the VIP. It can be used to check if we need to drop a fragment. |
| */ |
| boolean_t |
| ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule) |
| { |
| int i; |
| ilb_rule_t *rule; |
| boolean_t ret = B_FALSE; |
| |
| i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3], |
| ilbs->ilbs_rule_hash_size); |
| mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); |
| for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; |
| rule = rule->ir_hash_next) { |
| if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) { |
| mutex_enter(&rule->ir_lock); |
| if (rule->ir_flags & ILB_RULE_BUSY) { |
| mutex_exit(&rule->ir_lock); |
| break; |
| } |
| if (ret_rule != NULL) { |
| rule->ir_refcnt++; |
| mutex_exit(&rule->ir_lock); |
| *ret_rule = rule; |
| } else { |
| mutex_exit(&rule->ir_lock); |
| } |
| ret = B_TRUE; |
| break; |
| } |
| } |
| mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); |
| return (ret); |
| } |
| |
| boolean_t |
| ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule) |
| { |
| int i; |
| ilb_rule_t *rule; |
| boolean_t ret = B_FALSE; |
| |
| i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size); |
| mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock); |
| for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL; |
| rule = rule->ir_hash_next) { |
| if (rule->ir_target_v6.s6_addr32[3] == addr) { |
| mutex_enter(&rule->ir_lock); |
| if (rule->ir_flags & ILB_RULE_BUSY) { |
| mutex_exit(&rule->ir_lock); |
| break; |
| } |
| if (ret_rule != NULL) { |
| rule->ir_refcnt++; |
| mutex_exit(&rule->ir_lock); |
| *ret_rule = rule; |
| } else { |
| mutex_exit(&rule->ir_lock); |
| } |
| ret = B_TRUE; |
| break; |
| } |
| } |
| mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock); |
| return (ret); |
| } |
| |
| static ilb_rule_t * |
| ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, |
| int *err) |
| { |
| ilb_rule_t *tmp_rule; |
| |
| ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); |
| |
| for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; |
| tmp_rule = tmp_rule->ir_next) { |
| if (tmp_rule->ir_zoneid != zoneid) |
| continue; |
| if (strcasecmp(tmp_rule->ir_name, name) == 0) { |
| mutex_enter(&tmp_rule->ir_lock); |
| if (tmp_rule->ir_flags & ILB_RULE_BUSY) { |
| mutex_exit(&tmp_rule->ir_lock); |
| *err = EINPROGRESS; |
| return (NULL); |
| } |
| tmp_rule->ir_refcnt++; |
| mutex_exit(&tmp_rule->ir_lock); |
| *err = 0; |
| return (tmp_rule); |
| } |
| } |
| *err = ENOENT; |
| return (NULL); |
| } |
| |
| /* To find a rule with a given name and zone in the global rule list. */ |
| ilb_rule_t * |
| ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, |
| int *err) |
| { |
| ilb_rule_t *tmp_rule; |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err); |
| mutex_exit(&ilbs->ilbs_g_lock); |
| return (tmp_rule); |
| } |
| |
| /* Try to match the given packet info and zone ID with a rule. */ |
| static boolean_t |
| ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3, |
| int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr) |
| { |
| ilb_rule_t *tmp_rule; |
| |
| ASSERT(mutex_owned(&ilbs->ilbs_g_lock)); |
| |
| for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; |
| tmp_rule = tmp_rule->ir_next) { |
| if (tmp_rule->ir_zoneid != zoneid) |
| continue; |
| |
| /* |
| * We don't allow the same name in different rules even if all |
| * the other rule components are different. |
| */ |
| if (strcasecmp(tmp_rule->ir_name, name) == 0) |
| return (B_TRUE); |
| |
| if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4) |
| continue; |
| |
| /* |
| * ir_min_port and ir_max_port are the same if ir_port_range |
| * is false. In this case, if the ir_min|max_port (same) is |
| * outside of the given port range, it is OK. In other cases, |
| * check if min and max port are outside a rule's range. |
| */ |
| if (tmp_rule->ir_max_port < min_port || |
| tmp_rule->ir_min_port > max_port) { |
| continue; |
| } |
| |
| /* |
| * If l3 is IPv4, the addr passed in is assumed to be |
| * mapped address. |
| */ |
| if (V6_OR_V4_INADDR_ANY(*addr) || |
| V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) || |
| IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) { |
| return (B_TRUE); |
| } |
| } |
| return (B_FALSE); |
| } |
| |
| int |
| ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid, |
| const char *rule_name, ilb_rule_t *in_rule) |
| { |
| ilb_rule_t *rule; |
| int err; |
| |
| ASSERT((in_rule == NULL && rule_name != NULL) || |
| (in_rule != NULL && rule_name == NULL)); |
| if ((rule = in_rule) == NULL) { |
| if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, |
| &err)) == NULL) { |
| return (err); |
| } |
| } |
| mutex_enter(&rule->ir_lock); |
| rule->ir_flags |= ILB_RULE_ENABLED; |
| mutex_exit(&rule->ir_lock); |
| |
| /* Only refrele if the rule is passed in. */ |
| if (in_rule == NULL) |
| ILB_RULE_REFRELE(rule); |
| return (0); |
| } |
| |
| int |
| ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid, |
| const char *rule_name, ilb_rule_t *in_rule) |
| { |
| ilb_rule_t *rule; |
| int err; |
| |
| ASSERT((in_rule == NULL && rule_name != NULL) || |
| (in_rule != NULL && rule_name == NULL)); |
| if ((rule = in_rule) == NULL) { |
| if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, |
| &err)) == NULL) { |
| return (err); |
| } |
| } |
| mutex_enter(&rule->ir_lock); |
| rule->ir_flags &= ~ILB_RULE_ENABLED; |
| mutex_exit(&rule->ir_lock); |
| |
| /* Only refrele if the rule is passed in. */ |
| if (in_rule == NULL) |
| ILB_RULE_REFRELE(rule); |
| return (0); |
| } |
| |
| /* |
| * XXX We should probably have a walker function to walk all rules. For |
| * now, just add a simple loop for enable/disable/del. |
| */ |
| void |
| ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid) |
| { |
| ilb_rule_t *rule; |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) { |
| if (rule->ir_zoneid != zoneid) |
| continue; |
| /* |
| * No need to hold the rule as we are holding the global |
| * lock so it won't go away. Ignore the return value here |
| * as the rule is provided so the call cannot fail. |
| */ |
| (void) ilb_rule_enable(ilbs, zoneid, NULL, rule); |
| } |
| mutex_exit(&ilbs->ilbs_g_lock); |
| } |
| |
| void |
| ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid) |
| { |
| ilb_rule_t *rule; |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| for (rule = ilbs->ilbs_rule_head; rule != NULL; |
| rule = rule->ir_next) { |
| if (rule->ir_zoneid != zoneid) |
| continue; |
| (void) ilb_rule_disable(ilbs, zoneid, NULL, rule); |
| } |
| mutex_exit(&ilbs->ilbs_g_lock); |
| } |
| |
| void |
| ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid) |
| { |
| ilb_rule_t *rule; |
| ilb_rule_tq_t *arg; |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| while ((rule = ilbs->ilbs_rule_head) != NULL) { |
| if (rule->ir_zoneid != zoneid) |
| continue; |
| ilb_rule_hash_del(rule); |
| ilb_rule_g_del(ilbs, rule); |
| mutex_exit(&ilbs->ilbs_g_lock); |
| |
| arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP); |
| arg->ilbs = ilbs; |
| arg->rule = rule; |
| (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, |
| arg, TQ_SLEEP); |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| } |
| mutex_exit(&ilbs->ilbs_g_lock); |
| } |
| |
| /* |
| * This is just an optimization, so don't grab the global lock. The |
| * worst case is that we missed a couple packets. |
| */ |
| boolean_t |
| ilb_has_rules(ilb_stack_t *ilbs) |
| { |
| return (ilbs->ilbs_rule_head != NULL); |
| } |
| |
| |
| static int |
| ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, |
| ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable) |
| { |
| ilb_server_t *tmp_server; |
| int ret; |
| |
| ASSERT((rule == NULL && rule_name != NULL) || |
| (rule != NULL && rule_name == NULL)); |
| |
| if (rule == NULL) { |
| if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, |
| &ret)) == NULL) { |
| return (ret); |
| } |
| } |
| |
| /* Once we get a hold on the rule, no server can be added/deleted. */ |
| for (tmp_server = rule->ir_servers; tmp_server != NULL; |
| tmp_server = tmp_server->iser_next) { |
| if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr)) |
| break; |
| } |
| if (tmp_server == NULL) { |
| ret = ENOENT; |
| goto done; |
| } |
| |
| if (enable) { |
| ret = rule->ir_alg->ilb_alg_server_enable(tmp_server, |
| rule->ir_alg->ilb_alg_data); |
| if (ret == 0) { |
| tmp_server->iser_enabled = B_TRUE; |
| tmp_server->iser_die_time = 0; |
| } |
| } else { |
| ret = rule->ir_alg->ilb_alg_server_disable(tmp_server, |
| rule->ir_alg->ilb_alg_data); |
| if (ret == 0) { |
| tmp_server->iser_enabled = B_FALSE; |
| if (rule->ir_conn_drain_timeout != 0) { |
| (void) atomic_swap_64( |
| (uint64_t *)&tmp_server->iser_die_time, |
| ddi_get_lbolt64() + SEC_TO_TICK( |
| rule->ir_conn_drain_timeout)); |
| } |
| } |
| } |
| |
| done: |
| if (rule_name != NULL) |
| ILB_RULE_REFRELE(rule); |
| return (ret); |
| } |
| int |
| ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, |
| ilb_rule_t *rule, in6_addr_t *addr) |
| { |
| return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE)); |
| } |
| |
| int |
| ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, |
| ilb_rule_t *rule, in6_addr_t *addr) |
| { |
| return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE)); |
| } |
| |
| /* |
| * Add a back end server to a rule. If the address is IPv4, it is assumed |
| * to be passed in as a mapped address. |
| */ |
| int |
| ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info) |
| { |
| ilb_server_t *server; |
| netstackid_t stackid; |
| int ret = 0; |
| in_port_t min_port, max_port; |
| in_port_t range; |
| |
| /* Port is passed in network byte order. */ |
| min_port = ntohs(info->min_port); |
| max_port = ntohs(info->max_port); |
| if (min_port > max_port) |
| return (EINVAL); |
| |
| /* min_port == 0 means "all ports". Make it so */ |
| if (min_port == 0) { |
| min_port = 1; |
| max_port = 65535; |
| } |
| range = max_port - min_port; |
| |
| mutex_enter(&rule->ir_lock); |
| /* If someone is already doing server add/del, sleeps and wait. */ |
| while (rule->ir_flags & ILB_RULE_BUSY) { |
| if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { |
| mutex_exit(&rule->ir_lock); |
| return (EINTR); |
| } |
| } |
| |
| /* |
| * Set the rule to be busy to make sure that no new packet can |
| * use this rule. |
| */ |
| rule->ir_flags |= ILB_RULE_BUSY; |
| |
| /* Now wait for all other guys to finish their work. */ |
| while (rule->ir_refcnt > 2) { |
| if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { |
| mutex_exit(&rule->ir_lock); |
| ret = EINTR; |
| goto end; |
| } |
| } |
| mutex_exit(&rule->ir_lock); |
| |
| /* Sanity checks... */ |
| if ((IN6_IS_ADDR_V4MAPPED(&info->addr) && |
| rule->ir_ipver != IPPROTO_IP) || |
| (!IN6_IS_ADDR_V4MAPPED(&info->addr) && |
| rule->ir_ipver != IPPROTO_IPV6)) { |
| ret = EINVAL; |
| goto end; |
| } |
| |
| /* |
| * Check for valid port range. |
| * |
| * For DSR, there can be no port shifting. Hence the server |
| * specification must be the same as the rule's. |
| * |
| * For half-NAT/NAT, the range must either be 0 (port collapsing) or |
| * it must be equal to the same value as the rule port range. |
| * |
| */ |
| if (rule->ir_topo == ILB_TOPO_IMPL_DSR) { |
| if (rule->ir_max_port != max_port || |
| rule->ir_min_port != min_port) { |
| ret = EINVAL; |
| goto end; |
| } |
| } else { |
| if ((range != rule->ir_max_port - rule->ir_min_port) && |
| range != 0) { |
| ret = EINVAL; |
| goto end; |
| } |
| } |
| |
| /* Check for duplicate. */ |
| for (server = rule->ir_servers; server != NULL; |
| server = server->iser_next) { |
| if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) || |
| strcasecmp(server->iser_name, info->name) == 0) { |
| break; |
| } |
| } |
| if (server != NULL) { |
| ret = EEXIST; |
| goto end; |
| } |
| |
| if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) { |
| ret = ENOMEM; |
| goto end; |
| } |
| |
| (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1); |
| (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr, |
| sizeof (server->iser_ip_addr)); |
| stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private; |
| server->iser_ksp = ilb_server_kstat_init(stackid, rule, server); |
| if (server->iser_ksp == NULL) { |
| kmem_free(server, sizeof (ilb_server_t)); |
| ret = EINVAL; |
| goto end; |
| } |
| |
| server->iser_stackid = stackid; |
| server->iser_addr_v6 = info->addr; |
| server->iser_min_port = min_port; |
| server->iser_max_port = max_port; |
| if (min_port != max_port) |
| server->iser_port_range = B_TRUE; |
| else |
| server->iser_port_range = B_FALSE; |
| |
| /* |
| * If the rule uses NAT, find/create the NAT source entry to use |
| * for this server. |
| */ |
| if (rule->ir_topo == ILB_TOPO_IMPL_NAT) { |
| in_port_t port; |
| |
| /* |
| * If the server uses a port range, our port allocation |
| * scheme needs to treat it as a wildcard. Refer to the |
| * comments in ilb_nat.c about the scheme. |
| */ |
| if (server->iser_port_range) |
| port = 0; |
| else |
| port = server->iser_min_port; |
| |
| if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src, |
| &server->iser_addr_v6, port, &rule->ir_nat_src_start, |
| num_nat_src_v6(&rule->ir_nat_src_start, |
| &rule->ir_nat_src_end))) != 0) { |
| kstat_delete_netstack(server->iser_ksp, stackid); |
| kmem_free(server, sizeof (ilb_server_t)); |
| goto end; |
| } |
| } |
| |
| /* |
| * The iser_lock is only used to protect iser_refcnt. All the other |
| * fields in ilb_server_t should not change, except for iser_enabled. |
| * The worst thing that can happen if iser_enabled is messed up is |
| * that one or two packets may not be load balanced to a server |
| * correctly. |
| */ |
| server->iser_refcnt = 1; |
| server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE : |
| B_FALSE; |
| mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL); |
| cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL); |
| |
| /* Let the load balancing algorithm know about the addition. */ |
| ASSERT(rule->ir_alg != NULL); |
| if ((ret = rule->ir_alg->ilb_alg_server_add(server, |
| rule->ir_alg->ilb_alg_data)) != 0) { |
| kstat_delete_netstack(server->iser_ksp, stackid); |
| kmem_free(server, sizeof (ilb_server_t)); |
| goto end; |
| } |
| |
| /* |
| * No need to hold ir_lock since no other thread should manipulate |
| * the following fields until ILB_RULE_BUSY is cleared. |
| */ |
| if (rule->ir_servers == NULL) { |
| server->iser_next = NULL; |
| } else { |
| server->iser_next = rule->ir_servers; |
| } |
| rule->ir_servers = server; |
| ILB_R_KSTAT(rule, num_servers); |
| |
| end: |
| mutex_enter(&rule->ir_lock); |
| rule->ir_flags &= ~ILB_RULE_BUSY; |
| cv_signal(&rule->ir_cv); |
| mutex_exit(&rule->ir_lock); |
| return (ret); |
| } |
| |
| /* The routine executed by the delayed rule processing taskq. */ |
| static void |
| ilb_server_del_tq(void *arg) |
| { |
| ilb_server_t *server = (ilb_server_t *)arg; |
| |
| mutex_enter(&server->iser_lock); |
| while (server->iser_refcnt > 1) |
| cv_wait(&server->iser_cv, &server->iser_lock); |
| kstat_delete_netstack(server->iser_ksp, server->iser_stackid); |
| kmem_free(server, sizeof (ilb_server_t)); |
| } |
| |
| /* |
| * Delete a back end server from a rule. If the address is IPv4, it is assumed |
| * to be passed in as a mapped address. |
| */ |
| int |
| ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name, |
| ilb_rule_t *rule, in6_addr_t *addr) |
| { |
| ilb_server_t *server; |
| ilb_server_t *prev_server; |
| int ret = 0; |
| |
| ASSERT((rule == NULL && rule_name != NULL) || |
| (rule != NULL && rule_name == NULL)); |
| if (rule == NULL) { |
| if ((rule = ilb_find_rule(ilbs, zoneid, rule_name, |
| &ret)) == NULL) { |
| return (ret); |
| } |
| } |
| |
| mutex_enter(&rule->ir_lock); |
| /* If someone is already doing server add/del, sleeps and wait. */ |
| while (rule->ir_flags & ILB_RULE_BUSY) { |
| if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { |
| if (rule_name != NULL) { |
| if (--rule->ir_refcnt <= 2) |
| cv_signal(&rule->ir_cv); |
| } |
| mutex_exit(&rule->ir_lock); |
| return (EINTR); |
| } |
| } |
| /* |
| * Set the rule to be busy to make sure that no new packet can |
| * use this rule. |
| */ |
| rule->ir_flags |= ILB_RULE_BUSY; |
| |
| /* Now wait for all other guys to finish their work. */ |
| while (rule->ir_refcnt > 2) { |
| if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) { |
| mutex_exit(&rule->ir_lock); |
| ret = EINTR; |
| goto end; |
| } |
| } |
| mutex_exit(&rule->ir_lock); |
| |
| prev_server = NULL; |
| for (server = rule->ir_servers; server != NULL; |
| prev_server = server, server = server->iser_next) { |
| if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr)) |
| break; |
| } |
| if (server == NULL) { |
| ret = ENOENT; |
| goto end; |
| } |
| |
| /* |
| * Let the load balancing algorithm know about the removal. |
| * The algorithm may disallow the removal... |
| */ |
| if ((ret = rule->ir_alg->ilb_alg_server_del(server, |
| rule->ir_alg->ilb_alg_data)) != 0) { |
| goto end; |
| } |
| |
| if (prev_server == NULL) |
| rule->ir_servers = server->iser_next; |
| else |
| prev_server->iser_next = server->iser_next; |
| |
| ILB_R_KSTAT_UPDATE(rule, num_servers, -1); |
| |
| /* |
| * Mark the server as disabled so that if there is any sticky cache |
| * using this server around, it won't be used. |
| */ |
| server->iser_enabled = B_FALSE; |
| |
| mutex_enter(&server->iser_lock); |
| |
| /* |
| * De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t |
| * may not go away if there is still a conn using it. The NAT source |
| * timer will do the garbage collection. |
| */ |
| ilb_destroy_nat_src(&server->iser_nat_src); |
| |
| /* If there is a hard limit on when a server should die, set it. */ |
| if (rule->ir_conn_drain_timeout != 0) { |
| (void) atomic_swap_64((uint64_t *)&server->iser_die_time, |
| ddi_get_lbolt64() + |
| SEC_TO_TICK(rule->ir_conn_drain_timeout)); |
| } |
| |
| if (server->iser_refcnt > 1) { |
| (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq, |
| server, TQ_SLEEP); |
| mutex_exit(&server->iser_lock); |
| } else { |
| kstat_delete_netstack(server->iser_ksp, server->iser_stackid); |
| kmem_free(server, sizeof (ilb_server_t)); |
| } |
| |
| end: |
| mutex_enter(&rule->ir_lock); |
| rule->ir_flags &= ~ILB_RULE_BUSY; |
| if (rule_name != NULL) |
| rule->ir_refcnt--; |
| cv_signal(&rule->ir_cv); |
| mutex_exit(&rule->ir_lock); |
| return (ret); |
| } |
| |
| /* |
| * First check if the destination of the ICMP message matches a VIP of |
| * a rule. If it does not, just return ILB_PASSED. |
| * |
| * If the destination matches a VIP: |
| * |
| * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end |
| * server. |
| * |
| * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload |
| * and see which back end server we should send this message to. And we |
| * need to do NAT on both the payload message and the outside IP packet. |
| * |
| * For other ICMP messages, drop them. |
| */ |
| /* ARGSUSED */ |
| static int |
| ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, |
| icmph_t *icmph, ipaddr_t *lb_dst) |
| { |
| ipaddr_t vip; |
| ilb_rule_t *rule; |
| in6_addr_t addr6; |
| |
| if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule)) |
| return (ILB_PASSED); |
| |
| |
| if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) { |
| ILB_R_KSTAT(rule, icmp_dropped); |
| ILB_RULE_REFRELE(rule); |
| return (ILB_DROPPED); |
| } |
| |
| switch (icmph->icmph_type) { |
| case ICMP_ECHO_REQUEST: |
| ILB_R_KSTAT(rule, icmp_echo_processed); |
| ILB_RULE_REFRELE(rule); |
| |
| icmph->icmph_type = ICMP_ECHO_REPLY; |
| icmph->icmph_checksum = 0; |
| icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0); |
| ipha->ipha_ttl = |
| ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl; |
| *lb_dst = ipha->ipha_src; |
| vip = ipha->ipha_dst; |
| ipha->ipha_dst = ipha->ipha_src; |
| ipha->ipha_src = vip; |
| return (ILB_BALANCED); |
| case ICMP_DEST_UNREACHABLE: { |
| int ret; |
| |
| if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) { |
| ILB_R_KSTAT(rule, icmp_dropped); |
| ILB_RULE_REFRELE(rule); |
| return (ILB_DROPPED); |
| } |
| if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph, |
| &addr6)) { |
| ILB_R_KSTAT(rule, icmp_2big_processed); |
| ret = ILB_BALANCED; |
| } else { |
| ILB_R_KSTAT(rule, icmp_2big_dropped); |
| ret = ILB_DROPPED; |
| } |
| ILB_RULE_REFRELE(rule); |
| IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst); |
| return (ret); |
| } |
| default: |
| ILB_R_KSTAT(rule, icmp_dropped); |
| ILB_RULE_REFRELE(rule); |
| return (ILB_DROPPED); |
| } |
| } |
| |
| /* ARGSUSED */ |
| static int |
| ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, |
| icmp6_t *icmp6, in6_addr_t *lb_dst) |
| { |
| ilb_rule_t *rule; |
| |
| if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule)) |
| return (ILB_PASSED); |
| |
| if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) { |
| ILB_R_KSTAT(rule, icmp_dropped); |
| ILB_RULE_REFRELE(rule); |
| return (ILB_DROPPED); |
| } |
| |
| switch (icmp6->icmp6_type) { |
| case ICMP6_ECHO_REQUEST: { |
| int hdr_len; |
| |
| ILB_R_KSTAT(rule, icmp_echo_processed); |
| ILB_RULE_REFRELE(rule); |
| |
| icmp6->icmp6_type = ICMP6_ECHO_REPLY; |
| icmp6->icmp6_cksum = ip6h->ip6_plen; |
| hdr_len = (char *)icmp6 - (char *)ip6h; |
| icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len, |
| ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6)); |
| ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL; |
| ip6h->ip6_hops = |
| ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops; |
| *lb_dst = ip6h->ip6_src; |
| ip6h->ip6_src = ip6h->ip6_dst; |
| ip6h->ip6_dst = *lb_dst; |
| return (ILB_BALANCED); |
| } |
| case ICMP6_PACKET_TOO_BIG: { |
| int ret; |
| |
| if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6, |
| lb_dst)) { |
| ILB_R_KSTAT(rule, icmp_2big_processed); |
| ret = ILB_BALANCED; |
| } else { |
| ILB_R_KSTAT(rule, icmp_2big_dropped); |
| ret = ILB_DROPPED; |
| } |
| ILB_RULE_REFRELE(rule); |
| return (ret); |
| } |
| default: |
| ILB_R_KSTAT(rule, icmp_dropped); |
| ILB_RULE_REFRELE(rule); |
| return (ILB_DROPPED); |
| } |
| } |
| |
| /* |
| * Common routine to check an incoming packet and decide what to do with it. |
| * called by ilb_check_v4|v6(). |
| */ |
| static int |
| ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src, |
| in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len, |
| in6_addr_t *lb_dst) |
| { |
| in_port_t sport, dport; |
| tcpha_t *tcph; |
| udpha_t *udph; |
| ilb_rule_t *rule; |
| ilb_server_t *server; |
| boolean_t balanced; |
| struct ilb_sticky_s *s = NULL; |
| int ret; |
| uint32_t ip_sum, tp_sum; |
| ilb_nat_info_t info; |
| uint16_t nat_src_idx; |
| boolean_t busy; |
| |
| ret = 0; |
| |
| /* |
| * We don't really need to switch here since both protocols's |
| * ports are at the same offset. Just prepare for future protocol |
| * specific processing. |
| */ |
| switch (l4) { |
| case IPPROTO_TCP: |
| if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr) |
| return (ILB_DROPPED); |
| tcph = (tcpha_t *)tph; |
| sport = tcph->tha_lport; |
| dport = tcph->tha_fport; |
| break; |
| case IPPROTO_UDP: |
| if (tph + sizeof (udpha_t) > mp->b_wptr) |
| return (ILB_DROPPED); |
| udph = (udpha_t *)tph; |
| sport = udph->uha_src_port; |
| dport = udph->uha_dst_port; |
| break; |
| default: |
| return (ILB_PASSED); |
| } |
| |
| /* Fast path, there is an existing conn. */ |
| if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport, |
| pkt_len, lb_dst)) { |
| return (ILB_BALANCED); |
| } |
| |
| /* |
| * If there is no existing connection for the incoming packet, check |
| * to see if the packet matches a rule. If not, just let IP decide |
| * what to do with it. |
| * |
| * Note: a reply from back end server should not match a rule. A |
| * reply should match one existing conn. |
| */ |
| rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid, |
| pkt_len, &busy); |
| if (rule == NULL) { |
| /* If the rule is busy, just drop the packet. */ |
| if (busy) |
| return (ILB_DROPPED); |
| else |
| return (ILB_PASSED); |
| } |
| |
| /* |
| * The packet matches a rule, use the rule load balance algorithm |
| * to find a server. |
| */ |
| balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport, |
| rule->ir_alg->ilb_alg_data, &server); |
| /* |
| * This can only happen if there is no server in a rule or all |
| * the servers are currently disabled. |
| */ |
| if (!balanced) |
| goto no_server; |
| |
| /* |
| * If the rule is sticky enabled, we need to check the sticky table. |
| * If there is a sticky entry for the client, use the previous server |
| * instead of the one found above (note that both can be the same). |
| * If there is no entry for that client, add an entry to the sticky |
| * table. Both the find and add are done in ilb_sticky_find_add() |
| * to avoid checking for duplicate when adding an entry. |
| */ |
| if (rule->ir_flags & ILB_RULE_STICKY) { |
| in6_addr_t addr; |
| |
| V6_MASK_COPY(*src, rule->ir_sticky_mask, addr); |
| if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server, |
| &s, &nat_src_idx)) == NULL) { |
| ILB_R_KSTAT(rule, nomem_pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); |
| goto no_server; |
| } |
| } |
| |
| /* |
| * We are holding a reference on the rule, so the server |
| * cannot go away. |
| */ |
| *lb_dst = server->iser_addr_v6; |
| ILB_S_KSTAT(server, pkt_processed); |
| ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len); |
| |
| switch (rule->ir_topo) { |
| case ILB_TOPO_IMPL_NAT: { |
| ilb_nat_src_entry_t *src_ent; |
| uint16_t *src_idx; |
| |
| /* |
| * We create a cache even if it is not a SYN segment. |
| * The server should return a RST. When we see the |
| * RST, we will destroy this cache. But by having |
| * a cache, we know how to NAT the returned RST. |
| */ |
| info.vip = *dst; |
| info.dport = dport; |
| info.src = *src; |
| info.sport = sport; |
| |
| /* If stickiness is enabled, use the same source address */ |
| if (s != NULL) |
| src_idx = &nat_src_idx; |
| else |
| src_idx = NULL; |
| |
| if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src, |
| &info.nat_src, &info.nat_sport, src_idx)) == NULL) { |
| if (s != NULL) |
| ilb_sticky_refrele(s); |
| ILB_R_KSTAT(rule, pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); |
| ILB_R_KSTAT(rule, noport_pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len); |
| ret = ILB_DROPPED; |
| break; |
| } |
| info.src_ent = src_ent; |
| info.nat_dst = server->iser_addr_v6; |
| if (rule->ir_port_range && server->iser_port_range) { |
| info.nat_dport = htons(ntohs(dport) - |
| rule->ir_min_port + server->iser_min_port); |
| } else { |
| info.nat_dport = htons(server->iser_min_port); |
| } |
| |
| /* |
| * If ilb_conn_add() fails, it will release the reference on |
| * sticky info and de-allocate the NAT source port allocated |
| * above. |
| */ |
| if (ilb_conn_add(ilbs, rule, server, src, sport, dst, |
| dport, &info, &ip_sum, &tp_sum, s) != 0) { |
| ILB_R_KSTAT(rule, pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); |
| ILB_R_KSTAT(rule, nomem_pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); |
| ret = ILB_DROPPED; |
| break; |
| } |
| ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); |
| ret = ILB_BALANCED; |
| break; |
| } |
| case ILB_TOPO_IMPL_HALF_NAT: |
| info.vip = *dst; |
| info.nat_dst = server->iser_addr_v6; |
| info.dport = dport; |
| if (rule->ir_port_range && server->iser_port_range) { |
| info.nat_dport = htons(ntohs(dport) - |
| rule->ir_min_port + server->iser_min_port); |
| } else { |
| info.nat_dport = htons(server->iser_min_port); |
| } |
| |
| if (ilb_conn_add(ilbs, rule, server, src, sport, dst, |
| dport, &info, &ip_sum, &tp_sum, s) != 0) { |
| ILB_R_KSTAT(rule, pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); |
| ILB_R_KSTAT(rule, nomem_pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len); |
| ret = ILB_DROPPED; |
| break; |
| } |
| ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE); |
| |
| ret = ILB_BALANCED; |
| break; |
| case ILB_TOPO_IMPL_DSR: |
| /* |
| * By decrementing the sticky refcnt, the period of |
| * stickiness (life time of ilb_sticky_t) will be |
| * from now to (now + default expiry time). |
| */ |
| if (s != NULL) |
| ilb_sticky_refrele(s); |
| ret = ILB_BALANCED; |
| break; |
| default: |
| cmn_err(CE_PANIC, "data corruption unknown topology: %p", |
| (void *) rule); |
| break; |
| } |
| ILB_RULE_REFRELE(rule); |
| return (ret); |
| |
| no_server: |
| /* This can only happen if there is no server available. */ |
| ILB_R_KSTAT(rule, pkt_dropped); |
| ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len); |
| ILB_RULE_REFRELE(rule); |
| return (ILB_DROPPED); |
| } |
| |
| int |
| ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4, |
| uint8_t *tph, ipaddr_t *lb_dst) |
| { |
| in6_addr_t v6_src, v6_dst, v6_lb_dst; |
| int ret; |
| |
| ASSERT(DB_REF(mp) == 1); |
| |
| if (l4 == IPPROTO_ICMP) { |
| return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph, |
| lb_dst)); |
| } |
| |
| IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src); |
| IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst); |
| ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha, |
| tph, ntohs(ipha->ipha_length), &v6_lb_dst); |
| if (ret == ILB_BALANCED) |
| IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst); |
| return (ret); |
| } |
| |
| int |
| ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4, |
| uint8_t *tph, in6_addr_t *lb_dst) |
| { |
| uint32_t pkt_len; |
| |
| ASSERT(DB_REF(mp) == 1); |
| |
| if (l4 == IPPROTO_ICMPV6) { |
| return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph, |
| lb_dst)); |
| } |
| |
| pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; |
| return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst, |
| IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst)); |
| } |
| |
| void |
| ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules) |
| { |
| ilb_rule_t *tmp_rule; |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| *num_rules = 0; |
| for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; |
| tmp_rule = tmp_rule->ir_next) { |
| if (tmp_rule->ir_zoneid == zoneid) |
| *num_rules += 1; |
| } |
| mutex_exit(&ilbs->ilbs_g_lock); |
| } |
| |
| int |
| ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, |
| uint32_t *num_servers) |
| { |
| ilb_rule_t *rule; |
| int err; |
| |
| if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) |
| return (err); |
| *num_servers = rule->ir_kstat.num_servers.value.ui64; |
| ILB_RULE_REFRELE(rule); |
| return (0); |
| } |
| |
| int |
| ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, |
| ilb_server_info_t *servers, uint32_t *num_servers) |
| { |
| ilb_rule_t *rule; |
| ilb_server_t *server; |
| size_t cnt; |
| int err; |
| |
| if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL) |
| return (err); |
| for (server = rule->ir_servers, cnt = *num_servers; |
| server != NULL && cnt > 0; |
| server = server->iser_next, cnt--, servers++) { |
| (void) memcpy(servers->name, server->iser_name, |
| ILB_SERVER_NAMESZ); |
| servers->addr = server->iser_addr_v6; |
| servers->min_port = htons(server->iser_min_port); |
| servers->max_port = htons(server->iser_max_port); |
| servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0; |
| servers->err = 0; |
| } |
| ILB_RULE_REFRELE(rule); |
| *num_servers -= cnt; |
| |
| return (0); |
| } |
| |
| void |
| ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names, |
| char *buf) |
| { |
| ilb_rule_t *tmp_rule; |
| int cnt; |
| |
| if (*num_names == 0) |
| return; |
| |
| mutex_enter(&ilbs->ilbs_g_lock); |
| for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL; |
| tmp_rule = tmp_rule->ir_next) { |
| if (tmp_rule->ir_zoneid != zoneid) |
| continue; |
| |
| (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ); |
| buf += ILB_RULE_NAMESZ; |
| if (++cnt == *num_names) |
| break; |
| } |
| mutex_exit(&ilbs->ilbs_g_lock); |
| *num_names = cnt; |
| } |
| |
| int |
| ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd) |
| { |
| ilb_rule_t *rule; |
| int err; |
| |
| if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) { |
| return (err); |
| } |
| |
| /* |
| * Except the enabled flags, none of the following will change |
| * in the life time of a rule. So we don't hold the mutex when |
| * reading them. The worst is to report a wrong enabled flags. |
| */ |
| cmd->ip_ver = rule->ir_ipver; |
| cmd->proto = rule->ir_proto; |
| cmd->min_port = htons(rule->ir_min_port); |
| cmd->max_port = htons(rule->ir_max_port); |
| |
| cmd->vip = rule->ir_target_v6; |
| cmd->algo = rule->ir_alg_type; |
| cmd->topo = rule->ir_topo; |
| |
| cmd->nat_src_start = rule->ir_nat_src_start; |
| cmd->nat_src_end = rule->ir_nat_src_end; |
| |
| cmd->conn_drain_timeout = rule->ir_conn_drain_timeout; |
| cmd->nat_expiry = rule->ir_nat_expiry; |
| cmd->sticky_expiry = rule->ir_sticky_expiry; |
| |
| cmd->flags = 0; |
| if (rule->ir_flags & ILB_RULE_ENABLED) |
| cmd->flags |= ILB_RULE_ENABLED; |
| if (rule->ir_flags & ILB_RULE_STICKY) { |
| cmd->flags |= ILB_RULE_STICKY; |
| cmd->sticky_mask = rule->ir_sticky_mask; |
| } |
| |
| ILB_RULE_REFRELE(rule); |
| return (0); |
| } |
| |
| static void * |
| ilb_stack_init(netstackid_t stackid, netstack_t *ns) |
| { |
| ilb_stack_t *ilbs; |
| char tq_name[TASKQ_NAMELEN]; |
| |
| ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP); |
| ilbs->ilbs_netstack = ns; |
| |
| ilbs->ilbs_rule_head = NULL; |
| ilbs->ilbs_g_hash = NULL; |
| mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL); |
| |
| ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP); |
| if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) { |
| kmem_free(ilbs, sizeof (ilb_stack_t)); |
| return (NULL); |
| } |
| |
| /* |
| * ilbs_conn/sticky_hash related info is initialized in |
| * ilb_conn/sticky_hash_init(). |
| */ |
| ilbs->ilbs_conn_taskq = NULL; |
| ilbs->ilbs_rule_hash_size = ilb_rule_hash_size; |
| ilbs->ilbs_conn_hash_size = ilb_conn_hash_size; |
| ilbs->ilbs_c2s_conn_hash = NULL; |
| ilbs->ilbs_s2c_conn_hash = NULL; |
| ilbs->ilbs_conn_timer_list = NULL; |
| |
| ilbs->ilbs_sticky_hash = NULL; |
| ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size; |
| ilbs->ilbs_sticky_timer_list = NULL; |
| ilbs->ilbs_sticky_taskq = NULL; |
| |
| /* The allocation is done later when there is a rule using NAT mode. */ |
| ilbs->ilbs_nat_src = NULL; |
| ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size; |
| mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL); |
| ilbs->ilbs_nat_src_tid = 0; |
| |
| /* For listing the conn hash table */ |
| mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL); |
| cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL); |
| ilbs->ilbs_conn_list_busy = B_FALSE; |
| ilbs->ilbs_conn_list_cur = 0; |
| ilbs->ilbs_conn_list_connp = NULL; |
| |
| /* For listing the sticky hash table */ |
| mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL); |
| cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL); |
| ilbs->ilbs_sticky_list_busy = B_FALSE; |
| ilbs->ilbs_sticky_list_cur = 0; |
| ilbs->ilbs_sticky_list_curp = NULL; |
| |
| (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p", |
| (void *)ns); |
| ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR, |
| minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); |
| |
| return (ilbs); |
| } |
| |
| /* ARGSUSED */ |
| static void |
| ilb_stack_shutdown(netstackid_t stackid, void *arg) |
| { |
| ilb_stack_t *ilbs = (ilb_stack_t *)arg; |
| ilb_rule_t *tmp_rule; |
| |
| ilb_sticky_hash_fini(ilbs); |
| ilb_conn_hash_fini(ilbs); |
| mutex_enter(&ilbs->ilbs_g_lock); |
| while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) { |
| ilb_rule_hash_del(tmp_rule); |
| ilb_rule_g_del(ilbs, tmp_rule); |
| mutex_exit(&ilbs->ilbs_g_lock); |
| ilb_rule_del_common(ilbs, tmp_rule); |
| mutex_enter(&ilbs->ilbs_g_lock); |
| } |
| mutex_exit(&ilbs->ilbs_g_lock); |
| if (ilbs->ilbs_nat_src != NULL) |
| ilb_nat_src_fini(ilbs); |
| } |
| |
| static void |
| ilb_stack_fini(netstackid_t stackid, void * arg) |
| { |
| ilb_stack_t *ilbs = (ilb_stack_t *)arg; |
| |
| ilb_rule_hash_fini(ilbs); |
| taskq_destroy(ilbs->ilbs_rule_taskq); |
| ilb_kstat_g_fini(stackid, ilbs); |
| kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t)); |
| kmem_free(ilbs, sizeof (ilb_stack_t)); |
| } |
| |
| void |
| ilb_ddi_g_init(void) |
| { |
| netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown, |
| ilb_stack_fini); |
| } |
| |
| void |
| ilb_ddi_g_destroy(void) |
| { |
| netstack_unregister(NS_ILB); |
| ilb_conn_cache_fini(); |
| ilb_sticky_cache_fini(); |
| } |