| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| * |
| * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. |
| */ |
| |
| #include <inet/ip.h> |
| #include <inet/ip6.h> |
| #include <inet/ip_if.h> |
| #include <inet/ip_ire.h> |
| #include <inet/ip_multi.h> |
| #include <inet/ip_ndp.h> |
| #include <inet/ip_rts.h> |
| #include <inet/mi.h> |
| #include <net/if_types.h> |
| #include <sys/dlpi.h> |
| #include <sys/kmem.h> |
| #include <sys/modhash.h> |
| #include <sys/sdt.h> |
| #include <sys/strsun.h> |
| #include <sys/sunddi.h> |
| #include <sys/types.h> |
| |
| /* |
| * Convenience macros for getting the ip_stack_t associated with an |
| * ipmp_illgrp_t or ipmp_grp_t. |
| */ |
| #define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) |
| #define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) |
| |
| /* |
| * Assorted constants that aren't important enough to be tunable. |
| */ |
| #define IPMP_GRP_HASH_SIZE 64 |
| #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ |
| |
| /* |
| * IPMP meta-interface kstats (based on those in PSARC/1997/198). |
| */ |
| static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { |
| { "obytes", KSTAT_DATA_UINT32 }, |
| { "obytes64", KSTAT_DATA_UINT64 }, |
| { "rbytes", KSTAT_DATA_UINT32 }, |
| { "rbytes64", KSTAT_DATA_UINT64 }, |
| { "opackets", KSTAT_DATA_UINT32 }, |
| { "opackets64", KSTAT_DATA_UINT64 }, |
| { "oerrors", KSTAT_DATA_UINT32 }, |
| { "ipackets", KSTAT_DATA_UINT32 }, |
| { "ipackets64", KSTAT_DATA_UINT64 }, |
| { "ierrors", KSTAT_DATA_UINT32 }, |
| { "multircv", KSTAT_DATA_UINT32 }, |
| { "multixmt", KSTAT_DATA_UINT32 }, |
| { "brdcstrcv", KSTAT_DATA_UINT32 }, |
| { "brdcstxmt", KSTAT_DATA_UINT32 }, |
| { "link_up", KSTAT_DATA_UINT32 } |
| }; |
| |
| static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); |
| static int ipmp_grp_create_kstats(ipmp_grp_t *); |
| static int ipmp_grp_update_kstats(kstat_t *, int); |
| static void ipmp_grp_destroy_kstats(ipmp_grp_t *); |
| static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); |
| static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); |
| static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); |
| static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t, uint_t); |
| static boolean_t ipmp_ill_activate(ill_t *); |
| static void ipmp_ill_deactivate(ill_t *); |
| static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); |
| static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); |
| static void ipmp_ill_refresh_active_timer_start(ill_t *); |
| static void ipmp_ill_rtsaddrmsg(ill_t *, int); |
| static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); |
| static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); |
| static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); |
| static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); |
| static void ipmp_ncec_delete_nonlocal(ncec_t *, void *); |
| |
| /* |
| * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). |
| */ |
| void |
| ipmp_init(ip_stack_t *ipst) |
| { |
| ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", |
| IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, |
| mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); |
| rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); |
| } |
| |
| /* |
| * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). |
| */ |
| void |
| ipmp_destroy(ip_stack_t *ipst) |
| { |
| mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); |
| rw_destroy(&ipst->ips_ipmp_lock); |
| } |
| |
| /* |
| * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', |
| * and add it to the hash. On success, return a pointer to the created group. |
| * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP |
| * meta-interface associated with the group also has the same name (but they |
| * may differ later via ipmp_grp_rename()). |
| */ |
| ipmp_grp_t * |
| ipmp_grp_create(const char *grname, phyint_t *phyi) |
| { |
| ipmp_grp_t *grp; |
| ip_stack_t *ipst = PHYINT_TO_IPST(phyi); |
| mod_hash_hndl_t mh; |
| |
| ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); |
| |
| if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) |
| return (NULL); |
| |
| (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); |
| (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); |
| |
| /* |
| * Cache the group's phyint. This is safe since a phyint_t will |
| * outlive its ipmp_grp_t. |
| */ |
| grp->gr_phyint = phyi; |
| |
| /* |
| * Create IPMP group kstats. |
| */ |
| if (ipmp_grp_create_kstats(grp) != 0) { |
| kmem_free(grp, sizeof (ipmp_grp_t)); |
| return (NULL); |
| } |
| |
| /* |
| * Insert the group into the hash. |
| */ |
| if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { |
| ipmp_grp_destroy_kstats(grp); |
| kmem_free(grp, sizeof (ipmp_grp_t)); |
| return (NULL); |
| } |
| ipmp_grp_insert(grp, mh); |
| |
| return (grp); |
| } |
| |
| /* |
| * Create IPMP kstat structures for `grp'. Return an errno upon failure. |
| */ |
| static int |
| ipmp_grp_create_kstats(ipmp_grp_t *grp) |
| { |
| kstat_t *ksp; |
| netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; |
| |
| ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", |
| KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); |
| if (ksp == NULL) |
| return (ENOMEM); |
| |
| ksp->ks_update = ipmp_grp_update_kstats; |
| ksp->ks_private = grp; |
| bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); |
| |
| kstat_install(ksp); |
| grp->gr_ksp = ksp; |
| return (0); |
| } |
| |
| /* |
| * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. |
| */ |
| static int |
| ipmp_grp_update_kstats(kstat_t *ksp, int rw) |
| { |
| uint_t i; |
| kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); |
| ipmp_grp_t *grp = ksp->ks_private; |
| ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); |
| ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; |
| phyint_t *phyi; |
| uint64_t phyi_kstats[IPMP_KSTAT_MAX]; |
| |
| if (rw == KSTAT_WRITE) |
| return (EACCES); |
| |
| /* |
| * Start with the group's baseline values. |
| */ |
| for (i = 0; i < IPMP_KSTAT_MAX; i++) { |
| if (kn[i].data_type == KSTAT_DATA_UINT32) { |
| kn[i].value.ui32 = grp->gr_kstats0[i]; |
| } else { |
| ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); |
| kn[i].value.ui64 = grp->gr_kstats0[i]; |
| } |
| } |
| |
| /* |
| * Add in the stats of each phyint currently in the group. Since we |
| * don't directly track the phyints in a group, we cheat by walking |
| * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while |
| * ill_g_lock is held.) |
| */ |
| rw_enter(&ipst->ips_ill_g_lock, RW_READER); |
| ipsq = grp_ipsq->ipsq_next; |
| for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { |
| phyi = ipsq->ipsq_phyint; |
| |
| /* |
| * If a phyint in a group is being unplumbed, it's possible |
| * that ill_glist_delete() -> phyint_free() already freed the |
| * phyint (and set ipsq_phyint to NULL), but the unplumb |
| * operation has yet to complete (and thus ipsq_dq() has yet |
| * to remove the phyint's IPSQ from the group IPSQ's phyint |
| * list). We skip those phyints here (note that their kstats |
| * have already been added to gr_kstats0[]). |
| */ |
| if (phyi == NULL) |
| continue; |
| |
| ipmp_phyint_get_kstats(phyi, phyi_kstats); |
| |
| for (i = 0; i < IPMP_KSTAT_MAX; i++) { |
| phyi_kstats[i] -= phyi->phyint_kstats0[i]; |
| if (kn[i].data_type == KSTAT_DATA_UINT32) |
| kn[i].value.ui32 += phyi_kstats[i]; |
| else |
| kn[i].value.ui64 += phyi_kstats[i]; |
| } |
| } |
| |
| kn[IPMP_KSTAT_LINK_UP].value.ui32 = |
| (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; |
| |
| rw_exit(&ipst->ips_ill_g_lock); |
| return (0); |
| } |
| |
| /* |
| * Destroy IPMP kstat structures for `grp'. |
| */ |
| static void |
| ipmp_grp_destroy_kstats(ipmp_grp_t *grp) |
| { |
| netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; |
| |
| kstat_delete_netstack(grp->gr_ksp, id); |
| bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); |
| grp->gr_ksp = NULL; |
| } |
| |
| /* |
| * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it |
| * does not exist. |
| */ |
| ipmp_grp_t * |
| ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) |
| { |
| ipmp_grp_t *grp; |
| |
| ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); |
| |
| if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, |
| (mod_hash_val_t *)&grp) == 0) |
| return (grp); |
| |
| return (NULL); |
| } |
| |
| /* |
| * Place information about group `grp' into `lifgr'. |
| */ |
| void |
| ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) |
| { |
| ill_t *ill; |
| ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); |
| |
| ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); |
| |
| lifgr->gi_v4 = (grp->gr_v4 != NULL); |
| lifgr->gi_v6 = (grp->gr_v6 != NULL); |
| lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; |
| lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; |
| lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; |
| (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); |
| lifgr->gi_m4ifname[0] = '\0'; |
| lifgr->gi_m6ifname[0] = '\0'; |
| lifgr->gi_bcifname[0] = '\0'; |
| |
| if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { |
| (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); |
| (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); |
| } |
| |
| if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) |
| (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); |
| } |
| |
| /* |
| * Insert `grp' into the hash using the reserved hash entry `mh'. |
| * Caller must ensure `grp' is not yet in the hash. |
| */ |
| static void |
| ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) |
| { |
| int err; |
| ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); |
| |
| ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); |
| |
| /* |
| * Since grp->gr_name will exist at least as long as `grp' is in the |
| * hash, we use it directly as the key. |
| */ |
| err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, |
| (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); |
| if (err != 0) { |
| /* |
| * This should never happen since `mh' was preallocated. |
| */ |
| panic("cannot insert IPMP group \"%s\" (err %d)", |
| grp->gr_name, err); |
| } |
| } |
| |
| /* |
| * Remove `grp' from the hash. Caller must ensure `grp' is in it. |
| */ |
| static void |
| ipmp_grp_remove(ipmp_grp_t *grp) |
| { |
| int err; |
| mod_hash_val_t val; |
| mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; |
| ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); |
| |
| ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); |
| |
| err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); |
| if (err != 0 || val != grp) { |
| panic("cannot remove IPMP group \"%s\" (err %d)", |
| grp->gr_name, err); |
| } |
| } |
| |
| /* |
| * Attempt to rename `grp' to new name `grname'. Return an errno if the new |
| * group name already exists or is invalid, or if there isn't enough memory. |
| */ |
| int |
| ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) |
| { |
| mod_hash_hndl_t mh; |
| ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); |
| |
| ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); |
| |
| if (grname[0] == '\0') |
| return (EINVAL); |
| |
| if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, |
| (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) |
| return (EEXIST); |
| |
| /* |
| * Before we remove the group from the hash, ensure we'll be able to |
| * re-insert it by reserving space. |
| */ |
| if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) |
| return (ENOMEM); |
| |
| ipmp_grp_remove(grp); |
| (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); |
| ipmp_grp_insert(grp, mh); |
| |
| return (0); |
| } |
| |
| /* |
| * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in |
| * the hash, and that there are no interfaces on it. |
| */ |
| void |
| ipmp_grp_destroy(ipmp_grp_t *grp) |
| { |
| ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); |
| |
| ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); |
| |
| /* |
| * If there are still interfaces using this group, panic before things |
| * go really off the rails. |
| */ |
| if (grp->gr_nif != 0) |
| panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); |
| |
| ipmp_grp_remove(grp); |
| ipmp_grp_destroy_kstats(grp); |
| |
| ASSERT(grp->gr_v4 == NULL); |
| ASSERT(grp->gr_v6 == NULL); |
| ASSERT(grp->gr_nv4 == 0); |
| ASSERT(grp->gr_nv6 == 0); |
| ASSERT(grp->gr_nactif == 0); |
| ASSERT(grp->gr_linkdownmp == NULL); |
| grp->gr_phyint = NULL; |
| |
| kmem_free(grp, sizeof (ipmp_grp_t)); |
| } |
| |
| /* |
| * Check whether `ill' is suitable for inclusion into `grp', and return an |
| * errno describing the problem (if any). NOTE: many of these errno values |
| * are interpreted by ifconfig, which will take corrective action and retry |
| * the SIOCSLIFGROUPNAME, so please exercise care when changing them. |
| */ |
| static int |
| ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) |
| { |
| ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); |
| |
| /* |
| * To sidestep complicated address migration logic in the kernel and |
| * to force the kernel's all-hosts multicast memberships to be blown |
| * away, all addresses that had been brought up must be brought back |
| * down prior to adding an interface to a group. (This includes |
| * addresses currently down due to DAD.) Once the interface has been |
| * added to the group, its addresses can then be brought back up, at |
| * which point they will be moved to the IPMP meta-interface. |
| * NOTE: we do this before ill_appaddr_cnt() since bringing down the |
| * link-local causes in.ndpd to remove its ADDRCONF'd addresses. |
| */ |
| if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) |
| return (EADDRINUSE); |
| |
| /* |
| * To avoid confusing applications by changing addresses that are |
| * under their control, all such control must be removed prior to |
| * adding an interface into a group. |
| */ |
| if (ill_appaddr_cnt(ill) != 0) |
| return (EADDRNOTAVAIL); |
| |
| /* |
| * Since PTP addresses do not share the same broadcast domain, they |
| * are not allowed to be in an IPMP group. |
| */ |
| if (ill_ptpaddr_cnt(ill) != 0) |
| return (EINVAL); |
| |
| /* |
| * An ill must support multicast to be allowed into a group. |
| */ |
| if (!(ill->ill_flags & ILLF_MULTICAST)) |
| return (ENOTSUP); |
| |
| /* |
| * An ill must strictly be using ARP and/or ND for address |
| * resolution for it to be allowed into a group. |
| */ |
| if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP)) |
| return (ENOTSUP); |
| |
| /* |
| * An ill cannot also be using usesrc groups. (Although usesrc uses |
| * ill_g_usesrc_lock, we don't need to grab it since usesrc also does |
| * all its modifications as writer.) |
| */ |
| if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) |
| return (ENOTSUP); |
| |
| /* |
| * All ills in a group must be the same mactype. |
| */ |
| if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) |
| return (EINVAL); |
| |
| return (0); |
| } |
| |
| /* |
| * Check whether `phyi' is suitable for inclusion into `grp', and return an |
| * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() |
| * regarding errno values. |
| */ |
| int |
| ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) |
| { |
| int err = 0; |
| ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); |
| |
| ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); |
| ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); |
| |
| /* |
| * An interface cannot have address families plumbed that are not |
| * configured in the group. |
| */ |
| if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || |
| phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) |
| return (EAFNOSUPPORT); |
| |
| if (phyi->phyint_illv4 != NULL) |
| err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); |
| if (err == 0 && phyi->phyint_illv6 != NULL) |
| err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); |
| |
| return (err); |
| } |
| |
| /* |
| * Create a new illgrp on IPMP meta-interface `ill'. |
| */ |
| ipmp_illgrp_t * |
| ipmp_illgrp_create(ill_t *ill) |
| { |
| uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; |
| ipmp_illgrp_t *illg; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(IS_IPMP(ill)); |
| ASSERT(ill->ill_grp == NULL); |
| |
| if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) |
| return (NULL); |
| |
| list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); |
| list_create(&illg->ig_actif, sizeof (ill_t), |
| offsetof(ill_t, ill_actnode)); |
| list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), |
| offsetof(ipmp_arpent_t, ia_node)); |
| |
| illg->ig_ipmp_ill = ill; |
| ill->ill_grp = illg; |
| ipmp_illgrp_set_mtu(illg, mtu, mtu); |
| |
| return (illg); |
| } |
| |
| /* |
| * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. |
| */ |
| void |
| ipmp_illgrp_destroy(ipmp_illgrp_t *illg) |
| { |
| ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); |
| ASSERT(IS_IPMP(illg->ig_ipmp_ill)); |
| |
| /* |
| * Verify `illg' is empty. |
| */ |
| ASSERT(illg->ig_next_ill == NULL); |
| ASSERT(illg->ig_cast_ill == NULL); |
| ASSERT(list_is_empty(&illg->ig_arpent)); |
| ASSERT(list_is_empty(&illg->ig_if)); |
| ASSERT(list_is_empty(&illg->ig_actif)); |
| ASSERT(illg->ig_nactif == 0); |
| |
| /* |
| * Destroy `illg'. |
| */ |
| illg->ig_ipmp_ill->ill_grp = NULL; |
| illg->ig_ipmp_ill = NULL; |
| list_destroy(&illg->ig_if); |
| list_destroy(&illg->ig_actif); |
| list_destroy(&illg->ig_arpent); |
| kmem_free(illg, sizeof (ipmp_illgrp_t)); |
| } |
| |
| /* |
| * Add `ipif' to the pool of usable data addresses on `illg' and attempt to |
| * bind it to an underlying ill, while keeping an even address distribution. |
| * If the bind is successful, return a pointer to the bound ill. |
| */ |
| ill_t * |
| ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) |
| { |
| ill_t *minill; |
| ipmp_arpent_t *entp; |
| |
| ASSERT(IAM_WRITER_IPIF(ipif)); |
| ASSERT(ipmp_ipif_is_dataaddr(ipif)); |
| |
| /* |
| * IPMP data address mappings are internally managed by IP itself, so |
| * delete any existing ARP entries associated with the address. |
| */ |
| if (!ipif->ipif_isv6) { |
| entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); |
| if (entp != NULL) |
| ipmp_illgrp_destroy_arpent(illg, entp); |
| } |
| |
| if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) |
| ipmp_ill_bind_ipif(minill, ipif, Res_act_none); |
| |
| return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); |
| } |
| |
| /* |
| * Delete `ipif' from the pool of usable data addresses on `illg'. If it's |
| * bound, unbind it from the underlying ill while keeping an even address |
| * distribution. |
| */ |
| void |
| ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) |
| { |
| ill_t *maxill, *boundill = ipif->ipif_bound_ill; |
| |
| ASSERT(IAM_WRITER_IPIF(ipif)); |
| |
| if (boundill != NULL) { |
| (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); |
| |
| maxill = ipmp_illgrp_max_ill(illg); |
| if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { |
| ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); |
| ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); |
| } |
| } |
| } |
| |
| /* |
| * Return the active ill with the greatest number of data addresses in `illg'. |
| */ |
| static ill_t * |
| ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) |
| { |
| ill_t *ill, *bestill = NULL; |
| |
| ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); |
| |
| ill = list_head(&illg->ig_actif); |
| for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { |
| if (bestill == NULL || |
| ill->ill_bound_cnt > bestill->ill_bound_cnt) { |
| bestill = ill; |
| } |
| } |
| return (bestill); |
| } |
| |
| /* |
| * Return the active ill with the fewest number of data addresses in `illg'. |
| */ |
| static ill_t * |
| ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) |
| { |
| ill_t *ill, *bestill = NULL; |
| |
| ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); |
| |
| ill = list_head(&illg->ig_actif); |
| for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { |
| if (bestill == NULL || |
| ill->ill_bound_cnt < bestill->ill_bound_cnt) { |
| if (ill->ill_bound_cnt == 0) |
| return (ill); /* can't get better */ |
| bestill = ill; |
| } |
| } |
| return (bestill); |
| } |
| |
| /* |
| * Return a pointer to IPMP meta-interface for `illg' (which must exist). |
| * Since ig_ipmp_ill never changes for a given illg, no locks are needed. |
| */ |
| ill_t * |
| ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) |
| { |
| return (illg->ig_ipmp_ill); |
| } |
| |
| /* |
| * Return a pointer to the next available underlying ill in `illg', or NULL if |
| * one doesn't exist. Caller must be inside the IPSQ. |
| */ |
| ill_t * |
| ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) |
| { |
| ill_t *ill; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); |
| |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| if ((ill = illg->ig_next_ill) != NULL) { |
| illg->ig_next_ill = list_next(&illg->ig_actif, ill); |
| if (illg->ig_next_ill == NULL) |
| illg->ig_next_ill = list_head(&illg->ig_actif); |
| } |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| return (ill); |
| } |
| |
| /* |
| * Return a held pointer to the next available underlying ill in `illg', or |
| * NULL if one doesn't exist. Caller need not be inside the IPSQ. |
| */ |
| ill_t * |
| ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) |
| { |
| ill_t *ill; |
| uint_t i; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| for (i = 0; i < illg->ig_nactif; i++) { |
| ill = illg->ig_next_ill; |
| illg->ig_next_ill = list_next(&illg->ig_actif, ill); |
| if (illg->ig_next_ill == NULL) |
| illg->ig_next_ill = list_head(&illg->ig_actif); |
| |
| if (ill_check_and_refhold(ill)) { |
| rw_exit(&ipst->ips_ipmp_lock); |
| return (ill); |
| } |
| } |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| return (NULL); |
| } |
| |
| /* |
| * Return a held pointer to the nominated multicast ill in `illg', or NULL if |
| * one doesn't exist. Caller need not be inside the IPSQ. |
| */ |
| ill_t * |
| ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) |
| { |
| ill_t *castill; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| rw_enter(&ipst->ips_ipmp_lock, RW_READER); |
| castill = illg->ig_cast_ill; |
| if (castill != NULL && ill_check_and_refhold(castill)) { |
| rw_exit(&ipst->ips_ipmp_lock); |
| return (castill); |
| } |
| rw_exit(&ipst->ips_ipmp_lock); |
| return (NULL); |
| } |
| |
| /* |
| * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, |
| * any existing nomination is removed. Caller must be inside the IPSQ. |
| */ |
| static void |
| ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) |
| { |
| ill_t *ocastill = illg->ig_cast_ill; |
| ill_t *ipmp_ill = illg->ig_ipmp_ill; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| ASSERT(IAM_WRITER_ILL(ipmp_ill)); |
| |
| /* |
| * Disable old nominated ill (if any). |
| */ |
| if (ocastill != NULL) { |
| DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, |
| illg, ill_t *, ocastill); |
| ASSERT(ocastill->ill_nom_cast); |
| ocastill->ill_nom_cast = B_FALSE; |
| /* |
| * If the IPMP meta-interface is down, we never did the join, |
| * so we must not try to leave. |
| */ |
| if (ipmp_ill->ill_dl_up) |
| ill_leave_multicast(ipmp_ill); |
| |
| /* |
| * Delete any NCEs tied to the old nomination. We must do this |
| * last since ill_leave_multicast() may trigger IREs to be |
| * built using ig_cast_ill. |
| */ |
| ncec_walk(ocastill, ipmp_ncec_delete_nonlocal, ocastill, |
| ocastill->ill_ipst); |
| } |
| |
| /* |
| * Set new nomination. |
| */ |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| illg->ig_cast_ill = castill; |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| /* |
| * Enable new nominated ill (if any). |
| */ |
| if (castill != NULL) { |
| DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, |
| illg, ill_t *, castill); |
| ASSERT(!castill->ill_nom_cast); |
| castill->ill_nom_cast = B_TRUE; |
| /* |
| * If the IPMP meta-interface is down, the attempt to recover |
| * will silently fail but ill_need_recover_multicast will be |
| * erroneously cleared -- so check first. |
| */ |
| if (ipmp_ill->ill_dl_up) |
| ill_recover_multicast(ipmp_ill); |
| } |
| } |
| |
| /* |
| * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an |
| * entry for the same IP address already exists, destroy it first. Return the |
| * created IPMP ARP entry, or NULL on failure. |
| */ |
| ipmp_arpent_t * |
| ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp, |
| ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags) |
| { |
| ipmp_arpent_t *entp, *oentp; |
| |
| ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); |
| |
| if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len, |
| KM_NOSLEEP)) == NULL) |
| return (NULL); |
| |
| /* |
| * Delete any existing ARP entry for this address. |
| */ |
| if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) |
| ipmp_illgrp_destroy_arpent(illg, oentp); |
| |
| /* |
| * Prepend the new entry. |
| */ |
| entp->ia_ipaddr = ipaddr; |
| entp->ia_flags = flags; |
| entp->ia_lladdr_len = lladdr_len; |
| entp->ia_lladdr = (uchar_t *)&entp[1]; |
| bcopy(lladdr, entp->ia_lladdr, lladdr_len); |
| entp->ia_proxyarp = proxyarp; |
| entp->ia_notified = B_TRUE; |
| list_insert_head(&illg->ig_arpent, entp); |
| return (entp); |
| } |
| |
| /* |
| * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. |
| */ |
| void |
| ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) |
| { |
| ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); |
| |
| list_remove(&illg->ig_arpent, entp); |
| kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len); |
| } |
| |
| /* |
| * Mark that ARP has been notified about the IP address on `entp'; `illg' is |
| * taken as a debugging aid for DTrace FBT probes. |
| */ |
| /* ARGSUSED */ |
| void |
| ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) |
| { |
| entp->ia_notified = B_TRUE; |
| } |
| |
| /* |
| * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is |
| * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. |
| */ |
| ipmp_arpent_t * |
| ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) |
| { |
| ipmp_arpent_t *entp = list_head(&illg->ig_arpent); |
| |
| ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); |
| |
| if (addrp == NULL) |
| return (entp); |
| |
| for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) |
| if (entp->ia_ipaddr == *addrp) |
| break; |
| return (entp); |
| } |
| |
| /* |
| * Refresh ARP entries on `illg' to be distributed across its active |
| * interfaces. Entries that cannot be refreshed (e.g., because there are no |
| * active interfaces) are marked so that subsequent calls can try again. |
| */ |
| void |
| ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) |
| { |
| ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; |
| uint_t paddrlen = ipmp_ill->ill_phys_addr_length; |
| ipmp_arpent_t *entp; |
| ncec_t *ncec; |
| nce_t *nce; |
| |
| ASSERT(IAM_WRITER_ILL(ipmp_ill)); |
| ASSERT(!ipmp_ill->ill_isv6); |
| |
| ill = list_head(&illg->ig_actif); |
| entp = list_head(&illg->ig_arpent); |
| for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { |
| if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { |
| entp->ia_notified = B_FALSE; |
| continue; |
| } |
| |
| ASSERT(paddrlen == ill->ill_phys_addr_length); |
| |
| /* |
| * If this is a proxy ARP entry, we can skip notifying ARP if |
| * the entry is already up-to-date. If it has changed, we |
| * update the entry's hardware address before notifying ARP. |
| */ |
| if (entp->ia_proxyarp) { |
| if (bcmp(ill->ill_phys_addr, entp->ia_lladdr, |
| paddrlen) == 0 && entp->ia_notified) |
| continue; |
| bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen); |
| } |
| |
| (void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr, |
| paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED, |
| &nce); |
| if (nce == NULL || !entp->ia_proxyarp) { |
| if (nce != NULL) |
| nce_refrele(nce); |
| continue; |
| } |
| ncec = nce->nce_common; |
| mutex_enter(&ncec->ncec_lock); |
| nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr); |
| mutex_exit(&ncec->ncec_lock); |
| nce_refrele(nce); |
| ipmp_illgrp_mark_arpent(illg, entp); |
| |
| if ((ill = list_next(&illg->ig_actif, ill)) == NULL) |
| ill = list_head(&illg->ig_actif); |
| } |
| } |
| |
| /* |
| * Return an interface in `illg' with the specified `physaddr', or NULL if one |
| * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. |
| */ |
| ill_t * |
| ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) |
| { |
| ill_t *ill; |
| ill_t *ipmp_ill = illg->ig_ipmp_ill; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); |
| |
| ill = list_head(&illg->ig_if); |
| for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { |
| if (ill->ill_phys_addr_length == paddrlen && |
| bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) |
| return (ill); |
| } |
| return (NULL); |
| } |
| |
| /* |
| * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. |
| * Caller must be inside the IPSQ unless this is initialization. |
| */ |
| static void |
| ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu, uint_t mc_mtu) |
| { |
| ill_t *ill = illg->ig_ipmp_ill; |
| mblk_t *mp; |
| |
| ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); |
| |
| /* |
| * If allocation fails, we have bigger problems than MTU. |
| */ |
| if ((mp = ip_dlnotify_alloc2(DL_NOTE_SDU_SIZE2, mtu, mc_mtu)) != NULL) { |
| illg->ig_mtu = mtu; |
| illg->ig_mc_mtu = mc_mtu; |
| put(ill->ill_rq, mp); |
| } |
| } |
| |
| /* |
| * Recalculate the IPMP group MTU for `illg', and update its associated IPMP |
| * ill MTU if necessary. |
| */ |
| void |
| ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) |
| { |
| ill_t *ill; |
| ill_t *ipmp_ill = illg->ig_ipmp_ill; |
| uint_t mtu = 0; |
| uint_t mc_mtu = 0; |
| |
| ASSERT(IAM_WRITER_ILL(ipmp_ill)); |
| |
| /* |
| * Since ill_mtu can only change under ill_lock, we hold ill_lock |
| * for each ill as we iterate through the list. Any changes to the |
| * ill_mtu will also trigger an update, so even if we missed it |
| * this time around, the update will catch it. |
| */ |
| ill = list_head(&illg->ig_if); |
| for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { |
| mutex_enter(&ill->ill_lock); |
| if (mtu == 0 || ill->ill_mtu < mtu) |
| mtu = ill->ill_mtu; |
| if (mc_mtu == 0 || ill->ill_mc_mtu < mc_mtu) |
| mc_mtu = ill->ill_mc_mtu; |
| mutex_exit(&ill->ill_lock); |
| } |
| |
| /* |
| * MTU must be at least the minimum MTU. |
| */ |
| mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); |
| mc_mtu = MAX(mc_mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); |
| if (illg->ig_mtu != mtu || illg->ig_mc_mtu != mc_mtu) |
| ipmp_illgrp_set_mtu(illg, mtu, mc_mtu); |
| } |
| |
| /* |
| * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently |
| * allow the same link to be established more than once. |
| */ |
| void |
| ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) |
| { |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); |
| |
| if (illg->ig_ipmp_ill->ill_isv6) { |
| ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); |
| grp->gr_v6 = illg; |
| } else { |
| ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); |
| grp->gr_v4 = illg; |
| } |
| } |
| |
| /* |
| * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp |
| * cannot be unlinked (e.g., because there are still interfaces using it). |
| */ |
| int |
| ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) |
| { |
| ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); |
| |
| if (illg->ig_ipmp_ill->ill_isv6) { |
| if (grp->gr_nv6 + grp->gr_pendv6 != 0) |
| return (EBUSY); |
| grp->gr_v6 = NULL; |
| } else { |
| if (grp->gr_nv4 + grp->gr_pendv4 != 0) |
| return (EBUSY); |
| grp->gr_v4 = NULL; |
| } |
| return (0); |
| } |
| |
| /* |
| * Place `ill' into `illg', and rebalance the data addresses on `illg' |
| * to be spread evenly across the ills now in it. Also, adjust the IPMP |
| * ill as necessary to account for `ill' (e.g., MTU). |
| */ |
| void |
| ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) |
| { |
| ill_t *ipmp_ill; |
| ipif_t *ipif; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ |
| ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(ill->ill_grp == NULL); |
| |
| ipmp_ill = illg->ig_ipmp_ill; |
| |
| /* |
| * Account for `ill' joining the illgrp. |
| */ |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| if (ill->ill_isv6) |
| ill->ill_phyint->phyint_grp->gr_nv6++; |
| else |
| ill->ill_phyint->phyint_grp->gr_nv4++; |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| /* |
| * Ensure the ILLF_ROUTER flag remains consistent across the group. |
| */ |
| mutex_enter(&ill->ill_lock); |
| if (ipmp_ill->ill_flags & ILLF_ROUTER) |
| ill->ill_flags |= ILLF_ROUTER; |
| else |
| ill->ill_flags &= ~ILLF_ROUTER; |
| mutex_exit(&ill->ill_lock); |
| |
| /* |
| * Blow away all multicast memberships that currently exist on `ill'. |
| * This may seem odd, but it's consistent with the application view |
| * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). |
| * The ill_grp_pending bit prevents multicast group joins after |
| * update_conn_ill() and before ill_grp assignment. |
| */ |
| mutex_enter(&ill->ill_mcast_serializer); |
| ill->ill_grp_pending = 1; |
| mutex_exit(&ill->ill_mcast_serializer); |
| update_conn_ill(ill, ill->ill_ipst); |
| if (ill->ill_isv6) { |
| reset_mrt_ill(ill); |
| } else { |
| ipif = ill->ill_ipif; |
| for (; ipif != NULL; ipif = ipif->ipif_next) { |
| reset_mrt_vif_ipif(ipif); |
| } |
| } |
| ip_purge_allmulti(ill); |
| |
| /* |
| * Borrow the first ill's ill_phys_addr_length value for the illgrp's |
| * physical address length. All other ills must have the same value, |
| * since they are required to all be the same mactype. Also update |
| * the IPMP ill's MTU and CoS marking, if necessary. |
| */ |
| if (list_is_empty(&illg->ig_if)) { |
| ASSERT(ipmp_ill->ill_phys_addr_length == 0); |
| /* |
| * NOTE: we leave ill_phys_addr NULL since the IPMP group |
| * doesn't have a physical address. This means that code must |
| * not assume that ill_phys_addr is non-NULL just because |
| * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. |
| */ |
| ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; |
| ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; |
| ipmp_ill->ill_type = ill->ill_type; |
| |
| if (ill->ill_flags & ILLF_COS_ENABLED) { |
| mutex_enter(&ipmp_ill->ill_lock); |
| ipmp_ill->ill_flags |= ILLF_COS_ENABLED; |
| mutex_exit(&ipmp_ill->ill_lock); |
| } |
| ipmp_illgrp_set_mtu(illg, ill->ill_mtu, ill->ill_mc_mtu); |
| } else { |
| ASSERT(ipmp_ill->ill_phys_addr_length == |
| ill->ill_phys_addr_length); |
| ASSERT(ipmp_ill->ill_type == ill->ill_type); |
| |
| if (!(ill->ill_flags & ILLF_COS_ENABLED)) { |
| mutex_enter(&ipmp_ill->ill_lock); |
| ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; |
| mutex_exit(&ipmp_ill->ill_lock); |
| } |
| if (illg->ig_mtu > ill->ill_mtu || |
| illg->ig_mc_mtu > ill->ill_mc_mtu) { |
| ipmp_illgrp_set_mtu(illg, ill->ill_mtu, |
| ill->ill_mc_mtu); |
| } |
| } |
| |
| rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); |
| list_insert_tail(&illg->ig_if, ill); |
| ill->ill_grp = illg; |
| rw_exit(&ipst->ips_ill_g_lock); |
| |
| mutex_enter(&ill->ill_mcast_serializer); |
| ill->ill_grp_pending = 0; |
| mutex_exit(&ill->ill_mcast_serializer); |
| |
| /* |
| * Hide the IREs on `ill' so that we don't accidentally find them when |
| * sending data traffic. |
| */ |
| ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); |
| |
| ipmp_ill_refresh_active(ill); |
| } |
| |
| /* |
| * Remove `ill' from its illgrp, and rebalance the data addresses in that |
| * illgrp to be spread evenly across the remaining ills. Also, adjust the |
| * IPMP ill as necessary now that `ill' is removed (e.g., MTU). |
| */ |
| void |
| ipmp_ill_leave_illgrp(ill_t *ill) |
| { |
| ill_t *ipmp_ill; |
| ipif_t *ipif; |
| ipmp_arpent_t *entp; |
| ipmp_illgrp_t *illg = ill->ill_grp; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| ASSERT(IS_UNDER_IPMP(ill)); |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(illg != NULL); |
| |
| ipmp_ill = illg->ig_ipmp_ill; |
| |
| /* |
| * Cancel IPMP-specific ill timeouts. |
| */ |
| (void) untimeout(ill->ill_refresh_tid); |
| |
| /* |
| * Expose any previously-hidden IREs on `ill'. |
| */ |
| ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); |
| |
| /* |
| * Ensure the multicast state for each ipif on `ill' is down so that |
| * our ipif_multicast_up() (once `ill' leaves the group) will rejoin |
| * all eligible groups. |
| */ |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| if (ipif->ipif_flags & IPIF_UP) |
| ipif_multicast_down(ipif); |
| |
| /* |
| * Account for `ill' leaving the illgrp. |
| */ |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| if (ill->ill_isv6) |
| ill->ill_phyint->phyint_grp->gr_nv6--; |
| else |
| ill->ill_phyint->phyint_grp->gr_nv4--; |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| /* |
| * Pull `ill' out of the interface lists. |
| */ |
| if (list_link_active(&ill->ill_actnode)) |
| ipmp_ill_deactivate(ill); |
| rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); |
| list_remove(&illg->ig_if, ill); |
| ill->ill_grp = NULL; |
| rw_exit(&ipst->ips_ill_g_lock); |
| |
| /* |
| * Re-establish multicast memberships that were previously being |
| * handled by the IPMP meta-interface. |
| */ |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| if (ipif->ipif_flags & IPIF_UP) |
| ipif_multicast_up(ipif); |
| |
| /* |
| * Refresh the group MTU based on the new interface list. |
| */ |
| ipmp_illgrp_refresh_mtu(illg); |
| |
| if (list_is_empty(&illg->ig_if)) { |
| /* |
| * No ills left in the illgrp; we no longer have a physical |
| * address length, nor can we support ARP, CoS, or anything |
| * else that depends on knowing the link layer type. |
| */ |
| while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) |
| ipmp_illgrp_destroy_arpent(illg, entp); |
| |
| ipmp_ill->ill_phys_addr_length = 0; |
| ipmp_ill->ill_nd_lla_len = 0; |
| ipmp_ill->ill_type = IFT_OTHER; |
| mutex_enter(&ipmp_ill->ill_lock); |
| ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; |
| mutex_exit(&ipmp_ill->ill_lock); |
| } else { |
| /* |
| * If `ill' didn't support CoS, see if it can now be enabled. |
| */ |
| if (!(ill->ill_flags & ILLF_COS_ENABLED)) { |
| ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); |
| |
| ill = list_head(&illg->ig_if); |
| do { |
| if (!(ill->ill_flags & ILLF_COS_ENABLED)) |
| break; |
| } while ((ill = list_next(&illg->ig_if, ill)) != NULL); |
| |
| if (ill == NULL) { |
| mutex_enter(&ipmp_ill->ill_lock); |
| ipmp_ill->ill_flags |= ILLF_COS_ENABLED; |
| mutex_exit(&ipmp_ill->ill_lock); |
| } |
| } |
| } |
| } |
| |
| /* |
| * Check if `ill' should be active, and activate or deactivate if need be. |
| * Return B_FALSE if a refresh was necessary but could not be performed. |
| */ |
| static boolean_t |
| ipmp_ill_try_refresh_active(ill_t *ill) |
| { |
| boolean_t refreshed = B_TRUE; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(IS_UNDER_IPMP(ill)); |
| |
| if (ipmp_ill_is_active(ill)) { |
| if (!list_link_active(&ill->ill_actnode)) |
| refreshed = ipmp_ill_activate(ill); |
| } else { |
| if (list_link_active(&ill->ill_actnode)) |
| ipmp_ill_deactivate(ill); |
| } |
| |
| return (refreshed); |
| } |
| |
| /* |
| * Check if `ill' should be active, and activate or deactivate if need be. |
| * If the refresh fails, schedule a timer to try again later. |
| */ |
| void |
| ipmp_ill_refresh_active(ill_t *ill) |
| { |
| if (!ipmp_ill_try_refresh_active(ill)) |
| ipmp_ill_refresh_active_timer_start(ill); |
| } |
| |
| /* |
| * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. |
| */ |
| static void |
| ipmp_ill_refresh_active_timer(void *ill_arg) |
| { |
| ill_t *ill = ill_arg; |
| boolean_t refreshed = B_FALSE; |
| |
| /* |
| * Clear ill_refresh_tid to indicate that no timeout is pending |
| * (another thread could schedule a new timeout while we're still |
| * running, but that's harmless). If the ill is going away, bail. |
| */ |
| mutex_enter(&ill->ill_lock); |
| ill->ill_refresh_tid = 0; |
| if (ill->ill_state_flags & ILL_CONDEMNED) { |
| mutex_exit(&ill->ill_lock); |
| return; |
| } |
| mutex_exit(&ill->ill_lock); |
| |
| if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { |
| refreshed = ipmp_ill_try_refresh_active(ill); |
| ipsq_exit(ill->ill_phyint->phyint_ipsq); |
| } |
| |
| /* |
| * If the refresh failed, schedule another attempt. |
| */ |
| if (!refreshed) |
| ipmp_ill_refresh_active_timer_start(ill); |
| } |
| |
| /* |
| * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. |
| */ |
| static void |
| ipmp_ill_refresh_active_timer_start(ill_t *ill) |
| { |
| mutex_enter(&ill->ill_lock); |
| |
| /* |
| * If the ill is going away or a refresh is already scheduled, bail. |
| */ |
| if (ill->ill_refresh_tid != 0 || |
| (ill->ill_state_flags & ILL_CONDEMNED)) { |
| mutex_exit(&ill->ill_lock); |
| return; |
| } |
| |
| ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, |
| SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); |
| |
| mutex_exit(&ill->ill_lock); |
| } |
| |
| /* |
| * Activate `ill' so it will be used to send and receive data traffic. Return |
| * B_FALSE if `ill' cannot be activated. Note that we allocate any messages |
| * needed to deactivate `ill' here as well so that deactivation cannot fail. |
| */ |
| static boolean_t |
| ipmp_ill_activate(ill_t *ill) |
| { |
| ipif_t *ipif; |
| mblk_t *linkupmp = NULL, *linkdownmp = NULL; |
| ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; |
| ipmp_illgrp_t *illg = ill->ill_grp; |
| ill_t *maxill; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(IS_UNDER_IPMP(ill)); |
| |
| /* |
| * If this will be the first active interface in the group, allocate |
| * the link-up and link-down messages. |
| */ |
| if (grp->gr_nactif == 0) { |
| linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); |
| linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); |
| if (linkupmp == NULL || linkdownmp == NULL) |
| goto fail; |
| } |
| |
| if (list_is_empty(&illg->ig_actif)) { |
| /* |
| * Now that we have an active ill, nominate it for multicast |
| * and broadcast duties. Do this before ipmp_ill_bind_ipif() |
| * since that may need to send multicast packets (e.g., IPv6 |
| * neighbor discovery probes). |
| */ |
| ipmp_illgrp_set_cast(illg, ill); |
| |
| /* |
| * This is the first active ill in the illgrp -- add 'em all. |
| * We can access/walk ig_ipmp_ill's ipif list since we're |
| * writer on its IPSQ as well. |
| */ |
| ipif = illg->ig_ipmp_ill->ill_ipif; |
| for (; ipif != NULL; ipif = ipif->ipif_next) |
| if (ipmp_ipif_is_up_dataaddr(ipif)) |
| ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); |
| } else { |
| /* |
| * Redistribute the addresses by moving them from the ill with |
| * the most addresses until the ill being activated is at the |
| * same level as the rest of the ills. |
| */ |
| for (;;) { |
| maxill = ipmp_illgrp_max_ill(illg); |
| ASSERT(maxill != NULL); |
| if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) |
| break; |
| ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); |
| ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); |
| } |
| } |
| |
| /* |
| * Put the interface in the active list. |
| */ |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| list_insert_tail(&illg->ig_actif, ill); |
| illg->ig_nactif++; |
| illg->ig_next_ill = ill; |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| /* |
| * Refresh static/proxy ARP entries to use `ill', if need be. |
| */ |
| if (!ill->ill_isv6) |
| ipmp_illgrp_refresh_arpent(illg); |
| |
| /* |
| * Finally, mark the group link up, if necessary. |
| */ |
| if (grp->gr_nactif++ == 0) { |
| ASSERT(grp->gr_linkdownmp == NULL); |
| grp->gr_linkdownmp = linkdownmp; |
| put(illg->ig_ipmp_ill->ill_rq, linkupmp); |
| } |
| return (B_TRUE); |
| fail: |
| freemsg(linkupmp); |
| freemsg(linkdownmp); |
| return (B_FALSE); |
| } |
| |
| /* |
| * Deactivate `ill' so it will not be used to send or receive data traffic. |
| */ |
| static void |
| ipmp_ill_deactivate(ill_t *ill) |
| { |
| ill_t *minill, *ipmp_ill; |
| ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; |
| mblk_t *mp; |
| ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; |
| ipmp_illgrp_t *illg = ill->ill_grp; |
| ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(IS_UNDER_IPMP(ill)); |
| |
| ipmp_ill = illg->ig_ipmp_ill; |
| |
| /* |
| * Pull the interface out of the active list. |
| */ |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| list_remove(&illg->ig_actif, ill); |
| illg->ig_nactif--; |
| illg->ig_next_ill = list_head(&illg->ig_actif); |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| /* |
| * If the ill that's being deactivated had been nominated for |
| * multicast/broadcast, nominate a new one. |
| */ |
| if (ill == illg->ig_cast_ill) |
| ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); |
| |
| /* |
| * Delete all nce_t entries using this ill, so that the next attempt |
| * to send data traffic will revalidate cached nce's. |
| */ |
| nce_flush(ill, B_TRUE); |
| |
| /* |
| * Unbind all of the ipifs bound to this ill, and save 'em in a list; |
| * we'll rebind them after we tell the resolver the ill is no longer |
| * active. We must do things in this order or the resolver could |
| * accidentally rebind to the ill we're trying to remove if multiple |
| * ills in the group have the same hardware address (which is |
| * unsupported, but shouldn't lead to a wedged machine). |
| */ |
| while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { |
| ipif->ipif_bound_next = ubheadipif; |
| ubheadipif = ipif; |
| } |
| |
| if (!ill->ill_isv6) { |
| /* |
| * Refresh static/proxy ARP entries that had been using `ill'. |
| */ |
| ipmp_illgrp_refresh_arpent(illg); |
| } |
| |
| /* |
| * Rebind each ipif from the deactivated ill to the active ill with |
| * the fewest ipifs. If there are no active ills, the ipifs will |
| * remain unbound. |
| */ |
| for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { |
| ubnextipif = ipif->ipif_bound_next; |
| ipif->ipif_bound_next = NULL; |
| |
| if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) |
| ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); |
| } |
| |
| /* |
| * Remove any IRE_IF_CLONEs for this ill since they might have an |
| * ire_nce_cache/nce_common which refers to another ill in the group. |
| */ |
| ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone, ill, |
| ill); |
| |
| /* |
| * Finally, if there are no longer any active interfaces, then delete |
| * any NCECs associated with the group and mark the group link down. |
| */ |
| if (--grp->gr_nactif == 0) { |
| ncec_walk(ipmp_ill, ncec_delete_per_ill, ipmp_ill, ipst); |
| mp = grp->gr_linkdownmp; |
| grp->gr_linkdownmp = NULL; |
| ASSERT(mp != NULL); |
| put(ipmp_ill->ill_rq, mp); |
| } |
| } |
| |
| /* |
| * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) |
| * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. |
| */ |
| static void |
| ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) |
| { |
| ipif_t *ipif; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); |
| |
| /* |
| * If `ill' is truly down, there are no messages to generate since: |
| * |
| * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface |
| * and its addresses by bringing them down. But that's already |
| * true, so there's nothing to hide. |
| * |
| * 2. If cmd == RTM_ADD, then we're supposed to generate messages |
| * indicating that any previously-hidden up addresses are again |
| * back up (along with the interface). But they aren't, so |
| * there's nothing to expose. |
| */ |
| if (ill->ill_ipif_up_count == 0) |
| return; |
| |
| if (cmd == RTM_ADD) |
| ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); |
| |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| if (ipif->ipif_flags & IPIF_UP) |
| ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); |
| |
| if (cmd == RTM_DELETE) |
| ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); |
| } |
| |
| /* |
| * Bind the address named by `ipif' to the underlying ill named by `ill'. |
| * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' |
| * will indicate to the resolver whether this is an initial bringup of |
| * `ipif', or just a rebind to another ill. |
| */ |
| static void |
| ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) |
| { |
| int err = 0; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); |
| ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); |
| ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); |
| ASSERT(ipif->ipif_bound_ill == NULL); |
| ASSERT(ipif->ipif_bound_next == NULL); |
| |
| ipif->ipif_bound_next = ill->ill_bound_ipif; |
| ill->ill_bound_ipif = ipif; |
| ill->ill_bound_cnt++; |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| ipif->ipif_bound_ill = ill; |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| /* |
| * If necessary, tell ARP/NDP about the new mapping. Note that |
| * ipif_resolver_up() cannot fail for IPv6 ills. |
| */ |
| if (act != Res_act_none) { |
| if (ill->ill_isv6) { |
| VERIFY(ipif_resolver_up(ipif, act) == 0); |
| err = ipif_ndp_up(ipif, act == Res_act_initial); |
| } else { |
| err = ipif_resolver_up(ipif, act); |
| } |
| |
| /* |
| * Since ipif_ndp_up() never returns EINPROGRESS and |
| * ipif_resolver_up() only returns EINPROGRESS when the |
| * associated ill is not up, we should never be here with |
| * EINPROGRESS. We rely on this to simplify the design. |
| */ |
| ASSERT(err != EINPROGRESS); |
| } |
| /* TODO: retry binding on failure? when? */ |
| ipif->ipif_bound = (err == 0); |
| } |
| |
| /* |
| * Unbind the address named by `ipif' from the underlying ill named by `ill'. |
| * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. |
| * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is |
| * B_TRUE, notify the resolver about the change. |
| */ |
| static ipif_t * |
| ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) |
| { |
| ipif_t *previpif; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(IS_UNDER_IPMP(ill)); |
| |
| /* |
| * If necessary, find an ipif to unbind. |
| */ |
| if (ipif == NULL) { |
| if ((ipif = ill->ill_bound_ipif) == NULL) { |
| ASSERT(ill->ill_bound_cnt == 0); |
| return (NULL); |
| } |
| } |
| |
| ASSERT(IAM_WRITER_IPIF(ipif)); |
| ASSERT(IS_IPMP(ipif->ipif_ill)); |
| ASSERT(ipif->ipif_bound_ill == ill); |
| ASSERT(ill->ill_bound_cnt > 0); |
| |
| /* |
| * Unbind it. |
| */ |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| ipif->ipif_bound_ill = NULL; |
| rw_exit(&ipst->ips_ipmp_lock); |
| ill->ill_bound_cnt--; |
| |
| if (ill->ill_bound_ipif == ipif) { |
| ill->ill_bound_ipif = ipif->ipif_bound_next; |
| } else { |
| previpif = ill->ill_bound_ipif; |
| while (previpif->ipif_bound_next != ipif) |
| previpif = previpif->ipif_bound_next; |
| |
| previpif->ipif_bound_next = ipif->ipif_bound_next; |
| } |
| ipif->ipif_bound_next = NULL; |
| |
| /* |
| * If requested, notify the resolvers (provided we're bound). |
| */ |
| if (notifyres && ipif->ipif_bound) { |
| if (ill->ill_isv6) |
| ipif_ndp_down(ipif); |
| else |
| (void) ipif_arp_down(ipif); |
| } |
| ipif->ipif_bound = B_FALSE; |
| |
| return (ipif); |
| } |
| |
| /* |
| * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if |
| * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this |
| * to determine whether an ill should be considered active, other consumers |
| * may race and learn about an ill that should be deactivated/activated before |
| * IPMP has performed the activation/deactivation. This should be safe though |
| * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that |
| * would've been cleaned up by ipmp_ill_deactivate(). |
| */ |
| boolean_t |
| ipmp_ill_is_active(ill_t *ill) |
| { |
| phyint_t *phyi = ill->ill_phyint; |
| |
| ASSERT(IS_UNDER_IPMP(ill)); |
| ASSERT(IAM_WRITER_ILL(ill) || |
| (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); |
| |
| /* |
| * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to |
| * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the |
| * link flapping logic to be just in in.mpathd and allows us to ignore |
| * changes to PHYI_RUNNING. |
| */ |
| return (!(ill->ill_ipif_up_count == 0 || |
| (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); |
| } |
| |
| /* |
| * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated |
| * with `ill_arg'. |
| */ |
| static void |
| ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) |
| { |
| ill_t *ill = (ill_t *)ill_arg; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(!IS_IPMP(ill)); |
| |
| if (ire->ire_ill != ill) |
| return; |
| |
| if (IRE_HIDDEN_TYPE(ire->ire_type)) { |
| DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); |
| ire->ire_testhidden = B_TRUE; |
| } |
| } |
| |
| /* |
| * IRE walker callback: clear ire_testhidden if the IRE has a source address |
| * on `ill_arg'. |
| */ |
| static void |
| ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) |
| { |
| ill_t *ill = (ill_t *)ill_arg; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(!IS_IPMP(ill)); |
| |
| if (ire->ire_ill == ill) { |
| DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); |
| ire->ire_testhidden = B_FALSE; |
| } |
| } |
| |
| /* |
| * Return a held pointer to the IPMP ill for underlying interface `ill', or |
| * NULL if one doesn't exist. (Unfortunately, this function needs to take an |
| * underlying ill rather than an ipmp_illgrp_t because an underlying ill's |
| * ill_grp pointer may become stale when not inside an IPSQ and not holding |
| * ipmp_lock.) Caller need not be inside the IPSQ. |
| */ |
| ill_t * |
| ipmp_ill_hold_ipmp_ill(ill_t *ill) |
| { |
| ip_stack_t *ipst = ill->ill_ipst; |
| ipmp_illgrp_t *illg; |
| |
| ASSERT(!IS_IPMP(ill)); |
| |
| rw_enter(&ipst->ips_ipmp_lock, RW_READER); |
| illg = ill->ill_grp; |
| if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) { |
| rw_exit(&ipst->ips_ipmp_lock); |
| return (illg->ig_ipmp_ill); |
| } |
| /* |
| * Assume `ill' was removed from the illgrp in the meantime. |
| */ |
| rw_exit(&ill->ill_ipst->ips_ipmp_lock); |
| return (NULL); |
| } |
| |
| /* |
| * Return a held pointer to the appropriate underlying ill for sending the |
| * specified type of packet. (Unfortunately, this function needs to take an |
| * underlying ill rather than an ipmp_illgrp_t because an underlying ill's |
| * ill_grp pointer may become stale when not inside an IPSQ and not holding |
| * ipmp_lock.) Caller need not be inside the IPSQ. |
| */ |
| ill_t * |
| ipmp_ill_hold_xmit_ill(ill_t *ill, boolean_t is_unicast) |
| { |
| ill_t *xmit_ill; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| rw_enter(&ipst->ips_ill_g_lock, RW_READER); |
| if (ill->ill_grp == NULL) { |
| /* |
| * The ill was taken out of the group, so just send on it. |
| */ |
| rw_exit(&ipst->ips_ill_g_lock); |
| ill_refhold(ill); |
| return (ill); |
| } |
| if (is_unicast) |
| xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp); |
| else |
| xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); |
| rw_exit(&ipst->ips_ill_g_lock); |
| |
| return (xmit_ill); |
| } |
| |
| /* |
| * Return the interface index for the IPMP ill tied to underlying interface |
| * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. |
| */ |
| uint_t |
| ipmp_ill_get_ipmp_ifindex(const ill_t *ill) |
| { |
| uint_t ifindex = 0; |
| ip_stack_t *ipst = ill->ill_ipst; |
| ipmp_grp_t *grp; |
| |
| ASSERT(!IS_IPMP(ill)); |
| |
| rw_enter(&ipst->ips_ipmp_lock, RW_READER); |
| if ((grp = ill->ill_phyint->phyint_grp) != NULL) |
| ifindex = grp->gr_phyint->phyint_ifindex; |
| rw_exit(&ipst->ips_ipmp_lock); |
| return (ifindex); |
| } |
| |
| /* |
| * Place phyint `phyi' into IPMP group `grp'. |
| */ |
| void |
| ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) |
| { |
| ill_t *ill; |
| ipsq_t *ipsq = phyi->phyint_ipsq; |
| ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; |
| ip_stack_t *ipst = PHYINT_TO_IPST(phyi); |
| |
| ASSERT(IAM_WRITER_IPSQ(ipsq)); |
| ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); |
| ill = NULL; |
| |
| /* |
| * Send routing socket messages indicating that the phyint's ills |
| * and ipifs vanished. |
| */ |
| if (phyi->phyint_illv4 != NULL) { |
| ill = phyi->phyint_illv4; |
| ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); |
| } |
| |
| if (phyi->phyint_illv6 != NULL) { |
| ill = phyi->phyint_illv6; |
| ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); |
| } |
| |
| /* |
| * Snapshot the phyint's initial kstats as a baseline. |
| */ |
| ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); |
| |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| |
| phyi->phyint_grp = grp; |
| if (++grp->gr_nif == 1) |
| grp->gr_mactype = ill->ill_mactype; |
| else |
| ASSERT(grp->gr_mactype == ill->ill_mactype); |
| |
| /* |
| * Now that we're in the group, request a switch to the group's xop |
| * when we ipsq_exit(). All future operations will be exclusive on |
| * the group xop until ipmp_phyint_leave_grp() is called. |
| */ |
| ASSERT(ipsq->ipsq_swxop == NULL); |
| ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); |
| ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; |
| |
| rw_exit(&ipst->ips_ipmp_lock); |
| } |
| |
| /* |
| * Remove phyint `phyi' from its current IPMP group. |
| */ |
| void |
| ipmp_phyint_leave_grp(phyint_t *phyi) |
| { |
| uint_t i; |
| ipsq_t *ipsq = phyi->phyint_ipsq; |
| ip_stack_t *ipst = PHYINT_TO_IPST(phyi); |
| uint64_t phyi_kstats[IPMP_KSTAT_MAX]; |
| |
| ASSERT(IAM_WRITER_IPSQ(ipsq)); |
| |
| /* |
| * If any of the phyint's ills are still in an illgrp, kick 'em out. |
| */ |
| if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) |
| ipmp_ill_leave_illgrp(phyi->phyint_illv4); |
| if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) |
| ipmp_ill_leave_illgrp(phyi->phyint_illv6); |
| |
| /* |
| * Send routing socket messages indicating that the phyint's ills |
| * and ipifs have reappeared. |
| */ |
| if (phyi->phyint_illv4 != NULL) |
| ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); |
| if (phyi->phyint_illv6 != NULL) |
| ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); |
| |
| /* |
| * Calculate the phyint's cumulative kstats while it was in the group, |
| * and add that to the group's baseline. |
| */ |
| ipmp_phyint_get_kstats(phyi, phyi_kstats); |
| for (i = 0; i < IPMP_KSTAT_MAX; i++) { |
| phyi_kstats[i] -= phyi->phyint_kstats0[i]; |
| atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); |
| } |
| |
| rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); |
| |
| phyi->phyint_grp->gr_nif--; |
| phyi->phyint_grp = NULL; |
| |
| /* |
| * As our final act in leaving the group, request a switch back to our |
| * IPSQ's own xop when we ipsq_exit(). |
| */ |
| ASSERT(ipsq->ipsq_swxop == NULL); |
| ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; |
| |
| rw_exit(&ipst->ips_ipmp_lock); |
| } |
| |
| /* |
| * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. |
| * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. |
| */ |
| static void |
| ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) |
| { |
| uint_t i, j; |
| const char *name; |
| kstat_t *ksp; |
| kstat_named_t *kn; |
| ip_stack_t *ipst = PHYINT_TO_IPST(phyi); |
| zoneid_t zoneid; |
| |
| bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); |
| zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid); |
| ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid); |
| if (ksp == NULL) |
| return; |
| |
| KSTAT_ENTER(ksp); |
| |
| if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { |
| /* |
| * Bring kstats up-to-date before recording. |
| */ |
| (void) KSTAT_UPDATE(ksp, KSTAT_READ); |
| |
| kn = KSTAT_NAMED_PTR(ksp); |
| for (i = 0; i < IPMP_KSTAT_MAX; i++) { |
| name = ipmp_kstats[i].name; |
| kstats[i] = 0; |
| for (j = 0; j < ksp->ks_ndata; j++) { |
| if (strcmp(kn[j].name, name) != 0) |
| continue; |
| |
| switch (kn[j].data_type) { |
| case KSTAT_DATA_INT32: |
| case KSTAT_DATA_UINT32: |
| kstats[i] = kn[j].value.ui32; |
| break; |
| #ifdef _LP64 |
| case KSTAT_DATA_LONG: |
| case KSTAT_DATA_ULONG: |
| kstats[i] = kn[j].value.ul; |
| break; |
| #endif |
| case KSTAT_DATA_INT64: |
| case KSTAT_DATA_UINT64: |
| kstats[i] = kn[j].value.ui64; |
| break; |
| } |
| break; |
| } |
| } |
| } |
| |
| KSTAT_EXIT(ksp); |
| kstat_rele(ksp); |
| } |
| |
| /* |
| * Refresh the active state of all ills on `phyi'. |
| */ |
| void |
| ipmp_phyint_refresh_active(phyint_t *phyi) |
| { |
| if (phyi->phyint_illv4 != NULL) |
| ipmp_ill_refresh_active(phyi->phyint_illv4); |
| if (phyi->phyint_illv6 != NULL) |
| ipmp_ill_refresh_active(phyi->phyint_illv6); |
| } |
| |
| /* |
| * Return a held pointer to the underlying ill bound to `ipif', or NULL if one |
| * doesn't exist. Caller need not be inside the IPSQ. |
| */ |
| ill_t * |
| ipmp_ipif_hold_bound_ill(const ipif_t *ipif) |
| { |
| ill_t *boundill; |
| ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; |
| |
| ASSERT(IS_IPMP(ipif->ipif_ill)); |
| |
| rw_enter(&ipst->ips_ipmp_lock, RW_READER); |
| boundill = ipif->ipif_bound_ill; |
| if (boundill != NULL && ill_check_and_refhold(boundill)) { |
| rw_exit(&ipst->ips_ipmp_lock); |
| return (boundill); |
| } |
| rw_exit(&ipst->ips_ipmp_lock); |
| return (NULL); |
| } |
| |
| /* |
| * Return a pointer to the underlying ill bound to `ipif', or NULL if one |
| * doesn't exist. Caller must be inside the IPSQ. |
| */ |
| ill_t * |
| ipmp_ipif_bound_ill(const ipif_t *ipif) |
| { |
| ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); |
| ASSERT(IS_IPMP(ipif->ipif_ill)); |
| |
| return (ipif->ipif_bound_ill); |
| } |
| |
| /* |
| * Check if `ipif' is a "stub" (placeholder address not being used). |
| */ |
| boolean_t |
| ipmp_ipif_is_stubaddr(const ipif_t *ipif) |
| { |
| if (ipif->ipif_flags & IPIF_UP) |
| return (B_FALSE); |
| if (ipif->ipif_ill->ill_isv6) |
| return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); |
| else |
| return (ipif->ipif_lcl_addr == INADDR_ANY); |
| } |
| |
| /* |
| * Check if `ipif' is an IPMP data address. |
| */ |
| boolean_t |
| ipmp_ipif_is_dataaddr(const ipif_t *ipif) |
| { |
| if (ipif->ipif_flags & IPIF_NOFAILOVER) |
| return (B_FALSE); |
| if (ipif->ipif_ill->ill_isv6) |
| return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); |
| else |
| return (ipif->ipif_lcl_addr != INADDR_ANY); |
| } |
| |
| /* |
| * Check if `ipif' is an IPIF_UP IPMP data address. |
| */ |
| static boolean_t |
| ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) |
| { |
| return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); |
| } |
| |
| /* |
| * Check if `mp' contains a probe packet by checking if the IP source address |
| * is a test address on underlying interface `ill'. Caller need not be inside |
| * the IPSQ. |
| */ |
| boolean_t |
| ipmp_packet_is_probe(mblk_t *mp, ill_t *ill) |
| { |
| ip6_t *ip6h = (ip6_t *)mp->b_rptr; |
| ipha_t *ipha = (ipha_t *)mp->b_rptr; |
| |
| ASSERT(DB_TYPE(mp) != M_CTL); |
| |
| if (!IS_UNDER_IPMP(ill)) |
| return (B_FALSE); |
| |
| if (ill->ill_isv6) { |
| if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && |
| ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) |
| return (B_TRUE); |
| } else { |
| if (ipha->ipha_src != INADDR_ANY && |
| ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL)) |
| return (B_TRUE); |
| } |
| return (B_FALSE); |
| } |
| |
| /* |
| * NCEC walker callback: delete `ncec' if it is associated with `ill_arg' and |
| * is not one of our local addresses. Caller must be inside the IPSQ. |
| */ |
| static void |
| ipmp_ncec_delete_nonlocal(ncec_t *ncec, void *ill_arg) |
| { |
| if (!NCE_MYADDR(ncec) && ncec->ncec_ill == (ill_t *)ill_arg) |
| ncec_delete(ncec); |
| } |
| |
| /* |
| * Delete any NCEs tied to the illgrp associated with `ncec'. Caller need not |
| * be inside the IPSQ. |
| */ |
| void |
| ipmp_ncec_delete_nce(ncec_t *ncec) |
| { |
| ipmp_illgrp_t *illg = ncec->ncec_ill->ill_grp; |
| ip_stack_t *ipst = ncec->ncec_ipst; |
| ill_t *ill; |
| nce_t *nce; |
| list_t dead; |
| |
| ASSERT(IS_IPMP(ncec->ncec_ill)); |
| |
| /* |
| * For each underlying interface, delete `ncec' from its ill_nce list |
| * via nce_fastpath_list_delete(). Defer the actual nce_refrele() |
| * until we've dropped ill_g_lock. |
| */ |
| list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); |
| |
| rw_enter(&ipst->ips_ill_g_lock, RW_READER); |
| ill = list_head(&illg->ig_if); |
| for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) |
| nce_fastpath_list_delete(ill, ncec, &dead); |
| rw_exit(&ipst->ips_ill_g_lock); |
| |
| while ((nce = list_remove_head(&dead)) != NULL) |
| nce_refrele(nce); |
| |
| list_destroy(&dead); |
| } |
| |
| /* |
| * Refresh any NCE entries tied to the illgrp associated with `ncec' to |
| * use the information in `ncec'. Caller need not be inside the IPSQ. |
| */ |
| void |
| ipmp_ncec_refresh_nce(ncec_t *ncec) |
| { |
| ipmp_illgrp_t *illg = ncec->ncec_ill->ill_grp; |
| ip_stack_t *ipst = ncec->ncec_ipst; |
| ill_t *ill; |
| nce_t *nce, *nce_next; |
| list_t replace; |
| |
| ASSERT(IS_IPMP(ncec->ncec_ill)); |
| |
| /* |
| * If `ncec' is not reachable, there is no use in refreshing NCEs. |
| */ |
| if (!NCE_ISREACHABLE(ncec)) |
| return; |
| |
| /* |
| * Find all the NCEs matching ncec->ncec_addr. We cannot update them |
| * in-situ because we're holding ipmp_lock to prevent changes to IPMP |
| * group membership and updating indirectly calls nce_fastpath_probe() |
| * -> putnext() which cannot hold locks. Thus, move the NCEs to a |
| * separate list and process that list after dropping ipmp_lock. |
| */ |
| list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node)); |
| rw_enter(&ipst->ips_ipmp_lock, RW_READER); |
| ill = list_head(&illg->ig_actif); |
| for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { |
| mutex_enter(&ill->ill_lock); |
| nce = list_head(&ill->ill_nce); |
| for (; nce != NULL; nce = nce_next) { |
| nce_next = list_next(&ill->ill_nce, nce); |
| if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, |
| &ncec->ncec_addr)) { |
| nce_refhold(nce); |
| nce_delete(nce); |
| list_insert_tail(&replace, nce); |
| } |
| } |
| mutex_exit(&ill->ill_lock); |
| } |
| rw_exit(&ipst->ips_ipmp_lock); |
| |
| /* |
| * Process the list; nce_lookup_then_add_v* ensures that nce->nce_ill |
| * is still in the group for ncec->ncec_ill. |
| */ |
| while ((nce = list_remove_head(&replace)) != NULL) { |
| if (ncec->ncec_ill->ill_isv6) { |
| (void) nce_lookup_then_add_v6(nce->nce_ill, |
| ncec->ncec_lladdr, ncec->ncec_lladdr_length, |
| &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED, |
| NULL); |
| } else { |
| ipaddr_t ipaddr; |
| |
| IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr); |
| (void) nce_lookup_then_add_v4(nce->nce_ill, |
| ncec->ncec_lladdr, ncec->ncec_lladdr_length, |
| &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL); |
| } |
| nce_refrele(nce); |
| } |
| |
| list_destroy(&replace); |
| } |