4173841 Packet goes out with source IP address of another interface
6921533 ioctls could be executed when the thread is not a WRITER on the ipif
6921451 ira_pktlen not computed correctly for ipsec packets
6921615 IPMP need ~5 seconds for traffic passing through when its state transfers from failed
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 26a909b..0cc4b52 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -2242,8 +2242,9 @@
/*
* The normal flags for sending packets e.g., icmp errors
*/
-#define IXAF_BASIC_SIMPLE_V4 (IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4)
-#define IXAF_BASIC_SIMPLE_V6 (IXAF_SET_ULP_CKSUM)
+#define IXAF_BASIC_SIMPLE_V4 \
+ (IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4 | IXAF_VERIFY_SOURCE)
+#define IXAF_BASIC_SIMPLE_V6 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE)
/*
* Normally these fields do not have a hold. But in some cases they do, for
@@ -2677,6 +2678,12 @@
boolean_t ire_trace_disable; /* True when alloc fails */
ip_stack_t *ire_ipst; /* Does not have a netstack_hold */
iulp_t ire_metrics;
+ /*
+ * default and prefix routes that are added without explicitly
+ * specifying the interface are termed "unbound" routes, and will
+ * have ire_unbound set to true.
+ */
+ boolean_t ire_unbound;
};
/* IPv4 compatibility macros */
@@ -3005,6 +3012,8 @@
#define ips_ipv6_icmp_return_pmtu ips_param_arr[73].ip_param_value
#define ips_ip_arp_publish_count ips_param_arr[74].ip_param_value
#define ips_ip_arp_publish_interval ips_param_arr[75].ip_param_value
+#define ips_ip_strict_src_multihoming ips_param_arr[76].ip_param_value
+#define ips_ipv6_strict_src_multihoming ips_param_arr[77].ip_param_value
extern int dohwcksum; /* use h/w cksum if supported by the h/w */
#ifdef ZC_TEST
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 0df82f2..57a5f86 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -3103,6 +3103,16 @@
/* Even for multicast and broadcast we honor the apps ttl */
ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+ /*
+ * No source verification for non-local addresses
+ */
+ if (ipha->ipha_src != INADDR_ANY &&
+ ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
+ is->is_netstack->netstack_ip, B_FALSE)
+ != IPVL_UNICAST_UP) {
+ ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+ }
+
if (ipha->ipha_dst == INADDR_ANY)
ipha->ipha_dst = htonl(INADDR_LOOPBACK);
@@ -3468,6 +3478,26 @@
v6src = ipp->ipp_addr;
}
}
+ /*
+ * Allow source not assigned to the system
+ * only if it is not a local addresses
+ */
+ if (!V6_OR_V4_INADDR_ANY(v6src)) {
+ ip_laddr_t laddr_type;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipaddr_t v4src;
+
+ IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
+ laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
+ is->is_netstack->netstack_ip, B_FALSE);
+ } else {
+ laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
+ is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
+ }
+ if (laddr_type != IPVL_UNICAST_UP)
+ ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+ }
ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
@@ -3562,8 +3592,6 @@
/* We're done. Pass the packet to ip. */
BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
- /* Allow source not assigned to the system? */
- ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
error = conn_ip_output(mp, ixa);
if (!connp->conn_unspec_src)
ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 26547e6..8cf5e7e 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -826,6 +826,10 @@
{ 0, 99999, 100, "ip_icmp_err_interval" },
{ 1, 99999, 10, "ip_icmp_err_burst" },
{ 0, 999999999, 1000000, "ip_reass_queue_bytes" },
+ /*
+ * See comments for ip_strict_src_multihoming for an explanation
+ * of the semantics of ip_strict_dst_multihoming
+ */
{ 0, 1, 0, "ip_strict_dst_multihoming" },
{ 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"},
{ 0, 1, 0, "ipsec_override_persocket_policy" },
@@ -841,6 +845,10 @@
{ 0, 1, 1, "ip6_respond_to_echo_multicast"},
{ 0, 1, 1, "ip6_send_redirects"},
{ 0, 1, 0, "ip6_ignore_redirect" },
+ /*
+ * See comments for ip6_strict_src_multihoming for an explanation
+ * of the semantics of ip6_strict_dst_multihoming
+ */
{ 0, 1, 0, "ip6_strict_dst_multihoming" },
{ 0, 2, 2, "ip_src_check" },
@@ -907,7 +915,48 @@
* for IPv4, IPv6.
*/
{ 1, 20, 5, "ip_arp_publish_count" },
- { 1000, 20000, 2000, "ip_arp_publish_interval" },
+ { 1000, 20000, 2000, "ip_arp_publish_interval" },
+ /*
+ * The ip*strict_src_multihoming and ip*strict_dst_multihoming provide
+ * a range of choices for setting strong/weak/preferred end-system
+ * behavior. The semantics for setting these are:
+ *
+ * ip*_strict_dst_multihoming = 0
+ * weak end system model for managing ip destination addresses.
+ * A packet with IP dst D1 that's received on interface I1 will be
+ * accepted as long as D1 is one of the local addresses on
+ * the machine, even if D1 is not configured on I1.
+ * ip*strict_dst_multihioming = 1
+ * strong end system model for managing ip destination addresses.
+ * A packet with IP dst D1 that's received on interface I1 will be
+ * accepted if, and only if, D1 is configured on I1.
+ *
+ * ip*strict_src_multihoming = 0
+ * Source agnostic route selection for outgoing packets: the
+ * outgoing interface for a packet will be computed using
+ * default algorithms for route selection, where the route
+ * with the longest matching prefix is chosen for the output
+ * unless other route selection constraints are explicitly
+ * specified during routing table lookup. This may result
+ * in packet being sent out on interface I2 with source
+ * address S1, even though S1 is not a configured address on I2.
+ * ip*strict_src_multihoming = 1
+ * Preferred source aware route selection for outgoing packets: for
+ * a packet with source S2, destination D2, the route selection
+ * algorithm will first attempt to find a route for the destination
+ * that goes out through an interface where S2 is
+ * configured. If such a route cannot be found, then the
+ * best-matching route for D2 will be selected.
+ * ip*strict_src_multihoming = 2
+ * Source aware route selection for outgoing packets: a packet will
+ * be sent out on an interface I2 only if the src address S2 of the
+ * packet is a configured address on I2. In conjunction with
+ * the setting 'ip_strict_dst_multihoming == 1', this will result in
+ * the implementation of Strong ES as defined in Section 3.3.4.2 of
+ * RFC 1122
+ */
+ { 0, 2, 0, "ip_strict_src_multihoming" },
+ { 0, 2, 0, "ip6_strict_src_multihoming" }
};
/*
@@ -3562,8 +3611,8 @@
* a "hidden" route (i.e., going through a specific under_ill)
* if ixa_ifindex has been specified.
*/
- ire = ip_select_route_v4(firsthop, ixa, &generation, &setsrc, &error,
- &multirt);
+ ire = ip_select_route_v4(firsthop, *src_addrp, ixa,
+ &generation, &setsrc, &error, &multirt);
ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
if (error != 0)
goto bad_addr;
@@ -6773,6 +6822,85 @@
return (B_TRUE);
}
+/*
+ * When the src multihoming is changed from weak to [strong, preferred]
+ * ip_ire_rebind_walker is called to walk the list of all ire_t entries
+ * and identify routes that were created by user-applications in the
+ * unbound state (i.e., without RTA_IFP), and for which an ire_ill is not
+ * currently defined. These routes are then 'rebound', i.e., their ire_ill
+ * is selected by finding an interface route for the gateway.
+ */
+/* ARGSUSED */
+static void
+ip_ire_rebind_walker(ire_t *ire, void *notused)
+{
+ if (!ire->ire_unbound || ire->ire_ill != NULL)
+ return;
+ ire_rebind(ire);
+ ire_delete(ire);
+}
+
+/*
+ * When the src multihoming is changed from [strong, preferred] to weak,
+ * ip_ire_unbind_walker is called to walk the list of all ire_t entries, and
+ * set any entries that were created by user-applications in the unbound state
+ * (i.e., without RTA_IFP) back to having a NULL ire_ill.
+ */
+/* ARGSUSED */
+static void
+ip_ire_unbind_walker(ire_t *ire, void *notused)
+{
+ ire_t *new_ire;
+
+ if (!ire->ire_unbound || ire->ire_ill == NULL)
+ return;
+ if (ire->ire_ipversion == IPV6_VERSION) {
+ new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
+ &ire->ire_gateway_addr_v6, ire->ire_type, NULL,
+ ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
+ } else {
+ new_ire = ire_create((uchar_t *)&ire->ire_addr,
+ (uchar_t *)&ire->ire_mask,
+ (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, NULL,
+ ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
+ }
+ if (new_ire == NULL)
+ return;
+ new_ire->ire_unbound = B_TRUE;
+ /*
+ * The bound ire must first be deleted so that we don't return
+ * the existing one on the attempt to add the unbound new_ire.
+ */
+ ire_delete(ire);
+ new_ire = ire_add(new_ire);
+ if (new_ire != NULL)
+ ire_refrele(new_ire);
+}
+
+/*
+ * When the settings of ip*_strict_src_multihoming tunables are changed,
+ * all cached routes need to be recomputed. This recomputation needs to be
+ * done when going from weaker to stronger modes so that the cached ire
+ * for the connection does not violate the current ip*_strict_src_multihoming
+ * setting. It also needs to be done when going from stronger to weaker modes,
+ * so that we fall back to matching on the longest-matching-route (as opposed
+ * to a shorter match that may have been selected in the strong mode
+ * to satisfy src_multihoming settings).
+ *
+ * The cached ixa_ire entires for all conn_t entries are marked as
+ * "verify" so that they will be recomputed for the next packet.
+ */
+static void
+conn_ire_revalidate(conn_t *connp, void *arg)
+{
+ boolean_t isv6 = (boolean_t)arg;
+
+ if ((isv6 && connp->conn_ipversion != IPV6_VERSION) ||
+ (!isv6 && connp->conn_ipversion != IPV4_VERSION))
+ return;
+ connp->conn_ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+}
+
/* Named Dispatch routine to negotiate a new value for one of our parameters. */
/* ARGSUSED */
static int
@@ -6780,12 +6908,35 @@
{
long new_value;
ipparam_t *ippa = (ipparam_t *)cp;
+ ip_stack_t *ipst = CONNQ_TO_IPST(q);
+ int strict_src4, strict_src6;
+ strict_src4 = ipst->ips_ip_strict_src_multihoming;
+ strict_src6 = ipst->ips_ipv6_strict_src_multihoming;
if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
new_value < ippa->ip_param_min || new_value > ippa->ip_param_max) {
return (EINVAL);
}
ippa->ip_param_value = new_value;
+ if (ipst->ips_ip_strict_src_multihoming != strict_src4) {
+ if (strict_src4 == 0) {
+ ire_walk_v4(ip_ire_rebind_walker, NULL, ALL_ZONES,
+ ipst);
+ } else {
+ ire_walk_v4(ip_ire_unbind_walker, NULL, ALL_ZONES,
+ ipst);
+ }
+ ipcl_walk(conn_ire_revalidate, (void *)B_FALSE, ipst);
+ } else if (ipst->ips_ipv6_strict_src_multihoming != strict_src6) {
+ if (strict_src6 == 0) {
+ ire_walk_v6(ip_ire_rebind_walker, NULL, ALL_ZONES,
+ ipst);
+ } else {
+ ire_walk_v4(ip_ire_unbind_walker, NULL, ALL_ZONES,
+ ipst);
+ }
+ ipcl_walk(conn_ire_revalidate, (void *)B_TRUE, ipst);
+ }
return (0);
}
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index 44d777f..2cc88ff 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
@@ -2004,8 +2004,8 @@
* a "hidden" route (i.e., going through a specific under_ill)
* if ixa_ifindex has been specified.
*/
- ire = ip_select_route_v6(firsthop, ixa, &generation, &setsrc, &error,
- &multirt);
+ ire = ip_select_route_v6(firsthop, *src_addrp, ixa, &generation,
+ &setsrc, &error, &multirt);
ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
if (error != 0)
goto bad_addr;
diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c
index c6982ef..1676d29 100644
--- a/usr/src/uts/common/inet/ip/ip6_if.c
+++ b/usr/src/uts/common/inet/ip/ip6_if.c
@@ -405,6 +405,7 @@
tsol_gc_t *gc = NULL;
tsol_gcgrp_t *gcgrp = NULL;
boolean_t gcgrp_xtraref = B_FALSE;
+ boolean_t unbound = B_FALSE;
if (ire_arg != NULL)
*ire_arg = NULL;
@@ -724,6 +725,11 @@
ipif_refrele(ipif);
return (ENETUNREACH);
}
+ if (ill == NULL && !(flags & RTF_INDIRECT)) {
+ unbound = B_TRUE;
+ if (ipst->ips_ipv6_strict_src_multihoming > 0)
+ ill = gw_ire->ire_ill;
+ }
/*
* We create one of three types of IREs as a result of this request
@@ -819,6 +825,8 @@
if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr))
ire->ire_setsrc_addr_v6 = *src_addr;
+ ire->ire_unbound = unbound;
+
/*
* POLICY: should we allow an RTF_HOST with address INADDR_ANY?
* SUN/OS socket stuff does but do we really want to allow ::0 ?
diff --git a/usr/src/uts/common/inet/ip/ip6_input.c b/usr/src/uts/common/inet/ip/ip6_input.c
index 70955fa..d596c31 100644
--- a/usr/src/uts/common/inet/ip/ip6_input.c
+++ b/usr/src/uts/common/inet/ip/ip6_input.c
@@ -1539,6 +1539,8 @@
zoneid_t zoneid;
mblk_t *mp1;
ip6_t *ip6h1;
+ uint_t ira_pktlen = ira->ira_pktlen;
+ uint16_t ira_ip_hdr_length = ira->ira_ip_hdr_length;
/* ire_recv_multicast has switched to the upper ill for IPMP */
ASSERT(!IS_UNDER_IPMP(ill));
@@ -1598,6 +1600,12 @@
}
ip6h1 = (ip6_t *)mp1->b_rptr;
ip_fanout_v6(mp1, ip6h1, ira);
+ /*
+ * IPsec might have modified ira_pktlen and ira_ip_hdr_length
+ * so we restore them for a potential next iteration
+ */
+ ira->ira_pktlen = ira_pktlen;
+ ira->ira_ip_hdr_length = ira_ip_hdr_length;
}
/* Do the main ire */
diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c
index b31a110..0b84c0a 100644
--- a/usr/src/uts/common/inet/ip/ip6_ire.c
+++ b/usr/src/uts/common/inet/ip/ip6_ire.c
@@ -690,7 +690,7 @@
ASSERT(addr != NULL);
ASSERT(mask != NULL);
ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
- ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
+ ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
(ill != NULL && ill->ill_isv6));
/*
@@ -771,7 +771,7 @@
}
}
/*
- * For exampe, with
+ * For example, with
* route add 11.0.0.0 gw1 -ifp bge0
* route add 11.0.0.0 gw2 -ifp bge1
* this code would differentiate based on
@@ -799,13 +799,13 @@
}
matchit:
+ ire_ill = ire->ire_ill;
if (match_flags & MATCH_IRE_GW) {
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
}
if (match_flags & MATCH_IRE_ILL) {
- ire_ill = ire->ire_ill;
/*
* If asked to match an ill, we *must* match
@@ -830,6 +830,17 @@
return (B_FALSE);
}
}
+ if (match_flags & MATCH_IRE_SRC_ILL) {
+ if (ire_ill == NULL)
+ return (B_FALSE);
+ if (!IS_ON_SAME_LAN(ill, ire_ill)) {
+ if (ire_ill->ill_usesrc_ifindex == 0 ||
+ (ire_ill->ill_usesrc_ifindex !=
+ ill->ill_phyint->phyint_ifindex))
+ return (B_FALSE);
+ }
+ }
+
/* No ire_addr_v6 bits set past the mask */
ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
ire->ire_addr_v6));
@@ -910,9 +921,9 @@
/*
* ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
- * is set.
+ * or MATCH_IRE_SRC_ILL is set.
*/
- if ((flags & (MATCH_IRE_ILL)) && (ill == NULL))
+ if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
return (NULL);
rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
@@ -1113,12 +1124,13 @@
}
ire_t *
-ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa,
- uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
+ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src,
+ ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
+ int *errorp, boolean_t *multirtp)
{
ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
- return (ip_select_route(dst, ixa, generationp, setsrcp, errorp,
+ return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp,
multirtp));
}
@@ -1127,8 +1139,6 @@
* the zoneid, ill, and label. Used for the data paths. See also
* ire_route_recursive_dstonly.
*
- * If ill is set this means we will match it by adding MATCH_IRE_ILL.
- *
* If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
* create an IRE_IF_CLONE. This is used on the receive side when we are not
* forwarding.
@@ -1164,9 +1174,6 @@
if (gwattrp != NULL)
ASSERT(*gwattrp == NULL);
- if (ill_arg != NULL)
- match_args |= MATCH_IRE_ILL;
-
/*
* We iterate up to three times to resolve a route, even though
* we have four slots in the array. The extra slot is for an
@@ -1177,7 +1184,7 @@
/* ire_ftable_lookup handles round-robin/ECMP */
if (ire == NULL) {
ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
- (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+ (ill != NULL ? ill : ill_arg), zoneid, tsl,
match_args, xmit_hint, ipst, &generation);
} else {
/* Caller passed it; extra hold since we will rele */
@@ -1322,6 +1329,10 @@
* recursing. The type match is used by some callers
* to exclude certain types (such as IRE_IF_CLONE or
* IRE_LOCAL|IRE_LOOPBACK).
+ *
+ * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
+ * ire->ire_ill, and we want to find the IRE_INTERFACE for
+ * ire_ill, so we set ill to the ire_ill
*/
match_args &= MATCH_IRE_TYPE;
v6nexthop = ire->ire_gateway_addr_v6;
diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c
index 6fdcfbc..9d28d3f 100644
--- a/usr/src/uts/common/inet/ip/ip6_output.c
+++ b/usr/src/uts/common/inet/ip/ip6_output.c
@@ -150,8 +150,8 @@
repeat_ire:
error = 0;
setsrc = ipv6_all_zeros;
- ire = ip_select_route_v6(&firsthop, ixa, NULL, &setsrc, &error,
- &multirt);
+ ire = ip_select_route_v6(&firsthop, ip6h->ip6_src, ixa, NULL, &setsrc,
+ &error, &multirt);
ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
if (error != 0) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
@@ -1228,10 +1228,13 @@
* starting at ire1.
*/
ire_t *ire2;
+ uint_t match_flags = MATCH_IRE_DSTONLY;
+ if (ire1->ire_ill != NULL)
+ match_flags |= MATCH_IRE_ILL;
ire2 = ire_route_recursive_impl_v6(ire1,
&ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill,
- ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
+ ire1->ire_zoneid, NULL, match_flags,
IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
if (ire2 != NULL)
ire_refrele(ire2);
diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c
index 0ee44e2..8a9889a 100644
--- a/usr/src/uts/common/inet/ip/ip_ftable.c
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c
@@ -77,6 +77,10 @@
(((ire)->ire_type & IRE_DEFAULT) || \
(((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
+#define IP_SRC_MULTIHOMING(isv6, ipst) \
+ (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \
+ ipst->ips_ip_strict_src_multihoming)
+
static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
static void ire_del_host_redir(ire_t *, char *);
static boolean_t ire_find_best_route(struct radix_node *, void *);
@@ -104,7 +108,7 @@
* ire_match_args() will dereference ill if MATCH_IRE_ILL
* is set.
*/
- if ((flags & MATCH_IRE_ILL) && (ill == NULL))
+ if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
return (NULL);
bzero(&rdst, sizeof (rdst));
@@ -673,7 +677,8 @@
for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
if (IRE_IS_CONDEMNED(ire))
continue;
- if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK))
+ ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
+ if (margs->ift_flags & MATCH_IRE_MASK)
match_mask = margs->ift_mask;
else
match_mask = ire->ire_mask;
@@ -968,24 +973,112 @@
irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
}
+
+/*
+ * ip_select_src_ill() is used by ip_select_route() to find the src_ill
+ * to be used for source-aware routing table lookup. This function will
+ * ignore IPIF_UNNUMBERED interface addresses, and will only return a
+ * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
+ * interfaces).
+ */
+static ill_t *
+ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
+{
+ ipif_t *ipif;
+ ill_t *ill;
+ boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
+ ipaddr_t v4src;
+
+ if (isv6) {
+ ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
+ } else {
+ IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
+ ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
+ }
+ if (ipif == NULL)
+ return (NULL);
+ ill = ipif->ipif_ill;
+ ill_refhold(ill);
+ ipif_refrele(ipif);
+ return (ill);
+}
+
+/*
+ * verify that v6src is configured on ill
+ */
+static boolean_t
+ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
+{
+ ipif_t *ipif;
+ ip_stack_t *ipst;
+ ipaddr_t v4src;
+
+ if (ill == NULL)
+ return (B_FALSE);
+ ipst = ill->ill_ipst;
+
+ if (ill->ill_isv6) {
+ ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
+ } else {
+ IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
+ ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
+ }
+
+ if (ipif != NULL) {
+ ipif_refrele(ipif);
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
+ }
+}
+
/*
* Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
* routes this routine sets up a ire_nce_cache as well. The caller needs to
* lookup an nce for the multicast case.
+ *
+ * When src_multihoming is set to 2 (strict src multihoming) we use the source
+ * address to select the interface and route. If IP_BOUND_IF etc are
+ * specified, we require that they specify an interface on which the
+ * source address is assigned.
+ *
+ * When src_multihoming is set to 1 (preferred src aware route
+ * selection) the unicast lookup prefers a matching source
+ * (i.e., that the route points out an ill on which the source is assigned), but
+ * if no such route is found we fallback to not considering the source in the
+ * route lookup.
+ *
+ * We skip the src_multihoming check when the source isn't (yet) set, and
+ * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
+ * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
+ * when secpolicy_net_rawaccess().
*/
ire_t *
-ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa,
- uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
+ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
+ ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
+ int *errorp, boolean_t *multirtp)
{
uint_t match_args;
uint_t ire_type;
- ill_t *ill;
+ ill_t *ill = NULL;
ire_t *ire;
ip_stack_t *ipst = ixa->ixa_ipst;
ipaddr_t v4dst;
in6_addr_t v6nexthop;
iaflags_t ixaflags = ixa->ixa_flags;
nce_t *nce;
+ boolean_t preferred_src_aware = B_FALSE;
+ boolean_t verify_src;
+ boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
+ int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
+
+ /*
+ * We only verify that the src has been configured on a selected
+ * interface if the src is not :: or INADDR_ANY, and if the
+ * IXAF_VERIFY_SOURCE flag is set.
+ */
+ verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
+ (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
match_args = MATCH_IRE_SECATTR;
IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
@@ -999,17 +1092,16 @@
* SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
*/
- if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) :
- IN6_IS_ADDR_MULTICAST(v6dst)) {
+ if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
/* Pick up the IRE_MULTICAST for the ill */
if (ixa->ixa_multicast_ifindex != 0) {
ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
- !(ixaflags & IXAF_IS_IPV4), ipst);
+ isv6, ipst);
} else if (ixaflags & IXAF_SCOPEID_SET) {
/* sin6_scope_id takes precedence over ixa_ifindex */
ASSERT(ixa->ixa_scopeid != 0);
ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
- !(ixaflags & IXAF_IS_IPV4), ipst);
+ isv6, ipst);
} else if (ixa->ixa_ifindex != 0) {
/*
* In the ipmp case, the ixa_ifindex is set to
@@ -1017,17 +1109,32 @@
* ire_multicast() corresponding to that under_ill.
*/
ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
- !(ixaflags & IXAF_IS_IPV4), ipst);
- } else if (ixaflags & IXAF_IS_IPV4) {
+ isv6, ipst);
+ } else if (src_multihoming != 0 && verify_src) {
+ /* Look up the ill based on the source address */
+ ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
+ /*
+ * Since we looked up the ill from the source there
+ * is no need to verify that the source is on the ill
+ * below.
+ */
+ verify_src = B_FALSE;
+ if (ill != NULL && IS_VNI(ill)) {
+ ill_t *usesrc = ill;
+
+ ill = ill_lookup_usesrc(usesrc);
+ ill_refrele(usesrc);
+ }
+ } else if (!isv6) {
ipaddr_t v4setsrc = INADDR_ANY;
- ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst,
- multirtp, &v4setsrc);
+ ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
+ ipst, multirtp, &v4setsrc);
if (setsrcp != NULL)
IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
} else {
- ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst,
- multirtp, setsrcp);
+ ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
+ ipst, multirtp, setsrcp);
}
if (ill != NULL && IS_VNI(ill)) {
ill_refrele(ill);
@@ -1037,7 +1144,7 @@
if (errorp != NULL)
*errorp = ENXIO;
/* Get a hold on the IRE_NOROUTE */
- ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+ ire = ire_reject(ipst, isv6);
return (ire);
}
if (!(ill->ill_flags & ILLF_MULTICAST)) {
@@ -1045,7 +1152,21 @@
if (errorp != NULL)
*errorp = EHOSTUNREACH;
/* Get a hold on the IRE_NOROUTE */
- ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+ ire = ire_reject(ipst, isv6);
+ return (ire);
+ }
+ /*
+ * If we are doing the strictest src_multihoming, then
+ * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
+ * an interface that is consistent with the source address.
+ */
+ if (verify_src && src_multihoming == 2 &&
+ !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
+ if (errorp != NULL)
+ *errorp = EADDRNOTAVAIL;
+ ill_refrele(ill);
+ /* Get a hold on the IRE_NOROUTE */
+ ire = ire_reject(ipst, isv6);
return (ire);
}
/* Get a refcnt on the single IRE_MULTICAST per ill */
@@ -1060,16 +1181,17 @@
return (ire);
}
+ /* Now for unicast */
if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
if (ixaflags & IXAF_SCOPEID_SET) {
/* sin6_scope_id takes precedence over ixa_ifindex */
ASSERT(ixa->ixa_scopeid != 0);
ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
- !(ixaflags & IXAF_IS_IPV4), ipst);
+ isv6, ipst);
} else {
ASSERT(ixa->ixa_ifindex != 0);
ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
- !(ixaflags & IXAF_IS_IPV4), ipst);
+ isv6, ipst);
}
if (ill != NULL && IS_VNI(ill)) {
ill_refrele(ill);
@@ -1079,9 +1201,12 @@
if (errorp != NULL)
*errorp = ENXIO;
/* Get a hold on the IRE_NOROUTE */
- ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+ ire = ire_reject(ipst, isv6);
return (ire);
}
+
+ match_args |= MATCH_IRE_ILL;
+
/*
* icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
* so for both of them we need to be able look for an under
@@ -1089,8 +1214,38 @@
*/
if (IS_UNDER_IPMP(ill))
match_args |= MATCH_IRE_TESTHIDDEN;
- } else {
- ill = NULL;
+
+ /*
+ * If we are doing the strictest src_multihoming, then
+ * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
+ * an interface that is consistent with the source address.
+ */
+ if (src_multihoming == 2 &&
+ !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
+ if (errorp != NULL)
+ *errorp = EADDRNOTAVAIL;
+ ill_refrele(ill);
+ /* Get a hold on the IRE_NOROUTE */
+ ire = ire_reject(ipst, isv6);
+ return (ire);
+ }
+ } else if (src_multihoming != 0 && verify_src) {
+ /* Look up the ill based on the source address */
+ ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
+ if (ill == NULL) {
+ char addrbuf[INET6_ADDRSTRLEN];
+
+ ip3dbg(("%s not a valid src for unicast",
+ inet_ntop(AF_INET6, &v6src, addrbuf,
+ sizeof (addrbuf))));
+ if (errorp != NULL)
+ *errorp = EADDRNOTAVAIL;
+ /* Get a hold on the IRE_NOROUTE */
+ ire = ire_reject(ipst, isv6);
+ return (ire);
+ }
+ match_args |= MATCH_IRE_SRC_ILL;
+ preferred_src_aware = (src_multihoming == 1);
}
if (ixaflags & IXAF_NEXTHOP_SET) {
@@ -1101,7 +1256,6 @@
}
ire_type = 0;
- /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
/*
* If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
@@ -1112,7 +1266,8 @@
ire_type = IRE_ONLINK;
}
- if (ixaflags & IXAF_IS_IPV4) {
+retry:
+ if (!isv6) {
ipaddr_t v4nexthop;
ipaddr_t v4setsrc = INADDR_ANY;
@@ -1134,12 +1289,24 @@
v4dst, (void *)ire));
}
#endif
-
- if (ill != NULL)
+ if (ill != NULL) {
ill_refrele(ill);
-
+ ill = NULL;
+ }
if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
(ire->ire_type & IRE_MULTICAST)) {
+ if (preferred_src_aware) {
+ /*
+ * "Preferred Source Aware" send mode. If we cannot
+ * find an ire whose ire_ill had the desired source
+ * address retry after relaxing the ill matching
+ * constraint.
+ */
+ ire_refrele(ire);
+ preferred_src_aware = B_FALSE;
+ match_args &= ~MATCH_IRE_SRC_ILL;
+ goto retry;
+ }
/* No ire_nce_cache */
return (ire);
}
@@ -1169,34 +1336,36 @@
{
if (ixa->ixa_flags & IXAF_IS_IPV4) {
ipha_t *ipha = (ipha_t *)mp->b_rptr;
- in6_addr_t v6dst;
+ in6_addr_t v6dst, v6src;
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
- return (ip_select_route(&v6dst, ixa, generationp,
+ return (ip_select_route(&v6dst, v6src, ixa, generationp,
NULL, errorp, multirtp));
} else {
ip6_t *ip6h = (ip6_t *)mp->b_rptr;
- return (ip_select_route(&ip6h->ip6_dst, ixa, generationp,
- NULL, errorp, multirtp));
+ return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
+ ixa, generationp, NULL, errorp, multirtp));
}
}
ire_t *
-ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp,
- ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
+ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
+ uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
{
- in6_addr_t v6dst;
+ in6_addr_t v6dst, v6src;
ire_t *ire;
in6_addr_t setsrc;
ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
+ IN6_IPADDR_TO_V4MAPPED(src, &v6src);
setsrc = ipv6_all_zeros;
- ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp,
+ ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
multirtp);
if (v4setsrcp != NULL)
IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
@@ -1208,8 +1377,6 @@
* the zoneid, ill, and label. Used for the data paths. See also
* ire_route_recursive.
*
- * If ill is set this means we will match it by adding MATCH_IRE_ILL.
- *
* If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
* create an IRE_IF_CLONE. This is used on the receive side when we are not
* forwarding.
@@ -1244,9 +1411,6 @@
if (gwattrp != NULL)
ASSERT(*gwattrp == NULL);
- if (ill_arg != NULL)
- match_args |= MATCH_IRE_ILL;
-
/*
* We iterate up to three times to resolve a route, even though
* we have four slots in the array. The extra slot is for an
@@ -1257,7 +1421,7 @@
/* ire_ftable_lookup handles round-robin/ECMP */
if (ire == NULL) {
ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
- (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+ (ill != NULL? ill : ill_arg), zoneid, tsl,
match_args, xmit_hint, ipst, &generation);
} else {
/* Caller passed it; extra hold since we will rele */
@@ -1403,6 +1567,10 @@
* recursing. The type match is used by some callers
* to exclude certain types (such as IRE_IF_CLONE or
* IRE_LOCAL|IRE_LOOPBACK).
+ *
+ * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
+ * ire->ire_ill, and we want to find the IRE_INTERFACE for
+ * ire_ill, so we set ill to the ire_ill;
*/
match_args &= MATCH_IRE_TYPE;
nexthop = ire->ire_gateway_addr;
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 19b7c85..debf6bb 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -934,17 +934,15 @@
/*
* Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
* In the case of ioctl from a conn, there can be only 1 mp
- * queued on the ipsq. If an ill is being unplumbed, only messages
- * related to this ill are flushed, like M_ERROR or M_HANGUP message.
- * ioctls meant for this ill form conn's are not flushed. They will
- * be processed during ipsq_exit and will not find the ill and will
- * return error.
+ * queued on the ipsq. If an ill is being unplumbed flush all
+ * the messages.
*/
mutex_enter(&ipsq->ipsq_lock);
for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
curr = next) {
next = curr->b_next;
- if (curr->b_queue == wq || curr->b_queue == rq) {
+ if (connp == NULL ||
+ (curr->b_queue == wq || curr->b_queue == rq)) {
/* Unlink the mblk from the pending mp list */
if (prev != NULL) {
prev->b_next = curr->b_next;
@@ -1201,7 +1199,7 @@
/*
* ire_walk routine used to delete every IRE that depends on
- * 'ill'. (Always called as writer.)
+ * 'ill'. (Always called as writer, and may only be called from ire_walk.)
*
* Note: since the routes added by the kernel are deleted separately,
* this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
@@ -1223,8 +1221,23 @@
mutex_exit(&ire->ire_lock);
if (nce != NULL)
nce_refrele(nce);
- if (ire->ire_ill == ill)
+ if (ire->ire_ill == ill) {
+ /*
+ * The existing interface binding for ire must be
+ * deleted before trying to bind the route to another
+ * interface. However, since we are using the contents of the
+ * ire after ire_delete, the caller has to ensure that
+ * CONDEMNED (deleted) ire's are not removed from the list
+ * when ire_delete() returns. Currently ill_downi() is
+ * only called as part of ire_walk*() routines, so that
+ * the irb_refhold() done by ire_walk*() will ensure that
+ * ire_delete() does not lead to ire_inactive().
+ */
+ ASSERT(ire->ire_bucket->irb_refcnt > 0);
ire_delete(ire);
+ if (ire->ire_unbound)
+ ire_rebind(ire);
+ }
}
/* Remove IRE_IF_CLONE on this ill */
@@ -5441,6 +5454,7 @@
tsol_gcgrp_t *gcgrp = NULL;
boolean_t gcgrp_xtraref = B_FALSE;
boolean_t cgtp_broadcast;
+ boolean_t unbound = B_FALSE;
ip1dbg(("ip_rt_add:"));
@@ -5765,6 +5779,12 @@
return (ENETUNREACH);
}
+ if (ill == NULL && !(flags & RTF_INDIRECT)) {
+ unbound = B_TRUE;
+ if (ipst->ips_ip_strict_src_multihoming > 0)
+ ill = gw_ire->ire_ill;
+ }
+
/*
* We create one of three types of IREs as a result of this request
* based on the netmask. A netmask of all ones (which is automatically
@@ -5863,6 +5883,8 @@
if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
ire->ire_setsrc_addr = src_addr;
+ ire->ire_unbound = unbound;
+
/*
* POLICY: should we allow an RTF_HOST with address INADDR_ANY?
* SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
@@ -7601,8 +7623,8 @@
}
lifr++;
}
- rw_exit(&ipst->ips_ill_g_usesrc_lock);
rw_exit(&ipst->ips_ill_g_lock);
+ rw_exit(&ipst->ips_ill_g_usesrc_lock);
ipif_refrele(orig_ipif);
mp1->b_wptr = (uchar_t *)lifr;
STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
@@ -18478,3 +18500,30 @@
nce_flush(ill, B_TRUE);
}
}
+
+/*
+ * find the first interface that uses usill for its source address.
+ */
+ill_t *
+ill_lookup_usesrc(ill_t *usill)
+{
+ ip_stack_t *ipst = usill->ill_ipst;
+ ill_t *ill;
+
+ ASSERT(usill != NULL);
+
+ /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
+ rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
+ ill = ill->ill_usesrc_grp_next) {
+ if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
+ !ILL_IS_CONDEMNED(ill)) {
+ ill_refhold(ill);
+ break;
+ }
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ rw_exit(&ipst->ips_ill_g_usesrc_lock);
+ return (ill);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c
index e7cd69b..a54b2e8 100644
--- a/usr/src/uts/common/inet/ip/ip_input.c
+++ b/usr/src/uts/common/inet/ip/ip_input.c
@@ -1841,6 +1841,8 @@
ire_t *ire1;
mblk_t *mp1;
ipha_t *ipha1;
+ uint_t ira_pktlen = ira->ira_pktlen;
+ uint16_t ira_ip_hdr_length = ira->ira_ip_hdr_length;
irb = ire->ire_bucket;
@@ -1883,6 +1885,12 @@
ira->ira_zoneid = ire1->ire_zoneid;
ipha1 = (ipha_t *)mp1->b_rptr;
ip_fanout_v4(mp1, ipha1, ira);
+ /*
+ * IPsec might have modified ira_pktlen and ira_ip_hdr_length
+ * so we restore them for a potential next iteration
+ */
+ ira->ira_pktlen = ira_pktlen;
+ ira->ira_ip_hdr_length = ira_ip_hdr_length;
}
irb_refrele(irb);
/* Do the main ire */
@@ -1913,6 +1921,8 @@
zoneid_t zoneid;
mblk_t *mp1;
ipha_t *ipha1;
+ uint_t ira_pktlen = ira->ira_pktlen;
+ uint16_t ira_ip_hdr_length = ira->ira_ip_hdr_length;
/* ire_recv_multicast has switched to the upper ill for IPMP */
ASSERT(!IS_UNDER_IPMP(ill));
@@ -1972,6 +1982,12 @@
}
ipha1 = (ipha_t *)mp1->b_rptr;
ip_fanout_v4(mp1, ipha1, ira);
+ /*
+ * IPsec might have modified ira_pktlen and ira_ip_hdr_length
+ * so we restore them for a potential next iteration
+ */
+ ira->ira_pktlen = ira_pktlen;
+ ira->ira_ip_hdr_length = ira_ip_hdr_length;
}
/* Do the main ire */
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index 8701687..8e42353 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -1856,7 +1856,7 @@
ASSERT(ire->ire_ipversion == IPV4_VERSION);
ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
- ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
+ ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
(ill != NULL && !ill->ill_isv6));
/*
@@ -1936,7 +1936,7 @@
}
}
/*
- * For exampe, with
+ * For example, with
* route add 11.0.0.0 gw1 -ifp bge0
* route add 11.0.0.0 gw2 -ifp bge1
* this code would differentiate based on
@@ -1965,8 +1965,8 @@
}
matchit:
+ ire_ill = ire->ire_ill;
if (match_flags & MATCH_IRE_ILL) {
- ire_ill = ire->ire_ill;
/*
* If asked to match an ill, we *must* match
@@ -1991,6 +1991,16 @@
return (B_FALSE);
}
}
+ if (match_flags & MATCH_IRE_SRC_ILL) {
+ if (ire_ill == NULL)
+ return (B_FALSE);
+ if (!IS_ON_SAME_LAN(ill, ire_ill)) {
+ if (ire_ill->ill_usesrc_ifindex == 0 ||
+ (ire_ill->ill_usesrc_ifindex !=
+ ill->ill_phyint->phyint_ifindex))
+ return (B_FALSE);
+ }
+ }
if ((ire->ire_addr == (addr & mask)) &&
((!(match_flags & MATCH_IRE_GW)) ||
@@ -3563,3 +3573,60 @@
return (5);
return (-1); /* unknown ire_type */
}
+
+/*
+ * In the preferred/strict src multihoming modes, unbound routes (i.e.,
+ * ire_t entries with ire_unbound set to B_TRUE) are bound to an interface
+ * by selecting the first available interface that has an interface route for
+ * the ire_gateway. If that interface is subsequently brought down, ill_downi()
+ * will call ire_rebind() so that the unbound route can be bound to some other
+ * matching interface thereby preserving the intended reachability information
+ * from the original unbound route.
+ */
+void
+ire_rebind(ire_t *ire)
+{
+ ire_t *gw_ire, *new_ire;
+ int match_flags = MATCH_IRE_TYPE;
+ ill_t *gw_ill;
+ boolean_t isv6 = (ire->ire_ipversion == IPV6_VERSION);
+ ip_stack_t *ipst = ire->ire_ipst;
+
+ ASSERT(ire->ire_unbound);
+again:
+ if (isv6) {
+ gw_ire = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0,
+ IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0,
+ ipst, NULL);
+ } else {
+ gw_ire = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
+ IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0,
+ ipst, NULL);
+ }
+ if (gw_ire == NULL) {
+ /* see comments in ip_rt_add[_v6]() for IPMP */
+ if (match_flags & MATCH_IRE_TESTHIDDEN)
+ return;
+
+ match_flags |= MATCH_IRE_TESTHIDDEN;
+ goto again;
+ }
+ gw_ill = gw_ire->ire_ill;
+ if (isv6) {
+ new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
+ &ire->ire_gateway_addr_v6, ire->ire_type, gw_ill,
+ ire->ire_zoneid, ire->ire_flags, NULL, ipst);
+ } else {
+ new_ire = ire_create((uchar_t *)&ire->ire_addr,
+ (uchar_t *)&ire->ire_mask,
+ (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, gw_ill,
+ ire->ire_zoneid, ire->ire_flags, NULL, ipst);
+ }
+ ire_refrele(gw_ire);
+ if (new_ire == NULL)
+ return;
+ new_ire->ire_unbound = B_TRUE;
+ new_ire = ire_add(new_ire);
+ if (new_ire != NULL)
+ ire_refrele(new_ire);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c
index 53df3be..0ad211d 100644
--- a/usr/src/uts/common/inet/ip/ip_mroute.c
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c
@@ -3113,7 +3113,8 @@
(ptrdiff_t)(vifp - ipst->ips_vifs));
}
bzero(&ixas, sizeof (ixas));
- ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE;
+ ixas.ixa_flags =
+ IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
ixas.ixa_ipst = ipst;
ixas.ixa_ifindex = 0;
ixas.ixa_cred = kcred;
diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c
index fa630fd..d2ab0b2 100644
--- a/usr/src/uts/common/inet/ip/ip_ndp.c
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c
@@ -165,8 +165,8 @@
* the probe is sent on the ncec_ill (in the non-IPMP case) or the
* IPMP cast_ill (in the IPMP case).
*
- * Note that the probe interval is based on ncec->ncec_ill which
- * may be the ipmp_ill.
+ * Note that the probe interval is based on the src_ill for IPv6, and
+ * the ncec_xmit_interval for IPv4.
*/
static void
nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
@@ -180,7 +180,7 @@
dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
ncec->ncec_lladdr, ncec->ncec_lladdr_length,
&ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
- probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
+ probe_interval = ILL_PROBE_INTERVAL(src_ill);
} else {
/* IPv4 DAD delay the initial probe. */
if (send_probe)
@@ -4464,8 +4464,17 @@
*/
ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
IN6_V4MAPPED_TO_IPADDR(addr, addr4);
- if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
+ if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
fastprobe = B_TRUE;
+ } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
+ !IS_IPV4_LL_SPACE(&addr4)) {
+ ill_t *hwaddr_ill;
+
+ hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
+ hw_addr_len);
+ if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
+ fastprobe = B_TRUE;
+ }
if (fastprobe) {
ncec->ncec_xmit_interval =
ipst->ips_arp_fastprobe_interval;
diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c
index 59d95a4..77d24e2 100644
--- a/usr/src/uts/common/inet/ip/ip_output.c
+++ b/usr/src/uts/common/inet/ip/ip_output.c
@@ -847,8 +847,8 @@
repeat_ire:
error = 0;
setsrc = INADDR_ANY;
- ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error,
- &multirt);
+ ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
+ &setsrc, &error, &multirt);
ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
if (error != 0) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
@@ -2295,10 +2295,13 @@
* starting at ire1.
*/
ire_t *ire2;
+ uint_t match_flags = MATCH_IRE_DSTONLY;
+ if (ire1->ire_ill != NULL)
+ match_flags |= MATCH_IRE_ILL;
ire2 = ire_route_recursive_impl_v4(ire1,
ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
- ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
+ ire1->ire_zoneid, NULL, match_flags,
IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
if (ire2 != NULL)
ire_refrele(ire2);
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index 539eaef..47972a8 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -2320,7 +2320,7 @@
ixas.ixa_tsl = NULL;
ixas.ixa_ipst = ns->netstack_ip;
/* No ULP checksum; done by esp_prepare_udp */
- ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_IPSEC;
+ ixas.ixa_flags = (IXAF_IS_IPV4 | IXAF_NO_IPSEC | IXAF_VERIFY_SOURCE);
(void) ip_output_simple(mp, &ixas);
ixa_cleanup(&ixas);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index e47bad5..1ded817 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -291,6 +291,7 @@
extern int ipif_arp_down(ipif_t *ipif);
extern void ipif_mask_reply(ipif_t *);
extern int ipif_up(ipif_t *, queue_t *, mblk_t *);
+extern ill_t *ill_lookup_usesrc(ill_t *);
extern void ipsq_current_start(ipsq_t *, ipif_t *, int);
extern void ipsq_current_finish(ipsq_t *);
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index b187f4e..6fd304a 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -93,6 +93,7 @@
/* zones or shared IREs */
#define MATCH_IRE_SECATTR 0x0040 /* Match gateway security attributes */
#define MATCH_IRE_TESTHIDDEN 0x0080 /* Match ire_testhidden IREs */
+#define MATCH_IRE_SRC_ILL 0x0100 /* ire_ill uses a src address on ill */
#define MAX_IRE_RECURSION 4 /* Max IREs in ire_route_recursive */
@@ -321,12 +322,12 @@
extern ire_t *ip_select_route_pkt(mblk_t *, ip_xmit_attr_t *,
uint_t *, int *, boolean_t *);
-extern ire_t *ip_select_route(const in6_addr_t *, ip_xmit_attr_t *,
- uint_t *, in6_addr_t *, int *, boolean_t *);
-extern ire_t *ip_select_route_v4(ipaddr_t, ip_xmit_attr_t *,
+extern ire_t *ip_select_route(const in6_addr_t *, const in6_addr_t,
+ ip_xmit_attr_t *, uint_t *, in6_addr_t *, int *, boolean_t *);
+extern ire_t *ip_select_route_v4(ipaddr_t, ipaddr_t, ip_xmit_attr_t *,
uint_t *, ipaddr_t *, int *, boolean_t *);
-extern ire_t *ip_select_route_v6(const in6_addr_t *, ip_xmit_attr_t *,
- uint_t *, in6_addr_t *, int *, boolean_t *);
+extern ire_t *ip_select_route_v6(const in6_addr_t *, const in6_addr_t,
+ ip_xmit_attr_t *, uint_t *, in6_addr_t *, int *, boolean_t *);
extern void ire_walk(pfv_t, void *, ip_stack_t *);
extern void ire_walk_ill(uint_t, uint_t, pfv_t, void *, ill_t *);
@@ -348,6 +349,7 @@
zoneid_t, ip_stack_t *);
extern void ire_increment_generation(ire_t *);
extern void ire_increment_multicast_generation(ip_stack_t *, boolean_t);
+extern void ire_rebind(ire_t *);
#endif /* _KERNEL */