| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright 2010 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| |
| /* |
| * This file contains consumer routines of the IPv4 forwarding engine |
| */ |
| |
| #include <sys/types.h> |
| #include <sys/stream.h> |
| #include <sys/stropts.h> |
| #include <sys/strlog.h> |
| #include <sys/dlpi.h> |
| #include <sys/ddi.h> |
| #include <sys/cmn_err.h> |
| #include <sys/policy.h> |
| |
| #include <sys/systm.h> |
| #include <sys/strsun.h> |
| #include <sys/kmem.h> |
| #include <sys/param.h> |
| #include <sys/socket.h> |
| #include <sys/strsubr.h> |
| #include <net/if.h> |
| #include <net/route.h> |
| #include <netinet/in.h> |
| #include <net/if_dl.h> |
| #include <netinet/ip6.h> |
| #include <netinet/icmp6.h> |
| |
| #include <inet/ipsec_impl.h> |
| #include <inet/common.h> |
| #include <inet/mi.h> |
| #include <inet/mib2.h> |
| #include <inet/ip.h> |
| #include <inet/ip_impl.h> |
| #include <inet/ip6.h> |
| #include <inet/ip_ndp.h> |
| #include <inet/arp.h> |
| #include <inet/ip_if.h> |
| #include <inet/ip_ire.h> |
| #include <inet/ip_ftable.h> |
| #include <inet/ip_rts.h> |
| #include <inet/nd.h> |
| |
| #include <net/pfkeyv2.h> |
| #include <inet/sadb.h> |
| #include <inet/tcp.h> |
| #include <inet/ipclassifier.h> |
| #include <sys/zone.h> |
| #include <net/radix.h> |
| #include <sys/tsol/label.h> |
| #include <sys/tsol/tnet.h> |
| |
| #define IS_DEFAULT_ROUTE(ire) \ |
| (((ire)->ire_type & IRE_DEFAULT) || \ |
| (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) |
| |
| #define IP_SRC_MULTIHOMING(isv6, ipst) \ |
| (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \ |
| ipst->ips_ip_strict_src_multihoming) |
| |
| static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); |
| static void ire_del_host_redir(ire_t *, char *); |
| static boolean_t ire_find_best_route(struct radix_node *, void *); |
| |
| /* |
| * Lookup a route in forwarding table. A specific lookup is indicated by |
| * passing the required parameters and indicating the match required in the |
| * flag field. |
| * |
| * Supports IP_BOUND_IF by following the ipif/ill when recursing. |
| */ |
| ire_t * |
| ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, |
| int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, |
| int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) |
| { |
| ire_t *ire; |
| struct rt_sockaddr rdst, rmask; |
| struct rt_entry *rt; |
| ire_ftable_args_t margs; |
| |
| ASSERT(ill == NULL || !ill->ill_isv6); |
| |
| /* |
| * ire_match_args() will dereference ill if MATCH_IRE_ILL |
| * is set. |
| */ |
| if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) |
| return (NULL); |
| |
| bzero(&rdst, sizeof (rdst)); |
| rdst.rt_sin_len = sizeof (rdst); |
| rdst.rt_sin_family = AF_INET; |
| rdst.rt_sin_addr.s_addr = addr; |
| |
| bzero(&rmask, sizeof (rmask)); |
| rmask.rt_sin_len = sizeof (rmask); |
| rmask.rt_sin_family = AF_INET; |
| rmask.rt_sin_addr.s_addr = mask; |
| |
| bzero(&margs, sizeof (margs)); |
| margs.ift_addr = addr; |
| margs.ift_mask = mask; |
| margs.ift_gateway = gateway; |
| margs.ift_type = type; |
| margs.ift_ill = ill; |
| margs.ift_zoneid = zoneid; |
| margs.ift_tsl = tsl; |
| margs.ift_flags = flags; |
| |
| /* |
| * The flags argument passed to ire_ftable_lookup may cause the |
| * search to return, not the longest matching prefix, but the |
| * "best matching prefix", i.e., the longest prefix that also |
| * satisfies constraints imposed via the permutation of flags |
| * passed in. To achieve this, we invoke ire_match_args() on |
| * each matching leaf in the radix tree. ire_match_args is |
| * invoked by the callback function ire_find_best_route() |
| * We hold the global tree lock in read mode when calling |
| * rn_match_args. Before dropping the global tree lock, ensure |
| * that the radix node can't be deleted by incrementing ire_refcnt. |
| */ |
| RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); |
| rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, |
| ipst->ips_ip_ftable, ire_find_best_route, &margs); |
| ire = margs.ift_best_ire; |
| if (rt == NULL) { |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| return (NULL); |
| } |
| ASSERT(ire != NULL); |
| |
| DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); |
| |
| /* |
| * round-robin only if we have more than one route in the bucket. |
| * ips_ip_ecmp_behavior controls when we do ECMP |
| * 2: always |
| * 1: for IRE_DEFAULT and /0 IRE_INTERFACE |
| * 0: never |
| */ |
| if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { |
| if (ipst->ips_ip_ecmp_behavior == 2 || |
| (ipst->ips_ip_ecmp_behavior == 1 && |
| IS_DEFAULT_ROUTE(ire))) { |
| ire_t *next_ire; |
| |
| margs.ift_best_ire = NULL; |
| next_ire = ire_round_robin(ire->ire_bucket, &margs, |
| xmit_hint, ire, ipst); |
| if (next_ire == NULL) { |
| /* keep ire if next_ire is null */ |
| goto done; |
| } |
| ire_refrele(ire); |
| ire = next_ire; |
| } |
| } |
| |
| done: |
| /* Return generation before dropping lock */ |
| if (generationp != NULL) |
| *generationp = ire->ire_generation; |
| |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| |
| /* |
| * For shared-IP zones we need additional checks to what was |
| * done in ire_match_args to make sure IRE_LOCALs are handled. |
| * |
| * When ip_restrict_interzone_loopback is set, then |
| * we ensure that IRE_LOCAL are only used for loopback |
| * between zones when the logical "Ethernet" would |
| * have looped them back. That is, if in the absense of |
| * the IRE_LOCAL we would have sent to packet out the |
| * same ill. |
| */ |
| if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && |
| ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && |
| ipst->ips_ip_restrict_interzone_loopback) { |
| ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); |
| ASSERT(ire != NULL); |
| } |
| return (ire); |
| } |
| |
| /* |
| * This function is called by |
| * ip_input/ire_route_recursive when doing a route lookup on only the |
| * destination address. |
| * |
| * The optimizations of this function over ire_ftable_lookup are: |
| * o removing unnecessary flag matching |
| * o doing longest prefix match instead of overloading it further |
| * with the unnecessary "best_prefix_match" |
| * |
| * If no route is found we return IRE_NOROUTE. |
| */ |
| ire_t * |
| ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, |
| uint_t *generationp) |
| { |
| ire_t *ire; |
| struct rt_sockaddr rdst; |
| struct rt_entry *rt; |
| irb_t *irb; |
| |
| rdst.rt_sin_len = sizeof (rdst); |
| rdst.rt_sin_family = AF_INET; |
| rdst.rt_sin_addr.s_addr = addr; |
| |
| /* |
| * This is basically inlining a simpler version of ire_match_args |
| */ |
| RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); |
| |
| rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, |
| ipst->ips_ip_ftable, NULL, NULL); |
| |
| if (rt == NULL) |
| goto bad; |
| |
| irb = &rt->rt_irb; |
| if (irb->irb_ire_cnt == 0) |
| goto bad; |
| |
| rw_enter(&irb->irb_lock, RW_READER); |
| ire = irb->irb_ire; |
| if (ire == NULL) { |
| rw_exit(&irb->irb_lock); |
| goto bad; |
| } |
| while (IRE_IS_CONDEMNED(ire)) { |
| ire = ire->ire_next; |
| if (ire == NULL) { |
| rw_exit(&irb->irb_lock); |
| goto bad; |
| } |
| } |
| |
| /* we have a ire that matches */ |
| ire_refhold(ire); |
| rw_exit(&irb->irb_lock); |
| |
| /* |
| * round-robin only if we have more than one route in the bucket. |
| * ips_ip_ecmp_behavior controls when we do ECMP |
| * 2: always |
| * 1: for IRE_DEFAULT and /0 IRE_INTERFACE |
| * 0: never |
| * |
| * Note: if we found an IRE_IF_CLONE we won't look at the bucket with |
| * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match |
| * and the IRE_INTERFACESs are likely to be shorter matches. |
| */ |
| if (ire->ire_bucket->irb_ire_cnt > 1) { |
| if (ipst->ips_ip_ecmp_behavior == 2 || |
| (ipst->ips_ip_ecmp_behavior == 1 && |
| IS_DEFAULT_ROUTE(ire))) { |
| ire_t *next_ire; |
| ire_ftable_args_t margs; |
| |
| bzero(&margs, sizeof (margs)); |
| margs.ift_addr = addr; |
| margs.ift_zoneid = ALL_ZONES; |
| |
| next_ire = ire_round_robin(ire->ire_bucket, &margs, |
| xmit_hint, ire, ipst); |
| if (next_ire == NULL) { |
| /* keep ire if next_ire is null */ |
| if (generationp != NULL) |
| *generationp = ire->ire_generation; |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| return (ire); |
| } |
| ire_refrele(ire); |
| ire = next_ire; |
| } |
| } |
| /* Return generation before dropping lock */ |
| if (generationp != NULL) |
| *generationp = ire->ire_generation; |
| |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| |
| /* |
| * Since we only did ALL_ZONES matches there is no special handling |
| * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. |
| */ |
| return (ire); |
| |
| bad: |
| if (generationp != NULL) |
| *generationp = IRE_GENERATION_VERIFY; |
| |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| return (ire_reject(ipst, B_FALSE)); |
| } |
| |
| /* |
| * Find the ill matching a multicast group. |
| * Allows different routes for multicast addresses |
| * in the unicast routing table (akin to 224.0.0.0 but could be more specific) |
| * which point at different interfaces. This is used when IP_MULTICAST_IF |
| * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't |
| * specify the interface to join on. |
| * |
| * Supports link-local addresses by using ire_route_recursive which follows |
| * the ill when recursing. |
| * |
| * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group |
| * and the MULTIRT property can be different for different groups, we |
| * extract RTF_MULTIRT from the special unicast route added for a group |
| * with CGTP and pass that back in the multirtp argument. |
| * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. |
| * We have a setsrcp argument for the same reason. |
| */ |
| ill_t * |
| ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, |
| boolean_t *multirtp, ipaddr_t *setsrcp) |
| { |
| ire_t *ire; |
| ill_t *ill; |
| |
| ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, |
| MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); |
| ASSERT(ire != NULL); |
| if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { |
| ire_refrele(ire); |
| return (NULL); |
| } |
| |
| if (multirtp != NULL) |
| *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; |
| |
| ill = ire_nexthop_ill(ire); |
| ire_refrele(ire); |
| return (ill); |
| } |
| |
| /* |
| * Delete the passed in ire if the gateway addr matches |
| */ |
| void |
| ire_del_host_redir(ire_t *ire, char *gateway) |
| { |
| if ((ire->ire_flags & RTF_DYNAMIC) && |
| (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) |
| ire_delete(ire); |
| } |
| |
| /* |
| * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are |
| * pointing at the specified gateway and |
| * delete them. This routine is called only |
| * when a default gateway is going away. |
| */ |
| void |
| ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) |
| { |
| struct rtfuncarg rtfarg; |
| |
| bzero(&rtfarg, sizeof (rtfarg)); |
| rtfarg.rt_func = ire_del_host_redir; |
| rtfarg.rt_arg = (void *)&gateway; |
| rtfarg.rt_zoneid = ALL_ZONES; |
| rtfarg.rt_ipst = ipst; |
| (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, |
| rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); |
| } |
| |
| /* |
| * Obtain the rt_entry and rt_irb for the route to be added to |
| * the ips_ip_ftable. |
| * First attempt to add a node to the radix tree via rn_addroute. If the |
| * route already exists, return the bucket for the existing route. |
| * |
| * Locking notes: Need to hold the global radix tree lock in write mode to |
| * add a radix node. To prevent the node from being deleted, ire_get_bucket() |
| * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() |
| * while holding the irb_lock, but not the radix tree lock. |
| */ |
| irb_t * |
| ire_get_bucket(ire_t *ire) |
| { |
| struct radix_node *rn; |
| struct rt_entry *rt; |
| struct rt_sockaddr rmask, rdst; |
| irb_t *irb = NULL; |
| ip_stack_t *ipst = ire->ire_ipst; |
| |
| ASSERT(ipst->ips_ip_ftable != NULL); |
| |
| /* first try to see if route exists (based on rtalloc1) */ |
| bzero(&rdst, sizeof (rdst)); |
| rdst.rt_sin_len = sizeof (rdst); |
| rdst.rt_sin_family = AF_INET; |
| rdst.rt_sin_addr.s_addr = ire->ire_addr; |
| |
| bzero(&rmask, sizeof (rmask)); |
| rmask.rt_sin_len = sizeof (rmask); |
| rmask.rt_sin_family = AF_INET; |
| rmask.rt_sin_addr.s_addr = ire->ire_mask; |
| |
| /* |
| * add the route. based on BSD's rtrequest1(RTM_ADD) |
| */ |
| R_Malloc(rt, rt_entry_cache, sizeof (*rt)); |
| /* kmem_alloc failed */ |
| if (rt == NULL) |
| return (NULL); |
| |
| bzero(rt, sizeof (*rt)); |
| rt->rt_nodes->rn_key = (char *)&rt->rt_dst; |
| rt->rt_dst = rdst; |
| irb = &rt->rt_irb; |
| irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ |
| irb->irb_ipst = ipst; |
| rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); |
| RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); |
| rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, |
| ipst->ips_ip_ftable, (struct radix_node *)rt); |
| if (rn == NULL) { |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| Free(rt, rt_entry_cache); |
| rt = NULL; |
| irb = NULL; |
| RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); |
| rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, |
| ipst->ips_ip_ftable); |
| if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { |
| /* found a non-root match */ |
| rt = (struct rt_entry *)rn; |
| } |
| } |
| if (rt != NULL) { |
| irb = &rt->rt_irb; |
| irb_refhold(irb); |
| } |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| return (irb); |
| } |
| |
| /* |
| * This function is used when the caller wants to know the outbound |
| * interface for a packet given only the address. |
| * If this is a offlink IP address and there are multiple |
| * routes to this destination, this routine will utilise the |
| * first route it finds to IP address |
| * Return values: |
| * 0 - FAILURE |
| * nonzero - ifindex |
| */ |
| uint_t |
| ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) |
| { |
| uint_t ifindex = 0; |
| ire_t *ire; |
| ill_t *ill; |
| netstack_t *ns; |
| ip_stack_t *ipst; |
| |
| if (zoneid == ALL_ZONES) |
| ns = netstack_find_by_zoneid(GLOBAL_ZONEID); |
| else |
| ns = netstack_find_by_zoneid(zoneid); |
| ASSERT(ns != NULL); |
| |
| /* |
| * For exclusive stacks we set the zoneid to zero |
| * since IP uses the global zoneid in the exclusive stacks. |
| */ |
| if (ns->netstack_stackid != GLOBAL_NETSTACKID) |
| zoneid = GLOBAL_ZONEID; |
| ipst = ns->netstack_ip; |
| |
| ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); |
| |
| if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { |
| ill = ire_nexthop_ill(ire); |
| if (ill != NULL) { |
| ifindex = ill->ill_phyint->phyint_ifindex; |
| ill_refrele(ill); |
| } |
| ire_refrele(ire); |
| } |
| netstack_rele(ns); |
| return (ifindex); |
| } |
| |
| /* |
| * Routine to find the route to a destination. If a ifindex is supplied |
| * it tries to match the route to the corresponding ipif for the ifindex |
| */ |
| static ire_t * |
| route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) |
| { |
| ire_t *ire = NULL; |
| int match_flags; |
| |
| match_flags = MATCH_IRE_DSTONLY; |
| |
| /* XXX pass NULL tsl for now */ |
| |
| if (dst_addr->sa_family == AF_INET) { |
| ire = ire_route_recursive_v4( |
| ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, |
| zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, |
| NULL, NULL); |
| } else { |
| ire = ire_route_recursive_v6( |
| &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, |
| zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, |
| NULL, NULL); |
| } |
| ASSERT(ire != NULL); |
| if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { |
| ire_refrele(ire); |
| return (NULL); |
| } |
| return (ire); |
| } |
| |
| /* |
| * This routine is called by IP Filter to send a packet out on the wire |
| * to a specified dstination (which may be onlink or offlink). The ifindex may |
| * or may not be 0. A non-null ifindex indicates IP Filter has stipulated |
| * an outgoing interface and requires the nexthop to be on that interface. |
| * IP WILL NOT DO the following to the data packet before sending it out: |
| * a. manipulate ttl |
| * b. ipsec work |
| * c. fragmentation |
| * |
| * If the packet has been prepared for hardware checksum then it will be |
| * passed off to ip_send_align_cksum() to check that the flags set on the |
| * packet are in alignment with the capabilities of the new outgoing NIC. |
| * |
| * Return values: |
| * 0: IP was able to send of the data pkt |
| * ECOMM: Could not send packet |
| * ENONET No route to dst. It is up to the caller |
| * to send icmp unreachable error message, |
| * EINPROGRESS The macaddr of the onlink dst or that |
| * of the offlink dst's nexthop needs to get |
| * resolved before packet can be sent to dst. |
| * Thus transmission is not guaranteed. |
| * Note: No longer have visibility to the ARP queue |
| * hence no EINPROGRESS. |
| */ |
| int |
| ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, |
| zoneid_t zoneid) |
| { |
| ipaddr_t nexthop; |
| netstack_t *ns; |
| ip_stack_t *ipst; |
| ip_xmit_attr_t ixas; |
| int error; |
| |
| ASSERT(mp != NULL); |
| |
| if (zoneid == ALL_ZONES) |
| ns = netstack_find_by_zoneid(GLOBAL_ZONEID); |
| else |
| ns = netstack_find_by_zoneid(zoneid); |
| ASSERT(ns != NULL); |
| |
| /* |
| * For exclusive stacks we set the zoneid to zero |
| * since IP uses the global zoneid in the exclusive stacks. |
| */ |
| if (ns->netstack_stackid != GLOBAL_NETSTACKID) |
| zoneid = GLOBAL_ZONEID; |
| ipst = ns->netstack_ip; |
| |
| ASSERT(dst_addr->sa_family == AF_INET || |
| dst_addr->sa_family == AF_INET6); |
| |
| bzero(&ixas, sizeof (ixas)); |
| /* |
| * No IPsec, no fragmentation, and don't let any hooks see |
| * the packet. |
| */ |
| ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; |
| ixas.ixa_cred = kcred; |
| ixas.ixa_cpid = NOPID; |
| ixas.ixa_tsl = NULL; |
| ixas.ixa_ipst = ipst; |
| ixas.ixa_ifindex = ifindex; |
| |
| if (dst_addr->sa_family == AF_INET) { |
| ipha_t *ipha = (ipha_t *)mp->b_rptr; |
| |
| ixas.ixa_flags |= IXAF_IS_IPV4; |
| nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; |
| if (nexthop != ipha->ipha_dst) { |
| ixas.ixa_flags |= IXAF_NEXTHOP_SET; |
| ixas.ixa_nexthop_v4 = nexthop; |
| } |
| ixas.ixa_multicast_ttl = ipha->ipha_ttl; |
| } else { |
| ip6_t *ip6h = (ip6_t *)mp->b_rptr; |
| in6_addr_t *nexthop6; |
| |
| nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; |
| if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { |
| ixas.ixa_flags |= IXAF_NEXTHOP_SET; |
| ixas.ixa_nexthop_v6 = *nexthop6; |
| } |
| ixas.ixa_multicast_ttl = ip6h->ip6_hops; |
| } |
| error = ip_output_simple(mp, &ixas); |
| ixa_cleanup(&ixas); |
| |
| netstack_rele(ns); |
| switch (error) { |
| case 0: |
| break; |
| |
| case EHOSTUNREACH: |
| case ENETUNREACH: |
| error = ENONET; |
| break; |
| |
| default: |
| error = ECOMM; |
| break; |
| } |
| return (error); |
| } |
| |
| /* |
| * callback function provided by ire_ftable_lookup when calling |
| * rn_match_args(). Invoke ire_match_args on each matching leaf node in |
| * the radix tree. |
| */ |
| boolean_t |
| ire_find_best_route(struct radix_node *rn, void *arg) |
| { |
| struct rt_entry *rt = (struct rt_entry *)rn; |
| irb_t *irb_ptr; |
| ire_t *ire; |
| ire_ftable_args_t *margs = arg; |
| ipaddr_t match_mask; |
| |
| ASSERT(rt != NULL); |
| |
| irb_ptr = &rt->rt_irb; |
| |
| if (irb_ptr->irb_ire_cnt == 0) |
| return (B_FALSE); |
| |
| rw_enter(&irb_ptr->irb_lock, RW_READER); |
| for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { |
| if (IRE_IS_CONDEMNED(ire)) |
| continue; |
| ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0); |
| if (margs->ift_flags & MATCH_IRE_MASK) |
| match_mask = margs->ift_mask; |
| else |
| match_mask = ire->ire_mask; |
| |
| if (ire_match_args(ire, margs->ift_addr, match_mask, |
| margs->ift_gateway, margs->ift_type, margs->ift_ill, |
| margs->ift_zoneid, margs->ift_tsl, |
| margs->ift_flags)) { |
| ire_refhold(ire); |
| rw_exit(&irb_ptr->irb_lock); |
| margs->ift_best_ire = ire; |
| return (B_TRUE); |
| } |
| } |
| rw_exit(&irb_ptr->irb_lock); |
| return (B_FALSE); |
| } |
| |
| /* |
| * ftable irb_t structures are dynamically allocated, and we need to |
| * check if the irb_t (and associated ftable tree attachment) needs to |
| * be cleaned up when the irb_refcnt goes to 0. The conditions that need |
| * be verified are: |
| * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, |
| * - no other threads holding references to ire's in the bucket, |
| * i.e., irb_nire == 0 |
| * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 |
| * - need to hold the global tree lock and irb_lock in write mode. |
| */ |
| void |
| irb_refrele_ftable(irb_t *irb) |
| { |
| for (;;) { |
| rw_enter(&irb->irb_lock, RW_WRITER); |
| ASSERT(irb->irb_refcnt != 0); |
| if (irb->irb_refcnt != 1) { |
| /* |
| * Someone has a reference to this radix node |
| * or there is some bucket walker. |
| */ |
| irb->irb_refcnt--; |
| rw_exit(&irb->irb_lock); |
| return; |
| } else { |
| /* |
| * There is no other walker, nor is there any |
| * other thread that holds a direct ref to this |
| * radix node. Do the clean up if needed. Call |
| * to ire_unlink will clear the IRB_MARK_CONDEMNED flag |
| */ |
| if (irb->irb_marks & IRB_MARK_CONDEMNED) { |
| ire_t *ire_list; |
| |
| ire_list = ire_unlink(irb); |
| rw_exit(&irb->irb_lock); |
| |
| if (ire_list != NULL) |
| ire_cleanup(ire_list); |
| /* |
| * more CONDEMNED entries could have |
| * been added while we dropped the lock, |
| * so we have to re-check. |
| */ |
| continue; |
| } |
| |
| /* |
| * Now check if there are still any ires |
| * associated with this radix node. |
| */ |
| if (irb->irb_nire != 0) { |
| /* |
| * someone is still holding on |
| * to ires in this bucket |
| */ |
| irb->irb_refcnt--; |
| rw_exit(&irb->irb_lock); |
| return; |
| } else { |
| /* |
| * Everything is clear. Zero walkers, |
| * Zero threads with a ref to this |
| * radix node, Zero ires associated with |
| * this radix node. Due to lock order, |
| * check the above conditions again |
| * after grabbing all locks in the right order |
| */ |
| rw_exit(&irb->irb_lock); |
| if (irb_inactive(irb)) |
| return; |
| /* |
| * irb_inactive could not free the irb. |
| * See if there are any walkers, if not |
| * try to clean up again. |
| */ |
| } |
| } |
| } |
| } |
| |
| /* |
| * IRE iterator used by ire_ftable_lookup to process multiple equal |
| * routes. Given a starting point in the hash list (hash), walk the IREs |
| * in the bucket skipping deleted entries. We treat the bucket as a circular |
| * list for the purposes of walking it. |
| * Returns the IRE (held) that corresponds to the hash value. If that IRE is |
| * not applicable (ire_match_args failed) then it returns a subsequent one. |
| * If we fail to find an IRE we return NULL. |
| * |
| * Assumes that the caller holds a reference on the IRE bucket and a read lock |
| * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). |
| * |
| * Applies to IPv4 and IPv6. |
| * |
| * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same |
| * address and bucket, we compare against ire_type for the orig_ire. We also |
| * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being |
| * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. |
| * |
| * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is |
| * reachable from the zone i.e., that the ire_gateway_addr is in a subnet |
| * in which the zone has an IP address. We check this for the global zone |
| * even if no shared-IP zones are configured. |
| */ |
| ire_t * |
| ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, |
| ire_t *orig_ire, ip_stack_t *ipst) |
| { |
| ire_t *ire, *maybe_ire = NULL; |
| uint_t maybe_badcnt; |
| uint_t maxwalk; |
| |
| /* Fold in more bits from the hint/hash */ |
| hash = hash ^ (hash >> 8) ^ (hash >> 16); |
| |
| rw_enter(&irb_ptr->irb_lock, RW_WRITER); |
| maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ |
| hash %= maxwalk; |
| irb_refhold_locked(irb_ptr); |
| rw_exit(&irb_ptr->irb_lock); |
| |
| /* |
| * Round-robin the routers list looking for a route that |
| * matches the passed in parameters. |
| * First we skip "hash" number of non-condemned IREs. |
| * Then we match the IRE. |
| * If we find an ire which has a non-zero ire_badcnt then we remember |
| * it and keep on looking for a lower ire_badcnt. |
| * If we come to the end of the list we continue (treat the |
| * bucket list as a circular list) but we match less than "max" |
| * entries. |
| */ |
| ire = irb_ptr->irb_ire; |
| while (maxwalk > 0) { |
| if (IRE_IS_CONDEMNED(ire)) |
| goto next_ire_skip; |
| |
| /* Skip the first "hash" entries to do ECMP */ |
| if (hash != 0) { |
| hash--; |
| goto next_ire_skip; |
| } |
| |
| /* See CGTP comment above */ |
| if (ire->ire_type != orig_ire->ire_type || |
| ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) |
| goto next_ire; |
| |
| /* |
| * Note: Since IPv6 has hash buckets instead of radix |
| * buckers we need to explicitly compare the addresses. |
| * That makes this less efficient since we will be called |
| * even if there is no alternatives just because the |
| * bucket has multiple IREs for different addresses. |
| */ |
| if (ire->ire_ipversion == IPV6_VERSION) { |
| if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, |
| &ire->ire_addr_v6)) |
| goto next_ire; |
| } |
| |
| /* |
| * For some reason find_best_route uses ire_mask. We do |
| * the same. |
| */ |
| if (ire->ire_ipversion == IPV4_VERSION ? |
| !ire_match_args(ire, margs->ift_addr, |
| ire->ire_mask, margs->ift_gateway, |
| margs->ift_type, margs->ift_ill, margs->ift_zoneid, |
| margs->ift_tsl, margs->ift_flags) : |
| !ire_match_args_v6(ire, &margs->ift_addr_v6, |
| &ire->ire_mask_v6, &margs->ift_gateway_v6, |
| margs->ift_type, margs->ift_ill, margs->ift_zoneid, |
| margs->ift_tsl, margs->ift_flags)) |
| goto next_ire; |
| |
| if (margs->ift_zoneid != ALL_ZONES && |
| (ire->ire_type & IRE_OFFLINK)) { |
| /* |
| * When we're in a zone, we're only |
| * interested in routers that are |
| * reachable through ipifs within our zone. |
| */ |
| if (ire->ire_ipversion == IPV4_VERSION) { |
| if (!ire_gateway_ok_zone_v4( |
| ire->ire_gateway_addr, margs->ift_zoneid, |
| ire->ire_ill, margs->ift_tsl, ipst, |
| B_TRUE)) |
| goto next_ire; |
| } else { |
| if (!ire_gateway_ok_zone_v6( |
| &ire->ire_gateway_addr_v6, |
| margs->ift_zoneid, ire->ire_ill, |
| margs->ift_tsl, ipst, B_TRUE)) |
| goto next_ire; |
| } |
| } |
| mutex_enter(&ire->ire_lock); |
| /* Look for stale ire_badcnt and clear */ |
| if (ire->ire_badcnt != 0 && |
| (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > |
| ipst->ips_ip_ire_badcnt_lifetime)) |
| ire->ire_badcnt = 0; |
| mutex_exit(&ire->ire_lock); |
| |
| if (ire->ire_badcnt == 0) { |
| /* We found one with a zero badcnt; done */ |
| ire_refhold(ire); |
| /* |
| * Care needed since irb_refrele grabs WLOCK to free |
| * the irb_t. |
| */ |
| if (ire->ire_ipversion == IPV4_VERSION) { |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| irb_refrele(irb_ptr); |
| RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); |
| } else { |
| rw_exit(&ipst->ips_ip6_ire_head_lock); |
| irb_refrele(irb_ptr); |
| rw_enter(&ipst->ips_ip6_ire_head_lock, |
| RW_READER); |
| } |
| return (ire); |
| } |
| /* |
| * keep looking to see if there is a better (lower |
| * badcnt) matching IRE, but save this one as a last resort. |
| * If we find a lower badcnt pick that one as the last* resort. |
| */ |
| if (maybe_ire == NULL) { |
| maybe_ire = ire; |
| maybe_badcnt = ire->ire_badcnt; |
| } else if (ire->ire_badcnt < maybe_badcnt) { |
| maybe_ire = ire; |
| maybe_badcnt = ire->ire_badcnt; |
| } |
| |
| next_ire: |
| maxwalk--; |
| next_ire_skip: |
| ire = ire->ire_next; |
| if (ire == NULL) |
| ire = irb_ptr->irb_ire; |
| } |
| if (maybe_ire != NULL) |
| ire_refhold(maybe_ire); |
| |
| /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ |
| if (ire->ire_ipversion == IPV4_VERSION) { |
| RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); |
| irb_refrele(irb_ptr); |
| RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); |
| } else { |
| rw_exit(&ipst->ips_ip6_ire_head_lock); |
| irb_refrele(irb_ptr); |
| rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); |
| } |
| return (maybe_ire); |
| } |
| |
| void |
| irb_refhold_rn(struct radix_node *rn) |
| { |
| if ((rn->rn_flags & RNF_ROOT) == 0) |
| irb_refhold(&((rt_t *)(rn))->rt_irb); |
| } |
| |
| void |
| irb_refrele_rn(struct radix_node *rn) |
| { |
| if ((rn->rn_flags & RNF_ROOT) == 0) |
| irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); |
| } |
| |
| |
| /* |
| * ip_select_src_ill() is used by ip_select_route() to find the src_ill |
| * to be used for source-aware routing table lookup. This function will |
| * ignore IPIF_UNNUMBERED interface addresses, and will only return a |
| * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED |
| * interfaces). |
| */ |
| static ill_t * |
| ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst) |
| { |
| ipif_t *ipif; |
| ill_t *ill; |
| boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src); |
| ipaddr_t v4src; |
| |
| if (isv6) { |
| ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst); |
| } else { |
| IN6_V4MAPPED_TO_IPADDR(v6src, v4src); |
| ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst); |
| } |
| if (ipif == NULL) |
| return (NULL); |
| ill = ipif->ipif_ill; |
| ill_refhold(ill); |
| ipif_refrele(ipif); |
| return (ill); |
| } |
| |
| /* |
| * verify that v6src is configured on ill |
| */ |
| static boolean_t |
| ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid) |
| { |
| ipif_t *ipif; |
| ip_stack_t *ipst; |
| ipaddr_t v4src; |
| |
| if (ill == NULL) |
| return (B_FALSE); |
| ipst = ill->ill_ipst; |
| |
| if (ill->ill_isv6) { |
| ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst); |
| } else { |
| IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); |
| ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst); |
| } |
| |
| if (ipif != NULL) { |
| ipif_refrele(ipif); |
| return (B_TRUE); |
| } else { |
| return (B_FALSE); |
| } |
| } |
| |
| /* |
| * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject |
| * routes this routine sets up a ire_nce_cache as well. The caller needs to |
| * lookup an nce for the multicast case. |
| * |
| * When src_multihoming is set to 2 (strict src multihoming) we use the source |
| * address to select the interface and route. If IP_BOUND_IF etc are |
| * specified, we require that they specify an interface on which the |
| * source address is assigned. |
| * |
| * When src_multihoming is set to 1 (preferred src aware route |
| * selection) the unicast lookup prefers a matching source |
| * (i.e., that the route points out an ill on which the source is assigned), but |
| * if no such route is found we fallback to not considering the source in the |
| * route lookup. |
| * |
| * We skip the src_multihoming check when the source isn't (yet) set, and |
| * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send |
| * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO |
| * when secpolicy_net_rawaccess(). |
| */ |
| ire_t * |
| ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, |
| ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, |
| int *errorp, boolean_t *multirtp) |
| { |
| uint_t match_args; |
| uint_t ire_type; |
| ill_t *ill = NULL; |
| ire_t *ire; |
| ip_stack_t *ipst = ixa->ixa_ipst; |
| ipaddr_t v4dst; |
| in6_addr_t v6nexthop; |
| iaflags_t ixaflags = ixa->ixa_flags; |
| nce_t *nce; |
| boolean_t preferred_src_aware = B_FALSE; |
| boolean_t verify_src; |
| boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4); |
| int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst); |
| |
| /* |
| * We only verify that the src has been configured on a selected |
| * interface if the src is not :: or INADDR_ANY, and if the |
| * IXAF_VERIFY_SOURCE flag is set. |
| */ |
| verify_src = (!V6_OR_V4_INADDR_ANY(v6src) && |
| (ixa->ixa_flags & IXAF_VERIFY_SOURCE)); |
| |
| match_args = MATCH_IRE_SECATTR; |
| IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); |
| if (setsrcp != NULL) |
| ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); |
| if (errorp != NULL) |
| ASSERT(*errorp == 0); |
| |
| /* |
| * The content of the ixa will be different if IP_NEXTHOP, |
| * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set |
| */ |
| |
| if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) { |
| /* Pick up the IRE_MULTICAST for the ill */ |
| if (ixa->ixa_multicast_ifindex != 0) { |
| ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, |
| isv6, ipst); |
| } else if (ixaflags & IXAF_SCOPEID_SET) { |
| /* sin6_scope_id takes precedence over ixa_ifindex */ |
| ASSERT(ixa->ixa_scopeid != 0); |
| ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, |
| isv6, ipst); |
| } else if (ixa->ixa_ifindex != 0) { |
| /* |
| * In the ipmp case, the ixa_ifindex is set to |
| * point at an under_ill and we would return the |
| * ire_multicast() corresponding to that under_ill. |
| */ |
| ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, |
| isv6, ipst); |
| } else if (src_multihoming != 0 && verify_src) { |
| /* Look up the ill based on the source address */ |
| ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); |
| /* |
| * Since we looked up the ill from the source there |
| * is no need to verify that the source is on the ill |
| * below. |
| */ |
| verify_src = B_FALSE; |
| if (ill != NULL && IS_VNI(ill)) { |
| ill_t *usesrc = ill; |
| |
| ill = ill_lookup_usesrc(usesrc); |
| ill_refrele(usesrc); |
| } |
| } else if (!isv6) { |
| ipaddr_t v4setsrc = INADDR_ANY; |
| |
| ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, |
| ipst, multirtp, &v4setsrc); |
| if (setsrcp != NULL) |
| IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); |
| } else { |
| ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, |
| ipst, multirtp, setsrcp); |
| } |
| if (ill != NULL && IS_VNI(ill)) { |
| ill_refrele(ill); |
| ill = NULL; |
| } |
| if (ill == NULL) { |
| if (errorp != NULL) |
| *errorp = ENXIO; |
| /* Get a hold on the IRE_NOROUTE */ |
| ire = ire_reject(ipst, isv6); |
| return (ire); |
| } |
| if (!(ill->ill_flags & ILLF_MULTICAST)) { |
| ill_refrele(ill); |
| if (errorp != NULL) |
| *errorp = EHOSTUNREACH; |
| /* Get a hold on the IRE_NOROUTE */ |
| ire = ire_reject(ipst, isv6); |
| return (ire); |
| } |
| /* |
| * If we are doing the strictest src_multihoming, then |
| * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify |
| * an interface that is consistent with the source address. |
| */ |
| if (verify_src && src_multihoming == 2 && |
| !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { |
| if (errorp != NULL) |
| *errorp = EADDRNOTAVAIL; |
| ill_refrele(ill); |
| /* Get a hold on the IRE_NOROUTE */ |
| ire = ire_reject(ipst, isv6); |
| return (ire); |
| } |
| /* Get a refcnt on the single IRE_MULTICAST per ill */ |
| ire = ire_multicast(ill); |
| ill_refrele(ill); |
| if (generationp != NULL) |
| *generationp = ire->ire_generation; |
| if (errorp != NULL && |
| (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { |
| *errorp = EHOSTUNREACH; |
| } |
| return (ire); |
| } |
| |
| /* Now for unicast */ |
| if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { |
| if (ixaflags & IXAF_SCOPEID_SET) { |
| /* sin6_scope_id takes precedence over ixa_ifindex */ |
| ASSERT(ixa->ixa_scopeid != 0); |
| ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, |
| isv6, ipst); |
| } else { |
| ASSERT(ixa->ixa_ifindex != 0); |
| ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, |
| isv6, ipst); |
| } |
| if (ill != NULL && IS_VNI(ill)) { |
| ill_refrele(ill); |
| ill = NULL; |
| } |
| if (ill == NULL) { |
| if (errorp != NULL) |
| *errorp = ENXIO; |
| /* Get a hold on the IRE_NOROUTE */ |
| ire = ire_reject(ipst, isv6); |
| return (ire); |
| } |
| |
| match_args |= MATCH_IRE_ILL; |
| |
| /* |
| * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF |
| * so for both of them we need to be able look for an under |
| * interface. |
| */ |
| if (IS_UNDER_IPMP(ill)) |
| match_args |= MATCH_IRE_TESTHIDDEN; |
| |
| /* |
| * If we are doing the strictest src_multihoming, then |
| * we check that IP_BOUND_IF, IP_PKTINFO, etc specify |
| * an interface that is consistent with the source address. |
| */ |
| if (src_multihoming == 2 && |
| !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { |
| if (errorp != NULL) |
| *errorp = EADDRNOTAVAIL; |
| ill_refrele(ill); |
| /* Get a hold on the IRE_NOROUTE */ |
| ire = ire_reject(ipst, isv6); |
| return (ire); |
| } |
| } else if (src_multihoming != 0 && verify_src) { |
| /* Look up the ill based on the source address */ |
| ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); |
| if (ill == NULL) { |
| char addrbuf[INET6_ADDRSTRLEN]; |
| |
| ip3dbg(("%s not a valid src for unicast", |
| inet_ntop(AF_INET6, &v6src, addrbuf, |
| sizeof (addrbuf)))); |
| if (errorp != NULL) |
| *errorp = EADDRNOTAVAIL; |
| /* Get a hold on the IRE_NOROUTE */ |
| ire = ire_reject(ipst, isv6); |
| return (ire); |
| } |
| match_args |= MATCH_IRE_SRC_ILL; |
| preferred_src_aware = (src_multihoming == 1); |
| } |
| |
| if (ixaflags & IXAF_NEXTHOP_SET) { |
| /* IP_NEXTHOP was set */ |
| v6nexthop = ixa->ixa_nexthop_v6; |
| } else { |
| v6nexthop = *v6dst; |
| } |
| |
| ire_type = 0; |
| |
| /* |
| * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then |
| * we only look for an onlink IRE. |
| */ |
| if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { |
| match_args |= MATCH_IRE_TYPE; |
| ire_type = IRE_ONLINK; |
| } |
| |
| retry: |
| if (!isv6) { |
| ipaddr_t v4nexthop; |
| ipaddr_t v4setsrc = INADDR_ANY; |
| |
| IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); |
| ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, |
| ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, |
| ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); |
| if (setsrcp != NULL) |
| IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); |
| } else { |
| ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, |
| ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, |
| ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); |
| } |
| |
| #ifdef DEBUG |
| if (match_args & MATCH_IRE_TESTHIDDEN) { |
| ip3dbg(("looking for hidden; dst %x ire %p\n", |
| v4dst, (void *)ire)); |
| } |
| #endif |
| if (ill != NULL) { |
| ill_refrele(ill); |
| ill = NULL; |
| } |
| if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || |
| (ire->ire_type & IRE_MULTICAST)) { |
| if (preferred_src_aware) { |
| /* |
| * "Preferred Source Aware" send mode. If we cannot |
| * find an ire whose ire_ill had the desired source |
| * address retry after relaxing the ill matching |
| * constraint. |
| */ |
| ire_refrele(ire); |
| preferred_src_aware = B_FALSE; |
| match_args &= ~MATCH_IRE_SRC_ILL; |
| goto retry; |
| } |
| /* No ire_nce_cache */ |
| return (ire); |
| } |
| |
| /* Setup ire_nce_cache if it doesn't exist or is condemned. */ |
| mutex_enter(&ire->ire_lock); |
| nce = ire->ire_nce_cache; |
| if (nce == NULL || nce->nce_is_condemned) { |
| mutex_exit(&ire->ire_lock); |
| (void) ire_revalidate_nce(ire); |
| } else { |
| mutex_exit(&ire->ire_lock); |
| } |
| return (ire); |
| } |
| |
| /* |
| * Find a route given some xmit attributes and a packet. |
| * Generic for IPv4 and IPv6 |
| * |
| * This never returns NULL. But when it returns the IRE_NOROUTE |
| * it might set errorp. |
| */ |
| ire_t * |
| ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, |
| int *errorp, boolean_t *multirtp) |
| { |
| if (ixa->ixa_flags & IXAF_IS_IPV4) { |
| ipha_t *ipha = (ipha_t *)mp->b_rptr; |
| in6_addr_t v6dst, v6src; |
| |
| IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); |
| IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); |
| |
| return (ip_select_route(&v6dst, v6src, ixa, generationp, |
| NULL, errorp, multirtp)); |
| } else { |
| ip6_t *ip6h = (ip6_t *)mp->b_rptr; |
| |
| return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src, |
| ixa, generationp, NULL, errorp, multirtp)); |
| } |
| } |
| |
| ire_t * |
| ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa, |
| uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) |
| { |
| in6_addr_t v6dst, v6src; |
| ire_t *ire; |
| in6_addr_t setsrc; |
| |
| ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); |
| |
| IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); |
| IN6_IPADDR_TO_V4MAPPED(src, &v6src); |
| |
| setsrc = ipv6_all_zeros; |
| ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp, |
| multirtp); |
| if (v4setsrcp != NULL) |
| IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); |
| return (ire); |
| } |
| |
| /* |
| * Recursively look for a route to the destination. Can also match on |
| * the zoneid, ill, and label. Used for the data paths. See also |
| * ire_route_recursive. |
| * |
| * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never |
| * create an IRE_IF_CLONE. This is used on the receive side when we are not |
| * forwarding. |
| * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly |
| * resolve the gateway. |
| * |
| * Note that this function never returns NULL. It returns an IRE_NOROUTE |
| * instead. |
| * |
| * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it |
| * is an error. |
| * Allow at most one RTF_INDIRECT. |
| */ |
| ire_t * |
| ire_route_recursive_impl_v4(ire_t *ire, |
| ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, |
| zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, |
| uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, |
| tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) |
| { |
| int i, j; |
| ire_t *ires[MAX_IRE_RECURSION]; |
| uint_t generation; |
| uint_t generations[MAX_IRE_RECURSION]; |
| boolean_t need_refrele = B_FALSE; |
| boolean_t invalidate = B_FALSE; |
| int prefs[MAX_IRE_RECURSION]; |
| ill_t *ill = NULL; |
| |
| if (setsrcp != NULL) |
| ASSERT(*setsrcp == INADDR_ANY); |
| if (gwattrp != NULL) |
| ASSERT(*gwattrp == NULL); |
| |
| /* |
| * We iterate up to three times to resolve a route, even though |
| * we have four slots in the array. The extra slot is for an |
| * IRE_IF_CLONE we might need to create. |
| */ |
| i = 0; |
| while (i < MAX_IRE_RECURSION - 1) { |
| /* ire_ftable_lookup handles round-robin/ECMP */ |
| if (ire == NULL) { |
| ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, |
| (ill != NULL? ill : ill_arg), zoneid, tsl, |
| match_args, xmit_hint, ipst, &generation); |
| } else { |
| /* Caller passed it; extra hold since we will rele */ |
| ire_refhold(ire); |
| if (generationp != NULL) |
| generation = *generationp; |
| else |
| generation = IRE_GENERATION_VERIFY; |
| } |
| if (ire == NULL) |
| ire = ire_reject(ipst, B_FALSE); |
| |
| /* Need to return the ire with RTF_REJECT|BLACKHOLE */ |
| if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) |
| goto error; |
| |
| ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ |
| |
| if (i != 0) { |
| prefs[i] = ire_pref(ire); |
| /* |
| * Don't allow anything unusual past the first |
| * iteration. |
| */ |
| if ((ire->ire_type & |
| (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || |
| prefs[i] <= prefs[i-1]) { |
| ire_refrele(ire); |
| if (irr_flags & IRR_INCOMPLETE) { |
| ire = ires[0]; |
| ire_refhold(ire); |
| } else { |
| ire = ire_reject(ipst, B_FALSE); |
| } |
| goto error; |
| } |
| } |
| /* We have a usable IRE */ |
| ires[i] = ire; |
| generations[i] = generation; |
| i++; |
| |
| /* The first RTF_SETSRC address is passed back if setsrcp */ |
| if ((ire->ire_flags & RTF_SETSRC) && |
| setsrcp != NULL && *setsrcp == INADDR_ANY) { |
| ASSERT(ire->ire_setsrc_addr != INADDR_ANY); |
| *setsrcp = ire->ire_setsrc_addr; |
| } |
| |
| /* The first ire_gw_secattr is passed back if gwattrp */ |
| if (ire->ire_gw_secattr != NULL && |
| gwattrp != NULL && *gwattrp == NULL) |
| *gwattrp = ire->ire_gw_secattr; |
| |
| /* |
| * Check if we have a short-cut pointer to an IRE for this |
| * destination, and that the cached dependency isn't stale. |
| * In that case we've rejoined an existing tree towards a |
| * parent, thus we don't need to continue the loop to |
| * discover the rest of the tree. |
| */ |
| mutex_enter(&ire->ire_lock); |
| if (ire->ire_dep_parent != NULL && |
| ire->ire_dep_parent->ire_generation == |
| ire->ire_dep_parent_generation) { |
| mutex_exit(&ire->ire_lock); |
| ire = NULL; |
| goto done; |
| } |
| mutex_exit(&ire->ire_lock); |
| |
| /* |
| * If this type should have an ire_nce_cache (even if it |
| * doesn't yet have one) then we are done. Includes |
| * IRE_INTERFACE with a full 32 bit mask. |
| */ |
| if (ire->ire_nce_capable) { |
| ire = NULL; |
| goto done; |
| } |
| ASSERT(!(ire->ire_type & IRE_IF_CLONE)); |
| /* |
| * For an IRE_INTERFACE we create an IRE_IF_CLONE for this |
| * particular destination |
| */ |
| if (ire->ire_type & IRE_INTERFACE) { |
| in6_addr_t v6nexthop; |
| ire_t *clone; |
| |
| ASSERT(ire->ire_masklen != IPV4_ABITS); |
| |
| /* |
| * In the case of ip_input and ILLF_FORWARDING not |
| * being set, and in the case of RTM_GET, there is |
| * no point in allocating an IRE_IF_CLONE. We return |
| * the IRE_INTERFACE. Note that !IRR_ALLOCATE can |
| * result in a ire_dep_parent which is IRE_IF_* |
| * without an IRE_IF_CLONE. |
| * We recover from that when we need to send packets |
| * by ensuring that the generations become |
| * IRE_GENERATION_VERIFY in this case. |
| */ |
| if (!(irr_flags & IRR_ALLOCATE)) { |
| invalidate = B_TRUE; |
| ire = NULL; |
| goto done; |
| } |
| |
| IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); |
| |
| clone = ire_create_if_clone(ire, &v6nexthop, |
| &generation); |
| if (clone == NULL) { |
| /* |
| * Temporary failure - no memory. |
| * Don't want caller to cache IRE_NOROUTE. |
| */ |
| invalidate = B_TRUE; |
| ire = ire_blackhole(ipst, B_FALSE); |
| goto error; |
| } |
| /* |
| * Make clone next to last entry and the |
| * IRE_INTERFACE the last in the dependency |
| * chain since the clone depends on the |
| * IRE_INTERFACE. |
| */ |
| ASSERT(i >= 1); |
| ASSERT(i < MAX_IRE_RECURSION); |
| |
| ires[i] = ires[i-1]; |
| generations[i] = generations[i-1]; |
| ires[i-1] = clone; |
| generations[i-1] = generation; |
| i++; |
| |
| ire = NULL; |
| goto done; |
| } |
| |
| /* |
| * We only match on the type and optionally ILL when |
| * recursing. The type match is used by some callers |
| * to exclude certain types (such as IRE_IF_CLONE or |
| * IRE_LOCAL|IRE_LOOPBACK). |
| * |
| * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' |
| * ire->ire_ill, and we want to find the IRE_INTERFACE for |
| * ire_ill, so we set ill to the ire_ill; |
| */ |
| match_args &= MATCH_IRE_TYPE; |
| nexthop = ire->ire_gateway_addr; |
| if (ill == NULL && ire->ire_ill != NULL) { |
| ill = ire->ire_ill; |
| need_refrele = B_TRUE; |
| ill_refhold(ill); |
| match_args |= MATCH_IRE_ILL; |
| } |
| /* |
| * We set the prefs[i] value above if i > 0. We've already |
| * done i++ so i is one in the case of the first time around. |
| */ |
| if (i == 1) |
| prefs[0] = ire_pref(ire); |
| ire = NULL; |
| } |
| ASSERT(ire == NULL); |
| ire = ire_reject(ipst, B_FALSE); |
| |
| error: |
| ASSERT(ire != NULL); |
| if (need_refrele) |
| ill_refrele(ill); |
| |
| /* |
| * In the case of MULTIRT we want to try a different IRE the next |
| * time. We let the next packet retry in that case. |
| */ |
| if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) |
| (void) ire_no_good(ires[0]); |
| |
| cleanup: |
| /* cleanup ires[i] */ |
| ire_dep_unbuild(ires, i); |
| for (j = 0; j < i; j++) |
| ire_refrele(ires[j]); |
| |
| ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || |
| (irr_flags & IRR_INCOMPLETE)); |
| /* |
| * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the |
| * ip_select_route since the reject or lack of memory might be gone. |
| */ |
| if (generationp != NULL) |
| *generationp = IRE_GENERATION_VERIFY; |
| return (ire); |
| |
| done: |
| ASSERT(ire == NULL); |
| if (need_refrele) { |
| ill_refrele(ill); |
| ill = NULL; |
| } |
| |
| /* Build dependencies */ |
| if (i > 1 && !ire_dep_build(ires, generations, i)) { |
| /* Something in chain was condemned; tear it apart */ |
| ire = ire_reject(ipst, B_FALSE); |
| goto cleanup; |
| } |
| |
| /* |
| * Release all refholds except the one for ires[0] that we |
| * will return to the caller. |
| */ |
| for (j = 1; j < i; j++) |
| ire_refrele(ires[j]); |
| |
| if (invalidate) { |
| /* |
| * Since we needed to allocate but couldn't we need to make |
| * sure that the dependency chain is rebuilt the next time. |
| */ |
| ire_dep_invalidate_generations(ires[0]); |
| generation = IRE_GENERATION_VERIFY; |
| } else { |
| /* |
| * IREs can have been added or deleted while we did the |
| * recursive lookup and we can't catch those until we've built |
| * the dependencies. We verify the stored |
| * ire_dep_parent_generation to catch any such changes and |
| * return IRE_GENERATION_VERIFY (which will cause |
| * ip_select_route to be called again so we can redo the |
| * recursive lookup next time we send a packet. |
| */ |
| if (ires[0]->ire_dep_parent == NULL) |
| generation = ires[0]->ire_generation; |
| else |
| generation = ire_dep_validate_generations(ires[0]); |
| if (generations[0] != ires[0]->ire_generation) { |
| /* Something changed at the top */ |
| generation = IRE_GENERATION_VERIFY; |
| } |
| } |
| if (generationp != NULL) |
| *generationp = generation; |
| |
| return (ires[0]); |
| } |
| |
| ire_t * |
| ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, |
| zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, |
| uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, |
| tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) |
| { |
| return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, |
| zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, |
| gwattrp, generationp)); |
| } |
| |
| /* |
| * Recursively look for a route to the destination. |
| * We only handle a destination match here, yet we have the same arguments |
| * as the full match to allow function pointers to select between the two. |
| * |
| * Note that this function never returns NULL. It returns an IRE_NOROUTE |
| * instead. |
| * |
| * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it |
| * is an error. |
| * Allow at most one RTF_INDIRECT. |
| */ |
| ire_t * |
| ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, |
| uint32_t xmit_hint, ip_stack_t *ipst) |
| { |
| ire_t *ire; |
| ire_t *ire1; |
| uint_t generation; |
| |
| /* ire_ftable_lookup handles round-robin/ECMP */ |
| ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, |
| &generation); |
| ASSERT(ire != NULL); |
| |
| /* |
| * If this type should have an ire_nce_cache (even if it |
| * doesn't yet have one) then we are done. Includes |
| * IRE_INTERFACE with a full 32 bit mask. |
| */ |
| if (ire->ire_nce_capable) |
| return (ire); |
| |
| /* |
| * If the IRE has a current cached parent we know that the whole |
| * parent chain is current, hence we don't need to discover and |
| * build any dependencies by doing a recursive lookup. |
| */ |
| mutex_enter(&ire->ire_lock); |
| if (ire->ire_dep_parent != NULL && |
| ire->ire_dep_parent->ire_generation == |
| ire->ire_dep_parent_generation) { |
| mutex_exit(&ire->ire_lock); |
| return (ire); |
| } |
| mutex_exit(&ire->ire_lock); |
| |
| /* |
| * Fallback to loop in the normal code starting with the ire |
| * we found. Normally this would return the same ire. |
| */ |
| ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, |
| NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, |
| &generation); |
| ire_refrele(ire); |
| return (ire1); |
| } |