blob: 06c499ced9c52bfe12705138eb898ddb8f227d8c [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
/* AR - Address Resolution Protocol */
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/errno.h>
#include <sys/strlog.h>
#include <sys/dlpi.h>
#include <sys/sockio.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/socket.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/sdt.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/policy.h>
#include <sys/zone.h>
#include <sys/ethernet.h>
#include <sys/zone.h>
#include <sys/random.h>
#include <sys/sdt.h>
#include <sys/hook_event.h>
#include <inet/common.h>
#include <inet/optcom.h>
#include <inet/mi.h>
#include <inet/nd.h>
#include <inet/snmpcom.h>
#include <net/if.h>
#include <inet/arp.h>
#include <netinet/ip6.h>
#include <netinet/arp.h>
#include <inet/ip.h>
#include <inet/ip_ire.h>
#include <inet/ip_ndp.h>
#include <inet/mib2.h>
#include <inet/arp_impl.h>
/*
* ARP entry life time and design notes
* ------------------------------------
*
* ARP entries (ACEs) must last at least as long as IP knows about a given
* MAC-IP translation (i.e., as long as the IRE cache entry exists). It's ok
* if the ARP entry lasts longer, but not ok if it is removed before the IP
* entry. The reason for this is that if ARP doesn't have an entry, we will be
* unable to detect the difference between an ARP broadcast that represents no
* change (same, known address of sender) and one that represents a change (new
* address for existing entry). In the former case, we must not notify IP, or
* we can suffer hurricane attack. In the latter case, we must notify IP, or
* IP will drift out of sync with the network.
*
* Note that IP controls the lifetime of entries, not ARP.
*
* We don't attempt to reconfirm aging entries. If the system is no longer
* talking to a given peer, then it doesn't matter if we have the right mapping
* for that peer. It would be possible to send queries on aging entries that
* are active, but this isn't done.
*
* IPMP Notes
* ----------
*
* ARP is aware of IPMP. In particular, IP notifies ARP about all "active"
* (able to transmit data packets) interfaces in a given group via
* AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages. These messages, combined
* with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver,
* enable ARP to track all the arl_t's that are in the same group and thus
* ensure that ACEs are shared across each group and the arl_t that ARP
* chooses to transmit on for a given ACE is optimal.
*
* ARP relies on IP for hardware address updates. In particular, if the
* hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will
* bring the interface down and back up -- and as part of bringing it back
* up, will send messages to ARP that allow it to update the affected arl's
* with new hardware addresses.
*
* N.B.: One side-effect of this approach is that when an interface fails and
* then starts to repair, it will temporarily populate the ARP cache with
* addresses that are owned by it rather than the group's arl_t. To address
* this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE),
* but as the issue appears to be only cosmetic (redundant entries in the ARP
* cache during interace repair), we've kept things simple for now.
*/
/*
* This is used when scanning for "old" (least recently broadcast) ACEs. We
* don't want to have to walk the list for every single one, so we gather up
* batches at a time.
*/
#define ACE_RESCHED_LIST_LEN 8
typedef struct {
arl_t *art_arl;
uint_t art_naces;
ace_t *art_aces[ACE_RESCHED_LIST_LEN];
} ace_resched_t;
#define ACE_RESOLVED(ace) ((ace)->ace_flags & ACE_F_RESOLVED)
#define ACE_NONPERM(ace) \
(((ace)->ace_flags & (ACE_F_RESOLVED | ACE_F_PERMANENT)) == \
ACE_F_RESOLVED)
#define AR_DEF_XMIT_INTERVAL 500 /* time in milliseconds */
#define AR_LL_HDR_SLACK 32 /* Leave the lower layer some room */
#define AR_SNMP_MSG T_OPTMGMT_ACK
#define AR_DRAINING (void *)0x11
/*
* The IPv4 Link Local address space is special; we do extra duplicate checking
* there, as the entire assignment mechanism rests on random numbers.
*/
#define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
((uchar_t *)ptr)[1] == 254)
/*
* Check if the command needs to be enqueued by seeing if there are other
* commands ahead of us or if some DLPI response is being awaited. Usually
* there would be an enqueued command in the latter case, however if the
* stream that originated the command has closed, the close would have
* cleaned up the enqueued command. AR_DRAINING signifies that the command
* at the head of the arl_queue has been internally dequeued on completion
* of the previous command and is being called from ar_dlpi_done
*/
#define CMD_NEEDS_QUEUEING(mp, arl) \
(mp->b_prev != AR_DRAINING && (arl->arl_queue != NULL || \
arl->arl_dlpi_pending != DL_PRIM_INVAL))
#define ARH_FIXED_LEN 8
/*
* Macro used when creating ACEs to determine the arl that should own it.
*/
#define OWNING_ARL(arl) \
((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl)
/*
* MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK
* doesn't quite do it for us.
*/
typedef struct ar_m_s {
t_uscalar_t ar_mac_type;
uint32_t ar_mac_arp_hw_type;
t_scalar_t ar_mac_sap_length;
uint32_t ar_mac_hw_addr_length;
} ar_m_t;
typedef struct msg2_args {
mblk_t *m2a_mpdata;
mblk_t *m2a_mptail;
} msg2_args_t;
static mblk_t *ar_alloc(uint32_t cmd, int);
static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr,
uint32_t hw_addr_len, uchar_t *proto_addr,
uint32_t proto_addr_len, uchar_t *proto_mask,
uchar_t *proto_extract_mask, uint32_t hw_extract_start,
uchar_t *sender_addr, uint32_t flags);
static void ar_ce_delete(ace_t *ace);
static void ar_ce_delete_per_arl(ace_t *ace, void *arg);
static ace_t **ar_ce_hash(arp_stack_t *as, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
static ace_t *ar_ce_lookup(arl_t *arl, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
static ace_t *ar_ce_lookup_entry(arl_t *arl, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
static ace_t *ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp,
ace_t *matchfn());
static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto,
const uchar_t *proto_addr, uint32_t proto_addr_length);
static ace_t *ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto,
uchar_t *proto_addr, uint32_t proto_addr_length);
static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr,
uint32_t hw_addr_length);
static void ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *),
void *arg1);
static void ar_client_notify(const arl_t *arl, mblk_t *mp, int code);
static int ar_close(queue_t *q);
static int ar_cmd_dispatch(queue_t *q, mblk_t *mp, boolean_t from_wput);
static void ar_cmd_done(arl_t *arl);
static mblk_t *ar_dlpi_comm(t_uscalar_t prim, size_t size);
static void ar_dlpi_send(arl_t *, mblk_t *);
static void ar_dlpi_done(arl_t *, t_uscalar_t);
static int ar_entry_add(queue_t *q, mblk_t *mp);
static int ar_entry_delete(queue_t *q, mblk_t *mp);
static int ar_entry_query(queue_t *q, mblk_t *mp);
static int ar_entry_squery(queue_t *q, mblk_t *mp);
static int ar_interface_up(queue_t *q, mblk_t *mp);
static int ar_interface_down(queue_t *q, mblk_t *mp);
static int ar_interface_on(queue_t *q, mblk_t *mp);
static int ar_interface_off(queue_t *q, mblk_t *mp);
static int ar_ipmp_activate(queue_t *q, mblk_t *mp);
static int ar_ipmp_deactivate(queue_t *q, mblk_t *mp);
static void ar_ll_cleanup_arl_queue(queue_t *q);
static void ar_ll_down(arl_t *arl);
static arl_t *ar_ll_lookup_by_name(arp_stack_t *as, const char *name);
static arl_t *ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp);
static void ar_ll_init(arp_stack_t *, ar_t *, mblk_t *mp);
static void ar_ll_set_defaults(arl_t *, mblk_t *mp);
static void ar_ll_clear_defaults(arl_t *);
static int ar_ll_up(arl_t *arl);
static int ar_mapping_add(queue_t *q, mblk_t *mp);
static boolean_t ar_mask_all_ones(uchar_t *mask, uint32_t mask_len);
static ar_m_t *ar_m_lookup(t_uscalar_t mac_type);
static int ar_nd_ioctl(queue_t *q, mblk_t *mp);
static int ar_open(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
static int ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt);
static int ar_param_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static void ar_query_delete(ace_t *ace, void *ar);
static void ar_query_reply(ace_t *ace, int ret_val,
uchar_t *proto_addr, uint32_t proto_addr_len);
static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace);
static void ar_rput(queue_t *q, mblk_t *mp_orig);
static void ar_rput_dlpi(queue_t *q, mblk_t *mp);
static void ar_set_address(ace_t *ace, uchar_t *addrpos,
uchar_t *proto_addr, uint32_t proto_addr_len);
static int ar_slifname(queue_t *q, mblk_t *mp);
static int ar_set_ppa(queue_t *q, mblk_t *mp);
static int ar_snmp_msg(queue_t *q, mblk_t *mp_orig);
static void ar_snmp_msg2(ace_t *, void *);
static void ar_wput(queue_t *q, mblk_t *mp);
static void ar_wsrv(queue_t *q);
static void ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto,
uint32_t plen, const uchar_t *haddr1, const uchar_t *paddr1,
const uchar_t *haddr2, const uchar_t *paddr2, const uchar_t *dstaddr,
arp_stack_t *as);
static void ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q,
ushort_t cmd, boolean_t);
static mblk_t *ar_cmd_dequeue(arl_t *arl);
static void *arp_stack_init(netstackid_t stackid, netstack_t *ns);
static void arp_stack_fini(netstackid_t stackid, void *arg);
static void arp_stack_shutdown(netstackid_t stackid, void *arg);
/*
* All of these are alterable, within the min/max values given,
* at run time. arp_publish_interval and arp_publish_count are
* set by default to 2 seconds and 5 respectively. This is
* useful during FAILOVER/FAILBACK to make sure that the ARP
* packets are not lost. Assumed that it does not affect the
* normal operations.
*/
static arpparam_t arp_param_arr[] = {
/* min max value name */
{ 30000, 3600000, 300000, "arp_cleanup_interval"},
{ 1000, 20000, 2000, "arp_publish_interval"},
{ 1, 20, 5, "arp_publish_count"},
{ 0, 20000, 1000, "arp_probe_delay"},
{ 10, 20000, 1500, "arp_probe_interval"},
{ 0, 20, 3, "arp_probe_count"},
{ 0, 20000, 100, "arp_fastprobe_delay"},
{ 10, 20000, 150, "arp_fastprobe_interval"},
{ 0, 20, 3, "arp_fastprobe_count"},
{ 0, 3600000, 300000, "arp_defend_interval"},
{ 0, 20000, 100, "arp_defend_rate"},
{ 0, 3600000, 15000, "arp_broadcast_interval"},
{ 5, 86400, 3600, "arp_defend_period"}
};
#define as_cleanup_interval as_param_arr[0].arp_param_value
#define as_publish_interval as_param_arr[1].arp_param_value
#define as_publish_count as_param_arr[2].arp_param_value
#define as_probe_delay as_param_arr[3].arp_param_value
#define as_probe_interval as_param_arr[4].arp_param_value
#define as_probe_count as_param_arr[5].arp_param_value
#define as_fastprobe_delay as_param_arr[6].arp_param_value
#define as_fastprobe_interval as_param_arr[7].arp_param_value
#define as_fastprobe_count as_param_arr[8].arp_param_value
#define as_defend_interval as_param_arr[9].arp_param_value
#define as_defend_rate as_param_arr[10].arp_param_value
#define as_broadcast_interval as_param_arr[11].arp_param_value
#define as_defend_period as_param_arr[12].arp_param_value
static struct module_info arp_mod_info = {
0, "arp", 0, INFPSZ, 512, 128
};
static struct qinit arprinit = {
(pfi_t)ar_rput, NULL, ar_open, ar_close, NULL, &arp_mod_info
};
static struct qinit arpwinit = {
(pfi_t)ar_wput, (pfi_t)ar_wsrv, ar_open, ar_close, NULL, &arp_mod_info
};
struct streamtab arpinfo = {
&arprinit, &arpwinit
};
/*
* TODO: we need a better mechanism to set the ARP hardware type since
* the DLPI mac type does not include enough predefined values.
*/
static ar_m_t ar_m_tbl[] = {
{ DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */
{ DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */
{ DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */
{ DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */
{ DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */
{ DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */
{ DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */
{ DL_OTHER, ARPHRD_ETHER, -2, 6}, /* unknown */
};
/*
* Note that all routines which need to queue the message for later
* processing have to be ioctl_aware to be able to queue the complete message.
* Following are command entry flags in arct_flags
*/
#define ARF_IOCTL_AWARE 0x1 /* Arp command can come down as M_IOCTL */
#define ARF_ONLY_CMD 0x2 /* Command is exclusive to ARP */
#define ARF_WPUT_OK 0x4 /* Command is allowed from ar_wput */
/* ARP Cmd Table entry */
typedef struct arct_s {
int (*arct_pfi)(queue_t *, mblk_t *);
uint32_t arct_cmd;
int arct_min_len;
uint32_t arct_flags;
int arct_priv_req; /* Privilege required for this cmd */
const char *arct_txt;
} arct_t;
/*
* AR_ENTRY_ADD, QUERY and SQUERY are used by sdp, hence they need to
* have ARF_WPUT_OK set.
*/
static arct_t ar_cmd_tbl[] = {
{ ar_entry_add, AR_ENTRY_ADD, sizeof (area_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_CONFIG,
"AR_ENTRY_ADD" },
{ ar_entry_delete, AR_ENTRY_DELETE, sizeof (ared_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_ENTRY_DELETE" },
{ ar_entry_query, AR_ENTRY_QUERY, sizeof (areq_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP,
"AR_ENTRY_QUERY" },
{ ar_entry_squery, AR_ENTRY_SQUERY, sizeof (area_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP,
"AR_ENTRY_SQUERY" },
{ ar_mapping_add, AR_MAPPING_ADD, sizeof (arma_t),
ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_MAPPING_ADD" },
{ ar_interface_up, AR_INTERFACE_UP, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_UP" },
{ ar_interface_down, AR_INTERFACE_DOWN, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_DOWN" },
{ ar_interface_on, AR_INTERFACE_ON, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" },
{ ar_interface_off, AR_INTERFACE_OFF, sizeof (arc_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" },
{ ar_ipmp_activate, AR_IPMP_ACTIVATE, sizeof (arie_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" },
{ ar_ipmp_deactivate, AR_IPMP_DEACTIVATE, sizeof (arie_t),
ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" },
{ ar_set_ppa, (uint32_t)IF_UNITSEL, sizeof (int),
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" },
{ ar_nd_ioctl, ND_GET, 1,
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_NP, "ND_GET" },
{ ar_nd_ioctl, ND_SET, 1,
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "ND_SET" },
{ ar_snmp_msg, AR_SNMP_MSG, sizeof (struct T_optmgmt_ack),
ARF_IOCTL_AWARE | ARF_WPUT_OK | ARF_ONLY_CMD, OP_NP,
"AR_SNMP_MSG" },
{ ar_slifname, (uint32_t)SIOCSLIFNAME, sizeof (struct lifreq),
ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "SIOCSLIFNAME" }
};
/*
* Lookup and return an arl appropriate for sending packets with either source
* hardware address `hw_addr' or source protocol address `ip_addr', in that
* order. If neither was specified or neither match, return any arl in the
* same group as `arl'.
*/
static arl_t *
ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen,
uchar_t *ip_addr)
{
arlphy_t *ap;
ace_t *src_ace;
arl_t *xmit_arl = NULL;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
ASSERT(arl->arl_flags & ARL_F_IPMP);
if (hw_addr != NULL && hw_addrlen != 0) {
xmit_arl = as->as_arl_head;
for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) {
/*
* There may be arls with the same HW address that are
* not in our IPMP group; we don't want those.
*/
if (xmit_arl->arl_ipmp_arl != arl)
continue;
ap = xmit_arl->arl_phy;
if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen &&
bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0)
break;
}
DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *,
xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen);
}
if (xmit_arl == NULL && ip_addr != NULL) {
src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr,
IP_ADDR_LEN);
if (src_ace != NULL)
xmit_arl = src_ace->ace_xmit_arl;
DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *,
xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN);
}
if (xmit_arl == NULL) {
xmit_arl = as->as_arl_head;
for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next)
if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl)
break;
DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl);
}
return (xmit_arl);
}
/*
* ARP Cache Entry creation routine.
* Cache entries are allocated within timer messages and inserted into
* the global hash list based on protocol and protocol address.
*/
static int
ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask,
uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr,
uint_t flags)
{
static ace_t ace_null;
ace_t *ace;
ace_t **acep;
uchar_t *dst;
mblk_t *mp;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
arl_t *xmit_arl;
arlphy_t *ap;
if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL)
return (EINVAL);
if (proto_addr == NULL || proto_addr_len == 0 ||
(proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
return (EINVAL);
if (flags & ACE_F_MYADDR)
flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY;
/*
* Latch a transmit arl for this ace.
*/
if (arl->arl_flags & ARL_F_IPMP) {
ASSERT(proto == IP_ARP_PROTO_TYPE);
xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len,
sender_addr);
} else {
xmit_arl = arl;
}
if (xmit_arl == NULL || xmit_arl->arl_phy == NULL)
return (EINVAL);
ap = xmit_arl->arl_phy;
if (!hw_addr && hw_addr_len == 0) {
if (flags == ACE_F_PERMANENT) { /* Not publish */
/* 224.0.0.0 to zero length address */
flags |= ACE_F_RESOLVED;
} else { /* local address and unresolved case */
hw_addr = ap->ap_hw_addr;
hw_addr_len = ap->ap_hw_addrlen;
if (flags & ACE_F_PUBLISH)
flags |= ACE_F_RESOLVED;
}
} else {
flags |= ACE_F_RESOLVED;
}
/* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */
if (hw_addr_len != 0 && hw_addr == NULL)
return (EINVAL);
if (hw_addr_len < ap->ap_hw_addrlen && hw_addr_len != 0)
return (EINVAL);
if (!proto_extract_mask && (flags & ACE_F_MAPPING))
return (EINVAL);
/*
* If the underlying link doesn't have reliable up/down notification or
* if we're working with the IPv4 169.254.0.0/16 Link Local Address
* space, then don't use the fast timers. Otherwise, use them.
*/
if (ap->ap_notifies &&
!(proto == IP_ARP_PROTO_TYPE && IS_IPV4_LL_SPACE(proto_addr))) {
flags |= ACE_F_FAST;
}
/*
* Allocate the timer block to hold the ace.
* (ace + proto_addr + proto_addr_mask + proto_extract_mask + hw_addr)
*/
mp = mi_timer_alloc(sizeof (ace_t) + proto_addr_len + proto_addr_len +
proto_addr_len + hw_addr_len);
if (!mp)
return (ENOMEM);
ace = (ace_t *)mp->b_rptr;
*ace = ace_null;
ace->ace_proto = proto;
ace->ace_mp = mp;
ace->ace_arl = arl;
ace->ace_xmit_arl = xmit_arl;
dst = (uchar_t *)&ace[1];
ace->ace_proto_addr = dst;
ace->ace_proto_addr_length = proto_addr_len;
bcopy(proto_addr, dst, proto_addr_len);
dst += proto_addr_len;
/*
* The proto_mask allows us to add entries which will let us respond
* to requests for a group of addresses. This makes it easy to provide
* proxy ARP service for machines that don't understand about the local
* subnet structure, if, for example, there are BSD4.2 systems lurking.
*/
ace->ace_proto_mask = dst;
if (proto_mask != NULL) {
bcopy(proto_mask, dst, proto_addr_len);
dst += proto_addr_len;
} else {
while (proto_addr_len-- > 0)
*dst++ = (uchar_t)~0;
}
if (proto_extract_mask != NULL) {
ace->ace_proto_extract_mask = dst;
bcopy(proto_extract_mask, dst, ace->ace_proto_addr_length);
dst += ace->ace_proto_addr_length;
} else {
ace->ace_proto_extract_mask = NULL;
}
ace->ace_hw_extract_start = hw_extract_start;
ace->ace_hw_addr_length = hw_addr_len;
ace->ace_hw_addr = dst;
if (hw_addr != NULL) {
bcopy(hw_addr, dst, hw_addr_len);
dst += hw_addr_len;
}
ace->ace_flags = flags;
if (ar_mask_all_ones(ace->ace_proto_mask,
ace->ace_proto_addr_length)) {
acep = ar_ce_hash(as, ace->ace_proto, ace->ace_proto_addr,
ace->ace_proto_addr_length);
} else {
acep = &as->as_ce_mask_entries;
}
if ((ace->ace_next = *acep) != NULL)
ace->ace_next->ace_ptpn = &ace->ace_next;
*acep = ace;
ace->ace_ptpn = acep;
return (0);
}
/* Delete a cache entry. */
static void
ar_ce_delete(ace_t *ace)
{
ace_t **acep;
/* Get out of the hash list. */
acep = ace->ace_ptpn;
if (ace->ace_next)
ace->ace_next->ace_ptpn = acep;
acep[0] = ace->ace_next;
/* Mark it dying in case we have a timer about to fire. */
ace->ace_flags |= ACE_F_DYING;
/* Complete any outstanding queries immediately. */
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
/* Free the timer, immediately, or when it fires. */
mi_timer_free(ace->ace_mp);
}
/*
* ar_ce_walk routine. Delete the ace if it is associated with the arl
* that is going away.
*/
static void
ar_ce_delete_per_arl(ace_t *ace, void *arl)
{
if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) {
ace->ace_flags &= ~ACE_F_PERMANENT;
ar_ce_delete(ace);
}
}
/*
* ar_ce_walk routine used when deactivating an `arl' in a group. Deletes
* `ace' if it was using `arl_arg' as its output interface.
*/
static void
ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg)
{
arl_t *arl = arl_arg;
ASSERT(!(arl->arl_flags & ARL_F_IPMP));
if (ace->ace_arl == arl) {
ASSERT(ace->ace_xmit_arl == arl);
/*
* This ACE is tied to the arl leaving the group (e.g., an
* ACE_F_PERMANENT for a test address) and is not used by the
* group, so we can leave it be.
*/
return;
}
if (ace->ace_xmit_arl != arl)
return;
ASSERT(ace->ace_arl == arl->arl_ipmp_arl);
/*
* IP should've already sent us messages asking us to move any
* ACE_F_MYADDR entries to another arl, but there are two exceptions:
*
* 1. The group was misconfigured with interfaces that have duplicate
* hardware addresses, but in.mpathd was unable to offline those
* duplicate interfaces.
*
* 2. The messages from IP were lost or never created (e.g. due to
* memory pressure).
*
* We handle the first case by just quietly deleting the ACE. Since
* the second case cannot be distinguished from a more serious bug in
* the IPMP framework, we ASSERT() that this can't happen on DEBUG
* systems, but quietly delete the ACE on production systems (the
* deleted ACE will render the IP address unreachable).
*/
if (ace->ace_flags & ACE_F_MYADDR) {
arlphy_t *ap = arl->arl_phy;
uint_t hw_addrlen = ap->ap_hw_addrlen;
ASSERT(hw_addrlen == ace->ace_hw_addr_length &&
bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0);
}
/*
* NOTE: it's possible this arl got selected as the ace_xmit_arl when
* creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for
* an IPMP IP interface. But it's still OK for us to delete such an
* ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it
* and we'll pick another arl then.
*/
ar_ce_delete(ace);
}
/* Cache entry hash routine, based on protocol and protocol address. */
static ace_t **
ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr,
uint32_t proto_addr_length)
{
const uchar_t *up = proto_addr;
unsigned int hval = proto;
int len = proto_addr_length;
while (--len >= 0)
hval ^= *up++;
return (&as->as_ce_hash_tbl[hval % ARP_HASH_SIZE]);
}
/* Cache entry lookup. Try to find an ace matching the parameters passed. */
ace_t *
ar_ce_lookup(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
uint32_t proto_addr_length)
{
ace_t *ace;
ace = ar_ce_lookup_entry(arl, proto, proto_addr, proto_addr_length);
if (!ace)
ace = ar_ce_lookup_mapping(arl, proto, proto_addr,
proto_addr_length);
return (ace);
}
/*
* Cache entry lookup. Try to find an ace matching the parameters passed.
* Look only for exact entries (no mappings)
*/
static ace_t *
ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
uint32_t proto_addr_length)
{
ace_t *ace;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
if (!proto_addr)
return (NULL);
ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
for (; ace; ace = ace->ace_next) {
if ((ace->ace_arl == arl ||
ace->ace_arl == arl->arl_ipmp_arl) &&
ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
uchar_t *ace_addr = ace->ace_proto_addr;
uchar_t *mask = ace->ace_proto_mask;
/*
* Note that the ace_proto_mask is applied to the
* proto_addr before comparing to the ace_addr.
*/
do {
if (--i1 < 0)
return (ace);
} while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
}
}
return (ace);
}
/*
* Extract cache entry lookup parameters from an external command message, then
* call the supplied match function.
*/
static ace_t *
ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp, ace_t *matchfn())
{
uchar_t *proto_addr;
area_t *area = (area_t *)mp->b_rptr;
proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
area->area_proto_addr_length);
if (!proto_addr)
return (NULL);
return ((*matchfn)(ar_ll_lookup_from_mp(as, mp), area->area_proto,
proto_addr, area->area_proto_addr_length));
}
/*
* Cache entry lookup. Try to find an ace matching the parameters passed.
* Look only for mappings.
*/
static ace_t *
ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
uint32_t proto_addr_length)
{
ace_t *ace;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
if (!proto_addr)
return (NULL);
ace = as->as_ce_mask_entries;
for (; ace; ace = ace->ace_next) {
if (ace->ace_arl == arl &&
ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
uchar_t *ace_addr = ace->ace_proto_addr;
uchar_t *mask = ace->ace_proto_mask;
/*
* Note that the ace_proto_mask is applied to the
* proto_addr before comparing to the ace_addr.
*/
do {
if (--i1 < 0)
return (ace);
} while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
}
}
return (ace);
}
/*
* Look for a permanent entry for proto_addr across all interfaces.
*/
static ace_t *
ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
uint32_t proto_addr_length)
{
ace_t *ace;
ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
for (; ace != NULL; ace = ace->ace_next) {
if (!(ace->ace_flags & ACE_F_PERMANENT))
continue;
if (ace->ace_proto_addr_length == proto_addr_length &&
ace->ace_proto == proto) {
int i1 = proto_addr_length;
uchar_t *ace_addr = ace->ace_proto_addr;
uchar_t *mask = ace->ace_proto_mask;
/*
* Note that the ace_proto_mask is applied to the
* proto_addr before comparing to the ace_addr.
*/
do {
if (--i1 < 0)
return (ace);
} while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
}
}
return (ace);
}
/*
* ar_ce_resolve is called when a response comes in to an outstanding request.
* Returns 'true' if the address has changed and we need to tell the client.
* (We don't need to tell the client if there's still an outstanding query.)
*/
static boolean_t
ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
{
boolean_t hwchanged;
if (hw_addr_length == ace->ace_hw_addr_length) {
ASSERT(ace->ace_hw_addr != NULL);
hwchanged = bcmp(hw_addr, ace->ace_hw_addr,
hw_addr_length) != 0;
if (hwchanged)
bcopy(hw_addr, ace->ace_hw_addr, hw_addr_length);
/*
* No need to bother with ar_query_reply if no queries are
* waiting.
*/
ace->ace_flags |= ACE_F_RESOLVED;
if (ace->ace_query_mp != NULL)
ar_query_reply(ace, 0, NULL, (uint32_t)0);
if (hwchanged)
return (B_TRUE);
}
return (B_FALSE);
}
/*
* There are 2 functions performed by this function.
* 1. Resolution of unresolved entries and update of resolved entries.
* 2. Detection of nodes with our own IP address (duplicates).
*
* If the resolving ARL is in the same group as a matching ACE's ARL, then
* update the ACE. Otherwise, make no updates.
*
* For all entries, we first check to see if this is a duplicate (probable
* loopback) message. If so, then just ignore it.
*
* Next, check to see if the entry has completed DAD. If not, then we've
* failed, because someone is already using the address. Notify IP of the DAD
* failure and remove the broken ace.
*
* Next, we check if we're the authority for this address. If so, then it's
* time to defend it, because the other node is a duplicate. Report it as a
* 'bogon' and let IP decide how to defend.
*
* Finally, if it's unresolved or if the arls match, we just update the MAC
* address. This allows a published 'static' entry to be updated by an ARP
* request from the node for which we're a proxy ARP server.
*
* Note that this logic does not update published ARP entries for mismatched
* arls, as for example when we proxy arp across 2 subnets with differing
* subnet masks.
*
* Return Values below
*/
#define AR_NOTFOUND 1 /* No matching ace found in cache */
#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */
#define AR_LOOPBACK 3 /* Our own arp packet was received */
#define AR_BOGON 4 /* Another host has our IP addr. */
#define AR_FAILED 5 /* Duplicate Address Detection has failed */
#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */
static int
ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp)
{
ace_t *ace;
ace_t *ace_next;
int i1;
const uchar_t *paddr;
uchar_t *ace_addr;
uchar_t *mask;
int retv = AR_NOTFOUND;
arp_stack_t *as = ARL_TO_ARPSTACK(arl);
ace = *ar_ce_hash(as, proto, src_paddr, plen);
for (; ace != NULL; ace = ace_next) {
/* ar_ce_resolve may delete the ace; fetch next pointer now */
ace_next = ace->ace_next;
if (ace->ace_proto_addr_length != plen ||
ace->ace_proto != proto) {
continue;
}
/*
* Note that the ace_proto_mask is applied to the proto_addr
* before comparing to the ace_addr.
*/
paddr = src_paddr;
i1 = plen;
ace_addr = ace->ace_proto_addr;
mask = ace->ace_proto_mask;
while (--i1 >= 0) {
if ((*paddr++ & *mask++) != *ace_addr++)
break;
}
if (i1 >= 0)
continue;
*ace_arlp = ace->ace_arl;
/*
* If the IP address is ours, and the hardware address matches
* one of our own arls, then this is a broadcast packet
* emitted by one of our interfaces, reflected by the switch
* and received on another interface. We return AR_LOOPBACK.
*/
if (ace->ace_flags & ACE_F_MYADDR) {
arl_t *hw_arl = as->as_arl_head;
arlphy_t *ap;
for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) {
ap = hw_arl->arl_phy;
if (ap != NULL && ap->ap_hw_addrlen == hlen &&
bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0)
return (AR_LOOPBACK);
}
}
/*
* If the entry is unverified, then we've just verified that
* someone else already owns this address, because this is a
* message with the same protocol address but different
* hardware address. NOTE: the ace_xmit_arl check ensures we
* don't send duplicate AR_FAILEDs if arl is in an IPMP group.
*/
if ((ace->ace_flags & ACE_F_UNVERIFIED) &&
arl == ace->ace_xmit_arl) {
ar_ce_delete(ace);
return (AR_FAILED);
}
/*
* If the IP address matches ours and we're authoritative for
* this entry, then some other node is using our IP addr, so
* return AR_BOGON. Also reset the transmit count to zero so
* that, if we're currently in initial announcement mode, we
* switch back to the lazier defense mode. Knowing that
* there's at least one duplicate out there, we ought not
* blindly announce. NOTE: the ace_xmit_arl check ensures we
* don't send duplicate AR_BOGONs if arl is in an IPMP group.
*/
if ((ace->ace_flags & ACE_F_AUTHORITY) &&
arl == ace->ace_xmit_arl) {
ace->ace_xmit_count = 0;
return (AR_BOGON);
}
/*
* Only update this ACE if it's on the same network -- i.e.,
* it's for our ARL or another ARL in the same IPMP group.
*/
if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) {
if (ar_ce_resolve(ace, src_haddr, hlen))
retv = AR_CHANGED;
else if (retv == AR_NOTFOUND)
retv = AR_MERGED;
}
}
if (retv == AR_NOTFOUND)
*ace_arlp = NULL;
return (retv);
}
/* Pass arg1 to the pfi supplied, along with each ace in existence. */
static void
ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *), void *arg1)
{
ace_t *ace;
ace_t *ace1;
int i;
for (i = 0; i < ARP_HASH_SIZE; i++) {
/*
* We walk the hash chain in a way that allows the current
* ace to get blown off by the called routine.
*/
for (ace = as->as_ce_hash_tbl[i]; ace; ace = ace1) {
ace1 = ace->ace_next;
(*pfi)(ace, arg1);
}
}
for (ace = as->as_ce_mask_entries; ace; ace = ace1) {
ace1 = ace->ace_next;
(*pfi)(ace, arg1);
}
}
/*
* Send a copy of interesting packets to the corresponding IP instance.
* The corresponding IP instance is the ARP-IP-DEV instance for this
* DEV (i.e. ARL).
*/
static void
ar_client_notify(const arl_t *arl, mblk_t *mp, int code)
{
ar_t *ar = ((ar_t *)arl->arl_rq->q_ptr)->ar_arl_ip_assoc;
arcn_t *arcn;
mblk_t *mp1;
int arl_namelen = strlen(arl->arl_name) + 1;
/* Looks like the association disappeared */
if (ar == NULL) {
freemsg(mp);
return;
}
/* ar is the corresponding ARP-IP instance for this ARL */
ASSERT(ar->ar_arl == NULL && ar->ar_wq->q_next != NULL);
mp1 = allocb(sizeof (arcn_t) + arl_namelen, BPRI_MED);
if (mp1 == NULL) {
freemsg(mp);
return;
}
DB_TYPE(mp1) = M_CTL;
mp1->b_cont = mp;
arcn = (arcn_t *)mp1->b_rptr;
mp1->b_wptr = (uchar_t *)&arcn[1] + arl_namelen;
arcn->arcn_cmd = AR_CLIENT_NOTIFY;
arcn->arcn_name_offset = sizeof (arcn_t);
arcn->arcn_name_length = arl_namelen;
arcn->arcn_code = code;
bcopy(arl->arl_name, &arcn[1], arl_namelen);
putnext(ar->ar_wq, mp1);
}
/*
* Send a delete-notify message down to IP. We've determined that IP doesn't
* have a cache entry for the IP address itself, but it may have other cache
* entries with the same hardware address, and we don't want to see those grow
* stale. (The alternative is sending down updates for every ARP message we
* get that doesn't match an existing ace. That's much more expensive than an
* occasional delete and reload.)
*/
static void
ar_delete_notify(const ace_t *ace)
{
const arl_t *arl = ace->ace_arl;
const arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
mblk_t *mp;
size_t len;
arh_t *arh;
len = sizeof (*arh) + 2 * ace->ace_proto_addr_length;
mp = allocb(len, BPRI_MED);
if (mp == NULL)
return;
arh = (arh_t *)mp->b_rptr;
mp->b_wptr = (uchar_t *)arh + len;
U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
U16_TO_BE16(ace->ace_proto, arh->arh_proto);
arh->arh_hlen = 0;
arh->arh_plen = ace->ace_proto_addr_length;
U16_TO_BE16(ARP_RESPONSE, arh->arh_operation);
bcopy(ace->ace_proto_addr, arh + 1, ace->ace_proto_addr_length);
bcopy(ace->ace_proto_addr, (uchar_t *)(arh + 1) +
ace->ace_proto_addr_length, ace->ace_proto_addr_length);
ar_client_notify(arl, mp, AR_CN_ANNOUNCE);
}
/* ARP module close routine. */
static int
ar_close(queue_t *q)
{
ar_t *ar = (ar_t *)q->q_ptr;
char name[LIFNAMSIZ];
arl_t *arl, *xarl;
arl_t **arlp;
cred_t *cr;
arc_t *arc;
mblk_t *mp1;
int index;
arp_stack_t *as = ar->ar_as;
TRACE_1(TR_FAC_ARP, TR_ARP_CLOSE,
"arp_close: q %p", q);
arl = ar->ar_arl;
if (arl == NULL) {
index = 0;
/*
* If this is the <ARP-IP-Driver> stream send down
* a closing message to IP and wait for IP to send
* an ack. This helps to make sure that messages
* that are currently being sent up by IP are not lost.
*/
if (ar->ar_on_ill_stream) {
mp1 = allocb(sizeof (arc_t), BPRI_MED);
if (mp1 != NULL) {
DB_TYPE(mp1) = M_CTL;
arc = (arc_t *)mp1->b_rptr;
mp1->b_wptr = mp1->b_rptr + sizeof (arc_t);
arc->arc_cmd = AR_ARP_CLOSING;
putnext(WR(q), mp1);
while (!ar->ar_ip_acked_close)
/* If we are interrupted break out */
if (qwait_sig(q) == 0)
break;
}
}
/* Delete all our pending queries, 'arl' is not dereferenced */
ar_ce_walk(as, ar_query_delete, ar);
/*
* The request could be pending on some arl_queue also. This
* happens if the arl is not yet bound, and bind is pending.
*/
ar_ll_cleanup_arl_queue(q);
} else {
index = arl->arl_index;
(void) strcpy(name, arl->arl_name);
arl->arl_closing = 1;
while (arl->arl_queue != NULL)
qwait(arl->arl_rq);
if (arl->arl_state == ARL_S_UP)
ar_ll_down(arl);
while (arl->arl_state != ARL_S_DOWN)
qwait(arl->arl_rq);
if (arl->arl_flags & ARL_F_IPMP) {
/*
* Though rude, someone could force the IPMP arl
* closed without removing the underlying interfaces.
* In that case, force the ARLs out of the group.
*/
xarl = as->as_arl_head;
for (; xarl != NULL; xarl = xarl->arl_next) {
if (xarl->arl_ipmp_arl != arl || xarl == arl)
continue;
ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl);
xarl->arl_ipmp_arl = NULL;
}
}
ar_ll_clear_defaults(arl);
/*
* If this is the control stream for an arl, delete anything
* hanging off our arl.
*/
ar_ce_walk(as, ar_ce_delete_per_arl, arl);
/* Free any messages waiting for a bind_ack */
/* Get the arl out of the chain. */
rw_enter(&as->as_arl_lock, RW_WRITER);
for (arlp = &as->as_arl_head; *arlp;
arlp = &(*arlp)->arl_next) {
if (*arlp == arl) {
*arlp = arl->arl_next;
break;
}
}
ASSERT(arl->arl_dlpi_deferred == NULL);
ar->ar_arl = NULL;
rw_exit(&as->as_arl_lock);
mi_free((char *)arl);
}
/* Let's break the association between an ARL and IP instance */
if (ar->ar_arl_ip_assoc != NULL) {
ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL &&
ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar);
ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL;
ar->ar_arl_ip_assoc = NULL;
}
cr = ar->ar_credp;
/* mi_close_comm frees the instance data. */
(void) mi_close_comm(&as->as_head, q);
qprocsoff(q);
crfree(cr);
if (index != 0) {
hook_nic_event_t info;
info.hne_nic = index;
info.hne_lif = 0;
info.hne_event = NE_UNPLUMB;
info.hne_data = name;
info.hne_datalen = strlen(name);
(void) hook_run(as->as_net_data->netd_hooks,
as->as_arpnicevents, (hook_data_t)&info);
}
netstack_rele(as->as_netstack);
return (0);
}
/*
* Dispatch routine for ARP commands. This routine can be called out of
* either ar_wput or ar_rput, in response to IOCTLs or M_PROTO messages.
*/
/* TODO: error reporting for M_PROTO case */
static int
ar_cmd_dispatch(queue_t *q, mblk_t *mp_orig, boolean_t from_wput)
{
arct_t *arct;
uint32_t cmd;
ssize_t len;
mblk_t *mp = mp_orig;
cred_t *cr = NULL;
if (!mp)
return (ENOENT);
/* We get both M_PROTO and M_IOCTL messages, so watch out! */
if (DB_TYPE(mp) == M_IOCTL) {
struct iocblk *ioc;
ioc = (struct iocblk *)mp->b_rptr;
cmd = ioc->ioc_cmd;
cr = ioc->ioc_cr;
mp = mp->b_cont;
if (!mp)
return (ENOENT);
}
len = MBLKL(mp);
if (len < sizeof (uint32_t) || !OK_32PTR(mp->b_rptr))
return (ENOENT);
if (mp_orig == mp)
cmd = *(uint32_t *)mp->b_rptr;
for (arct = ar_cmd_tbl; ; arct++) {
if (arct >= A_END(ar_cmd_tbl))
return (ENOENT);
if (arct->arct_cmd == cmd)
break;
}
if (len < arct->arct_min_len) {
/*
* If the command is exclusive to ARP, we return EINVAL,
* else we need to pass the command downstream, so return
* ENOENT
*/
return ((arct->arct_flags & ARF_ONLY_CMD) ? EINVAL : ENOENT);
}
if (arct->arct_priv_req != OP_NP) {
int error;
if (cr == NULL)
cr = DB_CREDDEF(mp_orig, ((ar_t *)q->q_ptr)->ar_credp);
if ((error = secpolicy_ip(cr, arct->arct_priv_req,
B_FALSE)) != 0)
return (error);
}
/* Disallow many commands except if from rput i.e. from IP */
if (from_wput && !(arct->arct_flags & ARF_WPUT_OK)) {
return (EINVAL);
}
if (arct->arct_flags & ARF_IOCTL_AWARE)
mp = mp_orig;
DTRACE_PROBE3(cmd_dispatch, queue_t *, q, mblk_t *, mp,
arct_t *, arct);
return (*arct->arct_pfi)(q, mp);
}
/* Allocate and do common initializations for DLPI messages. */
static mblk_t *
ar_dlpi_comm(t_uscalar_t prim, size_t size)
{
mblk_t *mp;
if ((mp = allocb(size, BPRI_HI)) == NULL)
return (NULL);
/*
* DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
* of which we don't seem to use) are sent with M_PCPROTO, and
* that other DLPI are M_PROTO.
*/
DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO;
mp->b_wptr = mp->b_rptr + size;
bzero(mp->b_rptr, size);
((union DL_primitives *)mp->b_rptr)->dl_primitive = prim;
return (mp);
}
/*
* The following two functions serialize DLPI messages to the driver, much
* along the lines of ill_dlpi_send and ill_dlpi_done in IP. Basically,
* we wait for a DLPI message, sent downstream, to be acked before sending
* the next. If there are DLPI messages that have not yet been sent, queue
* this message (mp), else send it downstream.
*/
static void
ar_dlpi_send(arl_t *arl, mblk_t *mp)
{
ASSERT(arl != NULL);
ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
if (arl->arl_dlpi_pending != DL_PRIM_INVAL) {
mblk_t **mpp;
/* Must queue message. Tail insertion */
mpp = &arl->arl_dlpi_deferred;
while (*mpp != NULL)
mpp = &((*mpp)->b_next);
*mpp = mp;
DTRACE_PROBE2(dlpi_defer, arl_t *, arl, mblk_t *, mp);
return;
}
arl->arl_dlpi_pending =
((union DL_primitives *)mp->b_rptr)->dl_primitive;
DTRACE_PROBE2(dlpi_send, arl_t *, arl, mblk_t *, mp);
putnext(arl->arl_wq, mp);
}
/*
* Called when an DLPI control message has been acked; send down the next
* queued message (if any).
* The DLPI messages of interest being bind, attach, unbind and detach since
* these are the only ones sent by ARP via ar_dlpi_send.
*/
static void
ar_dlpi_done(arl_t *arl, t_uscalar_t prim)
{
mblk_t *mp;
if (arl->arl_dlpi_pending != prim) {
DTRACE_PROBE2(dlpi_done_unexpected, arl_t *, arl,
t_uscalar_t, prim);
return;
}
if ((mp = arl->arl_dlpi_deferred) == NULL) {
DTRACE_PROBE2(dlpi_done_idle, arl_t *, arl, t_uscalar_t, prim);
arl->arl_dlpi_pending = DL_PRIM_INVAL;
ar_cmd_done(arl);
return;
}
arl->arl_dlpi_deferred = mp->b_next;
mp->b_next = NULL;
ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
arl->arl_dlpi_pending =
((union DL_primitives *)mp->b_rptr)->dl_primitive;
DTRACE_PROBE2(dlpi_done_next, arl_t *, arl, mblk_t *, mp);
putnext(arl->arl_wq, mp);
}
static void
ar_cmd_done(arl_t *arl)
{
mblk_t *mp;
int cmd;
int err;
mblk_t *mp1;
mblk_t *dlpi_op_done_mp = NULL;
queue_t *dlpi_op_done_q;
ar_t *ar_arl;
ar_t *ar_ip;
queue_t *q;
ASSERT(arl->arl_state == ARL_S_UP || arl->arl_state == ARL_S_DOWN);
/*
* If the current operation was initiated by IP there must be
* an op enqueued in arl_queue. But if ar_close has sent down
* a detach/unbind, there is no command enqueued. Also if the IP-ARP
* stream has closed the cleanup would be done and there won't be any mp
*/
if ((mp = arl->arl_queue) == NULL)
return;
if ((cmd = (uintptr_t)mp->b_prev) & CMD_IN_PROGRESS) {
mp1 = ar_cmd_dequeue(arl);
ASSERT(mp == mp1);
cmd &= ~CMD_IN_PROGRESS;
if (cmd == AR_INTERFACE_UP) {
/*
* There is an ioctl waiting for us...
*/
if (arl->arl_state == ARL_S_UP)
err = 0;
else
err = EINVAL;
dlpi_op_done_mp = ar_alloc(AR_DLPIOP_DONE, err);
if (dlpi_op_done_mp != NULL) {
/*
* Better performance if we send the response
* after the potential MAPPING_ADDs command
* that are likely to follow. (Do it below the
* while loop, instead of putnext right now)
*/
dlpi_op_done_q = WR(mp->b_queue);
}
if (err == 0) {
/*
* Now that we have the ARL instance
* corresponding to the IP instance let's make
* the association here.
*/
ar_ip = (ar_t *)mp->b_queue->q_ptr;
ar_arl = (ar_t *)arl->arl_rq->q_ptr;
ar_arl->ar_arl_ip_assoc = ar_ip;
ar_ip->ar_arl_ip_assoc = ar_arl;
}
}
inet_freemsg(mp);
}
/*
* Run the commands that have been enqueued while we were waiting
* for the last command (AR_INTERFACE_UP or AR_INTERFACE_DOWN)
* to complete.
*/
while ((mp = ar_cmd_dequeue(arl)) != NULL) {
mp->b_prev = AR_DRAINING;
q = mp->b_queue;
mp->b_queue = NULL;
/*
* Don't call put(q, mp) since it can lead to reorder of
* messages by sending the current messages to the end of
* arp's syncq
*/
if (q->q_flag & QREADR)
ar_rput(q, mp);
else
ar_wput(q, mp);
if ((mp = arl->arl_queue) == NULL)
goto done; /* no work to do */
if ((cmd = (uintptr_t)mp->b_prev) & CMD_IN_PROGRESS) {
/*
* The current command is an AR_INTERFACE_UP or
* AR_INTERFACE_DOWN and is waiting for a DLPI ack
* from the driver. Return. We can't make progress now.
*/
goto done;
}
}
done:
if (dlpi_op_done_mp != NULL) {
DTRACE_PROBE3(cmd_done_next, arl_t *, arl,
queue_t *, dlpi_op_done_q, mblk_t *, dlpi_op_done_mp);
putnext(dlpi_op_done_q, dlpi_op_done_mp);
}
}
/*
* Queue all arp commands coming from clients. Typically these commands
* come from IP, but could also come from other clients. The commands
* are serviced in FIFO order. Some commands need to wait and restart
* after the DLPI response from the driver is received. Typically
* AR_INTERFACE_UP and AR_INTERFACE_DOWN. ar_dlpi_done restarts
* the command and then dequeues the queue at arl_queue and calls ar_rput
* or ar_wput for each enqueued command. AR_DRAINING is used to signify
* that the command is being executed thru a drain from ar_dlpi_done.
* Functions handling the individual commands such as ar_entry_add
* check for this flag in b_prev to determine whether the command has
* to be enqueued for later processing or must be processed now.
*
* b_next used to thread the enqueued command mblks
* b_queue used to identify the queue of the originating request(client)
* b_prev used to store the command itself for easy parsing.
*/
static void
ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q, ushort_t cmd,
boolean_t tail_insert)
{
mp->b_queue = q;
if (arl->arl_queue == NULL) {
ASSERT(arl->arl_queue_tail == NULL);
mp->b_prev = (void *)((uintptr_t)(cmd | CMD_IN_PROGRESS));
mp->b_next = NULL;
arl->arl_queue = mp;
arl->arl_queue_tail = mp;
} else if (tail_insert) {
mp->b_prev = (void *)((uintptr_t)cmd);
mp->b_next = NULL;
arl->arl_queue_tail->b_next = mp;
arl->arl_queue_tail = mp;
} else {
/* head insert */
mp->b_prev = (void *)((uintptr_t)cmd | CMD_IN_PROGRESS);
mp->b_next = arl->arl_queue;
arl->arl_queue = mp;
}
}
static mblk_t *
ar_cmd_dequeue(arl_t *arl)
{
mblk_t *mp;
if (arl->arl_queue == NULL) {
ASSERT(arl->arl_queue_tail == NULL);
return (NULL);
}
mp = arl->arl_queue;
arl->arl_queue = mp->b_next;
if (arl->arl_queue == NULL)
arl->arl_queue_tail = NULL;
mp->b_next = NULL;
return (mp);
}
/*
* Standard ACE timer handling: compute 'fuzz' around a central value or from 0
* up to a value, and then set the timer. The randomization is necessary to
* prevent groups of systems from falling into synchronization on the network
* and producing ARP packet storms.
*/
static void
ace_set_timer(ace_t *ace, boolean_t initial_time)
{
clock_t intv, rnd, frac;
(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
/* Note that clock_t is signed; must chop off bits */
rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
intv = ace->ace_xmit_interval;
if (initial_time) {
/* Set intv to be anywhere in the [1 .. intv] range */
if (intv <= 0)
intv = 1;
else
intv = (rnd % intv) + 1;
} else {
/* Compute 'frac' as 20% of the configured interval */
if ((frac = intv / 5) <= 1)
frac = 2;
/* Set intv randomly in the range [intv-frac .. intv+frac] */
if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
intv = 1;
}
mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, intv);
}
/*
* Process entry add requests from external messages.
* It is also called by ip_rput_dlpi_writer() through
* ipif_resolver_up() to change hardware address when
* an asynchronous hardware address change notification
* arrives from the driver.
*/
static int
ar_entry_add(queue_t *q, mblk_t *mp_orig)
{
area_t *area;
ace_t *ace;
uchar_t *hw_addr;
uint32_t hw_addr_len;
uchar_t *proto_addr;
uint32_t proto_addr_len;
uchar_t *proto_mask;
arl_t *arl;
mblk_t *mp = mp_orig;
int err;
uint_t aflags;
boolean_t unverified;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
/* We handle both M_IOCTL and M_PROTO messages. */
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL)
return (EINVAL);
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(eadd_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_ADD, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
area = (area_t *)mp->b_rptr;
aflags = area->area_flags;
/*
* If the previous entry wasn't published and we are now going
* to publish, then we need to do address verification. The previous
* entry may have been a local unpublished address or even an external
* address. If the entry we find was in an unverified state we retain
* this.
* If it's a new published entry, then we're obligated to do
* duplicate address detection now.
*/
ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_entry);
if (ace != NULL) {
unverified = !(ace->ace_flags & ACE_F_PUBLISH) &&
(aflags & ACE_F_PUBLISH);
if (ace->ace_flags & ACE_F_UNVERIFIED)
unverified = B_TRUE;
ar_ce_delete(ace);
} else {
unverified = (aflags & ACE_F_PUBLISH) != 0;
}
/* Allow client to request DAD restart */
if (aflags & ACE_F_UNVERIFIED)
unverified = B_TRUE;
/* Extract parameters from the message. */
hw_addr_len = area->area_hw_addr_length;
hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len);
proto_addr_len = area->area_proto_addr_length;
proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
proto_addr_len);
proto_mask = mi_offset_paramc(mp, area->area_proto_mask_offset,
proto_addr_len);
if (proto_mask == NULL) {
DTRACE_PROBE2(eadd_bad_mask, arl_t *, arl, area_t *, area);
return (EINVAL);
}
err = ar_ce_create(
arl,
area->area_proto,
hw_addr,
hw_addr_len,
proto_addr,
proto_addr_len,
proto_mask,
NULL,
(uint32_t)0,
NULL,
aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND);
if (err != 0) {
DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area,
int, err);
return (err);
}
if (aflags & ACE_F_PUBLISH) {
arlphy_t *ap;
ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
proto_addr_len);
ASSERT(ace != NULL);
ap = ace->ace_xmit_arl->arl_phy;
if (hw_addr == NULL || hw_addr_len == 0) {
hw_addr = ap->ap_hw_addr;
} else if (aflags & ACE_F_MYADDR) {
/*
* If hardware address changes, then make sure
* that the hardware address and hardware
* address length fields in arlphy_t get updated
* too. Otherwise, they will continue carrying
* the old hardware address information.
*/
ASSERT((hw_addr != NULL) && (hw_addr_len != 0));
bcopy(hw_addr, ap->ap_hw_addr, hw_addr_len);
ap->ap_hw_addrlen = hw_addr_len;
}
if (ace->ace_flags & ACE_F_FAST) {
ace->ace_xmit_count = as->as_fastprobe_count;
ace->ace_xmit_interval = as->as_fastprobe_delay;
} else {
ace->ace_xmit_count = as->as_probe_count;
ace->ace_xmit_interval = as->as_probe_delay;
}
/*
* If the user has disabled duplicate address detection for
* this kind of interface (fast or slow) by setting the probe
* count to zero, then pretend as if we've verified the
* address, and go right to address defense mode.
*/
if (ace->ace_xmit_count == 0)
unverified = B_FALSE;
/*
* If we need to do duplicate address detection, then kick that
* off. Otherwise, send out a gratuitous ARP message in order
* to update everyone's caches with the new hardware address.
*/
if (unverified) {
ace->ace_flags |= ACE_F_UNVERIFIED;
if (ace->ace_xmit_interval == 0) {
/*
* User has configured us to send the first
* probe right away. Do so, and set up for
* the subsequent probes.
*/
DTRACE_PROBE2(eadd_probe, ace_t *, ace,
area_t *, area);
ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
area->area_proto, proto_addr_len,
hw_addr, NULL, NULL, proto_addr, NULL, as);
ace->ace_xmit_count--;
ace->ace_xmit_interval =
(ace->ace_flags & ACE_F_FAST) ?
as->as_fastprobe_interval :
as->as_probe_interval;
ace_set_timer(ace, B_FALSE);
} else {
DTRACE_PROBE2(eadd_delay, ace_t *, ace,
area_t *, area);
/* Regular delay before initial probe */
ace_set_timer(ace, B_TRUE);
}
} else {
DTRACE_PROBE2(eadd_announce, ace_t *, ace,
area_t *, area);
ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
area->area_proto, proto_addr_len, hw_addr,
proto_addr, ap->ap_arp_addr, proto_addr, NULL, as);
ace->ace_last_bcast = ddi_get_lbolt();
/*
* If AUTHORITY is set, it is not just a proxy arp
* entry; we believe we're the authority for this
* entry. In that case, and if we're not just doing
* one-off defense of the address, we send more than
* one copy, so we'll still have a good chance of
* updating everyone even when there's a packet loss
* or two.
*/
if ((aflags & ACE_F_AUTHORITY) &&
!(aflags & ACE_F_DEFEND) &&
as->as_publish_count > 0) {
/* Account for the xmit we just did */
ace->ace_xmit_count = as->as_publish_count - 1;
ace->ace_xmit_interval =
as->as_publish_interval;
if (ace->ace_xmit_count > 0)
ace_set_timer(ace, B_FALSE);
}
}
}
return (0);
}
/* Process entry delete requests from external messages. */
static int
ar_entry_delete(queue_t *q, mblk_t *mp_orig)
{
ace_t *ace;
arl_t *arl;
mblk_t *mp = mp_orig;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
/* We handle both M_IOCTL and M_PROTO messages. */
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL)
return (EINVAL);
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(edel_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_DELETE, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
/*
* Need to know if it is a mapping or an exact match. Check exact
* match first.
*/
ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup);
if (ace != NULL) {
ared_t *ared = (ared_t *)mp->b_rptr;
/*
* If it's a permanent entry, then the client is the one who
* told us to delete it, so there's no reason to notify.
*/
if (ACE_NONPERM(ace))
ar_delete_notify(ace);
/*
* Only delete the ARP entry if it is non-permanent, or
* ARED_F_PRESERVE_PERM flags is not set.
*/
if (ACE_NONPERM(ace) ||
!(ared->ared_flags & ARED_F_PRESERVE_PERM)) {
ar_ce_delete(ace);
}
return (0);
}
return (ENXIO);
}
/*
* Process entry query requests from external messages.
* Bump up the ire_stats_freed for all errors except
* EINPROGRESS - which means the packet has been queued.
* For all other errors the packet is going to be freed
* and hence we account for ire being freed if it
* is a M_PROTO message.
*/
static int
ar_entry_query(queue_t *q, mblk_t *mp_orig)
{
ace_t *ace;
areq_t *areq;
arl_t *arl;
int err;
mblk_t *mp = mp_orig;
uchar_t *proto_addr;
uchar_t *sender_addr;
uint32_t proto_addr_len;
clock_t ms;
boolean_t is_mproto = B_TRUE;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
/* We handle both M_IOCTL and M_PROTO messages. */
if (DB_TYPE(mp) == M_IOCTL) {
is_mproto = B_FALSE;
mp = mp->b_cont;
}
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL) {
DTRACE_PROBE2(query_no_arl, queue_t *, q, mblk_t *, mp);
err = EINVAL;
goto err_ret;
}
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(query_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_QUERY, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
areq = (areq_t *)mp->b_rptr;
proto_addr_len = areq->areq_target_addr_length;
proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset,
proto_addr_len);
if (proto_addr == NULL) {
DTRACE_PROBE1(query_illegal_address, areq_t *, areq);
err = EINVAL;
goto err_ret;
}
/* Stash the reply queue pointer for later use. */
mp->b_prev = (mblk_t *)OTHERQ(q);
mp->b_next = NULL;
if (areq->areq_xmit_interval == 0)
areq->areq_xmit_interval = AR_DEF_XMIT_INTERVAL;
ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr, proto_addr_len);
if (ace != NULL && (ace->ace_flags & ACE_F_OLD)) {
/*
* This is a potentially stale entry that IP's asking about.
* Since IP is asking, it must not have an answer anymore,
* either due to periodic ARP flush or due to SO_DONTROUTE.
* Rather than go forward with what we've got, restart
* resolution.
*/
DTRACE_PROBE2(query_stale_ace, ace_t *, ace, areq_t *, areq);
ar_ce_delete(ace);
ace = NULL;
}
if (ace != NULL) {
mblk_t **mpp;
uint32_t count = 0;
/*
* There is already a cache entry. This means there is either
* a permanent entry, or address resolution is in progress.
* If the latter, there should be one or more queries queued
* up. We link the current one in at the end, if there aren't
* too many outstanding.
*/
for (mpp = &ace->ace_query_mp; mpp[0]; mpp = &mpp[0]->b_next) {
if (++count > areq->areq_max_buffered) {
DTRACE_PROBE2(query_overflow, ace_t *, ace,
areq_t *, areq);
mp->b_prev = NULL;
err = EALREADY;
goto err_ret;
}
}
/* Put us on the list. */
mpp[0] = mp;
if (count != 0) {
/*
* If a query was already queued up, then we must not
* have an answer yet.
*/
DTRACE_PROBE2(query_in_progress, ace_t *, ace,
areq_t *, areq);
return (EINPROGRESS);
}
if (ACE_RESOLVED(ace)) {
/*
* We have an answer already.
* Keep a dup of mp since proto_addr points to it
* and mp has been placed on the ace_query_mp list.
*/
mblk_t *mp1;
DTRACE_PROBE2(query_resolved, ace_t *, ace,
areq_t *, areq);
mp1 = dupmsg(mp);
ar_query_reply(ace, 0, proto_addr, proto_addr_len);
freemsg(mp1);
return (EINPROGRESS);
}
if (ace->ace_flags & ACE_F_MAPPING) {
/* Should never happen */
DTRACE_PROBE2(query_unresolved_mapping, ace_t *, ace,
areq_t *, areq);
mpp[0] = mp->b_next;
err = ENXIO;
goto err_ret;
}
DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq);
} else {
/* No ace yet. Make one now. (This is the common case.) */
if (areq->areq_xmit_count == 0) {
DTRACE_PROBE2(query_template, arl_t *, arl,
areq_t *, areq);
mp->b_prev = NULL;
err = ENXIO;
goto err_ret;
}
/*
* Check for sender addr being NULL or not before
* we create the ace. It is easy to cleanup later.
*/
sender_addr = mi_offset_paramc(mp,
areq->areq_sender_addr_offset,
areq->areq_sender_addr_length);
if (sender_addr == NULL) {
DTRACE_PROBE2(query_no_sender, arl_t *, arl,
areq_t *, areq);
mp->b_prev = NULL;
err = EINVAL;
goto err_ret;
}
err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0,
proto_addr, proto_addr_len, NULL,
NULL, (uint32_t)0, sender_addr,
areq->areq_flags);
if (err != 0) {
DTRACE_PROBE3(query_create_failed, arl_t *, arl,
areq_t *, areq, int, err);
mp->b_prev = NULL;
goto err_ret;
}
ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr,
proto_addr_len);
if (ace == NULL || ace->ace_query_mp != NULL) {
/* Shouldn't happen! */
DTRACE_PROBE3(query_lookup_failed, arl_t *, arl,
areq_t *, areq, ace_t *, ace);
mp->b_prev = NULL;
err = ENXIO;
goto err_ret;
}
ace->ace_query_mp = mp;
}
ms = ar_query_xmit(as, ace);
if (ms == 0) {
/* Immediate reply requested. */
ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
} else {
mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms);
}
return (EINPROGRESS);
err_ret:
if (is_mproto) {
ip_stack_t *ipst = as->as_netstack->netstack_ip;
BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
}
return (err);
}
/* Handle simple query requests. */
static int
ar_entry_squery(queue_t *q, mblk_t *mp_orig)
{
ace_t *ace;
area_t *area;
arl_t *arl;
uchar_t *hw_addr;
uint32_t hw_addr_len;
mblk_t *mp = mp_orig;
uchar_t *proto_addr;
int proto_addr_len;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
if (DB_TYPE(mp) == M_IOCTL)
mp = mp->b_cont;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL)
return (EINVAL);
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
DTRACE_PROBE3(squery_enqueued, queue_t *, q, mblk_t *, mp_orig,
arl_t *, arl);
ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_SQUERY, B_TRUE);
return (EINPROGRESS);
}
mp_orig->b_prev = NULL;
/* Extract parameters from the request message. */
area = (area_t *)mp->b_rptr;
proto_addr_len = area->area_proto_addr_length;
proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
proto_addr_len);
hw_addr_len = area->area_hw_addr_length;
hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len);
if (proto_addr == NULL || hw_addr == NULL) {
DTRACE_PROBE1(squery_illegal_address, area_t *, area);
return (EINVAL);
}
ace = ar_ce_lookup(arl, area->area_proto, proto_addr, proto_addr_len);
if (ace == NULL) {
return (ENXIO);
}
if (hw_addr_len < ace->ace_hw_addr_length) {
return (EINVAL);
}
if (ACE_RESOLVED(ace)) {
/* Got it, prepare the response. */
ASSERT(area->area_hw_addr_length == ace->ace_hw_addr_length);
ar_set_address(ace, hw_addr, proto_addr, proto_addr_len);
} else {
/*
* We have an incomplete entry. Set the length to zero and
* just return out the flags.
*/
area->area_hw_addr_length = 0;
}
area->area_flags = ace->ace_flags;
if (mp == mp_orig) {
/* Non-ioctl case */
/* TODO: change message type? */
DB_TYPE(mp) = M_CTL; /* Caught by ip_wput */
DTRACE_PROBE3(squery_reply, queue_t *, q, mblk_t *, mp,
arl_t *, arl);
qreply(q, mp);
return (EINPROGRESS);
}
return (0);
}
/* Process an interface down causing us to detach and unbind. */
/* ARGSUSED */
static int
ar_interface_down(queue_t *q, mblk_t *mp)
{
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL || arl->arl_closing) {
DTRACE_PROBE2(down_no_arl, queue_t *, q, mblk_t *, mp);
return (EINVAL);
}
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp, arl)) {
DTRACE_PROBE3(down_enqueued, queue_t *, q, mblk_t *, mp,
arl_t *, arl);
ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_TRUE);
return (EINPROGRESS);
}
mp->b_prev = NULL;
/*
* The arl is already down, no work to do.
*/
if (arl->arl_state == ARL_S_DOWN) {
/* ar_rput frees the mp */
return (0);
}
/*
* This command cannot complete in a single shot now itself.
* It has to be restarted after the receipt of the ack from
* the driver. So we need to enqueue the command (at the head).
*/
ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_FALSE);
ASSERT(arl->arl_state == ARL_S_UP);
/* Free all arp entries for this interface */
ar_ce_walk(as, ar_ce_delete_per_arl, arl);
ar_ll_down(arl);
/* Return EINPROGRESS so that ar_rput does not free the 'mp' */
return (EINPROGRESS);
}
/* Process an interface up causing the info req sequence to start. */
/* ARGSUSED */
static int
ar_interface_up(queue_t *q, mblk_t *mp)
{
arl_t *arl;
int err;
mblk_t *mp1;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL || arl->arl_closing) {
DTRACE_PROBE2(up_no_arl, queue_t *, q, mblk_t *, mp);
err = EINVAL;
goto done;
}
/*
* Newly received commands from clients go to the tail of the queue.
*/
if (CMD_NEEDS_QUEUEING(mp, arl)) {
DTRACE_PROBE3(up_enqueued, queue_t *, q, mblk_t *, mp,
arl_t *, arl);
ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_TRUE);
return (EINPROGRESS);
}
mp->b_prev = NULL;
/*
* The arl is already up. No work to do.
*/
if (arl->arl_state == ARL_S_UP) {
err = 0;
goto done;
}
/*
* This command cannot complete in a single shot now itself.
* It has to be restarted after the receipt of the ack from
* the driver. So we need to enqueue the command (at the head).
*/
ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_FALSE);
err = ar_ll_up(arl);
/* Return EINPROGRESS so that ar_rput does not free the 'mp' */
return (EINPROGRESS);
done:
/* caller frees 'mp' */
mp1 = ar_alloc(AR_DLPIOP_DONE, err);
if (mp1 != NULL) {
q = WR(q);
DTRACE_PROBE3(up_send_err, queue_t *, q, mblk_t *, mp1,
int, err);
putnext(q, mp1);
}
return (err);
}
/*
* Given an arie_t `mp', find the arl_t's that it names and return them
* in `*arlp' and `*ipmp_arlp'. If they cannot be found, return B_FALSE.
*/
static boolean_t
ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp)
{
arie_t *arie = (arie_t *)mp->b_rptr;
*arlp = ar_ll_lookup_from_mp(as, mp);
if (*arlp == NULL) {
DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp);
return (B_FALSE);
}
arie->arie_grifname[LIFNAMSIZ - 1] = '\0';
*ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname);
if (*ipmp_arlp == NULL) {
DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp);
return (B_FALSE);
}
DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp);
return (B_TRUE);
}
/*
* Bind an arl_t to an IPMP group arl_t.
*/
static int
ar_ipmp_activate(queue_t *q, mblk_t *mp)
{
arl_t *arl, *ipmp_arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
return (EINVAL);
if (arl->arl_ipmp_arl != NULL) {
DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl);
return (EALREADY);
}
DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl);
arl->arl_ipmp_arl = ipmp_arl;
return (0);
}
/*
* Unbind an arl_t from an IPMP group arl_t and update the ace_t's so
* that it is no longer part of the group.
*/
static int
ar_ipmp_deactivate(queue_t *q, mblk_t *mp)
{
arl_t *arl, *ipmp_arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
return (EINVAL);
if (ipmp_arl != arl->arl_ipmp_arl) {
DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *,
ipmp_arl);
return (EINVAL);
}
DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *,
arl->arl_ipmp_arl);
ar_ce_walk(as, ar_ce_ipmp_deactivate, arl);
arl->arl_ipmp_arl = NULL;
return (0);
}
/*
* Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages.
*/
/* ARGSUSED */
static int
ar_interface_on(queue_t *q, mblk_t *mp)
{
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL) {
DTRACE_PROBE2(on_no_arl, queue_t *, q, mblk_t *, mp);
return (EINVAL);
}
DTRACE_PROBE3(on_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl);
arl->arl_flags &= ~ARL_F_NOARP;
return (0);
}
/*
* Disable an interface from processing
* ARP_REQUEST and ARP_RESPONSE messages
*/
/* ARGSUSED */
static int
ar_interface_off(queue_t *q, mblk_t *mp)
{
arl_t *arl;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
arl = ar_ll_lookup_from_mp(as, mp);
if (arl == NULL) {
DTRACE_PROBE2(off_no_arl, queue_t *, q, mblk_t *, mp);
return (EINVAL);
}
DTRACE_PROBE3(off_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl);
arl->arl_flags |= ARL_F_NOARP;
return (0);
}
/*
* The queue 'q' is closing. Walk all the arl's and free any message
* pending in the arl_queue if it originated from the closing q.
* Also cleanup the ip_pending_queue, if the arp-IP stream is closing.
*/
static void
ar_ll_cleanup_arl_queue(queue_t *q)
{
arl_t *arl;
mblk_t *mp;
mblk_t *mpnext;
mblk_t *prev;
arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
ip_stack_t *ipst = as->as_netstack->netstack_ip;
for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) {
for (prev = NULL, mp = arl->arl_queue; mp != NULL;
mp = mpnext) {
mpnext = mp->b_next;
if ((void *)mp->b_queue == (void *)q ||
(void *)mp->b_queue == (void *)OTHERQ(q)) {
if (prev == NULL)
arl->arl_queue = mp->b_next;
else
prev->b_next = mp->b_next;
if (arl->arl_queue_tail == mp)
arl->arl_queue_tail = prev;
if (DB_TYPE(mp) == M_PROTO &&
*(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
ire_stats_freed);
}
inet_freemsg(mp);
} else {
prev = mp;
}
}
}
}
/*
* Look up a lower level tap by name.
*/
static arl_t *
ar_ll_lookup_by_name(arp_stack_t *as, const char *name)
{
arl_t *arl;
for (arl = as->as_arl_head; arl; arl = arl->arl_next) {
if (strcmp(arl->arl_name, name) == 0) {
return (arl);
}
}
return (NULL);
}
/*
* Look up a lower level tap using parameters extracted from the common
* portion of the ARP command.
*/
static arl_t *
ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp)
{
arc_t *arc = (arc_t *)mp->b_rptr;
uint8_t *name;
size_t namelen = arc->arc_name_length;
name = mi_offset_param(mp, arc->arc_name_offset, namelen);
if (name == NULL || name[namelen - 1] != '\0')
return (NULL);
return (ar_ll_lookup_by_name(as, (char *)name));
}
static void
ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp)
{
arl_t *arl;
dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
ASSERT(ar->ar_arl == NULL);
if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL)
return;
if (dlia->dl_mac_type == SUNW_DL_IPMP) {
arl->arl_flags |= ARL_F_IPMP;
arl->arl_ipmp_arl = arl;
}
arl->arl_provider_style = dlia->dl_provider_style;
arl->arl_rq = ar->ar_rq;
arl->arl_wq = ar->ar_wq;
arl->arl_dlpi_pending = DL_PRIM_INVAL;
ar->ar_arl = arl;
/*
* If/when ARP gets pushed into the IP module then this code to make
* a number uniquely identify an ARP instance can be removed and the
* ifindex from IP used. Rather than try and reinvent or copy the
* code used by IP for the purpose of allocating an index number
* (and trying to keep the number small), just allocate it in an
* ever increasing manner. This index number isn't ever exposed to
* users directly, its only use is for providing the pfhooks interface
* with a number it can use to uniquely identify an interface in time.
*
* Using a 32bit counter, over 136 plumbs would need to be done every
* second of every day (non-leap year) for it to wrap around and the
* for() loop below to kick in as a performance concern.
*/
if (as->as_arp_counter_wrapped) {
arl_t *arl1;
do {
for (arl1 = as->as_arl_head; arl1 != NULL;
arl1 = arl1->arl_next)
if (arl1->arl_index ==
as->as_arp_index_counter) {
as->as_arp_index_counter++;
if (as->as_arp_index_counter == 0) {
as->as_arp_counter_wrapped++;
as->as_arp_index_counter = 1;
}
break;
}
} while (arl1 != NULL);
} else {
arl->arl_index = as->as_arp_index_counter;
}
as->as_arp_index_counter++;
if (as->as_arp_index_counter == 0) {
as->as_arp_counter_wrapped++;
as->as_arp_index_counter = 1;
}
}
/*
* This routine is called during module initialization when the DL_INFO_ACK
* comes back from the device. We set up defaults for all the device dependent
* doo-dads we are going to need. This will leave us ready to roll if we are
* attempting auto-configuration. Alternatively, these defaults can be
* overridden by initialization procedures possessing higher intelligence.
*/
static void
ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
{
ar_m_t *arm;
dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
dl_unitdata_req_t *dlur;
uchar_t *up;
arlphy_t *ap;
ASSERT(arl != NULL);
/*
* Clear any stale defaults that might exist.
*/
ar_ll_clear_defaults(arl);
if (arl->arl_flags & ARL_F_IPMP) {
/*
* If this is an IPMP arl_t, we have nothing to do,
* since we will never transmit or receive.
*/
return;
}
ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP);
if (ap == NULL)
goto bad;
arl->arl_phy = ap;
if ((arm = ar_m_lookup(dlia->dl_mac_type)) == NULL)
arm = ar_m_lookup(DL_OTHER);
ASSERT(arm != NULL);
/*
* We initialize based on parameters in the (currently) not too
* exhaustive ar_m_tbl.
*/
if (dlia->dl_version == DL_VERSION_2) {
/* XXX DLPI spec allows dl_sap_length of 0 before binding. */
ap->ap_saplen = dlia->dl_sap_length;
ap->ap_hw_addrlen = dlia->dl_brdcst_addr_length;
} else {
ap->ap_saplen = arm->ar_mac_sap_length;
ap->ap_hw_addrlen = arm->ar_mac_hw_addr_length;
}
ap->ap_arp_hw_type = arm->ar_mac_arp_hw_type;
/*
* Allocate the hardware and ARP addresses; note that the hardware
* address cannot be filled in until we see the DL_BIND_ACK.
*/
ap->ap_hw_addr = kmem_zalloc(ap->ap_hw_addrlen, KM_NOSLEEP);
ap->ap_arp_addr = kmem_alloc(ap->ap_hw_addrlen, KM_NOSLEEP);
if (ap->ap_hw_addr == NULL || ap->ap_arp_addr == NULL)
goto bad;
if (dlia->dl_version == DL_VERSION_2) {
if ((up = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
ap->ap_hw_addrlen)) == NULL)
goto bad;
bcopy(up, ap->ap_arp_addr, ap->ap_hw_addrlen);
} else {
/*
* No choice but to assume a broadcast address of all ones,
* known to work on some popular networks.
*/
(void) memset(ap->ap_arp_addr, ~0, ap->ap_hw_addrlen);
}
/*
* Make us a template DL_UNITDATA_REQ message which we will use for
* broadcasting resolution requests, and which we will clone to hand
* back as responses to the protocols.
*/
ap->ap_xmit_mp = ar_dlpi_comm(DL_UNITDATA_REQ, ap->ap_hw_addrlen +
ABS(ap->ap_saplen) + sizeof (dl_unitdata_req_t));
if (ap->ap_xmit_mp == NULL)
goto bad;
dlur = (dl_unitdata_req_t *)ap->ap_xmit_mp->b_rptr;
dlur->dl_priority.dl_min = 0;
dlur->dl_priority.dl_max = 0;
dlur->dl_dest_addr_length = ap->ap_hw_addrlen + ABS(ap->ap_saplen);
dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
/* NOTE: the destination address and sap offsets are permanently set */
ap->ap_xmit_sapoff = dlur->dl_dest_addr_offset;
ap->ap_xmit_addroff = dlur->dl_dest_addr_offset;
if (ap->ap_saplen < 0)
ap->ap_xmit_sapoff += ap->ap_hw_addrlen; /* sap last */
else
ap->ap_xmit_addroff += ap->ap_saplen; /* addr last */
*(uint16_t *)((caddr_t)dlur + ap->ap_xmit_sapoff) = ETHERTYPE_ARP;
return;
bad:
ar_ll_clear_defaults(arl);
}
static void
ar_ll_clear_defaults(arl_t *arl)
{
arlphy_t *ap = arl->arl_phy;
if (ap != NULL) {
arl->arl_phy = NULL;
if (ap->ap_hw_addr != NULL)
kmem_free(ap->ap_hw_addr, ap->ap_hw_addrlen);
if (ap->ap_arp_addr != NULL)
kmem_free(ap->ap_arp_addr, ap->ap_hw_addrlen);
freemsg(ap->ap_xmit_mp);
kmem_free(ap, sizeof (arlphy_t));
}
}
static void
ar_ll_down(arl_t *arl)
{
mblk_t *mp;
ar_t *ar;
ASSERT(arl->arl_state == ARL_S_UP);
/* Let's break the association between an ARL and IP instance */
ar = (ar_t *)arl->arl_rq->q_ptr;
if (ar->ar_arl_ip_assoc != NULL) {
ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL &&
ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar);
ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL;
ar->ar_arl_ip_assoc = NULL;
}
arl->arl_state = ARL_S_PENDING;
mp = arl->arl_unbind_mp;
ASSERT(mp != NULL);
ar_dlpi_send(arl, mp);
arl->arl_unbind_mp = NULL;
if (arl->arl_provider_style == DL_STYLE2) {
mp = arl->arl_detach_mp;
ASSERT(mp != NULL);
ar_dlpi_send(arl, mp);
arl->arl_detach_mp = NULL;
}
}
static int
ar_ll_up(arl_t *arl)
{
mblk_t *attach_mp = NULL;
mblk_t *bind_mp = NULL;
mblk_t *detach_mp = NULL;
mblk_t *unbind_mp = NULL;
mblk_t *info_mp = NULL;
mblk_t *notify_mp = NULL;
ASSERT(arl->arl_state == ARL_S_DOWN);
if (arl->arl_provider_style == DL_STYLE2) {
attach_mp =
ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t));
if (attach_mp == NULL)
goto bad;
((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa =
arl->arl_ppa;
detach_mp =
ar_dlpi_comm(DL_DETACH_REQ, sizeof (dl_detach_req_t));
if (detach_mp == NULL)
goto bad;
}
info_mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t));
if (info_mp == NULL)
goto bad;
/* Allocate and initialize a bind message. */
bind_mp = ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t));
if (bind_mp == NULL)
goto bad;
((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP;
((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
unbind_mp = ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t));
if (unbind_mp == NULL)
goto bad;
notify_mp = ar_dlpi_comm(DL_NOTIFY_REQ, sizeof (dl_notify_req_t));
if (notify_mp == NULL)
goto bad;
((dl_notify_req_t *)notify_mp->b_rptr)->dl_notifications =
DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
arl->arl_state = ARL_S_PENDING;
if (arl->arl_provider_style == DL_STYLE2) {
ar_dlpi_send(arl, attach_mp);
ASSERT(detach_mp