blob: d1bc11f361209e1bfc4339cd02a9c7ad4dc9c476 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1990 Mentat Inc.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2016, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/*
* This file contains the interface control functions for IP.
*/
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/dlpi.h>
#include <sys/stropts.h>
#include <sys/strsun.h>
#include <sys/sysmacros.h>
#include <sys/strsubr.h>
#include <sys/strlog.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/debug.h>
#include <sys/zone.h>
#include <sys/sunldi.h>
#include <sys/file.h>
#include <sys/bitmap.h>
#include <sys/cpuvar.h>
#include <sys/time.h>
#include <sys/ctype.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/isa_defs.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <sys/sockio.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/igmp_var.h>
#include <sys/policy.h>
#include <sys/ethernet.h>
#include <sys/callb.h>
#include <sys/md5.h>
#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
#include <inet/mi.h>
#include <inet/nd.h>
#include <inet/tunables.h>
#include <inet/arp.h>
#include <inet/ip_arp.h>
#include <inet/mib2.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
#include <inet/ip_multi.h>
#include <inet/ip_ire.h>
#include <inet/ip_ftable.h>
#include <inet/ip_rts.h>
#include <inet/ip_ndp.h>
#include <inet/ip_if.h>
#include <inet/ip_impl.h>
#include <inet/sctp_ip.h>
#include <inet/ip_netinfo.h>
#include <inet/ilb_ip.h>
#include <netinet/igmp.h>
#include <inet/ip_listutils.h>
#include <inet/ipclassifier.h>
#include <sys/mac_client.h>
#include <sys/dld.h>
#include <sys/mac_flow.h>
#include <sys/systeminfo.h>
#include <sys/bootconf.h>
#include <sys/tsol/tndb.h>
#include <sys/tsol/tnet.h>
#include <inet/rawip_impl.h> /* needed for icmp_stack_t */
#include <inet/udp_impl.h> /* needed for udp_stack_t */
/* The character which tells where the ill_name ends */
#define IPIF_SEPARATOR_CHAR ':'
/* IP ioctl function table entry */
typedef struct ipft_s {
int ipft_cmd;
pfi_t ipft_pfi;
int ipft_min_size;
int ipft_flags;
} ipft_t;
#define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */
#define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */
static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int nd_ill_forward_set(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *ioc_cr);
static boolean_t ill_is_quiescent(ill_t *);
static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
static ip_m_t *ip_m_lookup(t_uscalar_t mac_type);
static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp, boolean_t need_up);
static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp, boolean_t need_up);
static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
queue_t *q, mblk_t *mp, boolean_t need_up);
static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
mblk_t *mp);
static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp);
static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
queue_t *q, mblk_t *mp, boolean_t need_up);
static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
int ioccmd, struct linkblk *li);
static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
static void ip_wput_ioctl(queue_t *q, mblk_t *mp);
static void ipsq_flush(ill_t *ill);
static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
queue_t *q, mblk_t *mp, boolean_t need_up);
static void ipsq_delete(ipsq_t *);
static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
boolean_t initialize, boolean_t insert, int *errorp);
static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
static void ipif_delete_bcast_ires(ipif_t *ipif);
static int ipif_add_ires_v4(ipif_t *, boolean_t);
static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
boolean_t isv6);
static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
static void ipif_free(ipif_t *ipif);
static void ipif_free_tail(ipif_t *ipif);
static void ipif_set_default(ipif_t *ipif);
static int ipif_set_values(queue_t *q, mblk_t *mp,
char *interf_name, uint_t *ppa);
static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
queue_t *q);
static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
ip_stack_t *);
static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen,
boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
int *error, ip_stack_t *);
static int ill_alloc_ppa(ill_if_t *, ill_t *);
static void ill_delete_interface_type(ill_if_t *);
static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
static void ill_dl_down(ill_t *ill);
static void ill_down(ill_t *ill);
static void ill_down_ipifs(ill_t *, boolean_t);
static void ill_free_mib(ill_t *ill);
static void ill_glist_delete(ill_t *);
static void ill_phyint_reinit(ill_t *ill);
static void ill_set_nce_router_flags(ill_t *, boolean_t);
static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
static ip_v4mapinfo_func_t ip_ether_v4_mapping;
static ip_v6mapinfo_func_t ip_ether_v6_mapping;
static ip_v4mapinfo_func_t ip_ib_v4_mapping;
static ip_v6mapinfo_func_t ip_ib_v6_mapping;
static ip_v4mapinfo_func_t ip_mbcast_mapping;
static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
static void phyint_free(phyint_t *);
static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
static void ill_capability_dld_reset_fill(ill_t *, mblk_t *);
static void ill_capability_dld_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
static void ill_capability_dld_enable(ill_t *);
static void ill_capability_ack_thr(void *);
static void ill_capability_lso_enable(ill_t *);
static ill_t *ill_prev_usesrc(ill_t *);
static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
static void ill_disband_usesrc_group(ill_t *);
static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
#ifdef DEBUG
static void ill_trace_cleanup(const ill_t *);
static void ipif_trace_cleanup(const ipif_t *);
#endif
static void ill_dlpi_clear_deferred(ill_t *ill);
static void phyint_flags_init(phyint_t *, t_uscalar_t);
/*
* if we go over the memory footprint limit more than once in this msec
* interval, we'll start pruning aggressively.
*/
int ip_min_frag_prune_time = 0;
static ipft_t ip_ioctl_ftbl[] = {
{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
IPFT_F_NO_REPLY },
{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
{ 0 }
};
/* Simple ICMP IP Header Template */
static ipha_t icmp_ipha = {
IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
};
static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
static ip_m_t ip_m_tbl[] = {
{ DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
ip_nodef_v6intfid },
{ DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
ip_nodef_v6intfid },
{ DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
ip_nodef_v6intfid },
{ DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
ip_nodef_v6intfid },
{ DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
ip_nodef_v6intfid },
{ DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
ip_nodef_v6intfid },
{ DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
ip_ipv4_v6destintfid },
{ DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
ip_ipv6_v6destintfid },
{ DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
ip_nodef_v6intfid },
{ SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
{ SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
{ DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
ip_nodef_v6intfid }
};
char ipif_loopback_name[] = "lo0";
/* These are used by all IP network modules. */
sin6_t sin6_null; /* Zero address for quick clears */
sin_t sin_null; /* Zero address for quick clears */
/* When set search for unused ipif_seqid */
static ipif_t ipif_zero;
/*
* ppa arena is created after these many
* interfaces have been plumbed.
*/
uint_t ill_no_arena = 12; /* Setable in /etc/system */
/*
* Allocate per-interface mibs.
* Returns true if ok. False otherwise.
* ipsq may not yet be allocated (loopback case ).
*/
static boolean_t
ill_allocate_mibs(ill_t *ill)
{
/* Already allocated? */
if (ill->ill_ip_mib != NULL) {
if (ill->ill_isv6)
ASSERT(ill->ill_icmp6_mib != NULL);
return (B_TRUE);
}
ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
KM_NOSLEEP);
if (ill->ill_ip_mib == NULL) {
return (B_FALSE);
}
/* Setup static information */
SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
sizeof (mib2_ipIfStatsEntry_t));
if (ill->ill_isv6) {
ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
sizeof (mib2_ipv6AddrEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
sizeof (mib2_ipv6RouteEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
sizeof (mib2_ipv6NetToMediaEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
sizeof (ipv6_member_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
sizeof (ipv6_grpsrc_t));
} else {
ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
sizeof (mib2_ipAddrEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
sizeof (mib2_ipRouteEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
sizeof (mib2_ipNetToMediaEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
sizeof (ip_member_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
sizeof (ip_grpsrc_t));
/*
* For a v4 ill, we are done at this point, because per ill
* icmp mibs are only used for v6.
*/
return (B_TRUE);
}
ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
KM_NOSLEEP);
if (ill->ill_icmp6_mib == NULL) {
kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
ill->ill_ip_mib = NULL;
return (B_FALSE);
}
/* static icmp info */
ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
sizeof (mib2_ipv6IfIcmpEntry_t);
/*
* The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
* after the phyint merge occurs in ipif_set_values -> ill_glist_insert
* -> ill_phyint_reinit
*/
return (B_TRUE);
}
/*
* Completely vaporize a lower level tap and all associated interfaces.
* ill_delete is called only out of ip_close when the device control
* stream is being closed.
*/
void
ill_delete(ill_t *ill)
{
ipif_t *ipif;
ill_t *prev_ill;
ip_stack_t *ipst = ill->ill_ipst;
/*
* ill_delete may be forcibly entering the ipsq. The previous
* ioctl may not have completed and may need to be aborted.
* ipsq_flush takes care of it. If we don't need to enter the
* the ipsq forcibly, the 2nd invocation of ipsq_flush in
* ill_delete_tail is sufficient.
*/
ipsq_flush(ill);
/*
* Nuke all interfaces. ipif_free will take down the interface,
* remove it from the list, and free the data structure.
* Walk down the ipif list and remove the logical interfaces
* first before removing the main ipif. We can't unplumb
* zeroth interface first in the case of IPv6 as update_conn_ill
* -> ip_ll_multireq de-references ill_ipif for checking
* POINTOPOINT.
*
* If ill_ipif was not properly initialized (i.e low on memory),
* then no interfaces to clean up. In this case just clean up the
* ill.
*/
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
ipif_free(ipif);
/*
* clean out all the nce_t entries that depend on this
* ill for the ill_phys_addr.
*/
nce_flush(ill, B_TRUE);
/* Clean up msgs on pending upcalls for mrouted */
reset_mrt_ill(ill);
update_conn_ill(ill, ipst);
/*
* Remove multicast references added as a result of calls to
* ip_join_allmulti().
*/
ip_purge_allmulti(ill);
/*
* If the ill being deleted is under IPMP, boot it out of the illgrp.
*/
if (IS_UNDER_IPMP(ill))
ipmp_ill_leave_illgrp(ill);
/*
* ill_down will arrange to blow off any IRE's dependent on this
* ILL, and shut down fragmentation reassembly.
*/
ill_down(ill);
/* Let SCTP know, so that it can remove this from its list. */
sctp_update_ill(ill, SCTP_ILL_REMOVE);
/*
* Walk all CONNs that can have a reference on an ire or nce for this
* ill (we actually walk all that now have stale references).
*/
ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
/* With IPv6 we have dce_ifindex. Cleanup for neatness */
if (ill->ill_isv6)
dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
/*
* If an address on this ILL is being used as a source address then
* clear out the pointers in other ILLs that point to this ILL.
*/
rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
if (ill->ill_usesrc_grp_next != NULL) {
if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
ill_disband_usesrc_group(ill);
} else { /* consumer of the usesrc ILL */
prev_ill = ill_prev_usesrc(ill);
prev_ill->ill_usesrc_grp_next =
ill->ill_usesrc_grp_next;
}
}
rw_exit(&ipst->ips_ill_g_usesrc_lock);
}
static void
ipif_non_duplicate(ipif_t *ipif)
{
ill_t *ill = ipif->ipif_ill;
mutex_enter(&ill->ill_lock);
if (ipif->ipif_flags & IPIF_DUPLICATE) {
ipif->ipif_flags &= ~IPIF_DUPLICATE;
ASSERT(ill->ill_ipif_dup_count > 0);
ill->ill_ipif_dup_count--;
}
mutex_exit(&ill->ill_lock);
}
/*
* ill_delete_tail is called from ip_modclose after all references
* to the closing ill are gone. The wait is done in ip_modclose
*/
void
ill_delete_tail(ill_t *ill)
{
mblk_t **mpp;
ipif_t *ipif;
ip_stack_t *ipst = ill->ill_ipst;
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
ipif_non_duplicate(ipif);
(void) ipif_down_tail(ipif);
}
ASSERT(ill->ill_ipif_dup_count == 0);
/*
* If polling capability is enabled (which signifies direct
* upcall into IP and driver has ill saved as a handle),
* we need to make sure that unbind has completed before we
* let the ill disappear and driver no longer has any reference
* to this ill.
*/
mutex_enter(&ill->ill_lock);
while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
cv_wait(&ill->ill_cv, &ill->ill_lock);
mutex_exit(&ill->ill_lock);
ASSERT(!(ill->ill_capabilities &
(ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
if (ill->ill_net_type != IRE_LOOPBACK)
qprocsoff(ill->ill_rq);
/*
* We do an ipsq_flush once again now. New messages could have
* landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
* could also have landed up if an ioctl thread had looked up
* the ill before we set the ILL_CONDEMNED flag, but not yet
* enqueued the ioctl when we did the ipsq_flush last time.
*/
ipsq_flush(ill);
/*
* Free capabilities.
*/
if (ill->ill_hcksum_capab != NULL) {
kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
ill->ill_hcksum_capab = NULL;
}
if (ill->ill_zerocopy_capab != NULL) {
kmem_free(ill->ill_zerocopy_capab,
sizeof (ill_zerocopy_capab_t));
ill->ill_zerocopy_capab = NULL;
}
if (ill->ill_lso_capab != NULL) {
kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
ill->ill_lso_capab = NULL;
}
if (ill->ill_dld_capab != NULL) {
kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
ill->ill_dld_capab = NULL;
}
/* Clean up ill_allowed_ips* related state */
if (ill->ill_allowed_ips != NULL) {
ASSERT(ill->ill_allowed_ips_cnt > 0);
kmem_free(ill->ill_allowed_ips,
ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
ill->ill_allowed_ips = NULL;
ill->ill_allowed_ips_cnt = 0;
}
while (ill->ill_ipif != NULL)
ipif_free_tail(ill->ill_ipif);
/*
* We have removed all references to ilm from conn and the ones joined
* within the kernel.
*
* We don't walk conns, mrts and ires because
*
* 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
* 2) ill_down ->ill_downi walks all the ires and cleans up
* ill references.
*/
/*
* If this ill is an IPMP meta-interface, blow away the illgrp. This
* is safe to do because the illgrp has already been unlinked from the
* group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
*/
if (IS_IPMP(ill)) {
ipmp_illgrp_destroy(ill->ill_grp);
ill->ill_grp = NULL;
}
if (ill->ill_mphysaddr_list != NULL) {
multiphysaddr_t *mpa, *tmpa;
mpa = ill->ill_mphysaddr_list;
ill->ill_mphysaddr_list = NULL;
while (mpa) {
tmpa = mpa->mpa_next;
kmem_free(mpa, sizeof (*mpa));
mpa = tmpa;
}
}
/*
* Take us out of the list of ILLs. ill_glist_delete -> phyint_free
* could free the phyint. No more reference to the phyint after this
* point.
*/
(void) ill_glist_delete(ill);
if (ill->ill_frag_ptr != NULL) {
uint_t count;
for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
}
mi_free(ill->ill_frag_ptr);
ill->ill_frag_ptr = NULL;
ill->ill_frag_hash_tbl = NULL;
}
freemsg(ill->ill_nd_lla_mp);
/* Free all retained control messages. */
mpp = &ill->ill_first_mp_to_free;
do {
while (mpp[0]) {
mblk_t *mp;
mblk_t *mp1;
mp = mpp[0];
mpp[0] = mp->b_next;
for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
mp1->b_next = NULL;
mp1->b_prev = NULL;
}
freemsg(mp);
}
} while (mpp++ != &ill->ill_last_mp_to_free);
ill_free_mib(ill);
#ifdef DEBUG
ill_trace_cleanup(ill);
#endif
/* The default multicast interface might have changed */
ire_increment_multicast_generation(ipst, ill->ill_isv6);
/* Drop refcnt here */
netstack_rele(ill->ill_ipst->ips_netstack);
ill->ill_ipst = NULL;
}
static void
ill_free_mib(ill_t *ill)
{
ip_stack_t *ipst = ill->ill_ipst;
/*
* MIB statistics must not be lost, so when an interface
* goes away the counter values will be added to the global
* MIBs.
*/
if (ill->ill_ip_mib != NULL) {
if (ill->ill_isv6) {
ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
ill->ill_ip_mib);
} else {
ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
ill->ill_ip_mib);
}
kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
ill->ill_ip_mib = NULL;
}
if (ill->ill_icmp6_mib != NULL) {
ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
ill->ill_icmp6_mib);
kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
ill->ill_icmp6_mib = NULL;
}
}
/*
* Concatenate together a physical address and a sap.
*
* Sap_lengths are interpreted as follows:
* sap_length == 0 ==> no sap
* sap_length > 0 ==> sap is at the head of the dlpi address
* sap_length < 0 ==> sap is at the tail of the dlpi address
*/
static void
ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
{
uint16_t sap_addr = (uint16_t)sap_src;
if (sap_length == 0) {
if (phys_src == NULL)
bzero(dst, phys_length);
else
bcopy(phys_src, dst, phys_length);
} else if (sap_length < 0) {
if (phys_src == NULL)
bzero(dst, phys_length);
else
bcopy(phys_src, dst, phys_length);
bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
} else {
bcopy(&sap_addr, dst, sizeof (sap_addr));
if (phys_src == NULL)
bzero((char *)dst + sap_length, phys_length);
else
bcopy(phys_src, (char *)dst + sap_length, phys_length);
}
}
/*
* Generate a dl_unitdata_req mblk for the device and address given.
* addr_length is the length of the physical portion of the address.
* If addr is NULL include an all zero address of the specified length.
* TRUE? In any case, addr_length is taken to be the entire length of the
* dlpi address, including the absolute value of sap_length.
*/
mblk_t *
ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
t_scalar_t sap_length)
{
dl_unitdata_req_t *dlur;
mblk_t *mp;
t_scalar_t abs_sap_length; /* absolute value */
abs_sap_length = ABS(sap_length);
mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
DL_UNITDATA_REQ);
if (mp == NULL)
return (NULL);
dlur = (dl_unitdata_req_t *)mp->b_rptr;
/* HACK: accomodate incompatible DLPI drivers */
if (addr_length == 8)
addr_length = 6;
dlur->dl_dest_addr_length = addr_length + abs_sap_length;
dlur->dl_dest_addr_offset = sizeof (*dlur);
dlur->dl_priority.dl_min = 0;
dlur->dl_priority.dl_max = 0;
ill_dlur_copy_address(addr, addr_length, sap, sap_length,
(uchar_t *)&dlur[1]);
return (mp);
}
/*
* Add the pending mp to the list. There can be only 1 pending mp
* in the list. Any exclusive ioctl that needs to wait for a response
* from another module or driver needs to use this function to set
* the ipx_pending_mp to the ioctl mblk and wait for the response from
* the other module/driver. This is also used while waiting for the
* ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
*/
boolean_t
ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
int waitfor)
{
ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
ASSERT(IAM_WRITER_IPIF(ipif));
ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
ASSERT(ipx->ipx_pending_mp == NULL);
/*
* The caller may be using a different ipif than the one passed into
* ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
* ill needs to wait for the V6 ill to quiesce). So we can't ASSERT
* that `ipx_current_ipif == ipif'.
*/
ASSERT(ipx->ipx_current_ipif != NULL);
/*
* M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
* driver.
*/
ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
(DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
(DB_TYPE(add_mp) == M_PCPROTO));
if (connp != NULL) {
ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
* Return error if the conn has started closing. The conn
* could have finished cleaning up the pending mp list,
* If so we should not add another mp to the list negating
* the cleanup.
*/
if (connp->conn_state_flags & CONN_CLOSING)
return (B_FALSE);
}
mutex_enter(&ipx->ipx_lock);
ipx->ipx_pending_ipif = ipif;
/*
* Note down the queue in b_queue. This will be returned by
* ipsq_pending_mp_get. Caller will then use these values to restart
* the processing
*/
add_mp->b_next = NULL;
add_mp->b_queue = q;
ipx->ipx_pending_mp = add_mp;
ipx->ipx_waitfor = waitfor;
mutex_exit(&ipx->ipx_lock);
if (connp != NULL)
connp->conn_oper_pending_ill = ipif->ipif_ill;
return (B_TRUE);
}
/*
* Retrieve the ipx_pending_mp and return it. There can be only 1 mp
* queued in the list.
*/
mblk_t *
ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
{
mblk_t *curr = NULL;
ipxop_t *ipx = ipsq->ipsq_xop;
*connpp = NULL;
mutex_enter(&ipx->ipx_lock);
if (ipx->ipx_pending_mp == NULL) {
mutex_exit(&ipx->ipx_lock);
return (NULL);
}
/* There can be only 1 such excl message */
curr = ipx->ipx_pending_mp;
ASSERT(curr->b_next == NULL);
ipx->ipx_pending_ipif = NULL;
ipx->ipx_pending_mp = NULL;
ipx->ipx_waitfor = 0;
mutex_exit(&ipx->ipx_lock);
if (CONN_Q(curr->b_queue)) {
/*
* This mp did a refhold on the conn, at the start of the ioctl.
* So we can safely return a pointer to the conn to the caller.
*/
*connpp = Q_TO_CONN(curr->b_queue);
} else {
*connpp = NULL;
}
curr->b_next = NULL;
curr->b_prev = NULL;
return (curr);
}
/*
* Cleanup the ioctl mp queued in ipx_pending_mp
* - Called in the ill_delete path
* - Called in the M_ERROR or M_HANGUP path on the ill.
* - Called in the conn close path.
*
* Returns success on finding the pending mblk associated with the ioctl or
* exclusive operation in progress, failure otherwise.
*/
boolean_t
ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
{
mblk_t *mp;
ipxop_t *ipx;
queue_t *q;
ipif_t *ipif;
int cmd;
ASSERT(IAM_WRITER_ILL(ill));
ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
mutex_enter(&ipx->ipx_lock);
mp = ipx->ipx_pending_mp;
if (connp != NULL) {
if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
/*
* Nothing to clean since the conn that is closing
* does not have a matching pending mblk in
* ipx_pending_mp.
*/
mutex_exit(&ipx->ipx_lock);
return (B_FALSE);
}
} else {
/*
* A non-zero ill_error signifies we are called in the
* M_ERROR or M_HANGUP path and we need to unconditionally
* abort any current ioctl and do the corresponding cleanup.
* A zero ill_error means we are in the ill_delete path and
* we do the cleanup only if there is a pending mp.
*/
if (mp == NULL && ill->ill_error == 0) {
mutex_exit(&ipx->ipx_lock);
return (B_FALSE);
}
}
/* Now remove from the ipx_pending_mp */
ipx->ipx_pending_mp = NULL;
ipif = ipx->ipx_pending_ipif;
ipx->ipx_pending_ipif = NULL;
ipx->ipx_waitfor = 0;
ipx->ipx_current_ipif = NULL;
cmd = ipx->ipx_current_ioctl;
ipx->ipx_current_ioctl = 0;
ipx->ipx_current_done = B_TRUE;
mutex_exit(&ipx->ipx_lock);
if (mp == NULL)
return (B_FALSE);
q = mp->b_queue;
mp->b_next = NULL;
mp->b_prev = NULL;
mp->b_queue = NULL;
if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
DTRACE_PROBE4(ipif__ioctl,
char *, "ipsq_pending_mp_cleanup",
int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
ipif_t *, ipif);
if (connp == NULL) {
ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
} else {
ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
mutex_enter(&ipif->ipif_ill->ill_lock);
ipif->ipif_state_flags &= ~IPIF_CHANGING;
mutex_exit(&ipif->ipif_ill->ill_lock);
}
} else {
inet_freemsg(mp);
}
return (B_TRUE);
}
/*
* Called in the conn close path and ill delete path
*/
static void
ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
{
ipsq_t *ipsq;
mblk_t *prev;
mblk_t *curr;
mblk_t *next;
queue_t *wq, *rq = NULL;
mblk_t *tmp_list = NULL;
ASSERT(IAM_WRITER_ILL(ill));
if (connp != NULL)
wq = CONNP_TO_WQ(connp);
else
wq = ill->ill_wq;
/*
* In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
* against this here.
*/
if (wq != NULL)
rq = RD(wq);
ipsq = ill->ill_phyint->phyint_ipsq;
/*
* Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
* In the case of ioctl from a conn, there can be only 1 mp
* queued on the ipsq. If an ill is being unplumbed flush all
* the messages.
*/
mutex_enter(&ipsq->ipsq_lock);
for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
curr = next) {
next = curr->b_next;
if (connp == NULL ||
(curr->b_queue == wq || curr->b_queue == rq)) {
/* Unlink the mblk from the pending mp list */
if (prev != NULL) {
prev->b_next = curr->b_next;
} else {
ASSERT(ipsq->ipsq_xopq_mphead == curr);
ipsq->ipsq_xopq_mphead = curr->b_next;
}
if (ipsq->ipsq_xopq_mptail == curr)
ipsq->ipsq_xopq_mptail = prev;
/*
* Create a temporary list and release the ipsq lock
* New elements are added to the head of the tmp_list
*/
curr->b_next = tmp_list;
tmp_list = curr;
} else {
prev = curr;
}
}
mutex_exit(&ipsq->ipsq_lock);
while (tmp_list != NULL) {
curr = tmp_list;
tmp_list = curr->b_next;
curr->b_next = NULL;
curr->b_prev = NULL;
wq = curr->b_queue;
curr->b_queue = NULL;
if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
DTRACE_PROBE4(ipif__ioctl,
char *, "ipsq_xopq_mp_cleanup",
int, 0, ill_t *, NULL, ipif_t *, NULL);
ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
CONN_CLOSE : NO_COPYOUT, NULL);
} else {
/*
* IP-MT XXX In the case of TLI/XTI bind / optmgmt
* this can't be just inet_freemsg. we have to
* restart it otherwise the thread will be stuck.
*/
inet_freemsg(curr);
}
}
}
/*
* This conn has started closing. Cleanup any pending ioctl from this conn.
* STREAMS ensures that there can be at most 1 active ioctl on a stream.
*/
void
conn_ioctl_cleanup(conn_t *connp)
{
ipsq_t *ipsq;
ill_t *ill;
boolean_t refheld;
/*
* Check for a queued ioctl. If the ioctl has not yet started, the mp
* is pending in the list headed by ipsq_xopq_head. If the ioctl has
* started the mp could be present in ipx_pending_mp. Note that if
* conn_oper_pending_ill is NULL, the ioctl may still be in flight and
* not yet queued anywhere. In this case, the conn close code will wait
* until the conn_ref is dropped. If the stream was a tcp stream, then
* tcp_close will wait first until all ioctls have completed for this
* conn.
*/
mutex_enter(&connp->conn_lock);
ill = connp->conn_oper_pending_ill;
if (ill == NULL) {
mutex_exit(&connp->conn_lock);
return;
}
/*
* We may not be able to refhold the ill if the ill/ipif
* is changing. But we need to make sure that the ill will
* not vanish. So we just bump up the ill_waiter count.
*/
refheld = ill_waiter_inc(ill);
mutex_exit(&connp->conn_lock);
if (refheld) {
if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
ill_waiter_dcr(ill);
/*
* Check whether this ioctl has started and is
* pending. If it is not found there then check
* whether this ioctl has not even started and is in
* the ipsq_xopq list.
*/
if (!ipsq_pending_mp_cleanup(ill, connp))
ipsq_xopq_mp_cleanup(ill, connp);
ipsq = ill->ill_phyint->phyint_ipsq;
ipsq_exit(ipsq);
return;
}
}
/*
* The ill is also closing and we could not bump up the
* ill_waiter_count or we could not enter the ipsq. Leave
* the cleanup to ill_delete
*/
mutex_enter(&connp->conn_lock);
while (connp->conn_oper_pending_ill != NULL)
cv_wait(&connp->conn_refcv, &connp->conn_lock);
mutex_exit(&connp->conn_lock);
if (refheld)
ill_waiter_dcr(ill);
}
/*
* ipcl_walk function for cleaning up conn_*_ill fields.
* Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
* conn_bound_if in place. We prefer dropping
* packets instead of sending them out the wrong interface, or accepting
* packets from the wrong ifindex.
*/
static void
conn_cleanup_ill(conn_t *connp, caddr_t arg)
{
ill_t *ill = (ill_t *)arg;
mutex_enter(&connp->conn_lock);
if (connp->conn_dhcpinit_ill == ill) {
connp->conn_dhcpinit_ill = NULL;
ASSERT(ill->ill_dhcpinit != 0);
atomic_dec_32(&ill->ill_dhcpinit);
ill_set_inputfn(ill);
}
mutex_exit(&connp->conn_lock);
}
static int
ill_down_ipifs_tail(ill_t *ill)
{
ipif_t *ipif;
int err;
ASSERT(IAM_WRITER_ILL(ill));
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
ipif_non_duplicate(ipif);
/*
* ipif_down_tail will call arp_ll_down on the last ipif
* and typically return EINPROGRESS when the DL_UNBIND is sent.
*/
if ((err = ipif_down_tail(ipif)) != 0)
return (err);
}
return (0);
}
/* ARGSUSED */
void
ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
ASSERT(IAM_WRITER_IPSQ(ipsq));
(void) ill_down_ipifs_tail(q->q_ptr);
freemsg(mp);
ipsq_current_finish(ipsq);
}
/*
* ill_down_start is called when we want to down this ill and bring it up again
* It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
* all interfaces, but don't tear down any plumbing.
*/
boolean_t
ill_down_start(queue_t *q, mblk_t *mp)
{
ill_t *ill = q->q_ptr;
ipif_t *ipif;
ASSERT(IAM_WRITER_ILL(ill));
/*
* It is possible that some ioctl is already in progress while we
* received the M_ERROR / M_HANGUP in which case, we need to abort
* the ioctl. ill_down_start() is being processed as CUR_OP rather
* than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
* the in progress ioctl from ever completing.
*
* The thread that started the ioctl (if any) must have returned,
* since we are now executing as writer. After the 2 calls below,
* the state of the ipsq and the ill would reflect no trace of any
* pending operation. Subsequently if there is any response to the
* original ioctl from the driver, it would be discarded as an
* unsolicited message from the driver.
*/
(void) ipsq_pending_mp_cleanup(ill, NULL);
ill_dlpi_clear_deferred(ill);
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
(void) ipif_down(ipif, NULL, NULL);
ill_down(ill);
/*
* Walk all CONNs that can have a reference on an ire or nce for this
* ill (we actually walk all that now have stale references).
*/
ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
/* With IPv6 we have dce_ifindex. Cleanup for neatness */
if (ill->ill_isv6)
dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
/*
* Atomically test and add the pending mp if references are active.
*/
mutex_enter(&ill->ill_lock);
if (!ill_is_quiescent(ill)) {
/* call cannot fail since `conn_t *' argument is NULL */
(void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
mp, ILL_DOWN);
mutex_exit(&ill->ill_lock);
return (B_FALSE);
}
mutex_exit(&ill->ill_lock);
return (B_TRUE);
}
static void
ill_down(ill_t *ill)
{
mblk_t *mp;
ip_stack_t *ipst = ill->ill_ipst;
/*
* Blow off any IREs dependent on this ILL.
* The caller needs to handle conn_ixa_cleanup
*/
ill_delete_ires(ill);
ire_walk_ill(0, 0, ill_downi, ill, ill);
/* Remove any conn_*_ill depending on this ill */
ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
/*
* Free state for additional IREs.
*/
mutex_enter(&ill->ill_saved_ire_lock);
mp = ill->ill_saved_ire_mp;
ill->ill_saved_ire_mp = NULL;
ill->ill_saved_ire_cnt = 0;
mutex_exit(&ill->ill_saved_ire_lock);
freemsg(mp);
}
/*
* ire_walk routine used to delete every IRE that depends on
* 'ill'. (Always called as writer, and may only be called from ire_walk.)
*
* Note: since the routes added by the kernel are deleted separately,
* this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
*
* We also remove references on ire_nce_cache entries that refer to the ill.
*/
void
ill_downi(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
nce_t *nce;
mutex_enter(&ire->ire_lock);
nce = ire->ire_nce_cache;
if (nce != NULL && nce->nce_ill == ill)
ire->ire_nce_cache = NULL;
else
nce = NULL;
mutex_exit(&ire->ire_lock);
if (nce != NULL)
nce_refrele(nce);
if (ire->ire_ill == ill) {
/*
* The existing interface binding for ire must be
* deleted before trying to bind the route to another
* interface. However, since we are using the contents of the
* ire after ire_delete, the caller has to ensure that
* CONDEMNED (deleted) ire's are not removed from the list
* when ire_delete() returns. Currently ill_downi() is
* only called as part of ire_walk*() routines, so that
* the irb_refhold() done by ire_walk*() will ensure that
* ire_delete() does not lead to ire_inactive().
*/
ASSERT(ire->ire_bucket->irb_refcnt > 0);
ire_delete(ire);
if (ire->ire_unbound)
ire_rebind(ire);
}
}
/* Remove IRE_IF_CLONE on this ill */
void
ill_downi_if_clone(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
ASSERT(ire->ire_type & IRE_IF_CLONE);
if (ire->ire_ill == ill)
ire_delete(ire);
}
/* Consume an M_IOCACK of the fastpath probe. */
void
ill_fastpath_ack(ill_t *ill, mblk_t *mp)
{
mblk_t *mp1 = mp;
/*
* If this was the first attempt turn on the fastpath probing.
*/
mutex_enter(&ill->ill_lock);
if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
ill->ill_dlpi_fastpath_state = IDS_OK;
mutex_exit(&ill->ill_lock);
/* Free the M_IOCACK mblk, hold on to the data */
mp = mp->b_cont;
freeb(mp1);
if (mp == NULL)
return;
if (mp->b_cont != NULL)
nce_fastpath_update(ill, mp);
else
ip0dbg(("ill_fastpath_ack: no b_cont\n"));
freemsg(mp);
}
/*
* Throw an M_IOCTL message downstream asking "do you know fastpath?"
* The data portion of the request is a dl_unitdata_req_t template for
* what we would send downstream in the absence of a fastpath confirmation.
*/
int
ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
{
struct iocblk *ioc;
mblk_t *mp;
if (dlur_mp == NULL)
return (EINVAL);
mutex_enter(&ill->ill_lock);
switch (ill->ill_dlpi_fastpath_state) {
case IDS_FAILED:
/*
* Driver NAKed the first fastpath ioctl - assume it doesn't
* support it.
*/
mutex_exit(&ill->ill_lock);
return (ENOTSUP);
case IDS_UNKNOWN:
/* This is the first probe */
ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
break;
default:
break;
}
mutex_exit(&ill->ill_lock);
if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
return (EAGAIN);
mp->b_cont = copyb(dlur_mp);
if (mp->b_cont == NULL) {
freeb(mp);
return (EAGAIN);
}
ioc = (struct iocblk *)mp->b_rptr;
ioc->ioc_count = msgdsize(mp->b_cont);
DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
char *, "DL_IOC_HDR_INFO", ill_t *, ill);
putnext(ill->ill_wq, mp);
return (0);
}
void
ill_capability_probe(ill_t *ill)
{
mblk_t *mp;
ASSERT(IAM_WRITER_ILL(ill));
if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
ill->ill_dlpi_capab_state != IDCS_FAILED)
return;
/*
* We are starting a new cycle of capability negotiation.
* Free up the capab reset messages of any previous incarnation.
* We will do a fresh allocation when we get the response to our probe
*/
if (ill->ill_capab_reset_mp != NULL) {
freemsg(ill->ill_capab_reset_mp);
ill->ill_capab_reset_mp = NULL;
}
ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
if (mp == NULL)
return;
ill_capability_send(ill, mp);
ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
}
void
ill_capability_reset(ill_t *ill, boolean_t reneg)
{
ASSERT(IAM_WRITER_ILL(ill));
if (ill->ill_dlpi_capab_state != IDCS_OK)
return;
ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
ill_capability_send(ill, ill->ill_capab_reset_mp);
ill->ill_capab_reset_mp = NULL;
/*
* We turn off all capabilities except those pertaining to
* direct function call capabilities viz. ILL_CAPAB_DLD*
* which will be turned off by the corresponding reset functions.
*/
ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY);
}
static void
ill_capability_reset_alloc(ill_t *ill)
{
mblk_t *mp;
size_t size = 0;
int err;
dl_capability_req_t *capb;
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(ill->ill_capab_reset_mp == NULL);
if (ILL_HCKSUM_CAPABLE(ill)) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_hcksum_t);
}
if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_zerocopy_t);
}
if (ill->ill_capabilities & ILL_CAPAB_DLD) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_dld_t);
}
mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
STR_NOSIG, &err);
mp->b_datap->db_type = M_PROTO;
bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
capb = (dl_capability_req_t *)mp->b_rptr;
capb->dl_primitive = DL_CAPABILITY_REQ;
capb->dl_sub_offset = sizeof (dl_capability_req_t);
capb->dl_sub_length = size;
mp->b_wptr += sizeof (dl_capability_req_t);
/*
* Each handler fills in the corresponding dl_capability_sub_t
* inside the mblk,
*/
ill_capability_hcksum_reset_fill(ill, mp);
ill_capability_zerocopy_reset_fill(ill, mp);
ill_capability_dld_reset_fill(ill, mp);
ill->ill_capab_reset_mp = mp;
}
static void
ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
{
dl_capab_id_t *id_ic;
uint_t sub_dl_cap = outers->dl_cap;
dl_capability_sub_t *inners;
uint8_t *capend;
ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(outers + 1) + outers->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_id_ack: "
"malformed sub-capability too long for mblk");
return;
}
id_ic = (dl_capab_id_t *)(outers + 1);
if (outers->dl_length < sizeof (*id_ic) ||
(inners = &id_ic->id_subcap,
inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
"encapsulated capab type %d too long for mblk",
inners->dl_cap);
return;
}
if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
"isn't as expected; pass-thru module(s) detected, "
"discarding capability\n", inners->dl_cap));
return;
}
/* Process the encapsulated sub-capability */
ill_capability_dispatch(ill, mp, inners);
}
static void
ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
{
dl_capability_sub_t *dl_subcap;
if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
return;
/*
* The dl_capab_dld_t that follows the dl_capability_sub_t is not
* initialized below since it is not used by DLD.
*/
dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = DL_CAPAB_DLD;
dl_subcap->dl_length = sizeof (dl_capab_dld_t);
mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
}
static void
ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
{
/*
* If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
* is only to get the VRRP capability.
*
* Note that we cannot check ill_ipif_up_count here since
* ill_ipif_up_count is only incremented when the resolver is setup.
* That is done asynchronously, and can race with this function.
*/
if (!ill->ill_dl_up) {
if (subp->dl_cap == DL_CAPAB_VRRP)
ill_capability_vrrp_ack(ill, mp, subp);
return;
}
switch (subp->dl_cap) {
case DL_CAPAB_HCKSUM:
ill_capability_hcksum_ack(ill, mp, subp);
break;
case DL_CAPAB_ZEROCOPY:
ill_capability_zerocopy_ack(ill, mp, subp);
break;
case DL_CAPAB_DLD:
ill_capability_dld_ack(ill, mp, subp);
break;
case DL_CAPAB_VRRP:
break;
default:
ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
subp->dl_cap));
}
}
/*
* Process the vrrp capability received from a DLS Provider. isub must point
* to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
*/
static void
ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
dl_capab_vrrp_t *vrrp;
uint_t sub_dl_cap = isub->dl_cap;
uint8_t *capend;
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
"malformed sub-capability too long for mblk");
return;
}
vrrp = (dl_capab_vrrp_t *)(isub + 1);
/*
* Compare the IP address family and set ILLF_VRRP for the right ill.
*/
if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
(vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
ill->ill_flags |= ILLF_VRRP;
}
}
/*
* Process a hardware checksum offload capability negotiation ack received
* from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
* of a DL_CAPABILITY_ACK message.
*/
static void
ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
dl_capability_req_t *ocap;
dl_capab_hcksum_t *ihck, *ohck;
ill_hcksum_capab_t **ill_hcksum;
mblk_t *nmp = NULL;
uint_t sub_dl_cap = isub->dl_cap;
uint8_t *capend;
ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
"malformed sub-capability too long for mblk");
return;
}
/*
* There are two types of acks we process here:
* 1. acks in reply to a (first form) generic capability req
* (no ENABLE flag set)
* 2. acks in reply to a ENABLE capability req.
* (ENABLE flag set)
*/
ihck = (dl_capab_hcksum_t *)(isub + 1);
if (ihck->hcksum_version != HCKSUM_VERSION_1) {
cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
"unsupported hardware checksum "
"sub-capability (version %d, expected %d)",
ihck->hcksum_version, HCKSUM_VERSION_1);
return;
}
if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
"checksum capability isn't as expected; pass-thru "
"module(s) detected, discarding capability\n"));
return;
}
#define CURR_HCKSUM_CAPAB \
(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \
HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
(ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
/* do ENABLE processing */
if (*ill_hcksum == NULL) {
*ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
KM_NOSLEEP);
if (*ill_hcksum == NULL) {
cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
"could not enable hcksum version %d "
"for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
ill->ill_name);
return;
}
}
(*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
(*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
ip1dbg(("ill_capability_hcksum_ack: interface %s "
"has enabled hardware checksumming\n ",
ill->ill_name));
} else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
/*
* Enabling hardware checksum offload
* Currently IP supports {TCP,UDP}/IPv4
* partial and full cksum offload and
* IPv4 header checksum offload.
* Allocate new mblk which will
* contain a new capability request
* to enable hardware checksum offload.
*/
uint_t size;
uchar_t *rptr;
size = sizeof (dl_capability_req_t) +
sizeof (dl_capability_sub_t) + isub->dl_length;
if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
"could not enable hardware cksum for %s (ENOMEM)\n",
ill->ill_name);
return;
}
rptr = nmp->b_rptr;
/* initialize dl_capability_req_t */
ocap = (dl_capability_req_t *)nmp->b_rptr;
ocap->dl_sub_offset =
sizeof (dl_capability_req_t);
ocap->dl_sub_length =
sizeof (dl_capability_sub_t) +
isub->dl_length;
nmp->b_rptr += sizeof (dl_capability_req_t);
/* initialize dl_capability_sub_t */
bcopy(isub, nmp->b_rptr, sizeof (*isub));
nmp->b_rptr += sizeof (*isub);
/* initialize dl_capab_hcksum_t */
ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
bcopy(ihck, ohck, sizeof (*ihck));
nmp->b_rptr = rptr;
ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
/* Set ENABLE flag */
ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
ohck->hcksum_txflags |= HCKSUM_ENABLE;
/*
* nmp points to a DL_CAPABILITY_REQ message to enable
* hardware checksum acceleration.
*/
ill_capability_send(ill, nmp);
} else {
ip1dbg(("ill_capability_hcksum_ack: interface %s has "
"advertised %x hardware checksum capability flags\n",
ill->ill_name, ihck->hcksum_txflags));
}
}
static void
ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
{
dl_capab_hcksum_t *hck_subcap;
dl_capability_sub_t *dl_subcap;
if (!ILL_HCKSUM_CAPABLE(ill))
return;
ASSERT(ill->ill_hcksum_capab != NULL);
dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
dl_subcap->dl_length = sizeof (*hck_subcap);
hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
hck_subcap->hcksum_txflags = 0;
mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
}
static void
ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
mblk_t *nmp = NULL;
dl_capability_req_t *oc;
dl_capab_zerocopy_t *zc_ic, *zc_oc;
ill_zerocopy_capab_t **ill_zerocopy_capab;
uint_t sub_dl_cap = isub->dl_cap;
uint8_t *capend;
ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
"malformed sub-capability too long for mblk");
return;
}
zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
"unsupported ZEROCOPY sub-capability (version %d, "
"expected %d)", zc_ic->zerocopy_version,
ZEROCOPY_VERSION_1);
return;
}
if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
"capability isn't as expected; pass-thru module(s) "
"detected, discarding capability\n"));
return;
}
if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
if (*ill_zerocopy_capab == NULL) {
*ill_zerocopy_capab =
kmem_zalloc(sizeof (ill_zerocopy_capab_t),
KM_NOSLEEP);
if (*ill_zerocopy_capab == NULL) {
cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
"could not enable Zero-copy version %d "
"for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
ill->ill_name);
return;
}
}
ip1dbg(("ill_capability_zerocopy_ack: interface %s "
"supports Zero-copy version %d\n", ill->ill_name,
ZEROCOPY_VERSION_1));
(*ill_zerocopy_capab)->ill_zerocopy_version =
zc_ic->zerocopy_version;
(*ill_zerocopy_capab)->ill_zerocopy_flags =
zc_ic->zerocopy_flags;
ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
} else {
uint_t size;
uchar_t *rptr;
size = sizeof (dl_capability_req_t) +
sizeof (dl_capability_sub_t) +
sizeof (dl_capab_zerocopy_t);
if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
"could not enable zerocopy for %s (ENOMEM)\n",
ill->ill_name);
return;
}
rptr = nmp->b_rptr;
/* initialize dl_capability_req_t */
oc = (dl_capability_req_t *)rptr;
oc->dl_sub_offset = sizeof (dl_capability_req_t);
oc->dl_sub_length = sizeof (dl_capability_sub_t) +
sizeof (dl_capab_zerocopy_t);
rptr += sizeof (dl_capability_req_t);
/* initialize dl_capability_sub_t */
bcopy(isub, rptr, sizeof (*isub));
rptr += sizeof (*isub);
/* initialize dl_capab_zerocopy_t */
zc_oc = (dl_capab_zerocopy_t *)rptr;
*zc_oc = *zc_ic;
ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
"to enable zero-copy version %d\n", ill->ill_name,
ZEROCOPY_VERSION_1));
/* set VMSAFE_MEM flag */
zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
/* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
ill_capability_send(ill, nmp);
}
}
static void
ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
{
dl_capab_zerocopy_t *zerocopy_subcap;
dl_capability_sub_t *dl_subcap;
if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
return;
ASSERT(ill->ill_zerocopy_capab != NULL);
dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
dl_subcap->dl_length = sizeof (*zerocopy_subcap);
zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
zerocopy_subcap->zerocopy_version =
ill->ill_zerocopy_capab->ill_zerocopy_version;
zerocopy_subcap->zerocopy_flags = 0;
mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
}
/*
* DLD capability
* Refer to dld.h for more information regarding the purpose and usage
* of this capability.
*/
static void
ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
dl_capab_dld_t *dld_ic, dld;
uint_t sub_dl_cap = isub->dl_cap;
uint8_t *capend;
ill_dld_capab_t *idc;
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(sub_dl_cap == DL_CAPAB_DLD);
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_dld_ack: "
"malformed sub-capability too long for mblk");
return;
}
dld_ic = (dl_capab_dld_t *)(isub + 1);
if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
cmn_err(CE_CONT, "ill_capability_dld_ack: "
"unsupported DLD sub-capability (version %d, "
"expected %d)", dld_ic->dld_version,
DLD_CURRENT_VERSION);
return;
}
if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_dld_ack: mid token for dld "
"capability isn't as expected; pass-thru module(s) "
"detected, discarding capability\n"));
return;
}
/*
* Copy locally to ensure alignment.
*/
bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
if ((idc = ill->ill_dld_capab) == NULL) {
idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
if (idc == NULL) {
cmn_err(CE_WARN, "ill_capability_dld_ack: "
"could not enable DLD version %d "
"for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
ill->ill_name);
return;
}
ill->ill_dld_capab = idc;
}
idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
idc->idc_capab_dh = (void *)dld.dld_capab_handle;
ip1dbg(("ill_capability_dld_ack: interface %s "
"supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
ill_capability_dld_enable(ill);
}
/*
* Typically capability negotiation between IP and the driver happens via
* DLPI message exchange. However GLD also offers a direct function call
* mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
* But arbitrary function calls into IP or GLD are not permitted, since both
* of them are protected by their own perimeter mechanism. The perimeter can
* be viewed as a coarse lock or serialization mechanism. The hierarchy of
* these perimeters is IP -> MAC. Thus for example to enable the squeue
* polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
* to enter the mac perimeter and then do the direct function calls into
* GLD to enable squeue polling. The ring related callbacks from the mac into
* the stack to add, bind, quiesce, restart or cleanup a ring are all
* protected by the mac perimeter.
*/
static void
ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
{
ill_dld_capab_t *idc = ill->ill_dld_capab;
int err;
err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
DLD_ENABLE);
ASSERT(err == 0);
}
static void
ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
{
ill_dld_capab_t *idc = ill->ill_dld_capab;
int err;
err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
DLD_DISABLE);
ASSERT(err == 0);
}
boolean_t
ill_mac_perim_held(ill_t *ill)
{
ill_dld_capab_t *idc = ill->ill_dld_capab;
return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
DLD_QUERY));
}
static void
ill_capability_direct_enable(ill_t *ill)
{
ill_dld_capab_t *idc = ill->ill_dld_capab;
ill_dld_direct_t *idd = &idc->idc_direct;
dld_capab_direct_t direct;
int rc;
ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
bzero(&direct, sizeof (direct));
direct.di_rx_cf = (uintptr_t)ip_input;
direct.di_rx_ch = ill;
rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
DLD_ENABLE);
if (rc == 0) {
idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
idd->idd_tx_dh = direct.di_tx_dh;
idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
ASSERT(idd->idd_tx_cb_df != NULL);
ASSERT(idd->idd_tx_fctl_df != NULL);
ASSERT(idd->idd_tx_df != NULL);
/*
* One time registration of flow enable callback function
*/
ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
ill_flow_enable, ill);
ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
DTRACE_PROBE1(direct_on, (ill_t *), ill);
} else {
cmn_err(CE_WARN, "warning: could not enable DIRECT "
"capability, rc = %d\n", rc);
DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
}
}
static void
ill_capability_poll_enable(ill_t *ill)
{
ill_dld_capab_t *idc = ill->ill_dld_capab;
dld_capab_poll_t poll;
int rc;
ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
bzero(&poll, sizeof (poll));
poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
poll.poll_ring_ch = ill;
rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
DLD_ENABLE);
if (rc == 0) {
ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
DTRACE_PROBE1(poll_on, (ill_t *), ill);
} else {
ip1dbg(("warning: could not enable POLL "
"capability, rc = %d\n", rc));
DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
}
}
/*
* Enable the LSO capability.
*/
static void
ill_capability_lso_enable(ill_t *ill)
{
ill_dld_capab_t *idc = ill->ill_dld_capab;
dld_capab_lso_t lso;
int rc;
ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
if (ill->ill_lso_capab == NULL) {
ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
KM_NOSLEEP);
if (ill->ill_lso_capab == NULL) {
cmn_err(CE_WARN, "ill_capability_lso_enable: "
"could not enable LSO for %s (ENOMEM)\n",
ill->ill_name);
return;
}
}
bzero(&lso, sizeof (lso));
if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
DLD_ENABLE)) == 0) {
ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
ill->ill_lso_capab->ill_lso_max = lso.lso_max;
ill->ill_capabilities |= ILL_CAPAB_LSO;
ip1dbg(("ill_capability_lso_enable: interface %s "
"has enabled LSO\n ", ill->ill_name));
} else {
kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
ill->ill_lso_capab = NULL;
DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
}
}
static void
ill_capability_dld_enable(ill_t *ill)
{
mac_perim_handle_t mph;
ASSERT(IAM_WRITER_ILL(ill));
if (ill->ill_isv6)
return;
ill_mac_perim_enter(ill, &mph);
if (!ill->ill_isv6) {
ill_capability_direct_enable(ill);
ill_capability_poll_enable(ill);
ill_capability_lso_enable(ill);
}
ill->ill_capabilities |= ILL_CAPAB_DLD;
ill_mac_perim_exit(ill, mph);
}
static void
ill_capability_dld_disable(ill_t *ill)
{
ill_dld_capab_t *idc;
ill_dld_direct_t *idd;
mac_perim_handle_t mph;
ASSERT(IAM_WRITER_ILL(ill));
if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
return;
ill_mac_perim_enter(ill, &mph);
idc = ill->ill_dld_capab;
if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
/*
* For performance we avoid locks in the transmit data path
* and don't maintain a count of the number of threads using
* direct calls. Thus some threads could be using direct
* transmit calls to GLD, even after the capability mechanism
* turns it off. This is still safe since the handles used in
* the direct calls continue to be valid until the unplumb is
* completed. Remove the callback that was added (1-time) at
* capab enable time.
*/
mutex_enter(&ill->ill_lock);
ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
mutex_exit(&ill->ill_lock);
if (ill->ill_flownotify_mh != NULL) {
idd = &idc->idc_direct;
idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
ill->ill_flownotify_mh);
ill->ill_flownotify_mh = NULL;
}
(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
NULL, DLD_DISABLE);
}
if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
ip_squeue_clean_all(ill);
(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
NULL, DLD_DISABLE);
}
if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
ASSERT(ill->ill_lso_capab != NULL);
/*
* Clear the capability flag for LSO but retain the
* ill_lso_capab structure since it's possible that another
* thread is still referring to it. The structure only gets
* deallocated when we destroy the ill.
*/
ill->ill_capabilities &= ~ILL_CAPAB_LSO;
(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
NULL, DLD_DISABLE);
}
ill->ill_capabilities &= ~ILL_CAPAB_DLD;
ill_mac_perim_exit(ill, mph);
}
/*
* Capability Negotiation protocol
*
* We don't wait for DLPI capability operations to finish during interface
* bringup or teardown. Doing so would introduce more asynchrony and the
* interface up/down operations will need multiple return and restarts.
* Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
* the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
* exclusive operation won't start until the DLPI operations of the previous
* exclusive operation complete.
*
* The capability state machine is shown below.
*
* state next state event, action
*
* IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe
* IDCS_PROBE_SENT IDCS_OK ill_capability_ack
* IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack)
* IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG
* IDCS_OK IDCS_RESET_SENT ill_capability_reset
* IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr
* IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr ->
* ill_capability_probe.
*/
/*
* Dedicated thread started from ip_stack_init that handles capability
* disable. This thread ensures the taskq dispatch does not fail by waiting
* for resources using TQ_SLEEP. The taskq mechanism is used to ensure
* that direct calls to DLD are done in a cv_waitable context.
*/
void
ill_taskq_dispatch(ip_stack_t *ipst)
{
callb_cpr_t cprinfo;
char name[64];
mblk_t *mp;
(void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
ipst->ips_netstack->netstack_stackid);
CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
name);
mutex_enter(&ipst->ips_capab_taskq_lock);
for (;;) {
mp = ipst->ips_capab_taskq_head;
while (mp != NULL) {
ipst->ips_capab_taskq_head = mp->b_next;
if (ipst->ips_capab_taskq_head == NULL)
ipst->ips_capab_taskq_tail = NULL;
mutex_exit(&ipst->ips_capab_taskq_lock);
mp->b_next = NULL;
VERIFY(taskq_dispatch(system_taskq,
ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
mutex_enter(&ipst->ips_capab_taskq_lock);
mp = ipst->ips_capab_taskq_head;
}
if (ipst->ips_capab_taskq_quit)
break;
CALLB_CPR_SAFE_BEGIN(&cprinfo);
cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
}
VERIFY(ipst->ips_capab_taskq_head == NULL);
VERIFY(ipst->ips_capab_taskq_tail == NULL);
CALLB_CPR_EXIT(&cprinfo);
thread_exit();
}
/*
* Consume a new-style hardware capabilities negotiation ack.
* Called via taskq on receipt of DL_CAPABILITY_ACK.
*/
static void
ill_capability_ack_thr(void *arg)
{
mblk_t *mp = arg;
dl_capability_ack_t *capp;
dl_capability_sub_t *subp, *endp;
ill_t *ill;
boolean_t reneg;
ill = (ill_t *)mp->b_prev;
mp->b_prev = NULL;
VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
ill->ill_dlpi_capab_state == IDCS_RENEG) {
/*
* We have received the ack for our DL_CAPAB reset request.
* There isnt' anything in the message that needs processing.
* All message based capabilities have been disabled, now
* do the function call based capability disable.
*/
reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
ill_capability_dld_disable(ill);
ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
if (reneg)
ill_capability_probe(ill);
goto done;
}
if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
ill->ill_dlpi_capab_state = IDCS_OK;
capp = (dl_capability_ack_t *)mp->b_rptr;
if (capp->dl_sub_length == 0) {
/* no new-style capabilities */
goto done;
}
/* make sure the driver supplied correct dl_sub_length */
if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
"invalid dl_sub_length (%d)\n", capp->dl_sub_length));
goto done;
}
#define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
/*
* There are sub-capabilities. Process the ones we know about.
* Loop until we don't have room for another sub-cap header..
*/
for (subp = SC(capp, capp->dl_sub_offset),
endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
subp <= endp;
subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
switch (subp->dl_cap) {
case DL_CAPAB_ID_WRAPPER:
ill_capability_id_ack(ill, mp, subp);
break;
default:
ill_capability_dispatch(ill, mp, subp);
break;
}
}
#undef SC
done:
inet_freemsg(mp);
ill_capability_done(ill);
ipsq_exit(ill->ill_phyint->phyint_ipsq);
}
/*
* This needs to be started in a taskq thread to provide a cv_waitable
* context.
*/
void
ill_capability_ack(ill_t *ill, mblk_t *mp)
{
ip_stack_t *ipst = ill->ill_ipst;
mp->b_prev = (mblk_t *)ill;
ASSERT(mp->b_next == NULL);
if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
TQ_NOSLEEP) != 0)
return;
/*
* The taskq dispatch failed. Signal the ill_taskq_dispatch thread
* which will do the dispatch using TQ_SLEEP to guarantee success.
*/
mutex_enter(&ipst->ips_capab_taskq_lock);
if (ipst->ips_capab_taskq_head == NULL) {
ASSERT(ipst->ips_capab_taskq_tail == NULL);
ipst->ips_capab_taskq_head = mp;
} else {
ipst->ips_capab_taskq_tail->b_next = mp;
}
ipst->ips_capab_taskq_tail = mp;
cv_signal(&ipst->ips_capab_taskq_cv);
mutex_exit(&ipst->ips_capab_taskq_lock);
}
/*
* This routine is called to scan the fragmentation reassembly table for
* the specified ILL for any packets that are starting to smell.
* dead_interval is the maximum time in seconds that will be tolerated. It
* will either be the value specified in ip_g_frag_timeout, or zero if the
* ILL is shutting down and it is time to blow everything off.
*
* It returns the number of seconds (as a time_t) that the next frag timer
* should be scheduled for, 0 meaning that the timer doesn't need to be
* re-started. Note that the method of calculating next_timeout isn't
* entirely accurate since time will flow between the time we grab
* current_time and the time we schedule the next timeout. This isn't a
* big problem since this is the timer for sending an ICMP reassembly time
* exceeded messages, and it doesn't have to be exactly accurate.
*
* This function is
* sometimes called as writer, although this is not required.
*/
time_t
ill_frag_timeout(ill_t *ill, time_t dead_interval)
{
ipfb_t *ipfb;
ipfb_t *endp;
ipf_t *ipf;
ipf_t *ipfnext;
mblk_t *mp;
time_t current_time = gethrestime_sec();
time_t next_timeout = 0;
uint32_t hdr_length;
mblk_t *send_icmp_head;
mblk_t *send_icmp_head_v6;
ip_stack_t *ipst = ill->ill_ipst;
ip_recv_attr_t iras;
bzero(&iras, sizeof (iras));
iras.ira_flags = 0;
iras.ira_ill = iras.ira_rill = ill;
iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
iras.ira_rifindex = iras.ira_ruifindex;
ipfb = ill->ill_frag_hash_tbl;
if (ipfb == NULL)
return (B_FALSE);
endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
/* Walk the frag hash table. */
for (; ipfb < endp; ipfb++) {
send_icmp_head = NULL;
send_icmp_head_v6 = NULL;
mutex_enter(&ipfb->ipfb_lock);
while ((ipf = ipfb->ipfb_ipf) != 0) {
time_t frag_time = current_time - ipf->ipf_timestamp;
time_t frag_timeout;
if (frag_time < dead_interval) {
/*
* There are some outstanding fragments
* that will timeout later. Make note of
* the time so that we can reschedule the
* next timeout appropriately.
*/
frag_timeout = dead_interval - frag_time;
if (next_timeout == 0 ||
frag_timeout < next_timeout) {
next_timeout = frag_timeout;
}
break;
}
/* Time's up. Get it out of here. */
hdr_length = ipf->ipf_nf_hdr_len;
ipfnext = ipf->ipf_hash_next;
if (ipfnext)
ipfnext->ipf_ptphn = ipf->ipf_ptphn;
*ipf->ipf_ptphn = ipfnext;
mp = ipf->ipf_mp->b_cont;
for (; mp; mp = mp->b_cont) {
/* Extra points for neatness. */
IP_REASS_SET_START(mp, 0);
IP_REASS_SET_END(mp, 0);
}
mp = ipf->ipf_mp->b_cont;
atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
ipfb->ipfb_count -= ipf->ipf_count;
ASSERT(ipfb->ipfb_frag_pkts > 0);
ipfb->ipfb_frag_pkts--;
/*
* We do not send any icmp message from here because
* we currently are holding the ipfb_lock for this
* hash chain. If we try and send any icmp messages
* from here we may end up via a put back into ip
* trying to get the same lock, causing a recursive
* mutex panic. Instead we build a list and send all
* the icmp messages after we have dropped the lock.
*/
if (ill->ill_isv6) {
if (hdr_length != 0) {
mp->b_next = send_icmp_head_v6;
send_icmp_head_v6 = mp;
} else {
freemsg(mp);
}
} else {
if (hdr_length != 0) {
mp->b_next = send_icmp_head;
send_icmp_head = mp;
} else {
freemsg(mp);
}
}
BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
freeb(ipf->ipf_mp);
}
mutex_exit(&ipfb->ipfb_lock);
/*
* Now need to send any icmp messages that we delayed from
* above.
*/
while (send_icmp_head_v6 != NULL) {
ip6_t *ip6h;
mp = send_icmp_head_v6;
send_icmp_head_v6 = send_icmp_head_v6->b_next;
mp->b_next = NULL;
ip6h = (ip6_t *)mp->b_rptr;
iras.ira_flags = 0;
/*
* This will result in an incorrect ALL_ZONES zoneid
* for multicast packets, but we
* don't send ICMP errors for those in any case.
*/
iras.ira_zoneid =
ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
ill, ipst);
ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
icmp_time_exceeded_v6(mp,
ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
&iras);
ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
}
while (send_icmp_head != NULL) {
ipaddr_t dst;
mp = send_icmp_head;
send_icmp_head = send_icmp_head->b_next;
mp->b_next = NULL;
dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
iras.ira_flags = IRAF_IS_IPV4;
/*
* This will result in an incorrect ALL_ZONES zoneid
* for broadcast and multicast packets, but we
* don't send ICMP errors for those in any case.
*/
iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
ill, ipst);
ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
icmp_time_exceeded(mp,
ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
}
}
/*
* A non-dying ILL will use the return value to decide whether to
* restart the frag timer, and for how long.
*/
return (next_timeout);
}
/*
* This routine is called when the approximate count of mblk memory used
* for the specified ILL has exceeded max_count.
*/
void
ill_frag_prune(ill_t *ill, uint_t max_count)
{
ipfb_t *ipfb;
ipf_t *ipf;
size_t count;
clock_t now;
/*
* If we are here within ip_min_frag_prune_time msecs remove
* ill_frag_free_num_pkts oldest packets from each bucket and increment
* ill_frag_free_num_pkts.
*/
mutex_enter(&ill->ill_lock);
now = ddi_get_lbolt();
if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
(ip_min_frag_prune_time != 0 ?
ip_min_frag_prune_time : msec_per_tick)) {
ill->ill_frag_free_num_pkts++;
} else {
ill->ill_frag_free_num_pkts = 0;
}
ill->ill_last_frag_clean_time = now;
mutex_exit(&ill->ill_lock);
/*
* free ill_frag_free_num_pkts oldest packets from each bucket.
*/
if (ill->ill_frag_free_num_pkts != 0) {
int ix;
for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
ipfb = &ill->ill_frag_hash_tbl[ix];
mutex_enter(&ipfb->ipfb_lock);
if (ipfb->ipfb_ipf != NULL) {
ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
ill->ill_frag_free_num_pkts);
}
mutex_exit(&ipfb->ipfb_lock);
}
}
/*
* While the reassembly list for this ILL is too big, prune a fragment
* queue by age, oldest first.
*/
while (ill->ill_frag_count > max_count) {
int ix;
ipfb_t *oipfb = NULL;
uint_t oldest = UINT_MAX;
count = 0;
for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
ipfb = &ill->ill_frag_hash_tbl[ix];
mutex_enter(&ipfb->ipfb_lock);
ipf = ipfb->ipfb_ipf;
if (ipf != NULL && ipf->ipf_gen < oldest) {
oldest = ipf->ipf_gen;
oipfb = ipfb;
}
count += ipfb->ipfb_count;
mutex_exit(&ipfb->ipfb_lock);
}
if (oipfb == NULL)
break;
if (count <= max_count)
return; /* Somebody beat us to it, nothing to do */
mutex_enter(&oipfb->ipfb_lock);
ipf = oipfb->ipfb_ipf;
if (ipf != NULL) {
ill_frag_free_pkts(ill, oipfb, ipf, 1);
}
mutex_exit(&oipfb->ipfb_lock);
}
}
/*
* free 'free_cnt' fragmented packets starting at ipf.
*/
void
ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
{
size_t count;
mblk_t *mp;
mblk_t *tmp;
ipf_t **ipfp = ipf->ipf_ptphn;
ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
ASSERT(ipfp != NULL);
ASSERT(ipf != NULL);
while (ipf != NULL && free_cnt-- > 0) {
count = ipf->ipf_count;
mp = ipf->ipf_mp;
ipf = ipf->ipf_hash_next;
for (tmp = mp; tmp; tmp = tmp->b_cont) {
IP_REASS_SET_START(tmp, 0);
IP_REASS_SET_END(tmp, 0);
}
atomic_add_32(&ill->ill_frag_count, -count);
ASSERT(ipfb->ipfb_count >= count);
ipfb->ipfb_count -= count;
ASSERT(ipfb->ipfb_frag_pkts > 0);
ipfb->ipfb_frag_pkts--;
BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
ip_drop_input("ipIfStatsReasmFails", mp, ill);
freemsg(mp);
}
if (ipf)
ipf->ipf_ptphn = ipfp;
ipfp[0] = ipf;
}
/*
* Helper function for ill_forward_set().
*/
static void
ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
{
ip_stack_t *ipst = ill->ill_ipst;
ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
ip1dbg(("ill_forward_set: %s %s forwarding on %s",
(enable ? "Enabling" : "Disabling"),
(ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
mutex_enter(&ill->ill_lock);
if (enable)
ill->ill_flags |= ILLF_ROUTER;
else
ill->ill_flags &= ~ILLF_ROUTER;
mutex_exit(&ill->ill_lock);
if (ill->ill_isv6)
ill_set_nce_router_flags(ill, enable);
/* Notify routing socket listeners of this change. */
if (ill->ill_ipif != NULL)
ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
}
/*
* Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing
* socket messages for each interface whose flags we change.
*/
int
ill_forward_set(ill_t *ill, boolean_t enable)
{
ipmp_illgrp_t *illg;
ip_stack_t *ipst = ill->ill_ipst;
ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
(!enable && !(ill->ill_flags & ILLF_ROUTER)))
return (0);
if (IS_LOOPBACK(ill))
return (EINVAL);
if (enable && ill->ill_allowed_ips_cnt > 0)
return (EPERM);
if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
/*
* Update all of the interfaces in the group.
*/
illg = ill->ill_grp;
ill = list_head(&illg->ig_if);
for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
ill_forward_set_on_ill(ill, enable);
/*
* Update the IPMP meta-interface.
*/
ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
return (0);
}
ill_forward_set_on_ill(ill, enable);
return (0);
}
/*
* Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
* addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
* set or clear.
*/
static void
ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
{
ipif_t *ipif;
ncec_t *ncec;
nce_t *nce;
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
/*
* NOTE: we match across the illgrp because nce's for
* addresses on IPMP interfaces have an nce_ill that points to
* the bound underlying ill.
*/
nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
if (nce != NULL) {
ncec = nce->nce_common;
mutex_enter(&ncec->ncec_lock);
if (enable)
ncec->ncec_flags |= NCE_F_ISROUTER;
else
ncec->ncec_flags &= ~NCE_F_ISROUTER;
mutex_exit(&ncec->ncec_lock);
nce_refrele(nce);
}
}
}
/*
* Intializes the context structure and returns the first ill in the list
* cuurently start_list and end_list can have values:
* MAX_G_HEADS Traverse both IPV4 and IPV6 lists.
* IP_V4_G_HEAD Traverse IPV4 list only.
* IP_V6_G_HEAD Traverse IPV6 list only.
*/
/*
* We don't check for CONDEMNED ills here. Caller must do that if
* necessary under the ill lock.
*/
ill_t *
ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
ip_stack_t *ipst)
{
ill_if_t *ifp;
ill_t *ill;
avl_tree_t *avl_tree;
ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
/*
* setup the lists to search
*/
if (end_list != MAX_G_HEADS) {
ctx->ctx_current_list = start_list;
ctx->ctx_last_list = end_list;
} else {
ctx->ctx_last_list = MAX_G_HEADS - 1;
ctx->ctx_current_list = 0;
}
while (ctx->ctx_current_list <= ctx->ctx_last_list) {
ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list,