blob: 9771c87721845ccec9ee6d13d077a33294559786 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
/*
* This file contains the interface control functions for IP.
*/
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/dlpi.h>
#include <sys/stropts.h>
#include <sys/strsun.h>
#include <sys/sysmacros.h>
#include <sys/strlog.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/debug.h>
#include <sys/zone.h>
#include <sys/sunldi.h>
#include <sys/file.h>
#include <sys/bitmap.h>
#include <sys/cpuvar.h>
#include <sys/time.h>
#include <sys/ctype.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/isa_defs.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <sys/sockio.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/igmp_var.h>
#include <sys/policy.h>
#include <sys/ethernet.h>
#include <sys/callb.h>
#include <sys/md5.h>
#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
#include <inet/mi.h>
#include <inet/nd.h>
#include <inet/arp.h>
#include <inet/mib2.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
#include <inet/ip_multi.h>
#include <inet/ip_ire.h>
#include <inet/ip_ftable.h>
#include <inet/ip_rts.h>
#include <inet/ip_ndp.h>
#include <inet/ip_if.h>
#include <inet/ip_impl.h>
#include <inet/tun.h>
#include <inet/sctp_ip.h>
#include <inet/ip_netinfo.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
#include <sys/iphada.h>
#include <netinet/igmp.h>
#include <inet/ip_listutils.h>
#include <inet/ipclassifier.h>
#include <sys/mac_client.h>
#include <sys/dld.h>
#include <sys/systeminfo.h>
#include <sys/bootconf.h>
#include <sys/tsol/tndb.h>
#include <sys/tsol/tnet.h>
/* The character which tells where the ill_name ends */
#define IPIF_SEPARATOR_CHAR ':'
/* IP ioctl function table entry */
typedef struct ipft_s {
int ipft_cmd;
pfi_t ipft_pfi;
int ipft_min_size;
int ipft_flags;
} ipft_t;
#define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */
#define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */
typedef struct ip_sock_ar_s {
union {
area_t ip_sock_area;
ared_t ip_sock_ared;
areq_t ip_sock_areq;
} ip_sock_ar_u;
queue_t *ip_sock_ar_q;
} ip_sock_ar_t;
static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int nd_ill_forward_set(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *ioc_cr);
static boolean_t ill_is_quiescent(ill_t *);
static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
static ip_m_t *ip_m_lookup(t_uscalar_t mac_type);
static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp, boolean_t need_up);
static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp, boolean_t need_up);
static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
queue_t *q, mblk_t *mp, boolean_t need_up);
static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
mblk_t *mp);
static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp);
static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
queue_t *q, mblk_t *mp, boolean_t need_up);
static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
int ioccmd, struct linkblk *li, boolean_t doconsist);
static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
static void ip_wput_ioctl(queue_t *q, mblk_t *mp);
static void ipsq_flush(ill_t *ill);
static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
queue_t *q, mblk_t *mp, boolean_t need_up);
static void ipsq_delete(ipsq_t *);
static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
boolean_t initialize, boolean_t insert);
static void ipif_check_bcast_ires(ipif_t *test_ipif);
static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
boolean_t isv6);
static void ipif_down_delete_ire(ire_t *ire, char *ipif);
static void ipif_delete_cache_ire(ire_t *, char *);
static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
static void ipif_free(ipif_t *ipif);
static void ipif_free_tail(ipif_t *ipif);
static void ipif_mtu_change(ire_t *ire, char *ipif_arg);
static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
static void ipif_set_default(ipif_t *ipif);
static int ipif_set_values(queue_t *q, mblk_t *mp,
char *interf_name, uint_t *ppa);
static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
queue_t *q);
static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
static void ipif_update_other_ipifs(ipif_t *old_ipif);
static int ill_alloc_ppa(ill_if_t *, ill_t *);
static int ill_arp_off(ill_t *ill);
static int ill_arp_on(ill_t *ill);
static void ill_delete_interface_type(ill_if_t *);
static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
static void ill_dl_down(ill_t *ill);
static void ill_down(ill_t *ill);
static void ill_downi(ire_t *ire, char *ill_arg);
static void ill_free_mib(ill_t *ill);
static void ill_glist_delete(ill_t *);
static void ill_phyint_reinit(ill_t *ill);
static void ill_set_nce_router_flags(ill_t *, boolean_t);
static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo;
static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo;
static void ipif_save_ire(ipif_t *, ire_t *);
static void ipif_remove_ire(ipif_t *, ire_t *);
static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
static void phyint_free(phyint_t *);
/*
* Per-ill IPsec capabilities management.
*/
static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void);
static void ill_ipsec_capab_free(ill_ipsec_capab_t *);
static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
static void ill_ipsec_capab_delete(ill_t *, uint_t);
static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
boolean_t);
static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *);
static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *);
static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *,
int *);
static void ill_capability_dld_reset_fill(ill_t *, mblk_t *);
static void ill_capability_dld_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
static void ill_capability_dld_enable(ill_t *);
static void ill_capability_ack_thr(void *);
static void ill_capability_lso_enable(ill_t *);
static void ill_capability_send(ill_t *, mblk_t *);
static ill_t *ill_prev_usesrc(ill_t *);
static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
static void ill_disband_usesrc_group(ill_t *);
static void conn_cleanup_stale_ire(conn_t *, caddr_t);
#ifdef DEBUG
static void ill_trace_cleanup(const ill_t *);
static void ipif_trace_cleanup(const ipif_t *);
#endif
/*
* if we go over the memory footprint limit more than once in this msec
* interval, we'll start pruning aggressively.
*/
int ip_min_frag_prune_time = 0;
/*
* max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY
* and the IPsec DOI
*/
#define MAX_IPSEC_ALGS 256
#define BITSPERBYTE 8
#define BITS(type) (BITSPERBYTE * (long)sizeof (type))
#define IPSEC_ALG_ENABLE(algs, algid) \
((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \
(1 << ((algid) % BITS(ipsec_capab_elem_t))))
#define IPSEC_ALG_IS_ENABLED(algid, algs) \
((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \
(1 << ((algid) % BITS(ipsec_capab_elem_t))))
typedef uint8_t ipsec_capab_elem_t;
/*
* Per-algorithm parameters. Note that at present, only encryption
* algorithms have variable keysize (IKE does not provide a way to negotiate
* auth algorithm keysize).
*
* All sizes here are in bits.
*/
typedef struct
{
uint16_t minkeylen;
uint16_t maxkeylen;
} ipsec_capab_algparm_t;
/*
* Per-ill capabilities.
*/
struct ill_ipsec_capab_s {
ipsec_capab_elem_t *encr_hw_algs;
ipsec_capab_elem_t *auth_hw_algs;
uint32_t algs_size; /* size of _hw_algs in bytes */
/* algorithm key lengths */
ipsec_capab_algparm_t *encr_algparm;
uint32_t encr_algparm_size;
uint32_t encr_algparm_end;
};
/*
* The field values are larger than strictly necessary for simple
* AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls.
*/
static area_t ip_area_template = {
AR_ENTRY_ADD, /* area_cmd */
sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl),
/* area_name_offset */
/* area_name_length temporarily holds this structure length */
sizeof (area_t), /* area_name_length */
IP_ARP_PROTO_TYPE, /* area_proto */
sizeof (ip_sock_ar_t), /* area_proto_addr_offset */
IP_ADDR_LEN, /* area_proto_addr_length */
sizeof (ip_sock_ar_t) + IP_ADDR_LEN,
/* area_proto_mask_offset */
0, /* area_flags */
sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN,
/* area_hw_addr_offset */
/* Zero length hw_addr_length means 'use your idea of the address' */
0 /* area_hw_addr_length */
};
/*
* AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver
* support
*/
static area_t ip6_area_template = {
AR_ENTRY_ADD, /* area_cmd */
sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t),
/* area_name_offset */
/* area_name_length temporarily holds this structure length */
sizeof (area_t), /* area_name_length */
IP_ARP_PROTO_TYPE, /* area_proto */
sizeof (ip_sock_ar_t), /* area_proto_addr_offset */
IPV6_ADDR_LEN, /* area_proto_addr_length */
sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN,
/* area_proto_mask_offset */
0, /* area_flags */
sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN,
/* area_hw_addr_offset */
/* Zero length hw_addr_length means 'use your idea of the address' */
0 /* area_hw_addr_length */
};
static ared_t ip_ared_template = {
AR_ENTRY_DELETE,
sizeof (ared_t) + IP_ADDR_LEN,
sizeof (ared_t),
IP_ARP_PROTO_TYPE,
sizeof (ared_t),
IP_ADDR_LEN,
0
};
static ared_t ip6_ared_template = {
AR_ENTRY_DELETE,
sizeof (ared_t) + IPV6_ADDR_LEN,
sizeof (ared_t),
IP_ARP_PROTO_TYPE,
sizeof (ared_t),
IPV6_ADDR_LEN,
0
};
/*
* A template for an IPv6 AR_ENTRY_QUERY template has not been created, as
* as the areq doesn't include an IP address in ill_dl_up() (the only place a
* areq is used).
*/
static areq_t ip_areq_template = {
AR_ENTRY_QUERY, /* cmd */
sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */
sizeof (areq_t), /* name len (filled by ill_arp_alloc) */
IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */
sizeof (areq_t), /* target addr offset */
IP_ADDR_LEN, /* target addr_length */
0, /* flags */
sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */
IP_ADDR_LEN, /* sender addr length */
AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */
AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */
AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */
/* anything else filled in by the code */
};
static arc_t ip_aru_template = {
AR_INTERFACE_UP,
sizeof (arc_t), /* Name offset */
sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
};
static arc_t ip_ard_template = {
AR_INTERFACE_DOWN,
sizeof (arc_t), /* Name offset */
sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
};
static arc_t ip_aron_template = {
AR_INTERFACE_ON,
sizeof (arc_t), /* Name offset */
sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
};
static arc_t ip_aroff_template = {
AR_INTERFACE_OFF,
sizeof (arc_t), /* Name offset */
sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
};
static arma_t ip_arma_multi_template = {
AR_MAPPING_ADD,
sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN,
/* Name offset */
sizeof (arma_t), /* Name length (set by ill_arp_alloc) */
IP_ARP_PROTO_TYPE,
sizeof (arma_t), /* proto_addr_offset */
IP_ADDR_LEN, /* proto_addr_length */
sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */
sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */
ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */
sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */
IP_MAX_HW_LEN, /* hw_addr_length */
0, /* hw_mapping_start */
};
static ipft_t ip_ioctl_ftbl[] = {
{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
IPFT_F_NO_REPLY },
{ IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t),
IPFT_F_NO_REPLY },
{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
{ 0 }
};
/* Simple ICMP IP Header Template */
static ipha_t icmp_ipha = {
IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
};
/* Flag descriptors for ip_ipif_report */
static nv_t ipif_nv_tbl[] = {
{ IPIF_UP, "UP" },
{ IPIF_BROADCAST, "BROADCAST" },
{ ILLF_DEBUG, "DEBUG" },
{ PHYI_LOOPBACK, "LOOPBACK" },
{ IPIF_POINTOPOINT, "POINTOPOINT" },
{ ILLF_NOTRAILERS, "NOTRAILERS" },
{ PHYI_RUNNING, "RUNNING" },
{ ILLF_NOARP, "NOARP" },
{ PHYI_PROMISC, "PROMISC" },
{ PHYI_ALLMULTI, "ALLMULTI" },
{ PHYI_INTELLIGENT, "INTELLIGENT" },
{ ILLF_MULTICAST, "MULTICAST" },
{ PHYI_MULTI_BCAST, "MULTI_BCAST" },
{ IPIF_UNNUMBERED, "UNNUMBERED" },
{ IPIF_DHCPRUNNING, "DHCP" },
{ IPIF_PRIVATE, "PRIVATE" },
{ IPIF_NOXMIT, "NOXMIT" },
{ IPIF_NOLOCAL, "NOLOCAL" },
{ IPIF_DEPRECATED, "DEPRECATED" },
{ IPIF_PREFERRED, "PREFERRED" },
{ IPIF_TEMPORARY, "TEMPORARY" },
{ IPIF_ADDRCONF, "ADDRCONF" },
{ PHYI_VIRTUAL, "VIRTUAL" },
{ ILLF_ROUTER, "ROUTER" },
{ ILLF_NONUD, "NONUD" },
{ IPIF_ANYCAST, "ANYCAST" },
{ ILLF_NORTEXCH, "NORTEXCH" },
{ ILLF_IPV4, "IPV4" },
{ ILLF_IPV6, "IPV6" },
{ IPIF_NOFAILOVER, "NOFAILOVER" },
{ PHYI_FAILED, "FAILED" },
{ PHYI_STANDBY, "STANDBY" },
{ PHYI_INACTIVE, "INACTIVE" },
{ PHYI_OFFLINE, "OFFLINE" },
{ PHYI_IPMP, "IPMP" }
};
static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
static ip_m_t ip_m_tbl[] = {
{ DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_ether_v6intfid },
{ DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid },
{ DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid },
{ DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid },
{ DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_ether_v6intfid },
{ DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
ip_ib_v6intfid },
{ SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL },
{ SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid },
{ DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid }
};
static ill_t ill_null; /* Empty ILL for init. */
char ipif_loopback_name[] = "lo0";
static char *ipv4_forward_suffix = ":ip_forwarding";
static char *ipv6_forward_suffix = ":ip6_forwarding";
static sin6_t sin6_null; /* Zero address for quick clears */
static sin_t sin_null; /* Zero address for quick clears */
/* When set search for unused ipif_seqid */
static ipif_t ipif_zero;
/*
* ppa arena is created after these many
* interfaces have been plumbed.
*/
uint_t ill_no_arena = 12; /* Setable in /etc/system */
/*
* Allocate per-interface mibs.
* Returns true if ok. False otherwise.
* ipsq may not yet be allocated (loopback case ).
*/
static boolean_t
ill_allocate_mibs(ill_t *ill)
{
/* Already allocated? */
if (ill->ill_ip_mib != NULL) {
if (ill->ill_isv6)
ASSERT(ill->ill_icmp6_mib != NULL);
return (B_TRUE);
}
ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
KM_NOSLEEP);
if (ill->ill_ip_mib == NULL) {
return (B_FALSE);
}
/* Setup static information */
SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
sizeof (mib2_ipIfStatsEntry_t));
if (ill->ill_isv6) {
ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
sizeof (mib2_ipv6AddrEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
sizeof (mib2_ipv6RouteEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
sizeof (mib2_ipv6NetToMediaEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
sizeof (ipv6_member_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
sizeof (ipv6_grpsrc_t));
} else {
ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
sizeof (mib2_ipAddrEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
sizeof (mib2_ipRouteEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
sizeof (mib2_ipNetToMediaEntry_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
sizeof (ip_member_t));
SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
sizeof (ip_grpsrc_t));
/*
* For a v4 ill, we are done at this point, because per ill
* icmp mibs are only used for v6.
*/
return (B_TRUE);
}
ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
KM_NOSLEEP);
if (ill->ill_icmp6_mib == NULL) {
kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
ill->ill_ip_mib = NULL;
return (B_FALSE);
}
/* static icmp info */
ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
sizeof (mib2_ipv6IfIcmpEntry_t);
/*
* The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
* after the phyint merge occurs in ipif_set_values -> ill_glist_insert
* -> ill_phyint_reinit
*/
return (B_TRUE);
}
/*
* Common code for preparation of ARP commands. Two points to remember:
* 1) The ill_name is tacked on at the end of the allocated space so
* the templates name_offset field must contain the total space
* to allocate less the name length.
*
* 2) The templates name_length field should contain the *template*
* length. We use it as a parameter to bcopy() and then write
* the real ill_name_length into the name_length field of the copy.
* (Always called as writer.)
*/
mblk_t *
ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr)
{
arc_t *arc = (arc_t *)template;
char *cp;
int len;
mblk_t *mp;
uint_t name_length = ill->ill_name_length;
uint_t template_len = arc->arc_name_length;
len = arc->arc_name_offset + name_length;
mp = allocb(len, BPRI_HI);
if (mp == NULL)
return (NULL);
cp = (char *)mp->b_rptr;
mp->b_wptr = (uchar_t *)&cp[len];
if (template_len)
bcopy(template, cp, template_len);
if (len > template_len)
bzero(&cp[template_len], len - template_len);
mp->b_datap->db_type = M_PROTO;
arc = (arc_t *)cp;
arc->arc_name_length = name_length;
cp = (char *)arc + arc->arc_name_offset;
bcopy(ill->ill_name, cp, name_length);
if (addr) {
area_t *area = (area_t *)mp->b_rptr;
cp = (char *)area + area->area_proto_addr_offset;
bcopy(addr, cp, area->area_proto_addr_length);
if (area->area_cmd == AR_ENTRY_ADD) {
cp = (char *)area;
len = area->area_proto_addr_length;
if (area->area_proto_mask_offset)
cp += area->area_proto_mask_offset;
else
cp += area->area_proto_addr_offset + len;
while (len-- > 0)
*cp++ = (char)~0;
}
}
return (mp);
}
mblk_t *
ipif_area_alloc(ipif_t *ipif, uint_t optflags)
{
caddr_t addr;
mblk_t *mp;
area_t *area;
uchar_t *areap;
ill_t *ill = ipif->ipif_ill;
if (ill->ill_isv6) {
ASSERT(ill->ill_flags & ILLF_XRESOLV);
addr = (caddr_t)&ipif->ipif_v6lcl_addr;
areap = (uchar_t *)&ip6_area_template;
} else {
addr = (caddr_t)&ipif->ipif_lcl_addr;
areap = (uchar_t *)&ip_area_template;
}
if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL)
return (NULL);
/*
* IPMP requires that the hardware address be included in all
* AR_ENTRY_ADD requests so that ARP can deduce the arl to send on.
* If there are no active underlying ills in the group (and thus no
* hardware address, DAD will be deferred until an underlying ill
* becomes active.
*/
if (IS_IPMP(ill)) {
if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
freemsg(mp);
return (NULL);
}
} else {
ill_refhold(ill);
}
area = (area_t *)mp->b_rptr;
area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR;
area->area_flags |= optflags;
area->area_hw_addr_length = ill->ill_phys_addr_length;
bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset,
area->area_hw_addr_length);
ill_refrele(ill);
return (mp);
}
mblk_t *
ipif_ared_alloc(ipif_t *ipif)
{
caddr_t addr;
uchar_t *aredp;
if (ipif->ipif_ill->ill_isv6) {
ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV);
addr = (caddr_t)&ipif->ipif_v6lcl_addr;
aredp = (uchar_t *)&ip6_ared_template;
} else {
addr = (caddr_t)&ipif->ipif_lcl_addr;
aredp = (uchar_t *)&ip_ared_template;
}
return (ill_arp_alloc(ipif->ipif_ill, aredp, addr));
}
mblk_t *
ill_ared_alloc(ill_t *ill, ipaddr_t addr)
{
return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
(char *)&addr));
}
mblk_t *
ill_arie_alloc(ill_t *ill, const char *grifname, const void *template)
{
mblk_t *mp = ill_arp_alloc(ill, template, 0);
arie_t *arie;
if (mp != NULL) {
arie = (arie_t *)mp->b_rptr;
(void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ);
}
return (mp);
}
/*
* Completely vaporize a lower level tap and all associated interfaces.
* ill_delete is called only out of ip_close when the device control
* stream is being closed.
*/
void
ill_delete(ill_t *ill)
{
ipif_t *ipif;
ill_t *prev_ill;
ip_stack_t *ipst = ill->ill_ipst;
/*
* ill_delete may be forcibly entering the ipsq. The previous
* ioctl may not have completed and may need to be aborted.
* ipsq_flush takes care of it. If we don't need to enter the
* the ipsq forcibly, the 2nd invocation of ipsq_flush in
* ill_delete_tail is sufficient.
*/
ipsq_flush(ill);
/*
* Nuke all interfaces. ipif_free will take down the interface,
* remove it from the list, and free the data structure.
* Walk down the ipif list and remove the logical interfaces
* first before removing the main ipif. We can't unplumb
* zeroth interface first in the case of IPv6 as reset_conn_ill
* -> ip_ll_delmulti_v6 de-references ill_ipif for checking
* POINTOPOINT.
*
* If ill_ipif was not properly initialized (i.e low on memory),
* then no interfaces to clean up. In this case just clean up the
* ill.
*/
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
ipif_free(ipif);
/*
* Used only by ill_arp_on and ill_arp_off, which are writers.
* So nobody can be using this mp now. Free the mp allocated for
* honoring ILLF_NOARP
*/
freemsg(ill->ill_arp_on_mp);
ill->ill_arp_on_mp = NULL;
/* Clean up msgs on pending upcalls for mrouted */
reset_mrt_ill(ill);
/*
* ipif_free -> reset_conn_ipif will remove all multicast
* references for IPv4. For IPv6, we need to do it here as
* it points only at ills.
*/
reset_conn_ill(ill);
/*
* Remove multicast references added as a result of calls to
* ip_join_allmulti().
*/
ip_purge_allmulti(ill);
/*
* If the ill being deleted is under IPMP, boot it out of the illgrp.
*/
if (IS_UNDER_IPMP(ill))
ipmp_ill_leave_illgrp(ill);
/*
* ill_down will arrange to blow off any IRE's dependent on this
* ILL, and shut down fragmentation reassembly.
*/
ill_down(ill);
/* Let SCTP know, so that it can remove this from its list. */
sctp_update_ill(ill, SCTP_ILL_REMOVE);
/*
* If an address on this ILL is being used as a source address then
* clear out the pointers in other ILLs that point to this ILL.
*/
rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
if (ill->ill_usesrc_grp_next != NULL) {
if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
ill_disband_usesrc_group(ill);
} else { /* consumer of the usesrc ILL */
prev_ill = ill_prev_usesrc(ill);
prev_ill->ill_usesrc_grp_next =
ill->ill_usesrc_grp_next;
}
}
rw_exit(&ipst->ips_ill_g_usesrc_lock);
}
static void
ipif_non_duplicate(ipif_t *ipif)
{
ill_t *ill = ipif->ipif_ill;
mutex_enter(&ill->ill_lock);
if (ipif->ipif_flags & IPIF_DUPLICATE) {
ipif->ipif_flags &= ~IPIF_DUPLICATE;
ASSERT(ill->ill_ipif_dup_count > 0);
ill->ill_ipif_dup_count--;
}
mutex_exit(&ill->ill_lock);
}
/*
* ill_delete_tail is called from ip_modclose after all references
* to the closing ill are gone. The wait is done in ip_modclose
*/
void
ill_delete_tail(ill_t *ill)
{
mblk_t **mpp;
ipif_t *ipif;
ip_stack_t *ipst = ill->ill_ipst;
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
ipif_non_duplicate(ipif);
ipif_down_tail(ipif);
}
ASSERT(ill->ill_ipif_dup_count == 0 &&
ill->ill_arp_down_mp == NULL &&
ill->ill_arp_del_mapping_mp == NULL);
/*
* If polling capability is enabled (which signifies direct
* upcall into IP and driver has ill saved as a handle),
* we need to make sure that unbind has completed before we
* let the ill disappear and driver no longer has any reference
* to this ill.
*/
mutex_enter(&ill->ill_lock);
while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
cv_wait(&ill->ill_cv, &ill->ill_lock);
mutex_exit(&ill->ill_lock);
ASSERT(!(ill->ill_capabilities &
(ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
if (ill->ill_net_type != IRE_LOOPBACK)
qprocsoff(ill->ill_rq);
/*
* We do an ipsq_flush once again now. New messages could have
* landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
* could also have landed up if an ioctl thread had looked up
* the ill before we set the ILL_CONDEMNED flag, but not yet
* enqueued the ioctl when we did the ipsq_flush last time.
*/
ipsq_flush(ill);
/*
* Free capabilities.
*/
if (ill->ill_ipsec_capab_ah != NULL) {
ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH);
ill_ipsec_capab_free(ill->ill_ipsec_capab_ah);
ill->ill_ipsec_capab_ah = NULL;
}
if (ill->ill_ipsec_capab_esp != NULL) {
ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP);
ill_ipsec_capab_free(ill->ill_ipsec_capab_esp);
ill->ill_ipsec_capab_esp = NULL;
}
if (ill->ill_mdt_capab != NULL) {
kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t));
ill->ill_mdt_capab = NULL;
}
if (ill->ill_hcksum_capab != NULL) {
kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
ill->ill_hcksum_capab = NULL;
}
if (ill->ill_zerocopy_capab != NULL) {
kmem_free(ill->ill_zerocopy_capab,
sizeof (ill_zerocopy_capab_t));
ill->ill_zerocopy_capab = NULL;
}
if (ill->ill_lso_capab != NULL) {
kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
ill->ill_lso_capab = NULL;
}
if (ill->ill_dld_capab != NULL) {
kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
ill->ill_dld_capab = NULL;
}
while (ill->ill_ipif != NULL)
ipif_free_tail(ill->ill_ipif);
/*
* We have removed all references to ilm from conn and the ones joined
* within the kernel.
*
* We don't walk conns, mrts and ires because
*
* 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts.
* 2) ill_down ->ill_downi walks all the ires and cleans up
* ill references.
*/
ASSERT(ilm_walk_ill(ill) == 0);
/*
* If this ill is an IPMP meta-interface, blow away the illgrp. This
* is safe to do because the illgrp has already been unlinked from the
* group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
*/
if (IS_IPMP(ill)) {
ipmp_illgrp_destroy(ill->ill_grp);
ill->ill_grp = NULL;
}
/*
* Take us out of the list of ILLs. ill_glist_delete -> phyint_free
* could free the phyint. No more reference to the phyint after this
* point.
*/
(void) ill_glist_delete(ill);
rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER);
if (ill->ill_ndd_name != NULL)
nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name);
rw_exit(&ipst->ips_ip_g_nd_lock);
if (ill->ill_frag_ptr != NULL) {
uint_t count;
for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
}
mi_free(ill->ill_frag_ptr);
ill->ill_frag_ptr = NULL;
ill->ill_frag_hash_tbl = NULL;
}
freemsg(ill->ill_nd_lla_mp);
/* Free all retained control messages. */
mpp = &ill->ill_first_mp_to_free;
do {
while (mpp[0]) {
mblk_t *mp;
mblk_t *mp1;
mp = mpp[0];
mpp[0] = mp->b_next;
for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
mp1->b_next = NULL;
mp1->b_prev = NULL;
}
freemsg(mp);
}
} while (mpp++ != &ill->ill_last_mp_to_free);
ill_free_mib(ill);
#ifdef DEBUG
ill_trace_cleanup(ill);
#endif
/* Drop refcnt here */
netstack_rele(ill->ill_ipst->ips_netstack);
ill->ill_ipst = NULL;
}
static void
ill_free_mib(ill_t *ill)
{
ip_stack_t *ipst = ill->ill_ipst;
/*
* MIB statistics must not be lost, so when an interface
* goes away the counter values will be added to the global
* MIBs.
*/
if (ill->ill_ip_mib != NULL) {
if (ill->ill_isv6) {
ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
ill->ill_ip_mib);
} else {
ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
ill->ill_ip_mib);
}
kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
ill->ill_ip_mib = NULL;
}
if (ill->ill_icmp6_mib != NULL) {
ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
ill->ill_icmp6_mib);
kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
ill->ill_icmp6_mib = NULL;
}
}
/*
* Concatenate together a physical address and a sap.
*
* Sap_lengths are interpreted as follows:
* sap_length == 0 ==> no sap
* sap_length > 0 ==> sap is at the head of the dlpi address
* sap_length < 0 ==> sap is at the tail of the dlpi address
*/
static void
ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
{
uint16_t sap_addr = (uint16_t)sap_src;
if (sap_length == 0) {
if (phys_src == NULL)
bzero(dst, phys_length);
else
bcopy(phys_src, dst, phys_length);
} else if (sap_length < 0) {
if (phys_src == NULL)
bzero(dst, phys_length);
else
bcopy(phys_src, dst, phys_length);
bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
} else {
bcopy(&sap_addr, dst, sizeof (sap_addr));
if (phys_src == NULL)
bzero((char *)dst + sap_length, phys_length);
else
bcopy(phys_src, (char *)dst + sap_length, phys_length);
}
}
/*
* Generate a dl_unitdata_req mblk for the device and address given.
* addr_length is the length of the physical portion of the address.
* If addr is NULL include an all zero address of the specified length.
* TRUE? In any case, addr_length is taken to be the entire length of the
* dlpi address, including the absolute value of sap_length.
*/
mblk_t *
ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
t_scalar_t sap_length)
{
dl_unitdata_req_t *dlur;
mblk_t *mp;
t_scalar_t abs_sap_length; /* absolute value */
abs_sap_length = ABS(sap_length);
mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
DL_UNITDATA_REQ);
if (mp == NULL)
return (NULL);
dlur = (dl_unitdata_req_t *)mp->b_rptr;
/* HACK: accomodate incompatible DLPI drivers */
if (addr_length == 8)
addr_length = 6;
dlur->dl_dest_addr_length = addr_length + abs_sap_length;
dlur->dl_dest_addr_offset = sizeof (*dlur);
dlur->dl_priority.dl_min = 0;
dlur->dl_priority.dl_max = 0;
ill_dlur_copy_address(addr, addr_length, sap, sap_length,
(uchar_t *)&dlur[1]);
return (mp);
}
/*
* Add the 'mp' to the list of pending mp's headed by ill_pending_mp
* Return an error if we already have 1 or more ioctls in progress.
* This is used only for non-exclusive ioctls. Currently this is used
* for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive
* and thus need to use ipsq_pending_mp_add.
*/
boolean_t
ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp)
{
ASSERT(MUTEX_HELD(&ill->ill_lock));
ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
/*
* M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls.
*/
ASSERT((add_mp->b_datap->db_type == M_IOCDATA) ||
(add_mp->b_datap->db_type == M_IOCTL));
ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
* Return error if the conn has started closing. The conn
* could have finished cleaning up the pending mp list,
* If so we should not add another mp to the list negating
* the cleanup.
*/
if (connp->conn_state_flags & CONN_CLOSING)
return (B_FALSE);
/*
* Add the pending mp to the head of the list, chained by b_next.
* Note down the conn on which the ioctl request came, in b_prev.
* This will be used to later get the conn, when we get a response
* on the ill queue, from some other module (typically arp)
*/
add_mp->b_next = (void *)ill->ill_pending_mp;
add_mp->b_queue = CONNP_TO_WQ(connp);
ill->ill_pending_mp = add_mp;
if (connp != NULL)
connp->conn_oper_pending_ill = ill;
return (B_TRUE);
}
/*
* Retrieve the ill_pending_mp and return it. We have to walk the list
* of mblks starting at ill_pending_mp, and match based on the ioc_id.
*/
mblk_t *
ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
{
mblk_t *prev = NULL;
mblk_t *curr = NULL;
uint_t id;
conn_t *connp;
/*
* When the conn closes, conn_ioctl_cleanup needs to clean
* up the pending mp, but it does not know the ioc_id and
* passes in a zero for it.
*/
mutex_enter(&ill->ill_lock);
if (ioc_id != 0)
*connpp = NULL;
/* Search the list for the appropriate ioctl based on ioc_id */
for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL;
prev = curr, curr = curr->b_next) {
id = ((struct iocblk *)curr->b_rptr)->ioc_id;
connp = Q_TO_CONN(curr->b_queue);
/* Match based on the ioc_id or based on the conn */
if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp))
break;
}
if (curr != NULL) {
/* Unlink the mblk from the pending mp list */
if (prev != NULL) {
prev->b_next = curr->b_next;
} else {
ASSERT(ill->ill_pending_mp == curr);
ill->ill_pending_mp = curr->b_next;
}
/*
* conn refcnt must have been bumped up at the start of
* the ioctl. So we can safely access the conn.
*/
ASSERT(CONN_Q(curr->b_queue));
*connpp = Q_TO_CONN(curr->b_queue);
curr->b_next = NULL;
curr->b_queue = NULL;
}
mutex_exit(&ill->ill_lock);
return (curr);
}
/*
* Add the pending mp to the list. There can be only 1 pending mp
* in the list. Any exclusive ioctl that needs to wait for a response
* from another module or driver needs to use this function to set
* the ipx_pending_mp to the ioctl mblk and wait for the response from
* the other module/driver. This is also used while waiting for the
* ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
*/
boolean_t
ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
int waitfor)
{
ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
ASSERT(IAM_WRITER_IPIF(ipif));
ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
ASSERT(ipx->ipx_pending_mp == NULL);
/*
* The caller may be using a different ipif than the one passed into
* ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
* ill needs to wait for the V6 ill to quiesce). So we can't ASSERT
* that `ipx_current_ipif == ipif'.
*/
ASSERT(ipx->ipx_current_ipif != NULL);
/*
* M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls,
* M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the driver.
*/
ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) ||
(DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP) ||
(DB_TYPE(add_mp) == M_PROTO) || (DB_TYPE(add_mp) == M_PCPROTO));
if (connp != NULL) {
ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
* Return error if the conn has started closing. The conn
* could have finished cleaning up the pending mp list,
* If so we should not add another mp to the list negating
* the cleanup.
*/
if (connp->conn_state_flags & CONN_CLOSING)
return (B_FALSE);
}
mutex_enter(&ipx->ipx_lock);
ipx->ipx_pending_ipif = ipif;
/*
* Note down the queue in b_queue. This will be returned by
* ipsq_pending_mp_get. Caller will then use these values to restart
* the processing
*/
add_mp->b_next = NULL;
add_mp->b_queue = q;
ipx->ipx_pending_mp = add_mp;
ipx->ipx_waitfor = waitfor;
mutex_exit(&ipx->ipx_lock);
if (connp != NULL)
connp->conn_oper_pending_ill = ipif->ipif_ill;
return (B_TRUE);
}
/*
* Retrieve the ipx_pending_mp and return it. There can be only 1 mp
* queued in the list.
*/
mblk_t *
ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
{
mblk_t *curr = NULL;
ipxop_t *ipx = ipsq->ipsq_xop;
*connpp = NULL;
mutex_enter(&ipx->ipx_lock);
if (ipx->ipx_pending_mp == NULL) {
mutex_exit(&ipx->ipx_lock);
return (NULL);
}
/* There can be only 1 such excl message */
curr = ipx->ipx_pending_mp;
ASSERT(curr->b_next == NULL);
ipx->ipx_pending_ipif = NULL;
ipx->ipx_pending_mp = NULL;
ipx->ipx_waitfor = 0;
mutex_exit(&ipx->ipx_lock);
if (CONN_Q(curr->b_queue)) {
/*
* This mp did a refhold on the conn, at the start of the ioctl.
* So we can safely return a pointer to the conn to the caller.
*/
*connpp = Q_TO_CONN(curr->b_queue);
} else {
*connpp = NULL;
}
curr->b_next = NULL;
curr->b_prev = NULL;
return (curr);
}
/*
* Cleanup the ioctl mp queued in ipx_pending_mp
* - Called in the ill_delete path
* - Called in the M_ERROR or M_HANGUP path on the ill.
* - Called in the conn close path.
*/
boolean_t
ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
{
mblk_t *mp;
ipxop_t *ipx;
queue_t *q;
ipif_t *ipif;
ASSERT(IAM_WRITER_ILL(ill));
ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
/*
* If connp is null, unconditionally clean up the ipx_pending_mp.
* This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
* even if it is meant for another ill, since we have to enqueue
* a new mp now in ipx_pending_mp to complete the ipif_down.
* If connp is non-null we are called from the conn close path.
*/
mutex_enter(&ipx->ipx_lock);
mp = ipx->ipx_pending_mp;
if (mp == NULL || (connp != NULL &&
mp->b_queue != CONNP_TO_WQ(connp))) {
mutex_exit(&ipx->ipx_lock);
return (B_FALSE);
}
/* Now remove from the ipx_pending_mp */
ipx->ipx_pending_mp = NULL;
q = mp->b_queue;
mp->b_next = NULL;
mp->b_prev = NULL;
mp->b_queue = NULL;
ipif = ipx->ipx_pending_ipif;
ipx->ipx_pending_ipif = NULL;
ipx->ipx_waitfor = 0;
ipx->ipx_current_ipif = NULL;
ipx->ipx_current_ioctl = 0;
ipx->ipx_current_done = B_TRUE;
mutex_exit(&ipx->ipx_lock);
if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
if (connp == NULL) {
ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
} else {
ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
mutex_enter(&ipif->ipif_ill->ill_lock);
ipif->ipif_state_flags &= ~IPIF_CHANGING;
mutex_exit(&ipif->ipif_ill->ill_lock);
}
} else {
/*
* IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
* be just inet_freemsg. we have to restart it
* otherwise the thread will be stuck.
*/
inet_freemsg(mp);
}
return (B_TRUE);
}
/*
* The ill is closing. Cleanup all the pending mps. Called exclusively
* towards the end of ill_delete. The refcount has gone to 0. So nobody
* knows this ill, and hence nobody can add an mp to this list
*/
static void
ill_pending_mp_cleanup(ill_t *ill)
{
mblk_t *mp;
queue_t *q;
ASSERT(IAM_WRITER_ILL(ill));
mutex_enter(&ill->ill_lock);
/*
* Every mp on the pending mp list originating from an ioctl
* added 1 to the conn refcnt, at the start of the ioctl.
* So bump it down now. See comments in ip_wput_nondata()
*/
while (ill->ill_pending_mp != NULL) {
mp = ill->ill_pending_mp;
ill->ill_pending_mp = mp->b_next;
mutex_exit(&ill->ill_lock);
q = mp->b_queue;
ASSERT(CONN_Q(q));
mp->b_next = NULL;
mp->b_prev = NULL;
mp->b_queue = NULL;
ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
mutex_enter(&ill->ill_lock);
}
ill->ill_pending_ipif = NULL;
mutex_exit(&ill->ill_lock);
}
/*
* Called in the conn close path and ill delete path
*/
static void
ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
{
ipsq_t *ipsq;
mblk_t *prev;
mblk_t *curr;
mblk_t *next;
queue_t *q;
mblk_t *tmp_list = NULL;
ASSERT(IAM_WRITER_ILL(ill));
if (connp != NULL)
q = CONNP_TO_WQ(connp);
else
q = ill->ill_wq;
ipsq = ill->ill_phyint->phyint_ipsq;
/*
* Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
* In the case of ioctl from a conn, there can be only 1 mp
* queued on the ipsq. If an ill is being unplumbed, only messages
* related to this ill are flushed, like M_ERROR or M_HANGUP message.
* ioctls meant for this ill form conn's are not flushed. They will
* be processed during ipsq_exit and will not find the ill and will
* return error.
*/
mutex_enter(&ipsq->ipsq_lock);
for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
curr = next) {
next = curr->b_next;
if (curr->b_queue == q || curr->b_queue == RD(q)) {
/* Unlink the mblk from the pending mp list */
if (prev != NULL) {
prev->b_next = curr->b_next;
} else {
ASSERT(ipsq->ipsq_xopq_mphead == curr);
ipsq->ipsq_xopq_mphead = curr->b_next;
}
if (ipsq->ipsq_xopq_mptail == curr)
ipsq->ipsq_xopq_mptail = prev;
/*
* Create a temporary list and release the ipsq lock
* New elements are added to the head of the tmp_list
*/
curr->b_next = tmp_list;
tmp_list = curr;
} else {
prev = curr;
}
}
mutex_exit(&ipsq->ipsq_lock);
while (tmp_list != NULL) {
curr = tmp_list;
tmp_list = curr->b_next;
curr->b_next = NULL;
curr->b_prev = NULL;
curr->b_queue = NULL;
if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
ip_ioctl_finish(q, curr, ENXIO, connp != NULL ?
CONN_CLOSE : NO_COPYOUT, NULL);
} else {
/*
* IP-MT XXX In the case of TLI/XTI bind / optmgmt
* this can't be just inet_freemsg. we have to
* restart it otherwise the thread will be stuck.
*/
inet_freemsg(curr);
}
}
}
/*
* This conn has started closing. Cleanup any pending ioctl from this conn.
* STREAMS ensures that there can be at most 1 ioctl pending on a stream.
*/
void
conn_ioctl_cleanup(conn_t *connp)
{
mblk_t *curr;
ipsq_t *ipsq;
ill_t *ill;
boolean_t refheld;
/*
* Is any exclusive ioctl pending ? If so clean it up. If the
* ioctl has not yet started, the mp is pending in the list headed by
* ipsq_xopq_head. If the ioctl has started the mp could be present in
* ipx_pending_mp. If the ioctl timed out in the streamhead but
* is currently executing now the mp is not queued anywhere but
* conn_oper_pending_ill is null. The conn close will wait
* till the conn_ref drops to zero.
*/
mutex_enter(&connp->conn_lock);
ill = connp->conn_oper_pending_ill;
if (ill == NULL) {
mutex_exit(&connp->conn_lock);
return;
}
curr = ill_pending_mp_get(ill, &connp, 0);
if (curr != NULL) {
mutex_exit(&connp->conn_lock);
CONN_DEC_REF(connp);
inet_freemsg(curr);
return;
}
/*
* We may not be able to refhold the ill if the ill/ipif
* is changing. But we need to make sure that the ill will
* not vanish. So we just bump up the ill_waiter count.
*/
refheld = ill_waiter_inc(ill);
mutex_exit(&connp->conn_lock);
if (refheld) {
if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
ill_waiter_dcr(ill);
/*
* Check whether this ioctl has started and is
* pending. If it is not found there then check
* whether this ioctl has not even started and is in
* the ipsq_xopq list.
*/
if (!ipsq_pending_mp_cleanup(ill, connp))
ipsq_xopq_mp_cleanup(ill, connp);
ipsq = ill->ill_phyint->phyint_ipsq;
ipsq_exit(ipsq);
return;
}
}
/*
* The ill is also closing and we could not bump up the
* ill_waiter_count or we could not enter the ipsq. Leave
* the cleanup to ill_delete
*/
mutex_enter(&connp->conn_lock);
while (connp->conn_oper_pending_ill != NULL)
cv_wait(&connp->conn_refcv, &connp->conn_lock);
mutex_exit(&connp->conn_lock);
if (refheld)
ill_waiter_dcr(ill);
}
/*
* ipcl_walk function for cleaning up conn_*_ill fields.
*/
static void
conn_cleanup_ill(conn_t *connp, caddr_t arg)
{
ill_t *ill = (ill_t *)arg;
ire_t *ire;
mutex_enter(&connp->conn_lock);
if (connp->conn_multicast_ill == ill) {
/* Revert to late binding */
connp->conn_multicast_ill = NULL;
}
if (connp->conn_incoming_ill == ill)
connp->conn_incoming_ill = NULL;
if (connp->conn_outgoing_ill == ill)
connp->conn_outgoing_ill = NULL;
if (connp->conn_dhcpinit_ill == ill) {
connp->conn_dhcpinit_ill = NULL;
ASSERT(ill->ill_dhcpinit != 0);
atomic_dec_32(&ill->ill_dhcpinit);
}
if (connp->conn_ire_cache != NULL) {
ire = connp->conn_ire_cache;
/*
* Source address selection makes it possible for IRE_CACHE
* entries to be created with ire_stq coming from interface X
* and ipif coming from interface Y. Thus whenever interface
* X goes down, remove all references to it by checking both
* on ire_ipif and ire_stq.
*/
if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
(ire->ire_type == IRE_CACHE &&
ire->ire_stq == ill->ill_wq)) {
connp->conn_ire_cache = NULL;
mutex_exit(&connp->conn_lock);
ire_refrele_notr(ire);
return;
}
}
mutex_exit(&connp->conn_lock);
}
/* ARGSUSED */
void
ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
ill_t *ill = q->q_ptr;
ipif_t *ipif;
ASSERT(IAM_WRITER_IPSQ(ipsq));
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
ipif_non_duplicate(ipif);
ipif_down_tail(ipif);
}
freemsg(mp);
ipsq_current_finish(ipsq);
}
/*
* ill_down_start is called when we want to down this ill and bring it up again
* It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
* all interfaces, but don't tear down any plumbing.
*/
boolean_t
ill_down_start(queue_t *q, mblk_t *mp)
{
ill_t *ill = q->q_ptr;
ipif_t *ipif;
ASSERT(IAM_WRITER_ILL(ill));
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
(void) ipif_down(ipif, NULL, NULL);
ill_down(ill);
(void) ipsq_pending_mp_cleanup(ill, NULL);
ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
/*
* Atomically test and add the pending mp if references are active.
*/
mutex_enter(&ill->ill_lock);
if (!ill_is_quiescent(ill)) {
/* call cannot fail since `conn_t *' argument is NULL */
(void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
mp, ILL_DOWN);
mutex_exit(&ill->ill_lock);
return (B_FALSE);
}
mutex_exit(&ill->ill_lock);
return (B_TRUE);
}
static void
ill_down(ill_t *ill)
{
ip_stack_t *ipst = ill->ill_ipst;
/* Blow off any IREs dependent on this ILL. */
ire_walk(ill_downi, ill, ipst);
/* Remove any conn_*_ill depending on this ill */
ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
}
/*
* ire_walk routine used to delete every IRE that depends on queues
* associated with 'ill'. (Always called as writer.)
*/
static void
ill_downi(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
/*
* Source address selection makes it possible for IRE_CACHE
* entries to be created with ire_stq coming from interface X
* and ipif coming from interface Y. Thus whenever interface
* X goes down, remove all references to it by checking both
* on ire_ipif and ire_stq.
*/
if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
(ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) {
ire_delete(ire);
}
}
/*
* Remove ire/nce from the fastpath list.
*/
void
ill_fastpath_nack(ill_t *ill)
{
nce_fastpath_list_dispatch(ill, NULL, NULL);
}
/* Consume an M_IOCACK of the fastpath probe. */
void
ill_fastpath_ack(ill_t *ill, mblk_t *mp)
{
mblk_t *mp1 = mp;
/*
* If this was the first attempt turn on the fastpath probing.
*/
mutex_enter(&ill->ill_lock);
if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
ill->ill_dlpi_fastpath_state = IDS_OK;
mutex_exit(&ill->ill_lock);
/* Free the M_IOCACK mblk, hold on to the data */
mp = mp->b_cont;
freeb(mp1);
if (mp == NULL)
return;
if (mp->b_cont != NULL) {
/*
* Update all IRE's or NCE's that are waiting for
* fastpath update.
*/
nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp);
mp1 = mp->b_cont;
freeb(mp);
mp = mp1;
} else {
ip0dbg(("ill_fastpath_ack: no b_cont\n"));
}
freeb(mp);
}
/*
* Throw an M_IOCTL message downstream asking "do you know fastpath?"
* The data portion of the request is a dl_unitdata_req_t template for
* what we would send downstream in the absence of a fastpath confirmation.
*/
int
ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
{
struct iocblk *ioc;
mblk_t *mp;
if (dlur_mp == NULL)
return (EINVAL);
mutex_enter(&ill->ill_lock);
switch (ill->ill_dlpi_fastpath_state) {
case IDS_FAILED:
/*
* Driver NAKed the first fastpath ioctl - assume it doesn't
* support it.
*/
mutex_exit(&ill->ill_lock);
return (ENOTSUP);
case IDS_UNKNOWN:
/* This is the first probe */
ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
break;
default:
break;
}
mutex_exit(&ill->ill_lock);
if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
return (EAGAIN);
mp->b_cont = copyb(dlur_mp);
if (mp->b_cont == NULL) {
freeb(mp);
return (EAGAIN);
}
ioc = (struct iocblk *)mp->b_rptr;
ioc->ioc_count = msgdsize(mp->b_cont);
putnext(ill->ill_wq, mp);
return (0);
}
void
ill_capability_probe(ill_t *ill)
{
mblk_t *mp;
ASSERT(IAM_WRITER_ILL(ill));
if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
ill->ill_dlpi_capab_state != IDCS_FAILED)
return;
/*
* We are starting a new cycle of capability negotiation.
* Free up the capab reset messages of any previous incarnation.
* We will do a fresh allocation when we get the response to our probe
*/
if (ill->ill_capab_reset_mp != NULL) {
freemsg(ill->ill_capab_reset_mp);
ill->ill_capab_reset_mp = NULL;
}
ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
if (mp == NULL)
return;
ill_capability_send(ill, mp);
ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
}
void
ill_capability_reset(ill_t *ill, boolean_t reneg)
{
ASSERT(IAM_WRITER_ILL(ill));
if (ill->ill_dlpi_capab_state != IDCS_OK)
return;
ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
ill_capability_send(ill, ill->ill_capab_reset_mp);
ill->ill_capab_reset_mp = NULL;
/*
* We turn off all capabilities except those pertaining to
* direct function call capabilities viz. ILL_CAPAB_DLD*
* which will be turned off by the corresponding reset functions.
*/
ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM |
ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP);
}
static void
ill_capability_reset_alloc(ill_t *ill)
{
mblk_t *mp;
size_t size = 0;
int err;
dl_capability_req_t *capb;
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(ill->ill_capab_reset_mp == NULL);
if (ILL_MDT_CAPABLE(ill))
size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
if (ILL_HCKSUM_CAPABLE(ill)) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_hcksum_t);
}
if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_zerocopy_t);
}
if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) {
size += sizeof (dl_capability_sub_t);
size += ill_capability_ipsec_reset_size(ill, NULL, NULL,
NULL, NULL);
}
if (ill->ill_capabilities & ILL_CAPAB_DLD) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_dld_t);
}
mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
STR_NOSIG, &err);
mp->b_datap->db_type = M_PROTO;
bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
capb = (dl_capability_req_t *)mp->b_rptr;
capb->dl_primitive = DL_CAPABILITY_REQ;
capb->dl_sub_offset = sizeof (dl_capability_req_t);
capb->dl_sub_length = size;
mp->b_wptr += sizeof (dl_capability_req_t);
/*
* Each handler fills in the corresponding dl_capability_sub_t
* inside the mblk,
*/
ill_capability_mdt_reset_fill(ill, mp);
ill_capability_hcksum_reset_fill(ill, mp);
ill_capability_zerocopy_reset_fill(ill, mp);
ill_capability_ipsec_reset_fill(ill, mp);
ill_capability_dld_reset_fill(ill, mp);
ill->ill_capab_reset_mp = mp;
}
static void
ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
{
dl_capab_id_t *id_ic;
uint_t sub_dl_cap = outers->dl_cap;
dl_capability_sub_t *inners;
uint8_t *capend;
ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(outers + 1) + outers->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_id_ack: "
"malformed sub-capability too long for mblk");
return;
}
id_ic = (dl_capab_id_t *)(outers + 1);
if (outers->dl_length < sizeof (*id_ic) ||
(inners = &id_ic->id_subcap,
inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
"encapsulated capab type %d too long for mblk",
inners->dl_cap);
return;
}
if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
"isn't as expected; pass-thru module(s) detected, "
"discarding capability\n", inners->dl_cap));
return;
}
/* Process the encapsulated sub-capability */
ill_capability_dispatch(ill, mp, inners, B_TRUE);
}
/*
* Process Multidata Transmit capability negotiation ack received from a
* DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a
* DL_CAPABILITY_ACK message.
*/
static void
ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
mblk_t *nmp = NULL;
dl_capability_req_t *oc;
dl_capab_mdt_t *mdt_ic, *mdt_oc;
ill_mdt_capab_t **ill_mdt_capab;
uint_t sub_dl_cap = isub->dl_cap;
uint8_t *capend;
ASSERT(sub_dl_cap == DL_CAPAB_MDT);
ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab;
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_mdt_ack: "
"malformed sub-capability too long for mblk");
return;
}
mdt_ic = (dl_capab_mdt_t *)(isub + 1);
if (mdt_ic->mdt_version != MDT_VERSION_2) {
cmn_err(CE_CONT, "ill_capability_mdt_ack: "
"unsupported MDT sub-capability (version %d, expected %d)",
mdt_ic->mdt_version, MDT_VERSION_2);
return;
}
if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_mdt_ack: mid token for MDT "
"capability isn't as expected; pass-thru module(s) "
"detected, discarding capability\n"));
return;
}
if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) {
if (*ill_mdt_capab == NULL) {
*ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t),
KM_NOSLEEP);
if (*ill_mdt_capab == NULL) {
cmn_err(CE_WARN, "ill_capability_mdt_ack: "
"could not enable MDT version %d "
"for %s (ENOMEM)\n", MDT_VERSION_2,
ill->ill_name);
return;
}
}
ip1dbg(("ill_capability_mdt_ack: interface %s supports "
"MDT version %d (%d bytes leading, %d bytes trailing "
"header spaces, %d max pld bufs, %d span limit)\n",
ill->ill_name, MDT_VERSION_2,
mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail,
mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit));
(*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2;
(*ill_mdt_capab)->ill_mdt_on = 1;
/*
* Round the following values to the nearest 32-bit; ULP
* may further adjust them to accomodate for additional
* protocol headers. We pass these values to ULP during
* bind time.
*/
(*ill_mdt_capab)->ill_mdt_hdr_head =
roundup(mdt_ic->mdt_hdr_head, 4);
(*ill_mdt_capab)->ill_mdt_hdr_tail =
roundup(mdt_ic->mdt_hdr_tail, 4);
(*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld;
(*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit;
ill->ill_capabilities |= ILL_CAPAB_MDT;
} else {
uint_t size;
uchar_t *rptr;
size = sizeof (dl_capability_req_t) +
sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
cmn_err(CE_WARN, "ill_capability_mdt_ack: "
"could not enable MDT for %s (ENOMEM)\n",
ill->ill_name);
return;
}
rptr = nmp->b_rptr;
/* initialize dl_capability_req_t */
oc = (dl_capability_req_t *)nmp->b_rptr;
oc->dl_sub_offset = sizeof (dl_capability_req_t);
oc->dl_sub_length = sizeof (dl_capability_sub_t) +
sizeof (dl_capab_mdt_t);
nmp->b_rptr += sizeof (dl_capability_req_t);
/* initialize dl_capability_sub_t */
bcopy(isub, nmp->b_rptr, sizeof (*isub));
nmp->b_rptr += sizeof (*isub);
/* initialize dl_capab_mdt_t */
mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr;
bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic));
nmp->b_rptr = rptr;
ip1dbg(("ill_capability_mdt_ack: asking interface %s "
"to enable MDT version %d\n", ill->ill_name,
MDT_VERSION_2));
/* set ENABLE flag */
mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE;
/* nmp points to a DL_CAPABILITY_REQ message to enable MDT */
ill_capability_send(ill, nmp);
}
}
static void
ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp)
{
dl_capab_mdt_t *mdt_subcap;
dl_capability_sub_t *dl_subcap;
if (!ILL_MDT_CAPABLE(ill))
return;
ASSERT(ill->ill_mdt_capab != NULL);
dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = DL_CAPAB_MDT;
dl_subcap->dl_length = sizeof (*mdt_subcap);
mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1);
mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version;
mdt_subcap->mdt_flags = 0;
mdt_subcap->mdt_hdr_head = 0;
mdt_subcap->mdt_hdr_tail = 0;
mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap);
}
static void
ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
{
dl_capability_sub_t *dl_subcap;
if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
return;
/*
* The dl_capab_dld_t that follows the dl_capability_sub_t is not
* initialized below since it is not used by DLD.
*/
dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = DL_CAPAB_DLD;
dl_subcap->dl_length = sizeof (dl_capab_dld_t);
mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
}
/*
* Send a DL_NOTIFY_REQ to the specified ill to enable
* DL_NOTE_PROMISC_ON/OFF_PHYS notifications.
* Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware
* acceleration.
* Returns B_TRUE on success, B_FALSE if the message could not be sent.
*/
static boolean_t
ill_enable_promisc_notify(ill_t *ill)
{
mblk_t *mp;
dl_notify_req_t *req;
IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n"));
mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ);
if (mp == NULL)
return (B_FALSE);
req = (dl_notify_req_t *)mp->b_rptr;
req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS |
DL_NOTE_PROMISC_OFF_PHYS;
ill_dlpi_send(ill, mp);
return (B_TRUE);
}
/*
* Allocate an IPsec capability request which will be filled by our
* caller to turn on support for one or more algorithms.
*/
static mblk_t *
ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub)
{
mblk_t *nmp;
dl_capability_req_t *ocap;
dl_capab_ipsec_t *ocip;
dl_capab_ipsec_t *icip;
uint8_t *ptr;
icip = (dl_capab_ipsec_t *)(isub + 1);
/*
* The first time around, we send a DL_NOTIFY_REQ to enable
* PROMISC_ON/OFF notification from the provider. We need to
* do this before enabling the algorithms to avoid leakage of
* cleartext packets.
*/
if (!ill_enable_promisc_notify(ill))
return (NULL);
/*
* Allocate new mblk which will contain a new capability
* request to enable the capabilities.
*/
nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) +
sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ);
if (nmp == NULL)
return (NULL);
ptr = nmp->b_rptr;
/* initialize dl_capability_req_t */
ocap = (dl_capability_req_t *)ptr;
ocap->dl_sub_offset = sizeof (dl_capability_req_t);
ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
ptr += sizeof (dl_capability_req_t);
/* initialize dl_capability_sub_t */
bcopy(isub, ptr, sizeof (*isub));
ptr += sizeof (*isub);
/* initialize dl_capab_ipsec_t */
ocip = (dl_capab_ipsec_t *)ptr;
bcopy(icip, ocip, sizeof (*icip));
nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]);
return (nmp);
}
/*
* Process an IPsec capability negotiation ack received from a DLS Provider.
* isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or
* DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message.
*/
static void
ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
dl_capab_ipsec_t *icip;
dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */
dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */
uint_t cipher, nciphers;
mblk_t *nmp;
uint_t alg_len;
boolean_t need_sadb_dump;
uint_t sub_dl_cap = isub->dl_cap;
ill_ipsec_capab_t **ill_capab;
uint64_t ill_capab_flag;
uint8_t *capend, *ciphend;
boolean_t sadb_resync;
ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH ||
sub_dl_cap == DL_CAPAB_IPSEC_ESP);
if (sub_dl_cap == DL_CAPAB_IPSEC_AH) {
ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah;
ill_capab_flag = ILL_CAPAB_AH;
} else {
ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp;
ill_capab_flag = ILL_CAPAB_ESP;
}
/*
* If the ill capability structure exists, then this incoming
* DL_CAPABILITY_ACK is a response to a "renegotiation" cycle.
* If this is so, then we'd need to resynchronize the SADB
* after re-enabling the offloaded ciphers.
*/
sadb_resync = (*ill_capab != NULL);
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
"malformed sub-capability too long for mblk");
return;
}
/*
* There are two types of acks we process here:
* 1. acks in reply to a (first form) generic capability req
* (no ENABLE flag set)
* 2. acks in reply to a ENABLE capability req.
* (ENABLE flag set)
*
* We process the subcapability passed as argument as follows:
* 1 do initializations
* 1.1 initialize nmp = NULL
* 1.2 set need_sadb_dump to B_FALSE
* 2 for each cipher in subcapability:
* 2.1 if ENABLE flag is set:
* 2.1.1 update per-ill ipsec capabilities info
* 2.1.2 set need_sadb_dump to B_TRUE
* 2.2 if ENABLE flag is not set:
* 2.2.1 if nmp is NULL:
* 2.2.1.1 allocate and initialize nmp
* 2.2.1.2 init current pos in nmp
* 2.2.2 copy current cipher to current pos in nmp
* 2.2.3 set ENABLE flag in nmp
* 2.2.4 update current pos
* 3 if nmp is not equal to NULL, send enable request
* 3.1 send capability request
* 4 if need_sadb_dump is B_TRUE
* 4.1 enable promiscuous on/off notifications
* 4.2 call ill_dlpi_send(isub->dlcap) to send all
* AH or ESP SA's to interface.
*/
nmp = NULL;
oalg = NULL;
need_sadb_dump = B_FALSE;
icip = (dl_capab_ipsec_t *)(isub + 1);
ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]);
nciphers = icip->cip_nciphers;
ciphend = (uint8_t *)(ialg + icip->cip_nciphers);
if (ciphend > capend) {
cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
"too many ciphers for sub-capability len");
return;
}
for (cipher = 0; cipher < nciphers; cipher++) {
alg_len = sizeof (dl_capab_ipsec_alg_t);
if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) {
/*
* TBD: when we provide a way to disable capabilities
* from above, need to manage the request-pending state
* and fail if we were not expecting this ACK.
*/
IPSECHW_DEBUG(IPSECHW_CAPAB,
("ill_capability_ipsec_ack: got ENABLE ACK\n"));
/*
* Update IPsec capabilities for this ill
*/
if (*ill_capab == NULL) {
IPSECHW_DEBUG(IPSECHW_CAPAB,
("ill_capability_ipsec_ack: "
"allocating ipsec_capab for ill\n"));
*ill_capab = ill_ipsec_capab_alloc();
if (*ill_capab == NULL) {
cmn_err(CE_WARN,
"ill_capability_ipsec_ack: "
"could not enable IPsec Hardware "
"acceleration for %s (ENOMEM)\n",
ill->ill_name);
return;
}
}
ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH ||
ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR);
if (ialg->alg_prim >= MAX_IPSEC_ALGS) {
cmn_err(CE_WARN,
"ill_capability_ipsec_ack: "
"malformed IPsec algorithm id %d",
ialg->alg_prim);
continue;
}
if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) {
IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs,
ialg->alg_prim);
} else {
ipsec_capab_algparm_t *alp;
IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs,
ialg->alg_prim);
if (!ill_ipsec_capab_resize_algparm(*ill_capab,
ialg->alg_prim)) {
cmn_err(CE_WARN,
"ill_capability_ipsec_ack: "
"no space for IPsec alg id %d",
ialg->alg_prim);
continue;
}
alp = &((*ill_capab)->encr_algparm[
ialg->alg_prim]);
alp->minkeylen = ialg->alg_minbits;
alp->maxkeylen = ialg->alg_maxbits;
}
ill->ill_capabilities |= ill_capab_flag;
/*
* indicate that a capability was enabled, which
* will be used below to kick off a SADB dump
* to the ill.
*/
need_sadb_dump = B_TRUE;
} else {
IPSECHW_DEBUG(IPSECHW_CAPAB,
("ill_capability_ipsec_ack: enabling alg 0x%x\n",
ialg->alg_prim));
if (nmp == NULL) {
nmp = ill_alloc_ipsec_cap_req(ill, isub);
if (nmp == NULL) {
/*
* Sending the PROMISC_ON/OFF
* notification request failed.
* We cannot enable the algorithms
* since the Provider will not
* notify IP of promiscous mode
* changes, which could lead
* to leakage of packets.
*/
cmn_err(CE_WARN,
"ill_capability_ipsec_ack: "
"could not enable IPsec Hardware "
"acceleration for %s (ENOMEM)\n",
ill->ill_name);
return;
}
/* ptr to current output alg specifier */
oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
}
/*
* Copy current alg specifier, set ENABLE
* flag, and advance to next output alg.
* For now we enable all IPsec capabilities.
*/
ASSERT(oalg != NULL);
bcopy(ialg, oalg, alg_len);
oalg->alg_flag |= DL_CAPAB_ALG_ENABLE;
nmp->b_wptr += alg_len;
oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
}
/* move to next input algorithm specifier */
ialg = (dl_capab_ipsec_alg_t *)
((char *)ialg + alg_len);
}
if (nmp != NULL)
/*
* nmp points to a DL_CAPABILITY_REQ message to enable
* IPsec hardware acceleration.
*/
ill_capability_send(ill, nmp);
if (need_sadb_dump)
/*
* An acknowledgement corresponding to a request to
* enable acceleration was received, notify SADB.
*/
ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync);
}
/*
* Given an mblk with enough space in it, create sub-capability entries for
* DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised
* offloaded ciphers (both AUTH and ENCR) with their enable flags cleared,
* in preparation for the reset the DL_CAPABILITY_REQ message.
*/
static void
ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen,
ill_ipsec_capab_t *ill_cap, mblk_t *mp)
{
dl_capab_ipsec_t *oipsec;
dl_capab_ipsec_alg_t *oalg;
dl_capability_sub_t *dl_subcap;
int i, k;
ASSERT(nciphers > 0);
ASSERT(ill_cap != NULL);
ASSERT(mp != NULL);
ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen);
/* dl_capability_sub_t for "stype" */
dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = stype;
dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen;
mp->b_wptr += sizeof (dl_capability_sub_t);
/* dl_capab_ipsec_t for "stype" */
oipsec = (dl_capab_ipsec_t *)mp->b_wptr;
oipsec->cip_version = 1;
oipsec->cip_nciphers = nciphers;
mp->b_wptr = (uchar_t *)&oipsec->cip_data[0];
/* create entries for "stype" AUTH ciphers */
for (i = 0; i < ill_cap->algs_size; i++) {
for (k = 0; k < BITSPERBYTE; k++) {
if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0)
continue;
oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
bzero((void *)oalg, sizeof (*oalg));
oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH;
oalg->alg_prim = k + (BITSPERBYTE * i);
mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
}
}
/* create entries for "stype" ENCR ciphers */
for (i = 0; i < ill_cap->algs_size; i++) {
for (k = 0; k < BITSPERBYTE; k++) {
if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0)
continue;
oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
bzero((void *)oalg, sizeof (*oalg));
oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR;
oalg->alg_prim = k + (BITSPERBYTE * i);
mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
}
}
}
/*
* Macro to count number of 1s in a byte (8-bit word). The total count is
* accumulated into the passed-in argument (sum). We could use SPARCv9's
* POPC instruction, but our macro is more flexible for an arbitrary length
* of bytes, such as {auth,encr}_hw_algs. These variables are currently
* 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length
* stays that way, we can reduce the number of iterations required.
*/
#define COUNT_1S(val, sum) { \
uint8_t x = val & 0xff; \
x = (x & 0x55) + ((x >> 1) & 0x55); \
x = (x & 0x33) + ((x >> 2) & 0x33); \
sum += (x & 0xf) + ((x >> 4) & 0xf); \
}
/* ARGSUSED */
static int
ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp,
int *esp_cntp, int *esp_lenp)
{
ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
uint64_t ill_capabilities = ill->ill_capabilities;
int ah_cnt = 0, esp_cnt = 0;
int ah_len = 0, esp_len = 0;
int i, size = 0;
if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)))
return (0);
ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH));
ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP));
/* Find out the number of ciphers for AH */
if (cap_ah != NULL) {
for (i = 0; i < cap_ah->algs_size; i++) {
COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt);
COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt);
}
if (ah_cnt > 0) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_ipsec_t);
/* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
size += ah_len;
}
}
/* Find out the number of ciphers for ESP */
if (cap_esp != NULL) {
for (i = 0; i < cap_esp->algs_size; i++) {
COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt);
COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt);
}
if (esp_cnt > 0) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_ipsec_t);
/* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
size += esp_len;
}
}
if (ah_cntp != NULL)
*ah_cntp = ah_cnt;
if (ah_lenp != NULL)
*ah_lenp = ah_len;
if (esp_cntp != NULL)
*esp_cntp = esp_cnt;
if (esp_lenp != NULL)
*esp_lenp = esp_len;
return (size);
}
/* ARGSUSED */
static void
ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp)
{
ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
int ah_cnt = 0, esp_cnt = 0;
int ah_len = 0, esp_len = 0;
int size;
size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len,
&esp_cnt, &esp_len);
if (size == 0)
return;
/*
* Clear the capability flags for IPsec HA but retain the ill
* capability structures since it's possible that another thread
* is still referring to them. The structures only get deallocated
* when we destroy the ill.
*
* Various places check the flags to see if the ill is capable of
* hardware acceleration, and by clearing them we ensure that new
* outbound IPsec packets are sent down encrypted.
*/
/* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */
if (ah_cnt > 0) {
ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len,
cap_ah, mp);
}
/* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */
if (esp_cnt > 0) {
ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len,
cap_esp, mp);
}
/*
* At this point we've composed a bunch of sub-capabilities to be
* encapsulated in a DL_CAPABILITY_REQ and later sent downstream
* by the caller. Upon receiving this reset message, the driver
* must stop inbound decryption (by destroying all inbound SAs)
* and let the corresponding packets come in encrypted.
*/
}
static void
ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
boolean_t encapsulated)
{
boolean_t legacy = B_FALSE;
/*
* Note that only the following two sub-capabilities may be
* considered as "legacy", since their original definitions
* do not incorporate the dl_mid_t module ID token, and hence
* may require the use of the wrapper sub-capability.
*/
switch (subp->dl_cap) {
case DL_CAPAB_IPSEC_AH:
case DL_CAPAB_IPSEC_ESP:
legacy = B_TRUE;
break;
}
/*
* For legacy sub-capabilities which don't incorporate a queue_t
* pointer in their structures, discard them if we detect that
* there are intermediate modules in between IP and the driver.
*/
if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) {
ip1dbg(("ill_capability_dispatch: unencapsulated capab type "
"%d discarded; %d module(s) present below IP\n",
subp->dl_cap, ill->ill_lmod_cnt));
return;
}
switch (subp->dl_cap) {
case DL_CAPAB_IPSEC_AH:
case DL_CAPAB_IPSEC_ESP:
ill_capability_ipsec_ack(ill, mp, subp);
break;
case DL_CAPAB_MDT:
ill_capability_mdt_ack(ill, mp, subp);
break;
case DL_CAPAB_HCKSUM:
ill_capability_hcksum_ack(ill, mp, subp);
break;
case DL_CAPAB_ZEROCOPY:
ill_capability_zerocopy_ack(ill, mp, subp);
break;
case DL_CAPAB_DLD:
ill_capability_dld_ack(ill, mp, subp);
break;
default:
ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
subp->dl_cap));
}
}
/*
* Process a hardware checksum offload capability negotiation ack received
* from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
* of a DL_CAPABILITY_ACK message.
*/
static void
ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
dl_capability_req_t *ocap;
dl_capab_hcksum_t *ihck, *ohck;
ill_hcksum_capab_t **ill_hcksum;
mblk_t *nmp = NULL;
uint_t sub_dl_cap = isub->dl_cap;
uint8_t *capend;
ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
"malformed sub-capability too long for mblk");
return;
}
/*
* There are two types of acks we process here:
* 1. acks in reply to a (first form) generic capability req
* (no ENABLE flag set)
* 2. acks in reply to a ENABLE capability req.
* (ENABLE flag set)
*/
ihck = (dl_capab_hcksum_t *)(isub + 1);
if (ihck->hcksum_version != HCKSUM_VERSION_1) {
cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
"unsupported hardware checksum "
"sub-capability (version %d, expected %d)",
ihck->hcksum_version, HCKSUM_VERSION_1);
return;
}
if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
"checksum capability isn't as expected; pass-thru "
"module(s) detected, discarding capability\n"));
return;
}
#define CURR_HCKSUM_CAPAB \
(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \
HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
(ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
/* do ENABLE processing */
if (*ill_hcksum == NULL) {
*ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
KM_NOSLEEP);
if (*ill_hcksum == NULL) {
cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
"could not enable hcksum version %d "
"for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
ill->ill_name);
return;
}
}
(*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
(*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
ip1dbg(("ill_capability_hcksum_ack: interface %s "
"has enabled hardware checksumming\n ",
ill->ill_name));
} else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
/*
* Enabling hardware checksum offload
* Currently IP supports {TCP,UDP}/IPv4
* partial and full cksum offload and
* IPv4 header checksum offload.
* Allocate new mblk which will
* contain a new capability request
* to enable hardware checksum offload.
*/
uint_t size;
uchar_t *rptr;
size = sizeof (dl_capability_req_t) +
sizeof (dl_capability_sub_t) + isub->dl_length;
if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
"could not enable hardware cksum for %s (ENOMEM)\n",
ill->ill_name);
return;
}
rptr = nmp->b_rptr;
/* initialize dl_capability_req_t */
ocap = (dl_capability_req_t *)nmp->b_rptr;
ocap->dl_sub_offset =
sizeof (dl_capability_req_t);
ocap->dl_sub_length =
sizeof (dl_capability_sub_t) +
isub->dl_length;
nmp->b_rptr += sizeof (dl_capability_req_t);
/* initialize dl_capability_sub_t */
bcopy(isub, nmp->b_rptr, sizeof (*isub));
nmp->b_rptr += sizeof (*isub);
/* initialize dl_capab_hcksum_t */
ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
bcopy(ihck, ohck, sizeof (*ihck));
nmp->b_rptr = rptr;
ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
/* Set EN