blob: c0bb78caaa8c6c8ff83206b0a0ca01c1c4c1edd1 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* This file contains the interface control functions for IP.
*/
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/dlpi.h>
#include <sys/stropts.h>
#include <sys/strsun.h>
#include <sys/sysmacros.h>
#include <sys/strlog.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/debug.h>
#include <sys/zone.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/isa_defs.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <sys/sockio.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/igmp_var.h>
#include <sys/strsun.h>
#include <sys/policy.h>
#include <sys/ethernet.h>
#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
#include <inet/mi.h>
#include <inet/nd.h>
#include <inet/arp.h>
#include <inet/mib2.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
#include <inet/ip_multi.h>
#include <inet/ip_ire.h>
#include <inet/ip_rts.h>
#include <inet/ip_ndp.h>
#include <inet/ip_if.h>
#include <inet/ip_impl.h>
#include <inet/tun.h>
#include <inet/sctp_ip.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
#include <sys/iphada.h>
#include <netinet/igmp.h>
#include <inet/ip_listutils.h>
#include <inet/ipclassifier.h>
#include <sys/mac.h>
#include <sys/systeminfo.h>
#include <sys/bootconf.h>
#include <sys/tsol/tndb.h>
#include <sys/tsol/tnet.h>
/* The character which tells where the ill_name ends */
#define IPIF_SEPARATOR_CHAR ':'
/* IP ioctl function table entry */
typedef struct ipft_s {
int ipft_cmd;
pfi_t ipft_pfi;
int ipft_min_size;
int ipft_flags;
} ipft_t;
#define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */
#define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */
typedef struct ip_sock_ar_s {
union {
area_t ip_sock_area;
ared_t ip_sock_ared;
areq_t ip_sock_areq;
} ip_sock_ar_u;
queue_t *ip_sock_ar_q;
} ip_sock_ar_t;
static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int nd_ill_forward_set(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *ioc_cr);
static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
static ip_m_t *ip_m_lookup(t_uscalar_t mac_type);
static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp, boolean_t need_up);
static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp, boolean_t need_up);
static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
queue_t *q, mblk_t *mp, boolean_t need_up);
static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
mblk_t *mp, boolean_t need_up);
static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
mblk_t *mp);
static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
queue_t *q, mblk_t *mp, boolean_t need_up);
static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp,
sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl);
static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **);
static void ip_wput_ioctl(queue_t *q, mblk_t *mp);
static void ipsq_flush(ill_t *ill);
static void ipsq_clean_all(ill_t *ill);
static void ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring);
static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
queue_t *q, mblk_t *mp, boolean_t need_up);
static void ipsq_delete(ipsq_t *);
static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
boolean_t initialize);
static void ipif_check_bcast_ires(ipif_t *test_ipif);
static void ipif_down_delete_ire(ire_t *ire, char *ipif);
static void ipif_delete_cache_ire(ire_t *, char *);
static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
static void ipif_down_tail(ipif_t *ipif);
static void ipif_free(ipif_t *ipif);
static void ipif_free_tail(ipif_t *ipif);
static void ipif_mask_reply(ipif_t *);
static void ipif_mtu_change(ire_t *ire, char *ipif_arg);
static void ipif_multicast_down(ipif_t *ipif);
static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
static void ipif_set_default(ipif_t *ipif);
static int ipif_set_values(queue_t *q, mblk_t *mp,
char *interf_name, uint_t *ppa);
static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
queue_t *q);
static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
queue_t *q, mblk_t *mp, ipsq_func_t func, int *error);
static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp);
static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp);
static int ill_alloc_ppa(ill_if_t *, ill_t *);
static int ill_arp_off(ill_t *ill);
static int ill_arp_on(ill_t *ill);
static void ill_delete_interface_type(ill_if_t *);
static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
static void ill_down(ill_t *ill);
static void ill_downi(ire_t *ire, char *ill_arg);
static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg);
static void ill_down_tail(ill_t *ill);
static void ill_free_mib(ill_t *ill);
static void ill_glist_delete(ill_t *);
static boolean_t ill_has_usable_ipif(ill_t *);
static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int);
static void ill_nominate_bcast_rcv(ill_group_t *illgrp);
static void ill_phyint_free(ill_t *ill);
static void ill_phyint_reinit(ill_t *ill);
static void ill_set_nce_router_flags(ill_t *, boolean_t);
static void ill_signal_ipsq_ills(ipsq_t *, boolean_t);
static boolean_t ill_split_ipsq(ipsq_t *cur_sq);
static void ill_stq_cache_delete(ire_t *, char *);
static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *);
static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *);
static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
in6_addr_t *);
static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
ipaddr_t *);
static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *);
static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
in6_addr_t *);
static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *,
ipaddr_t *);
static void ipif_save_ire(ipif_t *, ire_t *);
static void ipif_remove_ire(ipif_t *, ire_t *);
static void ip_cgtp_bcast_add(ire_t *, ire_t *);
static void ip_cgtp_bcast_delete(ire_t *);
/*
* Per-ill IPsec capabilities management.
*/
static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void);
static void ill_ipsec_capab_free(ill_ipsec_capab_t *);
static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
static void ill_ipsec_capab_delete(ill_t *, uint_t);
static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
static void ill_capability_proto(ill_t *, int, mblk_t *);
static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
boolean_t);
static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_mdt_reset(ill_t *, mblk_t **);
static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_ipsec_reset(ill_t *, mblk_t **);
static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_hcksum_reset(ill_t *, mblk_t **);
static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
static void ill_capability_zerocopy_reset(ill_t *, mblk_t **);
static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *);
static void ill_capability_dls_reset(ill_t *, mblk_t **);
static void ill_capability_dls_disable(ill_t *);
static void illgrp_cache_delete(ire_t *, char *);
static void illgrp_delete(ill_t *ill);
static void illgrp_reset_schednext(ill_t *ill);
static ill_t *ill_prev_usesrc(ill_t *);
static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
static void ill_disband_usesrc_group(ill_t *);
static void conn_cleanup_stale_ire(conn_t *, caddr_t);
/*
* if we go over the memory footprint limit more than once in this msec
* interval, we'll start pruning aggressively.
*/
int ip_min_frag_prune_time = 0;
/*
* max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY
* and the IPsec DOI
*/
#define MAX_IPSEC_ALGS 256
#define BITSPERBYTE 8
#define BITS(type) (BITSPERBYTE * (long)sizeof (type))
#define IPSEC_ALG_ENABLE(algs, algid) \
((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \
(1 << ((algid) % BITS(ipsec_capab_elem_t))))
#define IPSEC_ALG_IS_ENABLED(algid, algs) \
((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \
(1 << ((algid) % BITS(ipsec_capab_elem_t))))
typedef uint8_t ipsec_capab_elem_t;
/*
* Per-algorithm parameters. Note that at present, only encryption
* algorithms have variable keysize (IKE does not provide a way to negotiate
* auth algorithm keysize).
*
* All sizes here are in bits.
*/
typedef struct
{
uint16_t minkeylen;
uint16_t maxkeylen;
} ipsec_capab_algparm_t;
/*
* Per-ill capabilities.
*/
struct ill_ipsec_capab_s {
ipsec_capab_elem_t *encr_hw_algs;
ipsec_capab_elem_t *auth_hw_algs;
uint32_t algs_size; /* size of _hw_algs in bytes */
/* algorithm key lengths */
ipsec_capab_algparm_t *encr_algparm;
uint32_t encr_algparm_size;
uint32_t encr_algparm_end;
};
/*
* List of AH and ESP IPsec acceleration capable ills
*/
typedef struct ipsec_capab_ill_s {
uint_t ill_index;
boolean_t ill_isv6;
struct ipsec_capab_ill_s *next;
} ipsec_capab_ill_t;
static ipsec_capab_ill_t *ipsec_capab_ills_ah;
static ipsec_capab_ill_t *ipsec_capab_ills_esp;
krwlock_t ipsec_capab_ills_lock;
/*
* The field values are larger than strictly necessary for simple
* AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls.
*/
static area_t ip_area_template = {
AR_ENTRY_ADD, /* area_cmd */
sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl),
/* area_name_offset */
/* area_name_length temporarily holds this structure length */
sizeof (area_t), /* area_name_length */
IP_ARP_PROTO_TYPE, /* area_proto */
sizeof (ip_sock_ar_t), /* area_proto_addr_offset */
IP_ADDR_LEN, /* area_proto_addr_length */
sizeof (ip_sock_ar_t) + IP_ADDR_LEN,
/* area_proto_mask_offset */
0, /* area_flags */
sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN,
/* area_hw_addr_offset */
/* Zero length hw_addr_length means 'use your idea of the address' */
0 /* area_hw_addr_length */
};
/*
* AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver
* support
*/
static area_t ip6_area_template = {
AR_ENTRY_ADD, /* area_cmd */
sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t),
/* area_name_offset */
/* area_name_length temporarily holds this structure length */
sizeof (area_t), /* area_name_length */
IP_ARP_PROTO_TYPE, /* area_proto */
sizeof (ip_sock_ar_t), /* area_proto_addr_offset */
IPV6_ADDR_LEN, /* area_proto_addr_length */
sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN,
/* area_proto_mask_offset */
0, /* area_flags */
sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN,
/* area_hw_addr_offset */
/* Zero length hw_addr_length means 'use your idea of the address' */
0 /* area_hw_addr_length */
};
static ared_t ip_ared_template = {
AR_ENTRY_DELETE,
sizeof (ared_t) + IP_ADDR_LEN,
sizeof (ared_t),
IP_ARP_PROTO_TYPE,
sizeof (ared_t),
IP_ADDR_LEN
};
static ared_t ip6_ared_template = {
AR_ENTRY_DELETE,
sizeof (ared_t) + IPV6_ADDR_LEN,
sizeof (ared_t),
IP_ARP_PROTO_TYPE,
sizeof (ared_t),
IPV6_ADDR_LEN
};
/*
* A template for an IPv6 AR_ENTRY_QUERY template has not been created, as
* as the areq doesn't include an IP address in ill_dl_up() (the only place a
* areq is used).
*/
static areq_t ip_areq_template = {
AR_ENTRY_QUERY, /* cmd */
sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */
sizeof (areq_t), /* name len (filled by ill_arp_alloc) */
IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */
sizeof (areq_t), /* target addr offset */
IP_ADDR_LEN, /* target addr_length */
0, /* flags */
sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */
IP_ADDR_LEN, /* sender addr length */
6, /* xmit_count */
1000, /* (re)xmit_interval in milliseconds */
4 /* max # of requests to buffer */
/* anything else filled in by the code */
};
static arc_t ip_aru_template = {
AR_INTERFACE_UP,
sizeof (arc_t), /* Name offset */
sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
};
static arc_t ip_ard_template = {
AR_INTERFACE_DOWN,
sizeof (arc_t), /* Name offset */
sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
};
static arc_t ip_aron_template = {
AR_INTERFACE_ON,
sizeof (arc_t), /* Name offset */
sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
};
static arc_t ip_aroff_template = {
AR_INTERFACE_OFF,
sizeof (arc_t), /* Name offset */
sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
};
static arma_t ip_arma_multi_template = {
AR_MAPPING_ADD,
sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN,
/* Name offset */
sizeof (arma_t), /* Name length (set by ill_arp_alloc) */
IP_ARP_PROTO_TYPE,
sizeof (arma_t), /* proto_addr_offset */
IP_ADDR_LEN, /* proto_addr_length */
sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */
sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */
ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */
sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */
IP_MAX_HW_LEN, /* hw_addr_length */
0, /* hw_mapping_start */
};
static ipft_t ip_ioctl_ftbl[] = {
{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
IPFT_F_NO_REPLY },
{ IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t),
IPFT_F_NO_REPLY },
{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
{ 0 }
};
/* Simple ICMP IP Header Template */
static ipha_t icmp_ipha = {
IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
};
/* Flag descriptors for ip_ipif_report */
static nv_t ipif_nv_tbl[] = {
{ IPIF_UP, "UP" },
{ IPIF_BROADCAST, "BROADCAST" },
{ ILLF_DEBUG, "DEBUG" },
{ PHYI_LOOPBACK, "LOOPBACK" },
{ IPIF_POINTOPOINT, "POINTOPOINT" },
{ ILLF_NOTRAILERS, "NOTRAILERS" },
{ PHYI_RUNNING, "RUNNING" },
{ ILLF_NOARP, "NOARP" },
{ PHYI_PROMISC, "PROMISC" },
{ PHYI_ALLMULTI, "ALLMULTI" },
{ PHYI_INTELLIGENT, "INTELLIGENT" },
{ ILLF_MULTICAST, "MULTICAST" },
{ PHYI_MULTI_BCAST, "MULTI_BCAST" },
{ IPIF_UNNUMBERED, "UNNUMBERED" },
{ IPIF_DHCPRUNNING, "DHCP" },
{ IPIF_PRIVATE, "PRIVATE" },
{ IPIF_NOXMIT, "NOXMIT" },
{ IPIF_NOLOCAL, "NOLOCAL" },
{ IPIF_DEPRECATED, "DEPRECATED" },
{ IPIF_PREFERRED, "PREFERRED" },
{ IPIF_TEMPORARY, "TEMPORARY" },
{ IPIF_ADDRCONF, "ADDRCONF" },
{ PHYI_VIRTUAL, "VIRTUAL" },
{ ILLF_ROUTER, "ROUTER" },
{ ILLF_NONUD, "NONUD" },
{ IPIF_ANYCAST, "ANYCAST" },
{ ILLF_NORTEXCH, "NORTEXCH" },
{ ILLF_IPV4, "IPV4" },
{ ILLF_IPV6, "IPV6" },
{ IPIF_MIPRUNNING, "MIP" },
{ IPIF_NOFAILOVER, "NOFAILOVER" },
{ PHYI_FAILED, "FAILED" },
{ PHYI_STANDBY, "STANDBY" },
{ PHYI_INACTIVE, "INACTIVE" },
{ PHYI_OFFLINE, "OFFLINE" },
};
static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
static ip_m_t ip_m_tbl[] = {
{ DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_ether_v6intfid },
{ DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid },
{ DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid },
{ DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid },
{ DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_ether_v6intfid },
{ DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo,
ip_ib_v6intfid },
{ SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL},
{ DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo,
ip_nodef_v6intfid }
};
static ill_t ill_null; /* Empty ILL for init. */
char ipif_loopback_name[] = "lo0";
static char *ipv4_forward_suffix = ":ip_forwarding";
static char *ipv6_forward_suffix = ":ip6_forwarding";
static kstat_t *loopback_ksp = NULL;
static sin6_t sin6_null; /* Zero address for quick clears */
static sin_t sin_null; /* Zero address for quick clears */
static uint_t ill_index = 1; /* Used to assign interface indicies */
/* When set search for unused index */
static boolean_t ill_index_wrap = B_FALSE;
/* When set search for unused ipif_seqid */
static ipif_t ipif_zero;
uint_t ipif_src_random;
/*
* For details on the protection offered by these locks please refer
* to the notes under the Synchronization section at the start of ip.c
*/
krwlock_t ill_g_lock; /* The global ill_g_lock */
kmutex_t ip_addr_avail_lock; /* Address availability check lock */
ipsq_t *ipsq_g_head; /* List of all ipsq's on the system */
krwlock_t ill_g_usesrc_lock; /* Protects usesrc related fields */
/*
* illgrp_head/ifgrp_head is protected by IP's perimeter.
*/
static ill_group_t *illgrp_head_v4; /* Head of IPv4 ill groups */
ill_group_t *illgrp_head_v6; /* Head of IPv6 ill groups */
ill_g_head_t ill_g_heads[MAX_G_HEADS]; /* ILL List Head */
/*
* ppa arena is created after these many
* interfaces have been plumbed.
*/
uint_t ill_no_arena = 12;
#pragma align CACHE_ALIGN_SIZE(phyint_g_list)
static phyint_list_t phyint_g_list; /* start of phyint list */
/*
* Reflects value of FAILBACK variable in IPMP config file
* /etc/default/mpathd. Default value is B_TRUE.
* Set to B_FALSE if user disabled failback by configuring "FAILBACK=no"
* in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel.
*/
static boolean_t ipmp_enable_failback = B_TRUE;
/*
* Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout
* is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is
* set through platform specific code (Niagara/Ontario).
*/
#define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \
(ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE)
#define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL)
static uint_t
ipif_rand(void)
{
ipif_src_random = ipif_src_random * 1103515245 + 12345;
return ((ipif_src_random >> 16) & 0x7fff);
}
/*
* Allocate per-interface mibs. Only used for ipv6.
* Returns true if ok. False otherwise.
* ipsq may not yet be allocated (loopback case ).
*/
static boolean_t
ill_allocate_mibs(ill_t *ill)
{
ASSERT(ill->ill_isv6);
/* Already allocated? */
if (ill->ill_ip6_mib != NULL) {
ASSERT(ill->ill_icmp6_mib != NULL);
return (B_TRUE);
}
ill->ill_ip6_mib = kmem_zalloc(sizeof (*ill->ill_ip6_mib),
KM_NOSLEEP);
if (ill->ill_ip6_mib == NULL) {
return (B_FALSE);
}
ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
KM_NOSLEEP);
if (ill->ill_icmp6_mib == NULL) {
kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib));
ill->ill_ip6_mib = NULL;
return (B_FALSE);
}
/*
* The ipv6Ifindex and ipv6IfIcmpIndex will be assigned later
* after the phyint merge occurs in ipif_set_values -> ill_glist_insert
* -> ill_phyint_reinit
*/
return (B_TRUE);
}
/*
* Common code for preparation of ARP commands. Two points to remember:
* 1) The ill_name is tacked on at the end of the allocated space so
* the templates name_offset field must contain the total space
* to allocate less the name length.
*
* 2) The templates name_length field should contain the *template*
* length. We use it as a parameter to bcopy() and then write
* the real ill_name_length into the name_length field of the copy.
* (Always called as writer.)
*/
mblk_t *
ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr)
{
arc_t *arc = (arc_t *)template;
char *cp;
int len;
mblk_t *mp;
uint_t name_length = ill->ill_name_length;
uint_t template_len = arc->arc_name_length;
len = arc->arc_name_offset + name_length;
mp = allocb(len, BPRI_HI);
if (mp == NULL)
return (NULL);
cp = (char *)mp->b_rptr;
mp->b_wptr = (uchar_t *)&cp[len];
if (template_len)
bcopy(template, cp, template_len);
if (len > template_len)
bzero(&cp[template_len], len - template_len);
mp->b_datap->db_type = M_PROTO;
arc = (arc_t *)cp;
arc->arc_name_length = name_length;
cp = (char *)arc + arc->arc_name_offset;
bcopy(ill->ill_name, cp, name_length);
if (addr) {
area_t *area = (area_t *)mp->b_rptr;
cp = (char *)area + area->area_proto_addr_offset;
bcopy(addr, cp, area->area_proto_addr_length);
if (area->area_cmd == AR_ENTRY_ADD) {
cp = (char *)area;
len = area->area_proto_addr_length;
if (area->area_proto_mask_offset)
cp += area->area_proto_mask_offset;
else
cp += area->area_proto_addr_offset + len;
while (len-- > 0)
*cp++ = (char)~0;
}
}
return (mp);
}
/*
* Completely vaporize a lower level tap and all associated interfaces.
* ill_delete is called only out of ip_close when the device control
* stream is being closed.
*/
void
ill_delete(ill_t *ill)
{
ipif_t *ipif;
ill_t *prev_ill;
/*
* ill_delete may be forcibly entering the ipsq. The previous
* ioctl may not have completed and may need to be aborted.
* ipsq_flush takes care of it. If we don't need to enter the
* the ipsq forcibly, the 2nd invocation of ipsq_flush in
* ill_delete_tail is sufficient.
*/
ipsq_flush(ill);
/*
* Nuke all interfaces. ipif_free will take down the interface,
* remove it from the list, and free the data structure.
* Walk down the ipif list and remove the logical interfaces
* first before removing the main ipif. We can't unplumb
* zeroth interface first in the case of IPv6 as reset_conn_ill
* -> ip_ll_delmulti_v6 de-references ill_ipif for checking
* POINTOPOINT.
*
* If ill_ipif was not properly initialized (i.e low on memory),
* then no interfaces to clean up. In this case just clean up the
* ill.
*/
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
ipif_free(ipif);
/*
* Used only by ill_arp_on and ill_arp_off, which are writers.
* So nobody can be using this mp now. Free the mp allocated for
* honoring ILLF_NOARP
*/
freemsg(ill->ill_arp_on_mp);
ill->ill_arp_on_mp = NULL;
/* Clean up msgs on pending upcalls for mrouted */
reset_mrt_ill(ill);
/*
* ipif_free -> reset_conn_ipif will remove all multicast
* references for IPv4. For IPv6, we need to do it here as
* it points only at ills.
*/
reset_conn_ill(ill);
/*
* ill_down will arrange to blow off any IRE's dependent on this
* ILL, and shut down fragmentation reassembly.
*/
ill_down(ill);
/* Let SCTP know, so that it can remove this from its list. */
sctp_update_ill(ill, SCTP_ILL_REMOVE);
/*
* If an address on this ILL is being used as a source address then
* clear out the pointers in other ILLs that point to this ILL.
*/
rw_enter(&ill_g_usesrc_lock, RW_WRITER);
if (ill->ill_usesrc_grp_next != NULL) {
if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
ill_disband_usesrc_group(ill);
} else { /* consumer of the usesrc ILL */
prev_ill = ill_prev_usesrc(ill);
prev_ill->ill_usesrc_grp_next =
ill->ill_usesrc_grp_next;
}
}
rw_exit(&ill_g_usesrc_lock);
}
/*
* ill_delete_tail is called from ip_modclose after all references
* to the closing ill are gone. The wait is done in ip_modclose
*/
void
ill_delete_tail(ill_t *ill)
{
mblk_t **mpp;
ipif_t *ipif;
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
ipif_down_tail(ipif);
/*
* If polling capability is enabled (which signifies direct
* upcall into IP and driver has ill saved as a handle),
* we need to make sure that unbind has completed before we
* let the ill disappear and driver no longer has any reference
* to this ill.
*/
mutex_enter(&ill->ill_lock);
while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
cv_wait(&ill->ill_cv, &ill->ill_lock);
mutex_exit(&ill->ill_lock);
/*
* Clean up polling and soft ring capabilities
*/
if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))
ill_capability_dls_disable(ill);
/*
* Send the detach if there's one to send (i.e., if we're above a
* style 2 DLPI driver).
*/
if (ill->ill_detach_mp != NULL) {
ill_dlpi_send(ill, ill->ill_detach_mp);
ill->ill_detach_mp = NULL;
}
if (ill->ill_net_type != IRE_LOOPBACK)
qprocsoff(ill->ill_rq);
/*
* We do an ipsq_flush once again now. New messages could have
* landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
* could also have landed up if an ioctl thread had looked up
* the ill before we set the ILL_CONDEMNED flag, but not yet
* enqueued the ioctl when we did the ipsq_flush last time.
*/
ipsq_flush(ill);
/*
* Free capabilities.
*/
if (ill->ill_ipsec_capab_ah != NULL) {
ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH);
ill_ipsec_capab_free(ill->ill_ipsec_capab_ah);
ill->ill_ipsec_capab_ah = NULL;
}
if (ill->ill_ipsec_capab_esp != NULL) {
ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP);
ill_ipsec_capab_free(ill->ill_ipsec_capab_esp);
ill->ill_ipsec_capab_esp = NULL;
}
if (ill->ill_mdt_capab != NULL) {
kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t));
ill->ill_mdt_capab = NULL;
}
if (ill->ill_hcksum_capab != NULL) {
kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
ill->ill_hcksum_capab = NULL;
}
if (ill->ill_zerocopy_capab != NULL) {
kmem_free(ill->ill_zerocopy_capab,
sizeof (ill_zerocopy_capab_t));
ill->ill_zerocopy_capab = NULL;
}
if (ill->ill_dls_capab != NULL) {
CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn);
ill->ill_dls_capab->ill_unbind_conn = NULL;
kmem_free(ill->ill_dls_capab,
sizeof (ill_dls_capab_t) +
(sizeof (ill_rx_ring_t) * ILL_MAX_RINGS));
ill->ill_dls_capab = NULL;
}
ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL));
while (ill->ill_ipif != NULL)
ipif_free_tail(ill->ill_ipif);
ill_down_tail(ill);
/*
* We have removed all references to ilm from conn and the ones joined
* within the kernel.
*
* We don't walk conns, mrts and ires because
*
* 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts.
* 2) ill_down ->ill_downi walks all the ires and cleans up
* ill references.
*/
ASSERT(ilm_walk_ill(ill) == 0);
/*
* Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free
* could free the phyint. No more reference to the phyint after this
* point.
*/
(void) ill_glist_delete(ill);
rw_enter(&ip_g_nd_lock, RW_WRITER);
if (ill->ill_ndd_name != NULL)
nd_unload(&ip_g_nd, ill->ill_ndd_name);
rw_exit(&ip_g_nd_lock);
if (ill->ill_frag_ptr != NULL) {
uint_t count;
for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
}
mi_free(ill->ill_frag_ptr);
ill->ill_frag_ptr = NULL;
ill->ill_frag_hash_tbl = NULL;
}
if (ill->ill_nd_lla_mp != NULL)
freemsg(ill->ill_nd_lla_mp);
/* Free all retained control messages. */
mpp = &ill->ill_first_mp_to_free;
do {
while (mpp[0]) {
mblk_t *mp;
mblk_t *mp1;
mp = mpp[0];
mpp[0] = mp->b_next;
for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
mp1->b_next = NULL;
mp1->b_prev = NULL;
}
freemsg(mp);
}
} while (mpp++ != &ill->ill_last_mp_to_free);
ill_free_mib(ill);
ILL_TRACE_CLEANUP(ill);
}
static void
ill_free_mib(ill_t *ill)
{
if (ill->ill_ip6_mib != NULL) {
kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib));
ill->ill_ip6_mib = NULL;
}
if (ill->ill_icmp6_mib != NULL) {
kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
ill->ill_icmp6_mib = NULL;
}
}
/*
* Concatenate together a physical address and a sap.
*
* Sap_lengths are interpreted as follows:
* sap_length == 0 ==> no sap
* sap_length > 0 ==> sap is at the head of the dlpi address
* sap_length < 0 ==> sap is at the tail of the dlpi address
*/
static void
ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
{
uint16_t sap_addr = (uint16_t)sap_src;
if (sap_length == 0) {
if (phys_src == NULL)
bzero(dst, phys_length);
else
bcopy(phys_src, dst, phys_length);
} else if (sap_length < 0) {
if (phys_src == NULL)
bzero(dst, phys_length);
else
bcopy(phys_src, dst, phys_length);
bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
} else {
bcopy(&sap_addr, dst, sizeof (sap_addr));
if (phys_src == NULL)
bzero((char *)dst + sap_length, phys_length);
else
bcopy(phys_src, (char *)dst + sap_length, phys_length);
}
}
/*
* Generate a dl_unitdata_req mblk for the device and address given.
* addr_length is the length of the physical portion of the address.
* If addr is NULL include an all zero address of the specified length.
* TRUE? In any case, addr_length is taken to be the entire length of the
* dlpi address, including the absolute value of sap_length.
*/
mblk_t *
ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
t_scalar_t sap_length)
{
dl_unitdata_req_t *dlur;
mblk_t *mp;
t_scalar_t abs_sap_length; /* absolute value */
abs_sap_length = ABS(sap_length);
mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
DL_UNITDATA_REQ);
if (mp == NULL)
return (NULL);
dlur = (dl_unitdata_req_t *)mp->b_rptr;
/* HACK: accomodate incompatible DLPI drivers */
if (addr_length == 8)
addr_length = 6;
dlur->dl_dest_addr_length = addr_length + abs_sap_length;
dlur->dl_dest_addr_offset = sizeof (*dlur);
dlur->dl_priority.dl_min = 0;
dlur->dl_priority.dl_max = 0;
ill_dlur_copy_address(addr, addr_length, sap, sap_length,
(uchar_t *)&dlur[1]);
return (mp);
}
/*
* Add the 'mp' to the list of pending mp's headed by ill_pending_mp
* Return an error if we already have 1 or more ioctls in progress.
* This is used only for non-exclusive ioctls. Currently this is used
* for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive
* and thus need to use ipsq_pending_mp_add.
*/
boolean_t
ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp)
{
ASSERT(MUTEX_HELD(&ill->ill_lock));
ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
/*
* M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls.
*/
ASSERT((add_mp->b_datap->db_type == M_IOCDATA) ||
(add_mp->b_datap->db_type == M_IOCTL));
ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
* Return error if the conn has started closing. The conn
* could have finished cleaning up the pending mp list,
* If so we should not add another mp to the list negating
* the cleanup.
*/
if (connp->conn_state_flags & CONN_CLOSING)
return (B_FALSE);
/*
* Add the pending mp to the head of the list, chained by b_next.
* Note down the conn on which the ioctl request came, in b_prev.
* This will be used to later get the conn, when we get a response
* on the ill queue, from some other module (typically arp)
*/
add_mp->b_next = (void *)ill->ill_pending_mp;
add_mp->b_queue = CONNP_TO_WQ(connp);
ill->ill_pending_mp = add_mp;
if (connp != NULL)
connp->conn_oper_pending_ill = ill;
return (B_TRUE);
}
/*
* Retrieve the ill_pending_mp and return it. We have to walk the list
* of mblks starting at ill_pending_mp, and match based on the ioc_id.
*/
mblk_t *
ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
{
mblk_t *prev = NULL;
mblk_t *curr = NULL;
uint_t id;
conn_t *connp;
/*
* When the conn closes, conn_ioctl_cleanup needs to clean
* up the pending mp, but it does not know the ioc_id and
* passes in a zero for it.
*/
mutex_enter(&ill->ill_lock);
if (ioc_id != 0)
*connpp = NULL;
/* Search the list for the appropriate ioctl based on ioc_id */
for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL;
prev = curr, curr = curr->b_next) {
id = ((struct iocblk *)curr->b_rptr)->ioc_id;
connp = Q_TO_CONN(curr->b_queue);
/* Match based on the ioc_id or based on the conn */
if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp))
break;
}
if (curr != NULL) {
/* Unlink the mblk from the pending mp list */
if (prev != NULL) {
prev->b_next = curr->b_next;
} else {
ASSERT(ill->ill_pending_mp == curr);
ill->ill_pending_mp = curr->b_next;
}
/*
* conn refcnt must have been bumped up at the start of
* the ioctl. So we can safely access the conn.
*/
ASSERT(CONN_Q(curr->b_queue));
*connpp = Q_TO_CONN(curr->b_queue);
curr->b_next = NULL;
curr->b_queue = NULL;
}
mutex_exit(&ill->ill_lock);
return (curr);
}
/*
* Add the pending mp to the list. There can be only 1 pending mp
* in the list. Any exclusive ioctl that needs to wait for a response
* from another module or driver needs to use this function to set
* the ipsq_pending_mp to the ioctl mblk and wait for the response from
* the other module/driver. This is also used while waiting for the
* ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
*/
boolean_t
ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
int waitfor)
{
ipsq_t *ipsq;
ASSERT(IAM_WRITER_IPIF(ipif));
ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
/*
* M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls,
* M_ERROR/M_HANGUP from driver
*/
ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) ||
(DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP));
ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
if (connp != NULL) {
ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
* Return error if the conn has started closing. The conn
* could have finished cleaning up the pending mp list,
* If so we should not add another mp to the list negating
* the cleanup.
*/
if (connp->conn_state_flags & CONN_CLOSING)
return (B_FALSE);
}
mutex_enter(&ipsq->ipsq_lock);
ipsq->ipsq_pending_ipif = ipif;
/*
* Note down the queue in b_queue. This will be returned by
* ipsq_pending_mp_get. Caller will then use these values to restart
* the processing
*/
add_mp->b_next = NULL;
add_mp->b_queue = q;
ipsq->ipsq_pending_mp = add_mp;
ipsq->ipsq_waitfor = waitfor;
/*
* ipsq_current_ipif is needed to restart the operation from
* ipif_ill_refrele_tail when the last reference to the ipi/ill
* is gone. Since this is not an ioctl ipsq_current_ipif has not
* been set until now.
*/
if (DB_TYPE(add_mp) == M_ERROR || DB_TYPE(add_mp) == M_HANGUP) {
ASSERT(ipsq->ipsq_current_ipif == NULL);
ipsq->ipsq_current_ipif = ipif;
ipsq->ipsq_last_cmd = DB_TYPE(add_mp);
}
if (connp != NULL)
connp->conn_oper_pending_ill = ipif->ipif_ill;
mutex_exit(&ipsq->ipsq_lock);
return (B_TRUE);
}
/*
* Retrieve the ipsq_pending_mp and return it. There can be only 1 mp
* queued in the list.
*/
mblk_t *
ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
{
mblk_t *curr = NULL;
mutex_enter(&ipsq->ipsq_lock);
*connpp = NULL;
if (ipsq->ipsq_pending_mp == NULL) {
mutex_exit(&ipsq->ipsq_lock);
return (NULL);
}
/* There can be only 1 such excl message */
curr = ipsq->ipsq_pending_mp;
ASSERT(curr != NULL && curr->b_next == NULL);
ipsq->ipsq_pending_ipif = NULL;
ipsq->ipsq_pending_mp = NULL;
ipsq->ipsq_waitfor = 0;
mutex_exit(&ipsq->ipsq_lock);
if (CONN_Q(curr->b_queue)) {
/*
* This mp did a refhold on the conn, at the start of the ioctl.
* So we can safely return a pointer to the conn to the caller.
*/
*connpp = Q_TO_CONN(curr->b_queue);
} else {
*connpp = NULL;
}
curr->b_next = NULL;
curr->b_prev = NULL;
return (curr);
}
/*
* Cleanup the ioctl mp queued in ipsq_pending_mp
* - Called in the ill_delete path
* - Called in the M_ERROR or M_HANGUP path on the ill.
* - Called in the conn close path.
*/
boolean_t
ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
{
mblk_t *mp;
ipsq_t *ipsq;
queue_t *q;
ipif_t *ipif;
ASSERT(IAM_WRITER_ILL(ill));
ipsq = ill->ill_phyint->phyint_ipsq;
mutex_enter(&ipsq->ipsq_lock);
/*
* If connp is null, unconditionally clean up the ipsq_pending_mp.
* This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl
* even if it is meant for another ill, since we have to enqueue
* a new mp now in ipsq_pending_mp to complete the ipif_down.
* If connp is non-null we are called from the conn close path.
*/
mp = ipsq->ipsq_pending_mp;
if (mp == NULL || (connp != NULL &&
mp->b_queue != CONNP_TO_WQ(connp))) {
mutex_exit(&ipsq->ipsq_lock);
return (B_FALSE);
}
/* Now remove from the ipsq_pending_mp */
ipsq->ipsq_pending_mp = NULL;
q = mp->b_queue;
mp->b_next = NULL;
mp->b_prev = NULL;
mp->b_queue = NULL;
/* If MOVE was in progress, clear the move_in_progress fields also. */
ill = ipsq->ipsq_pending_ipif->ipif_ill;
if (ill->ill_move_in_progress) {
ILL_CLEAR_MOVE(ill);
} else if (ill->ill_up_ipifs) {
ill_group_cleanup(ill);
}
ipif = ipsq->ipsq_pending_ipif;
ipsq->ipsq_pending_ipif = NULL;
ipsq->ipsq_waitfor = 0;
ipsq->ipsq_current_ipif = NULL;
mutex_exit(&ipsq->ipsq_lock);
if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
ip_ioctl_finish(q, mp, ENXIO, connp != NULL ? CONN_CLOSE :
NO_COPYOUT, connp != NULL ? ipif : NULL, NULL);
} else {
/*
* IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
* be just inet_freemsg. we have to restart it
* otherwise the thread will be stuck.
*/
inet_freemsg(mp);
}
return (B_TRUE);
}
/*
* The ill is closing. Cleanup all the pending mps. Called exclusively
* towards the end of ill_delete. The refcount has gone to 0. So nobody
* knows this ill, and hence nobody can add an mp to this list
*/
static void
ill_pending_mp_cleanup(ill_t *ill)
{
mblk_t *mp;
queue_t *q;
ASSERT(IAM_WRITER_ILL(ill));
mutex_enter(&ill->ill_lock);
/*
* Every mp on the pending mp list originating from an ioctl
* added 1 to the conn refcnt, at the start of the ioctl.
* So bump it down now. See comments in ip_wput_nondata()
*/
while (ill->ill_pending_mp != NULL) {
mp = ill->ill_pending_mp;
ill->ill_pending_mp = mp->b_next;
mutex_exit(&ill->ill_lock);
q = mp->b_queue;
ASSERT(CONN_Q(q));
mp->b_next = NULL;
mp->b_prev = NULL;
mp->b_queue = NULL;
ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL, NULL);
mutex_enter(&ill->ill_lock);
}
ill->ill_pending_ipif = NULL;
mutex_exit(&ill->ill_lock);
}
/*
* Called in the conn close path and ill delete path
*/
static void
ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
{
ipsq_t *ipsq;
mblk_t *prev;
mblk_t *curr;
mblk_t *next;
queue_t *q;
mblk_t *tmp_list = NULL;
ASSERT(IAM_WRITER_ILL(ill));
if (connp != NULL)
q = CONNP_TO_WQ(connp);
else
q = ill->ill_wq;
ipsq = ill->ill_phyint->phyint_ipsq;
/*
* Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
* In the case of ioctl from a conn, there can be only 1 mp
* queued on the ipsq. If an ill is being unplumbed, only messages
* related to this ill are flushed, like M_ERROR or M_HANGUP message.
* ioctls meant for this ill form conn's are not flushed. They will
* be processed during ipsq_exit and will not find the ill and will
* return error.
*/
mutex_enter(&ipsq->ipsq_lock);
for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
curr = next) {
next = curr->b_next;
if (curr->b_queue == q || curr->b_queue == RD(q)) {
/* Unlink the mblk from the pending mp list */
if (prev != NULL) {
prev->b_next = curr->b_next;
} else {
ASSERT(ipsq->ipsq_xopq_mphead == curr);
ipsq->ipsq_xopq_mphead = curr->b_next;
}
if (ipsq->ipsq_xopq_mptail == curr)
ipsq->ipsq_xopq_mptail = prev;
/*
* Create a temporary list and release the ipsq lock
* New elements are added to the head of the tmp_list
*/
curr->b_next = tmp_list;
tmp_list = curr;
} else {
prev = curr;
}
}
mutex_exit(&ipsq->ipsq_lock);
while (tmp_list != NULL) {
curr = tmp_list;
tmp_list = curr->b_next;
curr->b_next = NULL;
curr->b_prev = NULL;
curr->b_queue = NULL;
if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
ip_ioctl_finish(q, curr, ENXIO, connp != NULL ?
CONN_CLOSE : NO_COPYOUT, NULL, NULL);
} else {
/*
* IP-MT XXX In the case of TLI/XTI bind / optmgmt
* this can't be just inet_freemsg. we have to
* restart it otherwise the thread will be stuck.
*/
inet_freemsg(curr);
}
}
}
/*
* This conn has started closing. Cleanup any pending ioctl from this conn.
* STREAMS ensures that there can be at most 1 ioctl pending on a stream.
*/
void
conn_ioctl_cleanup(conn_t *connp)
{
mblk_t *curr;
ipsq_t *ipsq;
ill_t *ill;
boolean_t refheld;
/*
* Is any exclusive ioctl pending ? If so clean it up. If the
* ioctl has not yet started, the mp is pending in the list headed by
* ipsq_xopq_head. If the ioctl has started the mp could be present in
* ipsq_pending_mp. If the ioctl timed out in the streamhead but
* is currently executing now the mp is not queued anywhere but
* conn_oper_pending_ill is null. The conn close will wait
* till the conn_ref drops to zero.
*/
mutex_enter(&connp->conn_lock);
ill = connp->conn_oper_pending_ill;
if (ill == NULL) {
mutex_exit(&connp->conn_lock);
return;
}
curr = ill_pending_mp_get(ill, &connp, 0);
if (curr != NULL) {
mutex_exit(&connp->conn_lock);
CONN_DEC_REF(connp);
inet_freemsg(curr);
return;
}
/*
* We may not be able to refhold the ill if the ill/ipif
* is changing. But we need to make sure that the ill will
* not vanish. So we just bump up the ill_waiter count.
*/
refheld = ill_waiter_inc(ill);
mutex_exit(&connp->conn_lock);
if (refheld) {
if (ipsq_enter(ill, B_TRUE)) {
ill_waiter_dcr(ill);
/*
* Check whether this ioctl has started and is
* pending now in ipsq_pending_mp. If it is not
* found there then check whether this ioctl has
* not even started and is in the ipsq_xopq list.
*/
if (!ipsq_pending_mp_cleanup(ill, connp))
ipsq_xopq_mp_cleanup(ill, connp);
ipsq = ill->ill_phyint->phyint_ipsq;
ipsq_exit(ipsq, B_TRUE, B_TRUE);
return;
}
}
/*
* The ill is also closing and we could not bump up the
* ill_waiter_count or we could not enter the ipsq. Leave
* the cleanup to ill_delete
*/
mutex_enter(&connp->conn_lock);
while (connp->conn_oper_pending_ill != NULL)
cv_wait(&connp->conn_refcv, &connp->conn_lock);
mutex_exit(&connp->conn_lock);
if (refheld)
ill_waiter_dcr(ill);
}
/*
* ipcl_walk function for cleaning up conn_*_ill fields.
*/
static void
conn_cleanup_ill(conn_t *connp, caddr_t arg)
{
ill_t *ill = (ill_t *)arg;
ire_t *ire;
mutex_enter(&connp->conn_lock);
if (connp->conn_multicast_ill == ill) {
/* Revert to late binding */
connp->conn_multicast_ill = NULL;
connp->conn_orig_multicast_ifindex = 0;
}
if (connp->conn_incoming_ill == ill)
connp->conn_incoming_ill = NULL;
if (connp->conn_outgoing_ill == ill)
connp->conn_outgoing_ill = NULL;
if (connp->conn_outgoing_pill == ill)
connp->conn_outgoing_pill = NULL;
if (connp->conn_nofailover_ill == ill)
connp->conn_nofailover_ill = NULL;
if (connp->conn_xmit_if_ill == ill)
connp->conn_xmit_if_ill = NULL;
if (connp->conn_ire_cache != NULL) {
ire = connp->conn_ire_cache;
/*
* ip_newroute creates IRE_CACHE with ire_stq coming from
* interface X and ipif coming from interface Y, if interface
* X and Y are part of the same IPMPgroup. Thus whenever
* interface X goes down, remove all references to it by
* checking both on ire_ipif and ire_stq.
*/
if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
(ire->ire_type == IRE_CACHE &&
ire->ire_stq == ill->ill_wq)) {
connp->conn_ire_cache = NULL;
mutex_exit(&connp->conn_lock);
ire_refrele_notr(ire);
return;
}
}
mutex_exit(&connp->conn_lock);
}
/* ARGSUSED */
void
ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
ill_t *ill = q->q_ptr;
ipif_t *ipif;
ASSERT(IAM_WRITER_IPSQ(ipsq));
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
ipif_down_tail(ipif);
ill_down_tail(ill);
freemsg(mp);
ipsq->ipsq_current_ipif = NULL;
}
/*
* ill_down_start is called when we want to down this ill and bring it up again
* It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
* all interfaces, but don't tear down any plumbing.
*/
boolean_t
ill_down_start(queue_t *q, mblk_t *mp)
{
ill_t *ill;
ipif_t *ipif;
ill = q->q_ptr;
ASSERT(IAM_WRITER_ILL(ill));
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
(void) ipif_down(ipif, NULL, NULL);
ill_down(ill);
(void) ipsq_pending_mp_cleanup(ill, NULL);
mutex_enter(&ill->ill_lock);
/*
* Atomically test and add the pending mp if references are
* still active.
*/
if (!ill_is_quiescent(ill)) {
/*
* Get rid of any pending mps and cleanup. Call will
* not fail since we are passing a null connp.
*/
(void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
mp, ILL_DOWN);
mutex_exit(&ill->ill_lock);
return (B_FALSE);
}
mutex_exit(&ill->ill_lock);
return (B_TRUE);
}
static void
ill_down(ill_t *ill)
{
/* Blow off any IREs dependent on this ILL. */
ire_walk(ill_downi, (char *)ill);
mutex_enter(&ire_mrtun_lock);
if (ire_mrtun_count != 0) {
mutex_exit(&ire_mrtun_lock);
ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif,
(char *)ill, NULL);
} else {
mutex_exit(&ire_mrtun_lock);
}
/*
* If any interface based forwarding table exists
* Blow off the ires there dependent on this ill
*/
mutex_enter(&ire_srcif_table_lock);
if (ire_srcif_table_count > 0) {
mutex_exit(&ire_srcif_table_lock);
ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill);
} else {
mutex_exit(&ire_srcif_table_lock);
}
/* Remove any conn_*_ill depending on this ill */
ipcl_walk(conn_cleanup_ill, (caddr_t)ill);
if (ill->ill_group != NULL) {
illgrp_delete(ill);
}
}
static void
ill_down_tail(ill_t *ill)
{
int i;
/* Destroy ill_srcif_table if it exists */
/* Lock not reqd really because nobody should be able to access */
mutex_enter(&ill->ill_lock);
if (ill->ill_srcif_table != NULL) {
ill->ill_srcif_refcnt = 0;
for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) {
rw_destroy(&ill->ill_srcif_table[i].irb_lock);
}
kmem_free(ill->ill_srcif_table,
IP_SRCIF_TABLE_SIZE * sizeof (irb_t));
ill->ill_srcif_table = NULL;
ill->ill_srcif_refcnt = 0;
ill->ill_mrtun_refcnt = 0;
}
mutex_exit(&ill->ill_lock);
}
/*
* ire_walk routine used to delete every IRE that depends on queues
* associated with 'ill'. (Always called as writer.)
*/
static void
ill_downi(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
/*
* ip_newroute creates IRE_CACHE with ire_stq coming from
* interface X and ipif coming from interface Y, if interface
* X and Y are part of the same IPMP group. Thus whenever interface
* X goes down, remove all references to it by checking both
* on ire_ipif and ire_stq.
*/
if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
(ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) {
ire_delete(ire);
}
}
/*
* A seperate routine for deleting revtun and srcif based routes
* are needed because the ires only deleted when the interface
* is unplumbed. Also these ires have ire_in_ill non-null as well.
* we want to keep mobile IP specific code separate.
*/
static void
ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
ASSERT(ire->ire_in_ill != NULL);
if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) ||
(ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) {
ire_delete(ire);
}
}
/*
* Remove ire/nce from the fastpath list.
*/
void
ill_fastpath_nack(ill_t *ill)
{
if (ill->ill_isv6) {
nce_fastpath_list_dispatch(ill, NULL, NULL);
} else {
ire_fastpath_list_dispatch(ill, NULL, NULL);
}
}
/* Consume an M_IOCACK of the fastpath probe. */
void
ill_fastpath_ack(ill_t *ill, mblk_t *mp)
{
mblk_t *mp1 = mp;
/*
* If this was the first attempt turn on the fastpath probing.
*/
mutex_enter(&ill->ill_lock);
if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS)
ill->ill_dlpi_fastpath_state = IDMS_OK;
mutex_exit(&ill->ill_lock);
/* Free the M_IOCACK mblk, hold on to the data */
mp = mp->b_cont;
freeb(mp1);
if (mp == NULL)
return;
if (mp->b_cont != NULL) {
/*
* Update all IRE's or NCE's that are waiting for
* fastpath update.
*/
if (ill->ill_isv6) {
/*
* update nce's in the fastpath list.
*/
nce_fastpath_list_dispatch(ill,
ndp_fastpath_update, mp);
} else {
/*
* update ire's in the fastpath list.
*/
ire_fastpath_list_dispatch(ill,
ire_fastpath_update, mp);
/*
* Check if we need to traverse reverse tunnel table.
* Since there is only single ire_type (IRE_MIPRTUN)
* in the table, we don't need to match on ire_type.
* We have to check ire_mrtun_count and not the
* ill_mrtun_refcnt since ill_mrtun_refcnt is set
* on the incoming ill and here we are dealing with
* outgoing ill.
*/
mutex_enter(&ire_mrtun_lock);
if (ire_mrtun_count != 0) {
mutex_exit(&ire_mrtun_lock);
ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN,
(void (*)(ire_t *, void *))
ire_fastpath_update, mp, ill);
} else {
mutex_exit(&ire_mrtun_lock);
}
}
mp1 = mp->b_cont;
freeb(mp);
mp = mp1;
} else {
ip0dbg(("ill_fastpath_ack: no b_cont\n"));
}
freeb(mp);
}
/*
* Throw an M_IOCTL message downstream asking "do you know fastpath?"
* The data portion of the request is a dl_unitdata_req_t template for
* what we would send downstream in the absence of a fastpath confirmation.
*/
int
ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
{
struct iocblk *ioc;
mblk_t *mp;
if (dlur_mp == NULL)
return (EINVAL);
mutex_enter(&ill->ill_lock);
switch (ill->ill_dlpi_fastpath_state) {
case IDMS_FAILED:
/*
* Driver NAKed the first fastpath ioctl - assume it doesn't
* support it.
*/
mutex_exit(&ill->ill_lock);
return (ENOTSUP);
case IDMS_UNKNOWN:
/* This is the first probe */
ill->ill_dlpi_fastpath_state = IDMS_INPROGRESS;
break;
default:
break;
}
mutex_exit(&ill->ill_lock);
if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
return (EAGAIN);
mp->b_cont = copyb(dlur_mp);
if (mp->b_cont == NULL) {
freeb(mp);
return (EAGAIN);
}
ioc = (struct iocblk *)mp->b_rptr;
ioc->ioc_count = msgdsize(mp->b_cont);
putnext(ill->ill_wq, mp);
return (0);
}
void
ill_capability_probe(ill_t *ill)
{
/*
* Do so only if negotiation is enabled, capabilities are unknown,
* and a capability negotiation is not already in progress.
*/
if (ill->ill_capab_state != IDMS_UNKNOWN &&
ill->ill_capab_state != IDMS_RENEG)
return;
ill->ill_capab_state = IDMS_INPROGRESS;
ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL);
}
void
ill_capability_reset(ill_t *ill)
{
mblk_t *sc_mp = NULL;
mblk_t *tmp;
/*
* Note here that we reset the state to UNKNOWN, and later send
* down the DL_CAPABILITY_REQ without first setting the state to
* INPROGRESS. We do this in order to distinguish the
* DL_CAPABILITY_ACK response which may come back in response to
* a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would
* also handle the case where the driver doesn't send us back
* a DL_CAPABILITY_ACK in response, since the "probe" routine
* requires the state to be in UNKNOWN anyway. In any case, all
* features are turned off until the state reaches IDMS_OK.
*/
ill->ill_capab_state = IDMS_UNKNOWN;
/*
* Disable sub-capabilities and request a list of sub-capability
* messages which will be sent down to the driver. Each handler
* allocates the corresponding dl_capability_sub_t inside an
* mblk, and links it to the existing sc_mp mblk, or return it
* as sc_mp if it's the first sub-capability (the passed in
* sc_mp is NULL). Upon returning from all capability handlers,
* sc_mp will be pulled-up, before passing it downstream.
*/
ill_capability_mdt_reset(ill, &sc_mp);
ill_capability_hcksum_reset(ill, &sc_mp);
ill_capability_zerocopy_reset(ill, &sc_mp);
ill_capability_ipsec_reset(ill, &sc_mp);
ill_capability_dls_reset(ill, &sc_mp);
/* Nothing to send down in order to disable the capabilities? */
if (sc_mp == NULL)
return;
tmp = msgpullup(sc_mp, -1);
freemsg(sc_mp);
if ((sc_mp = tmp) == NULL) {
cmn_err(CE_WARN, "ill_capability_reset: unable to send down "
"DL_CAPABILITY_REQ (ENOMEM)\n");
return;
}
ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n"));
ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp);
}
/*
* Request or set new-style hardware capabilities supported by DLS provider.
*/
static void
ill_capability_proto(ill_t *ill, int type, mblk_t *reqp)
{
mblk_t *mp;
dl_capability_req_t *capb;
size_t size = 0;
uint8_t *ptr;
if (reqp != NULL)
size = MBLKL(reqp);
mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type);
if (mp == NULL) {
freemsg(reqp);
return;
}
ptr = mp->b_rptr;
capb = (dl_capability_req_t *)ptr;
ptr += sizeof (dl_capability_req_t);
if (reqp != NULL) {
capb->dl_sub_offset = sizeof (dl_capability_req_t);
capb->dl_sub_length = size;
bcopy(reqp->b_rptr, ptr, size);
ptr += size;
mp->b_cont = reqp->b_cont;
freeb(reqp);
}
ASSERT(ptr == mp->b_wptr);
ill_dlpi_send(ill, mp);
}
static void
ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
{
dl_capab_id_t *id_ic;
uint_t sub_dl_cap = outers->dl_cap;
dl_capability_sub_t *inners;
uint8_t *capend;
ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(outers + 1) + outers->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_id_ack: "
"malformed sub-capability too long for mblk");
return;
}
id_ic = (dl_capab_id_t *)(outers + 1);
if (outers->dl_length < sizeof (*id_ic) ||
(inners = &id_ic->id_subcap,
inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
"encapsulated capab type %d too long for mblk",
inners->dl_cap);
return;
}
if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
"isn't as expected; pass-thru module(s) detected, "
"discarding capability\n", inners->dl_cap));
return;
}
/* Process the encapsulated sub-capability */
ill_capability_dispatch(ill, mp, inners, B_TRUE);
}
/*
* Process Multidata Transmit capability negotiation ack received from a
* DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a
* DL_CAPABILITY_ACK message.
*/
static void
ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
mblk_t *nmp = NULL;
dl_capability_req_t *oc;
dl_capab_mdt_t *mdt_ic, *mdt_oc;
ill_mdt_capab_t **ill_mdt_capab;
uint_t sub_dl_cap = isub->dl_cap;
uint8_t *capend;
ASSERT(sub_dl_cap == DL_CAPAB_MDT);
ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab;
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_mdt_ack: "
"malformed sub-capability too long for mblk");
return;
}
mdt_ic = (dl_capab_mdt_t *)(isub + 1);
if (mdt_ic->mdt_version != MDT_VERSION_2) {
cmn_err(CE_CONT, "ill_capability_mdt_ack: "
"unsupported MDT sub-capability (version %d, expected %d)",
mdt_ic->mdt_version, MDT_VERSION_2);
return;
}
if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) {
ip1dbg(("ill_capability_mdt_ack: mid token for MDT "
"capability isn't as expected; pass-thru module(s) "
"detected, discarding capability\n"));
return;
}
if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) {
if (*ill_mdt_capab == NULL) {
*ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t),
KM_NOSLEEP);
if (*ill_mdt_capab == NULL) {
cmn_err(CE_WARN, "ill_capability_mdt_ack: "
"could not enable MDT version %d "
"for %s (ENOMEM)\n", MDT_VERSION_2,
ill->ill_name);
return;
}
}
ip1dbg(("ill_capability_mdt_ack: interface %s supports "
"MDT version %d (%d bytes leading, %d bytes trailing "
"header spaces, %d max pld bufs, %d span limit)\n",
ill->ill_name, MDT_VERSION_2,
mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail,
mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit));
(*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2;
(*ill_mdt_capab)->ill_mdt_on = 1;
/*
* Round the following values to the nearest 32-bit; ULP
* may further adjust them to accomodate for additional
* protocol headers. We pass these values to ULP during
* bind time.
*/
(*ill_mdt_capab)->ill_mdt_hdr_head =
roundup(mdt_ic->mdt_hdr_head, 4);
(*ill_mdt_capab)->ill_mdt_hdr_tail =
roundup(mdt_ic->mdt_hdr_tail, 4);
(*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld;
(*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit;
ill->ill_capabilities |= ILL_CAPAB_MDT;
} else {
uint_t size;
uchar_t *rptr;
size = sizeof (dl_capability_req_t) +
sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
cmn_err(CE_WARN, "ill_capability_mdt_ack: "
"could not enable MDT for %s (ENOMEM)\n",
ill->ill_name);
return;
}
rptr = nmp->b_rptr;
/* initialize dl_capability_req_t */
oc = (dl_capability_req_t *)nmp->b_rptr;
oc->dl_sub_offset = sizeof (dl_capability_req_t);
oc->dl_sub_length = sizeof (dl_capability_sub_t) +
sizeof (dl_capab_mdt_t);
nmp->b_rptr += sizeof (dl_capability_req_t);
/* initialize dl_capability_sub_t */
bcopy(isub, nmp->b_rptr, sizeof (*isub));
nmp->b_rptr += sizeof (*isub);
/* initialize dl_capab_mdt_t */
mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr;
bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic));
nmp->b_rptr = rptr;
ip1dbg(("ill_capability_mdt_ack: asking interface %s "
"to enable MDT version %d\n", ill->ill_name,
MDT_VERSION_2));
/* set ENABLE flag */
mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE;
/* nmp points to a DL_CAPABILITY_REQ message to enable MDT */
ill_dlpi_send(ill, nmp);
}
}
static void
ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp)
{
mblk_t *mp;
dl_capab_mdt_t *mdt_subcap;
dl_capability_sub_t *dl_subcap;
int size;
if (!ILL_MDT_CAPABLE(ill))
return;
ASSERT(ill->ill_mdt_capab != NULL);
/*
* Clear the capability flag for MDT but retain the ill_mdt_capab
* structure since it's possible that another thread is still
* referring to it. The structure only gets deallocated when
* we destroy the ill.
*/
ill->ill_capabilities &= ~ILL_CAPAB_MDT;
size = sizeof (*dl_subcap) + sizeof (*mdt_subcap);
mp = allocb(size, BPRI_HI);
if (mp == NULL) {
ip1dbg(("ill_capability_mdt_reset: unable to allocate "
"request to disable MDT\n"));
return;
}
mp->b_wptr = mp->b_rptr + size;
dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
dl_subcap->dl_cap = DL_CAPAB_MDT;
dl_subcap->dl_length = sizeof (*mdt_subcap);
mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1);
mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version;
mdt_subcap->mdt_flags = 0;
mdt_subcap->mdt_hdr_head = 0;
mdt_subcap->mdt_hdr_tail = 0;
if (*sc_mp != NULL)
linkb(*sc_mp, mp);
else
*sc_mp = mp;
}
/*
* Send a DL_NOTIFY_REQ to the specified ill to enable
* DL_NOTE_PROMISC_ON/OFF_PHYS notifications.
* Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware
* acceleration.
* Returns B_TRUE on success, B_FALSE if the message could not be sent.
*/
static boolean_t
ill_enable_promisc_notify(ill_t *ill)
{
mblk_t *mp;
dl_notify_req_t *req;
IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n"));
mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ);
if (mp == NULL)
return (B_FALSE);
req = (dl_notify_req_t *)mp->b_rptr;
req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS |
DL_NOTE_PROMISC_OFF_PHYS;
ill_dlpi_send(ill, mp);
return (B_TRUE);
}
/*
* Allocate an IPsec capability request which will be filled by our
* caller to turn on support for one or more algorithms.
*/
static mblk_t *
ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub)
{
mblk_t *nmp;
dl_capability_req_t *ocap;
dl_capab_ipsec_t *ocip;
dl_capab_ipsec_t *icip;
uint8_t *ptr;
icip = (dl_capab_ipsec_t *)(isub + 1);
/*
* The first time around, we send a DL_NOTIFY_REQ to enable
* PROMISC_ON/OFF notification from the provider. We need to
* do this before enabling the algorithms to avoid leakage of
* cleartext packets.
*/
if (!ill_enable_promisc_notify(ill))
return (NULL);
/*
* Allocate new mblk which will contain a new capability
* request to enable the capabilities.
*/
nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) +
sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ);
if (nmp == NULL)
return (NULL);
ptr = nmp->b_rptr;
/* initialize dl_capability_req_t */
ocap = (dl_capability_req_t *)ptr;
ocap->dl_sub_offset = sizeof (dl_capability_req_t);
ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
ptr += sizeof (dl_capability_req_t);
/* initialize dl_capability_sub_t */
bcopy(isub, ptr, sizeof (*isub));
ptr += sizeof (*isub);
/* initialize dl_capab_ipsec_t */
ocip = (dl_capab_ipsec_t *)ptr;
bcopy(icip, ocip, sizeof (*icip));
nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]);
return (nmp);
}
/*
* Process an IPsec capability negotiation ack received from a DLS Provider.
* isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or
* DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message.
*/
static void
ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
dl_capab_ipsec_t *icip;
dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */
dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */
uint_t cipher, nciphers;
mblk_t *nmp;
uint_t alg_len;
boolean_t need_sadb_dump;
uint_t sub_dl_cap = isub->dl_cap;
ill_ipsec_capab_t **ill_capab;
uint64_t ill_capab_flag;
uint8_t *capend, *ciphend;
boolean_t sadb_resync;
ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH ||
sub_dl_cap == DL_CAPAB_IPSEC_ESP);
if (sub_dl_cap == DL_CAPAB_IPSEC_AH) {
ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah;
ill_capab_flag = ILL_CAPAB_AH;
} else {
ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp;
ill_capab_flag = ILL_CAPAB_ESP;
}
/*
* If the ill capability structure exists, then this incoming
* DL_CAPABILITY_ACK is a response to a "renegotiation" cycle.
* If this is so, then we'd need to resynchronize the SADB
* after re-enabling the offloaded ciphers.
*/
sadb_resync = (*ill_capab != NULL);
/*
* Note: range checks here are not absolutely sufficient to
* make us robust against malformed messages sent by drivers;
* this is in keeping with the rest of IP's dlpi handling.
* (Remember, it's coming from something else in the kernel
* address space)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
"malformed sub-capability too long for mblk");
return;
}
/*
* There are two types of acks we process here:
* 1. acks in reply to a (first form) generic capability req
* (no ENABLE flag set)
* 2. acks in reply to a ENABLE capability req.
* (ENABLE flag set)
*
* We process the subcapability passed as argument as follows:
* 1 do initializations
* 1.1 initialize nmp = NULL
* 1.2 set need_sadb_dump to B_FALSE
* 2 for each cipher in subcapability:
* 2.1 if ENABLE flag is set:
* 2.1.1 update per-ill ipsec capabilities info
* 2.1.2 set need_sadb_dump to B_TRUE
* 2.2 if ENABLE flag is not set:
* 2.2.1 if nmp is NULL:
* 2.2.1.1 allocate and initialize nmp
* 2.2.1.2 init current pos in nmp
* 2.2.2 copy current cipher to current pos in nmp
* 2.2.3 set ENABLE flag in nmp
* 2.2.4 update current pos
* 3 if nmp is not equal to NULL, send enable request
* 3.1 send capability request
* 4 if need_sadb_dump is B_TRUE
* 4.1 enable promiscuous on/off notifications
* 4.2 call ill_dlpi_send(isub->dlcap) to send all
* AH or ESP SA's to interface.
*/
nmp = NULL;
oalg = NULL;
need_sadb_dump = B_FALSE;
icip = (dl_capab_ipsec_t *)(isub + 1);
ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]);
nciphers = icip->cip_nciphers;
ciphend = (uint8_t *)(ialg + icip->cip_nciphers);
if (ciphend > capend) {
cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
"too many ciphers for sub-capability len");
return;
}
for (cipher = 0; cipher < nciphers; cipher++) {
alg_len = sizeof (dl_capab_ipsec_alg_t);
if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) {
/*
* TBD: when we provide a way to disable capabilities
* from above, need to manage the request-pending state
* and fail if we were not expecting this ACK.
*/
IPSECHW_DEBUG(IPSECHW_CAPAB,
("ill_capability_ipsec_ack: got ENABLE ACK\n"));
/*
* Update IPsec capabilities for this ill
*/
if (*ill_capab == NULL) {
IPSECHW_DEBUG(IPSECHW_CAPAB,
("ill_capability_ipsec_ack: "
"allocating ipsec_capab for ill\n"));
*ill_capab = ill_ipsec_capab_alloc();
if (*ill_capab == NULL) {
cmn_err(CE_WARN,
"ill_capability_ipsec_ack: "
"could not enable IPsec Hardware "
"acceleration for %s (ENOMEM)\n",
ill->ill_name);
return;
}
}
ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH ||
ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR);
if (ialg->alg_prim >= MAX_IPSEC_ALGS) {
cmn_err(CE_WARN,
"ill_capability_ipsec_ack: "
"malformed IPsec algorithm id %d",
ialg->alg_prim);
continue;
}
if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) {
IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs,
ialg->alg_prim);
} else {
ipsec_capab_algparm_t *alp;
IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs,
ialg->alg_prim);
if (!ill_ipsec_capab_resize_algparm(*ill_capab,
ialg->alg_prim)) {
cmn_err(CE_WARN,
"ill_capability_ipsec_ack: "
"no space for IPsec alg id %d",
ialg->alg_prim);
continue;
}
alp = &((*ill_capab)->encr_algparm[
ialg->alg_prim]);
alp->minkeylen = ialg->alg_minbits;
alp->maxkeylen = ialg->alg_maxbits;
}
ill->ill_capabilities |= ill_capab_flag;
/*
* indicate that a capability was enabled, which
* will be used below to kick off a SADB dump
* to the ill.
*/
need_sadb_dump = B_TRUE;
} else {
IPSECHW_DEBUG(IPSECHW_CAPAB,
("ill_capability_ipsec_ack: enabling alg 0x%x\n",
ialg->alg_prim));
if (nmp == NULL) {
nmp = ill_alloc_ipsec_cap_req(ill, isub);
if (nmp == NULL) {
/*
* Sending the PROMISC_ON/OFF
* notification request failed.
* We cannot enable the algorithms
* since the Provider will not
* notify IP of promiscous mode
* changes, which could lead
* to leakage of packets.
*/
cmn_err(CE_WARN,
"ill_capability_ipsec_ack: "
"could not enable IPsec Hardware "
"acceleration for %s (ENOMEM)\n",
ill->ill_name);
return;
}
/* ptr to current output alg specifier */
oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
}
/*
* Copy current alg specifier, set ENABLE
* flag, and advance to next output alg.
* For now we enable all IPsec capabilities.
*/
ASSERT(oalg != NULL);
bcopy(ialg, oalg, alg_len);
oalg->alg_flag |= DL_CAPAB_ALG_ENABLE;
nmp->b_wptr += alg_len;
oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
}
/* move to next input algorithm specifier */
ialg = (dl_capab_ipsec_alg_t *)
((char *)ialg + alg_len);
}
if (nmp != NULL)
/*
* nmp points to a DL_CAPABILITY_REQ message to enable
* IPsec hardware acceleration.
*/
ill_dlpi_send(ill, nmp);
if (need_sadb_dump)
/*
* An acknowledgement corresponding to a request to
* enable acceleration was received, notify SADB.
*/
ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync);
}
/*
* Given an mblk with enough space in it, create sub-capability entries for
* DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised
* offloaded ciphers (both AUTH and ENCR) with their enable flags cleared,
* in preparation for the reset the DL_CAPABILITY_REQ message.
*/
static void
ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen,
ill_ipsec_capab_t *ill_cap, mblk_t *mp)
{
dl_capab_ipsec_t *oipsec;
dl_capab_ipsec_alg_t *oalg;
dl_capability_sub_t *dl_subcap;
int i, k;
ASSERT(nciphers > 0);
ASSERT(ill_cap != NULL);
ASSERT(mp != NULL);
ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen);
/* dl_capability_sub_t for "stype" */
dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
dl_subcap->dl_cap = stype;
dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen;
mp->b_wptr += sizeof (dl_capability_sub_t);
/* dl_capab_ipsec_t for "stype" */
oipsec = (dl_capab_ipsec_t *)mp->b_wptr;
oipsec->cip_version = 1;
oipsec->cip_nciphers = nciphers;
mp->b_wptr = (uchar_t *)&oipsec->cip_data[0];
/* create entries for "stype" AUTH ciphers */
for (i = 0; i < ill_cap->algs_size; i++) {
for (k = 0; k < BITSPERBYTE; k++) {
if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0)
continue;
oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
bzero((void *)oalg, sizeof (*oalg));
oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH;
oalg->alg_prim = k + (BITSPERBYTE * i);
mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
}
}
/* create entries for "stype" ENCR ciphers */
for (i = 0; i < ill_cap->algs_size; i++) {
for (k = 0; k < BITSPERBYTE; k++) {
if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0)
continue;
oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
bzero((void *)oalg, sizeof (*oalg));
oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR;
oalg->alg_prim = k + (BITSPERBYTE * i);
mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
}
}
}
/*
* Macro to count number of 1s in a byte (8-bit word). The total count is
* accumulated into the passed-in argument (sum). We could use SPARCv9's
* POPC instruction, but our macro is more flexible for an arbitrary length
* of bytes, such as {auth,encr}_hw_algs. These variables are currently
* 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length
* stays that way, we can reduce the number of iterations required.
*/
#define COUNT_1S(val, sum) { \
uint8_t x = val & 0xff; \
x = (x & 0x55) + ((x >> 1) & 0x55); \
x = (x & 0x33) + ((x >> 2) & 0x33); \
sum += (x & 0xf) + ((x >> 4) & 0xf); \
}
/* ARGSUSED */
static void
ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp)
{
mblk_t *mp;
ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
uint64_t ill_capabilities = ill->ill_capabilities;
int ah_cnt = 0, esp_cnt = 0;
int ah_len = 0, esp_len = 0;
int i, size = 0;
if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)))
return;
ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH));
ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP));
/* Find out the number of ciphers for AH */
if (cap_ah != NULL) {
for (i = 0; i < cap_ah->algs_size; i++) {
COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt);
COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt);
}
if (ah_cnt > 0) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_ipsec_t);
/* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
size += ah_len;
}
}
/* Find out the number of ciphers for ESP */
if (cap_esp != NULL) {
for (i = 0; i < cap_esp->algs_size; i++) {
COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt);
COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt);
}
if (esp_cnt > 0) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_ipsec_t);
/* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
size += esp_len;
}
}
if (size == 0) {
ip1dbg(("ill_capability_ipsec_reset: capabilities exist but "
"there's nothing to reset\n"));
return;
}
mp = allocb(size, BPRI_HI);
if (mp == NULL) {
ip1dbg(("ill_capability_ipsec_reset: unable to allocate "
"request to disable IPSEC Hardware Acceleration\n"));
return;
}
/*
* Clear the capability flags for IPSec HA but retain the ill
* capability structures since it's possible that another thread
* is still referring to them. The structures only get deallocated
* when we destroy the ill.
*
* Various places check the flags to see if the ill is capable of
* hardware acceleration, and by clearing them we ensure that new
* outbound IPSec packets are sent down encrypted.
*/
ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP);
/* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */
if (ah_cnt > 0) {
ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len,
cap_ah, mp);
ASSERT(mp->b_rptr + size >= mp->b_wptr);
}
/* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */
if (esp_cnt > 0) {
ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len,
cap_esp, mp);
ASSERT(mp->b_rptr + size >= mp->b_wptr);
}
/*
* At this point we've composed a bunch of sub-capabilities to be
* encapsulated in a DL_CAPABILITY_REQ and later sent downstream
* by the caller. Upon receiving this reset message, the driver
* must stop inbound decryption (by destroying all inbound SAs)
* and let the corresponding packets come in encrypted.
*/
if (*sc_mp != NULL)
linkb(*sc_mp, mp);
else
*sc_mp = mp;
}
static void
ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
boolean_t encapsulated)
{
boolean_t legacy = B_FALSE;
/*
* If this DL_CAPABILITY_ACK came in as a response to our "reset"
* DL_CAPABILITY_REQ, ignore it during this cycle. We've just
* instructed the driver to disable its advertised capabilities,
* so there's no point in accepting any response at this moment.
*/
if (ill->ill_capab_state == IDMS_UNKNOWN)
return;
/*
* Note that only the following two sub-capabilities may be
* considered as "legacy", since their original definitions
* do not incorporate the dl_mid_t module ID token, and hence
* may require the use of the wrapper sub-capability.
*/
switch (subp->dl_cap) {
case DL_CAPAB_IPSEC_AH:
case DL_CAPAB_IPSEC_ESP:
legacy = B_TRUE;
break;
}
/*
* For legacy sub-capabilities which don't incorporate a queue_t
* pointer in their structures, discard them if we detect that
* there are intermediate modules in between IP and the driver.
*/
if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) {
ip1dbg(("ill_capability_dispatch: unencapsulated capab type "
"%d discarded; %d module(s) present below IP\n",
subp->dl_cap, ill->ill_lmod_cnt));
return;
}
switch (subp->dl_cap) {
case DL_CAPAB_IPSEC_AH:
case DL_CAPAB_IPSEC_ESP:
ill_capability_ipsec_ack(ill, mp, subp);
break;
case DL_CAPAB_MDT:
ill_capability_mdt_ack(ill, mp, subp);
break;
case DL_CAPAB_HCKSUM:
ill_capability_hcksum_ack(ill, mp, subp);
break;
case DL_CAPAB_ZEROCOPY:
ill_capability_zerocopy_ack(ill, mp, subp);
break;
case DL_CAPAB_POLL:
if (!SOFT_RINGS_ENABLED())
ill_capability_dls_ack(ill, mp, subp);
break;
case DL_CAPAB_SOFT_RING:
if (SOFT_RINGS_ENABLED())
ill_capability_dls_ack(ill, mp, subp);
break;
default:
ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
subp->dl_cap));
}
}
/*
* As part of negotiating polling capability, the driver tells us
* the default (or normal) blanking interval and packet threshold
* (the receive timer fires if blanking interval is reached or
* the packet threshold is reached).
*
* As part of manipulating the polling interval, we always use our
* estimated interval (avg service time * number of packets queued
* on the squeue) but we try to blank for a minimum of
* rr_normal_blank_time * rr_max_blank_ratio. We disable the
* packet threshold during this time. When we are not in polling mode
* we set the blank interval typically lower, rr_normal_pkt_cnt *
* rr_min_blank_ratio but up the packet cnt by a ratio of
* rr_min_pkt_cnt_ratio so that we are still getting chains if
* possible although for a shorter interval.
*/
#define RR_MAX_BLANK_RATIO 20
#define RR_MIN_BLANK_RATIO 10
#define RR_MAX_PKT_CNT_RATIO 3
#define RR_MIN_PKT_CNT_RATIO 3
/*
* These can be tuned via /etc/system.
*/
int rr_max_blank_ratio = RR_MAX_BLANK_RATIO;
int rr_min_blank_ratio = RR_MIN_BLANK_RATIO;
int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO;
int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO;
static mac_resource_handle_t
ill_ring_add(void *arg, mac_resource_t *mrp)
{
ill_t *ill = (ill_t *)arg;
mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp;
ill_rx_ring_t *rx_ring;
int ip_rx_index;
ASSERT(mrp != NULL);
if (mrp->mr_type != MAC_RX_FIFO) {
return (NULL);
}
ASSERT(ill != NULL);
ASSERT(ill->ill_dls_capab != NULL);
mutex_enter(&ill->ill_lock);
for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index];
ASSERT(rx_ring != NULL);
if (rx_ring->rr_ring_state == ILL_RING_FREE) {
time_t normal_blank_time =
mrfp->mrf_normal_blank_time;
uint_t normal_pkt_cnt =
mrfp->mrf_normal_pkt_count;
bzero(rx_ring, sizeof (ill_rx_ring_t));
rx_ring->rr_blank = mrfp->mrf_blank;
rx_ring->rr_handle = mrfp->mrf_arg;
rx_ring->rr_ill = ill;
rx_ring->rr_normal_blank_time = normal_blank_time;
rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt;
rx_ring->rr_max_blank_time =
normal_blank_time * rr_max_blank_ratio;
rx_ring->rr_min_blank_time =
normal_blank_time * rr_min_blank_ratio;
rx_ring->rr_max_pkt_cnt =
normal_pkt_cnt * rr_max_pkt_cnt_ratio;
rx_ring->rr_min_pkt_cnt =
normal_pkt_cnt * rr_min_pkt_cnt_ratio;
rx_ring->rr_ring_state = ILL_RING_INUSE;
mutex_exit(&ill->ill_lock);
DTRACE_PROBE2(ill__ring__add, (void *), ill,
(int), ip_rx_index);
return ((mac_resource_handle_t)rx_ring);
}
}
/*
* We ran out of ILL_MAX_RINGS worth rx_ring structures. If
* we have devices which can overwhelm this limit, ILL_MAX_RING
* should be made configurable. Meanwhile it cause no panic because
* driver will pass ip_input a NULL handle which will make
* IP allocate the default squeue and Polling mode will not
* be used for this ring.
*/
cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) "
"for %s\n", ILL_MAX_RINGS, ill->ill_name);
mutex_exit(&ill->ill_lock);
return (NULL);
}
static boolean_t
ill_capability_dls_init(ill_t *ill)
{
ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
conn_t *connp;
size_t sz;
if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
if (ill_dls == NULL) {
cmn_err(CE_PANIC, "ill_capability_dls_init: "
"soft_ring enabled for ill=%s (%p) but data "
"structs uninitialized\n", ill->ill_name,
(void *)ill);
}
return (B_TRUE);
} else if (ill->ill_capabilities & ILL_CAPAB_POLL) {
if (ill_dls == NULL) {
cmn_err(CE_PANIC, "ill_capability_dls_init: "
"polling enabled for ill=%s (%p) but data "
"structs uninitialized\n", ill->ill_name,
(void *)ill);
}
return (B_TRUE);
}
if (ill_dls != NULL) {
ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl;
/* Soft_Ring or polling is being re-enabled */
connp = ill_dls->ill_unbind_conn;
ASSERT(rx_ring != NULL);
bzero((void *)ill_dls, sizeof (ill_dls_capab_t));
bzero((void *)rx_ring,
sizeof (ill_rx_ring_t) * ILL_MAX_RINGS);
ill_dls->ill_ring_tbl = rx_ring;
ill_dls->ill_unbind_conn = connp;
return (B_TRUE);
}
if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL)
return (B_FALSE);
sz = sizeof (ill_dls_capab_t);
sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS;
ill_dls = kmem_zalloc(sz, KM_NOSLEEP);
if (ill_dls == NULL) {