| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| /* Copyright (c) 1990 Mentat Inc. */ |
| |
| /* |
| * This file contains the interface control functions for IP. |
| */ |
| |
| #include <sys/types.h> |
| #include <sys/stream.h> |
| #include <sys/dlpi.h> |
| #include <sys/stropts.h> |
| #include <sys/strsun.h> |
| #include <sys/sysmacros.h> |
| #include <sys/strlog.h> |
| #include <sys/ddi.h> |
| #include <sys/sunddi.h> |
| #include <sys/cmn_err.h> |
| #include <sys/kstat.h> |
| #include <sys/debug.h> |
| #include <sys/zone.h> |
| #include <sys/sunldi.h> |
| #include <sys/file.h> |
| #include <sys/bitmap.h> |
| #include <sys/cpuvar.h> |
| #include <sys/time.h> |
| #include <sys/ctype.h> |
| #include <sys/kmem.h> |
| #include <sys/systm.h> |
| #include <sys/param.h> |
| #include <sys/socket.h> |
| #include <sys/isa_defs.h> |
| #include <net/if.h> |
| #include <net/if_arp.h> |
| #include <net/if_types.h> |
| #include <net/if_dl.h> |
| #include <net/route.h> |
| #include <sys/sockio.h> |
| #include <netinet/in.h> |
| #include <netinet/ip6.h> |
| #include <netinet/icmp6.h> |
| #include <netinet/igmp_var.h> |
| #include <sys/policy.h> |
| #include <sys/ethernet.h> |
| #include <sys/callb.h> |
| #include <sys/md5.h> |
| |
| #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ |
| #include <inet/mi.h> |
| #include <inet/nd.h> |
| #include <inet/arp.h> |
| #include <inet/mib2.h> |
| #include <inet/ip.h> |
| #include <inet/ip6.h> |
| #include <inet/ip6_asp.h> |
| #include <inet/tcp.h> |
| #include <inet/ip_multi.h> |
| #include <inet/ip_ire.h> |
| #include <inet/ip_ftable.h> |
| #include <inet/ip_rts.h> |
| #include <inet/ip_ndp.h> |
| #include <inet/ip_if.h> |
| #include <inet/ip_impl.h> |
| #include <inet/tun.h> |
| #include <inet/sctp_ip.h> |
| #include <inet/ip_netinfo.h> |
| |
| #include <net/pfkeyv2.h> |
| #include <inet/ipsec_info.h> |
| #include <inet/sadb.h> |
| #include <inet/ipsec_impl.h> |
| #include <sys/iphada.h> |
| |
| #include <netinet/igmp.h> |
| #include <inet/ip_listutils.h> |
| #include <inet/ipclassifier.h> |
| #include <sys/mac_client.h> |
| #include <sys/dld.h> |
| |
| #include <sys/systeminfo.h> |
| #include <sys/bootconf.h> |
| |
| #include <sys/tsol/tndb.h> |
| #include <sys/tsol/tnet.h> |
| |
| /* The character which tells where the ill_name ends */ |
| #define IPIF_SEPARATOR_CHAR ':' |
| |
| /* IP ioctl function table entry */ |
| typedef struct ipft_s { |
| int ipft_cmd; |
| pfi_t ipft_pfi; |
| int ipft_min_size; |
| int ipft_flags; |
| } ipft_t; |
| #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ |
| #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ |
| |
| typedef struct ip_sock_ar_s { |
| union { |
| area_t ip_sock_area; |
| ared_t ip_sock_ared; |
| areq_t ip_sock_areq; |
| } ip_sock_ar_u; |
| queue_t *ip_sock_ar_q; |
| } ip_sock_ar_t; |
| |
| static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); |
| static int nd_ill_forward_set(queue_t *q, mblk_t *mp, |
| char *value, caddr_t cp, cred_t *ioc_cr); |
| |
| static boolean_t ill_is_quiescent(ill_t *); |
| static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); |
| static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); |
| static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, |
| mblk_t *mp); |
| static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp); |
| static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, |
| int ioccmd, struct linkblk *li, boolean_t doconsist); |
| static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); |
| static void ip_wput_ioctl(queue_t *q, mblk_t *mp); |
| static void ipsq_flush(ill_t *ill); |
| |
| static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static void ipsq_delete(ipsq_t *); |
| |
| static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, |
| boolean_t initialize, boolean_t insert); |
| static void ipif_check_bcast_ires(ipif_t *test_ipif); |
| static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); |
| static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, |
| boolean_t isv6); |
| static void ipif_down_delete_ire(ire_t *ire, char *ipif); |
| static void ipif_delete_cache_ire(ire_t *, char *); |
| static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); |
| static void ipif_free(ipif_t *ipif); |
| static void ipif_free_tail(ipif_t *ipif); |
| static void ipif_mtu_change(ire_t *ire, char *ipif_arg); |
| static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); |
| static void ipif_set_default(ipif_t *ipif); |
| static int ipif_set_values(queue_t *q, mblk_t *mp, |
| char *interf_name, uint_t *ppa); |
| static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, |
| queue_t *q); |
| static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, |
| boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, |
| queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); |
| static void ipif_update_other_ipifs(ipif_t *old_ipif); |
| |
| static int ill_alloc_ppa(ill_if_t *, ill_t *); |
| static int ill_arp_off(ill_t *ill); |
| static int ill_arp_on(ill_t *ill); |
| static void ill_delete_interface_type(ill_if_t *); |
| static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); |
| static void ill_dl_down(ill_t *ill); |
| static void ill_down(ill_t *ill); |
| static void ill_downi(ire_t *ire, char *ill_arg); |
| static void ill_free_mib(ill_t *ill); |
| static void ill_glist_delete(ill_t *); |
| static void ill_phyint_reinit(ill_t *ill); |
| static void ill_set_nce_router_flags(ill_t *, boolean_t); |
| static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); |
| static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; |
| static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; |
| static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo; |
| static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo; |
| static void ipif_save_ire(ipif_t *, ire_t *); |
| static void ipif_remove_ire(ipif_t *, ire_t *); |
| static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); |
| static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); |
| static void phyint_free(phyint_t *); |
| |
| /* |
| * Per-ill IPsec capabilities management. |
| */ |
| static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); |
| static void ill_ipsec_capab_free(ill_ipsec_capab_t *); |
| static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); |
| static void ill_ipsec_capab_delete(ill_t *, uint_t); |
| static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); |
| static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, |
| boolean_t); |
| static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *); |
| static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *); |
| static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); |
| static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, |
| dl_capability_sub_t *); |
| static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); |
| static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *, |
| int *); |
| static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); |
| static void ill_capability_dld_ack(ill_t *, mblk_t *, |
| dl_capability_sub_t *); |
| static void ill_capability_dld_enable(ill_t *); |
| static void ill_capability_ack_thr(void *); |
| static void ill_capability_lso_enable(ill_t *); |
| static void ill_capability_send(ill_t *, mblk_t *); |
| |
| static ill_t *ill_prev_usesrc(ill_t *); |
| static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); |
| static void ill_disband_usesrc_group(ill_t *); |
| static void conn_cleanup_stale_ire(conn_t *, caddr_t); |
| |
| #ifdef DEBUG |
| static void ill_trace_cleanup(const ill_t *); |
| static void ipif_trace_cleanup(const ipif_t *); |
| #endif |
| |
| /* |
| * if we go over the memory footprint limit more than once in this msec |
| * interval, we'll start pruning aggressively. |
| */ |
| int ip_min_frag_prune_time = 0; |
| |
| /* |
| * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY |
| * and the IPsec DOI |
| */ |
| #define MAX_IPSEC_ALGS 256 |
| |
| #define BITSPERBYTE 8 |
| #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) |
| |
| #define IPSEC_ALG_ENABLE(algs, algid) \ |
| ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ |
| (1 << ((algid) % BITS(ipsec_capab_elem_t)))) |
| |
| #define IPSEC_ALG_IS_ENABLED(algid, algs) \ |
| ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ |
| (1 << ((algid) % BITS(ipsec_capab_elem_t)))) |
| |
| typedef uint8_t ipsec_capab_elem_t; |
| |
| /* |
| * Per-algorithm parameters. Note that at present, only encryption |
| * algorithms have variable keysize (IKE does not provide a way to negotiate |
| * auth algorithm keysize). |
| * |
| * All sizes here are in bits. |
| */ |
| typedef struct |
| { |
| uint16_t minkeylen; |
| uint16_t maxkeylen; |
| } ipsec_capab_algparm_t; |
| |
| /* |
| * Per-ill capabilities. |
| */ |
| struct ill_ipsec_capab_s { |
| ipsec_capab_elem_t *encr_hw_algs; |
| ipsec_capab_elem_t *auth_hw_algs; |
| uint32_t algs_size; /* size of _hw_algs in bytes */ |
| /* algorithm key lengths */ |
| ipsec_capab_algparm_t *encr_algparm; |
| uint32_t encr_algparm_size; |
| uint32_t encr_algparm_end; |
| }; |
| |
| /* |
| * The field values are larger than strictly necessary for simple |
| * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. |
| */ |
| static area_t ip_area_template = { |
| AR_ENTRY_ADD, /* area_cmd */ |
| sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), |
| /* area_name_offset */ |
| /* area_name_length temporarily holds this structure length */ |
| sizeof (area_t), /* area_name_length */ |
| IP_ARP_PROTO_TYPE, /* area_proto */ |
| sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ |
| IP_ADDR_LEN, /* area_proto_addr_length */ |
| sizeof (ip_sock_ar_t) + IP_ADDR_LEN, |
| /* area_proto_mask_offset */ |
| 0, /* area_flags */ |
| sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, |
| /* area_hw_addr_offset */ |
| /* Zero length hw_addr_length means 'use your idea of the address' */ |
| 0 /* area_hw_addr_length */ |
| }; |
| |
| /* |
| * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver |
| * support |
| */ |
| static area_t ip6_area_template = { |
| AR_ENTRY_ADD, /* area_cmd */ |
| sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), |
| /* area_name_offset */ |
| /* area_name_length temporarily holds this structure length */ |
| sizeof (area_t), /* area_name_length */ |
| IP_ARP_PROTO_TYPE, /* area_proto */ |
| sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ |
| IPV6_ADDR_LEN, /* area_proto_addr_length */ |
| sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, |
| /* area_proto_mask_offset */ |
| 0, /* area_flags */ |
| sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, |
| /* area_hw_addr_offset */ |
| /* Zero length hw_addr_length means 'use your idea of the address' */ |
| 0 /* area_hw_addr_length */ |
| }; |
| |
| static ared_t ip_ared_template = { |
| AR_ENTRY_DELETE, |
| sizeof (ared_t) + IP_ADDR_LEN, |
| sizeof (ared_t), |
| IP_ARP_PROTO_TYPE, |
| sizeof (ared_t), |
| IP_ADDR_LEN, |
| 0 |
| }; |
| |
| static ared_t ip6_ared_template = { |
| AR_ENTRY_DELETE, |
| sizeof (ared_t) + IPV6_ADDR_LEN, |
| sizeof (ared_t), |
| IP_ARP_PROTO_TYPE, |
| sizeof (ared_t), |
| IPV6_ADDR_LEN, |
| 0 |
| }; |
| |
| /* |
| * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as |
| * as the areq doesn't include an IP address in ill_dl_up() (the only place a |
| * areq is used). |
| */ |
| static areq_t ip_areq_template = { |
| AR_ENTRY_QUERY, /* cmd */ |
| sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ |
| sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ |
| IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ |
| sizeof (areq_t), /* target addr offset */ |
| IP_ADDR_LEN, /* target addr_length */ |
| 0, /* flags */ |
| sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ |
| IP_ADDR_LEN, /* sender addr length */ |
| AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ |
| AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ |
| AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ |
| /* anything else filled in by the code */ |
| }; |
| |
| static arc_t ip_aru_template = { |
| AR_INTERFACE_UP, |
| sizeof (arc_t), /* Name offset */ |
| sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ |
| }; |
| |
| static arc_t ip_ard_template = { |
| AR_INTERFACE_DOWN, |
| sizeof (arc_t), /* Name offset */ |
| sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ |
| }; |
| |
| static arc_t ip_aron_template = { |
| AR_INTERFACE_ON, |
| sizeof (arc_t), /* Name offset */ |
| sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ |
| }; |
| |
| static arc_t ip_aroff_template = { |
| AR_INTERFACE_OFF, |
| sizeof (arc_t), /* Name offset */ |
| sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ |
| }; |
| |
| static arma_t ip_arma_multi_template = { |
| AR_MAPPING_ADD, |
| sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, |
| /* Name offset */ |
| sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ |
| IP_ARP_PROTO_TYPE, |
| sizeof (arma_t), /* proto_addr_offset */ |
| IP_ADDR_LEN, /* proto_addr_length */ |
| sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ |
| sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ |
| ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ |
| sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ |
| IP_MAX_HW_LEN, /* hw_addr_length */ |
| 0, /* hw_mapping_start */ |
| }; |
| |
| static ipft_t ip_ioctl_ftbl[] = { |
| { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, |
| { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), |
| IPFT_F_NO_REPLY }, |
| { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), |
| IPFT_F_NO_REPLY }, |
| { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, |
| { 0 } |
| }; |
| |
| /* Simple ICMP IP Header Template */ |
| static ipha_t icmp_ipha = { |
| IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP |
| }; |
| |
| /* Flag descriptors for ip_ipif_report */ |
| static nv_t ipif_nv_tbl[] = { |
| { IPIF_UP, "UP" }, |
| { IPIF_BROADCAST, "BROADCAST" }, |
| { ILLF_DEBUG, "DEBUG" }, |
| { PHYI_LOOPBACK, "LOOPBACK" }, |
| { IPIF_POINTOPOINT, "POINTOPOINT" }, |
| { ILLF_NOTRAILERS, "NOTRAILERS" }, |
| { PHYI_RUNNING, "RUNNING" }, |
| { ILLF_NOARP, "NOARP" }, |
| { PHYI_PROMISC, "PROMISC" }, |
| { PHYI_ALLMULTI, "ALLMULTI" }, |
| { PHYI_INTELLIGENT, "INTELLIGENT" }, |
| { ILLF_MULTICAST, "MULTICAST" }, |
| { PHYI_MULTI_BCAST, "MULTI_BCAST" }, |
| { IPIF_UNNUMBERED, "UNNUMBERED" }, |
| { IPIF_DHCPRUNNING, "DHCP" }, |
| { IPIF_PRIVATE, "PRIVATE" }, |
| { IPIF_NOXMIT, "NOXMIT" }, |
| { IPIF_NOLOCAL, "NOLOCAL" }, |
| { IPIF_DEPRECATED, "DEPRECATED" }, |
| { IPIF_PREFERRED, "PREFERRED" }, |
| { IPIF_TEMPORARY, "TEMPORARY" }, |
| { IPIF_ADDRCONF, "ADDRCONF" }, |
| { PHYI_VIRTUAL, "VIRTUAL" }, |
| { ILLF_ROUTER, "ROUTER" }, |
| { ILLF_NONUD, "NONUD" }, |
| { IPIF_ANYCAST, "ANYCAST" }, |
| { ILLF_NORTEXCH, "NORTEXCH" }, |
| { ILLF_IPV4, "IPV4" }, |
| { ILLF_IPV6, "IPV6" }, |
| { IPIF_NOFAILOVER, "NOFAILOVER" }, |
| { PHYI_FAILED, "FAILED" }, |
| { PHYI_STANDBY, "STANDBY" }, |
| { PHYI_INACTIVE, "INACTIVE" }, |
| { PHYI_OFFLINE, "OFFLINE" }, |
| { PHYI_IPMP, "IPMP" } |
| }; |
| |
| static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; |
| |
| static ip_m_t ip_m_tbl[] = { |
| { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_ether_v6intfid }, |
| { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_nodef_v6intfid }, |
| { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_nodef_v6intfid }, |
| { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_nodef_v6intfid }, |
| { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_ether_v6intfid }, |
| { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, |
| ip_ib_v6intfid }, |
| { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL }, |
| { SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid }, |
| { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_nodef_v6intfid } |
| }; |
| |
| static ill_t ill_null; /* Empty ILL for init. */ |
| char ipif_loopback_name[] = "lo0"; |
| static char *ipv4_forward_suffix = ":ip_forwarding"; |
| static char *ipv6_forward_suffix = ":ip6_forwarding"; |
| static sin6_t sin6_null; /* Zero address for quick clears */ |
| static sin_t sin_null; /* Zero address for quick clears */ |
| |
| /* When set search for unused ipif_seqid */ |
| static ipif_t ipif_zero; |
| |
| /* |
| * ppa arena is created after these many |
| * interfaces have been plumbed. |
| */ |
| uint_t ill_no_arena = 12; /* Setable in /etc/system */ |
| |
| /* |
| * Allocate per-interface mibs. |
| * Returns true if ok. False otherwise. |
| * ipsq may not yet be allocated (loopback case ). |
| */ |
| static boolean_t |
| ill_allocate_mibs(ill_t *ill) |
| { |
| /* Already allocated? */ |
| if (ill->ill_ip_mib != NULL) { |
| if (ill->ill_isv6) |
| ASSERT(ill->ill_icmp6_mib != NULL); |
| return (B_TRUE); |
| } |
| |
| ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), |
| KM_NOSLEEP); |
| if (ill->ill_ip_mib == NULL) { |
| return (B_FALSE); |
| } |
| |
| /* Setup static information */ |
| SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, |
| sizeof (mib2_ipIfStatsEntry_t)); |
| if (ill->ill_isv6) { |
| ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; |
| SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, |
| sizeof (mib2_ipv6AddrEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, |
| sizeof (mib2_ipv6RouteEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, |
| sizeof (mib2_ipv6NetToMediaEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, |
| sizeof (ipv6_member_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, |
| sizeof (ipv6_grpsrc_t)); |
| } else { |
| ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; |
| SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, |
| sizeof (mib2_ipAddrEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, |
| sizeof (mib2_ipRouteEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, |
| sizeof (mib2_ipNetToMediaEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, |
| sizeof (ip_member_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, |
| sizeof (ip_grpsrc_t)); |
| |
| /* |
| * For a v4 ill, we are done at this point, because per ill |
| * icmp mibs are only used for v6. |
| */ |
| return (B_TRUE); |
| } |
| |
| ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), |
| KM_NOSLEEP); |
| if (ill->ill_icmp6_mib == NULL) { |
| kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); |
| ill->ill_ip_mib = NULL; |
| return (B_FALSE); |
| } |
| /* static icmp info */ |
| ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = |
| sizeof (mib2_ipv6IfIcmpEntry_t); |
| /* |
| * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later |
| * after the phyint merge occurs in ipif_set_values -> ill_glist_insert |
| * -> ill_phyint_reinit |
| */ |
| return (B_TRUE); |
| } |
| |
| /* |
| * Common code for preparation of ARP commands. Two points to remember: |
| * 1) The ill_name is tacked on at the end of the allocated space so |
| * the templates name_offset field must contain the total space |
| * to allocate less the name length. |
| * |
| * 2) The templates name_length field should contain the *template* |
| * length. We use it as a parameter to bcopy() and then write |
| * the real ill_name_length into the name_length field of the copy. |
| * (Always called as writer.) |
| */ |
| mblk_t * |
| ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr) |
| { |
| arc_t *arc = (arc_t *)template; |
| char *cp; |
| int len; |
| mblk_t *mp; |
| uint_t name_length = ill->ill_name_length; |
| uint_t template_len = arc->arc_name_length; |
| |
| len = arc->arc_name_offset + name_length; |
| mp = allocb(len, BPRI_HI); |
| if (mp == NULL) |
| return (NULL); |
| cp = (char *)mp->b_rptr; |
| mp->b_wptr = (uchar_t *)&cp[len]; |
| if (template_len) |
| bcopy(template, cp, template_len); |
| if (len > template_len) |
| bzero(&cp[template_len], len - template_len); |
| mp->b_datap->db_type = M_PROTO; |
| |
| arc = (arc_t *)cp; |
| arc->arc_name_length = name_length; |
| cp = (char *)arc + arc->arc_name_offset; |
| bcopy(ill->ill_name, cp, name_length); |
| |
| if (addr) { |
| area_t *area = (area_t *)mp->b_rptr; |
| |
| cp = (char *)area + area->area_proto_addr_offset; |
| bcopy(addr, cp, area->area_proto_addr_length); |
| if (area->area_cmd == AR_ENTRY_ADD) { |
| cp = (char *)area; |
| len = area->area_proto_addr_length; |
| if (area->area_proto_mask_offset) |
| cp += area->area_proto_mask_offset; |
| else |
| cp += area->area_proto_addr_offset + len; |
| while (len-- > 0) |
| *cp++ = (char)~0; |
| } |
| } |
| return (mp); |
| } |
| |
| mblk_t * |
| ipif_area_alloc(ipif_t *ipif, uint_t optflags) |
| { |
| caddr_t addr; |
| mblk_t *mp; |
| area_t *area; |
| uchar_t *areap; |
| ill_t *ill = ipif->ipif_ill; |
| |
| if (ill->ill_isv6) { |
| ASSERT(ill->ill_flags & ILLF_XRESOLV); |
| addr = (caddr_t)&ipif->ipif_v6lcl_addr; |
| areap = (uchar_t *)&ip6_area_template; |
| } else { |
| addr = (caddr_t)&ipif->ipif_lcl_addr; |
| areap = (uchar_t *)&ip_area_template; |
| } |
| |
| if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL) |
| return (NULL); |
| |
| /* |
| * IPMP requires that the hardware address be included in all |
| * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on. |
| * If there are no active underlying ills in the group (and thus no |
| * hardware address, DAD will be deferred until an underlying ill |
| * becomes active. |
| */ |
| if (IS_IPMP(ill)) { |
| if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { |
| freemsg(mp); |
| return (NULL); |
| } |
| } else { |
| ill_refhold(ill); |
| } |
| |
| area = (area_t *)mp->b_rptr; |
| area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; |
| area->area_flags |= optflags; |
| area->area_hw_addr_length = ill->ill_phys_addr_length; |
| bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset, |
| area->area_hw_addr_length); |
| |
| ill_refrele(ill); |
| return (mp); |
| } |
| |
| mblk_t * |
| ipif_ared_alloc(ipif_t *ipif) |
| { |
| caddr_t addr; |
| uchar_t *aredp; |
| |
| if (ipif->ipif_ill->ill_isv6) { |
| ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV); |
| addr = (caddr_t)&ipif->ipif_v6lcl_addr; |
| aredp = (uchar_t *)&ip6_ared_template; |
| } else { |
| addr = (caddr_t)&ipif->ipif_lcl_addr; |
| aredp = (uchar_t *)&ip_ared_template; |
| } |
| |
| return (ill_arp_alloc(ipif->ipif_ill, aredp, addr)); |
| } |
| |
| mblk_t * |
| ill_ared_alloc(ill_t *ill, ipaddr_t addr) |
| { |
| return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, |
| (char *)&addr)); |
| } |
| |
| mblk_t * |
| ill_arie_alloc(ill_t *ill, const char *grifname, const void *template) |
| { |
| mblk_t *mp = ill_arp_alloc(ill, template, 0); |
| arie_t *arie; |
| |
| if (mp != NULL) { |
| arie = (arie_t *)mp->b_rptr; |
| (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ); |
| } |
| return (mp); |
| } |
| |
| /* |
| * Completely vaporize a lower level tap and all associated interfaces. |
| * ill_delete is called only out of ip_close when the device control |
| * stream is being closed. |
| */ |
| void |
| ill_delete(ill_t *ill) |
| { |
| ipif_t *ipif; |
| ill_t *prev_ill; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| /* |
| * ill_delete may be forcibly entering the ipsq. The previous |
| * ioctl may not have completed and may need to be aborted. |
| * ipsq_flush takes care of it. If we don't need to enter the |
| * the ipsq forcibly, the 2nd invocation of ipsq_flush in |
| * ill_delete_tail is sufficient. |
| */ |
| ipsq_flush(ill); |
| |
| /* |
| * Nuke all interfaces. ipif_free will take down the interface, |
| * remove it from the list, and free the data structure. |
| * Walk down the ipif list and remove the logical interfaces |
| * first before removing the main ipif. We can't unplumb |
| * zeroth interface first in the case of IPv6 as reset_conn_ill |
| * -> ip_ll_delmulti_v6 de-references ill_ipif for checking |
| * POINTOPOINT. |
| * |
| * If ill_ipif was not properly initialized (i.e low on memory), |
| * then no interfaces to clean up. In this case just clean up the |
| * ill. |
| */ |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| ipif_free(ipif); |
| |
| /* |
| * Used only by ill_arp_on and ill_arp_off, which are writers. |
| * So nobody can be using this mp now. Free the mp allocated for |
| * honoring ILLF_NOARP |
| */ |
| freemsg(ill->ill_arp_on_mp); |
| ill->ill_arp_on_mp = NULL; |
| |
| /* Clean up msgs on pending upcalls for mrouted */ |
| reset_mrt_ill(ill); |
| |
| /* |
| * ipif_free -> reset_conn_ipif will remove all multicast |
| * references for IPv4. For IPv6, we need to do it here as |
| * it points only at ills. |
| */ |
| reset_conn_ill(ill); |
| |
| /* |
| * Remove multicast references added as a result of calls to |
| * ip_join_allmulti(). |
| */ |
| ip_purge_allmulti(ill); |
| |
| /* |
| * If the ill being deleted is under IPMP, boot it out of the illgrp. |
| */ |
| if (IS_UNDER_IPMP(ill)) |
| ipmp_ill_leave_illgrp(ill); |
| |
| /* |
| * ill_down will arrange to blow off any IRE's dependent on this |
| * ILL, and shut down fragmentation reassembly. |
| */ |
| ill_down(ill); |
| |
| /* Let SCTP know, so that it can remove this from its list. */ |
| sctp_update_ill(ill, SCTP_ILL_REMOVE); |
| |
| /* |
| * If an address on this ILL is being used as a source address then |
| * clear out the pointers in other ILLs that point to this ILL. |
| */ |
| rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); |
| if (ill->ill_usesrc_grp_next != NULL) { |
| if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ |
| ill_disband_usesrc_group(ill); |
| } else { /* consumer of the usesrc ILL */ |
| prev_ill = ill_prev_usesrc(ill); |
| prev_ill->ill_usesrc_grp_next = |
| ill->ill_usesrc_grp_next; |
| } |
| } |
| rw_exit(&ipst->ips_ill_g_usesrc_lock); |
| } |
| |
| static void |
| ipif_non_duplicate(ipif_t *ipif) |
| { |
| ill_t *ill = ipif->ipif_ill; |
| mutex_enter(&ill->ill_lock); |
| if (ipif->ipif_flags & IPIF_DUPLICATE) { |
| ipif->ipif_flags &= ~IPIF_DUPLICATE; |
| ASSERT(ill->ill_ipif_dup_count > 0); |
| ill->ill_ipif_dup_count--; |
| } |
| mutex_exit(&ill->ill_lock); |
| } |
| |
| /* |
| * ill_delete_tail is called from ip_modclose after all references |
| * to the closing ill are gone. The wait is done in ip_modclose |
| */ |
| void |
| ill_delete_tail(ill_t *ill) |
| { |
| mblk_t **mpp; |
| ipif_t *ipif; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { |
| ipif_non_duplicate(ipif); |
| ipif_down_tail(ipif); |
| } |
| |
| ASSERT(ill->ill_ipif_dup_count == 0 && |
| ill->ill_arp_down_mp == NULL && |
| ill->ill_arp_del_mapping_mp == NULL); |
| |
| /* |
| * If polling capability is enabled (which signifies direct |
| * upcall into IP and driver has ill saved as a handle), |
| * we need to make sure that unbind has completed before we |
| * let the ill disappear and driver no longer has any reference |
| * to this ill. |
| */ |
| mutex_enter(&ill->ill_lock); |
| while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) |
| cv_wait(&ill->ill_cv, &ill->ill_lock); |
| mutex_exit(&ill->ill_lock); |
| ASSERT(!(ill->ill_capabilities & |
| (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); |
| |
| if (ill->ill_net_type != IRE_LOOPBACK) |
| qprocsoff(ill->ill_rq); |
| |
| /* |
| * We do an ipsq_flush once again now. New messages could have |
| * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls |
| * could also have landed up if an ioctl thread had looked up |
| * the ill before we set the ILL_CONDEMNED flag, but not yet |
| * enqueued the ioctl when we did the ipsq_flush last time. |
| */ |
| ipsq_flush(ill); |
| |
| /* |
| * Free capabilities. |
| */ |
| if (ill->ill_ipsec_capab_ah != NULL) { |
| ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); |
| ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); |
| ill->ill_ipsec_capab_ah = NULL; |
| } |
| |
| if (ill->ill_ipsec_capab_esp != NULL) { |
| ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); |
| ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); |
| ill->ill_ipsec_capab_esp = NULL; |
| } |
| |
| if (ill->ill_mdt_capab != NULL) { |
| kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); |
| ill->ill_mdt_capab = NULL; |
| } |
| |
| if (ill->ill_hcksum_capab != NULL) { |
| kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); |
| ill->ill_hcksum_capab = NULL; |
| } |
| |
| if (ill->ill_zerocopy_capab != NULL) { |
| kmem_free(ill->ill_zerocopy_capab, |
| sizeof (ill_zerocopy_capab_t)); |
| ill->ill_zerocopy_capab = NULL; |
| } |
| |
| if (ill->ill_lso_capab != NULL) { |
| kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); |
| ill->ill_lso_capab = NULL; |
| } |
| |
| if (ill->ill_dld_capab != NULL) { |
| kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); |
| ill->ill_dld_capab = NULL; |
| } |
| |
| while (ill->ill_ipif != NULL) |
| ipif_free_tail(ill->ill_ipif); |
| |
| /* |
| * We have removed all references to ilm from conn and the ones joined |
| * within the kernel. |
| * |
| * We don't walk conns, mrts and ires because |
| * |
| * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. |
| * 2) ill_down ->ill_downi walks all the ires and cleans up |
| * ill references. |
| */ |
| ASSERT(ilm_walk_ill(ill) == 0); |
| |
| /* |
| * If this ill is an IPMP meta-interface, blow away the illgrp. This |
| * is safe to do because the illgrp has already been unlinked from the |
| * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. |
| */ |
| if (IS_IPMP(ill)) { |
| ipmp_illgrp_destroy(ill->ill_grp); |
| ill->ill_grp = NULL; |
| } |
| |
| /* |
| * Take us out of the list of ILLs. ill_glist_delete -> phyint_free |
| * could free the phyint. No more reference to the phyint after this |
| * point. |
| */ |
| (void) ill_glist_delete(ill); |
| |
| rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); |
| if (ill->ill_ndd_name != NULL) |
| nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); |
| rw_exit(&ipst->ips_ip_g_nd_lock); |
| |
| if (ill->ill_frag_ptr != NULL) { |
| uint_t count; |
| |
| for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { |
| mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); |
| } |
| mi_free(ill->ill_frag_ptr); |
| ill->ill_frag_ptr = NULL; |
| ill->ill_frag_hash_tbl = NULL; |
| } |
| |
| freemsg(ill->ill_nd_lla_mp); |
| /* Free all retained control messages. */ |
| mpp = &ill->ill_first_mp_to_free; |
| do { |
| while (mpp[0]) { |
| mblk_t *mp; |
| mblk_t *mp1; |
| |
| mp = mpp[0]; |
| mpp[0] = mp->b_next; |
| for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { |
| mp1->b_next = NULL; |
| mp1->b_prev = NULL; |
| } |
| freemsg(mp); |
| } |
| } while (mpp++ != &ill->ill_last_mp_to_free); |
| |
| ill_free_mib(ill); |
| |
| #ifdef DEBUG |
| ill_trace_cleanup(ill); |
| #endif |
| |
| /* Drop refcnt here */ |
| netstack_rele(ill->ill_ipst->ips_netstack); |
| ill->ill_ipst = NULL; |
| } |
| |
| static void |
| ill_free_mib(ill_t *ill) |
| { |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| /* |
| * MIB statistics must not be lost, so when an interface |
| * goes away the counter values will be added to the global |
| * MIBs. |
| */ |
| if (ill->ill_ip_mib != NULL) { |
| if (ill->ill_isv6) { |
| ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, |
| ill->ill_ip_mib); |
| } else { |
| ip_mib2_add_ip_stats(&ipst->ips_ip_mib, |
| ill->ill_ip_mib); |
| } |
| |
| kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); |
| ill->ill_ip_mib = NULL; |
| } |
| if (ill->ill_icmp6_mib != NULL) { |
| ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, |
| ill->ill_icmp6_mib); |
| kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); |
| ill->ill_icmp6_mib = NULL; |
| } |
| } |
| |
| /* |
| * Concatenate together a physical address and a sap. |
| * |
| * Sap_lengths are interpreted as follows: |
| * sap_length == 0 ==> no sap |
| * sap_length > 0 ==> sap is at the head of the dlpi address |
| * sap_length < 0 ==> sap is at the tail of the dlpi address |
| */ |
| static void |
| ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, |
| t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) |
| { |
| uint16_t sap_addr = (uint16_t)sap_src; |
| |
| if (sap_length == 0) { |
| if (phys_src == NULL) |
| bzero(dst, phys_length); |
| else |
| bcopy(phys_src, dst, phys_length); |
| } else if (sap_length < 0) { |
| if (phys_src == NULL) |
| bzero(dst, phys_length); |
| else |
| bcopy(phys_src, dst, phys_length); |
| bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); |
| } else { |
| bcopy(&sap_addr, dst, sizeof (sap_addr)); |
| if (phys_src == NULL) |
| bzero((char *)dst + sap_length, phys_length); |
| else |
| bcopy(phys_src, (char *)dst + sap_length, phys_length); |
| } |
| } |
| |
| /* |
| * Generate a dl_unitdata_req mblk for the device and address given. |
| * addr_length is the length of the physical portion of the address. |
| * If addr is NULL include an all zero address of the specified length. |
| * TRUE? In any case, addr_length is taken to be the entire length of the |
| * dlpi address, including the absolute value of sap_length. |
| */ |
| mblk_t * |
| ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, |
| t_scalar_t sap_length) |
| { |
| dl_unitdata_req_t *dlur; |
| mblk_t *mp; |
| t_scalar_t abs_sap_length; /* absolute value */ |
| |
| abs_sap_length = ABS(sap_length); |
| mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, |
| DL_UNITDATA_REQ); |
| if (mp == NULL) |
| return (NULL); |
| dlur = (dl_unitdata_req_t *)mp->b_rptr; |
| /* HACK: accomodate incompatible DLPI drivers */ |
| if (addr_length == 8) |
| addr_length = 6; |
| dlur->dl_dest_addr_length = addr_length + abs_sap_length; |
| dlur->dl_dest_addr_offset = sizeof (*dlur); |
| dlur->dl_priority.dl_min = 0; |
| dlur->dl_priority.dl_max = 0; |
| ill_dlur_copy_address(addr, addr_length, sap, sap_length, |
| (uchar_t *)&dlur[1]); |
| return (mp); |
| } |
| |
| /* |
| * Add the 'mp' to the list of pending mp's headed by ill_pending_mp |
| * Return an error if we already have 1 or more ioctls in progress. |
| * This is used only for non-exclusive ioctls. Currently this is used |
| * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive |
| * and thus need to use ipsq_pending_mp_add. |
| */ |
| boolean_t |
| ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) |
| { |
| ASSERT(MUTEX_HELD(&ill->ill_lock)); |
| ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); |
| /* |
| * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. |
| */ |
| ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || |
| (add_mp->b_datap->db_type == M_IOCTL)); |
| |
| ASSERT(MUTEX_HELD(&connp->conn_lock)); |
| /* |
| * Return error if the conn has started closing. The conn |
| * could have finished cleaning up the pending mp list, |
| * If so we should not add another mp to the list negating |
| * the cleanup. |
| */ |
| if (connp->conn_state_flags & CONN_CLOSING) |
| return (B_FALSE); |
| /* |
| * Add the pending mp to the head of the list, chained by b_next. |
| * Note down the conn on which the ioctl request came, in b_prev. |
| * This will be used to later get the conn, when we get a response |
| * on the ill queue, from some other module (typically arp) |
| */ |
| add_mp->b_next = (void *)ill->ill_pending_mp; |
| add_mp->b_queue = CONNP_TO_WQ(connp); |
| ill->ill_pending_mp = add_mp; |
| if (connp != NULL) |
| connp->conn_oper_pending_ill = ill; |
| return (B_TRUE); |
| } |
| |
| /* |
| * Retrieve the ill_pending_mp and return it. We have to walk the list |
| * of mblks starting at ill_pending_mp, and match based on the ioc_id. |
| */ |
| mblk_t * |
| ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) |
| { |
| mblk_t *prev = NULL; |
| mblk_t *curr = NULL; |
| uint_t id; |
| conn_t *connp; |
| |
| /* |
| * When the conn closes, conn_ioctl_cleanup needs to clean |
| * up the pending mp, but it does not know the ioc_id and |
| * passes in a zero for it. |
| */ |
| mutex_enter(&ill->ill_lock); |
| if (ioc_id != 0) |
| *connpp = NULL; |
| |
| /* Search the list for the appropriate ioctl based on ioc_id */ |
| for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; |
| prev = curr, curr = curr->b_next) { |
| id = ((struct iocblk *)curr->b_rptr)->ioc_id; |
| connp = Q_TO_CONN(curr->b_queue); |
| /* Match based on the ioc_id or based on the conn */ |
| if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) |
| break; |
| } |
| |
| if (curr != NULL) { |
| /* Unlink the mblk from the pending mp list */ |
| if (prev != NULL) { |
| prev->b_next = curr->b_next; |
| } else { |
| ASSERT(ill->ill_pending_mp == curr); |
| ill->ill_pending_mp = curr->b_next; |
| } |
| |
| /* |
| * conn refcnt must have been bumped up at the start of |
| * the ioctl. So we can safely access the conn. |
| */ |
| ASSERT(CONN_Q(curr->b_queue)); |
| *connpp = Q_TO_CONN(curr->b_queue); |
| curr->b_next = NULL; |
| curr->b_queue = NULL; |
| } |
| |
| mutex_exit(&ill->ill_lock); |
| |
| return (curr); |
| } |
| |
| /* |
| * Add the pending mp to the list. There can be only 1 pending mp |
| * in the list. Any exclusive ioctl that needs to wait for a response |
| * from another module or driver needs to use this function to set |
| * the ipx_pending_mp to the ioctl mblk and wait for the response from |
| * the other module/driver. This is also used while waiting for the |
| * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. |
| */ |
| boolean_t |
| ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, |
| int waitfor) |
| { |
| ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; |
| |
| ASSERT(IAM_WRITER_IPIF(ipif)); |
| ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); |
| ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); |
| ASSERT(ipx->ipx_pending_mp == NULL); |
| /* |
| * The caller may be using a different ipif than the one passed into |
| * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 |
| * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT |
| * that `ipx_current_ipif == ipif'. |
| */ |
| ASSERT(ipx->ipx_current_ipif != NULL); |
| |
| /* |
| * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, |
| * M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the driver. |
| */ |
| ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || |
| (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP) || |
| (DB_TYPE(add_mp) == M_PROTO) || (DB_TYPE(add_mp) == M_PCPROTO)); |
| |
| if (connp != NULL) { |
| ASSERT(MUTEX_HELD(&connp->conn_lock)); |
| /* |
| * Return error if the conn has started closing. The conn |
| * could have finished cleaning up the pending mp list, |
| * If so we should not add another mp to the list negating |
| * the cleanup. |
| */ |
| if (connp->conn_state_flags & CONN_CLOSING) |
| return (B_FALSE); |
| } |
| mutex_enter(&ipx->ipx_lock); |
| ipx->ipx_pending_ipif = ipif; |
| /* |
| * Note down the queue in b_queue. This will be returned by |
| * ipsq_pending_mp_get. Caller will then use these values to restart |
| * the processing |
| */ |
| add_mp->b_next = NULL; |
| add_mp->b_queue = q; |
| ipx->ipx_pending_mp = add_mp; |
| ipx->ipx_waitfor = waitfor; |
| mutex_exit(&ipx->ipx_lock); |
| |
| if (connp != NULL) |
| connp->conn_oper_pending_ill = ipif->ipif_ill; |
| |
| return (B_TRUE); |
| } |
| |
| /* |
| * Retrieve the ipx_pending_mp and return it. There can be only 1 mp |
| * queued in the list. |
| */ |
| mblk_t * |
| ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) |
| { |
| mblk_t *curr = NULL; |
| ipxop_t *ipx = ipsq->ipsq_xop; |
| |
| *connpp = NULL; |
| mutex_enter(&ipx->ipx_lock); |
| if (ipx->ipx_pending_mp == NULL) { |
| mutex_exit(&ipx->ipx_lock); |
| return (NULL); |
| } |
| |
| /* There can be only 1 such excl message */ |
| curr = ipx->ipx_pending_mp; |
| ASSERT(curr->b_next == NULL); |
| ipx->ipx_pending_ipif = NULL; |
| ipx->ipx_pending_mp = NULL; |
| ipx->ipx_waitfor = 0; |
| mutex_exit(&ipx->ipx_lock); |
| |
| if (CONN_Q(curr->b_queue)) { |
| /* |
| * This mp did a refhold on the conn, at the start of the ioctl. |
| * So we can safely return a pointer to the conn to the caller. |
| */ |
| *connpp = Q_TO_CONN(curr->b_queue); |
| } else { |
| *connpp = NULL; |
| } |
| curr->b_next = NULL; |
| curr->b_prev = NULL; |
| return (curr); |
| } |
| |
| /* |
| * Cleanup the ioctl mp queued in ipx_pending_mp |
| * - Called in the ill_delete path |
| * - Called in the M_ERROR or M_HANGUP path on the ill. |
| * - Called in the conn close path. |
| */ |
| boolean_t |
| ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) |
| { |
| mblk_t *mp; |
| ipxop_t *ipx; |
| queue_t *q; |
| ipif_t *ipif; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; |
| |
| /* |
| * If connp is null, unconditionally clean up the ipx_pending_mp. |
| * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl |
| * even if it is meant for another ill, since we have to enqueue |
| * a new mp now in ipx_pending_mp to complete the ipif_down. |
| * If connp is non-null we are called from the conn close path. |
| */ |
| mutex_enter(&ipx->ipx_lock); |
| mp = ipx->ipx_pending_mp; |
| if (mp == NULL || (connp != NULL && |
| mp->b_queue != CONNP_TO_WQ(connp))) { |
| mutex_exit(&ipx->ipx_lock); |
| return (B_FALSE); |
| } |
| /* Now remove from the ipx_pending_mp */ |
| ipx->ipx_pending_mp = NULL; |
| q = mp->b_queue; |
| mp->b_next = NULL; |
| mp->b_prev = NULL; |
| mp->b_queue = NULL; |
| |
| ipif = ipx->ipx_pending_ipif; |
| ipx->ipx_pending_ipif = NULL; |
| ipx->ipx_waitfor = 0; |
| ipx->ipx_current_ipif = NULL; |
| ipx->ipx_current_ioctl = 0; |
| ipx->ipx_current_done = B_TRUE; |
| mutex_exit(&ipx->ipx_lock); |
| |
| if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { |
| if (connp == NULL) { |
| ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); |
| } else { |
| ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); |
| mutex_enter(&ipif->ipif_ill->ill_lock); |
| ipif->ipif_state_flags &= ~IPIF_CHANGING; |
| mutex_exit(&ipif->ipif_ill->ill_lock); |
| } |
| } else { |
| /* |
| * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't |
| * be just inet_freemsg. we have to restart it |
| * otherwise the thread will be stuck. |
| */ |
| inet_freemsg(mp); |
| } |
| return (B_TRUE); |
| } |
| |
| /* |
| * The ill is closing. Cleanup all the pending mps. Called exclusively |
| * towards the end of ill_delete. The refcount has gone to 0. So nobody |
| * knows this ill, and hence nobody can add an mp to this list |
| */ |
| static void |
| ill_pending_mp_cleanup(ill_t *ill) |
| { |
| mblk_t *mp; |
| queue_t *q; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| mutex_enter(&ill->ill_lock); |
| /* |
| * Every mp on the pending mp list originating from an ioctl |
| * added 1 to the conn refcnt, at the start of the ioctl. |
| * So bump it down now. See comments in ip_wput_nondata() |
| */ |
| while (ill->ill_pending_mp != NULL) { |
| mp = ill->ill_pending_mp; |
| ill->ill_pending_mp = mp->b_next; |
| mutex_exit(&ill->ill_lock); |
| |
| q = mp->b_queue; |
| ASSERT(CONN_Q(q)); |
| mp->b_next = NULL; |
| mp->b_prev = NULL; |
| mp->b_queue = NULL; |
| ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); |
| mutex_enter(&ill->ill_lock); |
| } |
| ill->ill_pending_ipif = NULL; |
| |
| mutex_exit(&ill->ill_lock); |
| } |
| |
| /* |
| * Called in the conn close path and ill delete path |
| */ |
| static void |
| ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) |
| { |
| ipsq_t *ipsq; |
| mblk_t *prev; |
| mblk_t *curr; |
| mblk_t *next; |
| queue_t *q; |
| mblk_t *tmp_list = NULL; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| if (connp != NULL) |
| q = CONNP_TO_WQ(connp); |
| else |
| q = ill->ill_wq; |
| |
| ipsq = ill->ill_phyint->phyint_ipsq; |
| /* |
| * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. |
| * In the case of ioctl from a conn, there can be only 1 mp |
| * queued on the ipsq. If an ill is being unplumbed, only messages |
| * related to this ill are flushed, like M_ERROR or M_HANGUP message. |
| * ioctls meant for this ill form conn's are not flushed. They will |
| * be processed during ipsq_exit and will not find the ill and will |
| * return error. |
| */ |
| mutex_enter(&ipsq->ipsq_lock); |
| for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; |
| curr = next) { |
| next = curr->b_next; |
| if (curr->b_queue == q || curr->b_queue == RD(q)) { |
| /* Unlink the mblk from the pending mp list */ |
| if (prev != NULL) { |
| prev->b_next = curr->b_next; |
| } else { |
| ASSERT(ipsq->ipsq_xopq_mphead == curr); |
| ipsq->ipsq_xopq_mphead = curr->b_next; |
| } |
| if (ipsq->ipsq_xopq_mptail == curr) |
| ipsq->ipsq_xopq_mptail = prev; |
| /* |
| * Create a temporary list and release the ipsq lock |
| * New elements are added to the head of the tmp_list |
| */ |
| curr->b_next = tmp_list; |
| tmp_list = curr; |
| } else { |
| prev = curr; |
| } |
| } |
| mutex_exit(&ipsq->ipsq_lock); |
| |
| while (tmp_list != NULL) { |
| curr = tmp_list; |
| tmp_list = curr->b_next; |
| curr->b_next = NULL; |
| curr->b_prev = NULL; |
| curr->b_queue = NULL; |
| if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { |
| ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? |
| CONN_CLOSE : NO_COPYOUT, NULL); |
| } else { |
| /* |
| * IP-MT XXX In the case of TLI/XTI bind / optmgmt |
| * this can't be just inet_freemsg. we have to |
| * restart it otherwise the thread will be stuck. |
| */ |
| inet_freemsg(curr); |
| } |
| } |
| } |
| |
| /* |
| * This conn has started closing. Cleanup any pending ioctl from this conn. |
| * STREAMS ensures that there can be at most 1 ioctl pending on a stream. |
| */ |
| void |
| conn_ioctl_cleanup(conn_t *connp) |
| { |
| mblk_t *curr; |
| ipsq_t *ipsq; |
| ill_t *ill; |
| boolean_t refheld; |
| |
| /* |
| * Is any exclusive ioctl pending ? If so clean it up. If the |
| * ioctl has not yet started, the mp is pending in the list headed by |
| * ipsq_xopq_head. If the ioctl has started the mp could be present in |
| * ipx_pending_mp. If the ioctl timed out in the streamhead but |
| * is currently executing now the mp is not queued anywhere but |
| * conn_oper_pending_ill is null. The conn close will wait |
| * till the conn_ref drops to zero. |
| */ |
| mutex_enter(&connp->conn_lock); |
| ill = connp->conn_oper_pending_ill; |
| if (ill == NULL) { |
| mutex_exit(&connp->conn_lock); |
| return; |
| } |
| |
| curr = ill_pending_mp_get(ill, &connp, 0); |
| if (curr != NULL) { |
| mutex_exit(&connp->conn_lock); |
| CONN_DEC_REF(connp); |
| inet_freemsg(curr); |
| return; |
| } |
| /* |
| * We may not be able to refhold the ill if the ill/ipif |
| * is changing. But we need to make sure that the ill will |
| * not vanish. So we just bump up the ill_waiter count. |
| */ |
| refheld = ill_waiter_inc(ill); |
| mutex_exit(&connp->conn_lock); |
| if (refheld) { |
| if (ipsq_enter(ill, B_TRUE, NEW_OP)) { |
| ill_waiter_dcr(ill); |
| /* |
| * Check whether this ioctl has started and is |
| * pending. If it is not found there then check |
| * whether this ioctl has not even started and is in |
| * the ipsq_xopq list. |
| */ |
| if (!ipsq_pending_mp_cleanup(ill, connp)) |
| ipsq_xopq_mp_cleanup(ill, connp); |
| ipsq = ill->ill_phyint->phyint_ipsq; |
| ipsq_exit(ipsq); |
| return; |
| } |
| } |
| |
| /* |
| * The ill is also closing and we could not bump up the |
| * ill_waiter_count or we could not enter the ipsq. Leave |
| * the cleanup to ill_delete |
| */ |
| mutex_enter(&connp->conn_lock); |
| while (connp->conn_oper_pending_ill != NULL) |
| cv_wait(&connp->conn_refcv, &connp->conn_lock); |
| mutex_exit(&connp->conn_lock); |
| if (refheld) |
| ill_waiter_dcr(ill); |
| } |
| |
| /* |
| * ipcl_walk function for cleaning up conn_*_ill fields. |
| */ |
| static void |
| conn_cleanup_ill(conn_t *connp, caddr_t arg) |
| { |
| ill_t *ill = (ill_t *)arg; |
| ire_t *ire; |
| |
| mutex_enter(&connp->conn_lock); |
| if (connp->conn_multicast_ill == ill) { |
| /* Revert to late binding */ |
| connp->conn_multicast_ill = NULL; |
| } |
| if (connp->conn_incoming_ill == ill) |
| connp->conn_incoming_ill = NULL; |
| if (connp->conn_outgoing_ill == ill) |
| connp->conn_outgoing_ill = NULL; |
| if (connp->conn_dhcpinit_ill == ill) { |
| connp->conn_dhcpinit_ill = NULL; |
| ASSERT(ill->ill_dhcpinit != 0); |
| atomic_dec_32(&ill->ill_dhcpinit); |
| } |
| if (connp->conn_ire_cache != NULL) { |
| ire = connp->conn_ire_cache; |
| /* |
| * Source address selection makes it possible for IRE_CACHE |
| * entries to be created with ire_stq coming from interface X |
| * and ipif coming from interface Y. Thus whenever interface |
| * X goes down, remove all references to it by checking both |
| * on ire_ipif and ire_stq. |
| */ |
| if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || |
| (ire->ire_type == IRE_CACHE && |
| ire->ire_stq == ill->ill_wq)) { |
| connp->conn_ire_cache = NULL; |
| mutex_exit(&connp->conn_lock); |
| ire_refrele_notr(ire); |
| return; |
| } |
| } |
| mutex_exit(&connp->conn_lock); |
| } |
| |
| /* ARGSUSED */ |
| void |
| ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) |
| { |
| ill_t *ill = q->q_ptr; |
| ipif_t *ipif; |
| |
| ASSERT(IAM_WRITER_IPSQ(ipsq)); |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { |
| ipif_non_duplicate(ipif); |
| ipif_down_tail(ipif); |
| } |
| freemsg(mp); |
| ipsq_current_finish(ipsq); |
| } |
| |
| /* |
| * ill_down_start is called when we want to down this ill and bring it up again |
| * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down |
| * all interfaces, but don't tear down any plumbing. |
| */ |
| boolean_t |
| ill_down_start(queue_t *q, mblk_t *mp) |
| { |
| ill_t *ill = q->q_ptr; |
| ipif_t *ipif; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| (void) ipif_down(ipif, NULL, NULL); |
| |
| ill_down(ill); |
| |
| (void) ipsq_pending_mp_cleanup(ill, NULL); |
| |
| ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); |
| |
| /* |
| * Atomically test and add the pending mp if references are active. |
| */ |
| mutex_enter(&ill->ill_lock); |
| if (!ill_is_quiescent(ill)) { |
| /* call cannot fail since `conn_t *' argument is NULL */ |
| (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, |
| mp, ILL_DOWN); |
| mutex_exit(&ill->ill_lock); |
| return (B_FALSE); |
| } |
| mutex_exit(&ill->ill_lock); |
| return (B_TRUE); |
| } |
| |
| static void |
| ill_down(ill_t *ill) |
| { |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| /* Blow off any IREs dependent on this ILL. */ |
| ire_walk(ill_downi, ill, ipst); |
| |
| /* Remove any conn_*_ill depending on this ill */ |
| ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); |
| } |
| |
| /* |
| * ire_walk routine used to delete every IRE that depends on queues |
| * associated with 'ill'. (Always called as writer.) |
| */ |
| static void |
| ill_downi(ire_t *ire, char *ill_arg) |
| { |
| ill_t *ill = (ill_t *)ill_arg; |
| |
| /* |
| * Source address selection makes it possible for IRE_CACHE |
| * entries to be created with ire_stq coming from interface X |
| * and ipif coming from interface Y. Thus whenever interface |
| * X goes down, remove all references to it by checking both |
| * on ire_ipif and ire_stq. |
| */ |
| if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || |
| (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { |
| ire_delete(ire); |
| } |
| } |
| |
| /* |
| * Remove ire/nce from the fastpath list. |
| */ |
| void |
| ill_fastpath_nack(ill_t *ill) |
| { |
| nce_fastpath_list_dispatch(ill, NULL, NULL); |
| } |
| |
| /* Consume an M_IOCACK of the fastpath probe. */ |
| void |
| ill_fastpath_ack(ill_t *ill, mblk_t *mp) |
| { |
| mblk_t *mp1 = mp; |
| |
| /* |
| * If this was the first attempt turn on the fastpath probing. |
| */ |
| mutex_enter(&ill->ill_lock); |
| if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) |
| ill->ill_dlpi_fastpath_state = IDS_OK; |
| mutex_exit(&ill->ill_lock); |
| |
| /* Free the M_IOCACK mblk, hold on to the data */ |
| mp = mp->b_cont; |
| freeb(mp1); |
| if (mp == NULL) |
| return; |
| if (mp->b_cont != NULL) { |
| /* |
| * Update all IRE's or NCE's that are waiting for |
| * fastpath update. |
| */ |
| nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp); |
| mp1 = mp->b_cont; |
| freeb(mp); |
| mp = mp1; |
| } else { |
| ip0dbg(("ill_fastpath_ack: no b_cont\n")); |
| } |
| |
| freeb(mp); |
| } |
| |
| /* |
| * Throw an M_IOCTL message downstream asking "do you know fastpath?" |
| * The data portion of the request is a dl_unitdata_req_t template for |
| * what we would send downstream in the absence of a fastpath confirmation. |
| */ |
| int |
| ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) |
| { |
| struct iocblk *ioc; |
| mblk_t *mp; |
| |
| if (dlur_mp == NULL) |
| return (EINVAL); |
| |
| mutex_enter(&ill->ill_lock); |
| switch (ill->ill_dlpi_fastpath_state) { |
| case IDS_FAILED: |
| /* |
| * Driver NAKed the first fastpath ioctl - assume it doesn't |
| * support it. |
| */ |
| mutex_exit(&ill->ill_lock); |
| return (ENOTSUP); |
| case IDS_UNKNOWN: |
| /* This is the first probe */ |
| ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; |
| break; |
| default: |
| break; |
| } |
| mutex_exit(&ill->ill_lock); |
| |
| if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) |
| return (EAGAIN); |
| |
| mp->b_cont = copyb(dlur_mp); |
| if (mp->b_cont == NULL) { |
| freeb(mp); |
| return (EAGAIN); |
| } |
| |
| ioc = (struct iocblk *)mp->b_rptr; |
| ioc->ioc_count = msgdsize(mp->b_cont); |
| |
| putnext(ill->ill_wq, mp); |
| return (0); |
| } |
| |
| void |
| ill_capability_probe(ill_t *ill) |
| { |
| mblk_t *mp; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && |
| ill->ill_dlpi_capab_state != IDCS_FAILED) |
| return; |
| |
| /* |
| * We are starting a new cycle of capability negotiation. |
| * Free up the capab reset messages of any previous incarnation. |
| * We will do a fresh allocation when we get the response to our probe |
| */ |
| if (ill->ill_capab_reset_mp != NULL) { |
| freemsg(ill->ill_capab_reset_mp); |
| ill->ill_capab_reset_mp = NULL; |
| } |
| |
| ip1dbg(("ill_capability_probe: starting capability negotiation\n")); |
| |
| mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); |
| if (mp == NULL) |
| return; |
| |
| ill_capability_send(ill, mp); |
| ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; |
| } |
| |
| void |
| ill_capability_reset(ill_t *ill, boolean_t reneg) |
| { |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| if (ill->ill_dlpi_capab_state != IDCS_OK) |
| return; |
| |
| ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; |
| |
| ill_capability_send(ill, ill->ill_capab_reset_mp); |
| ill->ill_capab_reset_mp = NULL; |
| /* |
| * We turn off all capabilities except those pertaining to |
| * direct function call capabilities viz. ILL_CAPAB_DLD* |
| * which will be turned off by the corresponding reset functions. |
| */ |
| ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM | |
| ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP); |
| } |
| |
| static void |
| ill_capability_reset_alloc(ill_t *ill) |
| { |
| mblk_t *mp; |
| size_t size = 0; |
| int err; |
| dl_capability_req_t *capb; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(ill->ill_capab_reset_mp == NULL); |
| |
| if (ILL_MDT_CAPABLE(ill)) |
| size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); |
| |
| if (ILL_HCKSUM_CAPABLE(ill)) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_hcksum_t); |
| } |
| |
| if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_zerocopy_t); |
| } |
| |
| if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) { |
| size += sizeof (dl_capability_sub_t); |
| size += ill_capability_ipsec_reset_size(ill, NULL, NULL, |
| NULL, NULL); |
| } |
| |
| if (ill->ill_capabilities & ILL_CAPAB_DLD) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_dld_t); |
| } |
| |
| mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, |
| STR_NOSIG, &err); |
| |
| mp->b_datap->db_type = M_PROTO; |
| bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); |
| |
| capb = (dl_capability_req_t *)mp->b_rptr; |
| capb->dl_primitive = DL_CAPABILITY_REQ; |
| capb->dl_sub_offset = sizeof (dl_capability_req_t); |
| capb->dl_sub_length = size; |
| |
| mp->b_wptr += sizeof (dl_capability_req_t); |
| |
| /* |
| * Each handler fills in the corresponding dl_capability_sub_t |
| * inside the mblk, |
| */ |
| ill_capability_mdt_reset_fill(ill, mp); |
| ill_capability_hcksum_reset_fill(ill, mp); |
| ill_capability_zerocopy_reset_fill(ill, mp); |
| ill_capability_ipsec_reset_fill(ill, mp); |
| ill_capability_dld_reset_fill(ill, mp); |
| |
| ill->ill_capab_reset_mp = mp; |
| } |
| |
| static void |
| ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) |
| { |
| dl_capab_id_t *id_ic; |
| uint_t sub_dl_cap = outers->dl_cap; |
| dl_capability_sub_t *inners; |
| uint8_t *capend; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| |
| capend = (uint8_t *)(outers + 1) + outers->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_id_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| id_ic = (dl_capab_id_t *)(outers + 1); |
| |
| if (outers->dl_length < sizeof (*id_ic) || |
| (inners = &id_ic->id_subcap, |
| inners->dl_length > (outers->dl_length - sizeof (*inners)))) { |
| cmn_err(CE_WARN, "ill_capability_id_ack: malformed " |
| "encapsulated capab type %d too long for mblk", |
| inners->dl_cap); |
| return; |
| } |
| |
| if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_id_ack: mid token for capab type %d " |
| "isn't as expected; pass-thru module(s) detected, " |
| "discarding capability\n", inners->dl_cap)); |
| return; |
| } |
| |
| /* Process the encapsulated sub-capability */ |
| ill_capability_dispatch(ill, mp, inners, B_TRUE); |
| } |
| |
| /* |
| * Process Multidata Transmit capability negotiation ack received from a |
| * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a |
| * DL_CAPABILITY_ACK message. |
| */ |
| static void |
| ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| mblk_t *nmp = NULL; |
| dl_capability_req_t *oc; |
| dl_capab_mdt_t *mdt_ic, *mdt_oc; |
| ill_mdt_capab_t **ill_mdt_capab; |
| uint_t sub_dl_cap = isub->dl_cap; |
| uint8_t *capend; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_MDT); |
| |
| ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_mdt_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| mdt_ic = (dl_capab_mdt_t *)(isub + 1); |
| |
| if (mdt_ic->mdt_version != MDT_VERSION_2) { |
| cmn_err(CE_CONT, "ill_capability_mdt_ack: " |
| "unsupported MDT sub-capability (version %d, expected %d)", |
| mdt_ic->mdt_version, MDT_VERSION_2); |
| return; |
| } |
| |
| if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_mdt_ack: mid token for MDT " |
| "capability isn't as expected; pass-thru module(s) " |
| "detected, discarding capability\n")); |
| return; |
| } |
| |
| if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { |
| |
| if (*ill_mdt_capab == NULL) { |
| *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), |
| KM_NOSLEEP); |
| if (*ill_mdt_capab == NULL) { |
| cmn_err(CE_WARN, "ill_capability_mdt_ack: " |
| "could not enable MDT version %d " |
| "for %s (ENOMEM)\n", MDT_VERSION_2, |
| ill->ill_name); |
| return; |
| } |
| } |
| |
| ip1dbg(("ill_capability_mdt_ack: interface %s supports " |
| "MDT version %d (%d bytes leading, %d bytes trailing " |
| "header spaces, %d max pld bufs, %d span limit)\n", |
| ill->ill_name, MDT_VERSION_2, |
| mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, |
| mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); |
| |
| (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; |
| (*ill_mdt_capab)->ill_mdt_on = 1; |
| /* |
| * Round the following values to the nearest 32-bit; ULP |
| * may further adjust them to accomodate for additional |
| * protocol headers. We pass these values to ULP during |
| * bind time. |
| */ |
| (*ill_mdt_capab)->ill_mdt_hdr_head = |
| roundup(mdt_ic->mdt_hdr_head, 4); |
| (*ill_mdt_capab)->ill_mdt_hdr_tail = |
| roundup(mdt_ic->mdt_hdr_tail, 4); |
| (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; |
| (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; |
| |
| ill->ill_capabilities |= ILL_CAPAB_MDT; |
| } else { |
| uint_t size; |
| uchar_t *rptr; |
| |
| size = sizeof (dl_capability_req_t) + |
| sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); |
| |
| if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { |
| cmn_err(CE_WARN, "ill_capability_mdt_ack: " |
| "could not enable MDT for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| |
| rptr = nmp->b_rptr; |
| /* initialize dl_capability_req_t */ |
| oc = (dl_capability_req_t *)nmp->b_rptr; |
| oc->dl_sub_offset = sizeof (dl_capability_req_t); |
| oc->dl_sub_length = sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_mdt_t); |
| nmp->b_rptr += sizeof (dl_capability_req_t); |
| |
| /* initialize dl_capability_sub_t */ |
| bcopy(isub, nmp->b_rptr, sizeof (*isub)); |
| nmp->b_rptr += sizeof (*isub); |
| |
| /* initialize dl_capab_mdt_t */ |
| mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; |
| bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); |
| |
| nmp->b_rptr = rptr; |
| |
| ip1dbg(("ill_capability_mdt_ack: asking interface %s " |
| "to enable MDT version %d\n", ill->ill_name, |
| MDT_VERSION_2)); |
| |
| /* set ENABLE flag */ |
| mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; |
| |
| /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ |
| ill_capability_send(ill, nmp); |
| } |
| } |
| |
| static void |
| ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp) |
| { |
| dl_capab_mdt_t *mdt_subcap; |
| dl_capability_sub_t *dl_subcap; |
| |
| if (!ILL_MDT_CAPABLE(ill)) |
| return; |
| |
| ASSERT(ill->ill_mdt_capab != NULL); |
| |
| dl_subcap = (dl_capability_sub_t *)mp->b_wptr; |
| dl_subcap->dl_cap = DL_CAPAB_MDT; |
| dl_subcap->dl_length = sizeof (*mdt_subcap); |
| |
| mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); |
| mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; |
| mdt_subcap->mdt_flags = 0; |
| mdt_subcap->mdt_hdr_head = 0; |
| mdt_subcap->mdt_hdr_tail = 0; |
| |
| mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap); |
| } |
| |
| static void |
| ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) |
| { |
| dl_capability_sub_t *dl_subcap; |
| |
| if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) |
| return; |
| |
| /* |
| * The dl_capab_dld_t that follows the dl_capability_sub_t is not |
| * initialized below since it is not used by DLD. |
| */ |
| dl_subcap = (dl_capability_sub_t *)mp->b_wptr; |
| dl_subcap->dl_cap = DL_CAPAB_DLD; |
| dl_subcap->dl_length = sizeof (dl_capab_dld_t); |
| |
| mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); |
| } |
| |
| /* |
| * Send a DL_NOTIFY_REQ to the specified ill to enable |
| * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. |
| * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware |
| * acceleration. |
| * Returns B_TRUE on success, B_FALSE if the message could not be sent. |
| */ |
| static boolean_t |
| ill_enable_promisc_notify(ill_t *ill) |
| { |
| mblk_t *mp; |
| dl_notify_req_t *req; |
| |
| IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); |
| |
| mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); |
| if (mp == NULL) |
| return (B_FALSE); |
| |
| req = (dl_notify_req_t *)mp->b_rptr; |
| req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | |
| DL_NOTE_PROMISC_OFF_PHYS; |
| |
| ill_dlpi_send(ill, mp); |
| |
| return (B_TRUE); |
| } |
| |
| /* |
| * Allocate an IPsec capability request which will be filled by our |
| * caller to turn on support for one or more algorithms. |
| */ |
| static mblk_t * |
| ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) |
| { |
| mblk_t *nmp; |
| dl_capability_req_t *ocap; |
| dl_capab_ipsec_t *ocip; |
| dl_capab_ipsec_t *icip; |
| uint8_t *ptr; |
| icip = (dl_capab_ipsec_t *)(isub + 1); |
| |
| /* |
| * The first time around, we send a DL_NOTIFY_REQ to enable |
| * PROMISC_ON/OFF notification from the provider. We need to |
| * do this before enabling the algorithms to avoid leakage of |
| * cleartext packets. |
| */ |
| |
| if (!ill_enable_promisc_notify(ill)) |
| return (NULL); |
| |
| /* |
| * Allocate new mblk which will contain a new capability |
| * request to enable the capabilities. |
| */ |
| |
| nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + |
| sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); |
| if (nmp == NULL) |
| return (NULL); |
| |
| ptr = nmp->b_rptr; |
| |
| /* initialize dl_capability_req_t */ |
| ocap = (dl_capability_req_t *)ptr; |
| ocap->dl_sub_offset = sizeof (dl_capability_req_t); |
| ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; |
| ptr += sizeof (dl_capability_req_t); |
| |
| /* initialize dl_capability_sub_t */ |
| bcopy(isub, ptr, sizeof (*isub)); |
| ptr += sizeof (*isub); |
| |
| /* initialize dl_capab_ipsec_t */ |
| ocip = (dl_capab_ipsec_t *)ptr; |
| bcopy(icip, ocip, sizeof (*icip)); |
| |
| nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); |
| return (nmp); |
| } |
| |
| /* |
| * Process an IPsec capability negotiation ack received from a DLS Provider. |
| * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or |
| * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. |
| */ |
| static void |
| ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| dl_capab_ipsec_t *icip; |
| dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ |
| dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ |
| uint_t cipher, nciphers; |
| mblk_t *nmp; |
| uint_t alg_len; |
| boolean_t need_sadb_dump; |
| uint_t sub_dl_cap = isub->dl_cap; |
| ill_ipsec_capab_t **ill_capab; |
| uint64_t ill_capab_flag; |
| uint8_t *capend, *ciphend; |
| boolean_t sadb_resync; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || |
| sub_dl_cap == DL_CAPAB_IPSEC_ESP); |
| |
| if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { |
| ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; |
| ill_capab_flag = ILL_CAPAB_AH; |
| } else { |
| ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; |
| ill_capab_flag = ILL_CAPAB_ESP; |
| } |
| |
| /* |
| * If the ill capability structure exists, then this incoming |
| * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. |
| * If this is so, then we'd need to resynchronize the SADB |
| * after re-enabling the offloaded ciphers. |
| */ |
| sadb_resync = (*ill_capab != NULL); |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_ipsec_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| /* |
| * There are two types of acks we process here: |
| * 1. acks in reply to a (first form) generic capability req |
| * (no ENABLE flag set) |
| * 2. acks in reply to a ENABLE capability req. |
| * (ENABLE flag set) |
| * |
| * We process the subcapability passed as argument as follows: |
| * 1 do initializations |
| * 1.1 initialize nmp = NULL |
| * 1.2 set need_sadb_dump to B_FALSE |
| * 2 for each cipher in subcapability: |
| * 2.1 if ENABLE flag is set: |
| * 2.1.1 update per-ill ipsec capabilities info |
| * 2.1.2 set need_sadb_dump to B_TRUE |
| * 2.2 if ENABLE flag is not set: |
| * 2.2.1 if nmp is NULL: |
| * 2.2.1.1 allocate and initialize nmp |
| * 2.2.1.2 init current pos in nmp |
| * 2.2.2 copy current cipher to current pos in nmp |
| * 2.2.3 set ENABLE flag in nmp |
| * 2.2.4 update current pos |
| * 3 if nmp is not equal to NULL, send enable request |
| * 3.1 send capability request |
| * 4 if need_sadb_dump is B_TRUE |
| * 4.1 enable promiscuous on/off notifications |
| * 4.2 call ill_dlpi_send(isub->dlcap) to send all |
| * AH or ESP SA's to interface. |
| */ |
| |
| nmp = NULL; |
| oalg = NULL; |
| need_sadb_dump = B_FALSE; |
| icip = (dl_capab_ipsec_t *)(isub + 1); |
| ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); |
| |
| nciphers = icip->cip_nciphers; |
| ciphend = (uint8_t *)(ialg + icip->cip_nciphers); |
| |
| if (ciphend > capend) { |
| cmn_err(CE_WARN, "ill_capability_ipsec_ack: " |
| "too many ciphers for sub-capability len"); |
| return; |
| } |
| |
| for (cipher = 0; cipher < nciphers; cipher++) { |
| alg_len = sizeof (dl_capab_ipsec_alg_t); |
| |
| if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { |
| /* |
| * TBD: when we provide a way to disable capabilities |
| * from above, need to manage the request-pending state |
| * and fail if we were not expecting this ACK. |
| */ |
| IPSECHW_DEBUG(IPSECHW_CAPAB, |
| ("ill_capability_ipsec_ack: got ENABLE ACK\n")); |
| |
| /* |
| * Update IPsec capabilities for this ill |
| */ |
| |
| if (*ill_capab == NULL) { |
| IPSECHW_DEBUG(IPSECHW_CAPAB, |
| ("ill_capability_ipsec_ack: " |
| "allocating ipsec_capab for ill\n")); |
| *ill_capab = ill_ipsec_capab_alloc(); |
| |
| if (*ill_capab == NULL) { |
| cmn_err(CE_WARN, |
| "ill_capability_ipsec_ack: " |
| "could not enable IPsec Hardware " |
| "acceleration for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| } |
| |
| ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || |
| ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); |
| |
| if (ialg->alg_prim >= MAX_IPSEC_ALGS) { |
| cmn_err(CE_WARN, |
| "ill_capability_ipsec_ack: " |
| "malformed IPsec algorithm id %d", |
| ialg->alg_prim); |
| continue; |
| } |
| |
| if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { |
| IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, |
| ialg->alg_prim); |
| } else { |
| ipsec_capab_algparm_t *alp; |
| |
| IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, |
| ialg->alg_prim); |
| if (!ill_ipsec_capab_resize_algparm(*ill_capab, |
| ialg->alg_prim)) { |
| cmn_err(CE_WARN, |
| "ill_capability_ipsec_ack: " |
| "no space for IPsec alg id %d", |
| ialg->alg_prim); |
| continue; |
| } |
| alp = &((*ill_capab)->encr_algparm[ |
| ialg->alg_prim]); |
| alp->minkeylen = ialg->alg_minbits; |
| alp->maxkeylen = ialg->alg_maxbits; |
| } |
| ill->ill_capabilities |= ill_capab_flag; |
| /* |
| * indicate that a capability was enabled, which |
| * will be used below to kick off a SADB dump |
| * to the ill. |
| */ |
| need_sadb_dump = B_TRUE; |
| } else { |
| IPSECHW_DEBUG(IPSECHW_CAPAB, |
| ("ill_capability_ipsec_ack: enabling alg 0x%x\n", |
| ialg->alg_prim)); |
| |
| if (nmp == NULL) { |
| nmp = ill_alloc_ipsec_cap_req(ill, isub); |
| if (nmp == NULL) { |
| /* |
| * Sending the PROMISC_ON/OFF |
| * notification request failed. |
| * We cannot enable the algorithms |
| * since the Provider will not |
| * notify IP of promiscous mode |
| * changes, which could lead |
| * to leakage of packets. |
| */ |
| cmn_err(CE_WARN, |
| "ill_capability_ipsec_ack: " |
| "could not enable IPsec Hardware " |
| "acceleration for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| /* ptr to current output alg specifier */ |
| oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; |
| } |
| |
| /* |
| * Copy current alg specifier, set ENABLE |
| * flag, and advance to next output alg. |
| * For now we enable all IPsec capabilities. |
| */ |
| ASSERT(oalg != NULL); |
| bcopy(ialg, oalg, alg_len); |
| oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; |
| nmp->b_wptr += alg_len; |
| oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; |
| } |
| |
| /* move to next input algorithm specifier */ |
| ialg = (dl_capab_ipsec_alg_t *) |
| ((char *)ialg + alg_len); |
| } |
| |
| if (nmp != NULL) |
| /* |
| * nmp points to a DL_CAPABILITY_REQ message to enable |
| * IPsec hardware acceleration. |
| */ |
| ill_capability_send(ill, nmp); |
| |
| if (need_sadb_dump) |
| /* |
| * An acknowledgement corresponding to a request to |
| * enable acceleration was received, notify SADB. |
| */ |
| ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); |
| } |
| |
| /* |
| * Given an mblk with enough space in it, create sub-capability entries for |
| * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised |
| * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, |
| * in preparation for the reset the DL_CAPABILITY_REQ message. |
| */ |
| static void |
| ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, |
| ill_ipsec_capab_t *ill_cap, mblk_t *mp) |
| { |
| dl_capab_ipsec_t *oipsec; |
| dl_capab_ipsec_alg_t *oalg; |
| dl_capability_sub_t *dl_subcap; |
| int i, k; |
| |
| ASSERT(nciphers > 0); |
| ASSERT(ill_cap != NULL); |
| ASSERT(mp != NULL); |
| ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); |
| |
| /* dl_capability_sub_t for "stype" */ |
| dl_subcap = (dl_capability_sub_t *)mp->b_wptr; |
| dl_subcap->dl_cap = stype; |
| dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; |
| mp->b_wptr += sizeof (dl_capability_sub_t); |
| |
| /* dl_capab_ipsec_t for "stype" */ |
| oipsec = (dl_capab_ipsec_t *)mp->b_wptr; |
| oipsec->cip_version = 1; |
| oipsec->cip_nciphers = nciphers; |
| mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; |
| |
| /* create entries for "stype" AUTH ciphers */ |
| for (i = 0; i < ill_cap->algs_size; i++) { |
| for (k = 0; k < BITSPERBYTE; k++) { |
| if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) |
| continue; |
| |
| oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; |
| bzero((void *)oalg, sizeof (*oalg)); |
| oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; |
| oalg->alg_prim = k + (BITSPERBYTE * i); |
| mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); |
| } |
| } |
| /* create entries for "stype" ENCR ciphers */ |
| for (i = 0; i < ill_cap->algs_size; i++) { |
| for (k = 0; k < BITSPERBYTE; k++) { |
| if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) |
| continue; |
| |
| oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; |
| bzero((void *)oalg, sizeof (*oalg)); |
| oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; |
| oalg->alg_prim = k + (BITSPERBYTE * i); |
| mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); |
| } |
| } |
| } |
| |
| /* |
| * Macro to count number of 1s in a byte (8-bit word). The total count is |
| * accumulated into the passed-in argument (sum). We could use SPARCv9's |
| * POPC instruction, but our macro is more flexible for an arbitrary length |
| * of bytes, such as {auth,encr}_hw_algs. These variables are currently |
| * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length |
| * stays that way, we can reduce the number of iterations required. |
| */ |
| #define COUNT_1S(val, sum) { \ |
| uint8_t x = val & 0xff; \ |
| x = (x & 0x55) + ((x >> 1) & 0x55); \ |
| x = (x & 0x33) + ((x >> 2) & 0x33); \ |
| sum += (x & 0xf) + ((x >> 4) & 0xf); \ |
| } |
| |
| /* ARGSUSED */ |
| static int |
| ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp, |
| int *esp_cntp, int *esp_lenp) |
| { |
| ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; |
| ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; |
| uint64_t ill_capabilities = ill->ill_capabilities; |
| int ah_cnt = 0, esp_cnt = 0; |
| int ah_len = 0, esp_len = 0; |
| int i, size = 0; |
| |
| if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) |
| return (0); |
| |
| ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); |
| ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); |
| |
| /* Find out the number of ciphers for AH */ |
| if (cap_ah != NULL) { |
| for (i = 0; i < cap_ah->algs_size; i++) { |
| COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); |
| COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); |
| } |
| if (ah_cnt > 0) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_ipsec_t); |
| /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ |
| ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); |
| size += ah_len; |
| } |
| } |
| |
| /* Find out the number of ciphers for ESP */ |
| if (cap_esp != NULL) { |
| for (i = 0; i < cap_esp->algs_size; i++) { |
| COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); |
| COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); |
| } |
| if (esp_cnt > 0) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_ipsec_t); |
| /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ |
| esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); |
| size += esp_len; |
| } |
| } |
| |
| if (ah_cntp != NULL) |
| *ah_cntp = ah_cnt; |
| if (ah_lenp != NULL) |
| *ah_lenp = ah_len; |
| if (esp_cntp != NULL) |
| *esp_cntp = esp_cnt; |
| if (esp_lenp != NULL) |
| *esp_lenp = esp_len; |
| |
| return (size); |
| } |
| |
| /* ARGSUSED */ |
| static void |
| ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp) |
| { |
| ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; |
| ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; |
| int ah_cnt = 0, esp_cnt = 0; |
| int ah_len = 0, esp_len = 0; |
| int size; |
| |
| size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len, |
| &esp_cnt, &esp_len); |
| if (size == 0) |
| return; |
| |
| /* |
| * Clear the capability flags for IPsec HA but retain the ill |
| * capability structures since it's possible that another thread |
| * is still referring to them. The structures only get deallocated |
| * when we destroy the ill. |
| * |
| * Various places check the flags to see if the ill is capable of |
| * hardware acceleration, and by clearing them we ensure that new |
| * outbound IPsec packets are sent down encrypted. |
| */ |
| |
| /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ |
| if (ah_cnt > 0) { |
| ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, |
| cap_ah, mp); |
| } |
| |
| /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ |
| if (esp_cnt > 0) { |
| ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, |
| cap_esp, mp); |
| } |
| |
| /* |
| * At this point we've composed a bunch of sub-capabilities to be |
| * encapsulated in a DL_CAPABILITY_REQ and later sent downstream |
| * by the caller. Upon receiving this reset message, the driver |
| * must stop inbound decryption (by destroying all inbound SAs) |
| * and let the corresponding packets come in encrypted. |
| */ |
| } |
| |
| static void |
| ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, |
| boolean_t encapsulated) |
| { |
| boolean_t legacy = B_FALSE; |
| |
| /* |
| * Note that only the following two sub-capabilities may be |
| * considered as "legacy", since their original definitions |
| * do not incorporate the dl_mid_t module ID token, and hence |
| * may require the use of the wrapper sub-capability. |
| */ |
| switch (subp->dl_cap) { |
| case DL_CAPAB_IPSEC_AH: |
| case DL_CAPAB_IPSEC_ESP: |
| legacy = B_TRUE; |
| break; |
| } |
| |
| /* |
| * For legacy sub-capabilities which don't incorporate a queue_t |
| * pointer in their structures, discard them if we detect that |
| * there are intermediate modules in between IP and the driver. |
| */ |
| if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { |
| ip1dbg(("ill_capability_dispatch: unencapsulated capab type " |
| "%d discarded; %d module(s) present below IP\n", |
| subp->dl_cap, ill->ill_lmod_cnt)); |
| return; |
| } |
| |
| switch (subp->dl_cap) { |
| case DL_CAPAB_IPSEC_AH: |
| case DL_CAPAB_IPSEC_ESP: |
| ill_capability_ipsec_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_MDT: |
| ill_capability_mdt_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_HCKSUM: |
| ill_capability_hcksum_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_ZEROCOPY: |
| ill_capability_zerocopy_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_DLD: |
| ill_capability_dld_ack(ill, mp, subp); |
| break; |
| default: |
| ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", |
| subp->dl_cap)); |
| } |
| } |
| |
| /* |
| * Process a hardware checksum offload capability negotiation ack received |
| * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) |
| * of a DL_CAPABILITY_ACK message. |
| */ |
| static void |
| ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| dl_capability_req_t *ocap; |
| dl_capab_hcksum_t *ihck, *ohck; |
| ill_hcksum_capab_t **ill_hcksum; |
| mblk_t *nmp = NULL; |
| uint_t sub_dl_cap = isub->dl_cap; |
| uint8_t *capend; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); |
| |
| ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_hcksum_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| /* |
| * There are two types of acks we process here: |
| * 1. acks in reply to a (first form) generic capability req |
| * (no ENABLE flag set) |
| * 2. acks in reply to a ENABLE capability req. |
| * (ENABLE flag set) |
| */ |
| ihck = (dl_capab_hcksum_t *)(isub + 1); |
| |
| if (ihck->hcksum_version != HCKSUM_VERSION_1) { |
| cmn_err(CE_CONT, "ill_capability_hcksum_ack: " |
| "unsupported hardware checksum " |
| "sub-capability (version %d, expected %d)", |
| ihck->hcksum_version, HCKSUM_VERSION_1); |
| return; |
| } |
| |
| if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " |
| "checksum capability isn't as expected; pass-thru " |
| "module(s) detected, discarding capability\n")); |
| return; |
| } |
| |
| #define CURR_HCKSUM_CAPAB \ |
| (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ |
| HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) |
| |
| if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && |
| (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { |
| /* do ENABLE processing */ |
| if (*ill_hcksum == NULL) { |
| *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), |
| KM_NOSLEEP); |
| |
| if (*ill_hcksum == NULL) { |
| cmn_err(CE_WARN, "ill_capability_hcksum_ack: " |
| "could not enable hcksum version %d " |
| "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, |
| ill->ill_name); |
| return; |
| } |
| } |
| |
| (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; |
| (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; |
| ill->ill_capabilities |= ILL_CAPAB_HCKSUM; |
| ip1dbg(("ill_capability_hcksum_ack: interface %s " |
| "has enabled hardware checksumming\n ", |
| ill->ill_name)); |
| } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { |
| /* |
| * Enabling hardware checksum offload |
| * Currently IP supports {TCP,UDP}/IPv4 |
| * partial and full cksum offload and |
| * IPv4 header checksum offload. |
| * Allocate new mblk which will |
| * contain a new capability request |
| * to enable hardware checksum offload. |
| */ |
| uint_t size; |
| uchar_t *rptr; |
| |
| size = sizeof (dl_capability_req_t) + |
| sizeof (dl_capability_sub_t) + isub->dl_length; |
| |
| if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { |
| cmn_err(CE_WARN, "ill_capability_hcksum_ack: " |
| "could not enable hardware cksum for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| |
| rptr = nmp->b_rptr; |
| /* initialize dl_capability_req_t */ |
| ocap = (dl_capability_req_t *)nmp->b_rptr; |
| ocap->dl_sub_offset = |
| sizeof (dl_capability_req_t); |
| ocap->dl_sub_length = |
| sizeof (dl_capability_sub_t) + |
| isub->dl_length; |
| nmp->b_rptr += sizeof (dl_capability_req_t); |
| |
| /* initialize dl_capability_sub_t */ |
| bcopy(isub, nmp->b_rptr, sizeof (*isub)); |
| nmp->b_rptr += sizeof (*isub); |
| |
| /* initialize dl_capab_hcksum_t */ |
| ohck = (dl_capab_hcksum_t *)nmp->b_rptr; |
| bcopy(ihck, ohck, sizeof (*ihck)); |
| |
| nmp->b_rptr = rptr; |
| ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); |
| |
| /* Set EN
|