| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright 2006 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| /* Copyright (c) 1990 Mentat Inc. */ |
| |
| #pragma ident "%Z%%M% %I% %E% SMI" |
| |
| /* |
| * This file contains the interface control functions for IP. |
| */ |
| |
| #include <sys/types.h> |
| #include <sys/stream.h> |
| #include <sys/dlpi.h> |
| #include <sys/stropts.h> |
| #include <sys/strsun.h> |
| #include <sys/sysmacros.h> |
| #include <sys/strlog.h> |
| #include <sys/ddi.h> |
| #include <sys/sunddi.h> |
| #include <sys/cmn_err.h> |
| #include <sys/kstat.h> |
| #include <sys/debug.h> |
| #include <sys/zone.h> |
| |
| #include <sys/kmem.h> |
| #include <sys/systm.h> |
| #include <sys/param.h> |
| #include <sys/socket.h> |
| #include <sys/isa_defs.h> |
| #include <net/if.h> |
| #include <net/if_arp.h> |
| #include <net/if_types.h> |
| #include <net/if_dl.h> |
| #include <net/route.h> |
| #include <sys/sockio.h> |
| #include <netinet/in.h> |
| #include <netinet/ip6.h> |
| #include <netinet/icmp6.h> |
| #include <netinet/igmp_var.h> |
| #include <sys/strsun.h> |
| #include <sys/policy.h> |
| #include <sys/ethernet.h> |
| |
| #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ |
| #include <inet/mi.h> |
| #include <inet/nd.h> |
| #include <inet/arp.h> |
| #include <inet/mib2.h> |
| #include <inet/ip.h> |
| #include <inet/ip6.h> |
| #include <inet/ip6_asp.h> |
| #include <inet/tcp.h> |
| #include <inet/ip_multi.h> |
| #include <inet/ip_ire.h> |
| #include <inet/ip_rts.h> |
| #include <inet/ip_ndp.h> |
| #include <inet/ip_if.h> |
| #include <inet/ip_impl.h> |
| #include <inet/tun.h> |
| #include <inet/sctp_ip.h> |
| |
| #include <net/pfkeyv2.h> |
| #include <inet/ipsec_info.h> |
| #include <inet/sadb.h> |
| #include <inet/ipsec_impl.h> |
| #include <sys/iphada.h> |
| |
| |
| #include <netinet/igmp.h> |
| #include <inet/ip_listutils.h> |
| #include <inet/ipclassifier.h> |
| #include <sys/mac.h> |
| |
| #include <sys/systeminfo.h> |
| #include <sys/bootconf.h> |
| |
| #include <sys/tsol/tndb.h> |
| #include <sys/tsol/tnet.h> |
| |
| /* The character which tells where the ill_name ends */ |
| #define IPIF_SEPARATOR_CHAR ':' |
| |
| /* IP ioctl function table entry */ |
| typedef struct ipft_s { |
| int ipft_cmd; |
| pfi_t ipft_pfi; |
| int ipft_min_size; |
| int ipft_flags; |
| } ipft_t; |
| #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ |
| #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ |
| |
| typedef struct ip_sock_ar_s { |
| union { |
| area_t ip_sock_area; |
| ared_t ip_sock_ared; |
| areq_t ip_sock_areq; |
| } ip_sock_ar_u; |
| queue_t *ip_sock_ar_q; |
| } ip_sock_ar_t; |
| |
| static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); |
| static int nd_ill_forward_set(queue_t *q, mblk_t *mp, |
| char *value, caddr_t cp, cred_t *ioc_cr); |
| |
| static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); |
| static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); |
| static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, |
| mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp); |
| static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, |
| sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl); |
| static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **); |
| static void ip_wput_ioctl(queue_t *q, mblk_t *mp); |
| static void ipsq_flush(ill_t *ill); |
| static void ipsq_clean_all(ill_t *ill); |
| static void ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring); |
| static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static void ipsq_delete(ipsq_t *); |
| |
| static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, |
| boolean_t initialize); |
| static void ipif_check_bcast_ires(ipif_t *test_ipif); |
| static void ipif_down_delete_ire(ire_t *ire, char *ipif); |
| static void ipif_delete_cache_ire(ire_t *, char *); |
| static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); |
| static void ipif_down_tail(ipif_t *ipif); |
| static void ipif_free(ipif_t *ipif); |
| static void ipif_free_tail(ipif_t *ipif); |
| static void ipif_mask_reply(ipif_t *); |
| static void ipif_mtu_change(ire_t *ire, char *ipif_arg); |
| static void ipif_multicast_down(ipif_t *ipif); |
| static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); |
| static void ipif_set_default(ipif_t *ipif); |
| static int ipif_set_values(queue_t *q, mblk_t *mp, |
| char *interf_name, uint_t *ppa); |
| static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, |
| queue_t *q); |
| static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, |
| boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, |
| queue_t *q, mblk_t *mp, ipsq_func_t func, int *error); |
| static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); |
| static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); |
| |
| static int ill_alloc_ppa(ill_if_t *, ill_t *); |
| static int ill_arp_off(ill_t *ill); |
| static int ill_arp_on(ill_t *ill); |
| static void ill_delete_interface_type(ill_if_t *); |
| static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); |
| static void ill_down(ill_t *ill); |
| static void ill_downi(ire_t *ire, char *ill_arg); |
| static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); |
| static void ill_down_tail(ill_t *ill); |
| static void ill_free_mib(ill_t *ill); |
| static void ill_glist_delete(ill_t *); |
| static boolean_t ill_has_usable_ipif(ill_t *); |
| static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); |
| static void ill_nominate_bcast_rcv(ill_group_t *illgrp); |
| static void ill_phyint_free(ill_t *ill); |
| static void ill_phyint_reinit(ill_t *ill); |
| static void ill_set_nce_router_flags(ill_t *, boolean_t); |
| static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); |
| static boolean_t ill_split_ipsq(ipsq_t *cur_sq); |
| static void ill_stq_cache_delete(ire_t *, char *); |
| |
| static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); |
| static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); |
| static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, |
| in6_addr_t *); |
| static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, |
| ipaddr_t *); |
| static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); |
| static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, |
| in6_addr_t *); |
| static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, |
| ipaddr_t *); |
| |
| static void ipif_save_ire(ipif_t *, ire_t *); |
| static void ipif_remove_ire(ipif_t *, ire_t *); |
| static void ip_cgtp_bcast_add(ire_t *, ire_t *); |
| static void ip_cgtp_bcast_delete(ire_t *); |
| |
| /* |
| * Per-ill IPsec capabilities management. |
| */ |
| static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); |
| static void ill_ipsec_capab_free(ill_ipsec_capab_t *); |
| static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); |
| static void ill_ipsec_capab_delete(ill_t *, uint_t); |
| static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); |
| static void ill_capability_proto(ill_t *, int, mblk_t *); |
| static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, |
| boolean_t); |
| static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_mdt_reset(ill_t *, mblk_t **); |
| static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_ipsec_reset(ill_t *, mblk_t **); |
| static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_hcksum_reset(ill_t *, mblk_t **); |
| static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, |
| dl_capability_sub_t *); |
| static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); |
| |
| static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); |
| static void ill_capability_dls_reset(ill_t *, mblk_t **); |
| static void ill_capability_dls_disable(ill_t *); |
| |
| static void illgrp_cache_delete(ire_t *, char *); |
| static void illgrp_delete(ill_t *ill); |
| static void illgrp_reset_schednext(ill_t *ill); |
| |
| static ill_t *ill_prev_usesrc(ill_t *); |
| static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); |
| static void ill_disband_usesrc_group(ill_t *); |
| |
| static void conn_cleanup_stale_ire(conn_t *, caddr_t); |
| |
| /* |
| * if we go over the memory footprint limit more than once in this msec |
| * interval, we'll start pruning aggressively. |
| */ |
| int ip_min_frag_prune_time = 0; |
| |
| /* |
| * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY |
| * and the IPsec DOI |
| */ |
| #define MAX_IPSEC_ALGS 256 |
| |
| #define BITSPERBYTE 8 |
| #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) |
| |
| #define IPSEC_ALG_ENABLE(algs, algid) \ |
| ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ |
| (1 << ((algid) % BITS(ipsec_capab_elem_t)))) |
| |
| #define IPSEC_ALG_IS_ENABLED(algid, algs) \ |
| ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ |
| (1 << ((algid) % BITS(ipsec_capab_elem_t)))) |
| |
| typedef uint8_t ipsec_capab_elem_t; |
| |
| /* |
| * Per-algorithm parameters. Note that at present, only encryption |
| * algorithms have variable keysize (IKE does not provide a way to negotiate |
| * auth algorithm keysize). |
| * |
| * All sizes here are in bits. |
| */ |
| typedef struct |
| { |
| uint16_t minkeylen; |
| uint16_t maxkeylen; |
| } ipsec_capab_algparm_t; |
| |
| /* |
| * Per-ill capabilities. |
| */ |
| struct ill_ipsec_capab_s { |
| ipsec_capab_elem_t *encr_hw_algs; |
| ipsec_capab_elem_t *auth_hw_algs; |
| uint32_t algs_size; /* size of _hw_algs in bytes */ |
| /* algorithm key lengths */ |
| ipsec_capab_algparm_t *encr_algparm; |
| uint32_t encr_algparm_size; |
| uint32_t encr_algparm_end; |
| }; |
| |
| /* |
| * List of AH and ESP IPsec acceleration capable ills |
| */ |
| typedef struct ipsec_capab_ill_s { |
| uint_t ill_index; |
| boolean_t ill_isv6; |
| struct ipsec_capab_ill_s *next; |
| } ipsec_capab_ill_t; |
| |
| static ipsec_capab_ill_t *ipsec_capab_ills_ah; |
| static ipsec_capab_ill_t *ipsec_capab_ills_esp; |
| krwlock_t ipsec_capab_ills_lock; |
| |
| /* |
| * The field values are larger than strictly necessary for simple |
| * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. |
| */ |
| static area_t ip_area_template = { |
| AR_ENTRY_ADD, /* area_cmd */ |
| sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), |
| /* area_name_offset */ |
| /* area_name_length temporarily holds this structure length */ |
| sizeof (area_t), /* area_name_length */ |
| IP_ARP_PROTO_TYPE, /* area_proto */ |
| sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ |
| IP_ADDR_LEN, /* area_proto_addr_length */ |
| sizeof (ip_sock_ar_t) + IP_ADDR_LEN, |
| /* area_proto_mask_offset */ |
| 0, /* area_flags */ |
| sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, |
| /* area_hw_addr_offset */ |
| /* Zero length hw_addr_length means 'use your idea of the address' */ |
| 0 /* area_hw_addr_length */ |
| }; |
| |
| /* |
| * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver |
| * support |
| */ |
| static area_t ip6_area_template = { |
| AR_ENTRY_ADD, /* area_cmd */ |
| sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), |
| /* area_name_offset */ |
| /* area_name_length temporarily holds this structure length */ |
| sizeof (area_t), /* area_name_length */ |
| IP_ARP_PROTO_TYPE, /* area_proto */ |
| sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ |
| IPV6_ADDR_LEN, /* area_proto_addr_length */ |
| sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, |
| /* area_proto_mask_offset */ |
| 0, /* area_flags */ |
| sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, |
| /* area_hw_addr_offset */ |
| /* Zero length hw_addr_length means 'use your idea of the address' */ |
| 0 /* area_hw_addr_length */ |
| }; |
| |
| static ared_t ip_ared_template = { |
| AR_ENTRY_DELETE, |
| sizeof (ared_t) + IP_ADDR_LEN, |
| sizeof (ared_t), |
| IP_ARP_PROTO_TYPE, |
| sizeof (ared_t), |
| IP_ADDR_LEN |
| }; |
| |
| static ared_t ip6_ared_template = { |
| AR_ENTRY_DELETE, |
| sizeof (ared_t) + IPV6_ADDR_LEN, |
| sizeof (ared_t), |
| IP_ARP_PROTO_TYPE, |
| sizeof (ared_t), |
| IPV6_ADDR_LEN |
| }; |
| |
| /* |
| * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as |
| * as the areq doesn't include an IP address in ill_dl_up() (the only place a |
| * areq is used). |
| */ |
| static areq_t ip_areq_template = { |
| AR_ENTRY_QUERY, /* cmd */ |
| sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ |
| sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ |
| IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ |
| sizeof (areq_t), /* target addr offset */ |
| IP_ADDR_LEN, /* target addr_length */ |
| 0, /* flags */ |
| sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ |
| IP_ADDR_LEN, /* sender addr length */ |
| 6, /* xmit_count */ |
| 1000, /* (re)xmit_interval in milliseconds */ |
| 4 /* max # of requests to buffer */ |
| /* anything else filled in by the code */ |
| }; |
| |
| static arc_t ip_aru_template = { |
| AR_INTERFACE_UP, |
| sizeof (arc_t), /* Name offset */ |
| sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ |
| }; |
| |
| static arc_t ip_ard_template = { |
| AR_INTERFACE_DOWN, |
| sizeof (arc_t), /* Name offset */ |
| sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ |
| }; |
| |
| static arc_t ip_aron_template = { |
| AR_INTERFACE_ON, |
| sizeof (arc_t), /* Name offset */ |
| sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ |
| }; |
| |
| static arc_t ip_aroff_template = { |
| AR_INTERFACE_OFF, |
| sizeof (arc_t), /* Name offset */ |
| sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ |
| }; |
| |
| |
| static arma_t ip_arma_multi_template = { |
| AR_MAPPING_ADD, |
| sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, |
| /* Name offset */ |
| sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ |
| IP_ARP_PROTO_TYPE, |
| sizeof (arma_t), /* proto_addr_offset */ |
| IP_ADDR_LEN, /* proto_addr_length */ |
| sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ |
| sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ |
| ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ |
| sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ |
| IP_MAX_HW_LEN, /* hw_addr_length */ |
| 0, /* hw_mapping_start */ |
| }; |
| |
| static ipft_t ip_ioctl_ftbl[] = { |
| { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, |
| { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), |
| IPFT_F_NO_REPLY }, |
| { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), |
| IPFT_F_NO_REPLY }, |
| { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, |
| { 0 } |
| }; |
| |
| /* Simple ICMP IP Header Template */ |
| static ipha_t icmp_ipha = { |
| IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP |
| }; |
| |
| /* Flag descriptors for ip_ipif_report */ |
| static nv_t ipif_nv_tbl[] = { |
| { IPIF_UP, "UP" }, |
| { IPIF_BROADCAST, "BROADCAST" }, |
| { ILLF_DEBUG, "DEBUG" }, |
| { PHYI_LOOPBACK, "LOOPBACK" }, |
| { IPIF_POINTOPOINT, "POINTOPOINT" }, |
| { ILLF_NOTRAILERS, "NOTRAILERS" }, |
| { PHYI_RUNNING, "RUNNING" }, |
| { ILLF_NOARP, "NOARP" }, |
| { PHYI_PROMISC, "PROMISC" }, |
| { PHYI_ALLMULTI, "ALLMULTI" }, |
| { PHYI_INTELLIGENT, "INTELLIGENT" }, |
| { ILLF_MULTICAST, "MULTICAST" }, |
| { PHYI_MULTI_BCAST, "MULTI_BCAST" }, |
| { IPIF_UNNUMBERED, "UNNUMBERED" }, |
| { IPIF_DHCPRUNNING, "DHCP" }, |
| { IPIF_PRIVATE, "PRIVATE" }, |
| { IPIF_NOXMIT, "NOXMIT" }, |
| { IPIF_NOLOCAL, "NOLOCAL" }, |
| { IPIF_DEPRECATED, "DEPRECATED" }, |
| { IPIF_PREFERRED, "PREFERRED" }, |
| { IPIF_TEMPORARY, "TEMPORARY" }, |
| { IPIF_ADDRCONF, "ADDRCONF" }, |
| { PHYI_VIRTUAL, "VIRTUAL" }, |
| { ILLF_ROUTER, "ROUTER" }, |
| { ILLF_NONUD, "NONUD" }, |
| { IPIF_ANYCAST, "ANYCAST" }, |
| { ILLF_NORTEXCH, "NORTEXCH" }, |
| { ILLF_IPV4, "IPV4" }, |
| { ILLF_IPV6, "IPV6" }, |
| { IPIF_MIPRUNNING, "MIP" }, |
| { IPIF_NOFAILOVER, "NOFAILOVER" }, |
| { PHYI_FAILED, "FAILED" }, |
| { PHYI_STANDBY, "STANDBY" }, |
| { PHYI_INACTIVE, "INACTIVE" }, |
| { PHYI_OFFLINE, "OFFLINE" }, |
| }; |
| |
| static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; |
| |
| static ip_m_t ip_m_tbl[] = { |
| { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_ether_v6intfid }, |
| { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_nodef_v6intfid }, |
| { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_nodef_v6intfid }, |
| { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_nodef_v6intfid }, |
| { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_ether_v6intfid }, |
| { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, |
| ip_ib_v6intfid }, |
| { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, |
| { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, |
| ip_nodef_v6intfid } |
| }; |
| |
| static ill_t ill_null; /* Empty ILL for init. */ |
| char ipif_loopback_name[] = "lo0"; |
| static char *ipv4_forward_suffix = ":ip_forwarding"; |
| static char *ipv6_forward_suffix = ":ip6_forwarding"; |
| static kstat_t *loopback_ksp = NULL; |
| static sin6_t sin6_null; /* Zero address for quick clears */ |
| static sin_t sin_null; /* Zero address for quick clears */ |
| static uint_t ill_index = 1; /* Used to assign interface indicies */ |
| /* When set search for unused index */ |
| static boolean_t ill_index_wrap = B_FALSE; |
| /* When set search for unused ipif_seqid */ |
| static ipif_t ipif_zero; |
| uint_t ipif_src_random; |
| |
| /* |
| * For details on the protection offered by these locks please refer |
| * to the notes under the Synchronization section at the start of ip.c |
| */ |
| krwlock_t ill_g_lock; /* The global ill_g_lock */ |
| kmutex_t ip_addr_avail_lock; /* Address availability check lock */ |
| ipsq_t *ipsq_g_head; /* List of all ipsq's on the system */ |
| |
| krwlock_t ill_g_usesrc_lock; /* Protects usesrc related fields */ |
| |
| /* |
| * illgrp_head/ifgrp_head is protected by IP's perimeter. |
| */ |
| static ill_group_t *illgrp_head_v4; /* Head of IPv4 ill groups */ |
| ill_group_t *illgrp_head_v6; /* Head of IPv6 ill groups */ |
| |
| ill_g_head_t ill_g_heads[MAX_G_HEADS]; /* ILL List Head */ |
| |
| /* |
| * ppa arena is created after these many |
| * interfaces have been plumbed. |
| */ |
| uint_t ill_no_arena = 12; |
| |
| #pragma align CACHE_ALIGN_SIZE(phyint_g_list) |
| static phyint_list_t phyint_g_list; /* start of phyint list */ |
| |
| /* |
| * Reflects value of FAILBACK variable in IPMP config file |
| * /etc/default/mpathd. Default value is B_TRUE. |
| * Set to B_FALSE if user disabled failback by configuring "FAILBACK=no" |
| * in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel. |
| */ |
| static boolean_t ipmp_enable_failback = B_TRUE; |
| |
| /* |
| * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout |
| * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is |
| * set through platform specific code (Niagara/Ontario). |
| */ |
| #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ |
| (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) |
| |
| #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) |
| |
| static uint_t |
| ipif_rand(void) |
| { |
| ipif_src_random = ipif_src_random * 1103515245 + 12345; |
| return ((ipif_src_random >> 16) & 0x7fff); |
| } |
| |
| /* |
| * Allocate per-interface mibs. Only used for ipv6. |
| * Returns true if ok. False otherwise. |
| * ipsq may not yet be allocated (loopback case ). |
| */ |
| static boolean_t |
| ill_allocate_mibs(ill_t *ill) |
| { |
| ASSERT(ill->ill_isv6); |
| |
| /* Already allocated? */ |
| if (ill->ill_ip6_mib != NULL) { |
| ASSERT(ill->ill_icmp6_mib != NULL); |
| return (B_TRUE); |
| } |
| |
| ill->ill_ip6_mib = kmem_zalloc(sizeof (*ill->ill_ip6_mib), |
| KM_NOSLEEP); |
| if (ill->ill_ip6_mib == NULL) { |
| return (B_FALSE); |
| } |
| ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), |
| KM_NOSLEEP); |
| if (ill->ill_icmp6_mib == NULL) { |
| kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); |
| ill->ill_ip6_mib = NULL; |
| return (B_FALSE); |
| } |
| /* |
| * The ipv6Ifindex and ipv6IfIcmpIndex will be assigned later |
| * after the phyint merge occurs in ipif_set_values -> ill_glist_insert |
| * -> ill_phyint_reinit |
| */ |
| return (B_TRUE); |
| } |
| |
| /* |
| * Common code for preparation of ARP commands. Two points to remember: |
| * 1) The ill_name is tacked on at the end of the allocated space so |
| * the templates name_offset field must contain the total space |
| * to allocate less the name length. |
| * |
| * 2) The templates name_length field should contain the *template* |
| * length. We use it as a parameter to bcopy() and then write |
| * the real ill_name_length into the name_length field of the copy. |
| * (Always called as writer.) |
| */ |
| mblk_t * |
| ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) |
| { |
| arc_t *arc = (arc_t *)template; |
| char *cp; |
| int len; |
| mblk_t *mp; |
| uint_t name_length = ill->ill_name_length; |
| uint_t template_len = arc->arc_name_length; |
| |
| len = arc->arc_name_offset + name_length; |
| mp = allocb(len, BPRI_HI); |
| if (mp == NULL) |
| return (NULL); |
| cp = (char *)mp->b_rptr; |
| mp->b_wptr = (uchar_t *)&cp[len]; |
| if (template_len) |
| bcopy(template, cp, template_len); |
| if (len > template_len) |
| bzero(&cp[template_len], len - template_len); |
| mp->b_datap->db_type = M_PROTO; |
| |
| arc = (arc_t *)cp; |
| arc->arc_name_length = name_length; |
| cp = (char *)arc + arc->arc_name_offset; |
| bcopy(ill->ill_name, cp, name_length); |
| |
| if (addr) { |
| area_t *area = (area_t *)mp->b_rptr; |
| |
| cp = (char *)area + area->area_proto_addr_offset; |
| bcopy(addr, cp, area->area_proto_addr_length); |
| if (area->area_cmd == AR_ENTRY_ADD) { |
| cp = (char *)area; |
| len = area->area_proto_addr_length; |
| if (area->area_proto_mask_offset) |
| cp += area->area_proto_mask_offset; |
| else |
| cp += area->area_proto_addr_offset + len; |
| while (len-- > 0) |
| *cp++ = (char)~0; |
| } |
| } |
| return (mp); |
| } |
| |
| /* |
| * Completely vaporize a lower level tap and all associated interfaces. |
| * ill_delete is called only out of ip_close when the device control |
| * stream is being closed. |
| */ |
| void |
| ill_delete(ill_t *ill) |
| { |
| ipif_t *ipif; |
| ill_t *prev_ill; |
| |
| /* |
| * ill_delete may be forcibly entering the ipsq. The previous |
| * ioctl may not have completed and may need to be aborted. |
| * ipsq_flush takes care of it. If we don't need to enter the |
| * the ipsq forcibly, the 2nd invocation of ipsq_flush in |
| * ill_delete_tail is sufficient. |
| */ |
| ipsq_flush(ill); |
| |
| /* |
| * Nuke all interfaces. ipif_free will take down the interface, |
| * remove it from the list, and free the data structure. |
| * Walk down the ipif list and remove the logical interfaces |
| * first before removing the main ipif. We can't unplumb |
| * zeroth interface first in the case of IPv6 as reset_conn_ill |
| * -> ip_ll_delmulti_v6 de-references ill_ipif for checking |
| * POINTOPOINT. |
| * |
| * If ill_ipif was not properly initialized (i.e low on memory), |
| * then no interfaces to clean up. In this case just clean up the |
| * ill. |
| */ |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| ipif_free(ipif); |
| |
| /* |
| * Used only by ill_arp_on and ill_arp_off, which are writers. |
| * So nobody can be using this mp now. Free the mp allocated for |
| * honoring ILLF_NOARP |
| */ |
| freemsg(ill->ill_arp_on_mp); |
| ill->ill_arp_on_mp = NULL; |
| |
| /* Clean up msgs on pending upcalls for mrouted */ |
| reset_mrt_ill(ill); |
| |
| /* |
| * ipif_free -> reset_conn_ipif will remove all multicast |
| * references for IPv4. For IPv6, we need to do it here as |
| * it points only at ills. |
| */ |
| reset_conn_ill(ill); |
| |
| /* |
| * ill_down will arrange to blow off any IRE's dependent on this |
| * ILL, and shut down fragmentation reassembly. |
| */ |
| ill_down(ill); |
| |
| /* Let SCTP know, so that it can remove this from its list. */ |
| sctp_update_ill(ill, SCTP_ILL_REMOVE); |
| |
| /* |
| * If an address on this ILL is being used as a source address then |
| * clear out the pointers in other ILLs that point to this ILL. |
| */ |
| rw_enter(&ill_g_usesrc_lock, RW_WRITER); |
| if (ill->ill_usesrc_grp_next != NULL) { |
| if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ |
| ill_disband_usesrc_group(ill); |
| } else { /* consumer of the usesrc ILL */ |
| prev_ill = ill_prev_usesrc(ill); |
| prev_ill->ill_usesrc_grp_next = |
| ill->ill_usesrc_grp_next; |
| } |
| } |
| rw_exit(&ill_g_usesrc_lock); |
| } |
| |
| /* |
| * ill_delete_tail is called from ip_modclose after all references |
| * to the closing ill are gone. The wait is done in ip_modclose |
| */ |
| void |
| ill_delete_tail(ill_t *ill) |
| { |
| mblk_t **mpp; |
| ipif_t *ipif; |
| |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| ipif_down_tail(ipif); |
| |
| /* |
| * If polling capability is enabled (which signifies direct |
| * upcall into IP and driver has ill saved as a handle), |
| * we need to make sure that unbind has completed before we |
| * let the ill disappear and driver no longer has any reference |
| * to this ill. |
| */ |
| mutex_enter(&ill->ill_lock); |
| while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) |
| cv_wait(&ill->ill_cv, &ill->ill_lock); |
| mutex_exit(&ill->ill_lock); |
| |
| /* |
| * Clean up polling and soft ring capabilities |
| */ |
| if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) |
| ill_capability_dls_disable(ill); |
| |
| /* |
| * Send the detach if there's one to send (i.e., if we're above a |
| * style 2 DLPI driver). |
| */ |
| if (ill->ill_detach_mp != NULL) { |
| ill_dlpi_send(ill, ill->ill_detach_mp); |
| ill->ill_detach_mp = NULL; |
| } |
| |
| if (ill->ill_net_type != IRE_LOOPBACK) |
| qprocsoff(ill->ill_rq); |
| |
| /* |
| * We do an ipsq_flush once again now. New messages could have |
| * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls |
| * could also have landed up if an ioctl thread had looked up |
| * the ill before we set the ILL_CONDEMNED flag, but not yet |
| * enqueued the ioctl when we did the ipsq_flush last time. |
| */ |
| ipsq_flush(ill); |
| |
| /* |
| * Free capabilities. |
| */ |
| if (ill->ill_ipsec_capab_ah != NULL) { |
| ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); |
| ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); |
| ill->ill_ipsec_capab_ah = NULL; |
| } |
| |
| if (ill->ill_ipsec_capab_esp != NULL) { |
| ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); |
| ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); |
| ill->ill_ipsec_capab_esp = NULL; |
| } |
| |
| if (ill->ill_mdt_capab != NULL) { |
| kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); |
| ill->ill_mdt_capab = NULL; |
| } |
| |
| if (ill->ill_hcksum_capab != NULL) { |
| kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); |
| ill->ill_hcksum_capab = NULL; |
| } |
| |
| if (ill->ill_zerocopy_capab != NULL) { |
| kmem_free(ill->ill_zerocopy_capab, |
| sizeof (ill_zerocopy_capab_t)); |
| ill->ill_zerocopy_capab = NULL; |
| } |
| |
| if (ill->ill_dls_capab != NULL) { |
| CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); |
| ill->ill_dls_capab->ill_unbind_conn = NULL; |
| kmem_free(ill->ill_dls_capab, |
| sizeof (ill_dls_capab_t) + |
| (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); |
| ill->ill_dls_capab = NULL; |
| } |
| |
| ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); |
| |
| while (ill->ill_ipif != NULL) |
| ipif_free_tail(ill->ill_ipif); |
| |
| ill_down_tail(ill); |
| |
| /* |
| * We have removed all references to ilm from conn and the ones joined |
| * within the kernel. |
| * |
| * We don't walk conns, mrts and ires because |
| * |
| * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. |
| * 2) ill_down ->ill_downi walks all the ires and cleans up |
| * ill references. |
| */ |
| ASSERT(ilm_walk_ill(ill) == 0); |
| /* |
| * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free |
| * could free the phyint. No more reference to the phyint after this |
| * point. |
| */ |
| (void) ill_glist_delete(ill); |
| |
| rw_enter(&ip_g_nd_lock, RW_WRITER); |
| if (ill->ill_ndd_name != NULL) |
| nd_unload(&ip_g_nd, ill->ill_ndd_name); |
| rw_exit(&ip_g_nd_lock); |
| |
| |
| if (ill->ill_frag_ptr != NULL) { |
| uint_t count; |
| |
| for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { |
| mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); |
| } |
| mi_free(ill->ill_frag_ptr); |
| ill->ill_frag_ptr = NULL; |
| ill->ill_frag_hash_tbl = NULL; |
| } |
| if (ill->ill_nd_lla_mp != NULL) |
| freemsg(ill->ill_nd_lla_mp); |
| /* Free all retained control messages. */ |
| mpp = &ill->ill_first_mp_to_free; |
| do { |
| while (mpp[0]) { |
| mblk_t *mp; |
| mblk_t *mp1; |
| |
| mp = mpp[0]; |
| mpp[0] = mp->b_next; |
| for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { |
| mp1->b_next = NULL; |
| mp1->b_prev = NULL; |
| } |
| freemsg(mp); |
| } |
| } while (mpp++ != &ill->ill_last_mp_to_free); |
| |
| ill_free_mib(ill); |
| ILL_TRACE_CLEANUP(ill); |
| } |
| |
| static void |
| ill_free_mib(ill_t *ill) |
| { |
| if (ill->ill_ip6_mib != NULL) { |
| kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); |
| ill->ill_ip6_mib = NULL; |
| } |
| if (ill->ill_icmp6_mib != NULL) { |
| kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); |
| ill->ill_icmp6_mib = NULL; |
| } |
| } |
| |
| /* |
| * Concatenate together a physical address and a sap. |
| * |
| * Sap_lengths are interpreted as follows: |
| * sap_length == 0 ==> no sap |
| * sap_length > 0 ==> sap is at the head of the dlpi address |
| * sap_length < 0 ==> sap is at the tail of the dlpi address |
| */ |
| static void |
| ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, |
| t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) |
| { |
| uint16_t sap_addr = (uint16_t)sap_src; |
| |
| if (sap_length == 0) { |
| if (phys_src == NULL) |
| bzero(dst, phys_length); |
| else |
| bcopy(phys_src, dst, phys_length); |
| } else if (sap_length < 0) { |
| if (phys_src == NULL) |
| bzero(dst, phys_length); |
| else |
| bcopy(phys_src, dst, phys_length); |
| bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); |
| } else { |
| bcopy(&sap_addr, dst, sizeof (sap_addr)); |
| if (phys_src == NULL) |
| bzero((char *)dst + sap_length, phys_length); |
| else |
| bcopy(phys_src, (char *)dst + sap_length, phys_length); |
| } |
| } |
| |
| /* |
| * Generate a dl_unitdata_req mblk for the device and address given. |
| * addr_length is the length of the physical portion of the address. |
| * If addr is NULL include an all zero address of the specified length. |
| * TRUE? In any case, addr_length is taken to be the entire length of the |
| * dlpi address, including the absolute value of sap_length. |
| */ |
| mblk_t * |
| ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, |
| t_scalar_t sap_length) |
| { |
| dl_unitdata_req_t *dlur; |
| mblk_t *mp; |
| t_scalar_t abs_sap_length; /* absolute value */ |
| |
| abs_sap_length = ABS(sap_length); |
| mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, |
| DL_UNITDATA_REQ); |
| if (mp == NULL) |
| return (NULL); |
| dlur = (dl_unitdata_req_t *)mp->b_rptr; |
| /* HACK: accomodate incompatible DLPI drivers */ |
| if (addr_length == 8) |
| addr_length = 6; |
| dlur->dl_dest_addr_length = addr_length + abs_sap_length; |
| dlur->dl_dest_addr_offset = sizeof (*dlur); |
| dlur->dl_priority.dl_min = 0; |
| dlur->dl_priority.dl_max = 0; |
| ill_dlur_copy_address(addr, addr_length, sap, sap_length, |
| (uchar_t *)&dlur[1]); |
| return (mp); |
| } |
| |
| /* |
| * Add the 'mp' to the list of pending mp's headed by ill_pending_mp |
| * Return an error if we already have 1 or more ioctls in progress. |
| * This is used only for non-exclusive ioctls. Currently this is used |
| * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive |
| * and thus need to use ipsq_pending_mp_add. |
| */ |
| boolean_t |
| ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) |
| { |
| ASSERT(MUTEX_HELD(&ill->ill_lock)); |
| ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); |
| /* |
| * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. |
| */ |
| ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || |
| (add_mp->b_datap->db_type == M_IOCTL)); |
| |
| ASSERT(MUTEX_HELD(&connp->conn_lock)); |
| /* |
| * Return error if the conn has started closing. The conn |
| * could have finished cleaning up the pending mp list, |
| * If so we should not add another mp to the list negating |
| * the cleanup. |
| */ |
| if (connp->conn_state_flags & CONN_CLOSING) |
| return (B_FALSE); |
| /* |
| * Add the pending mp to the head of the list, chained by b_next. |
| * Note down the conn on which the ioctl request came, in b_prev. |
| * This will be used to later get the conn, when we get a response |
| * on the ill queue, from some other module (typically arp) |
| */ |
| add_mp->b_next = (void *)ill->ill_pending_mp; |
| add_mp->b_queue = CONNP_TO_WQ(connp); |
| ill->ill_pending_mp = add_mp; |
| if (connp != NULL) |
| connp->conn_oper_pending_ill = ill; |
| return (B_TRUE); |
| } |
| |
| /* |
| * Retrieve the ill_pending_mp and return it. We have to walk the list |
| * of mblks starting at ill_pending_mp, and match based on the ioc_id. |
| */ |
| mblk_t * |
| ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) |
| { |
| mblk_t *prev = NULL; |
| mblk_t *curr = NULL; |
| uint_t id; |
| conn_t *connp; |
| |
| /* |
| * When the conn closes, conn_ioctl_cleanup needs to clean |
| * up the pending mp, but it does not know the ioc_id and |
| * passes in a zero for it. |
| */ |
| mutex_enter(&ill->ill_lock); |
| if (ioc_id != 0) |
| *connpp = NULL; |
| |
| /* Search the list for the appropriate ioctl based on ioc_id */ |
| for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; |
| prev = curr, curr = curr->b_next) { |
| id = ((struct iocblk *)curr->b_rptr)->ioc_id; |
| connp = Q_TO_CONN(curr->b_queue); |
| /* Match based on the ioc_id or based on the conn */ |
| if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) |
| break; |
| } |
| |
| if (curr != NULL) { |
| /* Unlink the mblk from the pending mp list */ |
| if (prev != NULL) { |
| prev->b_next = curr->b_next; |
| } else { |
| ASSERT(ill->ill_pending_mp == curr); |
| ill->ill_pending_mp = curr->b_next; |
| } |
| |
| /* |
| * conn refcnt must have been bumped up at the start of |
| * the ioctl. So we can safely access the conn. |
| */ |
| ASSERT(CONN_Q(curr->b_queue)); |
| *connpp = Q_TO_CONN(curr->b_queue); |
| curr->b_next = NULL; |
| curr->b_queue = NULL; |
| } |
| |
| mutex_exit(&ill->ill_lock); |
| |
| return (curr); |
| } |
| |
| /* |
| * Add the pending mp to the list. There can be only 1 pending mp |
| * in the list. Any exclusive ioctl that needs to wait for a response |
| * from another module or driver needs to use this function to set |
| * the ipsq_pending_mp to the ioctl mblk and wait for the response from |
| * the other module/driver. This is also used while waiting for the |
| * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. |
| */ |
| boolean_t |
| ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, |
| int waitfor) |
| { |
| ipsq_t *ipsq; |
| |
| ASSERT(IAM_WRITER_IPIF(ipif)); |
| ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); |
| ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); |
| /* |
| * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, |
| * M_ERROR/M_HANGUP from driver |
| */ |
| ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || |
| (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP)); |
| |
| ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; |
| if (connp != NULL) { |
| ASSERT(MUTEX_HELD(&connp->conn_lock)); |
| /* |
| * Return error if the conn has started closing. The conn |
| * could have finished cleaning up the pending mp list, |
| * If so we should not add another mp to the list negating |
| * the cleanup. |
| */ |
| if (connp->conn_state_flags & CONN_CLOSING) |
| return (B_FALSE); |
| } |
| mutex_enter(&ipsq->ipsq_lock); |
| ipsq->ipsq_pending_ipif = ipif; |
| /* |
| * Note down the queue in b_queue. This will be returned by |
| * ipsq_pending_mp_get. Caller will then use these values to restart |
| * the processing |
| */ |
| add_mp->b_next = NULL; |
| add_mp->b_queue = q; |
| ipsq->ipsq_pending_mp = add_mp; |
| ipsq->ipsq_waitfor = waitfor; |
| /* |
| * ipsq_current_ipif is needed to restart the operation from |
| * ipif_ill_refrele_tail when the last reference to the ipi/ill |
| * is gone. Since this is not an ioctl ipsq_current_ipif has not |
| * been set until now. |
| */ |
| if (DB_TYPE(add_mp) == M_ERROR || DB_TYPE(add_mp) == M_HANGUP) { |
| ASSERT(ipsq->ipsq_current_ipif == NULL); |
| ipsq->ipsq_current_ipif = ipif; |
| ipsq->ipsq_last_cmd = DB_TYPE(add_mp); |
| } |
| if (connp != NULL) |
| connp->conn_oper_pending_ill = ipif->ipif_ill; |
| mutex_exit(&ipsq->ipsq_lock); |
| return (B_TRUE); |
| } |
| |
| /* |
| * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp |
| * queued in the list. |
| */ |
| mblk_t * |
| ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) |
| { |
| mblk_t *curr = NULL; |
| |
| mutex_enter(&ipsq->ipsq_lock); |
| *connpp = NULL; |
| if (ipsq->ipsq_pending_mp == NULL) { |
| mutex_exit(&ipsq->ipsq_lock); |
| return (NULL); |
| } |
| |
| /* There can be only 1 such excl message */ |
| curr = ipsq->ipsq_pending_mp; |
| ASSERT(curr != NULL && curr->b_next == NULL); |
| ipsq->ipsq_pending_ipif = NULL; |
| ipsq->ipsq_pending_mp = NULL; |
| ipsq->ipsq_waitfor = 0; |
| mutex_exit(&ipsq->ipsq_lock); |
| |
| if (CONN_Q(curr->b_queue)) { |
| /* |
| * This mp did a refhold on the conn, at the start of the ioctl. |
| * So we can safely return a pointer to the conn to the caller. |
| */ |
| *connpp = Q_TO_CONN(curr->b_queue); |
| } else { |
| *connpp = NULL; |
| } |
| curr->b_next = NULL; |
| curr->b_prev = NULL; |
| return (curr); |
| } |
| |
| /* |
| * Cleanup the ioctl mp queued in ipsq_pending_mp |
| * - Called in the ill_delete path |
| * - Called in the M_ERROR or M_HANGUP path on the ill. |
| * - Called in the conn close path. |
| */ |
| boolean_t |
| ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) |
| { |
| mblk_t *mp; |
| ipsq_t *ipsq; |
| queue_t *q; |
| ipif_t *ipif; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ipsq = ill->ill_phyint->phyint_ipsq; |
| mutex_enter(&ipsq->ipsq_lock); |
| /* |
| * If connp is null, unconditionally clean up the ipsq_pending_mp. |
| * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl |
| * even if it is meant for another ill, since we have to enqueue |
| * a new mp now in ipsq_pending_mp to complete the ipif_down. |
| * If connp is non-null we are called from the conn close path. |
| */ |
| mp = ipsq->ipsq_pending_mp; |
| if (mp == NULL || (connp != NULL && |
| mp->b_queue != CONNP_TO_WQ(connp))) { |
| mutex_exit(&ipsq->ipsq_lock); |
| return (B_FALSE); |
| } |
| /* Now remove from the ipsq_pending_mp */ |
| ipsq->ipsq_pending_mp = NULL; |
| q = mp->b_queue; |
| mp->b_next = NULL; |
| mp->b_prev = NULL; |
| mp->b_queue = NULL; |
| |
| /* If MOVE was in progress, clear the move_in_progress fields also. */ |
| ill = ipsq->ipsq_pending_ipif->ipif_ill; |
| if (ill->ill_move_in_progress) { |
| ILL_CLEAR_MOVE(ill); |
| } else if (ill->ill_up_ipifs) { |
| ill_group_cleanup(ill); |
| } |
| |
| ipif = ipsq->ipsq_pending_ipif; |
| ipsq->ipsq_pending_ipif = NULL; |
| ipsq->ipsq_waitfor = 0; |
| ipsq->ipsq_current_ipif = NULL; |
| mutex_exit(&ipsq->ipsq_lock); |
| |
| if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { |
| ip_ioctl_finish(q, mp, ENXIO, connp != NULL ? CONN_CLOSE : |
| NO_COPYOUT, connp != NULL ? ipif : NULL, NULL); |
| } else { |
| /* |
| * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't |
| * be just inet_freemsg. we have to restart it |
| * otherwise the thread will be stuck. |
| */ |
| inet_freemsg(mp); |
| } |
| return (B_TRUE); |
| } |
| |
| /* |
| * The ill is closing. Cleanup all the pending mps. Called exclusively |
| * towards the end of ill_delete. The refcount has gone to 0. So nobody |
| * knows this ill, and hence nobody can add an mp to this list |
| */ |
| static void |
| ill_pending_mp_cleanup(ill_t *ill) |
| { |
| mblk_t *mp; |
| queue_t *q; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| mutex_enter(&ill->ill_lock); |
| /* |
| * Every mp on the pending mp list originating from an ioctl |
| * added 1 to the conn refcnt, at the start of the ioctl. |
| * So bump it down now. See comments in ip_wput_nondata() |
| */ |
| while (ill->ill_pending_mp != NULL) { |
| mp = ill->ill_pending_mp; |
| ill->ill_pending_mp = mp->b_next; |
| mutex_exit(&ill->ill_lock); |
| |
| q = mp->b_queue; |
| ASSERT(CONN_Q(q)); |
| mp->b_next = NULL; |
| mp->b_prev = NULL; |
| mp->b_queue = NULL; |
| ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL, NULL); |
| mutex_enter(&ill->ill_lock); |
| } |
| ill->ill_pending_ipif = NULL; |
| |
| mutex_exit(&ill->ill_lock); |
| } |
| |
| /* |
| * Called in the conn close path and ill delete path |
| */ |
| static void |
| ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) |
| { |
| ipsq_t *ipsq; |
| mblk_t *prev; |
| mblk_t *curr; |
| mblk_t *next; |
| queue_t *q; |
| mblk_t *tmp_list = NULL; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| if (connp != NULL) |
| q = CONNP_TO_WQ(connp); |
| else |
| q = ill->ill_wq; |
| |
| ipsq = ill->ill_phyint->phyint_ipsq; |
| /* |
| * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. |
| * In the case of ioctl from a conn, there can be only 1 mp |
| * queued on the ipsq. If an ill is being unplumbed, only messages |
| * related to this ill are flushed, like M_ERROR or M_HANGUP message. |
| * ioctls meant for this ill form conn's are not flushed. They will |
| * be processed during ipsq_exit and will not find the ill and will |
| * return error. |
| */ |
| mutex_enter(&ipsq->ipsq_lock); |
| for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; |
| curr = next) { |
| next = curr->b_next; |
| if (curr->b_queue == q || curr->b_queue == RD(q)) { |
| /* Unlink the mblk from the pending mp list */ |
| if (prev != NULL) { |
| prev->b_next = curr->b_next; |
| } else { |
| ASSERT(ipsq->ipsq_xopq_mphead == curr); |
| ipsq->ipsq_xopq_mphead = curr->b_next; |
| } |
| if (ipsq->ipsq_xopq_mptail == curr) |
| ipsq->ipsq_xopq_mptail = prev; |
| /* |
| * Create a temporary list and release the ipsq lock |
| * New elements are added to the head of the tmp_list |
| */ |
| curr->b_next = tmp_list; |
| tmp_list = curr; |
| } else { |
| prev = curr; |
| } |
| } |
| mutex_exit(&ipsq->ipsq_lock); |
| |
| while (tmp_list != NULL) { |
| curr = tmp_list; |
| tmp_list = curr->b_next; |
| curr->b_next = NULL; |
| curr->b_prev = NULL; |
| curr->b_queue = NULL; |
| if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { |
| ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? |
| CONN_CLOSE : NO_COPYOUT, NULL, NULL); |
| } else { |
| /* |
| * IP-MT XXX In the case of TLI/XTI bind / optmgmt |
| * this can't be just inet_freemsg. we have to |
| * restart it otherwise the thread will be stuck. |
| */ |
| inet_freemsg(curr); |
| } |
| } |
| } |
| |
| /* |
| * This conn has started closing. Cleanup any pending ioctl from this conn. |
| * STREAMS ensures that there can be at most 1 ioctl pending on a stream. |
| */ |
| void |
| conn_ioctl_cleanup(conn_t *connp) |
| { |
| mblk_t *curr; |
| ipsq_t *ipsq; |
| ill_t *ill; |
| boolean_t refheld; |
| |
| /* |
| * Is any exclusive ioctl pending ? If so clean it up. If the |
| * ioctl has not yet started, the mp is pending in the list headed by |
| * ipsq_xopq_head. If the ioctl has started the mp could be present in |
| * ipsq_pending_mp. If the ioctl timed out in the streamhead but |
| * is currently executing now the mp is not queued anywhere but |
| * conn_oper_pending_ill is null. The conn close will wait |
| * till the conn_ref drops to zero. |
| */ |
| mutex_enter(&connp->conn_lock); |
| ill = connp->conn_oper_pending_ill; |
| if (ill == NULL) { |
| mutex_exit(&connp->conn_lock); |
| return; |
| } |
| |
| curr = ill_pending_mp_get(ill, &connp, 0); |
| if (curr != NULL) { |
| mutex_exit(&connp->conn_lock); |
| CONN_DEC_REF(connp); |
| inet_freemsg(curr); |
| return; |
| } |
| /* |
| * We may not be able to refhold the ill if the ill/ipif |
| * is changing. But we need to make sure that the ill will |
| * not vanish. So we just bump up the ill_waiter count. |
| */ |
| refheld = ill_waiter_inc(ill); |
| mutex_exit(&connp->conn_lock); |
| if (refheld) { |
| if (ipsq_enter(ill, B_TRUE)) { |
| ill_waiter_dcr(ill); |
| /* |
| * Check whether this ioctl has started and is |
| * pending now in ipsq_pending_mp. If it is not |
| * found there then check whether this ioctl has |
| * not even started and is in the ipsq_xopq list. |
| */ |
| if (!ipsq_pending_mp_cleanup(ill, connp)) |
| ipsq_xopq_mp_cleanup(ill, connp); |
| ipsq = ill->ill_phyint->phyint_ipsq; |
| ipsq_exit(ipsq, B_TRUE, B_TRUE); |
| return; |
| } |
| } |
| |
| /* |
| * The ill is also closing and we could not bump up the |
| * ill_waiter_count or we could not enter the ipsq. Leave |
| * the cleanup to ill_delete |
| */ |
| mutex_enter(&connp->conn_lock); |
| while (connp->conn_oper_pending_ill != NULL) |
| cv_wait(&connp->conn_refcv, &connp->conn_lock); |
| mutex_exit(&connp->conn_lock); |
| if (refheld) |
| ill_waiter_dcr(ill); |
| } |
| |
| /* |
| * ipcl_walk function for cleaning up conn_*_ill fields. |
| */ |
| static void |
| conn_cleanup_ill(conn_t *connp, caddr_t arg) |
| { |
| ill_t *ill = (ill_t *)arg; |
| ire_t *ire; |
| |
| mutex_enter(&connp->conn_lock); |
| if (connp->conn_multicast_ill == ill) { |
| /* Revert to late binding */ |
| connp->conn_multicast_ill = NULL; |
| connp->conn_orig_multicast_ifindex = 0; |
| } |
| if (connp->conn_incoming_ill == ill) |
| connp->conn_incoming_ill = NULL; |
| if (connp->conn_outgoing_ill == ill) |
| connp->conn_outgoing_ill = NULL; |
| if (connp->conn_outgoing_pill == ill) |
| connp->conn_outgoing_pill = NULL; |
| if (connp->conn_nofailover_ill == ill) |
| connp->conn_nofailover_ill = NULL; |
| if (connp->conn_xmit_if_ill == ill) |
| connp->conn_xmit_if_ill = NULL; |
| if (connp->conn_ire_cache != NULL) { |
| ire = connp->conn_ire_cache; |
| /* |
| * ip_newroute creates IRE_CACHE with ire_stq coming from |
| * interface X and ipif coming from interface Y, if interface |
| * X and Y are part of the same IPMPgroup. Thus whenever |
| * interface X goes down, remove all references to it by |
| * checking both on ire_ipif and ire_stq. |
| */ |
| if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || |
| (ire->ire_type == IRE_CACHE && |
| ire->ire_stq == ill->ill_wq)) { |
| connp->conn_ire_cache = NULL; |
| mutex_exit(&connp->conn_lock); |
| ire_refrele_notr(ire); |
| return; |
| } |
| } |
| mutex_exit(&connp->conn_lock); |
| |
| } |
| |
| /* ARGSUSED */ |
| void |
| ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) |
| { |
| ill_t *ill = q->q_ptr; |
| ipif_t *ipif; |
| |
| ASSERT(IAM_WRITER_IPSQ(ipsq)); |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| ipif_down_tail(ipif); |
| ill_down_tail(ill); |
| freemsg(mp); |
| ipsq->ipsq_current_ipif = NULL; |
| } |
| |
| /* |
| * ill_down_start is called when we want to down this ill and bring it up again |
| * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down |
| * all interfaces, but don't tear down any plumbing. |
| */ |
| boolean_t |
| ill_down_start(queue_t *q, mblk_t *mp) |
| { |
| ill_t *ill; |
| ipif_t *ipif; |
| |
| ill = q->q_ptr; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| (void) ipif_down(ipif, NULL, NULL); |
| |
| ill_down(ill); |
| |
| (void) ipsq_pending_mp_cleanup(ill, NULL); |
| mutex_enter(&ill->ill_lock); |
| /* |
| * Atomically test and add the pending mp if references are |
| * still active. |
| */ |
| if (!ill_is_quiescent(ill)) { |
| /* |
| * Get rid of any pending mps and cleanup. Call will |
| * not fail since we are passing a null connp. |
| */ |
| (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, |
| mp, ILL_DOWN); |
| mutex_exit(&ill->ill_lock); |
| return (B_FALSE); |
| } |
| mutex_exit(&ill->ill_lock); |
| return (B_TRUE); |
| } |
| |
| static void |
| ill_down(ill_t *ill) |
| { |
| /* Blow off any IREs dependent on this ILL. */ |
| ire_walk(ill_downi, (char *)ill); |
| |
| mutex_enter(&ire_mrtun_lock); |
| if (ire_mrtun_count != 0) { |
| mutex_exit(&ire_mrtun_lock); |
| ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif, |
| (char *)ill, NULL); |
| } else { |
| mutex_exit(&ire_mrtun_lock); |
| } |
| |
| /* |
| * If any interface based forwarding table exists |
| * Blow off the ires there dependent on this ill |
| */ |
| mutex_enter(&ire_srcif_table_lock); |
| if (ire_srcif_table_count > 0) { |
| mutex_exit(&ire_srcif_table_lock); |
| ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill); |
| } else { |
| mutex_exit(&ire_srcif_table_lock); |
| } |
| |
| /* Remove any conn_*_ill depending on this ill */ |
| ipcl_walk(conn_cleanup_ill, (caddr_t)ill); |
| |
| if (ill->ill_group != NULL) { |
| illgrp_delete(ill); |
| } |
| |
| } |
| |
| static void |
| ill_down_tail(ill_t *ill) |
| { |
| int i; |
| |
| /* Destroy ill_srcif_table if it exists */ |
| /* Lock not reqd really because nobody should be able to access */ |
| mutex_enter(&ill->ill_lock); |
| if (ill->ill_srcif_table != NULL) { |
| ill->ill_srcif_refcnt = 0; |
| for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { |
| rw_destroy(&ill->ill_srcif_table[i].irb_lock); |
| } |
| kmem_free(ill->ill_srcif_table, |
| IP_SRCIF_TABLE_SIZE * sizeof (irb_t)); |
| ill->ill_srcif_table = NULL; |
| ill->ill_srcif_refcnt = 0; |
| ill->ill_mrtun_refcnt = 0; |
| } |
| mutex_exit(&ill->ill_lock); |
| } |
| |
| /* |
| * ire_walk routine used to delete every IRE that depends on queues |
| * associated with 'ill'. (Always called as writer.) |
| */ |
| static void |
| ill_downi(ire_t *ire, char *ill_arg) |
| { |
| ill_t *ill = (ill_t *)ill_arg; |
| |
| /* |
| * ip_newroute creates IRE_CACHE with ire_stq coming from |
| * interface X and ipif coming from interface Y, if interface |
| * X and Y are part of the same IPMP group. Thus whenever interface |
| * X goes down, remove all references to it by checking both |
| * on ire_ipif and ire_stq. |
| */ |
| if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || |
| (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { |
| ire_delete(ire); |
| } |
| } |
| |
| /* |
| * A seperate routine for deleting revtun and srcif based routes |
| * are needed because the ires only deleted when the interface |
| * is unplumbed. Also these ires have ire_in_ill non-null as well. |
| * we want to keep mobile IP specific code separate. |
| */ |
| static void |
| ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg) |
| { |
| ill_t *ill = (ill_t *)ill_arg; |
| |
| ASSERT(ire->ire_in_ill != NULL); |
| |
| if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) || |
| (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) { |
| ire_delete(ire); |
| } |
| } |
| |
| /* |
| * Remove ire/nce from the fastpath list. |
| */ |
| void |
| ill_fastpath_nack(ill_t *ill) |
| { |
| if (ill->ill_isv6) { |
| nce_fastpath_list_dispatch(ill, NULL, NULL); |
| } else { |
| ire_fastpath_list_dispatch(ill, NULL, NULL); |
| } |
| } |
| |
| /* Consume an M_IOCACK of the fastpath probe. */ |
| void |
| ill_fastpath_ack(ill_t *ill, mblk_t *mp) |
| { |
| mblk_t *mp1 = mp; |
| |
| /* |
| * If this was the first attempt turn on the fastpath probing. |
| */ |
| mutex_enter(&ill->ill_lock); |
| if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS) |
| ill->ill_dlpi_fastpath_state = IDMS_OK; |
| mutex_exit(&ill->ill_lock); |
| |
| /* Free the M_IOCACK mblk, hold on to the data */ |
| mp = mp->b_cont; |
| freeb(mp1); |
| if (mp == NULL) |
| return; |
| if (mp->b_cont != NULL) { |
| /* |
| * Update all IRE's or NCE's that are waiting for |
| * fastpath update. |
| */ |
| if (ill->ill_isv6) { |
| /* |
| * update nce's in the fastpath list. |
| */ |
| nce_fastpath_list_dispatch(ill, |
| ndp_fastpath_update, mp); |
| } else { |
| |
| /* |
| * update ire's in the fastpath list. |
| */ |
| ire_fastpath_list_dispatch(ill, |
| ire_fastpath_update, mp); |
| /* |
| * Check if we need to traverse reverse tunnel table. |
| * Since there is only single ire_type (IRE_MIPRTUN) |
| * in the table, we don't need to match on ire_type. |
| * We have to check ire_mrtun_count and not the |
| * ill_mrtun_refcnt since ill_mrtun_refcnt is set |
| * on the incoming ill and here we are dealing with |
| * outgoing ill. |
| */ |
| mutex_enter(&ire_mrtun_lock); |
| if (ire_mrtun_count != 0) { |
| mutex_exit(&ire_mrtun_lock); |
| ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN, |
| (void (*)(ire_t *, void *)) |
| ire_fastpath_update, mp, ill); |
| } else { |
| mutex_exit(&ire_mrtun_lock); |
| } |
| } |
| mp1 = mp->b_cont; |
| freeb(mp); |
| mp = mp1; |
| } else { |
| ip0dbg(("ill_fastpath_ack: no b_cont\n")); |
| } |
| |
| freeb(mp); |
| } |
| |
| /* |
| * Throw an M_IOCTL message downstream asking "do you know fastpath?" |
| * The data portion of the request is a dl_unitdata_req_t template for |
| * what we would send downstream in the absence of a fastpath confirmation. |
| */ |
| int |
| ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) |
| { |
| struct iocblk *ioc; |
| mblk_t *mp; |
| |
| if (dlur_mp == NULL) |
| return (EINVAL); |
| |
| mutex_enter(&ill->ill_lock); |
| switch (ill->ill_dlpi_fastpath_state) { |
| case IDMS_FAILED: |
| /* |
| * Driver NAKed the first fastpath ioctl - assume it doesn't |
| * support it. |
| */ |
| mutex_exit(&ill->ill_lock); |
| return (ENOTSUP); |
| case IDMS_UNKNOWN: |
| /* This is the first probe */ |
| ill->ill_dlpi_fastpath_state = IDMS_INPROGRESS; |
| break; |
| default: |
| break; |
| } |
| mutex_exit(&ill->ill_lock); |
| |
| if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) |
| return (EAGAIN); |
| |
| mp->b_cont = copyb(dlur_mp); |
| if (mp->b_cont == NULL) { |
| freeb(mp); |
| return (EAGAIN); |
| } |
| |
| ioc = (struct iocblk *)mp->b_rptr; |
| ioc->ioc_count = msgdsize(mp->b_cont); |
| |
| putnext(ill->ill_wq, mp); |
| return (0); |
| } |
| |
| void |
| ill_capability_probe(ill_t *ill) |
| { |
| /* |
| * Do so only if negotiation is enabled, capabilities are unknown, |
| * and a capability negotiation is not already in progress. |
| */ |
| if (ill->ill_capab_state != IDMS_UNKNOWN && |
| ill->ill_capab_state != IDMS_RENEG) |
| return; |
| |
| ill->ill_capab_state = IDMS_INPROGRESS; |
| ip1dbg(("ill_capability_probe: starting capability negotiation\n")); |
| ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); |
| } |
| |
| void |
| ill_capability_reset(ill_t *ill) |
| { |
| mblk_t *sc_mp = NULL; |
| mblk_t *tmp; |
| |
| /* |
| * Note here that we reset the state to UNKNOWN, and later send |
| * down the DL_CAPABILITY_REQ without first setting the state to |
| * INPROGRESS. We do this in order to distinguish the |
| * DL_CAPABILITY_ACK response which may come back in response to |
| * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would |
| * also handle the case where the driver doesn't send us back |
| * a DL_CAPABILITY_ACK in response, since the "probe" routine |
| * requires the state to be in UNKNOWN anyway. In any case, all |
| * features are turned off until the state reaches IDMS_OK. |
| */ |
| ill->ill_capab_state = IDMS_UNKNOWN; |
| |
| /* |
| * Disable sub-capabilities and request a list of sub-capability |
| * messages which will be sent down to the driver. Each handler |
| * allocates the corresponding dl_capability_sub_t inside an |
| * mblk, and links it to the existing sc_mp mblk, or return it |
| * as sc_mp if it's the first sub-capability (the passed in |
| * sc_mp is NULL). Upon returning from all capability handlers, |
| * sc_mp will be pulled-up, before passing it downstream. |
| */ |
| ill_capability_mdt_reset(ill, &sc_mp); |
| ill_capability_hcksum_reset(ill, &sc_mp); |
| ill_capability_zerocopy_reset(ill, &sc_mp); |
| ill_capability_ipsec_reset(ill, &sc_mp); |
| ill_capability_dls_reset(ill, &sc_mp); |
| |
| /* Nothing to send down in order to disable the capabilities? */ |
| if (sc_mp == NULL) |
| return; |
| |
| tmp = msgpullup(sc_mp, -1); |
| freemsg(sc_mp); |
| if ((sc_mp = tmp) == NULL) { |
| cmn_err(CE_WARN, "ill_capability_reset: unable to send down " |
| "DL_CAPABILITY_REQ (ENOMEM)\n"); |
| return; |
| } |
| |
| ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); |
| ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); |
| } |
| |
| /* |
| * Request or set new-style hardware capabilities supported by DLS provider. |
| */ |
| static void |
| ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) |
| { |
| mblk_t *mp; |
| dl_capability_req_t *capb; |
| size_t size = 0; |
| uint8_t *ptr; |
| |
| if (reqp != NULL) |
| size = MBLKL(reqp); |
| |
| mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); |
| if (mp == NULL) { |
| freemsg(reqp); |
| return; |
| } |
| ptr = mp->b_rptr; |
| |
| capb = (dl_capability_req_t *)ptr; |
| ptr += sizeof (dl_capability_req_t); |
| |
| if (reqp != NULL) { |
| capb->dl_sub_offset = sizeof (dl_capability_req_t); |
| capb->dl_sub_length = size; |
| bcopy(reqp->b_rptr, ptr, size); |
| ptr += size; |
| mp->b_cont = reqp->b_cont; |
| freeb(reqp); |
| } |
| ASSERT(ptr == mp->b_wptr); |
| |
| ill_dlpi_send(ill, mp); |
| } |
| |
| static void |
| ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) |
| { |
| dl_capab_id_t *id_ic; |
| uint_t sub_dl_cap = outers->dl_cap; |
| dl_capability_sub_t *inners; |
| uint8_t *capend; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| |
| capend = (uint8_t *)(outers + 1) + outers->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_id_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| id_ic = (dl_capab_id_t *)(outers + 1); |
| |
| if (outers->dl_length < sizeof (*id_ic) || |
| (inners = &id_ic->id_subcap, |
| inners->dl_length > (outers->dl_length - sizeof (*inners)))) { |
| cmn_err(CE_WARN, "ill_capability_id_ack: malformed " |
| "encapsulated capab type %d too long for mblk", |
| inners->dl_cap); |
| return; |
| } |
| |
| if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_id_ack: mid token for capab type %d " |
| "isn't as expected; pass-thru module(s) detected, " |
| "discarding capability\n", inners->dl_cap)); |
| return; |
| } |
| |
| /* Process the encapsulated sub-capability */ |
| ill_capability_dispatch(ill, mp, inners, B_TRUE); |
| } |
| |
| /* |
| * Process Multidata Transmit capability negotiation ack received from a |
| * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a |
| * DL_CAPABILITY_ACK message. |
| */ |
| static void |
| ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| mblk_t *nmp = NULL; |
| dl_capability_req_t *oc; |
| dl_capab_mdt_t *mdt_ic, *mdt_oc; |
| ill_mdt_capab_t **ill_mdt_capab; |
| uint_t sub_dl_cap = isub->dl_cap; |
| uint8_t *capend; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_MDT); |
| |
| ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_mdt_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| mdt_ic = (dl_capab_mdt_t *)(isub + 1); |
| |
| if (mdt_ic->mdt_version != MDT_VERSION_2) { |
| cmn_err(CE_CONT, "ill_capability_mdt_ack: " |
| "unsupported MDT sub-capability (version %d, expected %d)", |
| mdt_ic->mdt_version, MDT_VERSION_2); |
| return; |
| } |
| |
| if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_mdt_ack: mid token for MDT " |
| "capability isn't as expected; pass-thru module(s) " |
| "detected, discarding capability\n")); |
| return; |
| } |
| |
| if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { |
| |
| if (*ill_mdt_capab == NULL) { |
| *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), |
| KM_NOSLEEP); |
| |
| if (*ill_mdt_capab == NULL) { |
| cmn_err(CE_WARN, "ill_capability_mdt_ack: " |
| "could not enable MDT version %d " |
| "for %s (ENOMEM)\n", MDT_VERSION_2, |
| ill->ill_name); |
| return; |
| } |
| } |
| |
| ip1dbg(("ill_capability_mdt_ack: interface %s supports " |
| "MDT version %d (%d bytes leading, %d bytes trailing " |
| "header spaces, %d max pld bufs, %d span limit)\n", |
| ill->ill_name, MDT_VERSION_2, |
| mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, |
| mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); |
| |
| (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; |
| (*ill_mdt_capab)->ill_mdt_on = 1; |
| /* |
| * Round the following values to the nearest 32-bit; ULP |
| * may further adjust them to accomodate for additional |
| * protocol headers. We pass these values to ULP during |
| * bind time. |
| */ |
| (*ill_mdt_capab)->ill_mdt_hdr_head = |
| roundup(mdt_ic->mdt_hdr_head, 4); |
| (*ill_mdt_capab)->ill_mdt_hdr_tail = |
| roundup(mdt_ic->mdt_hdr_tail, 4); |
| (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; |
| (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; |
| |
| ill->ill_capabilities |= ILL_CAPAB_MDT; |
| } else { |
| uint_t size; |
| uchar_t *rptr; |
| |
| size = sizeof (dl_capability_req_t) + |
| sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); |
| |
| if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { |
| cmn_err(CE_WARN, "ill_capability_mdt_ack: " |
| "could not enable MDT for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| |
| rptr = nmp->b_rptr; |
| /* initialize dl_capability_req_t */ |
| oc = (dl_capability_req_t *)nmp->b_rptr; |
| oc->dl_sub_offset = sizeof (dl_capability_req_t); |
| oc->dl_sub_length = sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_mdt_t); |
| nmp->b_rptr += sizeof (dl_capability_req_t); |
| |
| /* initialize dl_capability_sub_t */ |
| bcopy(isub, nmp->b_rptr, sizeof (*isub)); |
| nmp->b_rptr += sizeof (*isub); |
| |
| /* initialize dl_capab_mdt_t */ |
| mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; |
| bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); |
| |
| nmp->b_rptr = rptr; |
| |
| ip1dbg(("ill_capability_mdt_ack: asking interface %s " |
| "to enable MDT version %d\n", ill->ill_name, |
| MDT_VERSION_2)); |
| |
| /* set ENABLE flag */ |
| mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; |
| |
| /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ |
| ill_dlpi_send(ill, nmp); |
| } |
| } |
| |
| static void |
| ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) |
| { |
| mblk_t *mp; |
| dl_capab_mdt_t *mdt_subcap; |
| dl_capability_sub_t *dl_subcap; |
| int size; |
| |
| if (!ILL_MDT_CAPABLE(ill)) |
| return; |
| |
| ASSERT(ill->ill_mdt_capab != NULL); |
| /* |
| * Clear the capability flag for MDT but retain the ill_mdt_capab |
| * structure since it's possible that another thread is still |
| * referring to it. The structure only gets deallocated when |
| * we destroy the ill. |
| */ |
| ill->ill_capabilities &= ~ILL_CAPAB_MDT; |
| |
| size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); |
| |
| mp = allocb(size, BPRI_HI); |
| if (mp == NULL) { |
| ip1dbg(("ill_capability_mdt_reset: unable to allocate " |
| "request to disable MDT\n")); |
| return; |
| } |
| |
| mp->b_wptr = mp->b_rptr + size; |
| |
| dl_subcap = (dl_capability_sub_t *)mp->b_rptr; |
| dl_subcap->dl_cap = DL_CAPAB_MDT; |
| dl_subcap->dl_length = sizeof (*mdt_subcap); |
| |
| mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); |
| mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; |
| mdt_subcap->mdt_flags = 0; |
| mdt_subcap->mdt_hdr_head = 0; |
| mdt_subcap->mdt_hdr_tail = 0; |
| |
| if (*sc_mp != NULL) |
| linkb(*sc_mp, mp); |
| else |
| *sc_mp = mp; |
| } |
| |
| /* |
| * Send a DL_NOTIFY_REQ to the specified ill to enable |
| * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. |
| * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware |
| * acceleration. |
| * Returns B_TRUE on success, B_FALSE if the message could not be sent. |
| */ |
| static boolean_t |
| ill_enable_promisc_notify(ill_t *ill) |
| { |
| mblk_t *mp; |
| dl_notify_req_t *req; |
| |
| IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); |
| |
| mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); |
| if (mp == NULL) |
| return (B_FALSE); |
| |
| req = (dl_notify_req_t *)mp->b_rptr; |
| req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | |
| DL_NOTE_PROMISC_OFF_PHYS; |
| |
| ill_dlpi_send(ill, mp); |
| |
| return (B_TRUE); |
| } |
| |
| |
| /* |
| * Allocate an IPsec capability request which will be filled by our |
| * caller to turn on support for one or more algorithms. |
| */ |
| static mblk_t * |
| ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) |
| { |
| mblk_t *nmp; |
| dl_capability_req_t *ocap; |
| dl_capab_ipsec_t *ocip; |
| dl_capab_ipsec_t *icip; |
| uint8_t *ptr; |
| icip = (dl_capab_ipsec_t *)(isub + 1); |
| |
| /* |
| * The first time around, we send a DL_NOTIFY_REQ to enable |
| * PROMISC_ON/OFF notification from the provider. We need to |
| * do this before enabling the algorithms to avoid leakage of |
| * cleartext packets. |
| */ |
| |
| if (!ill_enable_promisc_notify(ill)) |
| return (NULL); |
| |
| /* |
| * Allocate new mblk which will contain a new capability |
| * request to enable the capabilities. |
| */ |
| |
| nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + |
| sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); |
| if (nmp == NULL) |
| return (NULL); |
| |
| ptr = nmp->b_rptr; |
| |
| /* initialize dl_capability_req_t */ |
| ocap = (dl_capability_req_t *)ptr; |
| ocap->dl_sub_offset = sizeof (dl_capability_req_t); |
| ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; |
| ptr += sizeof (dl_capability_req_t); |
| |
| /* initialize dl_capability_sub_t */ |
| bcopy(isub, ptr, sizeof (*isub)); |
| ptr += sizeof (*isub); |
| |
| /* initialize dl_capab_ipsec_t */ |
| ocip = (dl_capab_ipsec_t *)ptr; |
| bcopy(icip, ocip, sizeof (*icip)); |
| |
| nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); |
| return (nmp); |
| } |
| |
| /* |
| * Process an IPsec capability negotiation ack received from a DLS Provider. |
| * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or |
| * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. |
| */ |
| static void |
| ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| dl_capab_ipsec_t *icip; |
| dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ |
| dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ |
| uint_t cipher, nciphers; |
| mblk_t *nmp; |
| uint_t alg_len; |
| boolean_t need_sadb_dump; |
| uint_t sub_dl_cap = isub->dl_cap; |
| ill_ipsec_capab_t **ill_capab; |
| uint64_t ill_capab_flag; |
| uint8_t *capend, *ciphend; |
| boolean_t sadb_resync; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || |
| sub_dl_cap == DL_CAPAB_IPSEC_ESP); |
| |
| if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { |
| ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; |
| ill_capab_flag = ILL_CAPAB_AH; |
| } else { |
| ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; |
| ill_capab_flag = ILL_CAPAB_ESP; |
| } |
| |
| /* |
| * If the ill capability structure exists, then this incoming |
| * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. |
| * If this is so, then we'd need to resynchronize the SADB |
| * after re-enabling the offloaded ciphers. |
| */ |
| sadb_resync = (*ill_capab != NULL); |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_ipsec_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| /* |
| * There are two types of acks we process here: |
| * 1. acks in reply to a (first form) generic capability req |
| * (no ENABLE flag set) |
| * 2. acks in reply to a ENABLE capability req. |
| * (ENABLE flag set) |
| * |
| * We process the subcapability passed as argument as follows: |
| * 1 do initializations |
| * 1.1 initialize nmp = NULL |
| * 1.2 set need_sadb_dump to B_FALSE |
| * 2 for each cipher in subcapability: |
| * 2.1 if ENABLE flag is set: |
| * 2.1.1 update per-ill ipsec capabilities info |
| * 2.1.2 set need_sadb_dump to B_TRUE |
| * 2.2 if ENABLE flag is not set: |
| * 2.2.1 if nmp is NULL: |
| * 2.2.1.1 allocate and initialize nmp |
| * 2.2.1.2 init current pos in nmp |
| * 2.2.2 copy current cipher to current pos in nmp |
| * 2.2.3 set ENABLE flag in nmp |
| * 2.2.4 update current pos |
| * 3 if nmp is not equal to NULL, send enable request |
| * 3.1 send capability request |
| * 4 if need_sadb_dump is B_TRUE |
| * 4.1 enable promiscuous on/off notifications |
| * 4.2 call ill_dlpi_send(isub->dlcap) to send all |
| * AH or ESP SA's to interface. |
| */ |
| |
| nmp = NULL; |
| oalg = NULL; |
| need_sadb_dump = B_FALSE; |
| icip = (dl_capab_ipsec_t *)(isub + 1); |
| ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); |
| |
| nciphers = icip->cip_nciphers; |
| ciphend = (uint8_t *)(ialg + icip->cip_nciphers); |
| |
| if (ciphend > capend) { |
| cmn_err(CE_WARN, "ill_capability_ipsec_ack: " |
| "too many ciphers for sub-capability len"); |
| return; |
| } |
| |
| for (cipher = 0; cipher < nciphers; cipher++) { |
| alg_len = sizeof (dl_capab_ipsec_alg_t); |
| |
| if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { |
| /* |
| * TBD: when we provide a way to disable capabilities |
| * from above, need to manage the request-pending state |
| * and fail if we were not expecting this ACK. |
| */ |
| IPSECHW_DEBUG(IPSECHW_CAPAB, |
| ("ill_capability_ipsec_ack: got ENABLE ACK\n")); |
| |
| /* |
| * Update IPsec capabilities for this ill |
| */ |
| |
| if (*ill_capab == NULL) { |
| IPSECHW_DEBUG(IPSECHW_CAPAB, |
| ("ill_capability_ipsec_ack: " |
| "allocating ipsec_capab for ill\n")); |
| *ill_capab = ill_ipsec_capab_alloc(); |
| |
| if (*ill_capab == NULL) { |
| cmn_err(CE_WARN, |
| "ill_capability_ipsec_ack: " |
| "could not enable IPsec Hardware " |
| "acceleration for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| } |
| |
| ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || |
| ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); |
| |
| if (ialg->alg_prim >= MAX_IPSEC_ALGS) { |
| cmn_err(CE_WARN, |
| "ill_capability_ipsec_ack: " |
| "malformed IPsec algorithm id %d", |
| ialg->alg_prim); |
| continue; |
| } |
| |
| if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { |
| IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, |
| ialg->alg_prim); |
| } else { |
| ipsec_capab_algparm_t *alp; |
| |
| IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, |
| ialg->alg_prim); |
| if (!ill_ipsec_capab_resize_algparm(*ill_capab, |
| ialg->alg_prim)) { |
| cmn_err(CE_WARN, |
| "ill_capability_ipsec_ack: " |
| "no space for IPsec alg id %d", |
| ialg->alg_prim); |
| continue; |
| } |
| alp = &((*ill_capab)->encr_algparm[ |
| ialg->alg_prim]); |
| alp->minkeylen = ialg->alg_minbits; |
| alp->maxkeylen = ialg->alg_maxbits; |
| } |
| ill->ill_capabilities |= ill_capab_flag; |
| /* |
| * indicate that a capability was enabled, which |
| * will be used below to kick off a SADB dump |
| * to the ill. |
| */ |
| need_sadb_dump = B_TRUE; |
| } else { |
| IPSECHW_DEBUG(IPSECHW_CAPAB, |
| ("ill_capability_ipsec_ack: enabling alg 0x%x\n", |
| ialg->alg_prim)); |
| |
| if (nmp == NULL) { |
| nmp = ill_alloc_ipsec_cap_req(ill, isub); |
| if (nmp == NULL) { |
| /* |
| * Sending the PROMISC_ON/OFF |
| * notification request failed. |
| * We cannot enable the algorithms |
| * since the Provider will not |
| * notify IP of promiscous mode |
| * changes, which could lead |
| * to leakage of packets. |
| */ |
| cmn_err(CE_WARN, |
| "ill_capability_ipsec_ack: " |
| "could not enable IPsec Hardware " |
| "acceleration for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| /* ptr to current output alg specifier */ |
| oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; |
| } |
| |
| /* |
| * Copy current alg specifier, set ENABLE |
| * flag, and advance to next output alg. |
| * For now we enable all IPsec capabilities. |
| */ |
| ASSERT(oalg != NULL); |
| bcopy(ialg, oalg, alg_len); |
| oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; |
| nmp->b_wptr += alg_len; |
| oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; |
| } |
| |
| /* move to next input algorithm specifier */ |
| ialg = (dl_capab_ipsec_alg_t *) |
| ((char *)ialg + alg_len); |
| } |
| |
| if (nmp != NULL) |
| /* |
| * nmp points to a DL_CAPABILITY_REQ message to enable |
| * IPsec hardware acceleration. |
| */ |
| ill_dlpi_send(ill, nmp); |
| |
| if (need_sadb_dump) |
| /* |
| * An acknowledgement corresponding to a request to |
| * enable acceleration was received, notify SADB. |
| */ |
| ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); |
| } |
| |
| /* |
| * Given an mblk with enough space in it, create sub-capability entries for |
| * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised |
| * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, |
| * in preparation for the reset the DL_CAPABILITY_REQ message. |
| */ |
| static void |
| ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, |
| ill_ipsec_capab_t *ill_cap, mblk_t *mp) |
| { |
| dl_capab_ipsec_t *oipsec; |
| dl_capab_ipsec_alg_t *oalg; |
| dl_capability_sub_t *dl_subcap; |
| int i, k; |
| |
| ASSERT(nciphers > 0); |
| ASSERT(ill_cap != NULL); |
| ASSERT(mp != NULL); |
| ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); |
| |
| /* dl_capability_sub_t for "stype" */ |
| dl_subcap = (dl_capability_sub_t *)mp->b_wptr; |
| dl_subcap->dl_cap = stype; |
| dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; |
| mp->b_wptr += sizeof (dl_capability_sub_t); |
| |
| /* dl_capab_ipsec_t for "stype" */ |
| oipsec = (dl_capab_ipsec_t *)mp->b_wptr; |
| oipsec->cip_version = 1; |
| oipsec->cip_nciphers = nciphers; |
| mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; |
| |
| /* create entries for "stype" AUTH ciphers */ |
| for (i = 0; i < ill_cap->algs_size; i++) { |
| for (k = 0; k < BITSPERBYTE; k++) { |
| if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) |
| continue; |
| |
| oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; |
| bzero((void *)oalg, sizeof (*oalg)); |
| oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; |
| oalg->alg_prim = k + (BITSPERBYTE * i); |
| mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); |
| } |
| } |
| /* create entries for "stype" ENCR ciphers */ |
| for (i = 0; i < ill_cap->algs_size; i++) { |
| for (k = 0; k < BITSPERBYTE; k++) { |
| if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) |
| continue; |
| |
| oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; |
| bzero((void *)oalg, sizeof (*oalg)); |
| oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; |
| oalg->alg_prim = k + (BITSPERBYTE * i); |
| mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); |
| } |
| } |
| } |
| |
| /* |
| * Macro to count number of 1s in a byte (8-bit word). The total count is |
| * accumulated into the passed-in argument (sum). We could use SPARCv9's |
| * POPC instruction, but our macro is more flexible for an arbitrary length |
| * of bytes, such as {auth,encr}_hw_algs. These variables are currently |
| * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length |
| * stays that way, we can reduce the number of iterations required. |
| */ |
| #define COUNT_1S(val, sum) { \ |
| uint8_t x = val & 0xff; \ |
| x = (x & 0x55) + ((x >> 1) & 0x55); \ |
| x = (x & 0x33) + ((x >> 2) & 0x33); \ |
| sum += (x & 0xf) + ((x >> 4) & 0xf); \ |
| } |
| |
| /* ARGSUSED */ |
| static void |
| ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) |
| { |
| mblk_t *mp; |
| ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; |
| ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; |
| uint64_t ill_capabilities = ill->ill_capabilities; |
| int ah_cnt = 0, esp_cnt = 0; |
| int ah_len = 0, esp_len = 0; |
| int i, size = 0; |
| |
| if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) |
| return; |
| |
| ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); |
| ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); |
| |
| /* Find out the number of ciphers for AH */ |
| if (cap_ah != NULL) { |
| for (i = 0; i < cap_ah->algs_size; i++) { |
| COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); |
| COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); |
| } |
| if (ah_cnt > 0) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_ipsec_t); |
| /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ |
| ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); |
| size += ah_len; |
| } |
| } |
| |
| /* Find out the number of ciphers for ESP */ |
| if (cap_esp != NULL) { |
| for (i = 0; i < cap_esp->algs_size; i++) { |
| COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); |
| COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); |
| } |
| if (esp_cnt > 0) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_ipsec_t); |
| /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ |
| esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); |
| size += esp_len; |
| } |
| } |
| |
| if (size == 0) { |
| ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " |
| "there's nothing to reset\n")); |
| return; |
| } |
| |
| mp = allocb(size, BPRI_HI); |
| if (mp == NULL) { |
| ip1dbg(("ill_capability_ipsec_reset: unable to allocate " |
| "request to disable IPSEC Hardware Acceleration\n")); |
| return; |
| } |
| |
| /* |
| * Clear the capability flags for IPSec HA but retain the ill |
| * capability structures since it's possible that another thread |
| * is still referring to them. The structures only get deallocated |
| * when we destroy the ill. |
| * |
| * Various places check the flags to see if the ill is capable of |
| * hardware acceleration, and by clearing them we ensure that new |
| * outbound IPSec packets are sent down encrypted. |
| */ |
| ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); |
| |
| /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ |
| if (ah_cnt > 0) { |
| ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, |
| cap_ah, mp); |
| ASSERT(mp->b_rptr + size >= mp->b_wptr); |
| } |
| |
| /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ |
| if (esp_cnt > 0) { |
| ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, |
| cap_esp, mp); |
| ASSERT(mp->b_rptr + size >= mp->b_wptr); |
| } |
| |
| /* |
| * At this point we've composed a bunch of sub-capabilities to be |
| * encapsulated in a DL_CAPABILITY_REQ and later sent downstream |
| * by the caller. Upon receiving this reset message, the driver |
| * must stop inbound decryption (by destroying all inbound SAs) |
| * and let the corresponding packets come in encrypted. |
| */ |
| |
| if (*sc_mp != NULL) |
| linkb(*sc_mp, mp); |
| else |
| *sc_mp = mp; |
| } |
| |
| static void |
| ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, |
| boolean_t encapsulated) |
| { |
| boolean_t legacy = B_FALSE; |
| |
| /* |
| * If this DL_CAPABILITY_ACK came in as a response to our "reset" |
| * DL_CAPABILITY_REQ, ignore it during this cycle. We've just |
| * instructed the driver to disable its advertised capabilities, |
| * so there's no point in accepting any response at this moment. |
| */ |
| if (ill->ill_capab_state == IDMS_UNKNOWN) |
| return; |
| |
| /* |
| * Note that only the following two sub-capabilities may be |
| * considered as "legacy", since their original definitions |
| * do not incorporate the dl_mid_t module ID token, and hence |
| * may require the use of the wrapper sub-capability. |
| */ |
| switch (subp->dl_cap) { |
| case DL_CAPAB_IPSEC_AH: |
| case DL_CAPAB_IPSEC_ESP: |
| legacy = B_TRUE; |
| break; |
| } |
| |
| /* |
| * For legacy sub-capabilities which don't incorporate a queue_t |
| * pointer in their structures, discard them if we detect that |
| * there are intermediate modules in between IP and the driver. |
| */ |
| if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { |
| ip1dbg(("ill_capability_dispatch: unencapsulated capab type " |
| "%d discarded; %d module(s) present below IP\n", |
| subp->dl_cap, ill->ill_lmod_cnt)); |
| return; |
| } |
| |
| switch (subp->dl_cap) { |
| case DL_CAPAB_IPSEC_AH: |
| case DL_CAPAB_IPSEC_ESP: |
| ill_capability_ipsec_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_MDT: |
| ill_capability_mdt_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_HCKSUM: |
| ill_capability_hcksum_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_ZEROCOPY: |
| ill_capability_zerocopy_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_POLL: |
| if (!SOFT_RINGS_ENABLED()) |
| ill_capability_dls_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_SOFT_RING: |
| if (SOFT_RINGS_ENABLED()) |
| ill_capability_dls_ack(ill, mp, subp); |
| break; |
| default: |
| ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", |
| subp->dl_cap)); |
| } |
| } |
| |
| /* |
| * As part of negotiating polling capability, the driver tells us |
| * the default (or normal) blanking interval and packet threshold |
| * (the receive timer fires if blanking interval is reached or |
| * the packet threshold is reached). |
| * |
| * As part of manipulating the polling interval, we always use our |
| * estimated interval (avg service time * number of packets queued |
| * on the squeue) but we try to blank for a minimum of |
| * rr_normal_blank_time * rr_max_blank_ratio. We disable the |
| * packet threshold during this time. When we are not in polling mode |
| * we set the blank interval typically lower, rr_normal_pkt_cnt * |
| * rr_min_blank_ratio but up the packet cnt by a ratio of |
| * rr_min_pkt_cnt_ratio so that we are still getting chains if |
| * possible although for a shorter interval. |
| */ |
| #define RR_MAX_BLANK_RATIO 20 |
| #define RR_MIN_BLANK_RATIO 10 |
| #define RR_MAX_PKT_CNT_RATIO 3 |
| #define RR_MIN_PKT_CNT_RATIO 3 |
| |
| /* |
| * These can be tuned via /etc/system. |
| */ |
| int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; |
| int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; |
| int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; |
| int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; |
| |
| static mac_resource_handle_t |
| ill_ring_add(void *arg, mac_resource_t *mrp) |
| { |
| ill_t *ill = (ill_t *)arg; |
| mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; |
| ill_rx_ring_t *rx_ring; |
| int ip_rx_index; |
| |
| ASSERT(mrp != NULL); |
| if (mrp->mr_type != MAC_RX_FIFO) { |
| return (NULL); |
| } |
| ASSERT(ill != NULL); |
| ASSERT(ill->ill_dls_capab != NULL); |
| |
| mutex_enter(&ill->ill_lock); |
| for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { |
| rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; |
| ASSERT(rx_ring != NULL); |
| |
| if (rx_ring->rr_ring_state == ILL_RING_FREE) { |
| time_t normal_blank_time = |
| mrfp->mrf_normal_blank_time; |
| uint_t normal_pkt_cnt = |
| mrfp->mrf_normal_pkt_count; |
| |
| bzero(rx_ring, sizeof (ill_rx_ring_t)); |
| |
| rx_ring->rr_blank = mrfp->mrf_blank; |
| rx_ring->rr_handle = mrfp->mrf_arg; |
| rx_ring->rr_ill = ill; |
| rx_ring->rr_normal_blank_time = normal_blank_time; |
| rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; |
| |
| rx_ring->rr_max_blank_time = |
| normal_blank_time * rr_max_blank_ratio; |
| rx_ring->rr_min_blank_time = |
| normal_blank_time * rr_min_blank_ratio; |
| rx_ring->rr_max_pkt_cnt = |
| normal_pkt_cnt * rr_max_pkt_cnt_ratio; |
| rx_ring->rr_min_pkt_cnt = |
| normal_pkt_cnt * rr_min_pkt_cnt_ratio; |
| |
| rx_ring->rr_ring_state = ILL_RING_INUSE; |
| mutex_exit(&ill->ill_lock); |
| |
| DTRACE_PROBE2(ill__ring__add, (void *), ill, |
| (int), ip_rx_index); |
| return ((mac_resource_handle_t)rx_ring); |
| } |
| } |
| |
| /* |
| * We ran out of ILL_MAX_RINGS worth rx_ring structures. If |
| * we have devices which can overwhelm this limit, ILL_MAX_RING |
| * should be made configurable. Meanwhile it cause no panic because |
| * driver will pass ip_input a NULL handle which will make |
| * IP allocate the default squeue and Polling mode will not |
| * be used for this ring. |
| */ |
| cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " |
| "for %s\n", ILL_MAX_RINGS, ill->ill_name); |
| |
| mutex_exit(&ill->ill_lock); |
| return (NULL); |
| } |
| |
| static boolean_t |
| ill_capability_dls_init(ill_t *ill) |
| { |
| ill_dls_capab_t *ill_dls = ill->ill_dls_capab; |
| conn_t *connp; |
| size_t sz; |
| |
| if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { |
| if (ill_dls == NULL) { |
| cmn_err(CE_PANIC, "ill_capability_dls_init: " |
| "soft_ring enabled for ill=%s (%p) but data " |
| "structs uninitialized\n", ill->ill_name, |
| (void *)ill); |
| } |
| return (B_TRUE); |
| } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { |
| if (ill_dls == NULL) { |
| cmn_err(CE_PANIC, "ill_capability_dls_init: " |
| "polling enabled for ill=%s (%p) but data " |
| "structs uninitialized\n", ill->ill_name, |
| (void *)ill); |
| } |
| return (B_TRUE); |
| } |
| |
| if (ill_dls != NULL) { |
| ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; |
| /* Soft_Ring or polling is being re-enabled */ |
| |
| connp = ill_dls->ill_unbind_conn; |
| ASSERT(rx_ring != NULL); |
| bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); |
| bzero((void *)rx_ring, |
| sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); |
| ill_dls->ill_ring_tbl = rx_ring; |
| ill_dls->ill_unbind_conn = connp; |
| return (B_TRUE); |
| } |
| |
| if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) |
| return (B_FALSE); |
| |
| sz = sizeof (ill_dls_capab_t); |
| sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; |
| |
| ill_dls = kmem_zalloc(sz, KM_NOSLEEP); |
| if (ill_dls == NULL) { |
| |