| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. |
| * Copyright (c) 1990 Mentat Inc. |
| * Copyright (c) 2013 by Delphix. All rights reserved. |
| * Copyright (c) 2016, Joyent, Inc. All rights reserved. |
| * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved. |
| */ |
| |
| /* |
| * This file contains the interface control functions for IP. |
| */ |
| |
| #include <sys/types.h> |
| #include <sys/stream.h> |
| #include <sys/dlpi.h> |
| #include <sys/stropts.h> |
| #include <sys/strsun.h> |
| #include <sys/sysmacros.h> |
| #include <sys/strsubr.h> |
| #include <sys/strlog.h> |
| #include <sys/ddi.h> |
| #include <sys/sunddi.h> |
| #include <sys/cmn_err.h> |
| #include <sys/kstat.h> |
| #include <sys/debug.h> |
| #include <sys/zone.h> |
| #include <sys/sunldi.h> |
| #include <sys/file.h> |
| #include <sys/bitmap.h> |
| #include <sys/cpuvar.h> |
| #include <sys/time.h> |
| #include <sys/ctype.h> |
| #include <sys/kmem.h> |
| #include <sys/systm.h> |
| #include <sys/param.h> |
| #include <sys/socket.h> |
| #include <sys/isa_defs.h> |
| #include <net/if.h> |
| #include <net/if_arp.h> |
| #include <net/if_types.h> |
| #include <net/if_dl.h> |
| #include <net/route.h> |
| #include <sys/sockio.h> |
| #include <netinet/in.h> |
| #include <netinet/ip6.h> |
| #include <netinet/icmp6.h> |
| #include <netinet/igmp_var.h> |
| #include <sys/policy.h> |
| #include <sys/ethernet.h> |
| #include <sys/callb.h> |
| #include <sys/md5.h> |
| |
| #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ |
| #include <inet/mi.h> |
| #include <inet/nd.h> |
| #include <inet/tunables.h> |
| #include <inet/arp.h> |
| #include <inet/ip_arp.h> |
| #include <inet/mib2.h> |
| #include <inet/ip.h> |
| #include <inet/ip6.h> |
| #include <inet/ip6_asp.h> |
| #include <inet/tcp.h> |
| #include <inet/ip_multi.h> |
| #include <inet/ip_ire.h> |
| #include <inet/ip_ftable.h> |
| #include <inet/ip_rts.h> |
| #include <inet/ip_ndp.h> |
| #include <inet/ip_if.h> |
| #include <inet/ip_impl.h> |
| #include <inet/sctp_ip.h> |
| #include <inet/ip_netinfo.h> |
| #include <inet/ilb_ip.h> |
| |
| #include <netinet/igmp.h> |
| #include <inet/ip_listutils.h> |
| #include <inet/ipclassifier.h> |
| #include <sys/mac_client.h> |
| #include <sys/dld.h> |
| #include <sys/mac_flow.h> |
| |
| #include <sys/systeminfo.h> |
| #include <sys/bootconf.h> |
| |
| #include <sys/tsol/tndb.h> |
| #include <sys/tsol/tnet.h> |
| |
| #include <inet/rawip_impl.h> /* needed for icmp_stack_t */ |
| #include <inet/udp_impl.h> /* needed for udp_stack_t */ |
| |
| /* The character which tells where the ill_name ends */ |
| #define IPIF_SEPARATOR_CHAR ':' |
| |
| /* IP ioctl function table entry */ |
| typedef struct ipft_s { |
| int ipft_cmd; |
| pfi_t ipft_pfi; |
| int ipft_min_size; |
| int ipft_flags; |
| } ipft_t; |
| #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ |
| #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ |
| |
| static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); |
| static int nd_ill_forward_set(queue_t *q, mblk_t *mp, |
| char *value, caddr_t cp, cred_t *ioc_cr); |
| |
| static boolean_t ill_is_quiescent(ill_t *); |
| static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); |
| static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); |
| static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, |
| mblk_t *mp); |
| static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, |
| mblk_t *mp); |
| static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, |
| int ioccmd, struct linkblk *li); |
| static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); |
| static void ip_wput_ioctl(queue_t *q, mblk_t *mp); |
| static void ipsq_flush(ill_t *ill); |
| |
| static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, |
| queue_t *q, mblk_t *mp, boolean_t need_up); |
| static void ipsq_delete(ipsq_t *); |
| |
| static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, |
| boolean_t initialize, boolean_t insert, int *errorp); |
| static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); |
| static void ipif_delete_bcast_ires(ipif_t *ipif); |
| static int ipif_add_ires_v4(ipif_t *, boolean_t); |
| static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, |
| boolean_t isv6); |
| static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); |
| static void ipif_free(ipif_t *ipif); |
| static void ipif_free_tail(ipif_t *ipif); |
| static void ipif_set_default(ipif_t *ipif); |
| static int ipif_set_values(queue_t *q, mblk_t *mp, |
| char *interf_name, uint_t *ppa); |
| static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, |
| queue_t *q); |
| static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, |
| boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, |
| ip_stack_t *); |
| static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen, |
| boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, |
| int *error, ip_stack_t *); |
| |
| static int ill_alloc_ppa(ill_if_t *, ill_t *); |
| static void ill_delete_interface_type(ill_if_t *); |
| static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); |
| static void ill_dl_down(ill_t *ill); |
| static void ill_down(ill_t *ill); |
| static void ill_down_ipifs(ill_t *, boolean_t); |
| static void ill_free_mib(ill_t *ill); |
| static void ill_glist_delete(ill_t *); |
| static void ill_phyint_reinit(ill_t *ill); |
| static void ill_set_nce_router_flags(ill_t *, boolean_t); |
| static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); |
| static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); |
| |
| static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; |
| static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; |
| static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; |
| static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; |
| static ip_v4mapinfo_func_t ip_ether_v4_mapping; |
| static ip_v6mapinfo_func_t ip_ether_v6_mapping; |
| static ip_v4mapinfo_func_t ip_ib_v4_mapping; |
| static ip_v6mapinfo_func_t ip_ib_v6_mapping; |
| static ip_v4mapinfo_func_t ip_mbcast_mapping; |
| static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); |
| static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); |
| static void phyint_free(phyint_t *); |
| |
| static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); |
| static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); |
| static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, |
| dl_capability_sub_t *); |
| static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); |
| static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); |
| static void ill_capability_dld_ack(ill_t *, mblk_t *, |
| dl_capability_sub_t *); |
| static void ill_capability_dld_enable(ill_t *); |
| static void ill_capability_ack_thr(void *); |
| static void ill_capability_lso_enable(ill_t *); |
| |
| static ill_t *ill_prev_usesrc(ill_t *); |
| static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); |
| static void ill_disband_usesrc_group(ill_t *); |
| static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); |
| |
| #ifdef DEBUG |
| static void ill_trace_cleanup(const ill_t *); |
| static void ipif_trace_cleanup(const ipif_t *); |
| #endif |
| |
| static void ill_dlpi_clear_deferred(ill_t *ill); |
| |
| static void phyint_flags_init(phyint_t *, t_uscalar_t); |
| |
| /* |
| * if we go over the memory footprint limit more than once in this msec |
| * interval, we'll start pruning aggressively. |
| */ |
| int ip_min_frag_prune_time = 0; |
| |
| static ipft_t ip_ioctl_ftbl[] = { |
| { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, |
| { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), |
| IPFT_F_NO_REPLY }, |
| { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, |
| { 0 } |
| }; |
| |
| /* Simple ICMP IP Header Template */ |
| static ipha_t icmp_ipha = { |
| IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP |
| }; |
| |
| static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; |
| |
| static ip_m_t ip_m_tbl[] = { |
| { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, |
| ip_nodef_v6intfid }, |
| { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, |
| ip_nodef_v6intfid }, |
| { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, |
| ip_nodef_v6intfid }, |
| { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, |
| ip_nodef_v6intfid }, |
| { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, |
| ip_nodef_v6intfid }, |
| { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, |
| ip_nodef_v6intfid }, |
| { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, |
| ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, |
| ip_ipv4_v6destintfid }, |
| { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, |
| ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, |
| ip_ipv6_v6destintfid }, |
| { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, |
| ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, |
| ip_nodef_v6intfid }, |
| { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, |
| { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, |
| { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, |
| ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, |
| ip_nodef_v6intfid } |
| }; |
| |
| char ipif_loopback_name[] = "lo0"; |
| |
| /* These are used by all IP network modules. */ |
| sin6_t sin6_null; /* Zero address for quick clears */ |
| sin_t sin_null; /* Zero address for quick clears */ |
| |
| /* When set search for unused ipif_seqid */ |
| static ipif_t ipif_zero; |
| |
| /* |
| * ppa arena is created after these many |
| * interfaces have been plumbed. |
| */ |
| uint_t ill_no_arena = 12; /* Setable in /etc/system */ |
| |
| /* |
| * Allocate per-interface mibs. |
| * Returns true if ok. False otherwise. |
| * ipsq may not yet be allocated (loopback case ). |
| */ |
| static boolean_t |
| ill_allocate_mibs(ill_t *ill) |
| { |
| /* Already allocated? */ |
| if (ill->ill_ip_mib != NULL) { |
| if (ill->ill_isv6) |
| ASSERT(ill->ill_icmp6_mib != NULL); |
| return (B_TRUE); |
| } |
| |
| ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), |
| KM_NOSLEEP); |
| if (ill->ill_ip_mib == NULL) { |
| return (B_FALSE); |
| } |
| |
| /* Setup static information */ |
| SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, |
| sizeof (mib2_ipIfStatsEntry_t)); |
| if (ill->ill_isv6) { |
| ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; |
| SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, |
| sizeof (mib2_ipv6AddrEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, |
| sizeof (mib2_ipv6RouteEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, |
| sizeof (mib2_ipv6NetToMediaEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, |
| sizeof (ipv6_member_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, |
| sizeof (ipv6_grpsrc_t)); |
| } else { |
| ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; |
| SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, |
| sizeof (mib2_ipAddrEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, |
| sizeof (mib2_ipRouteEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, |
| sizeof (mib2_ipNetToMediaEntry_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, |
| sizeof (ip_member_t)); |
| SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, |
| sizeof (ip_grpsrc_t)); |
| |
| /* |
| * For a v4 ill, we are done at this point, because per ill |
| * icmp mibs are only used for v6. |
| */ |
| return (B_TRUE); |
| } |
| |
| ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), |
| KM_NOSLEEP); |
| if (ill->ill_icmp6_mib == NULL) { |
| kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); |
| ill->ill_ip_mib = NULL; |
| return (B_FALSE); |
| } |
| /* static icmp info */ |
| ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = |
| sizeof (mib2_ipv6IfIcmpEntry_t); |
| /* |
| * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later |
| * after the phyint merge occurs in ipif_set_values -> ill_glist_insert |
| * -> ill_phyint_reinit |
| */ |
| return (B_TRUE); |
| } |
| |
| /* |
| * Completely vaporize a lower level tap and all associated interfaces. |
| * ill_delete is called only out of ip_close when the device control |
| * stream is being closed. |
| */ |
| void |
| ill_delete(ill_t *ill) |
| { |
| ipif_t *ipif; |
| ill_t *prev_ill; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| /* |
| * ill_delete may be forcibly entering the ipsq. The previous |
| * ioctl may not have completed and may need to be aborted. |
| * ipsq_flush takes care of it. If we don't need to enter the |
| * the ipsq forcibly, the 2nd invocation of ipsq_flush in |
| * ill_delete_tail is sufficient. |
| */ |
| ipsq_flush(ill); |
| |
| /* |
| * Nuke all interfaces. ipif_free will take down the interface, |
| * remove it from the list, and free the data structure. |
| * Walk down the ipif list and remove the logical interfaces |
| * first before removing the main ipif. We can't unplumb |
| * zeroth interface first in the case of IPv6 as update_conn_ill |
| * -> ip_ll_multireq de-references ill_ipif for checking |
| * POINTOPOINT. |
| * |
| * If ill_ipif was not properly initialized (i.e low on memory), |
| * then no interfaces to clean up. In this case just clean up the |
| * ill. |
| */ |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| ipif_free(ipif); |
| |
| /* |
| * clean out all the nce_t entries that depend on this |
| * ill for the ill_phys_addr. |
| */ |
| nce_flush(ill, B_TRUE); |
| |
| /* Clean up msgs on pending upcalls for mrouted */ |
| reset_mrt_ill(ill); |
| |
| update_conn_ill(ill, ipst); |
| |
| /* |
| * Remove multicast references added as a result of calls to |
| * ip_join_allmulti(). |
| */ |
| ip_purge_allmulti(ill); |
| |
| /* |
| * If the ill being deleted is under IPMP, boot it out of the illgrp. |
| */ |
| if (IS_UNDER_IPMP(ill)) |
| ipmp_ill_leave_illgrp(ill); |
| |
| /* |
| * ill_down will arrange to blow off any IRE's dependent on this |
| * ILL, and shut down fragmentation reassembly. |
| */ |
| ill_down(ill); |
| |
| /* Let SCTP know, so that it can remove this from its list. */ |
| sctp_update_ill(ill, SCTP_ILL_REMOVE); |
| |
| /* |
| * Walk all CONNs that can have a reference on an ire or nce for this |
| * ill (we actually walk all that now have stale references). |
| */ |
| ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); |
| |
| /* With IPv6 we have dce_ifindex. Cleanup for neatness */ |
| if (ill->ill_isv6) |
| dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); |
| |
| /* |
| * If an address on this ILL is being used as a source address then |
| * clear out the pointers in other ILLs that point to this ILL. |
| */ |
| rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); |
| if (ill->ill_usesrc_grp_next != NULL) { |
| if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ |
| ill_disband_usesrc_group(ill); |
| } else { /* consumer of the usesrc ILL */ |
| prev_ill = ill_prev_usesrc(ill); |
| prev_ill->ill_usesrc_grp_next = |
| ill->ill_usesrc_grp_next; |
| } |
| } |
| rw_exit(&ipst->ips_ill_g_usesrc_lock); |
| } |
| |
| static void |
| ipif_non_duplicate(ipif_t *ipif) |
| { |
| ill_t *ill = ipif->ipif_ill; |
| mutex_enter(&ill->ill_lock); |
| if (ipif->ipif_flags & IPIF_DUPLICATE) { |
| ipif->ipif_flags &= ~IPIF_DUPLICATE; |
| ASSERT(ill->ill_ipif_dup_count > 0); |
| ill->ill_ipif_dup_count--; |
| } |
| mutex_exit(&ill->ill_lock); |
| } |
| |
| /* |
| * ill_delete_tail is called from ip_modclose after all references |
| * to the closing ill are gone. The wait is done in ip_modclose |
| */ |
| void |
| ill_delete_tail(ill_t *ill) |
| { |
| mblk_t **mpp; |
| ipif_t *ipif; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { |
| ipif_non_duplicate(ipif); |
| (void) ipif_down_tail(ipif); |
| } |
| |
| ASSERT(ill->ill_ipif_dup_count == 0); |
| |
| /* |
| * If polling capability is enabled (which signifies direct |
| * upcall into IP and driver has ill saved as a handle), |
| * we need to make sure that unbind has completed before we |
| * let the ill disappear and driver no longer has any reference |
| * to this ill. |
| */ |
| mutex_enter(&ill->ill_lock); |
| while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) |
| cv_wait(&ill->ill_cv, &ill->ill_lock); |
| mutex_exit(&ill->ill_lock); |
| ASSERT(!(ill->ill_capabilities & |
| (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); |
| |
| if (ill->ill_net_type != IRE_LOOPBACK) |
| qprocsoff(ill->ill_rq); |
| |
| /* |
| * We do an ipsq_flush once again now. New messages could have |
| * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls |
| * could also have landed up if an ioctl thread had looked up |
| * the ill before we set the ILL_CONDEMNED flag, but not yet |
| * enqueued the ioctl when we did the ipsq_flush last time. |
| */ |
| ipsq_flush(ill); |
| |
| /* |
| * Free capabilities. |
| */ |
| if (ill->ill_hcksum_capab != NULL) { |
| kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); |
| ill->ill_hcksum_capab = NULL; |
| } |
| |
| if (ill->ill_zerocopy_capab != NULL) { |
| kmem_free(ill->ill_zerocopy_capab, |
| sizeof (ill_zerocopy_capab_t)); |
| ill->ill_zerocopy_capab = NULL; |
| } |
| |
| if (ill->ill_lso_capab != NULL) { |
| kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); |
| ill->ill_lso_capab = NULL; |
| } |
| |
| if (ill->ill_dld_capab != NULL) { |
| kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); |
| ill->ill_dld_capab = NULL; |
| } |
| |
| /* Clean up ill_allowed_ips* related state */ |
| if (ill->ill_allowed_ips != NULL) { |
| ASSERT(ill->ill_allowed_ips_cnt > 0); |
| kmem_free(ill->ill_allowed_ips, |
| ill->ill_allowed_ips_cnt * sizeof (in6_addr_t)); |
| ill->ill_allowed_ips = NULL; |
| ill->ill_allowed_ips_cnt = 0; |
| } |
| |
| while (ill->ill_ipif != NULL) |
| ipif_free_tail(ill->ill_ipif); |
| |
| /* |
| * We have removed all references to ilm from conn and the ones joined |
| * within the kernel. |
| * |
| * We don't walk conns, mrts and ires because |
| * |
| * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. |
| * 2) ill_down ->ill_downi walks all the ires and cleans up |
| * ill references. |
| */ |
| |
| /* |
| * If this ill is an IPMP meta-interface, blow away the illgrp. This |
| * is safe to do because the illgrp has already been unlinked from the |
| * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. |
| */ |
| if (IS_IPMP(ill)) { |
| ipmp_illgrp_destroy(ill->ill_grp); |
| ill->ill_grp = NULL; |
| } |
| |
| if (ill->ill_mphysaddr_list != NULL) { |
| multiphysaddr_t *mpa, *tmpa; |
| |
| mpa = ill->ill_mphysaddr_list; |
| ill->ill_mphysaddr_list = NULL; |
| while (mpa) { |
| tmpa = mpa->mpa_next; |
| kmem_free(mpa, sizeof (*mpa)); |
| mpa = tmpa; |
| } |
| } |
| /* |
| * Take us out of the list of ILLs. ill_glist_delete -> phyint_free |
| * could free the phyint. No more reference to the phyint after this |
| * point. |
| */ |
| (void) ill_glist_delete(ill); |
| |
| if (ill->ill_frag_ptr != NULL) { |
| uint_t count; |
| |
| for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { |
| mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); |
| } |
| mi_free(ill->ill_frag_ptr); |
| ill->ill_frag_ptr = NULL; |
| ill->ill_frag_hash_tbl = NULL; |
| } |
| |
| freemsg(ill->ill_nd_lla_mp); |
| /* Free all retained control messages. */ |
| mpp = &ill->ill_first_mp_to_free; |
| do { |
| while (mpp[0]) { |
| mblk_t *mp; |
| mblk_t *mp1; |
| |
| mp = mpp[0]; |
| mpp[0] = mp->b_next; |
| for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { |
| mp1->b_next = NULL; |
| mp1->b_prev = NULL; |
| } |
| freemsg(mp); |
| } |
| } while (mpp++ != &ill->ill_last_mp_to_free); |
| |
| ill_free_mib(ill); |
| |
| #ifdef DEBUG |
| ill_trace_cleanup(ill); |
| #endif |
| |
| /* The default multicast interface might have changed */ |
| ire_increment_multicast_generation(ipst, ill->ill_isv6); |
| |
| /* Drop refcnt here */ |
| netstack_rele(ill->ill_ipst->ips_netstack); |
| ill->ill_ipst = NULL; |
| } |
| |
| static void |
| ill_free_mib(ill_t *ill) |
| { |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| /* |
| * MIB statistics must not be lost, so when an interface |
| * goes away the counter values will be added to the global |
| * MIBs. |
| */ |
| if (ill->ill_ip_mib != NULL) { |
| if (ill->ill_isv6) { |
| ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, |
| ill->ill_ip_mib); |
| } else { |
| ip_mib2_add_ip_stats(&ipst->ips_ip_mib, |
| ill->ill_ip_mib); |
| } |
| |
| kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); |
| ill->ill_ip_mib = NULL; |
| } |
| if (ill->ill_icmp6_mib != NULL) { |
| ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, |
| ill->ill_icmp6_mib); |
| kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); |
| ill->ill_icmp6_mib = NULL; |
| } |
| } |
| |
| /* |
| * Concatenate together a physical address and a sap. |
| * |
| * Sap_lengths are interpreted as follows: |
| * sap_length == 0 ==> no sap |
| * sap_length > 0 ==> sap is at the head of the dlpi address |
| * sap_length < 0 ==> sap is at the tail of the dlpi address |
| */ |
| static void |
| ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, |
| t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) |
| { |
| uint16_t sap_addr = (uint16_t)sap_src; |
| |
| if (sap_length == 0) { |
| if (phys_src == NULL) |
| bzero(dst, phys_length); |
| else |
| bcopy(phys_src, dst, phys_length); |
| } else if (sap_length < 0) { |
| if (phys_src == NULL) |
| bzero(dst, phys_length); |
| else |
| bcopy(phys_src, dst, phys_length); |
| bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); |
| } else { |
| bcopy(&sap_addr, dst, sizeof (sap_addr)); |
| if (phys_src == NULL) |
| bzero((char *)dst + sap_length, phys_length); |
| else |
| bcopy(phys_src, (char *)dst + sap_length, phys_length); |
| } |
| } |
| |
| /* |
| * Generate a dl_unitdata_req mblk for the device and address given. |
| * addr_length is the length of the physical portion of the address. |
| * If addr is NULL include an all zero address of the specified length. |
| * TRUE? In any case, addr_length is taken to be the entire length of the |
| * dlpi address, including the absolute value of sap_length. |
| */ |
| mblk_t * |
| ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, |
| t_scalar_t sap_length) |
| { |
| dl_unitdata_req_t *dlur; |
| mblk_t *mp; |
| t_scalar_t abs_sap_length; /* absolute value */ |
| |
| abs_sap_length = ABS(sap_length); |
| mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, |
| DL_UNITDATA_REQ); |
| if (mp == NULL) |
| return (NULL); |
| dlur = (dl_unitdata_req_t *)mp->b_rptr; |
| /* HACK: accomodate incompatible DLPI drivers */ |
| if (addr_length == 8) |
| addr_length = 6; |
| dlur->dl_dest_addr_length = addr_length + abs_sap_length; |
| dlur->dl_dest_addr_offset = sizeof (*dlur); |
| dlur->dl_priority.dl_min = 0; |
| dlur->dl_priority.dl_max = 0; |
| ill_dlur_copy_address(addr, addr_length, sap, sap_length, |
| (uchar_t *)&dlur[1]); |
| return (mp); |
| } |
| |
| /* |
| * Add the pending mp to the list. There can be only 1 pending mp |
| * in the list. Any exclusive ioctl that needs to wait for a response |
| * from another module or driver needs to use this function to set |
| * the ipx_pending_mp to the ioctl mblk and wait for the response from |
| * the other module/driver. This is also used while waiting for the |
| * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. |
| */ |
| boolean_t |
| ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, |
| int waitfor) |
| { |
| ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; |
| |
| ASSERT(IAM_WRITER_IPIF(ipif)); |
| ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); |
| ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); |
| ASSERT(ipx->ipx_pending_mp == NULL); |
| /* |
| * The caller may be using a different ipif than the one passed into |
| * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 |
| * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT |
| * that `ipx_current_ipif == ipif'. |
| */ |
| ASSERT(ipx->ipx_current_ipif != NULL); |
| |
| /* |
| * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the |
| * driver. |
| */ |
| ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || |
| (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || |
| (DB_TYPE(add_mp) == M_PCPROTO)); |
| |
| if (connp != NULL) { |
| ASSERT(MUTEX_HELD(&connp->conn_lock)); |
| /* |
| * Return error if the conn has started closing. The conn |
| * could have finished cleaning up the pending mp list, |
| * If so we should not add another mp to the list negating |
| * the cleanup. |
| */ |
| if (connp->conn_state_flags & CONN_CLOSING) |
| return (B_FALSE); |
| } |
| mutex_enter(&ipx->ipx_lock); |
| ipx->ipx_pending_ipif = ipif; |
| /* |
| * Note down the queue in b_queue. This will be returned by |
| * ipsq_pending_mp_get. Caller will then use these values to restart |
| * the processing |
| */ |
| add_mp->b_next = NULL; |
| add_mp->b_queue = q; |
| ipx->ipx_pending_mp = add_mp; |
| ipx->ipx_waitfor = waitfor; |
| mutex_exit(&ipx->ipx_lock); |
| |
| if (connp != NULL) |
| connp->conn_oper_pending_ill = ipif->ipif_ill; |
| |
| return (B_TRUE); |
| } |
| |
| /* |
| * Retrieve the ipx_pending_mp and return it. There can be only 1 mp |
| * queued in the list. |
| */ |
| mblk_t * |
| ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) |
| { |
| mblk_t *curr = NULL; |
| ipxop_t *ipx = ipsq->ipsq_xop; |
| |
| *connpp = NULL; |
| mutex_enter(&ipx->ipx_lock); |
| if (ipx->ipx_pending_mp == NULL) { |
| mutex_exit(&ipx->ipx_lock); |
| return (NULL); |
| } |
| |
| /* There can be only 1 such excl message */ |
| curr = ipx->ipx_pending_mp; |
| ASSERT(curr->b_next == NULL); |
| ipx->ipx_pending_ipif = NULL; |
| ipx->ipx_pending_mp = NULL; |
| ipx->ipx_waitfor = 0; |
| mutex_exit(&ipx->ipx_lock); |
| |
| if (CONN_Q(curr->b_queue)) { |
| /* |
| * This mp did a refhold on the conn, at the start of the ioctl. |
| * So we can safely return a pointer to the conn to the caller. |
| */ |
| *connpp = Q_TO_CONN(curr->b_queue); |
| } else { |
| *connpp = NULL; |
| } |
| curr->b_next = NULL; |
| curr->b_prev = NULL; |
| return (curr); |
| } |
| |
| /* |
| * Cleanup the ioctl mp queued in ipx_pending_mp |
| * - Called in the ill_delete path |
| * - Called in the M_ERROR or M_HANGUP path on the ill. |
| * - Called in the conn close path. |
| * |
| * Returns success on finding the pending mblk associated with the ioctl or |
| * exclusive operation in progress, failure otherwise. |
| */ |
| boolean_t |
| ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) |
| { |
| mblk_t *mp; |
| ipxop_t *ipx; |
| queue_t *q; |
| ipif_t *ipif; |
| int cmd; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; |
| |
| mutex_enter(&ipx->ipx_lock); |
| mp = ipx->ipx_pending_mp; |
| if (connp != NULL) { |
| if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) { |
| /* |
| * Nothing to clean since the conn that is closing |
| * does not have a matching pending mblk in |
| * ipx_pending_mp. |
| */ |
| mutex_exit(&ipx->ipx_lock); |
| return (B_FALSE); |
| } |
| } else { |
| /* |
| * A non-zero ill_error signifies we are called in the |
| * M_ERROR or M_HANGUP path and we need to unconditionally |
| * abort any current ioctl and do the corresponding cleanup. |
| * A zero ill_error means we are in the ill_delete path and |
| * we do the cleanup only if there is a pending mp. |
| */ |
| if (mp == NULL && ill->ill_error == 0) { |
| mutex_exit(&ipx->ipx_lock); |
| return (B_FALSE); |
| } |
| } |
| |
| /* Now remove from the ipx_pending_mp */ |
| ipx->ipx_pending_mp = NULL; |
| ipif = ipx->ipx_pending_ipif; |
| ipx->ipx_pending_ipif = NULL; |
| ipx->ipx_waitfor = 0; |
| ipx->ipx_current_ipif = NULL; |
| cmd = ipx->ipx_current_ioctl; |
| ipx->ipx_current_ioctl = 0; |
| ipx->ipx_current_done = B_TRUE; |
| mutex_exit(&ipx->ipx_lock); |
| |
| if (mp == NULL) |
| return (B_FALSE); |
| |
| q = mp->b_queue; |
| mp->b_next = NULL; |
| mp->b_prev = NULL; |
| mp->b_queue = NULL; |
| |
| if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { |
| DTRACE_PROBE4(ipif__ioctl, |
| char *, "ipsq_pending_mp_cleanup", |
| int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, |
| ipif_t *, ipif); |
| if (connp == NULL) { |
| ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); |
| } else { |
| ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); |
| mutex_enter(&ipif->ipif_ill->ill_lock); |
| ipif->ipif_state_flags &= ~IPIF_CHANGING; |
| mutex_exit(&ipif->ipif_ill->ill_lock); |
| } |
| } else { |
| inet_freemsg(mp); |
| } |
| return (B_TRUE); |
| } |
| |
| /* |
| * Called in the conn close path and ill delete path |
| */ |
| static void |
| ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) |
| { |
| ipsq_t *ipsq; |
| mblk_t *prev; |
| mblk_t *curr; |
| mblk_t *next; |
| queue_t *wq, *rq = NULL; |
| mblk_t *tmp_list = NULL; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| if (connp != NULL) |
| wq = CONNP_TO_WQ(connp); |
| else |
| wq = ill->ill_wq; |
| |
| /* |
| * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard |
| * against this here. |
| */ |
| if (wq != NULL) |
| rq = RD(wq); |
| |
| ipsq = ill->ill_phyint->phyint_ipsq; |
| /* |
| * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. |
| * In the case of ioctl from a conn, there can be only 1 mp |
| * queued on the ipsq. If an ill is being unplumbed flush all |
| * the messages. |
| */ |
| mutex_enter(&ipsq->ipsq_lock); |
| for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; |
| curr = next) { |
| next = curr->b_next; |
| if (connp == NULL || |
| (curr->b_queue == wq || curr->b_queue == rq)) { |
| /* Unlink the mblk from the pending mp list */ |
| if (prev != NULL) { |
| prev->b_next = curr->b_next; |
| } else { |
| ASSERT(ipsq->ipsq_xopq_mphead == curr); |
| ipsq->ipsq_xopq_mphead = curr->b_next; |
| } |
| if (ipsq->ipsq_xopq_mptail == curr) |
| ipsq->ipsq_xopq_mptail = prev; |
| /* |
| * Create a temporary list and release the ipsq lock |
| * New elements are added to the head of the tmp_list |
| */ |
| curr->b_next = tmp_list; |
| tmp_list = curr; |
| } else { |
| prev = curr; |
| } |
| } |
| mutex_exit(&ipsq->ipsq_lock); |
| |
| while (tmp_list != NULL) { |
| curr = tmp_list; |
| tmp_list = curr->b_next; |
| curr->b_next = NULL; |
| curr->b_prev = NULL; |
| wq = curr->b_queue; |
| curr->b_queue = NULL; |
| if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { |
| DTRACE_PROBE4(ipif__ioctl, |
| char *, "ipsq_xopq_mp_cleanup", |
| int, 0, ill_t *, NULL, ipif_t *, NULL); |
| ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ? |
| CONN_CLOSE : NO_COPYOUT, NULL); |
| } else { |
| /* |
| * IP-MT XXX In the case of TLI/XTI bind / optmgmt |
| * this can't be just inet_freemsg. we have to |
| * restart it otherwise the thread will be stuck. |
| */ |
| inet_freemsg(curr); |
| } |
| } |
| } |
| |
| /* |
| * This conn has started closing. Cleanup any pending ioctl from this conn. |
| * STREAMS ensures that there can be at most 1 active ioctl on a stream. |
| */ |
| void |
| conn_ioctl_cleanup(conn_t *connp) |
| { |
| ipsq_t *ipsq; |
| ill_t *ill; |
| boolean_t refheld; |
| |
| /* |
| * Check for a queued ioctl. If the ioctl has not yet started, the mp |
| * is pending in the list headed by ipsq_xopq_head. If the ioctl has |
| * started the mp could be present in ipx_pending_mp. Note that if |
| * conn_oper_pending_ill is NULL, the ioctl may still be in flight and |
| * not yet queued anywhere. In this case, the conn close code will wait |
| * until the conn_ref is dropped. If the stream was a tcp stream, then |
| * tcp_close will wait first until all ioctls have completed for this |
| * conn. |
| */ |
| mutex_enter(&connp->conn_lock); |
| ill = connp->conn_oper_pending_ill; |
| if (ill == NULL) { |
| mutex_exit(&connp->conn_lock); |
| return; |
| } |
| |
| /* |
| * We may not be able to refhold the ill if the ill/ipif |
| * is changing. But we need to make sure that the ill will |
| * not vanish. So we just bump up the ill_waiter count. |
| */ |
| refheld = ill_waiter_inc(ill); |
| mutex_exit(&connp->conn_lock); |
| if (refheld) { |
| if (ipsq_enter(ill, B_TRUE, NEW_OP)) { |
| ill_waiter_dcr(ill); |
| /* |
| * Check whether this ioctl has started and is |
| * pending. If it is not found there then check |
| * whether this ioctl has not even started and is in |
| * the ipsq_xopq list. |
| */ |
| if (!ipsq_pending_mp_cleanup(ill, connp)) |
| ipsq_xopq_mp_cleanup(ill, connp); |
| ipsq = ill->ill_phyint->phyint_ipsq; |
| ipsq_exit(ipsq); |
| return; |
| } |
| } |
| |
| /* |
| * The ill is also closing and we could not bump up the |
| * ill_waiter_count or we could not enter the ipsq. Leave |
| * the cleanup to ill_delete |
| */ |
| mutex_enter(&connp->conn_lock); |
| while (connp->conn_oper_pending_ill != NULL) |
| cv_wait(&connp->conn_refcv, &connp->conn_lock); |
| mutex_exit(&connp->conn_lock); |
| if (refheld) |
| ill_waiter_dcr(ill); |
| } |
| |
| /* |
| * ipcl_walk function for cleaning up conn_*_ill fields. |
| * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and |
| * conn_bound_if in place. We prefer dropping |
| * packets instead of sending them out the wrong interface, or accepting |
| * packets from the wrong ifindex. |
| */ |
| static void |
| conn_cleanup_ill(conn_t *connp, caddr_t arg) |
| { |
| ill_t *ill = (ill_t *)arg; |
| |
| mutex_enter(&connp->conn_lock); |
| if (connp->conn_dhcpinit_ill == ill) { |
| connp->conn_dhcpinit_ill = NULL; |
| ASSERT(ill->ill_dhcpinit != 0); |
| atomic_dec_32(&ill->ill_dhcpinit); |
| ill_set_inputfn(ill); |
| } |
| mutex_exit(&connp->conn_lock); |
| } |
| |
| static int |
| ill_down_ipifs_tail(ill_t *ill) |
| { |
| ipif_t *ipif; |
| int err; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { |
| ipif_non_duplicate(ipif); |
| /* |
| * ipif_down_tail will call arp_ll_down on the last ipif |
| * and typically return EINPROGRESS when the DL_UNBIND is sent. |
| */ |
| if ((err = ipif_down_tail(ipif)) != 0) |
| return (err); |
| } |
| return (0); |
| } |
| |
| /* ARGSUSED */ |
| void |
| ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) |
| { |
| ASSERT(IAM_WRITER_IPSQ(ipsq)); |
| (void) ill_down_ipifs_tail(q->q_ptr); |
| freemsg(mp); |
| ipsq_current_finish(ipsq); |
| } |
| |
| /* |
| * ill_down_start is called when we want to down this ill and bring it up again |
| * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down |
| * all interfaces, but don't tear down any plumbing. |
| */ |
| boolean_t |
| ill_down_start(queue_t *q, mblk_t *mp) |
| { |
| ill_t *ill = q->q_ptr; |
| ipif_t *ipif; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| /* |
| * It is possible that some ioctl is already in progress while we |
| * received the M_ERROR / M_HANGUP in which case, we need to abort |
| * the ioctl. ill_down_start() is being processed as CUR_OP rather |
| * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent |
| * the in progress ioctl from ever completing. |
| * |
| * The thread that started the ioctl (if any) must have returned, |
| * since we are now executing as writer. After the 2 calls below, |
| * the state of the ipsq and the ill would reflect no trace of any |
| * pending operation. Subsequently if there is any response to the |
| * original ioctl from the driver, it would be discarded as an |
| * unsolicited message from the driver. |
| */ |
| (void) ipsq_pending_mp_cleanup(ill, NULL); |
| ill_dlpi_clear_deferred(ill); |
| |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) |
| (void) ipif_down(ipif, NULL, NULL); |
| |
| ill_down(ill); |
| |
| /* |
| * Walk all CONNs that can have a reference on an ire or nce for this |
| * ill (we actually walk all that now have stale references). |
| */ |
| ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); |
| |
| /* With IPv6 we have dce_ifindex. Cleanup for neatness */ |
| if (ill->ill_isv6) |
| dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); |
| |
| ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); |
| |
| /* |
| * Atomically test and add the pending mp if references are active. |
| */ |
| mutex_enter(&ill->ill_lock); |
| if (!ill_is_quiescent(ill)) { |
| /* call cannot fail since `conn_t *' argument is NULL */ |
| (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, |
| mp, ILL_DOWN); |
| mutex_exit(&ill->ill_lock); |
| return (B_FALSE); |
| } |
| mutex_exit(&ill->ill_lock); |
| return (B_TRUE); |
| } |
| |
| static void |
| ill_down(ill_t *ill) |
| { |
| mblk_t *mp; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| /* |
| * Blow off any IREs dependent on this ILL. |
| * The caller needs to handle conn_ixa_cleanup |
| */ |
| ill_delete_ires(ill); |
| |
| ire_walk_ill(0, 0, ill_downi, ill, ill); |
| |
| /* Remove any conn_*_ill depending on this ill */ |
| ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); |
| |
| /* |
| * Free state for additional IREs. |
| */ |
| mutex_enter(&ill->ill_saved_ire_lock); |
| mp = ill->ill_saved_ire_mp; |
| ill->ill_saved_ire_mp = NULL; |
| ill->ill_saved_ire_cnt = 0; |
| mutex_exit(&ill->ill_saved_ire_lock); |
| freemsg(mp); |
| } |
| |
| /* |
| * ire_walk routine used to delete every IRE that depends on |
| * 'ill'. (Always called as writer, and may only be called from ire_walk.) |
| * |
| * Note: since the routes added by the kernel are deleted separately, |
| * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. |
| * |
| * We also remove references on ire_nce_cache entries that refer to the ill. |
| */ |
| void |
| ill_downi(ire_t *ire, char *ill_arg) |
| { |
| ill_t *ill = (ill_t *)ill_arg; |
| nce_t *nce; |
| |
| mutex_enter(&ire->ire_lock); |
| nce = ire->ire_nce_cache; |
| if (nce != NULL && nce->nce_ill == ill) |
| ire->ire_nce_cache = NULL; |
| else |
| nce = NULL; |
| mutex_exit(&ire->ire_lock); |
| if (nce != NULL) |
| nce_refrele(nce); |
| if (ire->ire_ill == ill) { |
| /* |
| * The existing interface binding for ire must be |
| * deleted before trying to bind the route to another |
| * interface. However, since we are using the contents of the |
| * ire after ire_delete, the caller has to ensure that |
| * CONDEMNED (deleted) ire's are not removed from the list |
| * when ire_delete() returns. Currently ill_downi() is |
| * only called as part of ire_walk*() routines, so that |
| * the irb_refhold() done by ire_walk*() will ensure that |
| * ire_delete() does not lead to ire_inactive(). |
| */ |
| ASSERT(ire->ire_bucket->irb_refcnt > 0); |
| ire_delete(ire); |
| if (ire->ire_unbound) |
| ire_rebind(ire); |
| } |
| } |
| |
| /* Remove IRE_IF_CLONE on this ill */ |
| void |
| ill_downi_if_clone(ire_t *ire, char *ill_arg) |
| { |
| ill_t *ill = (ill_t *)ill_arg; |
| |
| ASSERT(ire->ire_type & IRE_IF_CLONE); |
| if (ire->ire_ill == ill) |
| ire_delete(ire); |
| } |
| |
| /* Consume an M_IOCACK of the fastpath probe. */ |
| void |
| ill_fastpath_ack(ill_t *ill, mblk_t *mp) |
| { |
| mblk_t *mp1 = mp; |
| |
| /* |
| * If this was the first attempt turn on the fastpath probing. |
| */ |
| mutex_enter(&ill->ill_lock); |
| if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) |
| ill->ill_dlpi_fastpath_state = IDS_OK; |
| mutex_exit(&ill->ill_lock); |
| |
| /* Free the M_IOCACK mblk, hold on to the data */ |
| mp = mp->b_cont; |
| freeb(mp1); |
| if (mp == NULL) |
| return; |
| if (mp->b_cont != NULL) |
| nce_fastpath_update(ill, mp); |
| else |
| ip0dbg(("ill_fastpath_ack: no b_cont\n")); |
| freemsg(mp); |
| } |
| |
| /* |
| * Throw an M_IOCTL message downstream asking "do you know fastpath?" |
| * The data portion of the request is a dl_unitdata_req_t template for |
| * what we would send downstream in the absence of a fastpath confirmation. |
| */ |
| int |
| ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) |
| { |
| struct iocblk *ioc; |
| mblk_t *mp; |
| |
| if (dlur_mp == NULL) |
| return (EINVAL); |
| |
| mutex_enter(&ill->ill_lock); |
| switch (ill->ill_dlpi_fastpath_state) { |
| case IDS_FAILED: |
| /* |
| * Driver NAKed the first fastpath ioctl - assume it doesn't |
| * support it. |
| */ |
| mutex_exit(&ill->ill_lock); |
| return (ENOTSUP); |
| case IDS_UNKNOWN: |
| /* This is the first probe */ |
| ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; |
| break; |
| default: |
| break; |
| } |
| mutex_exit(&ill->ill_lock); |
| |
| if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) |
| return (EAGAIN); |
| |
| mp->b_cont = copyb(dlur_mp); |
| if (mp->b_cont == NULL) { |
| freeb(mp); |
| return (EAGAIN); |
| } |
| |
| ioc = (struct iocblk *)mp->b_rptr; |
| ioc->ioc_count = msgdsize(mp->b_cont); |
| |
| DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", |
| char *, "DL_IOC_HDR_INFO", ill_t *, ill); |
| putnext(ill->ill_wq, mp); |
| return (0); |
| } |
| |
| void |
| ill_capability_probe(ill_t *ill) |
| { |
| mblk_t *mp; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && |
| ill->ill_dlpi_capab_state != IDCS_FAILED) |
| return; |
| |
| /* |
| * We are starting a new cycle of capability negotiation. |
| * Free up the capab reset messages of any previous incarnation. |
| * We will do a fresh allocation when we get the response to our probe |
| */ |
| if (ill->ill_capab_reset_mp != NULL) { |
| freemsg(ill->ill_capab_reset_mp); |
| ill->ill_capab_reset_mp = NULL; |
| } |
| |
| ip1dbg(("ill_capability_probe: starting capability negotiation\n")); |
| |
| mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); |
| if (mp == NULL) |
| return; |
| |
| ill_capability_send(ill, mp); |
| ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; |
| } |
| |
| void |
| ill_capability_reset(ill_t *ill, boolean_t reneg) |
| { |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| if (ill->ill_dlpi_capab_state != IDCS_OK) |
| return; |
| |
| ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; |
| |
| ill_capability_send(ill, ill->ill_capab_reset_mp); |
| ill->ill_capab_reset_mp = NULL; |
| /* |
| * We turn off all capabilities except those pertaining to |
| * direct function call capabilities viz. ILL_CAPAB_DLD* |
| * which will be turned off by the corresponding reset functions. |
| */ |
| ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); |
| } |
| |
| static void |
| ill_capability_reset_alloc(ill_t *ill) |
| { |
| mblk_t *mp; |
| size_t size = 0; |
| int err; |
| dl_capability_req_t *capb; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(ill->ill_capab_reset_mp == NULL); |
| |
| if (ILL_HCKSUM_CAPABLE(ill)) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_hcksum_t); |
| } |
| |
| if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_zerocopy_t); |
| } |
| |
| if (ill->ill_capabilities & ILL_CAPAB_DLD) { |
| size += sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_dld_t); |
| } |
| |
| mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, |
| STR_NOSIG, &err); |
| |
| mp->b_datap->db_type = M_PROTO; |
| bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); |
| |
| capb = (dl_capability_req_t *)mp->b_rptr; |
| capb->dl_primitive = DL_CAPABILITY_REQ; |
| capb->dl_sub_offset = sizeof (dl_capability_req_t); |
| capb->dl_sub_length = size; |
| |
| mp->b_wptr += sizeof (dl_capability_req_t); |
| |
| /* |
| * Each handler fills in the corresponding dl_capability_sub_t |
| * inside the mblk, |
| */ |
| ill_capability_hcksum_reset_fill(ill, mp); |
| ill_capability_zerocopy_reset_fill(ill, mp); |
| ill_capability_dld_reset_fill(ill, mp); |
| |
| ill->ill_capab_reset_mp = mp; |
| } |
| |
| static void |
| ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) |
| { |
| dl_capab_id_t *id_ic; |
| uint_t sub_dl_cap = outers->dl_cap; |
| dl_capability_sub_t *inners; |
| uint8_t *capend; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| |
| capend = (uint8_t *)(outers + 1) + outers->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_id_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| id_ic = (dl_capab_id_t *)(outers + 1); |
| |
| if (outers->dl_length < sizeof (*id_ic) || |
| (inners = &id_ic->id_subcap, |
| inners->dl_length > (outers->dl_length - sizeof (*inners)))) { |
| cmn_err(CE_WARN, "ill_capability_id_ack: malformed " |
| "encapsulated capab type %d too long for mblk", |
| inners->dl_cap); |
| return; |
| } |
| |
| if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_id_ack: mid token for capab type %d " |
| "isn't as expected; pass-thru module(s) detected, " |
| "discarding capability\n", inners->dl_cap)); |
| return; |
| } |
| |
| /* Process the encapsulated sub-capability */ |
| ill_capability_dispatch(ill, mp, inners); |
| } |
| |
| static void |
| ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) |
| { |
| dl_capability_sub_t *dl_subcap; |
| |
| if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) |
| return; |
| |
| /* |
| * The dl_capab_dld_t that follows the dl_capability_sub_t is not |
| * initialized below since it is not used by DLD. |
| */ |
| dl_subcap = (dl_capability_sub_t *)mp->b_wptr; |
| dl_subcap->dl_cap = DL_CAPAB_DLD; |
| dl_subcap->dl_length = sizeof (dl_capab_dld_t); |
| |
| mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); |
| } |
| |
| static void |
| ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) |
| { |
| /* |
| * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK |
| * is only to get the VRRP capability. |
| * |
| * Note that we cannot check ill_ipif_up_count here since |
| * ill_ipif_up_count is only incremented when the resolver is setup. |
| * That is done asynchronously, and can race with this function. |
| */ |
| if (!ill->ill_dl_up) { |
| if (subp->dl_cap == DL_CAPAB_VRRP) |
| ill_capability_vrrp_ack(ill, mp, subp); |
| return; |
| } |
| |
| switch (subp->dl_cap) { |
| case DL_CAPAB_HCKSUM: |
| ill_capability_hcksum_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_ZEROCOPY: |
| ill_capability_zerocopy_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_DLD: |
| ill_capability_dld_ack(ill, mp, subp); |
| break; |
| case DL_CAPAB_VRRP: |
| break; |
| default: |
| ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", |
| subp->dl_cap)); |
| } |
| } |
| |
| /* |
| * Process the vrrp capability received from a DLS Provider. isub must point |
| * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. |
| */ |
| static void |
| ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| dl_capab_vrrp_t *vrrp; |
| uint_t sub_dl_cap = isub->dl_cap; |
| uint8_t *capend; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(sub_dl_cap == DL_CAPAB_VRRP); |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_vrrp_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| vrrp = (dl_capab_vrrp_t *)(isub + 1); |
| |
| /* |
| * Compare the IP address family and set ILLF_VRRP for the right ill. |
| */ |
| if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || |
| (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { |
| ill->ill_flags |= ILLF_VRRP; |
| } |
| } |
| |
| /* |
| * Process a hardware checksum offload capability negotiation ack received |
| * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) |
| * of a DL_CAPABILITY_ACK message. |
| */ |
| static void |
| ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| dl_capability_req_t *ocap; |
| dl_capab_hcksum_t *ihck, *ohck; |
| ill_hcksum_capab_t **ill_hcksum; |
| mblk_t *nmp = NULL; |
| uint_t sub_dl_cap = isub->dl_cap; |
| uint8_t *capend; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); |
| |
| ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_hcksum_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| /* |
| * There are two types of acks we process here: |
| * 1. acks in reply to a (first form) generic capability req |
| * (no ENABLE flag set) |
| * 2. acks in reply to a ENABLE capability req. |
| * (ENABLE flag set) |
| */ |
| ihck = (dl_capab_hcksum_t *)(isub + 1); |
| |
| if (ihck->hcksum_version != HCKSUM_VERSION_1) { |
| cmn_err(CE_CONT, "ill_capability_hcksum_ack: " |
| "unsupported hardware checksum " |
| "sub-capability (version %d, expected %d)", |
| ihck->hcksum_version, HCKSUM_VERSION_1); |
| return; |
| } |
| |
| if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " |
| "checksum capability isn't as expected; pass-thru " |
| "module(s) detected, discarding capability\n")); |
| return; |
| } |
| |
| #define CURR_HCKSUM_CAPAB \ |
| (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ |
| HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) |
| |
| if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && |
| (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { |
| /* do ENABLE processing */ |
| if (*ill_hcksum == NULL) { |
| *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), |
| KM_NOSLEEP); |
| |
| if (*ill_hcksum == NULL) { |
| cmn_err(CE_WARN, "ill_capability_hcksum_ack: " |
| "could not enable hcksum version %d " |
| "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, |
| ill->ill_name); |
| return; |
| } |
| } |
| |
| (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; |
| (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; |
| ill->ill_capabilities |= ILL_CAPAB_HCKSUM; |
| ip1dbg(("ill_capability_hcksum_ack: interface %s " |
| "has enabled hardware checksumming\n ", |
| ill->ill_name)); |
| } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { |
| /* |
| * Enabling hardware checksum offload |
| * Currently IP supports {TCP,UDP}/IPv4 |
| * partial and full cksum offload and |
| * IPv4 header checksum offload. |
| * Allocate new mblk which will |
| * contain a new capability request |
| * to enable hardware checksum offload. |
| */ |
| uint_t size; |
| uchar_t *rptr; |
| |
| size = sizeof (dl_capability_req_t) + |
| sizeof (dl_capability_sub_t) + isub->dl_length; |
| |
| if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { |
| cmn_err(CE_WARN, "ill_capability_hcksum_ack: " |
| "could not enable hardware cksum for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| |
| rptr = nmp->b_rptr; |
| /* initialize dl_capability_req_t */ |
| ocap = (dl_capability_req_t *)nmp->b_rptr; |
| ocap->dl_sub_offset = |
| sizeof (dl_capability_req_t); |
| ocap->dl_sub_length = |
| sizeof (dl_capability_sub_t) + |
| isub->dl_length; |
| nmp->b_rptr += sizeof (dl_capability_req_t); |
| |
| /* initialize dl_capability_sub_t */ |
| bcopy(isub, nmp->b_rptr, sizeof (*isub)); |
| nmp->b_rptr += sizeof (*isub); |
| |
| /* initialize dl_capab_hcksum_t */ |
| ohck = (dl_capab_hcksum_t *)nmp->b_rptr; |
| bcopy(ihck, ohck, sizeof (*ihck)); |
| |
| nmp->b_rptr = rptr; |
| ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); |
| |
| /* Set ENABLE flag */ |
| ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; |
| ohck->hcksum_txflags |= HCKSUM_ENABLE; |
| |
| /* |
| * nmp points to a DL_CAPABILITY_REQ message to enable |
| * hardware checksum acceleration. |
| */ |
| ill_capability_send(ill, nmp); |
| } else { |
| ip1dbg(("ill_capability_hcksum_ack: interface %s has " |
| "advertised %x hardware checksum capability flags\n", |
| ill->ill_name, ihck->hcksum_txflags)); |
| } |
| } |
| |
| static void |
| ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) |
| { |
| dl_capab_hcksum_t *hck_subcap; |
| dl_capability_sub_t *dl_subcap; |
| |
| if (!ILL_HCKSUM_CAPABLE(ill)) |
| return; |
| |
| ASSERT(ill->ill_hcksum_capab != NULL); |
| |
| dl_subcap = (dl_capability_sub_t *)mp->b_wptr; |
| dl_subcap->dl_cap = DL_CAPAB_HCKSUM; |
| dl_subcap->dl_length = sizeof (*hck_subcap); |
| |
| hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); |
| hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; |
| hck_subcap->hcksum_txflags = 0; |
| |
| mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); |
| } |
| |
| static void |
| ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| mblk_t *nmp = NULL; |
| dl_capability_req_t *oc; |
| dl_capab_zerocopy_t *zc_ic, *zc_oc; |
| ill_zerocopy_capab_t **ill_zerocopy_capab; |
| uint_t sub_dl_cap = isub->dl_cap; |
| uint8_t *capend; |
| |
| ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); |
| |
| ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| |
| zc_ic = (dl_capab_zerocopy_t *)(isub + 1); |
| if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { |
| cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " |
| "unsupported ZEROCOPY sub-capability (version %d, " |
| "expected %d)", zc_ic->zerocopy_version, |
| ZEROCOPY_VERSION_1); |
| return; |
| } |
| |
| if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " |
| "capability isn't as expected; pass-thru module(s) " |
| "detected, discarding capability\n")); |
| return; |
| } |
| |
| if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { |
| if (*ill_zerocopy_capab == NULL) { |
| *ill_zerocopy_capab = |
| kmem_zalloc(sizeof (ill_zerocopy_capab_t), |
| KM_NOSLEEP); |
| |
| if (*ill_zerocopy_capab == NULL) { |
| cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " |
| "could not enable Zero-copy version %d " |
| "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, |
| ill->ill_name); |
| return; |
| } |
| } |
| |
| ip1dbg(("ill_capability_zerocopy_ack: interface %s " |
| "supports Zero-copy version %d\n", ill->ill_name, |
| ZEROCOPY_VERSION_1)); |
| |
| (*ill_zerocopy_capab)->ill_zerocopy_version = |
| zc_ic->zerocopy_version; |
| (*ill_zerocopy_capab)->ill_zerocopy_flags = |
| zc_ic->zerocopy_flags; |
| |
| ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; |
| } else { |
| uint_t size; |
| uchar_t *rptr; |
| |
| size = sizeof (dl_capability_req_t) + |
| sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_zerocopy_t); |
| |
| if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { |
| cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " |
| "could not enable zerocopy for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| |
| rptr = nmp->b_rptr; |
| /* initialize dl_capability_req_t */ |
| oc = (dl_capability_req_t *)rptr; |
| oc->dl_sub_offset = sizeof (dl_capability_req_t); |
| oc->dl_sub_length = sizeof (dl_capability_sub_t) + |
| sizeof (dl_capab_zerocopy_t); |
| rptr += sizeof (dl_capability_req_t); |
| |
| /* initialize dl_capability_sub_t */ |
| bcopy(isub, rptr, sizeof (*isub)); |
| rptr += sizeof (*isub); |
| |
| /* initialize dl_capab_zerocopy_t */ |
| zc_oc = (dl_capab_zerocopy_t *)rptr; |
| *zc_oc = *zc_ic; |
| |
| ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " |
| "to enable zero-copy version %d\n", ill->ill_name, |
| ZEROCOPY_VERSION_1)); |
| |
| /* set VMSAFE_MEM flag */ |
| zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; |
| |
| /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ |
| ill_capability_send(ill, nmp); |
| } |
| } |
| |
| static void |
| ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) |
| { |
| dl_capab_zerocopy_t *zerocopy_subcap; |
| dl_capability_sub_t *dl_subcap; |
| |
| if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) |
| return; |
| |
| ASSERT(ill->ill_zerocopy_capab != NULL); |
| |
| dl_subcap = (dl_capability_sub_t *)mp->b_wptr; |
| dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; |
| dl_subcap->dl_length = sizeof (*zerocopy_subcap); |
| |
| zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); |
| zerocopy_subcap->zerocopy_version = |
| ill->ill_zerocopy_capab->ill_zerocopy_version; |
| zerocopy_subcap->zerocopy_flags = 0; |
| |
| mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); |
| } |
| |
| /* |
| * DLD capability |
| * Refer to dld.h for more information regarding the purpose and usage |
| * of this capability. |
| */ |
| static void |
| ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) |
| { |
| dl_capab_dld_t *dld_ic, dld; |
| uint_t sub_dl_cap = isub->dl_cap; |
| uint8_t *capend; |
| ill_dld_capab_t *idc; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| ASSERT(sub_dl_cap == DL_CAPAB_DLD); |
| |
| /* |
| * Note: range checks here are not absolutely sufficient to |
| * make us robust against malformed messages sent by drivers; |
| * this is in keeping with the rest of IP's dlpi handling. |
| * (Remember, it's coming from something else in the kernel |
| * address space) |
| */ |
| capend = (uint8_t *)(isub + 1) + isub->dl_length; |
| if (capend > mp->b_wptr) { |
| cmn_err(CE_WARN, "ill_capability_dld_ack: " |
| "malformed sub-capability too long for mblk"); |
| return; |
| } |
| dld_ic = (dl_capab_dld_t *)(isub + 1); |
| if (dld_ic->dld_version != DLD_CURRENT_VERSION) { |
| cmn_err(CE_CONT, "ill_capability_dld_ack: " |
| "unsupported DLD sub-capability (version %d, " |
| "expected %d)", dld_ic->dld_version, |
| DLD_CURRENT_VERSION); |
| return; |
| } |
| if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { |
| ip1dbg(("ill_capability_dld_ack: mid token for dld " |
| "capability isn't as expected; pass-thru module(s) " |
| "detected, discarding capability\n")); |
| return; |
| } |
| |
| /* |
| * Copy locally to ensure alignment. |
| */ |
| bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); |
| |
| if ((idc = ill->ill_dld_capab) == NULL) { |
| idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); |
| if (idc == NULL) { |
| cmn_err(CE_WARN, "ill_capability_dld_ack: " |
| "could not enable DLD version %d " |
| "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, |
| ill->ill_name); |
| return; |
| } |
| ill->ill_dld_capab = idc; |
| } |
| idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; |
| idc->idc_capab_dh = (void *)dld.dld_capab_handle; |
| ip1dbg(("ill_capability_dld_ack: interface %s " |
| "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); |
| |
| ill_capability_dld_enable(ill); |
| } |
| |
| /* |
| * Typically capability negotiation between IP and the driver happens via |
| * DLPI message exchange. However GLD also offers a direct function call |
| * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, |
| * But arbitrary function calls into IP or GLD are not permitted, since both |
| * of them are protected by their own perimeter mechanism. The perimeter can |
| * be viewed as a coarse lock or serialization mechanism. The hierarchy of |
| * these perimeters is IP -> MAC. Thus for example to enable the squeue |
| * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter |
| * to enter the mac perimeter and then do the direct function calls into |
| * GLD to enable squeue polling. The ring related callbacks from the mac into |
| * the stack to add, bind, quiesce, restart or cleanup a ring are all |
| * protected by the mac perimeter. |
| */ |
| static void |
| ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) |
| { |
| ill_dld_capab_t *idc = ill->ill_dld_capab; |
| int err; |
| |
| err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, |
| DLD_ENABLE); |
| ASSERT(err == 0); |
| } |
| |
| static void |
| ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) |
| { |
| ill_dld_capab_t *idc = ill->ill_dld_capab; |
| int err; |
| |
| err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, |
| DLD_DISABLE); |
| ASSERT(err == 0); |
| } |
| |
| boolean_t |
| ill_mac_perim_held(ill_t *ill) |
| { |
| ill_dld_capab_t *idc = ill->ill_dld_capab; |
| |
| return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, |
| DLD_QUERY)); |
| } |
| |
| static void |
| ill_capability_direct_enable(ill_t *ill) |
| { |
| ill_dld_capab_t *idc = ill->ill_dld_capab; |
| ill_dld_direct_t *idd = &idc->idc_direct; |
| dld_capab_direct_t direct; |
| int rc; |
| |
| ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); |
| |
| bzero(&direct, sizeof (direct)); |
| direct.di_rx_cf = (uintptr_t)ip_input; |
| direct.di_rx_ch = ill; |
| |
| rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, |
| DLD_ENABLE); |
| if (rc == 0) { |
| idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; |
| idd->idd_tx_dh = direct.di_tx_dh; |
| idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; |
| idd->idd_tx_cb_dh = direct.di_tx_cb_dh; |
| idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; |
| idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; |
| ASSERT(idd->idd_tx_cb_df != NULL); |
| ASSERT(idd->idd_tx_fctl_df != NULL); |
| ASSERT(idd->idd_tx_df != NULL); |
| /* |
| * One time registration of flow enable callback function |
| */ |
| ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, |
| ill_flow_enable, ill); |
| ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; |
| DTRACE_PROBE1(direct_on, (ill_t *), ill); |
| } else { |
| cmn_err(CE_WARN, "warning: could not enable DIRECT " |
| "capability, rc = %d\n", rc); |
| DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); |
| } |
| } |
| |
| static void |
| ill_capability_poll_enable(ill_t *ill) |
| { |
| ill_dld_capab_t *idc = ill->ill_dld_capab; |
| dld_capab_poll_t poll; |
| int rc; |
| |
| ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); |
| |
| bzero(&poll, sizeof (poll)); |
| poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; |
| poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; |
| poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; |
| poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; |
| poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; |
| poll.poll_ring_ch = ill; |
| rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, |
| DLD_ENABLE); |
| if (rc == 0) { |
| ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; |
| DTRACE_PROBE1(poll_on, (ill_t *), ill); |
| } else { |
| ip1dbg(("warning: could not enable POLL " |
| "capability, rc = %d\n", rc)); |
| DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); |
| } |
| } |
| |
| /* |
| * Enable the LSO capability. |
| */ |
| static void |
| ill_capability_lso_enable(ill_t *ill) |
| { |
| ill_dld_capab_t *idc = ill->ill_dld_capab; |
| dld_capab_lso_t lso; |
| int rc; |
| |
| ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); |
| |
| if (ill->ill_lso_capab == NULL) { |
| ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), |
| KM_NOSLEEP); |
| if (ill->ill_lso_capab == NULL) { |
| cmn_err(CE_WARN, "ill_capability_lso_enable: " |
| "could not enable LSO for %s (ENOMEM)\n", |
| ill->ill_name); |
| return; |
| } |
| } |
| |
| bzero(&lso, sizeof (lso)); |
| if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, |
| DLD_ENABLE)) == 0) { |
| ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; |
| ill->ill_lso_capab->ill_lso_max = lso.lso_max; |
| ill->ill_capabilities |= ILL_CAPAB_LSO; |
| ip1dbg(("ill_capability_lso_enable: interface %s " |
| "has enabled LSO\n ", ill->ill_name)); |
| } else { |
| kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); |
| ill->ill_lso_capab = NULL; |
| DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); |
| } |
| } |
| |
| static void |
| ill_capability_dld_enable(ill_t *ill) |
| { |
| mac_perim_handle_t mph; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| if (ill->ill_isv6) |
| return; |
| |
| ill_mac_perim_enter(ill, &mph); |
| if (!ill->ill_isv6) { |
| ill_capability_direct_enable(ill); |
| ill_capability_poll_enable(ill); |
| ill_capability_lso_enable(ill); |
| } |
| ill->ill_capabilities |= ILL_CAPAB_DLD; |
| ill_mac_perim_exit(ill, mph); |
| } |
| |
| static void |
| ill_capability_dld_disable(ill_t *ill) |
| { |
| ill_dld_capab_t *idc; |
| ill_dld_direct_t *idd; |
| mac_perim_handle_t mph; |
| |
| ASSERT(IAM_WRITER_ILL(ill)); |
| |
| if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) |
| return; |
| |
| ill_mac_perim_enter(ill, &mph); |
| |
| idc = ill->ill_dld_capab; |
| if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { |
| /* |
| * For performance we avoid locks in the transmit data path |
| * and don't maintain a count of the number of threads using |
| * direct calls. Thus some threads could be using direct |
| * transmit calls to GLD, even after the capability mechanism |
| * turns it off. This is still safe since the handles used in |
| * the direct calls continue to be valid until the unplumb is |
| * completed. Remove the callback that was added (1-time) at |
| * capab enable time. |
| */ |
| mutex_enter(&ill->ill_lock); |
| ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; |
| mutex_exit(&ill->ill_lock); |
| if (ill->ill_flownotify_mh != NULL) { |
| idd = &idc->idc_direct; |
| idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, |
| ill->ill_flownotify_mh); |
| ill->ill_flownotify_mh = NULL; |
| } |
| (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, |
| NULL, DLD_DISABLE); |
| } |
| |
| if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { |
| ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; |
| ip_squeue_clean_all(ill); |
| (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, |
| NULL, DLD_DISABLE); |
| } |
| |
| if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { |
| ASSERT(ill->ill_lso_capab != NULL); |
| /* |
| * Clear the capability flag for LSO but retain the |
| * ill_lso_capab structure since it's possible that another |
| * thread is still referring to it. The structure only gets |
| * deallocated when we destroy the ill. |
| */ |
| |
| ill->ill_capabilities &= ~ILL_CAPAB_LSO; |
| (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, |
| NULL, DLD_DISABLE); |
| } |
| |
| ill->ill_capabilities &= ~ILL_CAPAB_DLD; |
| ill_mac_perim_exit(ill, mph); |
| } |
| |
| /* |
| * Capability Negotiation protocol |
| * |
| * We don't wait for DLPI capability operations to finish during interface |
| * bringup or teardown. Doing so would introduce more asynchrony and the |
| * interface up/down operations will need multiple return and restarts. |
| * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as |
| * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next |
| * exclusive operation won't start until the DLPI operations of the previous |
| * exclusive operation complete. |
| * |
| * The capability state machine is shown below. |
| * |
| * state next state event, action |
| * |
| * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe |
| * IDCS_PROBE_SENT IDCS_OK ill_capability_ack |
| * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) |
| * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG |
| * IDCS_OK IDCS_RESET_SENT ill_capability_reset |
| * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr |
| * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> |
| * ill_capability_probe. |
| */ |
| |
| /* |
| * Dedicated thread started from ip_stack_init that handles capability |
| * disable. This thread ensures the taskq dispatch does not fail by waiting |
| * for resources using TQ_SLEEP. The taskq mechanism is used to ensure |
| * that direct calls to DLD are done in a cv_waitable context. |
| */ |
| void |
| ill_taskq_dispatch(ip_stack_t *ipst) |
| { |
| callb_cpr_t cprinfo; |
| char name[64]; |
| mblk_t *mp; |
| |
| (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", |
| ipst->ips_netstack->netstack_stackid); |
| CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, |
| name); |
| mutex_enter(&ipst->ips_capab_taskq_lock); |
| |
| for (;;) { |
| mp = ipst->ips_capab_taskq_head; |
| while (mp != NULL) { |
| ipst->ips_capab_taskq_head = mp->b_next; |
| if (ipst->ips_capab_taskq_head == NULL) |
| ipst->ips_capab_taskq_tail = NULL; |
| mutex_exit(&ipst->ips_capab_taskq_lock); |
| mp->b_next = NULL; |
| |
| VERIFY(taskq_dispatch(system_taskq, |
| ill_capability_ack_thr, mp, TQ_SLEEP) != 0); |
| mutex_enter(&ipst->ips_capab_taskq_lock); |
| mp = ipst->ips_capab_taskq_head; |
| } |
| |
| if (ipst->ips_capab_taskq_quit) |
| break; |
| CALLB_CPR_SAFE_BEGIN(&cprinfo); |
| cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); |
| CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); |
| } |
| VERIFY(ipst->ips_capab_taskq_head == NULL); |
| VERIFY(ipst->ips_capab_taskq_tail == NULL); |
| CALLB_CPR_EXIT(&cprinfo); |
| thread_exit(); |
| } |
| |
| /* |
| * Consume a new-style hardware capabilities negotiation ack. |
| * Called via taskq on receipt of DL_CAPABILITY_ACK. |
| */ |
| static void |
| ill_capability_ack_thr(void *arg) |
| { |
| mblk_t *mp = arg; |
| dl_capability_ack_t *capp; |
| dl_capability_sub_t *subp, *endp; |
| ill_t *ill; |
| boolean_t reneg; |
| |
| ill = (ill_t *)mp->b_prev; |
| mp->b_prev = NULL; |
| |
| VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); |
| |
| if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || |
| ill->ill_dlpi_capab_state == IDCS_RENEG) { |
| /* |
| * We have received the ack for our DL_CAPAB reset request. |
| * There isnt' anything in the message that needs processing. |
| * All message based capabilities have been disabled, now |
| * do the function call based capability disable. |
| */ |
| reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; |
| ill_capability_dld_disable(ill); |
| ill->ill_dlpi_capab_state = IDCS_UNKNOWN; |
| if (reneg) |
| ill_capability_probe(ill); |
| goto done; |
| } |
| |
| if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) |
| ill->ill_dlpi_capab_state = IDCS_OK; |
| |
| capp = (dl_capability_ack_t *)mp->b_rptr; |
| |
| if (capp->dl_sub_length == 0) { |
| /* no new-style capabilities */ |
| goto done; |
| } |
| |
| /* make sure the driver supplied correct dl_sub_length */ |
| if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { |
| ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " |
| "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); |
| goto done; |
| } |
| |
| #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) |
| /* |
| * There are sub-capabilities. Process the ones we know about. |
| * Loop until we don't have room for another sub-cap header.. |
| */ |
| for (subp = SC(capp, capp->dl_sub_offset), |
| endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); |
| subp <= endp; |
| subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { |
| |
| switch (subp->dl_cap) { |
| case DL_CAPAB_ID_WRAPPER: |
| ill_capability_id_ack(ill, mp, subp); |
| break; |
| default: |
| ill_capability_dispatch(ill, mp, subp); |
| break; |
| } |
| } |
| #undef SC |
| done: |
| inet_freemsg(mp); |
| ill_capability_done(ill); |
| ipsq_exit(ill->ill_phyint->phyint_ipsq); |
| } |
| |
| /* |
| * This needs to be started in a taskq thread to provide a cv_waitable |
| * context. |
| */ |
| void |
| ill_capability_ack(ill_t *ill, mblk_t *mp) |
| { |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| mp->b_prev = (mblk_t *)ill; |
| ASSERT(mp->b_next == NULL); |
| |
| if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, |
| TQ_NOSLEEP) != 0) |
| return; |
| |
| /* |
| * The taskq dispatch failed. Signal the ill_taskq_dispatch thread |
| * which will do the dispatch using TQ_SLEEP to guarantee success. |
| */ |
| mutex_enter(&ipst->ips_capab_taskq_lock); |
| if (ipst->ips_capab_taskq_head == NULL) { |
| ASSERT(ipst->ips_capab_taskq_tail == NULL); |
| ipst->ips_capab_taskq_head = mp; |
| } else { |
| ipst->ips_capab_taskq_tail->b_next = mp; |
| } |
| ipst->ips_capab_taskq_tail = mp; |
| |
| cv_signal(&ipst->ips_capab_taskq_cv); |
| mutex_exit(&ipst->ips_capab_taskq_lock); |
| } |
| |
| /* |
| * This routine is called to scan the fragmentation reassembly table for |
| * the specified ILL for any packets that are starting to smell. |
| * dead_interval is the maximum time in seconds that will be tolerated. It |
| * will either be the value specified in ip_g_frag_timeout, or zero if the |
| * ILL is shutting down and it is time to blow everything off. |
| * |
| * It returns the number of seconds (as a time_t) that the next frag timer |
| * should be scheduled for, 0 meaning that the timer doesn't need to be |
| * re-started. Note that the method of calculating next_timeout isn't |
| * entirely accurate since time will flow between the time we grab |
| * current_time and the time we schedule the next timeout. This isn't a |
| * big problem since this is the timer for sending an ICMP reassembly time |
| * exceeded messages, and it doesn't have to be exactly accurate. |
| * |
| * This function is |
| * sometimes called as writer, although this is not required. |
| */ |
| time_t |
| ill_frag_timeout(ill_t *ill, time_t dead_interval) |
| { |
| ipfb_t *ipfb; |
| ipfb_t *endp; |
| ipf_t *ipf; |
| ipf_t *ipfnext; |
| mblk_t *mp; |
| time_t current_time = gethrestime_sec(); |
| time_t next_timeout = 0; |
| uint32_t hdr_length; |
| mblk_t *send_icmp_head; |
| mblk_t *send_icmp_head_v6; |
| ip_stack_t *ipst = ill->ill_ipst; |
| ip_recv_attr_t iras; |
| |
| bzero(&iras, sizeof (iras)); |
| iras.ira_flags = 0; |
| iras.ira_ill = iras.ira_rill = ill; |
| iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; |
| iras.ira_rifindex = iras.ira_ruifindex; |
| |
| ipfb = ill->ill_frag_hash_tbl; |
| if (ipfb == NULL) |
| return (B_FALSE); |
| endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; |
| /* Walk the frag hash table. */ |
| for (; ipfb < endp; ipfb++) { |
| send_icmp_head = NULL; |
| send_icmp_head_v6 = NULL; |
| mutex_enter(&ipfb->ipfb_lock); |
| while ((ipf = ipfb->ipfb_ipf) != 0) { |
| time_t frag_time = current_time - ipf->ipf_timestamp; |
| time_t frag_timeout; |
| |
| if (frag_time < dead_interval) { |
| /* |
| * There are some outstanding fragments |
| * that will timeout later. Make note of |
| * the time so that we can reschedule the |
| * next timeout appropriately. |
| */ |
| frag_timeout = dead_interval - frag_time; |
| if (next_timeout == 0 || |
| frag_timeout < next_timeout) { |
| next_timeout = frag_timeout; |
| } |
| break; |
| } |
| /* Time's up. Get it out of here. */ |
| hdr_length = ipf->ipf_nf_hdr_len; |
| ipfnext = ipf->ipf_hash_next; |
| if (ipfnext) |
| ipfnext->ipf_ptphn = ipf->ipf_ptphn; |
| *ipf->ipf_ptphn = ipfnext; |
| mp = ipf->ipf_mp->b_cont; |
| for (; mp; mp = mp->b_cont) { |
| /* Extra points for neatness. */ |
| IP_REASS_SET_START(mp, 0); |
| IP_REASS_SET_END(mp, 0); |
| } |
| mp = ipf->ipf_mp->b_cont; |
| atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); |
| ASSERT(ipfb->ipfb_count >= ipf->ipf_count); |
| ipfb->ipfb_count -= ipf->ipf_count; |
| ASSERT(ipfb->ipfb_frag_pkts > 0); |
| ipfb->ipfb_frag_pkts--; |
| /* |
| * We do not send any icmp message from here because |
| * we currently are holding the ipfb_lock for this |
| * hash chain. If we try and send any icmp messages |
| * from here we may end up via a put back into ip |
| * trying to get the same lock, causing a recursive |
| * mutex panic. Instead we build a list and send all |
| * the icmp messages after we have dropped the lock. |
| */ |
| if (ill->ill_isv6) { |
| if (hdr_length != 0) { |
| mp->b_next = send_icmp_head_v6; |
| send_icmp_head_v6 = mp; |
| } else { |
| freemsg(mp); |
| } |
| } else { |
| if (hdr_length != 0) { |
| mp->b_next = send_icmp_head; |
| send_icmp_head = mp; |
| } else { |
| freemsg(mp); |
| } |
| } |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); |
| ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); |
| freeb(ipf->ipf_mp); |
| } |
| mutex_exit(&ipfb->ipfb_lock); |
| /* |
| * Now need to send any icmp messages that we delayed from |
| * above. |
| */ |
| while (send_icmp_head_v6 != NULL) { |
| ip6_t *ip6h; |
| |
| mp = send_icmp_head_v6; |
| send_icmp_head_v6 = send_icmp_head_v6->b_next; |
| mp->b_next = NULL; |
| ip6h = (ip6_t *)mp->b_rptr; |
| iras.ira_flags = 0; |
| /* |
| * This will result in an incorrect ALL_ZONES zoneid |
| * for multicast packets, but we |
| * don't send ICMP errors for those in any case. |
| */ |
| iras.ira_zoneid = |
| ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, |
| ill, ipst); |
| ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); |
| icmp_time_exceeded_v6(mp, |
| ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, |
| &iras); |
| ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); |
| } |
| while (send_icmp_head != NULL) { |
| ipaddr_t dst; |
| |
| mp = send_icmp_head; |
| send_icmp_head = send_icmp_head->b_next; |
| mp->b_next = NULL; |
| |
| dst = ((ipha_t *)mp->b_rptr)->ipha_dst; |
| |
| iras.ira_flags = IRAF_IS_IPV4; |
| /* |
| * This will result in an incorrect ALL_ZONES zoneid |
| * for broadcast and multicast packets, but we |
| * don't send ICMP errors for those in any case. |
| */ |
| iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, |
| ill, ipst); |
| ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); |
| icmp_time_exceeded(mp, |
| ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); |
| ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); |
| } |
| } |
| /* |
| * A non-dying ILL will use the return value to decide whether to |
| * restart the frag timer, and for how long. |
| */ |
| return (next_timeout); |
| } |
| |
| /* |
| * This routine is called when the approximate count of mblk memory used |
| * for the specified ILL has exceeded max_count. |
| */ |
| void |
| ill_frag_prune(ill_t *ill, uint_t max_count) |
| { |
| ipfb_t *ipfb; |
| ipf_t *ipf; |
| size_t count; |
| clock_t now; |
| |
| /* |
| * If we are here within ip_min_frag_prune_time msecs remove |
| * ill_frag_free_num_pkts oldest packets from each bucket and increment |
| * ill_frag_free_num_pkts. |
| */ |
| mutex_enter(&ill->ill_lock); |
| now = ddi_get_lbolt(); |
| if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= |
| (ip_min_frag_prune_time != 0 ? |
| ip_min_frag_prune_time : msec_per_tick)) { |
| |
| ill->ill_frag_free_num_pkts++; |
| |
| } else { |
| ill->ill_frag_free_num_pkts = 0; |
| } |
| ill->ill_last_frag_clean_time = now; |
| mutex_exit(&ill->ill_lock); |
| |
| /* |
| * free ill_frag_free_num_pkts oldest packets from each bucket. |
| */ |
| if (ill->ill_frag_free_num_pkts != 0) { |
| int ix; |
| |
| for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { |
| ipfb = &ill->ill_frag_hash_tbl[ix]; |
| mutex_enter(&ipfb->ipfb_lock); |
| if (ipfb->ipfb_ipf != NULL) { |
| ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, |
| ill->ill_frag_free_num_pkts); |
| } |
| mutex_exit(&ipfb->ipfb_lock); |
| } |
| } |
| /* |
| * While the reassembly list for this ILL is too big, prune a fragment |
| * queue by age, oldest first. |
| */ |
| while (ill->ill_frag_count > max_count) { |
| int ix; |
| ipfb_t *oipfb = NULL; |
| uint_t oldest = UINT_MAX; |
| |
| count = 0; |
| for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { |
| ipfb = &ill->ill_frag_hash_tbl[ix]; |
| mutex_enter(&ipfb->ipfb_lock); |
| ipf = ipfb->ipfb_ipf; |
| if (ipf != NULL && ipf->ipf_gen < oldest) { |
| oldest = ipf->ipf_gen; |
| oipfb = ipfb; |
| } |
| count += ipfb->ipfb_count; |
| mutex_exit(&ipfb->ipfb_lock); |
| } |
| if (oipfb == NULL) |
| break; |
| |
| if (count <= max_count) |
| return; /* Somebody beat us to it, nothing to do */ |
| mutex_enter(&oipfb->ipfb_lock); |
| ipf = oipfb->ipfb_ipf; |
| if (ipf != NULL) { |
| ill_frag_free_pkts(ill, oipfb, ipf, 1); |
| } |
| mutex_exit(&oipfb->ipfb_lock); |
| } |
| } |
| |
| /* |
| * free 'free_cnt' fragmented packets starting at ipf. |
| */ |
| void |
| ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) |
| { |
| size_t count; |
| mblk_t *mp; |
| mblk_t *tmp; |
| ipf_t **ipfp = ipf->ipf_ptphn; |
| |
| ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); |
| ASSERT(ipfp != NULL); |
| ASSERT(ipf != NULL); |
| |
| while (ipf != NULL && free_cnt-- > 0) { |
| count = ipf->ipf_count; |
| mp = ipf->ipf_mp; |
| ipf = ipf->ipf_hash_next; |
| for (tmp = mp; tmp; tmp = tmp->b_cont) { |
| IP_REASS_SET_START(tmp, 0); |
| IP_REASS_SET_END(tmp, 0); |
| } |
| atomic_add_32(&ill->ill_frag_count, -count); |
| ASSERT(ipfb->ipfb_count >= count); |
| ipfb->ipfb_count -= count; |
| ASSERT(ipfb->ipfb_frag_pkts > 0); |
| ipfb->ipfb_frag_pkts--; |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); |
| ip_drop_input("ipIfStatsReasmFails", mp, ill); |
| freemsg(mp); |
| } |
| |
| if (ipf) |
| ipf->ipf_ptphn = ipfp; |
| ipfp[0] = ipf; |
| } |
| |
| /* |
| * Helper function for ill_forward_set(). |
| */ |
| static void |
| ill_forward_set_on_ill(ill_t *ill, boolean_t enable) |
| { |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); |
| |
| ip1dbg(("ill_forward_set: %s %s forwarding on %s", |
| (enable ? "Enabling" : "Disabling"), |
| (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); |
| mutex_enter(&ill->ill_lock); |
| if (enable) |
| ill->ill_flags |= ILLF_ROUTER; |
| else |
| ill->ill_flags &= ~ILLF_ROUTER; |
| mutex_exit(&ill->ill_lock); |
| if (ill->ill_isv6) |
| ill_set_nce_router_flags(ill, enable); |
| /* Notify routing socket listeners of this change. */ |
| if (ill->ill_ipif != NULL) |
| ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); |
| } |
| |
| /* |
| * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing |
| * socket messages for each interface whose flags we change. |
| */ |
| int |
| ill_forward_set(ill_t *ill, boolean_t enable) |
| { |
| ipmp_illgrp_t *illg; |
| ip_stack_t *ipst = ill->ill_ipst; |
| |
| ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); |
| |
| if ((enable && (ill->ill_flags & ILLF_ROUTER)) || |
| (!enable && !(ill->ill_flags & ILLF_ROUTER))) |
| return (0); |
| |
| if (IS_LOOPBACK(ill)) |
| return (EINVAL); |
| |
| if (enable && ill->ill_allowed_ips_cnt > 0) |
| return (EPERM); |
| |
| if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { |
| /* |
| * Update all of the interfaces in the group. |
| */ |
| illg = ill->ill_grp; |
| ill = list_head(&illg->ig_if); |
| for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) |
| ill_forward_set_on_ill(ill, enable); |
| |
| /* |
| * Update the IPMP meta-interface. |
| */ |
| ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); |
| return (0); |
| } |
| |
| ill_forward_set_on_ill(ill, enable); |
| return (0); |
| } |
| |
| /* |
| * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for |
| * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately |
| * set or clear. |
| */ |
| static void |
| ill_set_nce_router_flags(ill_t *ill, boolean_t enable) |
| { |
| ipif_t *ipif; |
| ncec_t *ncec; |
| nce_t *nce; |
| |
| for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { |
| /* |
| * NOTE: we match across the illgrp because nce's for |
| * addresses on IPMP interfaces have an nce_ill that points to |
| * the bound underlying ill. |
| */ |
| nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); |
| if (nce != NULL) { |
| ncec = nce->nce_common; |
| mutex_enter(&ncec->ncec_lock); |
| if (enable) |
| ncec->ncec_flags |= NCE_F_ISROUTER; |
| else |
| ncec->ncec_flags &= ~NCE_F_ISROUTER; |
| mutex_exit(&ncec->ncec_lock); |
| nce_refrele(nce); |
| } |
| } |
| } |
| |
| /* |
| * Intializes the context structure and returns the first ill in the list |
| * cuurently start_list and end_list can have values: |
| * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. |
| * IP_V4_G_HEAD Traverse IPV4 list only. |
| * IP_V6_G_HEAD Traverse IPV6 list only. |
| */ |
| |
| /* |
| * We don't check for CONDEMNED ills here. Caller must do that if |
| * necessary under the ill lock. |
| */ |
| ill_t * |
| ill_first(int start_list, int end_list, ill_walk_context_t *ctx, |
| ip_stack_t *ipst) |
| { |
| ill_if_t *ifp; |
| ill_t *ill; |
| avl_tree_t *avl_tree; |
| |
| ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); |
| ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); |
| |
| /* |
| * setup the lists to search |
| */ |
| if (end_list != MAX_G_HEADS) { |
| ctx->ctx_current_list = start_list; |
| ctx->ctx_last_list = end_list; |
| } else { |
| ctx->ctx_last_list = MAX_G_HEADS - 1; |
| ctx->ctx_current_list = 0; |
| } |
| |
| while (ctx->ctx_current_list <= ctx->ctx_last_list) { |
| ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, |