blob: 7e6cf3b4d74e7474d1fde83dd70d4cbf86ffe262 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1990 Mentat Inc.
* Copyright (c) 2017 OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright (c) 2018 Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/dlpi.h>
#include <sys/stropts.h>
#include <sys/sysmacros.h>
#include <sys/strsubr.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
#include <sys/zone.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/xti_inet.h>
#include <sys/ddi.h>
#include <sys/suntpi.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/kobj.h>
#include <sys/modctl.h>
#include <sys/atomic.h>
#include <sys/policy.h>
#include <sys/priv.h>
#include <sys/taskq.h>
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/vtrace.h>
#include <sys/isa_defs.h>
#include <sys/mac.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/route.h>
#include <sys/sockio.h>
#include <netinet/in.h>
#include <net/if_dl.h>
#include <inet/common.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/arp.h>
#include <inet/snmpcom.h>
#include <inet/optcom.h>
#include <inet/kstatcom.h>
#include <netinet/igmp_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/sctp.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
#include <inet/ip_multi.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
#include <inet/ip_ftable.h>
#include <inet/ip_rts.h>
#include <inet/ip_ndp.h>
#include <inet/ip_listutils.h>
#include <netinet/igmp.h>
#include <netinet/ip_mroute.h>
#include <inet/ipp_common.h>
#include <net/pfkeyv2.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
#include <inet/iptun/iptun_impl.h>
#include <inet/ipdrop.h>
#include <inet/ip_netinfo.h>
#include <inet/ilb_ip.h>
#include <sys/ethernet.h>
#include <net/if_types.h>
#include <sys/cpuvar.h>
#include <ipp/ipp.h>
#include <ipp/ipp_impl.h>
#include <ipp/ipgpc/ipgpc.h>
#include <sys/pattr.h>
#include <inet/ipclassifier.h>
#include <inet/sctp_ip.h>
#include <inet/sctp/sctp_impl.h>
#include <inet/udp_impl.h>
#include <inet/rawip_impl.h>
#include <inet/rts_impl.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
#include <sys/squeue_impl.h>
#include <inet/ip_arp.h>
#include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */
/*
* Values for squeue switch:
* IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
* IP_SQUEUE_ENTER: SQ_PROCESS
* IP_SQUEUE_FILL: SQ_FILL
*/
int ip_squeue_enter = IP_SQUEUE_ENTER; /* Setable in /etc/system */
int ip_squeue_flag;
/*
* Setable in /etc/system
*/
int ip_poll_normal_ms = 100;
int ip_poll_normal_ticks = 0;
int ip_modclose_ackwait_ms = 3000;
/*
* It would be nice to have these present only in DEBUG systems, but the
* current design of the global symbol checking logic requires them to be
* unconditionally present.
*/
uint_t ip_thread_data; /* TSD key for debug support */
krwlock_t ip_thread_rwlock;
list_t ip_thread_list;
/*
* Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
*/
struct listptr_s {
mblk_t *lp_head; /* pointer to the head of the list */
mblk_t *lp_tail; /* pointer to the tail of the list */
};
typedef struct listptr_s listptr_t;
/*
* This is used by ip_snmp_get_mib2_ip_route_media and
* ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
*/
typedef struct iproutedata_s {
uint_t ird_idx;
uint_t ird_flags; /* see below */
listptr_t ird_route; /* ipRouteEntryTable */
listptr_t ird_netmedia; /* ipNetToMediaEntryTable */
listptr_t ird_attrs; /* ipRouteAttributeTable */
} iproutedata_t;
/* Include ire_testhidden and IRE_IF_CLONE routes */
#define IRD_REPORT_ALL 0x01
/*
* Cluster specific hooks. These should be NULL when booted as a non-cluster
*/
/*
* Hook functions to enable cluster networking
* On non-clustered systems these vectors must always be NULL.
*
* Hook function to Check ip specified ip address is a shared ip address
* in the cluster
*
*/
int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
/*
* Hook function to generate cluster wide ip fragment identifier
*/
uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
void *args) = NULL;
/*
* Hook function to generate cluster wide SPI.
*/
void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
void *) = NULL;
/*
* Hook function to verify if the SPI is already utlized.
*/
int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
/*
* Hook function to delete the SPI from the cluster wide repository.
*/
void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
/*
* Hook function to inform the cluster when packet received on an IDLE SA
*/
void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
in6_addr_t, in6_addr_t, void *) = NULL;
/*
* Synchronization notes:
*
* IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
* MT level protection given by STREAMS. IP uses a combination of its own
* internal serialization mechanism and standard Solaris locking techniques.
* The internal serialization is per phyint. This is used to serialize
* plumbing operations, IPMP operations, most set ioctls, etc.
*
* Plumbing is a long sequence of operations involving message
* exchanges between IP, ARP and device drivers. Many set ioctls are typically
* involved in plumbing operations. A natural model is to serialize these
* ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
* parallel without any interference. But various set ioctls on hme0 are best
* serialized, along with IPMP operations and processing of DLPI control
* messages received from drivers on a per phyint basis. This serialization is
* provided by the ipsq_t and primitives operating on this. Details can
* be found in ip_if.c above the core primitives operating on ipsq_t.
*
* Lookups of an ipif or ill by a thread return a refheld ipif / ill.
* Simiarly lookup of an ire by a thread also returns a refheld ire.
* In addition ipif's and ill's referenced by the ire are also indirectly
* refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
* directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
* address of an ipif has to go through the ipsq_t. This ensures that only
* one such exclusive operation proceeds at any time on the ipif. It then
* waits for all refcnts
* associated with this ipif to come down to zero. The address is changed
* only after the ipif has been quiesced. Then the ipif is brought up again.
* More details are described above the comment in ip_sioctl_flags.
*
* Packet processing is based mostly on IREs and are fully multi-threaded
* using standard Solaris MT techniques.
*
* There are explicit locks in IP to handle:
* - The ip_g_head list maintained by mi_open_link() and friends.
*
* - The reassembly data structures (one lock per hash bucket)
*
* - conn_lock is meant to protect conn_t fields. The fields actually
* protected by conn_lock are documented in the conn_t definition.
*
* - ire_lock to protect some of the fields of the ire, IRE tables
* (one lock per hash bucket). Refer to ip_ire.c for details.
*
* - ndp_g_lock and ncec_lock for protecting NCEs.
*
* - ill_lock protects fields of the ill and ipif. Details in ip.h
*
* - ill_g_lock: This is a global reader/writer lock. Protects the following
* * The AVL tree based global multi list of all ills.
* * The linked list of all ipifs of an ill
* * The <ipsq-xop> mapping
* * <ill-phyint> association
* Insertion/deletion of an ill in the system, insertion/deletion of an ipif
* into an ill, changing the <ipsq-xop> mapping of an ill, changing the
* <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
* writer for the actual duration of the insertion/deletion/change.
*
* - ill_lock: This is a per ill mutex.
* It protects some members of the ill_t struct; see ip.h for details.
* It also protects the <ill-phyint> assoc.
* It also protects the list of ipifs hanging off the ill.
*
* - ipsq_lock: This is a per ipsq_t mutex lock.
* This protects some members of the ipsq_t struct; see ip.h for details.
* It also protects the <ipsq-ipxop> mapping
*
* - ipx_lock: This is a per ipxop_t mutex lock.
* This protects some members of the ipxop_t struct; see ip.h for details.
*
* - phyint_lock: This is a per phyint mutex lock. Protects just the
* phyint_flags
*
* - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
* This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
* uniqueness check also done atomically.
*
* - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
* group list linked by ill_usesrc_grp_next. It also protects the
* ill_usesrc_ifindex field. It is taken as a writer when a member of the
* group is being added or deleted. This lock is taken as a reader when
* walking the list/group(eg: to get the number of members in a usesrc group).
* Note, it is only necessary to take this lock if the ill_usesrc_grp_next
* field is changing state i.e from NULL to non-NULL or vice-versa. For
* example, it is not necessary to take this lock in the initial portion
* of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
* operations are executed exclusively and that ensures that the "usesrc
* group state" cannot change. The "usesrc group state" change can happen
* only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
*
* Changing <ill-phyint>, <ipsq-xop> assocications:
*
* To change the <ill-phyint> association, the ill_g_lock must be held
* as writer, and the ill_locks of both the v4 and v6 instance of the ill
* must be held.
*
* To change the <ipsq-xop> association, the ill_g_lock must be held as
* writer, the ipsq_lock must be held, and one must be writer on the ipsq.
* This is only done when ills are added or removed from IPMP groups.
*
* To add or delete an ipif from the list of ipifs hanging off the ill,
* ill_g_lock (writer) and ill_lock must be held and the thread must be
* a writer on the associated ipsq.
*
* To add or delete an ill to the system, the ill_g_lock must be held as
* writer and the thread must be a writer on the associated ipsq.
*
* To add or delete an ilm to an ill, the ill_lock must be held and the thread
* must be a writer on the associated ipsq.
*
* Lock hierarchy
*
* Some lock hierarchy scenarios are listed below.
*
* ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
* ill_g_lock -> ill_lock(s) -> phyint_lock
* ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
* ill_g_lock -> ip_addr_avail_lock
* conn_lock -> irb_lock -> ill_lock -> ire_lock
* ill_g_lock -> ip_g_nd_lock
* ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
* ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
* arl_lock -> ill_lock
* ips_ire_dep_lock -> irb_lock
*
* When more than 1 ill lock is needed to be held, all ill lock addresses
* are sorted on address and locked starting from highest addressed lock
* downward.
*
* Multicast scenarios
* ips_ill_g_lock -> ill_mcast_lock
* conn_ilg_lock -> ips_ill_g_lock -> ill_lock
* ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
* ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
* ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
* ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
*
* IPsec scenarios
*
* ipsa_lock -> ill_g_lock -> ill_lock
* ill_g_usesrc_lock -> ill_g_lock -> ill_lock
*
* Trusted Solaris scenarios
*
* igsa_lock -> gcgrp_rwlock -> gcgrp_lock
* igsa_lock -> gcdb_lock
* gcgrp_rwlock -> ire_lock
* gcgrp_rwlock -> gcdb_lock
*
* squeue(sq_lock), flow related (ft_lock, fe_lock) locking
*
* cpu_lock --> ill_lock --> sqset_lock --> sq_lock
* sq_lock -> conn_lock -> QLOCK(q)
* ill_lock -> ft_lock -> fe_lock
*
* Routing/forwarding table locking notes:
*
* Lock acquisition order: Radix tree lock, irb_lock.
* Requirements:
* i. Walker must not hold any locks during the walker callback.
* ii Walker must not see a truncated tree during the walk because of any node
* deletion.
* iii Existing code assumes ire_bucket is valid if it is non-null and is used
* in many places in the code to walk the irb list. Thus even if all the
* ires in a bucket have been deleted, we still can't free the radix node
* until the ires have actually been inactive'd (freed).
*
* Tree traversal - Need to hold the global tree lock in read mode.
* Before dropping the global tree lock, need to either increment the ire_refcnt
* to ensure that the radix node can't be deleted.
*
* Tree add - Need to hold the global tree lock in write mode to add a
* radix node. To prevent the node from being deleted, increment the
* irb_refcnt, after the node is added to the tree. The ire itself is
* added later while holding the irb_lock, but not the tree lock.
*
* Tree delete - Need to hold the global tree lock and irb_lock in write mode.
* All associated ires must be inactive (i.e. freed), and irb_refcnt
* must be zero.
*
* Walker - Increment irb_refcnt before calling the walker callback. Hold the
* global tree lock (read mode) for traversal.
*
* IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
* hence we will acquire irb_lock while holding ips_ire_dep_lock.
*
* IPsec notes :
*
* IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
* in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
* ip_xmit_attr_t has the
* information used by the IPsec code for applying the right level of
* protection. The information initialized by IP in the ip_xmit_attr_t
* is determined by the per-socket policy or global policy in the system.
* For inbound datagrams, the ip_recv_attr_t
* starts out with nothing in it. It gets filled
* with the right information if it goes through the AH/ESP code, which
* happens if the incoming packet is secure. The information initialized
* by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
* the policy requirements needed by per-socket policy or global policy
* is met or not.
*
* For fully connected sockets i.e dst, src [addr, port] is known,
* conn_policy_cached is set indicating that policy has been cached.
* conn_in_enforce_policy may or may not be set depending on whether
* there is a global policy match or per-socket policy match.
* Policy inheriting happpens in ip_policy_set once the destination is known.
* Once the right policy is set on the conn_t, policy cannot change for
* this socket. This makes life simpler for TCP (UDP ?) where
* re-transmissions go out with the same policy. For symmetry, policy
* is cached for fully connected UDP sockets also. Thus if policy is cached,
* it also implies that policy is latched i.e policy cannot change
* on these sockets. As we have the right policy on the conn, we don't
* have to lookup global policy for every outbound and inbound datagram
* and thus serving as an optimization. Note that a global policy change
* does not affect fully connected sockets if they have policy. If fully
* connected sockets did not have any policy associated with it, global
* policy change may affect them.
*
* IP Flow control notes:
* ---------------------
* Non-TCP streams are flow controlled by IP. The way this is accomplished
* differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
* ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
* GLDv3. Otherwise packets are sent down to lower layers using STREAMS
* functions.
*
* Per Tx ring udp flow control:
* This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
* the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
*
* The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
* To achieve best performance, outgoing traffic need to be fanned out among
* these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
* traffic out of the NIC and it takes a fanout hint. UDP connections pass
* the address of connp as fanout hint to mac_tx(). Under flow controlled
* condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
* cookie points to a specific Tx ring that is blocked. The cookie is used to
* hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
* point to drain_lists (idl_t's). These drain list will store the blocked UDP
* connp's. The drain list is not a single list but a configurable number of
* lists.
*
* The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
* has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
* which is equal to 128. This array in turn contains a pointer to idl_t[],
* the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
* list will point to the list of connp's that are flow controlled.
*
* --------------- ------- ------- -------
* |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
* | --------------- ------- ------- -------
* | --------------- ------- ------- -------
* |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
* ---------------- | --------------- ------- ------- -------
* |idl_tx_list[0]|->| --------------- ------- ------- -------
* ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
* | --------------- ------- ------- -------
* . . . . .
* | --------------- ------- ------- -------
* |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
* --------------- ------- ------- -------
* --------------- ------- ------- -------
* |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
* | --------------- ------- ------- -------
* | --------------- ------- ------- -------
* ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
* |idl_tx_list[1]|->| --------------- ------- ------- -------
* ---------------- | . . . .
* | --------------- ------- ------- -------
* |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
* --------------- ------- ------- -------
* .....
* ----------------
* |idl_tx_list[n]|-> ...
* ----------------
*
* When mac_tx() returns a cookie, the cookie is hashed into an index into
* ips_idl_tx_list[], and conn_drain_insert() is called with the idl_tx_list
* to insert the conn onto. conn_drain_insert() asserts flow control for the
* sockets via su_txq_full() (non-STREAMS) or QFULL on conn_wq (STREAMS).
* Further, conn_blocked is set to indicate that the conn is blocked.
*
* GLDv3 calls ill_flow_enable() when flow control is relieved. The cookie
* passed in the call to ill_flow_enable() identifies the blocked Tx ring and
* is again hashed to locate the appropriate idl_tx_list, which is then
* drained via conn_walk_drain(). conn_walk_drain() goes through each conn in
* the drain list and calls conn_drain_remove() to clear flow control (via
* calling su_txq_full() or clearing QFULL), and remove the conn from the
* drain list.
*
* Note that the drain list is not a single list but a (configurable) array of
* lists (8 elements by default). Synchronization between drain insertion and
* flow control wakeup is handled by using idl_txl->txl_lock, and only
* conn_drain_insert() and conn_drain_remove() manipulate the drain list.
*
* Flow control via STREAMS is used when ILL_DIRECT_CAPABLE() returns FALSE.
* On the send side, if the packet cannot be sent down to the driver by IP
* (canput() fails), ip_xmit() drops the packet and returns EWOULDBLOCK to the
* caller, who may then invoke ixa_check_drain_insert() to insert the conn on
* the 0'th drain list. When ip_wsrv() runs on the ill_wq because flow
* control has been relieved, the blocked conns in the 0'th drain list are
* drained as in the non-STREAMS case.
*
* In both the STREAMS and non-STREAMS cases, the sockfs upcall to set QFULL
* is done when the conn is inserted into the drain list (conn_drain_insert())
* and cleared when the conn is removed from the it (conn_drain_remove()).
*
* IPQOS notes:
*
* IPQoS Policies are applied to packets using IPPF (IP Policy framework)
* and IPQoS modules. IPPF includes hooks in IP at different control points
* (callout positions) which direct packets to IPQoS modules for policy
* processing. Policies, if present, are global.
*
* The callout positions are located in the following paths:
* o local_in (packets destined for this host)
* o local_out (packets orginating from this host )
* o fwd_in (packets forwarded by this m/c - inbound)
* o fwd_out (packets forwarded by this m/c - outbound)
* Hooks at these callout points can be enabled/disabled using the ndd variable
* ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
* By default all the callout positions are enabled.
*
* Outbound (local_out)
* Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
*
* Inbound (local_in)
* Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
*
* Forwarding (in and out)
* Hooks are placed in ire_recv_forward_v4/v6.
*
* IP Policy Framework processing (IPPF processing)
* Policy processing for a packet is initiated by ip_process, which ascertains
* that the classifier (ipgpc) is loaded and configured, failing which the
* packet resumes normal processing in IP. If the clasifier is present, the
* packet is acted upon by one or more IPQoS modules (action instances), per
* filters configured in ipgpc and resumes normal IP processing thereafter.
* An action instance can drop a packet in course of its processing.
*
* Zones notes:
*
* The partitioning rules for networking are as follows:
* 1) Packets coming from a zone must have a source address belonging to that
* zone.
* 2) Packets coming from a zone can only be sent on a physical interface on
* which the zone has an IP address.
* 3) Between two zones on the same machine, packet delivery is only allowed if
* there's a matching route for the destination and zone in the forwarding
* table.
* 4) The TCP and UDP port spaces are per-zone; that is, two processes in
* different zones can bind to the same port with the wildcard address
* (INADDR_ANY).
*
* The granularity of interface partitioning is at the logical interface level.
* Therefore, every zone has its own IP addresses, and incoming packets can be
* attributed to a zone unambiguously. A logical interface is placed into a zone
* using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
* structure. Rule (1) is implemented by modifying the source address selection
* algorithm so that the list of eligible addresses is filtered based on the
* sending process zone.
*
* The Internet Routing Entries (IREs) are either exclusive to a zone or shared
* across all zones, depending on their type. Here is the break-up:
*
* IRE type Shared/exclusive
* -------- ----------------
* IRE_BROADCAST Exclusive
* IRE_DEFAULT (default routes) Shared (*)
* IRE_LOCAL Exclusive (x)
* IRE_LOOPBACK Exclusive
* IRE_PREFIX (net routes) Shared (*)
* IRE_IF_NORESOLVER (interface routes) Exclusive
* IRE_IF_RESOLVER (interface routes) Exclusive
* IRE_IF_CLONE (interface routes) Exclusive
* IRE_HOST (host routes) Shared (*)
*
* (*) A zone can only use a default or off-subnet route if the gateway is
* directly reachable from the zone, that is, if the gateway's address matches
* one of the zone's logical interfaces.
*
* (x) IRE_LOCAL are handled a bit differently.
* When ip_restrict_interzone_loopback is set (the default),
* ire_route_recursive restricts loopback using an IRE_LOCAL
* between zone to the case when L2 would have conceptually looped the packet
* back, i.e. the loopback which is required since neither Ethernet drivers
* nor Ethernet hardware loops them back. This is the case when the normal
* routes (ignoring IREs with different zoneids) would send out the packet on
* the same ill as the ill with which is IRE_LOCAL is associated.
*
* Multiple zones can share a common broadcast address; typically all zones
* share the 255.255.255.255 address. Incoming as well as locally originated
* broadcast packets must be dispatched to all the zones on the broadcast
* network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
* since some zones may not be on the 10.16.72/24 network. To handle this, each
* zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
* sent to every zone that has an IRE_BROADCAST entry for the destination
* address on the input ill, see ip_input_broadcast().
*
* Applications in different zones can join the same multicast group address.
* The same logic applies for multicast as for broadcast. ip_input_multicast
* dispatches packets to all zones that have members on the physical interface.
*/
/*
* Squeue Fanout flags:
* 0: No fanout.
* 1: Fanout across all squeues
*/
boolean_t ip_squeue_fanout = 0;
/*
* Maximum dups allowed per packet.
*/
uint_t ip_max_frag_dups = 10;
static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp, boolean_t isv6);
static mblk_t *ip_xmit_attach_llhdr(mblk_t *, nce_t *);
static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
static void icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
static void icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
ip_recv_attr_t *);
static void icmp_options_update(ipha_t *);
static void icmp_param_problem(mblk_t *, uint8_t, ip_recv_attr_t *);
static void icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
static void icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
ip_recv_attr_t *);
static void icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
static void icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
ip_recv_attr_t *);
mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
char *ip_dot_addr(ipaddr_t, char *);
mblk_t *ip_carve_mp(mblk_t **, ssize_t);
static char *ip_dot_saddr(uchar_t *, char *);
static int ip_lrput(queue_t *, mblk_t *);
ipaddr_t ip_net_mask(ipaddr_t);
char *ip_nv_lookup(nv_t *, int);
int ip_rput(queue_t *, mblk_t *);
static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
void *dummy_arg);
int ip_snmp_get(queue_t *, mblk_t *, int, boolean_t);
static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
mib2_ipIfStatsEntry_t *, ip_stack_t *, boolean_t);
static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
ip_stack_t *, boolean_t);
static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *,
boolean_t);
static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
ip_stack_t *ipst, boolean_t);
static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
ip_stack_t *ipst, boolean_t);
static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
ip_stack_t *ipst);
static void ip_snmp_get2_v4(ire_t *, iproutedata_t *);
static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
static void ip_snmp_get2_v4_media(ncec_t *, void *);
static void ip_snmp_get2_v6_media(ncec_t *, void *);
int ip_snmp_set(queue_t *, int, int, uchar_t *, int);
static mblk_t *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
mblk_t *);
static void conn_drain_init(ip_stack_t *);
static void conn_drain_fini(ip_stack_t *);
static void conn_drain(conn_t *connp, boolean_t closing);
static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
static void conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
static void *ip_stack_init(netstackid_t stackid, netstack_t *ns);
static void ip_stack_shutdown(netstackid_t stackid, void *arg);
static void ip_stack_fini(netstackid_t stackid, void *arg);
static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
const in6_addr_t *);
static int ip_squeue_switch(int);
static void *ip_kstat_init(netstackid_t, ip_stack_t *);
static void ip_kstat_fini(netstackid_t, kstat_t *);
static int ip_kstat_update(kstat_t *kp, int rw);
static void *icmp_kstat_init(netstackid_t);
static void icmp_kstat_fini(netstackid_t, kstat_t *);
static int icmp_kstat_update(kstat_t *kp, int rw);
static void *ip_kstat2_init(netstackid_t, ip_stat_t *);
static void ip_kstat2_fini(netstackid_t, kstat_t *);
static void ipobs_init(ip_stack_t *);
static void ipobs_fini(ip_stack_t *);
static int ip_tp_cpu_update(cpu_setup_t, int, void *);
ipaddr_t ip_g_all_ones = IP_HOST_MASK;
static long ip_rput_pullups;
int dohwcksum = 1; /* use h/w cksum if supported by the hardware */
vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
int ip_debug;
/*
* Multirouting/CGTP stuff
*/
int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */
/*
* IP tunables related declarations. Definitions are in ip_tunables.c
*/
extern mod_prop_info_t ip_propinfo_tbl[];
extern int ip_propinfo_count;
/*
* Table of IP ioctls encoding the various properties of the ioctl and
* indexed based on the last byte of the ioctl command. Occasionally there
* is a clash, and there is more than 1 ioctl with the same last byte.
* In such a case 1 ioctl is encoded in the ndx table and the remaining
* ioctls are encoded in the misc table. An entry in the ndx table is
* retrieved by indexing on the last byte of the ioctl command and comparing
* the ioctl command with the value in the ndx table. In the event of a
* mismatch the misc table is then searched sequentially for the desired
* ioctl command.
*
* Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
*/
ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV,
MISC_CMD, ip_siocaddrt, NULL },
/* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV,
MISC_CMD, ip_siocdelrt, NULL },
/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_addr, NULL },
/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
IPI_MODOK | IPI_GET_CMD,
IF_CMD, ip_sioctl_get_flags, NULL },
/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* copyin size cannot be coded for SIOCGIFCONF */
/* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
MISC_CMD, ip_sioctl_get_ifconf, NULL },
/* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_mtu, NULL },
/* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_mtu, NULL },
/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_brdaddr, NULL },
/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
IF_CMD, ip_sioctl_metric, NULL },
/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* See 166-168 below for extended SIOC*XARP ioctls */
/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
ARP_CMD, ip_sioctl_arp, NULL },
/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
ARP_CMD, ip_sioctl_arp, NULL },
/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
ARP_CMD, ip_sioctl_arp, NULL },
/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
MISC_CMD, if_unitsel, if_unitsel_restart },
/* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
IPI_PRIV | IPI_WR | IPI_MODOK,
IF_CMD, ip_sioctl_sifname, NULL },
/* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
MISC_CMD, ip_sioctl_get_ifnum, NULL },
/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_muxid, NULL },
/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
/* Both if and lif variants share same func */
/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_lifindex, NULL },
/* Both if and lif variants share same func */
/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
/* copyin size cannot be coded for SIOCGIFCONF */
/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
MISC_CMD, ip_sioctl_get_ifconf, NULL },
/* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
ip_sioctl_removeif_restart },
/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
IPI_GET_CMD | IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_addif, NULL },
#define SIOCLIFADDR_NDX 112
/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
IPI_GET_CMD | IPI_MODOK,
LIF_CMD, ip_sioctl_get_flags, NULL },
/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
ip_sioctl_get_lifconf, NULL },
/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_mtu, NULL },
/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
LIF_CMD, ip_sioctl_get_mtu, NULL },
/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_brdaddr, NULL },
/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_metric, NULL },
/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_MODOK,
LIF_CMD, ip_sioctl_slifname,
ip_sioctl_slifname_restart },
/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
MISC_CMD, ip_sioctl_get_lifnum, NULL },
/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_token, NULL },
/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_lnkinfo, NULL },
/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
LIF_CMD, ip_siocdelndp_v6, NULL },
/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
LIF_CMD, ip_siocqueryndp_v6, NULL },
/* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
LIF_CMD, ip_siocsetndp_v6, NULL },
/* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
MISC_CMD, ip_sioctl_tmyaddr, NULL },
/* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
MISC_CMD, ip_sioctl_tonlink, NULL },
/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
MISC_CMD, ip_sioctl_tmysite, NULL },
/* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* Old *IPSECONFIG ioctls are now deprecated, now see spdsock.c */
/* 149 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 150 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 151 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 152 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
LIF_CMD, ip_sioctl_get_binding, NULL },
/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
/* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* These are handled in ip_sioctl_copyin_setup itself */
/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
MISC_CMD, NULL, NULL },
/* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
MISC_CMD, NULL, NULL },
/* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
ip_sioctl_get_lifconf, NULL },
/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
XARP_CMD, ip_sioctl_arp, NULL },
/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
XARP_CMD, ip_sioctl_arp, NULL },
/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
XARP_CMD, ip_sioctl_arp, NULL },
/* SIOCPOPSOCKFS is not handled by IP */
/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
ip_sioctl_slifzone_restart },
/* 172-174 are SCTP ioctls and not handled by IP */
/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD,
ip_sioctl_get_lifusesrc, 0 },
/* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_slifusesrc,
NULL },
/* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
ip_sioctl_get_lifsrcof, NULL },
/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* SIOCSENABLESDP is handled by SDP */
/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
/* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
/* 185 */ { SIOCGIFHWADDR, sizeof (struct ifreq), IPI_GET_CMD,
IF_CMD, ip_sioctl_get_ifhwaddr, NULL },
/* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
/* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
ip_sioctl_ilb_cmd, NULL },
/* 188 */ { SIOCGETPROP, 0, IPI_GET_CMD, 0, NULL, NULL },
/* 189 */ { SIOCSETPROP, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL},
/* 190 */ { SIOCGLIFDADSTATE, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dadstate, NULL },
/* 191 */ { SIOCSLIFPREFIX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_prefix, ip_sioctl_prefix_restart },
/* 192 */ { SIOCGLIFHWADDR, sizeof (struct lifreq), IPI_GET_CMD,
LIF_CMD, ip_sioctl_get_lifhwaddr, NULL }
};
int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
{ I_LINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
{ I_UNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
{ I_PLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
{ I_PUNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
{ ND_GET, 0, 0, 0, NULL, NULL },
{ ND_SET, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
{ IP_IOCTL, 0, 0, 0, NULL, NULL },
{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
{ SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl}
};
int ip_misc_ioctl_count =
sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
int conn_drain_nthreads; /* Number of drainers reqd. */
/* Settable in /etc/system */
/* Defined in ip_ire.c */
extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
static nv_t ire_nv_arr[] = {
{ IRE_BROADCAST, "BROADCAST" },
{ IRE_LOCAL, "LOCAL" },
{ IRE_LOOPBACK, "LOOPBACK" },
{ IRE_DEFAULT, "DEFAULT" },
{ IRE_PREFIX, "PREFIX" },
{ IRE_IF_NORESOLVER, "IF_NORESOL" },
{ IRE_IF_RESOLVER, "IF_RESOLV" },
{ IRE_IF_CLONE, "IF_CLONE" },
{ IRE_HOST, "HOST" },
{ IRE_MULTICAST, "MULTICAST" },
{ IRE_NOROUTE, "NOROUTE" },
{ 0 }
};
nv_t *ire_nv_tbl = ire_nv_arr;
/* Simple ICMP IP Header Template */
static ipha_t icmp_ipha = {
IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
};
struct module_info ip_mod_info = {
IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
IP_MOD_LOWAT
};
/*
* Duplicate static symbols within a module confuses mdb; so we avoid the
* problem by making the symbols here distinct from those in udp.c.
*/
/*
* Entry points for IP as a device and as a module.
* We have separate open functions for the /dev/ip and /dev/ip6 devices.
*/
static struct qinit iprinitv4 = {
ip_rput, NULL, ip_openv4, ip_close, NULL, &ip_mod_info
};
struct qinit iprinitv6 = {
ip_rput_v6, NULL, ip_openv6, ip_close, NULL, &ip_mod_info
};
static struct qinit ipwinit = {
ip_wput_nondata, ip_wsrv, NULL, NULL, NULL, &ip_mod_info
};
static struct qinit iplrinit = {
ip_lrput, NULL, ip_openv4, ip_close, NULL, &ip_mod_info
};
static struct qinit iplwinit = {
ip_lwput, NULL, NULL, NULL, NULL, &ip_mod_info
};
/* For AF_INET aka /dev/ip */
struct streamtab ipinfov4 = {
&iprinitv4, &ipwinit, &iplrinit, &iplwinit
};
/* For AF_INET6 aka /dev/ip6 */
struct streamtab ipinfov6 = {
&iprinitv6, &ipwinit, &iplrinit, &iplwinit
};
#ifdef DEBUG
boolean_t skip_sctp_cksum = B_FALSE;
#endif
/*
* Generate an ICMP fragmentation needed message.
* When called from ip_output side a minimal ip_recv_attr_t needs to be
* constructed by the caller.
*/
void
icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
{
icmph_t icmph;
ip_stack_t *ipst = ira->ira_ill->ill_ipst;
mp = icmp_pkt_err_ok(mp, ira);
if (mp == NULL)
return;
bzero(&icmph, sizeof (icmph_t));
icmph.icmph_type = ICMP_DEST_UNREACHABLE;
icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
icmph.icmph_du_mtu = htons((uint16_t)mtu);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
}
/*
* icmp_inbound_v4 deals with ICMP messages that are handled by IP.
* If the ICMP message is consumed by IP, i.e., it should not be delivered
* to any IPPROTO_ICMP raw sockets, then it returns NULL.
* Likewise, if the ICMP error is misformed (too short, etc), then it
* returns NULL. The caller uses this to determine whether or not to send
* to raw sockets.
*
* All error messages are passed to the matching transport stream.
*
* The following cases are handled by icmp_inbound:
* 1) It needs to send a reply back and possibly delivering it
* to the "interested" upper clients.
* 2) Return the mblk so that the caller can pass it to the RAW socket clients.
* 3) It needs to change some values in IP only.
* 4) It needs to change some values in IP and upper layers e.g TCP
* by delivering an error to the upper layers.
*
* We handle the above three cases in the context of IPsec in the
* following way :
*
* 1) Send the reply back in the same way as the request came in.
* If it came in encrypted, it goes out encrypted. If it came in
* clear, it goes out in clear. Thus, this will prevent chosen
* plain text attack.
* 2) The client may or may not expect things to come in secure.
* If it comes in secure, the policy constraints are checked
* before delivering it to the upper layers. If it comes in
* clear, ipsec_inbound_accept_clear will decide whether to
* accept this in clear or not. In both the cases, if the returned
* message (IP header + 8 bytes) that caused the icmp message has
* AH/ESP headers, it is sent up to AH/ESP for validation before
* sending up. If there are only 8 bytes of returned message, then
* upper client will not be notified.
* 3) Check with global policy to see whether it matches the constaints.
* But this will be done only if icmp_accept_messages_in_clear is
* zero.
* 4) If we need to change both in IP and ULP, then the decision taken
* while affecting the values in IP and while delivering up to TCP
* should be the same.
*
* There are two cases.
*
* a) If we reject data at the IP layer (ipsec_check_global_policy()
* failed), we will not deliver it to the ULP, even though they
* are *willing* to accept in *clear*. This is fine as our global
* disposition to icmp messages asks us reject the datagram.
*
* b) If we accept data at the IP layer (ipsec_check_global_policy()
* succeeded or icmp_accept_messages_in_clear is 1), and not able
* to deliver it to ULP (policy failed), it can lead to
* consistency problems. The cases known at this time are
* ICMP_DESTINATION_UNREACHABLE messages with following code
* values :
*
* - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
* and Upper layer rejects. Then the communication will
* come to a stop. This is solved by making similar decisions
* at both levels. Currently, when we are unable to deliver
* to the Upper Layer (due to policy failures) while IP has
* adjusted dce_pmtu, the next outbound datagram would
* generate a local ICMP_FRAGMENTATION_NEEDED message - which
* will be with the right level of protection. Thus the right
* value will be communicated even if we are not able to
* communicate when we get from the wire initially. But this
* assumes there would be at least one outbound datagram after
* IP has adjusted its dce_pmtu value. To make things
* simpler, we accept in clear after the validation of
* AH/ESP headers.
*
* - Other ICMP ERRORS : We may not be able to deliver it to the
* upper layer depending on the level of protection the upper
* layer expects and the disposition in ipsec_inbound_accept_clear().
* ipsec_inbound_accept_clear() decides whether a given ICMP error
* should be accepted in clear when the Upper layer expects secure.
* Thus the communication may get aborted by some bad ICMP
* packets.
*/
mblk_t *
icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
{
icmph_t *icmph;
ipha_t *ipha; /* Outer header */
int ip_hdr_length; /* Outer header length */
boolean_t interested;
ipif_t *ipif;
uint32_t ts;
uint32_t *tsp;
timestruc_t now;
ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
zoneid_t zoneid = ira->ira_zoneid;
int len_needed;
mblk_t *mp_ret = NULL;
ipha = (ipha_t *)mp->b_rptr;
BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
ip_hdr_length = ira->ira_ip_hdr_length;
if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
freemsg(mp);
return (NULL);
}
/* Last chance to get real. */
ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
if (ipha == NULL) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
freemsg(mp);
return (NULL);
}
}
/* The IP header will always be a multiple of four bytes */
icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
icmph->icmph_code));
/*
* We will set "interested" to "true" if we should pass a copy to
* the transport or if we handle the packet locally.
*/
interested = B_FALSE;
switch (icmph->icmph_type) {
case ICMP_ECHO_REPLY:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
break;
case ICMP_DEST_UNREACHABLE:
if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
break;
case ICMP_SOURCE_QUENCH:
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
break;
case ICMP_REDIRECT:
if (!ipst->ips_ip_ignore_redirect)
interested = B_TRUE;
BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
break;
case ICMP_ECHO_REQUEST:
/*
* Whether to respond to echo requests that come in as IP
* broadcasts or as IP multicast is subject to debate
* (what isn't?). We aim to please, you pick it.
* Default is do it.
*/
if (ira->ira_flags & IRAF_MULTICAST) {
/* multicast: respond based on tunable */
interested = ipst->ips_ip_g_resp_to_echo_mcast;
} else if (ira->ira_flags & IRAF_BROADCAST) {
/* broadcast: respond based on tunable */
interested = ipst->ips_ip_g_resp_to_echo_bcast;
} else {
/* unicast: always respond */
interested = B_TRUE;
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
if (!interested) {
/* We never pass these to RAW sockets */
freemsg(mp);
return (NULL);
}
/* Check db_ref to make sure we can modify the packet. */
if (mp->b_datap->db_ref > 1) {
mblk_t *mp1;
mp1 = copymsg(mp);
freemsg(mp);
if (!mp1) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
return (NULL);
}
mp = mp1;
ipha = (ipha_t *)mp->b_rptr;
icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
}
icmph->icmph_type = ICMP_ECHO_REPLY;
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
icmp_send_reply_v4(mp, ipha, icmph, ira);
return (NULL);
case ICMP_ROUTER_ADVERTISEMENT:
case ICMP_ROUTER_SOLICITATION:
break;
case ICMP_TIME_EXCEEDED:
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
break;
case ICMP_PARAM_PROBLEM:
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
break;
case ICMP_TIME_STAMP_REQUEST:
/* Response to Time Stamp Requests is local policy. */
if (ipst->ips_ip_g_resp_to_timestamp) {
if (ira->ira_flags & IRAF_MULTIBROADCAST)
interested =
ipst->ips_ip_g_resp_to_timestamp_bcast;
else
interested = B_TRUE;
}
if (!interested) {
/* We never pass these to RAW sockets */
freemsg(mp);
return (NULL);
}
/* Make sure we have enough of the packet */
len_needed = ip_hdr_length + ICMPH_SIZE +
3 * sizeof (uint32_t);
if (mp->b_wptr - mp->b_rptr < len_needed) {
ipha = ip_pullup(mp, len_needed, ira);
if (ipha == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
ip_drop_input("ipIfStatsInDiscards - ip_pullup",
mp, ill);
freemsg(mp);
return (NULL);
}
/* Refresh following the pullup. */
icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
/* Check db_ref to make sure we can modify the packet. */
if (mp->b_datap->db_ref > 1) {
mblk_t *mp1;
mp1 = copymsg(mp);
freemsg(mp);
if (!mp1) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
return (NULL);
}
mp = mp1;
ipha = (ipha_t *)mp->b_rptr;
icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
}
icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
tsp = (uint32_t *)&icmph[1];
tsp++; /* Skip past 'originate time' */
/* Compute # of milliseconds since midnight */
gethrestime(&now);
ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
NSEC2MSEC(now.tv_nsec);
*tsp++ = htonl(ts); /* Lay in 'receive time' */
*tsp++ = htonl(ts); /* Lay in 'send time' */
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
icmp_send_reply_v4(mp, ipha, icmph, ira);
return (NULL);
case ICMP_TIME_STAMP_REPLY:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
break;
case ICMP_INFO_REQUEST:
/* Per RFC 1122 3.2.2.7, ignore this. */
case ICMP_INFO_REPLY:
break;
case ICMP_ADDRESS_MASK_REQUEST:
if (ira->ira_flags & IRAF_MULTIBROADCAST) {
interested =
ipst->ips_ip_respond_to_address_mask_broadcast;
} else {
interested = B_TRUE;
}
if (!interested) {
/* We never pass these to RAW sockets */
freemsg(mp);
return (NULL);
}
len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
if (mp->b_wptr - mp->b_rptr < len_needed) {
ipha = ip_pullup(mp, len_needed, ira);
if (ipha == NULL) {
BUMP_MIB(ill->ill_ip_mib,
ipIfStatsInTruncatedPkts);
ip_drop_input("ipIfStatsInTruncatedPkts", mp,
ill);
freemsg(mp);
return (NULL);
}
/* Refresh following the pullup. */
icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
/* Check db_ref to make sure we can modify the packet. */
if (mp->b_datap->db_ref > 1) {
mblk_t *mp1;
mp1 = copymsg(mp);
freemsg(mp);
if (!mp1) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
return (NULL);
}
mp = mp1;
ipha = (ipha_t *)mp->b_rptr;
icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
}
/*
* Need the ipif with the mask be the same as the source
* address of the mask reply. For unicast we have a specific
* ipif. For multicast/broadcast we only handle onlink
* senders, and use the source address to pick an ipif.
*/
ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
if (ipif == NULL) {
/* Broadcast or multicast */
ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
if (ipif == NULL) {
freemsg(mp);
return (NULL);
}
}
icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
ipif_refrele(ipif);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
icmp_send_reply_v4(mp, ipha, icmph, ira);
return (NULL);
case ICMP_ADDRESS_MASK_REPLY:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
break;
default:
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
break;
}
/*
* See if there is an ICMP client to avoid an extra copymsg/freemsg
* if there isn't one.
*/
if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
/* If there is an ICMP client and we want one too, copy it. */
if (!interested) {
/* Caller will deliver to RAW sockets */
return (mp);
}
mp_ret = copymsg(mp);
if (mp_ret == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
}
} else if (!interested) {
/* Neither we nor raw sockets are interested. Drop packet now */
freemsg(mp);
return (NULL);
}
/*
* ICMP error or redirect packet. Make sure we have enough of
* the header and that db_ref == 1 since we might end up modifying
* the packet.
*/
if (mp->b_cont != NULL) {
if (ip_pullup(mp, -1, ira) == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
ip_drop_input("ipIfStatsInDiscards - ip_pullup",
mp, ill);
freemsg(mp);
return (mp_ret);
}
}
if (mp->b_datap->db_ref > 1) {
mblk_t *mp1;
mp1 = copymsg(mp);
if (mp1 == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
freemsg(mp);
return (mp_ret);
}
freemsg(mp);
mp = mp1;
}
/*
* In case mp has changed, verify the message before any further
* processes.
*/
ipha = (ipha_t *)mp->b_rptr;
icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
freemsg(mp);
return (mp_ret);
}
switch (icmph->icmph_type) {
case ICMP_REDIRECT:
icmp_redirect_v4(mp, ipha, icmph, ira);
break;
case ICMP_DEST_UNREACHABLE:
if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
/* Update DCE and adjust MTU is icmp header if needed */
icmp_inbound_too_big_v4(icmph, ira);
}
/* FALLTHROUGH */
default:
icmp_inbound_error_fanout_v4(mp, icmph, ira);
break;
}
return (mp_ret);
}
/*
* Send an ICMP echo, timestamp or address mask reply.
* The caller has already updated the payload part of the packet.
* We handle the ICMP checksum, IP source address selection and feed
* the packet into ip_output_simple.
*/
static void
icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
ip_recv_attr_t *ira)
{
uint_t ip_hdr_length = ira->ira_ip_hdr_length;
ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
ip_xmit_attr_t ixas;
/* Send out an ICMP packet */
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
/* Reset time to live. */
ipha->ipha_ttl = ipst->ips_ip_def_ttl;
{
/* Swap source and destination addresses */
ipaddr_t tmp;
tmp = ipha->ipha_src;
ipha->ipha_src = ipha->ipha_dst;
ipha->ipha_dst = tmp;
}
ipha->ipha_ident = 0;
if (!IS_SIMPLE_IPH(ipha))
icmp_options_update(ipha);
bzero(&ixas, sizeof (ixas));
ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
ixas.ixa_zoneid = ira->ira_zoneid;
ixas.ixa_cred = kcred;
ixas.ixa_cpid = NOPID;
ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
ixas.ixa_ifindex = 0;
ixas.ixa_ipst = ipst;
ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
/*
* This packet should go out the same way as it
* came in i.e in clear, independent of the IPsec policy
* for transmitting packets.
*/
ixas.ixa_flags |= IXAF_NO_IPSEC;
} else {
if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
/* Note: mp already consumed and ip_drop_packet done */
return;
}
}
if (ira->ira_flags & IRAF_MULTIBROADCAST) {
/*
* Not one or our addresses (IRE_LOCALs), thus we let
* ip_output_simple pick the source.
*/
ipha->ipha_src = INADDR_ANY;
ixas.ixa_flags |= IXAF_SET_SOURCE;
}
/* Should we send with DF and use dce_pmtu? */
if (ipst->ips_ipv4_icmp_return_pmtu) {
ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
(void) ip_output_simple(mp, &ixas);
ixa_cleanup(&ixas);
}
/*
* Verify the ICMP messages for either for ICMP error or redirect packet.
* The caller should have fully pulled up the message. If it's a redirect
* packet, only basic checks on IP header will be done; otherwise, verify
* the packet by looking at the included ULP header.
*
* Called before icmp_inbound_error_fanout_v4 is called.
*/
static boolean_t
icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
{
ill_t *ill = ira->ira_ill;
int hdr_length;
ip_stack_t *ipst = ira->ira_ill->ill_ipst;
conn_t *connp;
ipha_t *ipha; /* Inner IP header */
ipha = (ipha_t *)&icmph[1];
if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
goto truncated;
hdr_length = IPH_HDR_LENGTH(ipha);
if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
goto discard_pkt;
if (hdr_length < sizeof (ipha_t))
goto truncated;
if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
goto truncated;
/*
* Stop here for ICMP_REDIRECT.
*/
if (icmph->icmph_type == ICMP_REDIRECT)
return (B_TRUE);
/*
* ICMP errors only.
*/
switch (ipha->ipha_protocol) {
case IPPROTO_UDP:
/*
* Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
* transport header.
*/
if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
mp->b_wptr)
goto truncated;
break;
case IPPROTO_TCP: {
tcpha_t *tcpha;
/*
* Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
* transport header.
*/
if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
mp->b_wptr)
goto truncated;
tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
ipst);
if (connp == NULL)
goto discard_pkt;
if ((connp->conn_verifyicmp != NULL) &&
!connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
CONN_DEC_REF(connp);
goto discard_pkt;
}
CONN_DEC_REF(connp);
break;
}
case IPPROTO_SCTP:
/*
* Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
* transport header.
*/
if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
mp->b_wptr)
goto truncated;
break;
case IPPROTO_ESP:
case IPPROTO_AH:
break;
case IPPROTO_ENCAP:
if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
mp->b_wptr)
goto truncated;
break;
default:
break;
}
return (B_TRUE);
discard_pkt:
/* Bogus ICMP error. */
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
return (B_FALSE);
truncated:
/* We pulled up everthing already. Must be truncated */
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
return (B_FALSE);
}
/* Table from RFC 1191 */
static int icmp_frag_size_table[] =
{ 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
/*
* Process received ICMP Packet too big.
* Just handles the DCE create/update, including using the above table of
* PMTU guesses. The caller is responsible for validating the packet before
* passing it in and also to fanout the ICMP error to any matching transport
* conns. Assumes the message has been fully pulled up and verified.
*
* Before getting here, the caller has called icmp_inbound_verify_v4()
* that should have verified with ULP to prevent undoing the changes we're
* going to make to DCE. For example, TCP might have verified that the packet
* which generated error is in the send window.
*
* In some cases modified this MTU in the ICMP header packet; the caller
* should pass to the matching ULP after this returns.
*/
static void
icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
{
dce_t *dce;
int old_mtu;
int mtu, orig_mtu;
ipaddr_t dst;
boolean_t disable_pmtud;
ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
uint_t hdr_length;
ipha_t *ipha;
/* Caller already pulled up everything. */
ipha = (ipha_t *)&icmph[1];
ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
ASSERT(ill != NULL);
hdr_length = IPH_HDR_LENGTH(ipha);
/*
* We handle path MTU for source routed packets since the DCE
* is looked up using the final destination.
*/
dst = ip_get_dst(ipha);
dce = dce_lookup_and_add_v4(dst, ipst);
if (dce == NULL) {
/* Couldn't add a unique one - ENOMEM */
ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
ntohl(dst)));
return;
}
/* Check for MTU discovery advice as described in RFC 1191 */
mtu = ntohs(icmph->icmph_du_mtu);
orig_mtu = mtu;
disable_pmtud = B_FALSE;
mutex_enter(&dce->dce_lock);
if (dce->dce_flags & DCEF_PMTU)
old_mtu = dce->dce_pmtu;
else
old_mtu = ill->ill_mtu;
if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
uint32_t length;
int i;
/*
* Use the table from RFC 1191 to figure out
* the next "plateau" based on the length in
* the original IP packet.
*/
length = ntohs(ipha->ipha_length);
DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
uint32_t, length);
if (old_mtu <= length &&
old_mtu >= length - hdr_length) {
/*
* Handle broken BSD 4.2 systems that
* return the wrong ipha_length in ICMP
* errors.
*/
ip1dbg(("Wrong mtu: sent %d, dce %d\n",
length, old_mtu));
length -= hdr_length;
}
for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
if (length > icmp_frag_size_table[i])
break;
}
if (i == A_CNT(icmp_frag_size_table)) {
/* Smaller than IP_MIN_MTU! */
ip1dbg(("Too big for packet size %d\n",
length));
disable_pmtud = B_TRUE;
mtu = ipst->ips_ip_pmtu_min;
} else {
mtu = icmp_frag_size_table[i];
ip1dbg(("Calculated mtu %d, packet size %d, "
"before %d\n", mtu, length, old_mtu));
if (mtu < ipst->ips_ip_pmtu_min) {
mtu = ipst->ips_ip_pmtu_min;
disable_pmtud = B_TRUE;
}
}
}
if (disable_pmtud)
dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
else
dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
dce->dce_pmtu = MIN(old_mtu, mtu);
/* Prepare to send the new max frag size for the ULP. */
icmph->icmph_du_zero = 0;
icmph->icmph_du_mtu = htons((uint16_t)dce->dce_pmtu);
DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
dce, int, orig_mtu, int, mtu);
/* We now have a PMTU for sure */
dce->dce_flags |= DCEF_PMTU;
dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
mutex_exit(&dce->dce_lock);
/*
* After dropping the lock the new value is visible to everyone.
* Then we bump the generation number so any cached values reinspect
* the dce_t.
*/
dce_increment_generation(dce);
dce_refrele(dce);
}
/*
* If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
* calls this function.
*/
static mblk_t *
icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
{
int length;
ASSERT(mp->b_datap->db_type == M_DATA);
/* icmp_inbound_v4 has already pulled up the whole error packet */
ASSERT(mp->b_cont == NULL);
/*
* The length that we want to overlay is the inner header
* and what follows it.
*/
length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
/*
* Overlay the inner header and whatever follows it over the
* outer header.
*/
bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
/* Adjust for what we removed */
mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
return (mp);
}
/*
* Try to pass the ICMP message upstream in case the ULP cares.
*
* If the packet that caused the ICMP error is secure, we send
* it to AH/ESP to make sure that the attached packet has a
* valid association. ipha in the code below points to the
* IP header of the packet that caused the error.
*
* For IPsec cases, we let the next-layer-up (which has access to
* cached policy on the conn_t, or can query the SPD directly)
* subtract out any IPsec overhead if they must. We therefore make no
* adjustments here for IPsec overhead.
*
* IFN could have been generated locally or by some router.
*
* LOCAL : ire_send_wire (before calling ipsec_out_process) can call
* icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
* This happens because IP adjusted its value of MTU on an
* earlier IFN message and could not tell the upper layer,
* the new adjusted value of MTU e.g. Packet was encrypted
* or there was not enough information to fanout to upper
* layers. Thus on the next outbound datagram, ire_send_wire
* generates the IFN, where IPsec processing has *not* been
* done.
*
* Note that we retain ixa_fragsize across IPsec thus once
* we have picking ixa_fragsize and entered ipsec_out_process we do
* no change the fragsize even if the path MTU changes before
* we reach ip_output_post_ipsec.
*
* In the local case, IRAF_LOOPBACK will be set indicating
* that IFN was generated locally.
*
* ROUTER : IFN could be secure or non-secure.
*
* * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
* packet in error has AH/ESP headers to validate the AH/ESP
* headers. AH/ESP will verify whether there is a valid SA or
* not and send it back. We will fanout again if we have more
* data in the packet.
*
* If the packet in error does not have AH/ESP, we handle it
* like any other case.
*
* * NON_SECURE : If the packet in error has AH/ESP headers, we send it
* up to AH/ESP for validation. AH/ESP will verify whether there is a
* valid SA or not and send it back. We will fanout again if
* we have more data in the packet.
*
* If the packet in error does not have AH/ESP, we handle it
* like any other case.
*
* The caller must have called icmp_inbound_verify_v4.
*/
static void
icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
{
uint16_t *up; /* Pointer to ports in ULP header */
uint32_t ports; /* reversed ports for fanout */
ipha_t ripha; /* With reversed addresses */
ipha_t *ipha; /* Inner IP header */
uint_t hdr_length; /* Inner IP header length */
tcpha_t *tcpha;
conn_t *connp;
ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
ill_t *rill = ira->ira_rill;
/* Caller already pulled up everything. */
ipha = (ipha_t *)&icmph[1];
ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
ASSERT(mp->b_cont == NULL);
hdr_length = IPH_HDR_LENGTH(ipha);
ira->ira_protocol = ipha->ipha_protocol;
/*
* We need a separate IP header with the source and destination
* addresses reversed to do fanout/classification because the ipha in
* the ICMP error is in the form we sent it out.
*/
ripha.ipha_src = ipha->ipha_dst;
ripha.ipha_dst = ipha->ipha_src;
ripha.ipha_protocol = ipha->ipha_protocol;
ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
ripha.ipha_protocol, ntohl(ipha->ipha_src),
ntohl(ipha->ipha_dst),
icmph->icmph_type, icmph->icmph_code));
switch (ipha->ipha_protocol) {
case IPPROTO_UDP:
up = (uint16_t *)((uchar_t *)ipha + hdr_length);
/* Attempt to find a client stream based on port. */
ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
ntohs(up[0]), ntohs(up[1])));
/* Note that we send error to all matches. */
ira->ira_flags |= IRAF_ICMP_ERROR;
ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
case IPPROTO_TCP:
/*
* Find a TCP client stream for this packet.
* Note that we do a reverse lookup since the header is
* in the form we sent it out.
*/
tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
ipst);
if (connp == NULL)
goto discard_pkt;
if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
(ira->ira_flags & IRAF_IPSEC_SECURE)) {
mp = ipsec_check_inbound_policy(mp, connp,
ipha, NULL, ira);
if (mp == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
/* Note that mp is NULL */
ip_drop_input("ipIfStatsInDiscards", mp, ill);
CONN_DEC_REF(connp);
return;
}
}
ira->ira_flags |= IRAF_ICMP_ERROR;
ira->ira_ill = ira->ira_rill = NULL;
if (IPCL_IS_TCP(connp)) {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
connp->conn_recvicmp, connp, ira, SQ_FILL,
SQTAG_TCP_INPUT_ICMP_ERR);
} else {
/* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
(connp->conn_recv)(connp, mp, NULL, ira);
CONN_DEC_REF(connp);
}
ira->ira_ill = ill;
ira->ira_rill = rill;
ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
case IPPROTO_SCTP:
up = (uint16_t *)((uchar_t *)ipha + hdr_length);
/* Find a SCTP client stream for this packet. */
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
ira->ira_flags |= IRAF_ICMP_ERROR;
ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
case IPPROTO_ESP:
case IPPROTO_AH:
if (!ipsec_loaded(ipss)) {
ip_proto_not_sup(mp, ira);
return;
}
if (ipha->ipha_protocol == IPPROTO_ESP)
mp = ipsecesp_icmp_error(mp, ira);
else
mp = ipsecah_icmp_error(mp, ira);
if (mp == NULL)
return;
/* Just in case ipsec didn't preserve the NULL b_cont */
if (mp->b_cont != NULL) {
if (!pullupmsg(mp, -1))
goto discard_pkt;
}
/*
* Note that ira_pktlen and ira_ip_hdr_length are no longer
* correct, but we don't use them any more here.
*
* If succesful, the mp has been modified to not include
* the ESP/AH header so we can fanout to the ULP's icmp
* error handler.
*/
if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
goto truncated;
/* Verify the modified message before any further processes. */
ipha = (ipha_t *)mp->b_rptr;
hdr_length = IPH_HDR_LENGTH(ipha);
icmph = (icmph_t *)&mp->b_rptr[hdr_length];
if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
freemsg(mp);
return;
}
icmp_inbound_error_fanout_v4(mp, icmph, ira);
return;
case IPPROTO_ENCAP: {
/* Look for self-encapsulated packets that caused an error */
ipha_t *in_ipha;
/*
* Caller has verified that length has to be
* at least the size of IP header.
*/
ASSERT(hdr_length >= sizeof (ipha_t));
/*
* Check the sanity of the inner IP header like
* we did for the outer header.
*/
in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
goto discard_pkt;
}
if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
goto discard_pkt;
}
/* Check for Self-encapsulated tunnels */
if (in_ipha->ipha_src == ipha->ipha_src &&
in_ipha->ipha_dst == ipha->ipha_dst) {
mp = icmp_inbound_self_encap_error_v4(mp, ipha,
in_ipha);
if (mp == NULL)
goto discard_pkt;
/*
* Just in case self_encap didn't preserve the NULL
* b_cont
*/
if (mp->b_cont != NULL) {
if (!pullupmsg(mp, -1))
goto discard_pkt;
}
/*
* Note that ira_pktlen and ira_ip_hdr_length are no
* longer correct, but we don't use them any more here.
*/
if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
goto truncated;
/*
* Verify the modified message before any further
* processes.
*/
ipha = (ipha_t *)mp->b_rptr;
hdr_length = IPH_HDR_LENGTH(ipha);
icmph = (icmph_t *)&mp->b_rptr[hdr_length];
if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
freemsg(mp);
return;
}
/*
* The packet in error is self-encapsualted.
* And we are finding it further encapsulated
* which we could not have possibly generated.
*/
if (ipha->ipha_protocol == IPPROTO_ENCAP) {
goto discard_pkt;
}
icmp_inbound_error_fanout_v4(mp, icmph, ira);
return;
}
/* No self-encapsulated */
}
/* FALLTHROUGH */
case IPPROTO_IPV6:
if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
&ripha.ipha_dst, ipst)) != NULL) {
ira->ira_flags |= IRAF_ICMP_ERROR;
connp->conn_recvicmp(connp, mp, NULL, ira);
CONN_DEC_REF(connp);
ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
}
/*
* No IP tunnel is interested, fallthrough and see
* if a raw socket will want it.
*/
/* FALLTHROUGH */
default:
ira->ira_flags |= IRAF_ICMP_ERROR;
ip_fanout_proto_v4(mp, &ripha, ira);
ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
}
/* NOTREACHED */
discard_pkt:
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
ip_drop_input("ipIfStatsInDiscards", mp, ill);
freemsg(mp);
return;
truncated:
/* We pulled up everthing already. Must be truncated */
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
freemsg(mp);
}
/*
* Common IP options parser.
*
* Setup routine: fill in *optp with options-parsing state, then
* tail-call ipoptp_next to return the first option.
*/
uint8_t
ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
{
uint32_t totallen; /* total length of all options */
totallen = ipha->ipha_version_and_hdr_length -
(uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
totallen <<= 2;
optp->ipoptp_next = (uint8_t *)(&ipha[1]);
optp->ipoptp_end = optp->ipoptp_next + totallen;
optp->ipoptp_flags = 0;
return (ipoptp_next(optp));
}
/* Like above but without an ipha_t */
uint8_t
ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
{
optp->ipoptp_next = opt;
optp->ipoptp_end = optp->ipoptp_next + totallen;
optp->ipoptp_flags = 0;
return (ipoptp_next(optp));
}
/*
* Common IP options parser: extract next option.
*/
uint8_t
ipoptp_next(ipoptp_t *optp)
{
uint8_t *end = optp->ipoptp_end;
uint8_t *cur = optp->ipoptp_next;
uint8_t opt, len, pointer;
/*
* If cur > end already, then the ipoptp_end or ipoptp_next pointer
* has been corrupted.
*/
ASSERT(cur <= end);
if (cur == end)
return (IPOPT_EOL);
opt = cur[IPOPT_OPTVAL];
/*
* Skip any NOP options.
*/
while (opt == IPOPT_NOP) {
cur++;
if (cur == end)
return (IPOPT_EOL);
opt = cur[IPOPT_OPTVAL];
}
if (opt == IPOPT_EOL)
return (IPOPT_EOL);
/*
* Option requiring a length.
*/
if ((cur + 1) >= end) {
optp->ipoptp_flags |= IPOPTP_ERROR;
return (IPOPT_EOL);
}
len = cur[IPOPT_OLEN];
if (len < 2) {
optp->ipoptp_flags |= IPOPTP_ERROR;
return (IPOPT_EOL);
}
optp->ipoptp_cur = cur;
optp->ipoptp_len = len;
optp->ipoptp_next = cur + len;
if (cur + len > end) {
optp->ipoptp_flags |= IPOPTP_ERROR;
return (IPOPT_EOL);
}
/*
* For the options which require a pointer field, make sure
* its there, and make sure it points to either something
* inside this option, or the end of the option.
*/
switch (opt) {
case IPOPT_RR:
case IPOPT_TS:
case IPOPT_LSRR:
case IPOPT_SSRR:
if (len <= IPOPT_OFFSET) {
optp->ipoptp_flags |= IPOPTP_ERROR;
return (opt);
}
pointer = cur[IPOPT_OFFSET];
if (pointer - 1 > len) {
optp->ipoptp_flags |= IPOPTP_ERROR;
return (opt);
}
break;
}
/*
* Sanity check the pointer field based on the type of the
* option.
*/
switch (opt) {
case IPOPT_RR:
case IPOPT_SSRR:
case IPOPT_LSRR:
if (pointer < IPOPT_MINOFF_SR)
optp->ipoptp_flags |= IPOPTP_ERROR;
break;
case IPOPT_TS:
if (pointer < IPOPT_MINOFF_IT)
optp->ipoptp_flags |= IPOPTP_ERROR;
/*
* Note that the Internet Timestamp option also
* contains two four bit fields (the Overflow field,
* and the Flag field), which follow the pointer
* field. We don't need to check that these fields
* fall within the length of the option because this
* was implicitely done above. We've checked that the
* pointer value is at least IPOPT_MINOFF_IT, and that
* it falls within the option. Since IPOPT_MINOFF_IT >
* IPOPT_POS_OV_FLG, we don't need the explicit check.
*/
ASSERT(len > IPOPT_POS_OV_FLG);
break;
}
return (opt);
}
/*
* Use the outgoing IP header to create an IP_OPTIONS option the way
* it was passed down from the application.
*
* This is compatible with BSD in that it returns
* the reverse source route with the final destination
* as the last entry. The first 4 bytes of the option
* will contain the final destination.
*/
int
ip_opt_get_user(conn_t *connp, uchar_t *buf)
{
ipoptp_t opts;
uchar_t *opt;
uint8_t optval;
uint8_t optlen;
uint32_t len = 0;
uchar_t *buf1 = buf;
uint32_t totallen;
ipaddr_t dst;
ip_pkt_t *ipp = &connp->conn_xmit_ipp;
if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
return (0);
totallen = ipp->ipp_ipv4_options_len;
if (totallen & 0x3)
return (0);
buf += IP_ADDR_LEN; /* Leave room for final destination */
len += IP_ADDR_LEN;
bzero(buf1, IP_ADDR_LEN);
dst = connp->conn_faddr_v4;
for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
optval != IPOPT_EOL;
optval = ipoptp_next(&opts)) {
int off;
opt = opts.ipoptp_cur;
if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
break;
}
optlen = opts.ipoptp_len;
switch (optval) {
case IPOPT_SSRR:
case IPOPT_LSRR:
/*
* Insert destination as the first entry in the source
* route and move down the entries on step.
* The last entry gets placed at buf1.
*/
buf[IPOPT_OPTVAL] = optval;
buf[IPOPT_OLEN] = optlen;
buf[IPOPT_OFFSET] = optlen;
off = optlen - IP_ADDR_LEN;
if (off < 0) {
/* No entries in source route */
break;
}
/* Last entry in source route if not already set */
if (dst == INADDR_ANY)
bcopy(opt + off, buf1, IP_ADDR_LEN);
off -= IP_ADDR_LEN;
while (off > 0) {
bcopy(opt + off,
buf + off + IP_ADDR_LEN,
IP_ADDR_LEN);
off -= IP_ADDR_LEN;
}
/* ipha_dst into first slot */
bcopy(&dst, buf + off + IP_ADDR_LEN,
IP_ADDR_LEN);
buf += optlen;
len += optlen;
break;
default:
bcopy(opt, buf, optlen);
buf += optlen;
len += optlen;
break;
}
}
done:
/* Pad the resulting options */
while (len & 0x3) {
*buf++ = IPOPT_EOL;
len++;
}
return (len);
}
/*
* Update any record route or timestamp options to include this host.
* Reverse any source route option.
* This routine assumes that the options are well formed i.e. that they
* have already been checked.
*/
static void
icmp_options_update(ipha_t *ipha)
{
ipoptp_t opts;
uchar_t *opt;
uint8_t optval;
ipaddr_t src; /* Our local address */
ipaddr_t dst;
ip2dbg(("icmp_options_update\n"));
src = ipha->ipha_src;
dst = ipha->ipha_dst;
for (optval = ipoptp_first(&opts, ipha);
optval != IPOPT_EOL;
optval = ipoptp_next(&opts)) {
ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
opt = opts.ipoptp_cur;
ip2dbg(("icmp_options_update: opt %d, len %d\n",
optval, opts.ipoptp_len));
switch (optval) {
int off1, off2;
case IPOPT_SSRR:
case IPOPT_LSRR:
/*
* Reverse the source route. The first entry
* should be the next to last one in the current
* source route (the last entry is our address).
* The last entry should be the final destination.
*/
off1 = IPOPT_MINOFF_SR - 1;
off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
if (off2 < 0) {
/* No entries in source route */
ip1dbg((
"icmp_options_update: bad src route\n"));
break;
}
bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
off2 -= IP_ADDR_LEN;
while (off1 < off2) {
bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
bcopy((char *)opt + off2, (char *)opt + off1,
IP_ADDR_LEN);
bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
off1 += IP_ADDR_LEN;
off2 -= IP_ADDR_LEN;
}
opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
break;
}
}
}
/*
* Process received ICMP Redirect messages.
* Assumes the caller has verified that the headers are in the pulled up mblk.
* Consumes mp.
*/
static void
icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
{
ire_t *ire, *nire;
ire_t *prev_ire;
ipaddr_t src, dst, gateway;
ip_stack_t *ipst = ira->ira_ill->ill_ipst;
ipha_t *inner_ipha; /* Inner IP header */
/* Caller already pulled up everything. */
inner_ipha = (ipha_t *)&icmph[1];
src = ipha->ipha_src;
dst = inner_ipha->ipha_dst;
gateway = icmph->icmph_rd_gateway;
/* Make sure the new gateway is reachable somehow. */
ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
/*
* Make sure we had a route for the dest in question and that
* that route was pointing to the old gateway (the source of the
* redirect packet.)
* We do longest match and then compare ire_gateway_addr below.
*/
prev_ire = ire_ftable_lookup_v4(dst, 0, 0, 0, NULL, ALL_ZONES,
NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
/*
* Check that
* the redirect was not from ourselves
* the new gateway and the old gateway are directly reachable
*/
if (prev_ire == NULL || ire == NULL ||
(prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
(prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
!(ire->ire_type & IRE_IF_ALL) ||
prev_ire->ire_gateway_addr != src) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
freemsg(mp);
if (ire != NULL)
ire_refrele(ire);
if (prev_ire != NULL)
ire_refrele(prev_ire);
return;
}
ire_refrele(prev_ire);
ire_refrele(ire);
/*
* TODO: more precise handling for cases 0, 2, 3, the latter two
* require TOS routing
*/
switch (icmph->icmph_code) {
case 0:
case 1:
/* TODO: TOS specificity for cases 2 and 3 */
case 2:
case 3:
break;
default:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
freemsg(mp);
return;
}
/*
* Create a Route Association. This will allow us to remember that
* someone we believe told us to use the particular gateway.
*/
ire = ire_create(
(uchar_t *)&dst, /* dest addr */
(uchar_t *)&ip_g_all_ones, /* mask */
(uchar_t *)&gateway, /* gateway addr */
IRE_HOST,
NULL, /* ill */
ALL_ZONES,
(RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
NULL, /* tsol_gc_t */
ipst);
if (ire == NULL) {
freemsg(mp);
return;
}
nire = ire_add(ire);
/* Check if it was a duplicate entry */
if (nire != NULL && nire != ire) {
ASSERT(nire->ire_identical_ref > 1);
ire_delete(nire);
ire_refrele(nire);
nire = NULL;
}
ire = nire;
if (ire != NULL) {
ire_refrele(ire); /* Held in ire_add */
/* tell routing sockets that we received a redirect */
ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
(RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),