blob: 83e13a8bd12c06156d791b733707cb3ca4a4fa52 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/dlpi.h>
#include <sys/stropts.h>
#include <sys/sysmacros.h>
#include <sys/strsubr.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
#include <sys/zone.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/xti_inet.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/kobj.h>
#include <sys/modctl.h>
#include <sys/atomic.h>
#include <sys/policy.h>
#include <sys/priv.h>
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/vtrace.h>
#include <sys/isa_defs.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/route.h>
#include <sys/sockio.h>
#include <netinet/in.h>
#include <net/if_dl.h>
#include <inet/common.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/arp.h>
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>
#include <netinet/igmp_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/sctp.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
#include <inet/ip_multi.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
#include <inet/ip_ftable.h>
#include <inet/ip_rts.h>
#include <inet/optcom.h>
#include <inet/ip_ndp.h>
#include <inet/ip_listutils.h>
#include <netinet/igmp.h>
#include <netinet/ip_mroute.h>
#include <inet/ipp_common.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
#include <sys/iphada.h>
#include <inet/tun.h>
#include <inet/ipdrop.h>
#include <inet/ip_netinfo.h>
#include <sys/ethernet.h>
#include <net/if_types.h>
#include <sys/cpuvar.h>
#include <ipp/ipp.h>
#include <ipp/ipp_impl.h>
#include <ipp/ipgpc/ipgpc.h>
#include <sys/multidata.h>
#include <sys/pattr.h>
#include <inet/ipclassifier.h>
#include <inet/sctp_ip.h>
#include <inet/sctp/sctp_impl.h>
#include <inet/udp_impl.h>
#include <sys/sunddi.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
#include <rpc/pmap_prot.h>
/*
* Values for squeue switch:
* IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain
* IP_SQUEUE_ENTER: squeue_enter
* IP_SQUEUE_FILL: squeue_fill
*/
int ip_squeue_enter = 2; /* Setable in /etc/system */
squeue_func_t ip_input_proc;
#define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x))
#define TCP6 "tcp6"
#define TCP "tcp"
#define SCTP "sctp"
#define SCTP6 "sctp6"
major_t TCP6_MAJ;
major_t TCP_MAJ;
major_t SCTP_MAJ;
major_t SCTP6_MAJ;
/*
* Setable in /etc/system
*/
int ip_poll_normal_ms = 100;
int ip_poll_normal_ticks = 0;
int ip_modclose_ackwait_ms = 3000;
/*
* Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
*/
struct listptr_s {
mblk_t *lp_head; /* pointer to the head of the list */
mblk_t *lp_tail; /* pointer to the tail of the list */
};
typedef struct listptr_s listptr_t;
/*
* This is used by ip_snmp_get_mib2_ip_route_media and
* ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
*/
typedef struct iproutedata_s {
uint_t ird_idx;
listptr_t ird_route; /* ipRouteEntryTable */
listptr_t ird_netmedia; /* ipNetToMediaEntryTable */
listptr_t ird_attrs; /* ipRouteAttributeTable */
} iproutedata_t;
/*
* Cluster specific hooks. These should be NULL when booted as a non-cluster
*/
/*
* Hook functions to enable cluster networking
* On non-clustered systems these vectors must always be NULL.
*
* Hook function to Check ip specified ip address is a shared ip address
* in the cluster
*
*/
int (*cl_inet_isclusterwide)(uint8_t protocol,
sa_family_t addr_family, uint8_t *laddrp) = NULL;
/*
* Hook function to generate cluster wide ip fragment identifier
*/
uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, uint8_t *faddrp) = NULL;
/*
* Synchronization notes:
*
* IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
* MT level protection given by STREAMS. IP uses a combination of its own
* internal serialization mechanism and standard Solaris locking techniques.
* The internal serialization is per phyint (no IPMP) or per IPMP group.
* This is used to serialize plumbing operations, IPMP operations, certain
* multicast operations, most set ioctls, igmp/mld timers etc.
*
* Plumbing is a long sequence of operations involving message
* exchanges between IP, ARP and device drivers. Many set ioctls are typically
* involved in plumbing operations. A natural model is to serialize these
* ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
* parallel without any interference. But various set ioctls on hme0 are best
* serialized. However if the system uses IPMP, the operations are easier if
* they are serialized on a per IPMP group basis since IPMP operations
* happen across ill's of a group. Thus the lowest common denominator is to
* serialize most set ioctls, multicast join/leave operations, IPMP operations
* igmp/mld timer operations, and processing of DLPI control messages received
* from drivers on a per IPMP group basis. If the system does not employ
* IPMP the serialization is on a per phyint basis. This serialization is
* provided by the ipsq_t and primitives operating on this. Details can
* be found in ip_if.c above the core primitives operating on ipsq_t.
*
* Lookups of an ipif or ill by a thread return a refheld ipif / ill.
* Simiarly lookup of an ire by a thread also returns a refheld ire.
* In addition ipif's and ill's referenced by the ire are also indirectly
* refheld. Thus no ipif or ill can vanish nor can critical parameters like
* the ipif's address or netmask change as long as an ipif is refheld
* directly or indirectly. For example an SIOCLIFADDR ioctl that changes the
* address of an ipif has to go through the ipsq_t. This ensures that only
* 1 such exclusive operation proceeds at any time on the ipif. It then
* deletes all ires associated with this ipif, and waits for all refcnts
* associated with this ipif to come down to zero. The address is changed
* only after the ipif has been quiesced. Then the ipif is brought up again.
* More details are described above the comment in ip_sioctl_flags.
*
* Packet processing is based mostly on IREs and are fully multi-threaded
* using standard Solaris MT techniques.
*
* There are explicit locks in IP to handle:
* - The ip_g_head list maintained by mi_open_link() and friends.
*
* - The reassembly data structures (one lock per hash bucket)
*
* - conn_lock is meant to protect conn_t fields. The fields actually
* protected by conn_lock are documented in the conn_t definition.
*
* - ire_lock to protect some of the fields of the ire, IRE tables
* (one lock per hash bucket). Refer to ip_ire.c for details.
*
* - ndp_g_lock and nce_lock for protecting NCEs.
*
* - ill_lock protects fields of the ill and ipif. Details in ip.h
*
* - ill_g_lock: This is a global reader/writer lock. Protects the following
* * The AVL tree based global multi list of all ills.
* * The linked list of all ipifs of an ill
* * The <ill-ipsq> mapping
* * The ipsq->ipsq_phyint_list threaded by phyint_ipsq_next
* * The illgroup list threaded by ill_group_next.
* * <ill-phyint> association
* Insertion/deletion of an ill in the system, insertion/deletion of an ipif
* into an ill, changing the <ill-ipsq> mapping of an ill, insertion/deletion
* of an ill into the illgrp list, changing the <ill-phyint> assoc of an ill
* will all have to hold the ill_g_lock as writer for the actual duration
* of the insertion/deletion/change. More details about the <ill-ipsq> mapping
* may be found in the IPMP section.
*
* - ill_lock: This is a per ill mutex.
* It protects some members of the ill and is documented below.
* It also protects the <ill-ipsq> mapping
* It also protects the illgroup list threaded by ill_group_next.
* It also protects the <ill-phyint> assoc.
* It also protects the list of ipifs hanging off the ill.
*
* - ipsq_lock: This is a per ipsq_t mutex lock.
* This protects all the other members of the ipsq struct except
* ipsq_refs and ipsq_phyint_list which are protected by ill_g_lock
*
* - illgrp_lock: This is a per ill_group mutex lock.
* The only thing it protects is the illgrp_ill_schednext member of ill_group
* which dictates which is the next ill in an ill_group that is to be chosen
* for sending outgoing packets, through creation of an IRE_CACHE that
* references this ill.
*
* - phyint_lock: This is a per phyint mutex lock. Protects just the
* phyint_flags
*
* - ip_g_nd_lock: This is a global reader/writer lock.
* Any call to nd_load to load a new parameter to the ND table must hold the
* lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock
* as reader.
*
* - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
* This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
* uniqueness check also done atomically.
*
* - ipsec_capab_ills_lock: This readers/writer lock protects the global
* lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken
* as a writer when adding or deleting elements from these lists, and
* as a reader when walking these lists to send a SADB update to the
* IPsec capable ills.
*
* - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
* group list linked by ill_usesrc_grp_next. It also protects the
* ill_usesrc_ifindex field. It is taken as a writer when a member of the
* group is being added or deleted. This lock is taken as a reader when
* walking the list/group(eg: to get the number of members in a usesrc group).
* Note, it is only necessary to take this lock if the ill_usesrc_grp_next
* field is changing state i.e from NULL to non-NULL or vice-versa. For
* example, it is not necessary to take this lock in the initial portion
* of ip_sioctl_slifusesrc or at all in ip_sioctl_groupname and
* ip_sioctl_flags since the these operations are executed exclusively and
* that ensures that the "usesrc group state" cannot change. The "usesrc
* group state" change can happen only in the latter part of
* ip_sioctl_slifusesrc and in ill_delete.
*
* Changing <ill-phyint>, <ill-ipsq>, <ill-illgroup> assocications.
*
* To change the <ill-phyint> association, the ill_g_lock must be held
* as writer, and the ill_locks of both the v4 and v6 instance of the ill
* must be held.
*
* To change the <ill-ipsq> association the ill_g_lock must be held as writer
* and the ill_lock of the ill in question must be held.
*
* To change the <ill-illgroup> association the ill_g_lock must be held as
* writer and the ill_lock of the ill in question must be held.
*
* To add or delete an ipif from the list of ipifs hanging off the ill,
* ill_g_lock (writer) and ill_lock must be held and the thread must be
* a writer on the associated ipsq,.
*
* To add or delete an ill to the system, the ill_g_lock must be held as
* writer and the thread must be a writer on the associated ipsq.
*
* To add or delete an ilm to an ill, the ill_lock must be held and the thread
* must be a writer on the associated ipsq.
*
* Lock hierarchy
*
* Some lock hierarchy scenarios are listed below.
*
* ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock
* ill_g_lock -> illgrp_lock -> ill_lock
* ill_g_lock -> ill_lock(s) -> phyint_lock
* ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock
* ill_g_lock -> ip_addr_avail_lock
* conn_lock -> irb_lock -> ill_lock -> ire_lock
* ill_g_lock -> ip_g_nd_lock
*
* When more than 1 ill lock is needed to be held, all ill lock addresses
* are sorted on address and locked starting from highest addressed lock
* downward.
*
* Mobile-IP scenarios
*
* irb_lock -> ill_lock -> ire_mrtun_lock
* irb_lock -> ill_lock -> ire_srcif_table_lock
*
* IPsec scenarios
*
* ipsa_lock -> ill_g_lock -> ill_lock
* ipsec_capab_ills_lock -> ill_g_lock -> ill_lock
* ipsec_capab_ills_lock -> ipsa_lock
* ill_g_usesrc_lock -> ill_g_lock -> ill_lock
*
* Trusted Solaris scenarios
*
* igsa_lock -> gcgrp_rwlock -> gcgrp_lock
* igsa_lock -> gcdb_lock
* gcgrp_rwlock -> ire_lock
* gcgrp_rwlock -> gcdb_lock
*
*
* Routing/forwarding table locking notes:
*
* Lock acquisition order: Radix tree lock, irb_lock.
* Requirements:
* i. Walker must not hold any locks during the walker callback.
* ii Walker must not see a truncated tree during the walk because of any node
* deletion.
* iii Existing code assumes ire_bucket is valid if it is non-null and is used
* in many places in the code to walk the irb list. Thus even if all the
* ires in a bucket have been deleted, we still can't free the radix node
* until the ires have actually been inactive'd (freed).
*
* Tree traversal - Need to hold the global tree lock in read mode.
* Before dropping the global tree lock, need to either increment the ire_refcnt
* to ensure that the radix node can't be deleted.
*
* Tree add - Need to hold the global tree lock in write mode to add a
* radix node. To prevent the node from being deleted, increment the
* irb_refcnt, after the node is added to the tree. The ire itself is
* added later while holding the irb_lock, but not the tree lock.
*
* Tree delete - Need to hold the global tree lock and irb_lock in write mode.
* All associated ires must be inactive (i.e. freed), and irb_refcnt
* must be zero.
*
* Walker - Increment irb_refcnt before calling the walker callback. Hold the
* global tree lock (read mode) for traversal.
*
* IPSEC notes :
*
* IP interacts with the IPSEC code (AH/ESP) by tagging a M_CTL message
* in front of the actual packet. For outbound datagrams, the M_CTL
* contains a ipsec_out_t (defined in ipsec_info.h), which has the
* information used by the IPSEC code for applying the right level of
* protection. The information initialized by IP in the ipsec_out_t
* is determined by the per-socket policy or global policy in the system.
* For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in
* ipsec_info.h) which starts out with nothing in it. It gets filled
* with the right information if it goes through the AH/ESP code, which
* happens if the incoming packet is secure. The information initialized
* by AH/ESP, is later used by IP(during fanouts to ULP) to see whether
* the policy requirements needed by per-socket policy or global policy
* is met or not.
*
* If there is both per-socket policy (set using setsockopt) and there
* is also global policy match for the 5 tuples of the socket,
* ipsec_override_policy() makes the decision of which one to use.
*
* For fully connected sockets i.e dst, src [addr, port] is known,
* conn_policy_cached is set indicating that policy has been cached.
* conn_in_enforce_policy may or may not be set depending on whether
* there is a global policy match or per-socket policy match.
* Policy inheriting happpens in ip_bind during the ipa_conn_t bind.
* Once the right policy is set on the conn_t, policy cannot change for
* this socket. This makes life simpler for TCP (UDP ?) where
* re-transmissions go out with the same policy. For symmetry, policy
* is cached for fully connected UDP sockets also. Thus if policy is cached,
* it also implies that policy is latched i.e policy cannot change
* on these sockets. As we have the right policy on the conn, we don't
* have to lookup global policy for every outbound and inbound datagram
* and thus serving as an optimization. Note that a global policy change
* does not affect fully connected sockets if they have policy. If fully
* connected sockets did not have any policy associated with it, global
* policy change may affect them.
*
* IP Flow control notes:
*
* Non-TCP streams are flow controlled by IP. On the send side, if the packet
* cannot be sent down to the driver by IP, because of a canput failure, IP
* does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq.
* ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained
* when the flowcontrol condition subsides. Ultimately STREAMS backenables the
* ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the
* first conn in the list of conn's to be drained. ip_wsrv on this conn drains
* the queued messages, and removes the conn from the drain list, if all
* messages were drained. It also qenables the next conn in the drain list to
* continue the drain process.
*
* In reality the drain list is not a single list, but a configurable number
* of lists. The ip_wsrv on the IP module, qenables the first conn in each
* list. If the ip_wsrv of the next qenabled conn does not run, because the
* stream closes, ip_close takes responsibility to qenable the next conn in
* the drain list. The directly called ip_wput path always does a putq, if
* it cannot putnext. Thus synchronization problems are handled between
* ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only
* functions that manipulate this drain list. Furthermore conn_drain_insert
* is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv
* running on a queue at any time. conn_drain_tail can be simultaneously called
* from both ip_wsrv and ip_close.
*
* IPQOS notes:
*
* IPQoS Policies are applied to packets using IPPF (IP Policy framework)
* and IPQoS modules. IPPF includes hooks in IP at different control points
* (callout positions) which direct packets to IPQoS modules for policy
* processing. Policies, if present, are global.
*
* The callout positions are located in the following paths:
* o local_in (packets destined for this host)
* o local_out (packets orginating from this host )
* o fwd_in (packets forwarded by this m/c - inbound)
* o fwd_out (packets forwarded by this m/c - outbound)
* Hooks at these callout points can be enabled/disabled using the ndd variable
* ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
* By default all the callout positions are enabled.
*
* Outbound (local_out)
* Hooks are placed in ip_wput_ire and ipsec_out_process.
*
* Inbound (local_in)
* Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and
* TCP and UDP fanout routines.
*
* Forwarding (in and out)
* Hooks are placed in ip_rput_forward and ip_mrtun_forward.
*
* IP Policy Framework processing (IPPF processing)
* Policy processing for a packet is initiated by ip_process, which ascertains
* that the classifier (ipgpc) is loaded and configured, failing which the
* packet resumes normal processing in IP. If the clasifier is present, the
* packet is acted upon by one or more IPQoS modules (action instances), per
* filters configured in ipgpc and resumes normal IP processing thereafter.
* An action instance can drop a packet in course of its processing.
*
* A boolean variable, ip_policy, is used in all the fanout routines that can
* invoke ip_process for a packet. This variable indicates if the packet should
* to be sent for policy processing. The variable is set to B_TRUE by default,
* i.e. when the routines are invoked in the normal ip procesing path for a
* packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout;
* ip_policy is set to B_FALSE for all the routines called in these two
* functions because, in the former case, we don't process loopback traffic
* currently while in the latter, the packets have already been processed in
* icmp_inbound.
*
* Zones notes:
*
* The partitioning rules for networking are as follows:
* 1) Packets coming from a zone must have a source address belonging to that
* zone.
* 2) Packets coming from a zone can only be sent on a physical interface on
* which the zone has an IP address.
* 3) Between two zones on the same machine, packet delivery is only allowed if
* there's a matching route for the destination and zone in the forwarding
* table.
* 4) The TCP and UDP port spaces are per-zone; that is, two processes in
* different zones can bind to the same port with the wildcard address
* (INADDR_ANY).
*
* The granularity of interface partitioning is at the logical interface level.
* Therefore, every zone has its own IP addresses, and incoming packets can be
* attributed to a zone unambiguously. A logical interface is placed into a zone
* using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
* structure. Rule (1) is implemented by modifying the source address selection
* algorithm so that the list of eligible addresses is filtered based on the
* sending process zone.
*
* The Internet Routing Entries (IREs) are either exclusive to a zone or shared
* across all zones, depending on their type. Here is the break-up:
*
* IRE type Shared/exclusive
* -------- ----------------
* IRE_BROADCAST Exclusive
* IRE_DEFAULT (default routes) Shared (*)
* IRE_LOCAL Exclusive (x)
* IRE_LOOPBACK Exclusive
* IRE_PREFIX (net routes) Shared (*)
* IRE_CACHE Exclusive
* IRE_IF_NORESOLVER (interface routes) Exclusive
* IRE_IF_RESOLVER (interface routes) Exclusive
* IRE_HOST (host routes) Shared (*)
*
* (*) A zone can only use a default or off-subnet route if the gateway is
* directly reachable from the zone, that is, if the gateway's address matches
* one of the zone's logical interfaces.
*
* (x) IRE_LOCAL are handled a bit differently, since for all other entries
* in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source
* when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP
* address of the zone itself (the destination). Since IRE_LOCAL is used
* for communication between zones, ip_wput_ire has special logic to set
* the right source address when sending using an IRE_LOCAL.
*
* Furthermore, when ip_restrict_interzone_loopback is set (the default),
* ire_cache_lookup restricts loopback using an IRE_LOCAL
* between zone to the case when L2 would have conceptually looped the packet
* back, i.e. the loopback which is required since neither Ethernet drivers
* nor Ethernet hardware loops them back. This is the case when the normal
* routes (ignoring IREs with different zoneids) would send out the packet on
* the same ill (or ill group) as the ill with which is IRE_LOCAL is
* associated.
*
* Multiple zones can share a common broadcast address; typically all zones
* share the 255.255.255.255 address. Incoming as well as locally originated
* broadcast packets must be dispatched to all the zones on the broadcast
* network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
* since some zones may not be on the 10.16.72/24 network. To handle this, each
* zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
* sent to every zone that has an IRE_BROADCAST entry for the destination
* address on the input ill, see conn_wantpacket().
*
* Applications in different zones can join the same multicast group address.
* For IPv4, group memberships are per-logical interface, so they're already
* inherently part of a zone. For IPv6, group memberships are per-physical
* interface, so we distinguish IPv6 group memberships based on group address,
* interface and zoneid. In both cases, received multicast packets are sent to
* every zone for which a group membership entry exists. On IPv6 we need to
* check that the target zone still has an address on the receiving physical
* interface; it could have been removed since the application issued the
* IPV6_JOIN_GROUP.
*/
/*
* Squeue Fanout flags:
* 0: No fanout.
* 1: Fanout across all squeues
*/
boolean_t ip_squeue_fanout = 0;
/*
* Maximum dups allowed per packet.
*/
uint_t ip_max_frag_dups = 10;
#define IS_SIMPLE_IPH(ipha) \
((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)
/* RFC1122 Conformance */
#define IP_FORWARD_DEFAULT IP_FORWARD_NEVER
#define ILL_MAX_NAMELEN LIFNAMSIZ
static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t);
static void ip_ipsec_out_prepend(mblk_t *, mblk_t *, ill_t *);
static void icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t,
ip_stack_t *);
static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int,
uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t);
static ipaddr_t icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp);
static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t,
mblk_t *, int, ip_stack_t *);
static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *,
icmph_t *, ipha_t *, int, int, boolean_t, boolean_t,
ill_t *, zoneid_t);
static void icmp_options_update(ipha_t *);
static void icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t,
ip_stack_t *);
static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t,
zoneid_t zoneid, ip_stack_t *);
static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_stack_t *);
static void icmp_redirect(ill_t *, mblk_t *);
static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t,
ip_stack_t *);
static void ip_arp_news(queue_t *, mblk_t *);
static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *,
ip_stack_t *);
mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
char *ip_dot_addr(ipaddr_t, char *);
mblk_t *ip_carve_mp(mblk_t **, ssize_t);
int ip_close(queue_t *, int);
static char *ip_dot_saddr(uchar_t *, char *);
static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
boolean_t, boolean_t, ill_t *, zoneid_t);
static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
boolean_t, boolean_t, zoneid_t);
static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t,
boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t);
static void ip_lrput(queue_t *, mblk_t *);
static void ip_mrtun_forward(ire_t *, ill_t *, mblk_t *);
ipaddr_t ip_net_mask(ipaddr_t);
void ip_newroute(queue_t *, mblk_t *, ipaddr_t, ill_t *, conn_t *,
zoneid_t, ip_stack_t *);
static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t,
conn_t *, uint32_t, zoneid_t, ip_opt_info_t *);
char *ip_nv_lookup(nv_t *, int);
static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *);
static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t,
ipndp_t *, size_t);
static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
void ip_rput(queue_t *, mblk_t *);
static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
void *dummy_arg);
void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *,
ip_stack_t *);
static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
ire_t *, ip_stack_t *);
static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *,
mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *);
static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *,
ip_stack_t *);
static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
uint16_t *);
int ip_snmp_get(queue_t *, mblk_t *);
static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
mib2_ipIfStatsEntry_t *, ip_stack_t *);
static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
ip_stack_t *);
static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *);
static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *,
ip_stack_t *ipst);
static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *,
ip_stack_t *ipst);
static void ip_snmp_get2_v4(ire_t *, iproutedata_t *);
static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *);
int ip_snmp_set(queue_t *, int, int, uchar_t *, int);
static boolean_t ip_source_routed(ipha_t *, ip_stack_t *);
static boolean_t ip_source_route_included(ipha_t *);
static void ip_trash_ire_reclaim_stack(ip_stack_t *);
static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t,
zoneid_t, ip_stack_t *);
static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *);
static void ip_wput_local_options(ipha_t *, ip_stack_t *);
static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t,
zoneid_t, ip_stack_t *);
static void conn_drain_init(ip_stack_t *);
static void conn_drain_fini(ip_stack_t *);
static void conn_drain_tail(conn_t *connp, boolean_t closing);
static void conn_walk_drain(ip_stack_t *);
static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *,
zoneid_t);
static void *ip_stack_init(netstackid_t stackid, netstack_t *ns);
static void ip_stack_shutdown(netstackid_t stackid, void *arg);
static void ip_stack_fini(netstackid_t stackid, void *arg);
static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int,
zoneid_t);
static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
void *dummy_arg);
static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *,
conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *);
static void ip_multirt_bad_mtu(ire_t *, uint32_t);
static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
caddr_t, cred_t *);
extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t,
cred_t *);
static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
cred_t *);
static squeue_func_t ip_squeue_switch(int);
static void *ip_kstat_init(netstackid_t, ip_stack_t *);
static void ip_kstat_fini(netstackid_t, kstat_t *);
static int ip_kstat_update(kstat_t *kp, int rw);
static void *icmp_kstat_init(netstackid_t);
static void icmp_kstat_fini(netstackid_t, kstat_t *);
static int icmp_kstat_update(kstat_t *kp, int rw);
static void *ip_kstat2_init(netstackid_t, ip_stat_t *);
static void ip_kstat2_fini(netstackid_t, kstat_t *);
static int ip_conn_report(queue_t *, mblk_t *, caddr_t, cred_t *);
static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t,
ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *);
static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
ipha_t *, ill_t *, boolean_t);
static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
ipha_t *, ill_t *, boolean_t);
ipaddr_t ip_g_all_ones = IP_HOST_MASK;
/* How long, in seconds, we allow frags to hang around. */
#define IP_FRAG_TIMEOUT 60
/*
* Threshold which determines whether MDT should be used when
* generating IP fragments; payload size must be greater than
* this threshold for MDT to take place.
*/
#define IP_WPUT_FRAG_MDT_MIN 32768
/* Setable in /etc/system only */
int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
static long ip_rput_pullups;
int dohwcksum = 1; /* use h/w cksum if supported by the hardware */
vmem_t *ip_minor_arena;
int ip_debug;
#ifdef DEBUG
uint32_t ipsechw_debug = 0;
#endif
/*
* Multirouting/CGTP stuff
*/
cgtp_filter_ops_t *ip_cgtp_filter_ops; /* CGTP hooks */
int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */
boolean_t ip_cgtp_filter; /* Enable/disable CGTP hooks */
/*
* XXX following really should only be in a header. Would need more
* header and .c clean up first.
*/
extern optdb_obj_t ip_opt_obj;
ulong_t ip_squeue_enter_unbound = 0;
/*
* Named Dispatch Parameter Table.
* All of these are alterable, within the min/max values given, at run time.
*/
static ipparam_t lcl_param_arr[] = {
/* min max value name */
{ 0, 1, 0, "ip_respond_to_address_mask_broadcast"},
{ 0, 1, 1, "ip_respond_to_echo_broadcast"},
{ 0, 1, 1, "ip_respond_to_echo_multicast"},
{ 0, 1, 0, "ip_respond_to_timestamp"},
{ 0, 1, 0, "ip_respond_to_timestamp_broadcast"},
{ 0, 1, 1, "ip_send_redirects"},
{ 0, 1, 0, "ip_forward_directed_broadcasts"},
{ 0, 10, 0, "ip_debug"},
{ 0, 10, 0, "ip_mrtdebug"},
{ 5000, 999999999, 60000, "ip_ire_timer_interval" },
{ 60000, 999999999, 1200000, "ip_ire_arp_interval" },
{ 60000, 999999999, 60000, "ip_ire_redirect_interval" },
{ 1, 255, 255, "ip_def_ttl" },
{ 0, 1, 0, "ip_forward_src_routed"},
{ 0, 256, 32, "ip_wroff_extra" },
{ 5000, 999999999, 600000, "ip_ire_pathmtu_interval" },
{ 8, 65536, 64, "ip_icmp_return_data_bytes" },
{ 0, 1, 1, "ip_path_mtu_discovery" },
{ 0, 240, 30, "ip_ignore_delete_time" },
{ 0, 1, 0, "ip_ignore_redirect" },
{ 0, 1, 1, "ip_output_queue" },
{ 1, 254, 1, "ip_broadcast_ttl" },
{ 0, 99999, 100, "ip_icmp_err_interval" },
{ 1, 99999, 10, "ip_icmp_err_burst" },
{ 0, 999999999, 1000000, "ip_reass_queue_bytes" },
{ 0, 1, 0, "ip_strict_dst_multihoming" },
{ 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"},
{ 0, 1, 0, "ipsec_override_persocket_policy" },
{ 0, 1, 1, "icmp_accept_clear_messages" },
{ 0, 1, 1, "igmp_accept_clear_messages" },
{ 2, 999999999, ND_DELAY_FIRST_PROBE_TIME,
"ip_ndp_delay_first_probe_time"},
{ 1, 999999999, ND_MAX_UNICAST_SOLICIT,
"ip_ndp_max_unicast_solicit"},
{ 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" },
{ 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" },
{ 0, 1, 0, "ip6_forward_src_routed"},
{ 0, 1, 1, "ip6_respond_to_echo_multicast"},
{ 0, 1, 1, "ip6_send_redirects"},
{ 0, 1, 0, "ip6_ignore_redirect" },
{ 0, 1, 0, "ip6_strict_dst_multihoming" },
{ 1, 8, 3, "ip_ire_reclaim_fraction" },
{ 0, 999999, 1000, "ipsec_policy_log_interval" },
{ 0, 1, 1, "pim_accept_clear_messages" },
{ 1000, 20000, 2000, "ip_ndp_unsolicit_interval" },
{ 1, 20, 3, "ip_ndp_unsolicit_count" },
{ 0, 1, 1, "ip6_ignore_home_address_opt" },
{ 0, 15, 0, "ip_policy_mask" },
{ 1000, 60000, 1000, "ip_multirt_resolution_interval" },
{ 0, 255, 1, "ip_multirt_ttl" },
{ 0, 1, 1, "ip_multidata_outbound" },
{ 0, 3600000, 300000, "ip_ndp_defense_interval" },
{ 0, 999999, 60*60*24, "ip_max_temp_idle" },
{ 0, 1000, 1, "ip_max_temp_defend" },
{ 0, 1000, 3, "ip_max_defend" },
{ 0, 999999, 30, "ip_defend_interval" },
{ 0, 3600000, 300000, "ip_dup_recovery" },
{ 0, 1, 1, "ip_restrict_interzone_loopback" },
{ 0, 1, 1, "ip_lso_outbound" },
#ifdef DEBUG
{ 0, 1, 0, "ip6_drop_inbound_icmpv6" },
#else
{ 0, 0, 0, "" },
#endif
};
/*
* Extended NDP table
* The addresses for the first two are filled in to be ips_ip_g_forward
* and ips_ipv6_forward at init time.
*/
static ipndp_t lcl_ndp_arr[] = {
/* getf setf data name */
#define IPNDP_IP_FORWARDING_OFFSET 0
{ ip_param_generic_get, ip_forward_set, NULL,
"ip_forwarding" },
#define IPNDP_IP6_FORWARDING_OFFSET 1
{ ip_param_generic_get, ip_forward_set, NULL,
"ip6_forwarding" },
{ ip_ill_report, NULL, NULL,
"ip_ill_status" },
{ ip_ipif_report, NULL, NULL,
"ip_ipif_status" },
{ ip_ire_report, NULL, NULL,
"ipv4_ire_status" },
{ ip_ire_report_mrtun, NULL, NULL,
"ipv4_mrtun_ire_status" },
{ ip_ire_report_srcif, NULL, NULL,
"ipv4_srcif_ire_status" },
{ ip_ire_report_v6, NULL, NULL,
"ipv6_ire_status" },
{ ip_conn_report, NULL, NULL,
"ip_conn_status" },
{ nd_get_long, nd_set_long, (caddr_t)&ip_rput_pullups,
"ip_rput_pullups" },
{ ndp_report, NULL, NULL,
"ip_ndp_cache_report" },
{ ip_srcid_report, NULL, NULL,
"ip_srcid_status" },
{ ip_param_generic_get, ip_squeue_profile_set,
(caddr_t)&ip_squeue_profile, "ip_squeue_profile" },
{ ip_param_generic_get, ip_squeue_bind_set,
(caddr_t)&ip_squeue_bind, "ip_squeue_bind" },
{ ip_param_generic_get, ip_input_proc_set,
(caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
{ ip_param_generic_get, ip_int_set,
(caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
#define IPNDP_CGTP_FILTER_OFFSET 16
{ ip_cgtp_filter_get, ip_cgtp_filter_set, NULL,
"ip_cgtp_filter" },
{ ip_param_generic_get, ip_int_set,
(caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" },
};
/*
* Table of IP ioctls encoding the various properties of the ioctl and
* indexed based on the last byte of the ioctl command. Occasionally there
* is a clash, and there is more than 1 ioctl with the same last byte.
* In such a case 1 ioctl is encoded in the ndx table and the remaining
* ioctls are encoded in the misc table. An entry in the ndx table is
* retrieved by indexing on the last byte of the ioctl command and comparing
* the ioctl command with the value in the ndx table. In the event of a
* mismatch the misc table is then searched sequentially for the desired
* ioctl command.
*
* Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
*/
ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
/* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV,
MISC_CMD, ip_siocaddrt, NULL },
/* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV,
MISC_CMD, ip_siocdelrt, NULL },
/* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
/* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_addr, NULL },
/* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
/* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_dstaddr, NULL },
/* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
/* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
IPI_MODOK | IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_flags, NULL },
/* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* copyin size cannot be coded for SIOCGIFCONF */
/* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD | IPI_REPL,
MISC_CMD, ip_sioctl_get_ifconf, NULL },
/* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_mtu, NULL },
/* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_mtu, NULL },
/* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_brdaddr, NULL },
/* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_brdaddr, NULL },
/* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_netmask, NULL },
/* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
/* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_metric, NULL },
/* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
IF_CMD, ip_sioctl_metric, NULL },
/* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* See 166-168 below for extended SIOC*XARP ioctls */
/* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV,
MISC_CMD, ip_sioctl_arp, NULL },
/* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD | IPI_REPL,
MISC_CMD, ip_sioctl_arp, NULL },
/* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV,
MISC_CMD, ip_sioctl_arp, NULL },
/* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
MISC_CMD, if_unitsel, if_unitsel_restart },
/* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
IPI_PRIV | IPI_WR | IPI_MODOK,
IF_CMD, ip_sioctl_sifname, NULL },
/* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD | IPI_REPL,
MISC_CMD, ip_sioctl_get_ifnum, NULL },
/* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_muxid, NULL },
/* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
IF_CMD, ip_sioctl_muxid, NULL },
/* Both if and lif variants share same func */
/* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD | IPI_REPL,
IF_CMD, ip_sioctl_get_lifindex, NULL },
/* Both if and lif variants share same func */
/* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
IF_CMD, ip_sioctl_slifindex, NULL },
/* copyin size cannot be coded for SIOCGIFCONF */
/* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD | IPI_REPL,
MISC_CMD, ip_sioctl_get_ifconf, NULL },
/* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
LIF_CMD, ip_sioctl_removeif,
ip_sioctl_removeif_restart },
/* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
IPI_GET_CMD | IPI_PRIV | IPI_WR | IPI_REPL,
LIF_CMD, ip_sioctl_addif, NULL },
#define SIOCLIFADDR_NDX 112
/* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
/* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_addr, NULL },
/* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
/* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_dstaddr, NULL },
/* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
/* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
IPI_GET_CMD | IPI_MODOK | IPI_REPL,
LIF_CMD, ip_sioctl_get_flags, NULL },
/* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD | IPI_REPL,
ip_sioctl_get_lifconf, NULL },
/* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_mtu, NULL },
/* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_mtu, NULL },
/* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_brdaddr, NULL },
/* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_brdaddr, NULL },
/* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_netmask, NULL },
/* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
/* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_metric, NULL },
/* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_metric, NULL },
/* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_MODOK | IPI_REPL,
LIF_CMD, ip_sioctl_slifname,
ip_sioctl_slifname_restart },
/* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD | IPI_REPL,
MISC_CMD, ip_sioctl_get_lifnum, NULL },
/* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_muxid, NULL },
/* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
LIF_CMD, ip_sioctl_muxid, NULL },
/* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_lifindex, 0 },
/* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
LIF_CMD, ip_sioctl_slifindex, 0 },
/* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_token, NULL },
/* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_token, NULL },
/* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
/* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_subnet, NULL },
/* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_lnkinfo, NULL },
/* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
/* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
LIF_CMD, ip_siocdelndp_v6, NULL },
/* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
LIF_CMD, ip_siocqueryndp_v6, NULL },
/* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
LIF_CMD, ip_siocsetndp_v6, NULL },
/* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
MISC_CMD, ip_sioctl_tmyaddr, NULL },
/* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
MISC_CMD, ip_sioctl_tonlink, NULL },
/* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
MISC_CMD, ip_sioctl_tmysite, NULL },
/* 147 */ { SIOCGTUNPARAM, sizeof (struct iftun_req), IPI_REPL,
TUN_CMD, ip_sioctl_tunparam, NULL },
/* 148 */ { SIOCSTUNPARAM, sizeof (struct iftun_req),
IPI_PRIV | IPI_WR,
TUN_CMD, ip_sioctl_tunparam, NULL },
/* IPSECioctls handled in ip_sioctl_copyin_setup itself */
/* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
/* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
/* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
/* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
/* 153 */ { SIOCLIFFAILOVER, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
LIF_CMD, ip_sioctl_move, ip_sioctl_move },
/* 154 */ { SIOCLIFFAILBACK, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
LIF_CMD, ip_sioctl_move, ip_sioctl_move },
/* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
/* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_groupname, NULL },
/* 157 */ { SIOCGLIFOINDEX, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_oindex, NULL },
/* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
/* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 161 */ { SIOCSLIFOINDEX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_slifoindex, NULL },
/* These are handled in ip_sioctl_copyin_setup itself */
/* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
MISC_CMD, NULL, NULL },
/* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
MISC_CMD, NULL, NULL },
/* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
/* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD | IPI_REPL,
ip_sioctl_get_lifconf, NULL },
/* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV,
MISC_CMD, ip_sioctl_xarp, NULL },
/* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD | IPI_REPL,
MISC_CMD, ip_sioctl_xarp, NULL },
/* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV,
MISC_CMD, ip_sioctl_xarp, NULL },
/* SIOCPOPSOCKFS is not handled by IP */
/* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
/* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
IPI_GET_CMD | IPI_REPL,
LIF_CMD, ip_sioctl_get_lifzone, NULL },
/* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
IPI_PRIV | IPI_WR | IPI_REPL,
LIF_CMD, ip_sioctl_slifzone,
ip_sioctl_slifzone_restart },
/* 172-174 are SCTP ioctls and not handled by IP */
/* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
IPI_GET_CMD, LIF_CMD,
ip_sioctl_get_lifusesrc, 0 },
/* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
IPI_PRIV | IPI_WR,
LIF_CMD, ip_sioctl_slifusesrc,
NULL },
/* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
ip_sioctl_get_lifsrcof, NULL },
/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
MISC_CMD, ip_sioctl_msfilter, NULL },
/* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR,
MISC_CMD, ip_sioctl_msfilter, NULL },
/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
MISC_CMD, ip_sioctl_msfilter, NULL },
/* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR,
MISC_CMD, ip_sioctl_msfilter, NULL },
/* 182 */ { SIOCSIPMPFAILBACK, sizeof (int), IPI_PRIV, MISC_CMD,
ip_sioctl_set_ipmpfailback, NULL }
};
int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
{ OSIOCGTUNPARAM, sizeof (struct old_iftun_req),
IPI_GET_CMD | IPI_REPL, TUN_CMD, ip_sioctl_tunparam, NULL },
{ OSIOCSTUNPARAM, sizeof (struct old_iftun_req), IPI_PRIV | IPI_WR,
TUN_CMD, ip_sioctl_tunparam, NULL },
{ I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
{ I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
{ I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
{ I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
{ ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL },
{ ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
{ IP_IOCTL, 0, 0, 0, NULL, NULL },
{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_REPL | IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
{ SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_REPL | IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
{ SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_REPL | IPI_GET_CMD,
MISC_CMD, mrt_ioctl}
};
int ip_misc_ioctl_count =
sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
int conn_drain_nthreads; /* Number of drainers reqd. */
/* Settable in /etc/system */
/* Defined in ip_ire.c */
extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
static nv_t ire_nv_arr[] = {
{ IRE_BROADCAST, "BROADCAST" },
{ IRE_LOCAL, "LOCAL" },
{ IRE_LOOPBACK, "LOOPBACK" },
{ IRE_CACHE, "CACHE" },
{ IRE_DEFAULT, "DEFAULT" },
{ IRE_PREFIX, "PREFIX" },
{ IRE_IF_NORESOLVER, "IF_NORESOL" },
{ IRE_IF_RESOLVER, "IF_RESOLV" },
{ IRE_HOST, "HOST" },
{ 0 }
};
nv_t *ire_nv_tbl = ire_nv_arr;
/* Defined in ip_netinfo.c */
extern ddi_taskq_t *eventq_queue_nic;
/* Simple ICMP IP Header Template */
static ipha_t icmp_ipha = {
IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
};
struct module_info ip_mod_info = {
IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024
};
/*
* Duplicate static symbols within a module confuses mdb; so we avoid the
* problem by making the symbols here distinct from those in udp.c.
*/
static struct qinit iprinit = {
(pfi_t)ip_rput, NULL, ip_open, ip_close, NULL,
&ip_mod_info
};
static struct qinit ipwinit = {
(pfi_t)ip_wput, (pfi_t)ip_wsrv, ip_open, ip_close, NULL,
&ip_mod_info
};
static struct qinit iplrinit = {
(pfi_t)ip_lrput, NULL, ip_open, ip_close, NULL,
&ip_mod_info
};
static struct qinit iplwinit = {
(pfi_t)ip_lwput, NULL, ip_open, ip_close, NULL,
&ip_mod_info
};
struct streamtab ipinfo = {
&iprinit, &ipwinit, &iplrinit, &iplwinit
};
#ifdef DEBUG
static boolean_t skip_sctp_cksum = B_FALSE;
#endif
/*
* Prepend the zoneid using an ipsec_out_t for later use by functions like
* ip_rput_v6(), ip_output(), etc. If the message
* block already has a M_CTL at the front of it, then simply set the zoneid
* appropriately.
*/
mblk_t *
ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
{
mblk_t *first_mp;
ipsec_out_t *io;
ASSERT(zoneid != ALL_ZONES);
if (mp->b_datap->db_type == M_CTL) {
io = (ipsec_out_t *)mp->b_rptr;
ASSERT(io->ipsec_out_type == IPSEC_OUT);
io->ipsec_out_zoneid = zoneid;
return (mp);
}
first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack);
if (first_mp == NULL)
return (NULL);
io = (ipsec_out_t *)first_mp->b_rptr;
/* This is not a secure packet */
io->ipsec_out_secure = B_FALSE;
io->ipsec_out_zoneid = zoneid;
first_mp->b_cont = mp;
return (first_mp);
}
/*
* Copy an M_CTL-tagged message, preserving reference counts appropriately.
*/
mblk_t *
ip_copymsg(mblk_t *mp)
{
mblk_t *nmp;
ipsec_info_t *in;
if (mp->b_datap->db_type != M_CTL)
return (copymsg(mp));
in = (ipsec_info_t *)mp->b_rptr;
/*
* Note that M_CTL is also used for delivering ICMP error messages
* upstream to transport layers.
*/
if (in->ipsec_info_type != IPSEC_OUT &&
in->ipsec_info_type != IPSEC_IN)
return (copymsg(mp));
nmp = copymsg(mp->b_cont);
if (in->ipsec_info_type == IPSEC_OUT) {
return (ipsec_out_tag(mp, nmp,
((ipsec_out_t *)in)->ipsec_out_ns));
} else {
return (ipsec_in_tag(mp, nmp,
((ipsec_in_t *)in)->ipsec_in_ns));
}
}
/* Generate an ICMP fragmentation needed message. */
static void
icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
ip_stack_t *ipst)
{
icmph_t icmph;
mblk_t *first_mp;
boolean_t mctl_present;
EXTRACT_PKT_MP(mp, first_mp, mctl_present);
if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
if (mctl_present)
freeb(first_mp);
return;
}
bzero(&icmph, sizeof (icmph_t));
icmph.icmph_type = ICMP_DEST_UNREACHABLE;
icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
icmph.icmph_du_mtu = htons((uint16_t)mtu);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
ipst);
}
/*
* icmp_inbound deals with ICMP messages in the following ways.
*
* 1) It needs to send a reply back and possibly delivering it
* to the "interested" upper clients.
* 2) It needs to send it to the upper clients only.
* 3) It needs to change some values in IP only.
* 4) It needs to change some values in IP and upper layers e.g TCP.
*
* We need to accomodate icmp messages coming in clear until we get
* everything secure from the wire. If icmp_accept_clear_messages
* is zero we check with the global policy and act accordingly. If
* it is non-zero, we accept the message without any checks. But
* *this does not mean* that this will be delivered to the upper
* clients. By accepting we might send replies back, change our MTU
* value etc. but delivery to the ULP/clients depends on their policy
* dispositions.
*
* We handle the above 4 cases in the context of IPSEC in the
* following way :
*
* 1) Send the reply back in the same way as the request came in.
* If it came in encrypted, it goes out encrypted. If it came in
* clear, it goes out in clear. Thus, this will prevent chosen
* plain text attack.
* 2) The client may or may not expect things to come in secure.
* If it comes in secure, the policy constraints are checked
* before delivering it to the upper layers. If it comes in
* clear, ipsec_inbound_accept_clear will decide whether to
* accept this in clear or not. In both the cases, if the returned
* message (IP header + 8 bytes) that caused the icmp message has
* AH/ESP headers, it is sent up to AH/ESP for validation before
* sending up. If there are only 8 bytes of returned message, then
* upper client will not be notified.
* 3) Check with global policy to see whether it matches the constaints.
* But this will be done only if icmp_accept_messages_in_clear is
* zero.
* 4) If we need to change both in IP and ULP, then the decision taken
* while affecting the values in IP and while delivering up to TCP
* should be the same.
*
* There are two cases.
*
* a) If we reject data at the IP layer (ipsec_check_global_policy()
* failed), we will not deliver it to the ULP, even though they
* are *willing* to accept in *clear*. This is fine as our global
* disposition to icmp messages asks us reject the datagram.
*
* b) If we accept data at the IP layer (ipsec_check_global_policy()
* succeeded or icmp_accept_messages_in_clear is 1), and not able
* to deliver it to ULP (policy failed), it can lead to
* consistency problems. The cases known at this time are
* ICMP_DESTINATION_UNREACHABLE messages with following code
* values :
*
* - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
* and Upper layer rejects. Then the communication will
* come to a stop. This is solved by making similar decisions
* at both levels. Currently, when we are unable to deliver
* to the Upper Layer (due to policy failures) while IP has
* adjusted ire_max_frag, the next outbound datagram would
* generate a local ICMP_FRAGMENTATION_NEEDED message - which
* will be with the right level of protection. Thus the right
* value will be communicated even if we are not able to
* communicate when we get from the wire initially. But this
* assumes there would be at least one outbound datagram after
* IP has adjusted its ire_max_frag value. To make things
* simpler, we accept in clear after the validation of
* AH/ESP headers.
*
* - Other ICMP ERRORS : We may not be able to deliver it to the
* upper layer depending on the level of protection the upper
* layer expects and the disposition in ipsec_inbound_accept_clear().
* ipsec_inbound_accept_clear() decides whether a given ICMP error
* should be accepted in clear when the Upper layer expects secure.
* Thus the communication may get aborted by some bad ICMP
* packets.
*
* IPQoS Notes:
* The only instance when a packet is sent for processing is when there
* isn't an ICMP client and if we are interested in it.
* If there is a client, IPPF processing will take place in the
* ip_fanout_proto routine.
*
* Zones notes:
* The packet is only processed in the context of the specified zone: typically
* only this zone will reply to an echo request, and only interested clients in
* this zone will receive a copy of the packet. This means that the caller must
* call icmp_inbound() for each relevant zone.
*/
static void
icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy,
ill_t *recv_ill, zoneid_t zoneid)
{
icmph_t *icmph;
ipha_t *ipha;
int iph_hdr_length;
int hdr_length;
boolean_t interested;
uint32_t ts;
uchar_t *wptr;
ipif_t *ipif;
mblk_t *first_mp;
ipsec_in_t *ii;
ire_t *src_ire;
boolean_t onlink;
timestruc_t now;
uint32_t ill_index;
ip_stack_t *ipst;
ASSERT(ill != NULL);
ipst = ill->ill_ipst;
first_mp = mp;
if (mctl_present) {
mp = first_mp->b_cont;
ASSERT(mp != NULL);
}
ipha = (ipha_t *)mp->b_rptr;
if (ipst->ips_icmp_accept_clear_messages == 0) {
first_mp = ipsec_check_global_policy(first_mp, NULL,
ipha, NULL, mctl_present, ipst->ips_netstack);
if (first_mp == NULL)
return;
}
/*
* On a labeled system, we have to check whether the zone itself is
* permitted to receive raw traffic.
*/
if (is_system_labeled()) {
if (zoneid == ALL_ZONES)
zoneid = tsol_packet_to_zoneid(mp);
if (!tsol_can_accept_raw(mp, B_FALSE)) {
ip1dbg(("icmp_inbound: zone %d can't receive raw",
zoneid));
BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
freemsg(first_mp);
return;
}
}
/*
* We have accepted the ICMP message. It means that we will
* respond to the packet if needed. It may not be delivered
* to the upper client depending on the policy constraints
* and the disposition in ipsec_inbound_accept_clear.
*/
ASSERT(ill != NULL);
BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
iph_hdr_length = IPH_HDR_LENGTH(ipha);
if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) {
/* Last chance to get real. */
if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
freemsg(first_mp);
return;
}
/* Refresh iph following the pullup. */
ipha = (ipha_t *)mp->b_rptr;
}
/* ICMP header checksum, including checksum field, should be zero. */
if (sum_valid ? (sum != 0 && sum != 0xFFFF) :
IP_CSUM(mp, iph_hdr_length, 0)) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
freemsg(first_mp);
return;
}
/* The IP header will always be a multiple of four bytes */
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type,
icmph->icmph_code));
wptr = (uchar_t *)icmph + ICMPH_SIZE;
/* We will set "interested" to "true" if we want a copy */
interested = B_FALSE;
switch (icmph->icmph_type) {
case ICMP_ECHO_REPLY:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
break;
case ICMP_DEST_UNREACHABLE:
if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
break;
case ICMP_SOURCE_QUENCH:
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
break;
case ICMP_REDIRECT:
if (!ipst->ips_ip_ignore_redirect)
interested = B_TRUE;
BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
break;
case ICMP_ECHO_REQUEST:
/*
* Whether to respond to echo requests that come in as IP
* broadcasts or as IP multicast is subject to debate
* (what isn't?). We aim to please, you pick it.
* Default is do it.
*/
if (!broadcast && !CLASSD(ipha->ipha_dst)) {
/* unicast: always respond */
interested = B_TRUE;
} else if (CLASSD(ipha->ipha_dst)) {
/* multicast: respond based on tunable */
interested = ipst->ips_ip_g_resp_to_echo_mcast;
} else if (broadcast) {
/* broadcast: respond based on tunable */
interested = ipst->ips_ip_g_resp_to_echo_bcast;
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
break;
case ICMP_ROUTER_ADVERTISEMENT:
case ICMP_ROUTER_SOLICITATION:
break;
case ICMP_TIME_EXCEEDED:
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
break;
case ICMP_PARAM_PROBLEM:
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
break;
case ICMP_TIME_STAMP_REQUEST:
/* Response to Time Stamp Requests is local policy. */
if (ipst->ips_ip_g_resp_to_timestamp &&
/* So is whether to respond if it was an IP broadcast. */
(!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) {
int tstamp_len = 3 * sizeof (uint32_t);
if (wptr + tstamp_len > mp->b_wptr) {
if (!pullupmsg(mp, wptr + tstamp_len -
mp->b_rptr)) {
BUMP_MIB(ill->ill_ip_mib,
ipIfStatsInDiscards);
freemsg(first_mp);
return;
}
/* Refresh ipha following the pullup. */
ipha = (ipha_t *)mp->b_rptr;
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
wptr = (uchar_t *)icmph + ICMPH_SIZE;
}
interested = B_TRUE;
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
break;
case ICMP_TIME_STAMP_REPLY:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
break;
case ICMP_INFO_REQUEST:
/* Per RFC 1122 3.2.2.7, ignore this. */
case ICMP_INFO_REPLY:
break;
case ICMP_ADDRESS_MASK_REQUEST:
if ((ipst->ips_ip_respond_to_address_mask_broadcast ||
!broadcast) &&
/* TODO m_pullup of complete header? */
(mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN)
interested = B_TRUE;
BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
break;
case ICMP_ADDRESS_MASK_REPLY:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
break;
default:
interested = B_TRUE; /* Pass up to transport */
BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
break;
}
/* See if there is an ICMP client. */
if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) {
/* If there is an ICMP client and we want one too, copy it. */
mblk_t *first_mp1;
if (!interested) {
ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present,
ip_policy, recv_ill, zoneid);
return;
}
first_mp1 = ip_copymsg(first_mp);
if (first_mp1 != NULL) {
ip_fanout_proto(q, first_mp1, ill, ipha,
0, mctl_present, ip_policy, recv_ill, zoneid);
}
} else if (!interested) {
freemsg(first_mp);
return;
} else {
/*
* Initiate policy processing for this packet if ip_policy
* is true.
*/
if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
ill_index = ill->ill_phyint->phyint_ifindex;
ip_process(IPP_LOCAL_IN, &mp, ill_index);
if (mp == NULL) {
if (mctl_present) {
freeb(first_mp);
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
return;
}
}
}
/* We want to do something with it. */
/* Check db_ref to make sure we can modify the packet. */
if (mp->b_datap->db_ref > 1) {
mblk_t *first_mp1;
first_mp1 = ip_copymsg(first_mp);
freemsg(first_mp);
if (!first_mp1) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
return;
}
first_mp = first_mp1;
if (mctl_present) {
mp = first_mp->b_cont;
ASSERT(mp != NULL);
} else {
mp = first_mp;
}
ipha = (ipha_t *)mp->b_rptr;
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
wptr = (uchar_t *)icmph + ICMPH_SIZE;
}
switch (icmph->icmph_type) {
case ICMP_ADDRESS_MASK_REQUEST:
ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
if (ipif == NULL) {
freemsg(first_mp);
return;
}
/*
* outging interface must be IPv4
*/
ASSERT(ipif != NULL && !ipif->ipif_isv6);
icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN);
ipif_refrele(ipif);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
break;
case ICMP_ECHO_REQUEST:
icmph->icmph_type = ICMP_ECHO_REPLY;
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
break;
case ICMP_TIME_STAMP_REQUEST: {
uint32_t *tsp;
icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
tsp = (uint32_t *)wptr;
tsp++; /* Skip past 'originate time' */
/* Compute # of milliseconds since midnight */
gethrestime(&now);
ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
now.tv_nsec / (NANOSEC / MILLISEC);
*tsp++ = htonl(ts); /* Lay in 'receive time' */
*tsp++ = htonl(ts); /* Lay in 'send time' */
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
break;
}
default:
ipha = (ipha_t *)&icmph[1];
if ((uchar_t *)&ipha[1] > mp->b_wptr) {
if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
freemsg(first_mp);
return;
}
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
}
if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
freemsg(first_mp);
return;
}
hdr_length = IPH_HDR_LENGTH(ipha);
if (hdr_length < sizeof (ipha_t)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
freemsg(first_mp);
return;
}
if ((uchar_t *)ipha + hdr_length > mp->b_wptr) {
if (!pullupmsg(mp,
(uchar_t *)ipha + hdr_length - mp->b_rptr)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
freemsg(first_mp);
return;
}
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
}
switch (icmph->icmph_type) {
case ICMP_REDIRECT:
/*
* As there is no upper client to deliver, we don't
* need the first_mp any more.
*/
if (mctl_present) {
freeb(first_mp);
}
icmp_redirect(ill, mp);
return;
case ICMP_DEST_UNREACHABLE:
if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
if (!icmp_inbound_too_big(icmph, ipha, ill,
zoneid, mp, iph_hdr_length, ipst)) {
freemsg(first_mp);
return;
}
/*
* icmp_inbound_too_big() may alter mp.
* Resynch ipha and icmph accordingly.
*/
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
}
/* FALLTHRU */
default :
/*
* IPQoS notes: Since we have already done IPQoS
* processing we don't want to do it again in
* the fanout routines called by
* icmp_inbound_error_fanout, hence the last
* argument, ip_policy, is B_FALSE.
*/
icmp_inbound_error_fanout(q, ill, first_mp, icmph,
ipha, iph_hdr_length, hdr_length, mctl_present,
B_FALSE, recv_ill, zoneid);
}
return;
}
/* Send out an ICMP packet */
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0);
if (icmph->icmph_checksum == 0)
icmph->icmph_checksum = 0xFFFF;
if (broadcast || CLASSD(ipha->ipha_dst)) {
ipif_t *ipif_chosen;
/*
* Make it look like it was directed to us, so we don't look
* like a fool with a broadcast or multicast source address.
*/
ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
/*
* Make sure that we haven't grabbed an interface that's DOWN.
*/
if (ipif != NULL) {
ipif_chosen = ipif_select_source(ipif->ipif_ill,
ipha->ipha_src, zoneid);
if (ipif_chosen != NULL) {
ipif_refrele(ipif);
ipif = ipif_chosen;
}
}
if (ipif == NULL) {
ip0dbg(("icmp_inbound: "
"No source for broadcast/multicast:\n"
"\tsrc 0x%x dst 0x%x ill %p "
"ipif_lcl_addr 0x%x\n",
ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
(void *)ill,
ill->ill_ipif->ipif_lcl_addr));
freemsg(first_mp);
return;
}
ASSERT(ipif != NULL && !ipif->ipif_isv6);
ipha->ipha_dst = ipif->ipif_src_addr;
ipif_refrele(ipif);
}
/* Reset time to live. */
ipha->ipha_ttl = ipst->ips_ip_def_ttl;
{
/* Swap source and destination addresses */
ipaddr_t tmp;
tmp = ipha->ipha_src;
ipha->ipha_src = ipha->ipha_dst;
ipha->ipha_dst = tmp;
}
ipha->ipha_ident = 0;
if (!IS_SIMPLE_IPH(ipha))
icmp_options_update(ipha);
/*
* ICMP echo replies should go out on the same interface
* the request came on as probes used by in.mpathd for detecting
* NIC failures are ECHO packets. We turn-off load spreading
* by setting ipsec_in_attach_if to B_TRUE, which is copied
* to ipsec_out_attach_if by ipsec_in_to_out called later in this
* function. This is in turn handled by ip_wput and ip_newroute
* to make sure that the packet goes out on the interface it came
* in on. If we don't turnoff load spreading, the packets might get
* dropped if there are no non-FAILED/INACTIVE interfaces for it
* to go out and in.mpathd would wrongly detect a failure or
* mis-detect a NIC failure for link failure. As load spreading
* can happen only if ill_group is not NULL, we do only for
* that case and this does not affect the normal case.
*
* We turn off load spreading only on echo packets that came from
* on-link hosts. If the interface route has been deleted, this will
* not be enforced as we can't do much. For off-link hosts, as the
* default routes in IPv4 does not typically have an ire_ipif
* pointer, we can't force MATCH_IRE_ILL in ip_wput/ip_newroute.
* Moreover, expecting a default route through this interface may
* not be correct. We use ipha_dst because of the swap above.
*/
onlink = B_FALSE;
if (icmph->icmph_type == ICMP_ECHO_REPLY && ill->ill_group != NULL) {
/*
* First, we need to make sure that it is not one of our
* local addresses. If we set onlink when it is one of
* our local addresses, we will end up creating IRE_CACHES
* for one of our local addresses. Then, we will never
* accept packets for them afterwards.
*/
src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_LOCAL,
NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
if (src_ire == NULL) {
ipif = ipif_get_next_ipif(NULL, ill);
if (ipif == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
freemsg(mp);
return;
}
src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0,
IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE, ipst);
ipif_refrele(ipif);
if (src_ire != NULL) {
onlink = B_TRUE;
ire_refrele(src_ire);
}
} else {
ire_refrele(src_ire);
}
}
if (!mctl_present) {
/*
* This packet should go out the same way as it
* came in i.e in clear. To make sure that global
* policy will not be applied to this in ip_wput_ire,
* we attach a IPSEC_IN mp and clear ipsec_in_secure.
*/
ASSERT(first_mp == mp);
first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
if (first_mp == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
freemsg(mp);
return;
}
ii = (ipsec_in_t *)first_mp->b_rptr;
/* This is not a secure packet */
ii->ipsec_in_secure = B_FALSE;
if (onlink) {
ii->ipsec_in_attach_if = B_TRUE;
ii->ipsec_in_ill_index =
ill->ill_phyint->phyint_ifindex;
ii->ipsec_in_rill_index =
recv_ill->ill_phyint->phyint_ifindex;
}
first_mp->b_cont = mp;
} else if (onlink) {
ii = (ipsec_in_t *)first_mp->b_rptr;
ii->ipsec_in_attach_if = B_TRUE;
ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
ii->ipsec_in_rill_index = recv_ill->ill_phyint->phyint_ifindex;
ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */
} else {
ii = (ipsec_in_t *)first_mp->b_rptr;
ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */
}
ii->ipsec_in_zoneid = zoneid;
ASSERT(zoneid != ALL_ZONES);
if (!ipsec_in_to_out(first_mp, ipha, NULL)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
return;
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
put(WR(q), first_mp);
}
static ipaddr_t
icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp)
{
conn_t *connp;
connf_t *connfp;
ipaddr_t nexthop_addr = INADDR_ANY;
int hdr_length = IPH_HDR_LENGTH(ipha);
uint16_t *up;
uint32_t ports;
ip_stack_t *ipst = ill->ill_ipst;
up = (uint16_t *)((uchar_t *)ipha + hdr_length);
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
{
tcph_t *tcph;
/* do a reverse lookup */
tcph = (tcph_t *)((uchar_t *)ipha + hdr_length);
connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph,
TCPS_LISTEN, ipst);
break;
}
case IPPROTO_UDP:
{
uint32_t dstport, srcport;
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
/* Extract ports in net byte order */
dstport = htons(ntohl(ports) & 0xFFFF);
srcport = htons(ntohl(ports) >> 16);
connfp = &ipst->ips_ipcl_udp_fanout[
IPCL_UDP_HASH(dstport, ipst)];
mutex_enter(&connfp->connf_lock);
connp = connfp->connf_head;
/* do a reverse lookup */
while ((connp != NULL) &&
(!IPCL_UDP_MATCH(connp, dstport,
ipha->ipha_src, srcport, ipha->ipha_dst) ||
!IPCL_ZONE_MATCH(connp, zoneid))) {
connp = connp->conn_next;
}
if (connp != NULL)
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
break;
}
case IPPROTO_SCTP:
{
in6_addr_t map_src, map_dst;
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src);
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst);
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
connp = sctp_find_conn(&map_src, &map_dst, ports,
0, zoneid, ipst->ips_netstack->netstack_sctp);
if (connp == NULL) {
connp = ipcl_classify_raw(mp, IPPROTO_SCTP,
zoneid, ports, ipha, ipst);
} else {
CONN_INC_REF(connp);
SCTP_REFRELE(CONN2SCTP(connp));
}
break;
}
default:
{
ipha_t ripha;
ripha.ipha_src = ipha->ipha_dst;
ripha.ipha_dst = ipha->ipha_src;
ripha.ipha_protocol = ipha->ipha_protocol;
connfp = &ipst->ips_ipcl_proto_fanout[
ipha->ipha_protocol];
mutex_enter(&connfp->connf_lock);
connp = connfp->connf_head;
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
if (IPCL_PROTO_MATCH(connp,
ipha->ipha_protocol, &ripha, ill,
0, zoneid)) {
CONN_INC_REF(connp);
break;
}
}
mutex_exit(&connfp->connf_lock);
}
}
if (connp != NULL) {
if (connp->conn_nexthop_set)
nexthop_addr = connp->conn_nexthop_v4;
CONN_DEC_REF(connp);
}
return (nexthop_addr);
}
/* Table from RFC 1191 */
static int icmp_frag_size_table[] =
{ 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
/*
* Process received ICMP Packet too big.
* After updating any IRE it does the fanout to any matching transport streams.
* Assumes the message has been pulled up till the IP header that caused
* the error.
*
* Returns B_FALSE on failure and B_TRUE on success.
*/
static boolean_t
icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill,
zoneid_t zoneid, mblk_t *mp, int iph_hdr_length,
ip_stack_t *ipst)
{
ire_t *ire, *first_ire;
int mtu;
int hdr_length;
ipaddr_t nexthop_addr;
ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
ASSERT(ill != NULL);
hdr_length = IPH_HDR_LENGTH(ipha);
/* Drop if the original packet contained a source route */
if (ip_source_route_included(ipha)) {
return (B_FALSE);
}
/*
* Verify we have atleast ICMP_MIN_TP_HDR_LENGTH bytes of transport
* header.
*/
if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
mp->b_wptr) {
if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
ip1dbg(("icmp_inbound_too_big: insufficient hdr\n"));
return (B_FALSE);
}
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
}
nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp);
if (nexthop_addr != INADDR_ANY) {
/* nexthop set */
first_ire = ire_ctable_lookup(ipha->ipha_dst,
nexthop_addr, 0, NULL, ALL_ZONES, MBLK_GETLABEL(mp),
MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst);
} else {
/* nexthop not set */
first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE,
NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
}
if (!first_ire) {
ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n",
ntohl(ipha->ipha_dst)));
return (B_FALSE);
}
/* Check for MTU discovery advice as described in RFC 1191 */
mtu = ntohs(icmph->icmph_du_mtu);
rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER);
for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst;
ire = ire->ire_next) {
/*
* Look for the connection to which this ICMP message is
* directed. If it has the IP_NEXTHOP option set, then the
* search is limited to IREs with the MATCH_IRE_PRIVATE
* option. Else the search is limited to regular IREs.
*/
if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) &&
(nexthop_addr != ire->ire_gateway_addr)) ||
(!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) &&
(nexthop_addr != INADDR_ANY)))
continue;
mutex_enter(&ire->ire_lock);
if (icmph->icmph_du_zero == 0 && mtu > 68) {
/* Reduce the IRE max frag value as advised. */
ip1dbg(("Received mtu from router: %d (was %d)\n",
mtu, ire->ire_max_frag));
ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
} else {
uint32_t length;
int i;
/*
* Use the table from RFC 1191 to figure out
* the next "plateau" based on the length in
* the original IP packet.
*/
length = ntohs(ipha->ipha_length);
if (ire->ire_max_frag <= length &&
ire->ire_max_frag >= length - hdr_length) {
/*
* Handle broken BSD 4.2 systems that
* return the wrong iph_length in ICMP
* errors.
*/
ip1dbg(("Wrong mtu: sent %d, ire %d\n",
length, ire->ire_max_frag));
length -= hdr_length;
}
for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
if (length > icmp_frag_size_table[i])
break;
}
if (i == A_CNT(icmp_frag_size_table)) {
/* Smaller than 68! */
ip1dbg(("Too big for packet size %d\n",
length));
ire->ire_max_frag = MIN(ire->ire_max_frag, 576);
ire->ire_frag_flag = 0;
} else {
mtu = icmp_frag_size_table[i];
ip1dbg(("Calculated mtu %d, packet size %d, "
"before %d", mtu, length,
ire->ire_max_frag));
ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
ip1dbg((", after %d\n", ire->ire_max_frag));
}
/* Record the new max frag size for the ULP. */
icmph->icmph_du_zero = 0;
icmph->icmph_du_mtu =
htons((uint16_t)ire->ire_max_frag);
}
mutex_exit(&ire->ire_lock);
}
rw_exit(&first_ire->ire_bucket->irb_lock);
ire_refrele(first_ire);
return (B_TRUE);
}
/*
* If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout
* calls this function.
*/
static mblk_t *
icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length)
{
ipha_t *ipha;
icmph_t *icmph;
ipha_t *in_ipha;
int length;
ASSERT(mp->b_datap->db_type == M_DATA);
/*
* For Self-encapsulated packets, we added an extra IP header
* without the options. Inner IP header is the one from which
* the outer IP header was formed. Thus, we need to remove the
* outer IP header. To do this, we pullup the whole message
* and overlay whatever follows the outer IP header over the
* outer IP header.
*/
if (!pullupmsg(mp, -1))
return (NULL);
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
/*
* The length that we want to overlay is following the inner
* IP header. Subtracting the IP header + icmp header + outer
* IP header's length should give us the length that we want to
* overlay.
*/
length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) -
hdr_length;
/*
* Overlay whatever follows the inner header over the
* outer header.
*/
bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
/* Set the wptr to account for the outer header */
mp->b_wptr -= hdr_length;
return (mp);
}
/*
* Try to pass the ICMP message upstream in case the ULP cares.
*
* If the packet that caused the ICMP error is secure, we send
* it to AH/ESP to make sure that the attached packet has a
* valid association. ipha in the code below points to the
* IP header of the packet that caused the error.
*
* We handle ICMP_FRAGMENTATION_NEEDED(IFN) message differently
* in the context of IPSEC. Normally we tell the upper layer
* whenever we send the ire (including ip_bind), the IPSEC header
* length in ire_ipsec_overhead. TCP can deduce the MSS as it
* has both the MTU (ire_max_frag) and the ire_ipsec_overhead.
* Similarly, we pass the new MTU icmph_du_mtu and TCP does the
* same thing. As TCP has the IPSEC options size that needs to be
* adjusted, we just pass the MTU unchanged.
*
* IFN could have been generated locally or by some router.
*
* LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this.
* This happens because IP adjusted its value of MTU on an
* earlier IFN message and could not tell the upper layer,
* the new adjusted value of MTU e.g. Packet was encrypted
* or there was not enough information to fanout to upper
* layers. Thus on the next outbound datagram, ip_wput_ire
* generates the IFN, where IPSEC processing has *not* been
* done.
*
* *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed
* could have generated this. This happens because ire_max_frag
* value in IP was set to a new value, while the IPSEC processing
* was being done and after we made the fragmentation check in
* ip_wput_ire. Thus on return from IPSEC processing,
* ip_wput_ipsec_out finds that the new length is > ire_max_frag
* and generates the IFN. As IPSEC processing is over, we fanout
* to AH/ESP to remove the header.
*
* In both these cases, ipsec_in_loopback will be set indicating
* that IFN was generated locally.
*
* ROUTER : IFN could be secure or non-secure.
*
* * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
* packet in error has AH/ESP headers to validate the AH/ESP
* headers. AH/ESP will verify whether there is a valid SA or
* not and send it back. We will fanout again if we have more
* data in the packet.
*
* If the packet in error does not have AH/ESP, we handle it
* like any other case.
*
* * NON_SECURE : If the packet in error has AH/ESP headers,
* we attach a dummy ipsec_in and send it up to AH/ESP
* for validation. AH/ESP will verify whether there is a
* valid SA or not and send it back. We will fanout again if
* we have more data in the packet.
*
* If the packet in error does not have AH/ESP, we handle it
* like any other case.
*/
static void
icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp,
icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length,
boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill,
zoneid_t zoneid)
{
uint16_t *up; /* Pointer to ports in ULP header */
uint32_t ports; /* reversed ports for fanout */
ipha_t ripha; /* With reversed addresses */
mblk_t *first_mp;
ipsec_in_t *ii;
tcph_t *tcph;
conn_t *connp;
ip_stack_t *ipst;
ASSERT(ill != NULL);
ASSERT(recv_ill != NULL);
ipst = recv_ill->ill_ipst;
first_mp = mp;
if (mctl_present) {
mp = first_mp->b_cont;
ASSERT(mp != NULL);
ii = (ipsec_in_t *)first_mp->b_rptr;
ASSERT(ii->ipsec_in_type == IPSEC_IN);
} else {
ii = NULL;
}
switch (ipha->ipha_protocol) {
case IPPROTO_UDP:
/*
* Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
* transport header.
*/
if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
mp->b_wptr) {
if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
goto discard_pkt;
}
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
}
up = (uint16_t *)((uchar_t *)ipha + hdr_length);
/*
* Attempt to find a client stream based on port.
* Note that we do a reverse lookup since the header is
* in the form we sent it out.
* The ripha header is only used for the IP_UDP_MATCH and we
* only set the src and dst addresses and protocol.
*/
ripha.ipha_src = ipha->ipha_dst;
ripha.ipha_dst = ipha->ipha_src;
ripha.ipha_protocol = ipha->ipha_protocol;
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
ip2dbg(("icmp_inbound_error: UDP %x:%d to %x:%d: %d/%d\n",
ntohl(ipha->ipha_src), ntohs(up[0]),
ntohl(ipha->ipha_dst), ntohs(up[1]),
icmph->icmph_type, icmph->icmph_code));
/* Have to change db_type after any pullupmsg */
DB_TYPE(mp) = M_CTL;
ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0,
mctl_present, ip_policy, recv_ill, zoneid);
return;
case IPPROTO_TCP:
/*
* Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
* transport header.
*/
if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
mp->b_wptr) {
if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
goto discard_pkt;
}
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
}
/*
* Find a TCP client stream for this packet.
* Note that we do a reverse lookup since the header is
* in the form we sent it out.
*/
tcph = (tcph_t *)((uchar_t *)ipha + hdr_length);
connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN,
ipst);
if (connp == NULL)
goto discard_pkt;
/* Have to change db_type after any pullupmsg */
DB_TYPE(mp) = M_CTL;
squeue_fill(connp->conn_sqp, first_mp, tcp_input,
connp, SQTAG_TCP_INPUT_ICMP_ERR);
return;
case IPPROTO_SCTP:
/*
* Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
* transport header.
*/
if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
mp->b_wptr) {
if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
goto discard_pkt;
}
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
}
up = (uint16_t *)((uchar_t *)ipha + hdr_length);
/*
* Find a SCTP client stream for this packet.
* Note that we do a reverse lookup since the header is
* in the form we sent it out.
* The ripha header is only used for the matching and we
* only set the src and dst addresses, protocol, and version.
*/
ripha.ipha_src = ipha->ipha_dst;
ripha.ipha_dst = ipha->ipha_src;
ripha.ipha_protocol = ipha->ipha_protocol;
ripha.ipha_version_and_hdr_length =
ipha->ipha_version_and_hdr_length;
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
/* Have to change db_type after any pullupmsg */
DB_TYPE(mp) = M_CTL;
ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0,
mctl_present, ip_policy, 0, zoneid);
return;
case IPPROTO_ESP:
case IPPROTO_AH: {
int ipsec_rc;
ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
/*
* We need a IPSEC_IN in the front to fanout to AH/ESP.
* We will re-use the IPSEC_IN if it is already present as
* AH/ESP will not affect any fields in the IPSEC_IN for
* ICMP errors. If there is no IPSEC_IN, allocate a new
* one and attach it in the front.
*/
if (ii != NULL) {
/*
* ip_fanout_proto_again converts the ICMP errors
* that come back from AH/ESP to M_DATA so that
* if it is non-AH/ESP and we do a pullupmsg in
* this function, it would work. Convert it back
* to M_CTL before we send up as this is a ICMP
* error. This could have been generated locally or
* by some router. Validate the inner IPSEC
* headers.
*
* NOTE : ill_index is used by ip_fanout_proto_again
* to locate the ill.
*/
ASSERT(ill != NULL);
ii->ipsec_in_ill_index =
ill->ill_phyint->phyint_ifindex;
ii->ipsec_in_rill_index =
recv_ill->ill_phyint->phyint_ifindex;
DB_TYPE(first_mp->b_cont) = M_CTL;
} else {
/*
* IPSEC_IN is not present. We attach a ipsec_in
* message and send up to IPSEC for validating
* and removing the IPSEC headers. Clear