| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright 2010 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| /* Copyright (c) 1990 Mentat Inc. */ |
| |
| #include <sys/types.h> |
| #include <sys/stream.h> |
| #include <sys/dlpi.h> |
| #include <sys/stropts.h> |
| #include <sys/sysmacros.h> |
| #include <sys/strsubr.h> |
| #include <sys/strlog.h> |
| #include <sys/strsun.h> |
| #include <sys/zone.h> |
| #define _SUN_TPI_VERSION 2 |
| #include <sys/tihdr.h> |
| #include <sys/xti_inet.h> |
| #include <sys/ddi.h> |
| #include <sys/suntpi.h> |
| #include <sys/cmn_err.h> |
| #include <sys/debug.h> |
| #include <sys/kobj.h> |
| #include <sys/modctl.h> |
| #include <sys/atomic.h> |
| #include <sys/policy.h> |
| #include <sys/priv.h> |
| #include <sys/taskq.h> |
| |
| #include <sys/systm.h> |
| #include <sys/param.h> |
| #include <sys/kmem.h> |
| #include <sys/sdt.h> |
| #include <sys/socket.h> |
| #include <sys/vtrace.h> |
| #include <sys/isa_defs.h> |
| #include <sys/mac.h> |
| #include <net/if.h> |
| #include <net/if_arp.h> |
| #include <net/route.h> |
| #include <sys/sockio.h> |
| #include <netinet/in.h> |
| #include <net/if_dl.h> |
| |
| #include <inet/common.h> |
| #include <inet/mi.h> |
| #include <inet/mib2.h> |
| #include <inet/nd.h> |
| #include <inet/arp.h> |
| #include <inet/snmpcom.h> |
| #include <inet/optcom.h> |
| #include <inet/kstatcom.h> |
| |
| #include <netinet/igmp_var.h> |
| #include <netinet/ip6.h> |
| #include <netinet/icmp6.h> |
| #include <netinet/sctp.h> |
| |
| #include <inet/ip.h> |
| #include <inet/ip_impl.h> |
| #include <inet/ip6.h> |
| #include <inet/ip6_asp.h> |
| #include <inet/tcp.h> |
| #include <inet/tcp_impl.h> |
| #include <inet/ip_multi.h> |
| #include <inet/ip_if.h> |
| #include <inet/ip_ire.h> |
| #include <inet/ip_ftable.h> |
| #include <inet/ip_rts.h> |
| #include <inet/ip_ndp.h> |
| #include <inet/ip_listutils.h> |
| #include <netinet/igmp.h> |
| #include <netinet/ip_mroute.h> |
| #include <inet/ipp_common.h> |
| |
| #include <net/pfkeyv2.h> |
| #include <inet/sadb.h> |
| #include <inet/ipsec_impl.h> |
| #include <inet/iptun/iptun_impl.h> |
| #include <inet/ipdrop.h> |
| #include <inet/ip_netinfo.h> |
| #include <inet/ilb_ip.h> |
| |
| #include <sys/ethernet.h> |
| #include <net/if_types.h> |
| #include <sys/cpuvar.h> |
| |
| #include <ipp/ipp.h> |
| #include <ipp/ipp_impl.h> |
| #include <ipp/ipgpc/ipgpc.h> |
| |
| #include <sys/pattr.h> |
| #include <inet/ipclassifier.h> |
| #include <inet/sctp_ip.h> |
| #include <inet/sctp/sctp_impl.h> |
| #include <inet/udp_impl.h> |
| #include <inet/rawip_impl.h> |
| #include <inet/rts_impl.h> |
| |
| #include <sys/tsol/label.h> |
| #include <sys/tsol/tnet.h> |
| |
| #include <sys/squeue_impl.h> |
| #include <inet/ip_arp.h> |
| |
| #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ |
| |
| /* |
| * Values for squeue switch: |
| * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN |
| * IP_SQUEUE_ENTER: SQ_PROCESS |
| * IP_SQUEUE_FILL: SQ_FILL |
| */ |
| int ip_squeue_enter = IP_SQUEUE_ENTER; /* Setable in /etc/system */ |
| |
| int ip_squeue_flag; |
| |
| /* |
| * Setable in /etc/system |
| */ |
| int ip_poll_normal_ms = 100; |
| int ip_poll_normal_ticks = 0; |
| int ip_modclose_ackwait_ms = 3000; |
| |
| /* |
| * It would be nice to have these present only in DEBUG systems, but the |
| * current design of the global symbol checking logic requires them to be |
| * unconditionally present. |
| */ |
| uint_t ip_thread_data; /* TSD key for debug support */ |
| krwlock_t ip_thread_rwlock; |
| list_t ip_thread_list; |
| |
| /* |
| * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions. |
| */ |
| |
| struct listptr_s { |
| mblk_t *lp_head; /* pointer to the head of the list */ |
| mblk_t *lp_tail; /* pointer to the tail of the list */ |
| }; |
| |
| typedef struct listptr_s listptr_t; |
| |
| /* |
| * This is used by ip_snmp_get_mib2_ip_route_media and |
| * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data. |
| */ |
| typedef struct iproutedata_s { |
| uint_t ird_idx; |
| uint_t ird_flags; /* see below */ |
| listptr_t ird_route; /* ipRouteEntryTable */ |
| listptr_t ird_netmedia; /* ipNetToMediaEntryTable */ |
| listptr_t ird_attrs; /* ipRouteAttributeTable */ |
| } iproutedata_t; |
| |
| /* Include ire_testhidden and IRE_IF_CLONE routes */ |
| #define IRD_REPORT_ALL 0x01 |
| |
| /* |
| * Cluster specific hooks. These should be NULL when booted as a non-cluster |
| */ |
| |
| /* |
| * Hook functions to enable cluster networking |
| * On non-clustered systems these vectors must always be NULL. |
| * |
| * Hook function to Check ip specified ip address is a shared ip address |
| * in the cluster |
| * |
| */ |
| int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol, |
| sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL; |
| |
| /* |
| * Hook function to generate cluster wide ip fragment identifier |
| */ |
| uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol, |
| sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp, |
| void *args) = NULL; |
| |
| /* |
| * Hook function to generate cluster wide SPI. |
| */ |
| void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t, |
| void *) = NULL; |
| |
| /* |
| * Hook function to verify if the SPI is already utlized. |
| */ |
| |
| int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL; |
| |
| /* |
| * Hook function to delete the SPI from the cluster wide repository. |
| */ |
| |
| void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL; |
| |
| /* |
| * Hook function to inform the cluster when packet received on an IDLE SA |
| */ |
| |
| void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, |
| in6_addr_t, in6_addr_t, void *) = NULL; |
| |
| /* |
| * Synchronization notes: |
| * |
| * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any |
| * MT level protection given by STREAMS. IP uses a combination of its own |
| * internal serialization mechanism and standard Solaris locking techniques. |
| * The internal serialization is per phyint. This is used to serialize |
| * plumbing operations, IPMP operations, most set ioctls, etc. |
| * |
| * Plumbing is a long sequence of operations involving message |
| * exchanges between IP, ARP and device drivers. Many set ioctls are typically |
| * involved in plumbing operations. A natural model is to serialize these |
| * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in |
| * parallel without any interference. But various set ioctls on hme0 are best |
| * serialized, along with IPMP operations and processing of DLPI control |
| * messages received from drivers on a per phyint basis. This serialization is |
| * provided by the ipsq_t and primitives operating on this. Details can |
| * be found in ip_if.c above the core primitives operating on ipsq_t. |
| * |
| * Lookups of an ipif or ill by a thread return a refheld ipif / ill. |
| * Simiarly lookup of an ire by a thread also returns a refheld ire. |
| * In addition ipif's and ill's referenced by the ire are also indirectly |
| * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld |
| * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the |
| * address of an ipif has to go through the ipsq_t. This ensures that only |
| * one such exclusive operation proceeds at any time on the ipif. It then |
| * waits for all refcnts |
| * associated with this ipif to come down to zero. The address is changed |
| * only after the ipif has been quiesced. Then the ipif is brought up again. |
| * More details are described above the comment in ip_sioctl_flags. |
| * |
| * Packet processing is based mostly on IREs and are fully multi-threaded |
| * using standard Solaris MT techniques. |
| * |
| * There are explicit locks in IP to handle: |
| * - The ip_g_head list maintained by mi_open_link() and friends. |
| * |
| * - The reassembly data structures (one lock per hash bucket) |
| * |
| * - conn_lock is meant to protect conn_t fields. The fields actually |
| * protected by conn_lock are documented in the conn_t definition. |
| * |
| * - ire_lock to protect some of the fields of the ire, IRE tables |
| * (one lock per hash bucket). Refer to ip_ire.c for details. |
| * |
| * - ndp_g_lock and ncec_lock for protecting NCEs. |
| * |
| * - ill_lock protects fields of the ill and ipif. Details in ip.h |
| * |
| * - ill_g_lock: This is a global reader/writer lock. Protects the following |
| * * The AVL tree based global multi list of all ills. |
| * * The linked list of all ipifs of an ill |
| * * The <ipsq-xop> mapping |
| * * <ill-phyint> association |
| * Insertion/deletion of an ill in the system, insertion/deletion of an ipif |
| * into an ill, changing the <ipsq-xop> mapping of an ill, changing the |
| * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as |
| * writer for the actual duration of the insertion/deletion/change. |
| * |
| * - ill_lock: This is a per ill mutex. |
| * It protects some members of the ill_t struct; see ip.h for details. |
| * It also protects the <ill-phyint> assoc. |
| * It also protects the list of ipifs hanging off the ill. |
| * |
| * - ipsq_lock: This is a per ipsq_t mutex lock. |
| * This protects some members of the ipsq_t struct; see ip.h for details. |
| * It also protects the <ipsq-ipxop> mapping |
| * |
| * - ipx_lock: This is a per ipxop_t mutex lock. |
| * This protects some members of the ipxop_t struct; see ip.h for details. |
| * |
| * - phyint_lock: This is a per phyint mutex lock. Protects just the |
| * phyint_flags |
| * |
| * - ip_g_nd_lock: This is a global reader/writer lock. |
| * Any call to nd_load to load a new parameter to the ND table must hold the |
| * lock as writer. ND_GET/ND_SET routines that read the ND table hold the lock |
| * as reader. |
| * |
| * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses. |
| * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the |
| * uniqueness check also done atomically. |
| * |
| * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc |
| * group list linked by ill_usesrc_grp_next. It also protects the |
| * ill_usesrc_ifindex field. It is taken as a writer when a member of the |
| * group is being added or deleted. This lock is taken as a reader when |
| * walking the list/group(eg: to get the number of members in a usesrc group). |
| * Note, it is only necessary to take this lock if the ill_usesrc_grp_next |
| * field is changing state i.e from NULL to non-NULL or vice-versa. For |
| * example, it is not necessary to take this lock in the initial portion |
| * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these |
| * operations are executed exclusively and that ensures that the "usesrc |
| * group state" cannot change. The "usesrc group state" change can happen |
| * only in the latter part of ip_sioctl_slifusesrc and in ill_delete. |
| * |
| * Changing <ill-phyint>, <ipsq-xop> assocications: |
| * |
| * To change the <ill-phyint> association, the ill_g_lock must be held |
| * as writer, and the ill_locks of both the v4 and v6 instance of the ill |
| * must be held. |
| * |
| * To change the <ipsq-xop> association, the ill_g_lock must be held as |
| * writer, the ipsq_lock must be held, and one must be writer on the ipsq. |
| * This is only done when ills are added or removed from IPMP groups. |
| * |
| * To add or delete an ipif from the list of ipifs hanging off the ill, |
| * ill_g_lock (writer) and ill_lock must be held and the thread must be |
| * a writer on the associated ipsq. |
| * |
| * To add or delete an ill to the system, the ill_g_lock must be held as |
| * writer and the thread must be a writer on the associated ipsq. |
| * |
| * To add or delete an ilm to an ill, the ill_lock must be held and the thread |
| * must be a writer on the associated ipsq. |
| * |
| * Lock hierarchy |
| * |
| * Some lock hierarchy scenarios are listed below. |
| * |
| * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock |
| * ill_g_lock -> ill_lock(s) -> phyint_lock |
| * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock |
| * ill_g_lock -> ip_addr_avail_lock |
| * conn_lock -> irb_lock -> ill_lock -> ire_lock |
| * ill_g_lock -> ip_g_nd_lock |
| * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock |
| * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock |
| * arl_lock -> ill_lock |
| * ips_ire_dep_lock -> irb_lock |
| * |
| * When more than 1 ill lock is needed to be held, all ill lock addresses |
| * are sorted on address and locked starting from highest addressed lock |
| * downward. |
| * |
| * Multicast scenarios |
| * ips_ill_g_lock -> ill_mcast_lock |
| * conn_ilg_lock -> ips_ill_g_lock -> ill_lock |
| * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock |
| * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock |
| * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock |
| * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock |
| * |
| * IPsec scenarios |
| * |
| * ipsa_lock -> ill_g_lock -> ill_lock |
| * ill_g_usesrc_lock -> ill_g_lock -> ill_lock |
| * |
| * Trusted Solaris scenarios |
| * |
| * igsa_lock -> gcgrp_rwlock -> gcgrp_lock |
| * igsa_lock -> gcdb_lock |
| * gcgrp_rwlock -> ire_lock |
| * gcgrp_rwlock -> gcdb_lock |
| * |
| * squeue(sq_lock), flow related (ft_lock, fe_lock) locking |
| * |
| * cpu_lock --> ill_lock --> sqset_lock --> sq_lock |
| * sq_lock -> conn_lock -> QLOCK(q) |
| * ill_lock -> ft_lock -> fe_lock |
| * |
| * Routing/forwarding table locking notes: |
| * |
| * Lock acquisition order: Radix tree lock, irb_lock. |
| * Requirements: |
| * i. Walker must not hold any locks during the walker callback. |
| * ii Walker must not see a truncated tree during the walk because of any node |
| * deletion. |
| * iii Existing code assumes ire_bucket is valid if it is non-null and is used |
| * in many places in the code to walk the irb list. Thus even if all the |
| * ires in a bucket have been deleted, we still can't free the radix node |
| * until the ires have actually been inactive'd (freed). |
| * |
| * Tree traversal - Need to hold the global tree lock in read mode. |
| * Before dropping the global tree lock, need to either increment the ire_refcnt |
| * to ensure that the radix node can't be deleted. |
| * |
| * Tree add - Need to hold the global tree lock in write mode to add a |
| * radix node. To prevent the node from being deleted, increment the |
| * irb_refcnt, after the node is added to the tree. The ire itself is |
| * added later while holding the irb_lock, but not the tree lock. |
| * |
| * Tree delete - Need to hold the global tree lock and irb_lock in write mode. |
| * All associated ires must be inactive (i.e. freed), and irb_refcnt |
| * must be zero. |
| * |
| * Walker - Increment irb_refcnt before calling the walker callback. Hold the |
| * global tree lock (read mode) for traversal. |
| * |
| * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele |
| * hence we will acquire irb_lock while holding ips_ire_dep_lock. |
| * |
| * IPsec notes : |
| * |
| * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes |
| * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the |
| * ip_xmit_attr_t has the |
| * information used by the IPsec code for applying the right level of |
| * protection. The information initialized by IP in the ip_xmit_attr_t |
| * is determined by the per-socket policy or global policy in the system. |
| * For inbound datagrams, the ip_recv_attr_t |
| * starts out with nothing in it. It gets filled |
| * with the right information if it goes through the AH/ESP code, which |
| * happens if the incoming packet is secure. The information initialized |
| * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether |
| * the policy requirements needed by per-socket policy or global policy |
| * is met or not. |
| * |
| * For fully connected sockets i.e dst, src [addr, port] is known, |
| * conn_policy_cached is set indicating that policy has been cached. |
| * conn_in_enforce_policy may or may not be set depending on whether |
| * there is a global policy match or per-socket policy match. |
| * Policy inheriting happpens in ip_policy_set once the destination is known. |
| * Once the right policy is set on the conn_t, policy cannot change for |
| * this socket. This makes life simpler for TCP (UDP ?) where |
| * re-transmissions go out with the same policy. For symmetry, policy |
| * is cached for fully connected UDP sockets also. Thus if policy is cached, |
| * it also implies that policy is latched i.e policy cannot change |
| * on these sockets. As we have the right policy on the conn, we don't |
| * have to lookup global policy for every outbound and inbound datagram |
| * and thus serving as an optimization. Note that a global policy change |
| * does not affect fully connected sockets if they have policy. If fully |
| * connected sockets did not have any policy associated with it, global |
| * policy change may affect them. |
| * |
| * IP Flow control notes: |
| * --------------------- |
| * Non-TCP streams are flow controlled by IP. The way this is accomplished |
| * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When |
| * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into |
| * GLDv3. Otherwise packets are sent down to lower layers using STREAMS |
| * functions. |
| * |
| * Per Tx ring udp flow control: |
| * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in |
| * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true). |
| * |
| * The underlying link can expose multiple Tx rings to the GLDv3 mac layer. |
| * To achieve best performance, outgoing traffic need to be fanned out among |
| * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send |
| * traffic out of the NIC and it takes a fanout hint. UDP connections pass |
| * the address of connp as fanout hint to mac_tx(). Under flow controlled |
| * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This |
| * cookie points to a specific Tx ring that is blocked. The cookie is used to |
| * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t |
| * point to drain_lists (idl_t's). These drain list will store the blocked UDP |
| * connp's. The drain list is not a single list but a configurable number of |
| * lists. |
| * |
| * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t |
| * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE |
| * which is equal to 128. This array in turn contains a pointer to idl_t[], |
| * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain |
| * list will point to the list of connp's that are flow controlled. |
| * |
| * --------------- ------- ------- ------- |
| * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> |
| * | --------------- ------- ------- ------- |
| * | --------------- ------- ------- ------- |
| * |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> |
| * ---------------- | --------------- ------- ------- ------- |
| * |idl_tx_list[0]|->| --------------- ------- ------- ------- |
| * ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|--> |
| * | --------------- ------- ------- ------- |
| * . . . . . |
| * | --------------- ------- ------- ------- |
| * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> |
| * --------------- ------- ------- ------- |
| * --------------- ------- ------- ------- |
| * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> |
| * | --------------- ------- ------- ------- |
| * | --------------- ------- ------- ------- |
| * ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> |
| * |idl_tx_list[1]|->| --------------- ------- ------- ------- |
| * ---------------- | . . . . |
| * | --------------- ------- ------- ------- |
| * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> |
| * --------------- ------- ------- ------- |
| * ..... |
| * ---------------- |
| * |idl_tx_list[n]|-> ... |
| * ---------------- |
| * |
| * When mac_tx() returns a cookie, the cookie is used to hash into a |
| * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is |
| * called passing idl_tx_list. The connp gets inserted in a drain list |
| * pointed to by idl_tx_list. conn_drain_list() asserts flow control for |
| * the sockets (non stream based) and sets QFULL condition on the conn_wq |
| * of streams sockets, or the su_txqfull for non-streams sockets. |
| * connp->conn_direct_blocked will be set to indicate the blocked |
| * condition. |
| * |
| * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved. |
| * A cookie is passed in the call to ill_flow_enable() that identifies the |
| * blocked Tx ring. This cookie is used to get to the idl_tx_list that |
| * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t |
| * and goes through each conn in the drain list and calls conn_idl_remove |
| * for the conn to clear the qfull condition for the conn, as well as to |
| * remove the conn from the idl list. In addition, streams based sockets |
| * will have the conn_wq enabled, causing ip_wsrv to run for the |
| * conn. ip_wsrv drains the queued messages, and removes the conn from the |
| * drain list, if all messages were drained. It also notifies the |
| * conn_upcalls for the conn to signal that flow-control has opened up. |
| * |
| * In reality the drain list is not a single list, but a configurable number |
| * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for |
| * each conn in the list. conn_drain_insert and conn_drain_tail are the only |
| * functions that manipulate this drain list. conn_drain_insert is called in |
| * from the protocol layer when conn_ip_output returns EWOULDBLOCK. |
| * (as opposed to from ip_wsrv context for STREAMS |
| * case -- see below). The synchronization between drain insertion and flow |
| * control wakeup is handled by using idl_txl->txl_lock. |
| * |
| * Flow control using STREAMS: |
| * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism |
| * is used. On the send side, if the packet cannot be sent down to the |
| * driver by IP, because of a canput failure, ip_xmit drops the packet |
| * and returns EWOULDBLOCK to the caller, who may then invoke |
| * ixa_check_drain_insert to insert the conn on the 0'th drain list. |
| * When ip_wsrv runs on the ill_wq because flow control has been relieved, the |
| * blocked conns in the * 0'th drain list is drained as with the |
| * non-STREAMS case. |
| * |
| * In both the STREAMS and non-STREAMS case, the sockfs upcall to set |
| * qfull is done when the conn is inserted into the drain list |
| * (conn_drain_insert()) and cleared when the conn is removed from the drain |
| * list (conn_idl_remove()). |
| * |
| * IPQOS notes: |
| * |
| * IPQoS Policies are applied to packets using IPPF (IP Policy framework) |
| * and IPQoS modules. IPPF includes hooks in IP at different control points |
| * (callout positions) which direct packets to IPQoS modules for policy |
| * processing. Policies, if present, are global. |
| * |
| * The callout positions are located in the following paths: |
| * o local_in (packets destined for this host) |
| * o local_out (packets orginating from this host ) |
| * o fwd_in (packets forwarded by this m/c - inbound) |
| * o fwd_out (packets forwarded by this m/c - outbound) |
| * Hooks at these callout points can be enabled/disabled using the ndd variable |
| * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions). |
| * By default all the callout positions are enabled. |
| * |
| * Outbound (local_out) |
| * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6. |
| * |
| * Inbound (local_in) |
| * Hooks are placed in ip_fanout_v4 and ip_fanout_v6. |
| * |
| * Forwarding (in and out) |
| * Hooks are placed in ire_recv_forward_v4/v6. |
| * |
| * IP Policy Framework processing (IPPF processing) |
| * Policy processing for a packet is initiated by ip_process, which ascertains |
| * that the classifier (ipgpc) is loaded and configured, failing which the |
| * packet resumes normal processing in IP. If the clasifier is present, the |
| * packet is acted upon by one or more IPQoS modules (action instances), per |
| * filters configured in ipgpc and resumes normal IP processing thereafter. |
| * An action instance can drop a packet in course of its processing. |
| * |
| * Zones notes: |
| * |
| * The partitioning rules for networking are as follows: |
| * 1) Packets coming from a zone must have a source address belonging to that |
| * zone. |
| * 2) Packets coming from a zone can only be sent on a physical interface on |
| * which the zone has an IP address. |
| * 3) Between two zones on the same machine, packet delivery is only allowed if |
| * there's a matching route for the destination and zone in the forwarding |
| * table. |
| * 4) The TCP and UDP port spaces are per-zone; that is, two processes in |
| * different zones can bind to the same port with the wildcard address |
| * (INADDR_ANY). |
| * |
| * The granularity of interface partitioning is at the logical interface level. |
| * Therefore, every zone has its own IP addresses, and incoming packets can be |
| * attributed to a zone unambiguously. A logical interface is placed into a zone |
| * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t |
| * structure. Rule (1) is implemented by modifying the source address selection |
| * algorithm so that the list of eligible addresses is filtered based on the |
| * sending process zone. |
| * |
| * The Internet Routing Entries (IREs) are either exclusive to a zone or shared |
| * across all zones, depending on their type. Here is the break-up: |
| * |
| * IRE type Shared/exclusive |
| * -------- ---------------- |
| * IRE_BROADCAST Exclusive |
| * IRE_DEFAULT (default routes) Shared (*) |
| * IRE_LOCAL Exclusive (x) |
| * IRE_LOOPBACK Exclusive |
| * IRE_PREFIX (net routes) Shared (*) |
| * IRE_IF_NORESOLVER (interface routes) Exclusive |
| * IRE_IF_RESOLVER (interface routes) Exclusive |
| * IRE_IF_CLONE (interface routes) Exclusive |
| * IRE_HOST (host routes) Shared (*) |
| * |
| * (*) A zone can only use a default or off-subnet route if the gateway is |
| * directly reachable from the zone, that is, if the gateway's address matches |
| * one of the zone's logical interfaces. |
| * |
| * (x) IRE_LOCAL are handled a bit differently. |
| * When ip_restrict_interzone_loopback is set (the default), |
| * ire_route_recursive restricts loopback using an IRE_LOCAL |
| * between zone to the case when L2 would have conceptually looped the packet |
| * back, i.e. the loopback which is required since neither Ethernet drivers |
| * nor Ethernet hardware loops them back. This is the case when the normal |
| * routes (ignoring IREs with different zoneids) would send out the packet on |
| * the same ill as the ill with which is IRE_LOCAL is associated. |
| * |
| * Multiple zones can share a common broadcast address; typically all zones |
| * share the 255.255.255.255 address. Incoming as well as locally originated |
| * broadcast packets must be dispatched to all the zones on the broadcast |
| * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial |
| * since some zones may not be on the 10.16.72/24 network. To handle this, each |
| * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are |
| * sent to every zone that has an IRE_BROADCAST entry for the destination |
| * address on the input ill, see ip_input_broadcast(). |
| * |
| * Applications in different zones can join the same multicast group address. |
| * The same logic applies for multicast as for broadcast. ip_input_multicast |
| * dispatches packets to all zones that have members on the physical interface. |
| */ |
| |
| /* |
| * Squeue Fanout flags: |
| * 0: No fanout. |
| * 1: Fanout across all squeues |
| */ |
| boolean_t ip_squeue_fanout = 0; |
| |
| /* |
| * Maximum dups allowed per packet. |
| */ |
| uint_t ip_max_frag_dups = 10; |
| |
| /* RFC 1122 Conformance */ |
| #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER |
| |
| static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag, |
| cred_t *credp, boolean_t isv6); |
| static mblk_t *ip_xmit_attach_llhdr(mblk_t *, nce_t *); |
| |
| static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *); |
| static void icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *); |
| static void icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *, |
| ip_recv_attr_t *); |
| static void icmp_options_update(ipha_t *); |
| static void icmp_param_problem(mblk_t *, uint8_t, ip_recv_attr_t *); |
| static void icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *); |
| static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *); |
| static void icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *, |
| ip_recv_attr_t *); |
| static void icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *); |
| static void icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *, |
| ip_recv_attr_t *); |
| |
| mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); |
| char *ip_dot_addr(ipaddr_t, char *); |
| mblk_t *ip_carve_mp(mblk_t **, ssize_t); |
| int ip_close(queue_t *, int); |
| static char *ip_dot_saddr(uchar_t *, char *); |
| static void ip_lrput(queue_t *, mblk_t *); |
| ipaddr_t ip_net_mask(ipaddr_t); |
| char *ip_nv_lookup(nv_t *, int); |
| static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); |
| static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); |
| static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t, |
| ipndp_t *, size_t); |
| static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); |
| void ip_rput(queue_t *, mblk_t *); |
| static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, |
| void *dummy_arg); |
| int ip_snmp_get(queue_t *, mblk_t *, int); |
| static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, |
| mib2_ipIfStatsEntry_t *, ip_stack_t *); |
| static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *, |
| ip_stack_t *); |
| static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *); |
| static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int, |
| ip_stack_t *ipst); |
| static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int, |
| ip_stack_t *ipst); |
| static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); |
| static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); |
| static int ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *); |
| static int ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *); |
| int ip_snmp_set(queue_t *, int, int, uchar_t *, int); |
| |
| static mblk_t *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *, |
| mblk_t *); |
| |
| static void conn_drain_init(ip_stack_t *); |
| static void conn_drain_fini(ip_stack_t *); |
| static void conn_drain_tail(conn_t *connp, boolean_t closing); |
| |
| static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *); |
| static void conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *); |
| |
| static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); |
| static void ip_stack_shutdown(netstackid_t stackid, void *arg); |
| static void ip_stack_fini(netstackid_t stackid, void *arg); |
| |
| static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); |
| |
| static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, |
| const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *), |
| ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t, |
| const in6_addr_t *); |
| |
| static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); |
| static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, |
| caddr_t, cred_t *); |
| static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, |
| caddr_t cp, cred_t *cr); |
| static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, |
| cred_t *); |
| static int ip_squeue_switch(int); |
| |
| static void *ip_kstat_init(netstackid_t, ip_stack_t *); |
| static void ip_kstat_fini(netstackid_t, kstat_t *); |
| static int ip_kstat_update(kstat_t *kp, int rw); |
| static void *icmp_kstat_init(netstackid_t); |
| static void icmp_kstat_fini(netstackid_t, kstat_t *); |
| static int icmp_kstat_update(kstat_t *kp, int rw); |
| static void *ip_kstat2_init(netstackid_t, ip_stat_t *); |
| static void ip_kstat2_fini(netstackid_t, kstat_t *); |
| |
| static void ipobs_init(ip_stack_t *); |
| static void ipobs_fini(ip_stack_t *); |
| |
| ipaddr_t ip_g_all_ones = IP_HOST_MASK; |
| |
| /* How long, in seconds, we allow frags to hang around. */ |
| #define IP_FRAG_TIMEOUT 15 |
| #define IPV6_FRAG_TIMEOUT 60 |
| |
| static long ip_rput_pullups; |
| int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ |
| |
| vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */ |
| vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */ |
| |
| int ip_debug; |
| |
| /* |
| * Multirouting/CGTP stuff |
| */ |
| int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ |
| |
| /* |
| * Named Dispatch Parameter Table. |
| * All of these are alterable, within the min/max values given, at run time. |
| */ |
| static ipparam_t lcl_param_arr[] = { |
| /* min max value name */ |
| { 0, 1, 0, "ip_respond_to_address_mask_broadcast"}, |
| { 0, 1, 1, "ip_respond_to_echo_broadcast"}, |
| { 0, 1, 1, "ip_respond_to_echo_multicast"}, |
| { 0, 1, 0, "ip_respond_to_timestamp"}, |
| { 0, 1, 0, "ip_respond_to_timestamp_broadcast"}, |
| { 0, 1, 1, "ip_send_redirects"}, |
| { 0, 1, 0, "ip_forward_directed_broadcasts"}, |
| { 0, 10, 0, "ip_mrtdebug"}, |
| { 1, 8, 3, "ip_ire_reclaim_fraction" }, |
| { 1, 8, 3, "ip_nce_reclaim_fraction" }, |
| { 1, 8, 3, "ip_dce_reclaim_fraction" }, |
| { 1, 255, 255, "ip_def_ttl" }, |
| { 0, 1, 0, "ip_forward_src_routed"}, |
| { 0, 256, 32, "ip_wroff_extra" }, |
| { 2, 999999999, 60*20, "ip_pathmtu_interval" }, /* In seconds */ |
| { 8, 65536, 64, "ip_icmp_return_data_bytes" }, |
| { 0, 1, 1, "ip_path_mtu_discovery" }, |
| { 68, 65535, 576, "ip_pmtu_min" }, |
| { 0, 1, 0, "ip_ignore_redirect" }, |
| { 0, 1, 0, "ip_arp_icmp_error" }, |
| { 1, 254, 1, "ip_broadcast_ttl" }, |
| { 0, 99999, 100, "ip_icmp_err_interval" }, |
| { 1, 99999, 10, "ip_icmp_err_burst" }, |
| { 0, 999999999, 1000000, "ip_reass_queue_bytes" }, |
| /* |
| * See comments for ip_strict_src_multihoming for an explanation |
| * of the semantics of ip_strict_dst_multihoming |
| */ |
| { 0, 1, 0, "ip_strict_dst_multihoming" }, |
| { 1, MAX_ADDRS_PER_IF, 256, "ip_addrs_per_if"}, |
| { 0, 1, 0, "ipsec_override_persocket_policy" }, |
| { 0, 1, 1, "icmp_accept_clear_messages" }, |
| { 0, 1, 1, "igmp_accept_clear_messages" }, |
| { 2, 999999999, ND_DELAY_FIRST_PROBE_TIME, |
| "ip_ndp_delay_first_probe_time"}, |
| { 1, 999999999, ND_MAX_UNICAST_SOLICIT, |
| "ip_ndp_max_unicast_solicit"}, |
| { 1, 255, IPV6_MAX_HOPS, "ip6_def_hops" }, |
| { 8, IPV6_MIN_MTU, IPV6_MIN_MTU, "ip6_icmp_return_data_bytes" }, |
| { 0, 1, 0, "ip6_forward_src_routed"}, |
| { 0, 1, 1, "ip6_respond_to_echo_multicast"}, |
| { 0, 1, 1, "ip6_send_redirects"}, |
| { 0, 1, 0, "ip6_ignore_redirect" }, |
| /* |
| * See comments for ip6_strict_src_multihoming for an explanation |
| * of the semantics of ip6_strict_dst_multihoming |
| */ |
| { 0, 1, 0, "ip6_strict_dst_multihoming" }, |
| |
| { 0, 2, 2, "ip_src_check" }, |
| |
| { 0, 999999, 1000, "ipsec_policy_log_interval" }, |
| |
| { 0, 1, 1, "pim_accept_clear_messages" }, |
| { 1000, 20000, 2000, "ip_ndp_unsolicit_interval" }, |
| { 1, 20, 3, "ip_ndp_unsolicit_count" }, |
| { 0, 1, 1, "ip6_ignore_home_address_opt" }, |
| { 0, 15, 0, "ip_policy_mask" }, |
| { 0, 2, 2, "ip_ecmp_behavior" }, |
| { 0, 255, 1, "ip_multirt_ttl" }, |
| { 0, 3600, 60, "ip_ire_badcnt_lifetime" }, /* In seconds */ |
| { 0, 999999, 60*60*24, "ip_max_temp_idle" }, |
| { 0, 1000, 1, "ip_max_temp_defend" }, |
| /* |
| * when a conflict of an active address is detected, |
| * defend up to ip_max_defend times, within any |
| * ip_defend_interval span. |
| */ |
| { 0, 1000, 3, "ip_max_defend" }, |
| { 0, 999999, 30, "ip_defend_interval" }, |
| { 0, 3600000, 300000, "ip_dup_recovery" }, |
| { 0, 1, 1, "ip_restrict_interzone_loopback" }, |
| { 0, 1, 1, "ip_lso_outbound" }, |
| { IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" }, |
| { MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" }, |
| #ifdef DEBUG |
| { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, |
| #else |
| { 0, 0, 0, "" }, |
| #endif |
| /* delay before sending first probe: */ |
| { 0, 20000, 1000, "arp_probe_delay" }, |
| { 0, 20000, 100, "arp_fastprobe_delay" }, |
| /* interval at which DAD probes are sent: */ |
| { 10, 20000, 1500, "arp_probe_interval" }, |
| { 10, 20000, 150, "arp_fastprobe_interval" }, |
| /* setting probe count to 0 will disable ARP probing for DAD. */ |
| { 0, 20, 3, "arp_probe_count" }, |
| { 0, 20, 3, "arp_fastprobe_count" }, |
| |
| { 0, 3600000, 15000, "ipv4_dad_announce_interval"}, |
| { 0, 3600000, 15000, "ipv6_dad_announce_interval"}, |
| /* |
| * Rate limiting parameters for DAD defense used in |
| * ill_defend_rate_limit(): |
| * defend_rate : pkts/hour permitted |
| * defend_interval : time that can elapse before we send out a |
| * DAD defense. |
| * defend_period: denominator for defend_rate (in seconds). |
| */ |
| { 0, 3600000, 300000, "arp_defend_interval"}, |
| { 0, 20000, 100, "arp_defend_rate"}, |
| { 0, 3600000, 300000, "ndp_defend_interval"}, |
| { 0, 20000, 100, "ndp_defend_rate"}, |
| { 5, 86400, 3600, "arp_defend_period"}, |
| { 5, 86400, 3600, "ndp_defend_period"}, |
| { 0, 1, 1, "ipv4_icmp_return_pmtu" }, |
| { 0, 1, 1, "ipv6_icmp_return_pmtu" }, |
| /* |
| * publish count/interval values used to announce local addresses |
| * for IPv4, IPv6. |
| */ |
| { 1, 20, 5, "ip_arp_publish_count" }, |
| { 1000, 20000, 2000, "ip_arp_publish_interval" }, |
| /* |
| * The ip*strict_src_multihoming and ip*strict_dst_multihoming provide |
| * a range of choices for setting strong/weak/preferred end-system |
| * behavior. The semantics for setting these are: |
| * |
| * ip*_strict_dst_multihoming = 0 |
| * weak end system model for managing ip destination addresses. |
| * A packet with IP dst D1 that's received on interface I1 will be |
| * accepted as long as D1 is one of the local addresses on |
| * the machine, even if D1 is not configured on I1. |
| * ip*strict_dst_multihioming = 1 |
| * strong end system model for managing ip destination addresses. |
| * A packet with IP dst D1 that's received on interface I1 will be |
| * accepted if, and only if, D1 is configured on I1. |
| * |
| * ip*strict_src_multihoming = 0 |
| * Source agnostic route selection for outgoing packets: the |
| * outgoing interface for a packet will be computed using |
| * default algorithms for route selection, where the route |
| * with the longest matching prefix is chosen for the output |
| * unless other route selection constraints are explicitly |
| * specified during routing table lookup. This may result |
| * in packet being sent out on interface I2 with source |
| * address S1, even though S1 is not a configured address on I2. |
| * ip*strict_src_multihoming = 1 |
| * Preferred source aware route selection for outgoing packets: for |
| * a packet with source S2, destination D2, the route selection |
| * algorithm will first attempt to find a route for the destination |
| * that goes out through an interface where S2 is |
| * configured. If such a route cannot be found, then the |
| * best-matching route for D2 will be selected. |
| * ip*strict_src_multihoming = 2 |
| * Source aware route selection for outgoing packets: a packet will |
| * be sent out on an interface I2 only if the src address S2 of the |
| * packet is a configured address on I2. In conjunction with |
| * the setting 'ip_strict_dst_multihoming == 1', this will result in |
| * the implementation of Strong ES as defined in Section 3.3.4.2 of |
| * RFC 1122 |
| */ |
| { 0, 2, 0, "ip_strict_src_multihoming" }, |
| { 0, 2, 0, "ip6_strict_src_multihoming" } |
| }; |
| |
| /* |
| * Extended NDP table |
| * The addresses for the first two are filled in to be ips_ip_g_forward |
| * and ips_ipv6_forward at init time. |
| */ |
| static ipndp_t lcl_ndp_arr[] = { |
| /* getf setf data name */ |
| #define IPNDP_IP_FORWARDING_OFFSET 0 |
| { ip_param_generic_get, ip_forward_set, NULL, |
| "ip_forwarding" }, |
| #define IPNDP_IP6_FORWARDING_OFFSET 1 |
| { ip_param_generic_get, ip_forward_set, NULL, |
| "ip6_forwarding" }, |
| { ip_param_generic_get, ip_input_proc_set, |
| (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, |
| { ip_param_generic_get, ip_int_set, |
| (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, |
| #define IPNDP_CGTP_FILTER_OFFSET 4 |
| { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, |
| "ip_cgtp_filter" }, |
| { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, |
| "ip_debug" }, |
| }; |
| |
| /* |
| * Table of IP ioctls encoding the various properties of the ioctl and |
| * indexed based on the last byte of the ioctl command. Occasionally there |
| * is a clash, and there is more than 1 ioctl with the same last byte. |
| * In such a case 1 ioctl is encoded in the ndx table and the remaining |
| * ioctls are encoded in the misc table. An entry in the ndx table is |
| * retrieved by indexing on the last byte of the ioctl command and comparing |
| * the ioctl command with the value in the ndx table. In the event of a |
| * mismatch the misc table is then searched sequentially for the desired |
| * ioctl command. |
| * |
| * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func> |
| */ |
| ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { |
| /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV, |
| MISC_CMD, ip_siocaddrt, NULL }, |
| /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV, |
| MISC_CMD, ip_siocdelrt, NULL }, |
| |
| /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, |
| IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, |
| /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD, |
| IF_CMD, ip_sioctl_get_addr, NULL }, |
| |
| /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, |
| IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, |
| /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq), |
| IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL }, |
| |
| /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq), |
| IPI_PRIV | IPI_WR, |
| IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, |
| /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq), |
| IPI_MODOK | IPI_GET_CMD, |
| IF_CMD, ip_sioctl_get_flags, NULL }, |
| |
| /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* copyin size cannot be coded for SIOCGIFCONF */ |
| /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD, |
| MISC_CMD, ip_sioctl_get_ifconf, NULL }, |
| |
| /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR, |
| IF_CMD, ip_sioctl_mtu, NULL }, |
| /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD, |
| IF_CMD, ip_sioctl_get_mtu, NULL }, |
| /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq), |
| IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL }, |
| /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR, |
| IF_CMD, ip_sioctl_brdaddr, NULL }, |
| /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq), |
| IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL }, |
| /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR, |
| IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, |
| /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq), |
| IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL }, |
| /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV, |
| IF_CMD, ip_sioctl_metric, NULL }, |
| /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* See 166-168 below for extended SIOC*XARP ioctls */ |
| /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, |
| ARP_CMD, ip_sioctl_arp, NULL }, |
| /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD, |
| ARP_CMD, ip_sioctl_arp, NULL }, |
| /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR, |
| ARP_CMD, ip_sioctl_arp, NULL }, |
| |
| /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK, |
| MISC_CMD, if_unitsel, if_unitsel_restart }, |
| |
| /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq), |
| IPI_PRIV | IPI_WR | IPI_MODOK, |
| IF_CMD, ip_sioctl_sifname, NULL }, |
| |
| /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD, |
| MISC_CMD, ip_sioctl_get_ifnum, NULL }, |
| /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD, |
| IF_CMD, ip_sioctl_get_muxid, NULL }, |
| /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq), |
| IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL }, |
| |
| /* Both if and lif variants share same func */ |
| /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD, |
| IF_CMD, ip_sioctl_get_lifindex, NULL }, |
| /* Both if and lif variants share same func */ |
| /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq), |
| IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL }, |
| |
| /* copyin size cannot be coded for SIOCGIFCONF */ |
| /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD, |
| MISC_CMD, ip_sioctl_get_ifconf, NULL }, |
| /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq), |
| IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif, |
| ip_sioctl_removeif_restart }, |
| /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq), |
| IPI_GET_CMD | IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_addif, NULL }, |
| #define SIOCLIFADDR_NDX 112 |
| /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart }, |
| /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL }, |
| /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart }, |
| /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL }, |
| /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq), |
| IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart }, |
| /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq), |
| IPI_GET_CMD | IPI_MODOK, |
| LIF_CMD, ip_sioctl_get_flags, NULL }, |
| |
| /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, |
| ip_sioctl_get_lifconf, NULL }, |
| /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_mtu, NULL }, |
| /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD, |
| LIF_CMD, ip_sioctl_get_mtu, NULL }, |
| /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL }, |
| /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_brdaddr, NULL }, |
| /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL }, |
| /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart }, |
| /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL }, |
| /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_metric, NULL }, |
| /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq), |
| IPI_PRIV | IPI_WR | IPI_MODOK, |
| LIF_CMD, ip_sioctl_slifname, |
| ip_sioctl_slifname_restart }, |
| |
| /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD, |
| MISC_CMD, ip_sioctl_get_lifnum, NULL }, |
| /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL }, |
| /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq), |
| IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL }, |
| /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 }, |
| /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq), |
| IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 }, |
| /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_token, NULL }, |
| /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL }, |
| /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart }, |
| /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL }, |
| /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_lnkinfo, NULL }, |
| |
| /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL }, |
| /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV, |
| LIF_CMD, ip_siocdelndp_v6, NULL }, |
| /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD, |
| LIF_CMD, ip_siocqueryndp_v6, NULL }, |
| /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV, |
| LIF_CMD, ip_siocsetndp_v6, NULL }, |
| /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD, |
| MISC_CMD, ip_sioctl_tmyaddr, NULL }, |
| /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD, |
| MISC_CMD, ip_sioctl_tonlink, NULL }, |
| /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0, |
| MISC_CMD, ip_sioctl_tmysite, NULL }, |
| /* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* IPSECioctls handled in ip_sioctl_copyin_setup itself */ |
| /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, |
| /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, |
| /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, |
| /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL }, |
| |
| /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD, |
| LIF_CMD, ip_sioctl_get_binding, NULL }, |
| /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq), |
| IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname }, |
| /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL }, |
| /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t), |
| IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL }, |
| |
| /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */ |
| /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| |
| /* These are handled in ip_sioctl_copyin_setup itself */ |
| /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT, |
| MISC_CMD, NULL, NULL }, |
| /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT, |
| MISC_CMD, NULL, NULL }, |
| /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL }, |
| |
| /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD, |
| ip_sioctl_get_lifconf, NULL }, |
| |
| /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, |
| XARP_CMD, ip_sioctl_arp, NULL }, |
| /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD, |
| XARP_CMD, ip_sioctl_arp, NULL }, |
| /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR, |
| XARP_CMD, ip_sioctl_arp, NULL }, |
| |
| /* SIOCPOPSOCKFS is not handled by IP */ |
| /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL }, |
| |
| /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL }, |
| /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq), |
| IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone, |
| ip_sioctl_slifzone_restart }, |
| /* 172-174 are SCTP ioctls and not handled by IP */ |
| /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq), |
| IPI_GET_CMD, LIF_CMD, |
| ip_sioctl_get_lifusesrc, 0 }, |
| /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq), |
| IPI_PRIV | IPI_WR, |
| LIF_CMD, ip_sioctl_slifusesrc, |
| NULL }, |
| /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD, |
| ip_sioctl_get_lifsrcof, NULL }, |
| /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, |
| MSFILT_CMD, ip_sioctl_msfilter, NULL }, |
| /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0, |
| MSFILT_CMD, ip_sioctl_msfilter, NULL }, |
| /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, |
| MSFILT_CMD, ip_sioctl_msfilter, NULL }, |
| /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0, |
| MSFILT_CMD, ip_sioctl_msfilter, NULL }, |
| /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, |
| /* SIOCSENABLESDP is handled by SDP */ |
| /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, |
| /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL }, |
| /* 185 */ { IPI_DONTCARE /* SIOCGIFHWADDR */, 0, 0, 0, NULL, NULL }, |
| /* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL }, |
| /* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD, |
| ip_sioctl_ilb_cmd, NULL }, |
| }; |
| |
| int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); |
| |
| ip_ioctl_cmd_t ip_misc_ioctl_table[] = { |
| { I_LINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, |
| { I_UNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, |
| { I_PLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, |
| { I_PUNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, |
| { ND_GET, 0, 0, 0, NULL, NULL }, |
| { ND_SET, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, |
| { IP_IOCTL, 0, 0, 0, NULL, NULL }, |
| { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD, |
| MISC_CMD, mrt_ioctl}, |
| { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD, |
| MISC_CMD, mrt_ioctl}, |
| { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD, |
| MISC_CMD, mrt_ioctl} |
| }; |
| |
| int ip_misc_ioctl_count = |
| sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t); |
| |
| int conn_drain_nthreads; /* Number of drainers reqd. */ |
| /* Settable in /etc/system */ |
| /* Defined in ip_ire.c */ |
| extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt; |
| extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt; |
| extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio; |
| |
| static nv_t ire_nv_arr[] = { |
| { IRE_BROADCAST, "BROADCAST" }, |
| { IRE_LOCAL, "LOCAL" }, |
| { IRE_LOOPBACK, "LOOPBACK" }, |
| { IRE_DEFAULT, "DEFAULT" }, |
| { IRE_PREFIX, "PREFIX" }, |
| { IRE_IF_NORESOLVER, "IF_NORESOL" }, |
| { IRE_IF_RESOLVER, "IF_RESOLV" }, |
| { IRE_IF_CLONE, "IF_CLONE" }, |
| { IRE_HOST, "HOST" }, |
| { IRE_MULTICAST, "MULTICAST" }, |
| { IRE_NOROUTE, "NOROUTE" }, |
| { 0 } |
| }; |
| |
| nv_t *ire_nv_tbl = ire_nv_arr; |
| |
| /* Simple ICMP IP Header Template */ |
| static ipha_t icmp_ipha = { |
| IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP |
| }; |
| |
| struct module_info ip_mod_info = { |
| IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT, |
| IP_MOD_LOWAT |
| }; |
| |
| /* |
| * Duplicate static symbols within a module confuses mdb; so we avoid the |
| * problem by making the symbols here distinct from those in udp.c. |
| */ |
| |
| /* |
| * Entry points for IP as a device and as a module. |
| * We have separate open functions for the /dev/ip and /dev/ip6 devices. |
| */ |
| static struct qinit iprinitv4 = { |
| (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL, |
| &ip_mod_info |
| }; |
| |
| struct qinit iprinitv6 = { |
| (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL, |
| &ip_mod_info |
| }; |
| |
| static struct qinit ipwinit = { |
| (pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL, |
| &ip_mod_info |
| }; |
| |
| static struct qinit iplrinit = { |
| (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL, |
| &ip_mod_info |
| }; |
| |
| static struct qinit iplwinit = { |
| (pfi_t)ip_lwput, NULL, NULL, NULL, NULL, |
| &ip_mod_info |
| }; |
| |
| /* For AF_INET aka /dev/ip */ |
| struct streamtab ipinfov4 = { |
| &iprinitv4, &ipwinit, &iplrinit, &iplwinit |
| }; |
| |
| /* For AF_INET6 aka /dev/ip6 */ |
| struct streamtab ipinfov6 = { |
| &iprinitv6, &ipwinit, &iplrinit, &iplwinit |
| }; |
| |
| #ifdef DEBUG |
| boolean_t skip_sctp_cksum = B_FALSE; |
| #endif |
| |
| /* |
| * Generate an ICMP fragmentation needed message. |
| * When called from ip_output side a minimal ip_recv_attr_t needs to be |
| * constructed by the caller. |
| */ |
| void |
| icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira) |
| { |
| icmph_t icmph; |
| ip_stack_t *ipst = ira->ira_ill->ill_ipst; |
| |
| mp = icmp_pkt_err_ok(mp, ira); |
| if (mp == NULL) |
| return; |
| |
| bzero(&icmph, sizeof (icmph_t)); |
| icmph.icmph_type = ICMP_DEST_UNREACHABLE; |
| icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED; |
| icmph.icmph_du_mtu = htons((uint16_t)mtu); |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded); |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); |
| |
| icmp_pkt(mp, &icmph, sizeof (icmph_t), ira); |
| } |
| |
| /* |
| * icmp_inbound_v4 deals with ICMP messages that are handled by IP. |
| * If the ICMP message is consumed by IP, i.e., it should not be delivered |
| * to any IPPROTO_ICMP raw sockets, then it returns NULL. |
| * Likewise, if the ICMP error is misformed (too short, etc), then it |
| * returns NULL. The caller uses this to determine whether or not to send |
| * to raw sockets. |
| * |
| * All error messages are passed to the matching transport stream. |
| * |
| * The following cases are handled by icmp_inbound: |
| * 1) It needs to send a reply back and possibly delivering it |
| * to the "interested" upper clients. |
| * 2) Return the mblk so that the caller can pass it to the RAW socket clients. |
| * 3) It needs to change some values in IP only. |
| * 4) It needs to change some values in IP and upper layers e.g TCP |
| * by delivering an error to the upper layers. |
| * |
| * We handle the above three cases in the context of IPsec in the |
| * following way : |
| * |
| * 1) Send the reply back in the same way as the request came in. |
| * If it came in encrypted, it goes out encrypted. If it came in |
| * clear, it goes out in clear. Thus, this will prevent chosen |
| * plain text attack. |
| * 2) The client may or may not expect things to come in secure. |
| * If it comes in secure, the policy constraints are checked |
| * before delivering it to the upper layers. If it comes in |
| * clear, ipsec_inbound_accept_clear will decide whether to |
| * accept this in clear or not. In both the cases, if the returned |
| * message (IP header + 8 bytes) that caused the icmp message has |
| * AH/ESP headers, it is sent up to AH/ESP for validation before |
| * sending up. If there are only 8 bytes of returned message, then |
| * upper client will not be notified. |
| * 3) Check with global policy to see whether it matches the constaints. |
| * But this will be done only if icmp_accept_messages_in_clear is |
| * zero. |
| * 4) If we need to change both in IP and ULP, then the decision taken |
| * while affecting the values in IP and while delivering up to TCP |
| * should be the same. |
| * |
| * There are two cases. |
| * |
| * a) If we reject data at the IP layer (ipsec_check_global_policy() |
| * failed), we will not deliver it to the ULP, even though they |
| * are *willing* to accept in *clear*. This is fine as our global |
| * disposition to icmp messages asks us reject the datagram. |
| * |
| * b) If we accept data at the IP layer (ipsec_check_global_policy() |
| * succeeded or icmp_accept_messages_in_clear is 1), and not able |
| * to deliver it to ULP (policy failed), it can lead to |
| * consistency problems. The cases known at this time are |
| * ICMP_DESTINATION_UNREACHABLE messages with following code |
| * values : |
| * |
| * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value |
| * and Upper layer rejects. Then the communication will |
| * come to a stop. This is solved by making similar decisions |
| * at both levels. Currently, when we are unable to deliver |
| * to the Upper Layer (due to policy failures) while IP has |
| * adjusted dce_pmtu, the next outbound datagram would |
| * generate a local ICMP_FRAGMENTATION_NEEDED message - which |
| * will be with the right level of protection. Thus the right |
| * value will be communicated even if we are not able to |
| * communicate when we get from the wire initially. But this |
| * assumes there would be at least one outbound datagram after |
| * IP has adjusted its dce_pmtu value. To make things |
| * simpler, we accept in clear after the validation of |
| * AH/ESP headers. |
| * |
| * - Other ICMP ERRORS : We may not be able to deliver it to the |
| * upper layer depending on the level of protection the upper |
| * layer expects and the disposition in ipsec_inbound_accept_clear(). |
| * ipsec_inbound_accept_clear() decides whether a given ICMP error |
| * should be accepted in clear when the Upper layer expects secure. |
| * Thus the communication may get aborted by some bad ICMP |
| * packets. |
| */ |
| mblk_t * |
| icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira) |
| { |
| icmph_t *icmph; |
| ipha_t *ipha; /* Outer header */ |
| int ip_hdr_length; /* Outer header length */ |
| boolean_t interested; |
| ipif_t *ipif; |
| uint32_t ts; |
| uint32_t *tsp; |
| timestruc_t now; |
| ill_t *ill = ira->ira_ill; |
| ip_stack_t *ipst = ill->ill_ipst; |
| zoneid_t zoneid = ira->ira_zoneid; |
| int len_needed; |
| mblk_t *mp_ret = NULL; |
| |
| ipha = (ipha_t *)mp->b_rptr; |
| |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs); |
| |
| ip_hdr_length = ira->ira_ip_hdr_length; |
| if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) { |
| if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) { |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); |
| ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); |
| freemsg(mp); |
| return (NULL); |
| } |
| /* Last chance to get real. */ |
| ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira); |
| if (ipha == NULL) { |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); |
| freemsg(mp); |
| return (NULL); |
| } |
| } |
| |
| /* The IP header will always be a multiple of four bytes */ |
| icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; |
| ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type, |
| icmph->icmph_code)); |
| |
| /* |
| * We will set "interested" to "true" if we should pass a copy to |
| * the transport or if we handle the packet locally. |
| */ |
| interested = B_FALSE; |
| switch (icmph->icmph_type) { |
| case ICMP_ECHO_REPLY: |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps); |
| break; |
| case ICMP_DEST_UNREACHABLE: |
| if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded); |
| interested = B_TRUE; /* Pass up to transport */ |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs); |
| break; |
| case ICMP_SOURCE_QUENCH: |
| interested = B_TRUE; /* Pass up to transport */ |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs); |
| break; |
| case ICMP_REDIRECT: |
| if (!ipst->ips_ip_ignore_redirect) |
| interested = B_TRUE; |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects); |
| break; |
| case ICMP_ECHO_REQUEST: |
| /* |
| * Whether to respond to echo requests that come in as IP |
| * broadcasts or as IP multicast is subject to debate |
| * (what isn't?). We aim to please, you pick it. |
| * Default is do it. |
| */ |
| if (ira->ira_flags & IRAF_MULTICAST) { |
| /* multicast: respond based on tunable */ |
| interested = ipst->ips_ip_g_resp_to_echo_mcast; |
| } else if (ira->ira_flags & IRAF_BROADCAST) { |
| /* broadcast: respond based on tunable */ |
| interested = ipst->ips_ip_g_resp_to_echo_bcast; |
| } else { |
| /* unicast: always respond */ |
| interested = B_TRUE; |
| } |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos); |
| if (!interested) { |
| /* We never pass these to RAW sockets */ |
| freemsg(mp); |
| return (NULL); |
| } |
| |
| /* Check db_ref to make sure we can modify the packet. */ |
| if (mp->b_datap->db_ref > 1) { |
| mblk_t *mp1; |
| |
| mp1 = copymsg(mp); |
| freemsg(mp); |
| if (!mp1) { |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); |
| return (NULL); |
| } |
| mp = mp1; |
| ipha = (ipha_t *)mp->b_rptr; |
| icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; |
| } |
| icmph->icmph_type = ICMP_ECHO_REPLY; |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); |
| icmp_send_reply_v4(mp, ipha, icmph, ira); |
| return (NULL); |
| |
| case ICMP_ROUTER_ADVERTISEMENT: |
| case ICMP_ROUTER_SOLICITATION: |
| break; |
| case ICMP_TIME_EXCEEDED: |
| interested = B_TRUE; /* Pass up to transport */ |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds); |
| break; |
| case ICMP_PARAM_PROBLEM: |
| interested = B_TRUE; /* Pass up to transport */ |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs); |
| break; |
| case ICMP_TIME_STAMP_REQUEST: |
| /* Response to Time Stamp Requests is local policy. */ |
| if (ipst->ips_ip_g_resp_to_timestamp) { |
| if (ira->ira_flags & IRAF_MULTIBROADCAST) |
| interested = |
| ipst->ips_ip_g_resp_to_timestamp_bcast; |
| else |
| interested = B_TRUE; |
| } |
| if (!interested) { |
| /* We never pass these to RAW sockets */ |
| freemsg(mp); |
| return (NULL); |
| } |
| |
| /* Make sure we have enough of the packet */ |
| len_needed = ip_hdr_length + ICMPH_SIZE + |
| 3 * sizeof (uint32_t); |
| |
| if (mp->b_wptr - mp->b_rptr < len_needed) { |
| ipha = ip_pullup(mp, len_needed, ira); |
| if (ipha == NULL) { |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); |
| ip_drop_input("ipIfStatsInDiscards - ip_pullup", |
| mp, ill); |
| freemsg(mp); |
| return (NULL); |
| } |
| /* Refresh following the pullup. */ |
| icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; |
| } |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps); |
| /* Check db_ref to make sure we can modify the packet. */ |
| if (mp->b_datap->db_ref > 1) { |
| mblk_t *mp1; |
| |
| mp1 = copymsg(mp); |
| freemsg(mp); |
| if (!mp1) { |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); |
| return (NULL); |
| } |
| mp = mp1; |
| ipha = (ipha_t *)mp->b_rptr; |
| icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; |
| } |
| icmph->icmph_type = ICMP_TIME_STAMP_REPLY; |
| tsp = (uint32_t *)&icmph[1]; |
| tsp++; /* Skip past 'originate time' */ |
| /* Compute # of milliseconds since midnight */ |
| gethrestime(&now); |
| ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + |
| now.tv_nsec / (NANOSEC / MILLISEC); |
| *tsp++ = htonl(ts); /* Lay in 'receive time' */ |
| *tsp++ = htonl(ts); /* Lay in 'send time' */ |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); |
| icmp_send_reply_v4(mp, ipha, icmph, ira); |
| return (NULL); |
| |
| case ICMP_TIME_STAMP_REPLY: |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps); |
| break; |
| case ICMP_INFO_REQUEST: |
| /* Per RFC 1122 3.2.2.7, ignore this. */ |
| case ICMP_INFO_REPLY: |
| break; |
| case ICMP_ADDRESS_MASK_REQUEST: |
| if (ira->ira_flags & IRAF_MULTIBROADCAST) { |
| interested = |
| ipst->ips_ip_respond_to_address_mask_broadcast; |
| } else { |
| interested = B_TRUE; |
| } |
| if (!interested) { |
| /* We never pass these to RAW sockets */ |
| freemsg(mp); |
| return (NULL); |
| } |
| len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN; |
| if (mp->b_wptr - mp->b_rptr < len_needed) { |
| ipha = ip_pullup(mp, len_needed, ira); |
| if (ipha == NULL) { |
| BUMP_MIB(ill->ill_ip_mib, |
| ipIfStatsInTruncatedPkts); |
| ip_drop_input("ipIfStatsInTruncatedPkts", mp, |
| ill); |
| freemsg(mp); |
| return (NULL); |
| } |
| /* Refresh following the pullup. */ |
| icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; |
| } |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks); |
| /* Check db_ref to make sure we can modify the packet. */ |
| if (mp->b_datap->db_ref > 1) { |
| mblk_t *mp1; |
| |
| mp1 = copymsg(mp); |
| freemsg(mp); |
| if (!mp1) { |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); |
| return (NULL); |
| } |
| mp = mp1; |
| ipha = (ipha_t *)mp->b_rptr; |
| icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; |
| } |
| /* |
| * Need the ipif with the mask be the same as the source |
| * address of the mask reply. For unicast we have a specific |
| * ipif. For multicast/broadcast we only handle onlink |
| * senders, and use the source address to pick an ipif. |
| */ |
| ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst); |
| if (ipif == NULL) { |
| /* Broadcast or multicast */ |
| ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); |
| if (ipif == NULL) { |
| freemsg(mp); |
| return (NULL); |
| } |
| } |
| icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; |
| bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); |
| ipif_refrele(ipif); |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); |
| icmp_send_reply_v4(mp, ipha, icmph, ira); |
| return (NULL); |
| |
| case ICMP_ADDRESS_MASK_REPLY: |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps); |
| break; |
| default: |
| interested = B_TRUE; /* Pass up to transport */ |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns); |
| break; |
| } |
| /* |
| * See if there is an ICMP client to avoid an extra copymsg/freemsg |
| * if there isn't one. |
| */ |
| if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) { |
| /* If there is an ICMP client and we want one too, copy it. */ |
| |
| if (!interested) { |
| /* Caller will deliver to RAW sockets */ |
| return (mp); |
| } |
| mp_ret = copymsg(mp); |
| if (mp_ret == NULL) { |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); |
| ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); |
| } |
| } else if (!interested) { |
| /* Neither we nor raw sockets are interested. Drop packet now */ |
| freemsg(mp); |
| return (NULL); |
| } |
| |
| /* |
| * ICMP error or redirect packet. Make sure we have enough of |
| * the header and that db_ref == 1 since we might end up modifying |
| * the packet. |
| */ |
| if (mp->b_cont != NULL) { |
| if (ip_pullup(mp, -1, ira) == NULL) { |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); |
| ip_drop_input("ipIfStatsInDiscards - ip_pullup", |
| mp, ill); |
| freemsg(mp); |
| return (mp_ret); |
| } |
| } |
| |
| if (mp->b_datap->db_ref > 1) { |
| mblk_t *mp1; |
| |
| mp1 = copymsg(mp); |
| if (mp1 == NULL) { |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); |
| ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); |
| freemsg(mp); |
| return (mp_ret); |
| } |
| freemsg(mp); |
| mp = mp1; |
| } |
| |
| /* |
| * In case mp has changed, verify the message before any further |
| * processes. |
| */ |
| ipha = (ipha_t *)mp->b_rptr; |
| icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; |
| if (!icmp_inbound_verify_v4(mp, icmph, ira)) { |
| freemsg(mp); |
| return (mp_ret); |
| } |
| |
| switch (icmph->icmph_type) { |
| case ICMP_REDIRECT: |
| icmp_redirect_v4(mp, ipha, icmph, ira); |
| break; |
| case ICMP_DEST_UNREACHABLE: |
| if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { |
| /* Update DCE and adjust MTU is icmp header if needed */ |
| icmp_inbound_too_big_v4(icmph, ira); |
| } |
| /* FALLTHRU */ |
| default: |
| icmp_inbound_error_fanout_v4(mp, icmph, ira); |
| break; |
| } |
| return (mp_ret); |
| } |
| |
| /* |
| * Send an ICMP echo, timestamp or address mask reply. |
| * The caller has already updated the payload part of the packet. |
| * We handle the ICMP checksum, IP source address selection and feed |
| * the packet into ip_output_simple. |
| */ |
| static void |
| icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, |
| ip_recv_attr_t *ira) |
| { |
| uint_t ip_hdr_length = ira->ira_ip_hdr_length; |
| ill_t *ill = ira->ira_ill; |
| ip_stack_t *ipst = ill->ill_ipst; |
| ip_xmit_attr_t ixas; |
| |
| /* Send out an ICMP packet */ |
| icmph->icmph_checksum = 0; |
| icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0); |
| /* Reset time to live. */ |
| ipha->ipha_ttl = ipst->ips_ip_def_ttl; |
| { |
| /* Swap source and destination addresses */ |
| ipaddr_t tmp; |
| |
| tmp = ipha->ipha_src; |
| ipha->ipha_src = ipha->ipha_dst; |
| ipha->ipha_dst = tmp; |
| } |
| ipha->ipha_ident = 0; |
| if (!IS_SIMPLE_IPH(ipha)) |
| icmp_options_update(ipha); |
| |
| bzero(&ixas, sizeof (ixas)); |
| ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; |
| ixas.ixa_zoneid = ira->ira_zoneid; |
| ixas.ixa_cred = kcred; |
| ixas.ixa_cpid = NOPID; |
| ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ |
| ixas.ixa_ifindex = 0; |
| ixas.ixa_ipst = ipst; |
| ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; |
| |
| if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { |
| /* |
| * This packet should go out the same way as it |
| * came in i.e in clear, independent of the IPsec policy |
| * for transmitting packets. |
| */ |
| ixas.ixa_flags |= IXAF_NO_IPSEC; |
| } else { |
| if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) { |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); |
| /* Note: mp already consumed and ip_drop_packet done */ |
| return; |
| } |
| } |
| if (ira->ira_flags & IRAF_MULTIBROADCAST) { |
| /* |
| * Not one or our addresses (IRE_LOCALs), thus we let |
| * ip_output_simple pick the source. |
| */ |
| ipha->ipha_src = INADDR_ANY; |
| ixas.ixa_flags |= IXAF_SET_SOURCE; |
| } |
| /* Should we send with DF and use dce_pmtu? */ |
| if (ipst->ips_ipv4_icmp_return_pmtu) { |
| ixas.ixa_flags |= IXAF_PMTU_DISCOVERY; |
| ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS; |
| } |
| |
| BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); |
| |
| (void) ip_output_simple(mp, &ixas); |
| ixa_cleanup(&ixas); |
| } |
| |
| /* |
| * Verify the ICMP messages for either for ICMP error or redirect packet. |
| * The caller should have fully pulled up the message. If it's a redirect |
| * packet, only basic checks on IP header will be done; otherwise, verify |
| * the packet by looking at the included ULP header. |
| * |
| * Called before icmp_inbound_error_fanout_v4 is called. |
| */ |
| static boolean_t |
| icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira) |
| { |
| ill_t *ill = ira->ira_ill; |
| int hdr_length; |
| ip_stack_t *ipst = ira->ira_ill->ill_ipst; |
| conn_t *connp; |
| ipha_t *ipha; /* Inner IP header */ |
| |
| ipha = (ipha_t *)&icmph[1]; |
| if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr) |
| goto truncated; |
| |
| hdr_length = IPH_HDR_LENGTH(ipha); |
| |
| if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) |
| goto discard_pkt; |
| |
| if (hdr_length < sizeof (ipha_t)) |
| goto truncated; |
| |
| if ((uchar_t *)ipha + hdr_length > mp->b_wptr) |
| goto truncated; |
| |
| /* |
| * Stop here for ICMP_REDIRECT. |
| */ |
| if (icmph->icmph_type == ICMP_REDIRECT) |
| return (B_TRUE); |
| |
| /* |
| * ICMP errors only. |
| */ |
| switch (ipha->ipha_protocol) { |
| case IPPROTO_UDP: |
| /* |
| * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of |
| * transport header. |
| */ |
| if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > |
| mp->b_wptr) |
| goto truncated; |
| break; |
| case IPPROTO_TCP: { |
| tcpha_t *tcpha; |
| |
| /* |
| * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of |
| * transport header. |
| */ |
| if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > |
| mp->b_wptr) |
| goto truncated; |
| |
| tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length); |
| connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN, |
| ipst); |
| if (connp == NULL) |
| goto discard_pkt; |
| |
| if ((connp->conn_verifyicmp != NULL) && |
| !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) { |
| CONN_DEC_REF(connp); |
| goto discard_pkt; |
| } |
| CONN_DEC_REF(connp); |
| break; |
| } |
| case IPPROTO_SCTP: |
| /* |
| * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of |
| * transport header. |
| */ |
| if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > |
| mp->b_wptr) |
| goto truncated; |
| break; |
| case IPPROTO_ESP: |
| case IPPROTO_AH: |
| break; |
| case IPPROTO_ENCAP: |
| if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > |
| mp->b_wptr) |
| goto truncated; |
| break; |
| default: |
| break; |
| } |
| |
| return (B_TRUE); |
| |
| discard_pkt: |
| /* Bogus ICMP error. */ |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); |
| return (B_FALSE); |
| |
| truncated: |
| /* We pulled up everthing already. Must be truncated */ |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); |
| ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); |
| return (B_FALSE); |
| } |
| |
| /* Table from RFC 1191 */ |
| static int icmp_frag_size_table[] = |
| { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 }; |
| |
| /* |
| * Process received ICMP Packet too big. |
| * Just handles the DCE create/update, including using the above table of |
| * PMTU guesses. The caller is responsible for validating the packet before |
| * passing it in and also to fanout the ICMP error to any matching transport |
| * conns. Assumes the message has been fully pulled up and verified. |
| * |
| * Before getting here, the caller has called icmp_inbound_verify_v4() |
| * that should have verified with ULP to prevent undoing the changes we're |
| * going to make to DCE. For example, TCP might have verified that the packet |
| * which generated error is in the send window. |
| * |
| * In some cases modified this MTU in the ICMP header packet; the caller |
| * should pass to the matching ULP after this returns. |
| */ |
| static void |
| icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira) |
| { |
| dce_t *dce; |
| int old_mtu; |
| int mtu, orig_mtu; |
| ipaddr_t dst; |
| boolean_t disable_pmtud; |
| ill_t *ill = ira->ira_ill; |
| ip_stack_t *ipst = ill->ill_ipst; |
| uint_t hdr_length; |
| ipha_t *ipha; |
| |
| /* Caller already pulled up everything. */ |
| ipha = (ipha_t *)&icmph[1]; |
| ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && |
| icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); |
| ASSERT(ill != NULL); |
| |
| hdr_length = IPH_HDR_LENGTH(ipha); |
| |
| /* |
| * We handle path MTU for source routed packets since the DCE |
| * is looked up using the final destination. |
| */ |
| dst = ip_get_dst(ipha); |
| |
| dce = dce_lookup_and_add_v4(dst, ipst); |
| if (dce == NULL) { |
| /* Couldn't add a unique one - ENOMEM */ |
| ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n", |
| ntohl(dst))); |
| return; |
| } |
| |
| /* Check for MTU discovery advice as described in RFC 1191 */ |
| mtu = ntohs(icmph->icmph_du_mtu); |
| orig_mtu = mtu; |
| disable_pmtud = B_FALSE; |
| |
| mutex_enter(&dce->dce_lock); |
| if (dce->dce_flags & DCEF_PMTU) |
| old_mtu = dce->dce_pmtu; |
| else |
| old_mtu = ill->ill_mtu; |
| |
| if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) { |
| uint32_t length; |
| int i; |
| |
| /* |
| * Use the table from RFC 1191 to figure out |
| * the next "plateau" based on the length in |
| * the original IP packet. |
| */ |
| length = ntohs(ipha->ipha_length); |
| DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce, |
| uint32_t, length); |
| if (old_mtu <= length && |
| old_mtu >= length - hdr_length) { |
| /* |
| * Handle broken BSD 4.2 systems that |
| * return the wrong ipha_length in ICMP |
| * errors. |
| */ |
| ip1dbg(("Wrong mtu: sent %d, dce %d\n", |
| length, old_mtu)); |
| length -= hdr_length; |
| } |
| for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { |
| if (length > icmp_frag_size_table[i]) |
| break; |
| } |
| if (i == A_CNT(icmp_frag_size_table)) { |
| /* Smaller than IP_MIN_MTU! */ |
| ip1dbg(("Too big for packet size %d\n", |
| length)); |
| disable_pmtud = B_TRUE; |
| mtu = ipst->ips_ip_pmtu_min; |
| } else { |
| mtu = icmp_frag_size_table[i]; |
| ip1dbg(("Calculated mtu %d, packet size %d, " |
| "before %d\n", mtu, length, old_mtu)); |
| if (mtu < ipst->ips_ip_pmtu_min) { |
| mtu = ipst->ips_ip_pmtu_min; |
| disable_pmtud = B_TRUE; |
| } |
| } |
| } |
| if (disable_pmtud) |
| dce->dce_flags |= DCEF_TOO_SMALL_PMTU; |
| else |
| dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU; |
| |
| dce->dce_pmtu = MIN(old_mtu, mtu); |
| /* Prepare to send the new max frag size for the ULP. */ |
| icmph->icmph_du_zero = 0; |
| icmph->icmph_du_mtu = htons((uint16_t)dce->dce_pmtu); |
| DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *, |
| dce, int, orig_mtu, int, mtu); |
| |
| /* We now have a PMTU for sure */ |
| dce->dce_flags |= DCEF_PMTU; |
| dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); |
| mutex_exit(&dce->dce_lock); |
| /* |
| * After dropping the lock the new value is visible to everyone. |
| * Then we bump the generation number so any cached values reinspect |
| * the dce_t. |
| */ |
| dce_increment_generation(dce); |
| dce_refrele(dce); |
| } |
| |
| /* |
| * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4 |
| * calls this function. |
| */ |
| static mblk_t * |
| icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha) |
| { |
| int length; |
| |
| ASSERT(mp->b_datap->db_type == M_DATA); |
| |
| /* icmp_inbound_v4 has already pulled up the whole error packet */ |
| ASSERT(mp->b_cont == NULL); |
| |
| /* |
| * The length that we want to overlay is the inner header |
| * and what follows it. |
| */ |
| length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr); |
| |
| /* |
| * Overlay the inner header and whatever follows it over the |
| * outer header. |
| */ |
| bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); |
| |
| /* Adjust for what we removed */ |
| mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha; |
| return (mp); |
| } |
| |
| /* |
| * Try to pass the ICMP message upstream in case the ULP cares. |
| * |
| * If the packet that caused the ICMP error is secure, we send |
| * it to AH/ESP to make sure that the attached packet has a |
| * valid association. ipha in the code below points to the |
| * IP header of the packet that caused the error. |
| * |
| * For IPsec cases, we let the next-layer-up (which has access to |
| * cached policy on the conn_t, or can query the SPD directly) |
| * subtract out any IPsec overhead if they must. We therefore make no |
| * adjustments here for IPsec overhead. |
| * |
| * IFN could have been generated locally or by some router. |
| * |
| * LOCAL : ire_send_wire (before calling ipsec_out_process) can call |
| * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN. |
| * This happens because IP adjusted its value of MTU on an |
| * earlier IFN message and could not tell the upper layer, |
| * the new adjusted value of MTU e.g. Packet was encrypted |
| * or there was not enough information to fanout to upper |
| * layers. Thus on the next outbound datagram, ire_send_wire |
| * generates the IFN, where IPsec processing has *not* been |
| * done. |
| * |
| * Note that we retain ixa_fragsize across IPsec thus once |
| * we have picking ixa_fragsize and entered ipsec_out_process we do |
| * no change the fragsize even if the path MTU changes before |
| * we reach ip_output_post_ipsec. |
| * |
| * In the local case, IRAF_LOOPBACK will be set indicating |
| * that IFN was generated locally. |
| * |
| * ROUTER : IFN could be secure or non-secure. |
| * |
| * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the |
| * packet in error has AH/ESP headers to validate the AH/ESP |
| * headers. AH/ESP will verify whether there is a valid SA or |
| * not and send it back. We will fanout again if we have more |
| * data in the packet. |
| * |
| * If the packet in error does not have AH/ESP, we handle it |
| * like any other case. |
| * |
| * * NON_SECURE : If the packet in error has AH/ESP headers, we send it |
| * up to AH/ESP for validation. AH/ESP will verify whether there is a |
| * valid SA or not and send it back. We will fanout again if |
| * we have more data in the packet. |
| * |
| * If the packet in error does not have AH/ESP, we handle it |
| * like any other case. |
| * |
| * The caller must have called icmp_inbound_verify_v4. |
| */ |
| static void |
| icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira) |
| { |
| uint16_t *up; /* Pointer to ports in ULP header */ |
| uint32_t ports; /* reversed ports for fanout */ |
| ipha_t ripha; /* With reversed addresses */ |
| ipha_t *ipha; /* Inner IP header */ |
| uint_t hdr_length; /* Inner IP header length */ |
| tcpha_t *tcpha; |
| conn_t *connp; |
| ill_t *ill = ira->ira_ill; |
| ip_stack_t *ipst = ill->ill_ipst; |
| ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; |
| ill_t *rill = ira->ira_rill; |
| |
| /* Caller already pulled up everything. */ |
| ipha = (ipha_t *)&icmph[1]; |
| ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr); |
| ASSERT(mp->b_cont == NULL); |
| |
| hdr_length = IPH_HDR_LENGTH(ipha); |
| ira->ira_protocol = ipha->ipha_protocol; |
| |
| /* |
| * We need a separate IP header with the source and destination |
| * addresses reversed to do fanout/classification because the ipha in |
| * the ICMP error is in the form we sent it out. |
| */ |
| ripha.ipha_src = ipha->ipha_dst; |
| ripha.ipha_dst = ipha->ipha_src; |
| ripha.ipha_protocol = ipha->ipha_protocol; |
| ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length; |
| |
| ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n", |
| ripha.ipha_protocol, ntohl(ipha->ipha_src), |
| ntohl(ipha->ipha_dst), |
| icmph->icmph_type, icmph->icmph_code)); |
| |
| switch (ipha->ipha_protocol) { |
| case IPPROTO_UDP: |
| up = (uint16_t *)((uchar_t *)ipha + hdr_length); |
| |
| /* Attempt to find a client stream based on port. */ |
| ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n", |
| ntohs(up[0]), ntohs(up[1]))); |
| |
| /* Note that we send error to all matches. */ |
| ira->ira_flags |= IRAF_ICMP_ERROR; |
| ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira); |
| ira->ira_flags &= ~IRAF_ICMP_ERROR; |
| return; |
| |
| case IPPROTO_TCP: |
| /* |
| * Find a TCP client stream for this packet. |
| * Note that we do a reverse lookup since the header is |
| * in the form we sent it out. |
| */ |
| tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length); |
| connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN, |
| ipst); |
| if (connp == NULL) |
| goto discard_pkt; |
| |
| if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || |
| (ira->ira_flags & IRAF_IPSEC_SECURE)) { |
| mp = ipsec_check_inbound_policy(mp, connp, |
| ipha, NULL, ira); |
| if (mp == NULL) { |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); |
| /* Note that mp is NULL */ |
| ip_drop_input("ipIfStatsInDiscards", mp, ill); |
| CONN_DEC_REF(connp); |
| return; |
| } |
| } |
| |
| ira->ira_flags |= IRAF_ICMP_ERROR; |
| ira->ira_ill = ira->ira_rill = NULL; |
| if (IPCL_IS_TCP(connp)) { |
| SQUEUE_ENTER_ONE(connp->conn_sqp, mp, |
| connp->conn_recvicmp, connp, ira, SQ_FILL, |
| SQTAG_TCP_INPUT_ICMP_ERR); |
| } else { |
| /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ |
| (connp->conn_recv)(connp, mp, NULL, ira); |
| CONN_DEC_REF(connp); |
| } |
| ira->ira_ill = ill; |
| ira->ira_rill = rill; |
| ira->ira_flags &= ~IRAF_ICMP_ERROR; |
| return; |
| |
| case IPPROTO_SCTP: |
| up = (uint16_t *)((uchar_t *)ipha + hdr_length); |
| /* Find a SCTP client stream for this packet. */ |
| ((uint16_t *)&ports)[0] = up[1]; |
| ((uint16_t *)&ports)[1] = up[0]; |
| |
| ira->ira_flags |= IRAF_ICMP_ERROR; |
| ip_fanout_sctp(mp, &ripha, NULL, ports, ira); |
| ira->ira_flags &= ~IRAF_ICMP_ERROR; |
| return; |
| |
| case IPPROTO_ESP: |
| case IPPROTO_AH: |
| if (!ipsec_loaded(ipss)) { |
| ip_proto_not_sup(mp, ira); |
| return; |
| } |
| |
| if (ipha->ipha_protocol == IPPROTO_ESP) |
| mp = ipsecesp_icmp_error(mp, ira); |
| else |
| mp = ipsecah_icmp_error(mp, ira); |
| if (mp == NULL) |
| return; |
| |
| /* Just in case ipsec didn't preserve the NULL b_cont */ |
| if (mp->b_cont != NULL) { |
| if (!pullupmsg(mp, -1)) |
| goto discard_pkt; |
| } |
| |
| /* |
| * Note that ira_pktlen and ira_ip_hdr_length are no longer |
| * correct, but we don't use them any more here. |
| * |
| * If succesful, the mp has been modified to not include |
| * the ESP/AH header so we can fanout to the ULP's icmp |
| * error handler. |
| */ |
| if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) |
| goto truncated; |
| |
| /* Verify the modified message before any further processes. */ |
| ipha = (ipha_t *)mp->b_rptr; |
| hdr_length = IPH_HDR_LENGTH(ipha); |
| icmph = (icmph_t *)&mp->b_rptr[hdr_length]; |
| if (!icmp_inbound_verify_v4(mp, icmph, ira)) { |
| freemsg(mp); |
| return; |
| } |
| |
| icmp_inbound_error_fanout_v4(mp, icmph, ira); |
| return; |
| |
| case IPPROTO_ENCAP: { |
| /* Look for self-encapsulated packets that caused an error */ |
| ipha_t *in_ipha; |
| |
| /* |
| * Caller has verified that length has to be |
| * at least the size of IP header. |
| */ |
| ASSERT(hdr_length >= sizeof (ipha_t)); |
| /* |
| * Check the sanity of the inner IP header like |
| * we did for the outer header. |
| */ |
| in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); |
| if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { |
| goto discard_pkt; |
| } |
| if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { |
| goto discard_pkt; |
| } |
| /* Check for Self-encapsulated tunnels */ |
| if (in_ipha->ipha_src == ipha->ipha_src && |
| in_ipha->ipha_dst == ipha->ipha_dst) { |
| |
| mp = icmp_inbound_self_encap_error_v4(mp, ipha, |
| in_ipha); |
| if (mp == NULL) |
| goto discard_pkt; |
| |
| /* |
| * Just in case self_encap didn't preserve the NULL |
| * b_cont |
| */ |
| if (mp->b_cont != NULL) { |
| if (!pullupmsg(mp, -1)) |
| goto discard_pkt; |
| } |
| /* |
| * Note that ira_pktlen and ira_ip_hdr_length are no |
| * longer correct, but we don't use them any more here. |
| */ |
| if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) |
| goto truncated; |
| |
| /* |
| * Verify the modified message before any further |
| * processes. |
| */ |
| ipha = (ipha_t *)mp->b_rptr; |
| hdr_length = IPH_HDR_LENGTH(ipha); |
| icmph = (icmph_t *)&mp->b_rptr[hdr_length]; |
| if (!icmp_inbound_verify_v4(mp, icmph, ira)) { |
| freemsg(mp); |
| return; |
| } |
| |
| /* |
| * The packet in error is self-encapsualted. |
| * And we are finding it further encapsulated |
| * which we could not have possibly generated. |
| */ |
| if (ipha->ipha_protocol == IPPROTO_ENCAP) { |
| goto discard_pkt; |
| } |
| icmp_inbound_error_fanout_v4(mp, icmph, ira); |
| return; |
| } |
| /* No self-encapsulated */ |
| /* FALLTHRU */ |
| } |
| case IPPROTO_IPV6: |
| if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src, |
| &ripha.ipha_dst, ipst)) != NULL) { |
| ira->ira_flags |= IRAF_ICMP_ERROR; |
| connp->conn_recvicmp(connp, mp, NULL, ira); |
| CONN_DEC_REF(connp); |
| ira->ira_flags &= ~IRAF_ICMP_ERROR; |
| return; |
| } |
| /* |
| * No IP tunnel is interested, fallthrough and see |
| * if a raw socket will want it. |
| */ |
| /* FALLTHRU */ |
| default: |
| ira->ira_flags |= IRAF_ICMP_ERROR; |
| ip_fanout_proto_v4(mp, &ripha, ira); |
| ira->ira_flags &= ~IRAF_ICMP_ERROR; |
| return; |
| } |
| /* NOTREACHED */ |
| discard_pkt: |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); |
| ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n")); |
| ip_drop_input("ipIfStatsInDiscards", mp, ill); |
| freemsg(mp); |
| return; |
| |
| truncated: |
| /* We pulled up everthing already. Must be truncated */ |
| BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); |
| ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); |
| freemsg(mp); |
| } |
| |
| /* |
| * Common IP options parser. |
| * |
| * Setup routine: fill in *optp with options-parsing state, then |
| * tail-call ipoptp_next to return the first option. |
| */ |
| uint8_t |
| ipoptp_first(ipoptp_t *optp, ipha_t *ipha) |
| { |
| uint32_t totallen; /* total length of all options */ |
| |
| totallen = ipha->ipha_version_and_hdr_length - |
| (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); |
| totallen <<= 2; |
| optp->ipoptp_next = (uint8_t *)(&ipha[1]); |
| optp->ipoptp_end = optp->ipoptp_next + totallen; |
| optp->ipoptp_flags = 0; |
| return (ipoptp_next(optp)); |
| } |
| |
| /* Like above but without an ipha_t */ |
| uint8_t |
| ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt) |
| { |
| optp->ipoptp_next = opt; |
| optp->ipoptp_end = optp->ipoptp_next + totallen; |
| optp->ipoptp_flags = 0; |
| return (ipoptp_next(optp)); |
| } |
| |
| /* |
| * Common IP options parser: extract next option. |
| */ |
| uint8_t |
| ipoptp_next(ipoptp_t *optp) |
| { |
| uint8_t *end = optp->ipoptp_end; |
| uint8_t *cur = optp->ipoptp_next; |
| uint8_t opt, len, pointer; |
| |
| /* |
| * If cur > end already, then the ipoptp_end or ipoptp_next pointer |
| * has been corrupted. |
| */ |
| ASSERT(cur <= end); |
| |
| if (cur == end) |
| return (IPOPT_EOL); |
| |
| opt = cur[IPOPT_OPTVAL]; |
| |
| /* |
| * Skip any NOP options. |
| */ |
| while (opt == IPOPT_NOP) { |
| cur++; |
| if (cur == end) |
| return (IPOPT_EOL); |
| opt = cur[IPOPT_OPTVAL]; |
| } |
| |
| if (opt == IPOPT_EOL) |
| return (IPOPT_EOL); |
| |
| /* |
| * Option requiring a length. |
| */ |
| if ((cur + 1) >= end) { |
| optp->ipoptp_flags |= IPOPTP_ERROR; |
| return (IPOPT_EOL); |
| } |
| len = cur[IPOPT_OLEN]; |
| if (len < 2) { |
| optp->ipoptp_flags |= IPOPTP_ERROR; |
| return (IPOPT_EOL); |
| } |
| optp->ipoptp_cur = cur; |
| optp->ipoptp_len = len; |
| optp->ipoptp_next = cur + len; |
| if (cur + len > end) { |
| optp->ipoptp_flags |= IPOPTP_ERROR; |
| return (IPOPT_EOL); |
| } |
| |
| /* |
| * For the options which require a pointer field, make sure |
| * its there, and make sure it points to either something |
| * inside this option, or the end of the option. |
| */ |
| switch (opt) { |
| case IPOPT_RR: |
| case IPOPT_TS: |
| case IPOPT_LSRR: |
| case IPOPT_SSRR: |
| if (len <= IPOPT_OFFSET) { |
| optp->ipoptp_flags |= IPOPTP_ERROR; |
| return (opt); |
| } |
| pointer = cur[IPOPT_OFFSET]; |
| if (pointer - 1 > len) { |
| optp->ipoptp_flags |= IPOPTP_ERROR; |
| return (opt); |
| } |
| break; |
| } |
| |
| /* |
| * Sanity check the pointer field based on the type of the |
| * option. |
| */ |
| switch (opt) { |
| case IPOPT_RR: |
| case IPOPT_SSRR: |
| case IPOPT_LSRR: |
| if (pointer < IPOPT_MINOFF_SR) |
| optp->ipoptp_flags |= IPOPTP_ERROR; |
| break; |
| case IPOPT_TS: |
| if (pointer < IPOPT_MINOFF_IT) |
| optp->ipoptp_flags |= IPOPTP_ERROR; |
| /* |
| * Note that the Internet Timestamp option also |
| * contains two four bit fields (the Overflow field, |
| * and the Flag field), which follow the pointer |
| * field. We don't need to check that these fields |
| * fall within the length of the option because this |
| * was implicitely done above. We've checked that the |
| * pointer value is at least IPOPT_MINOFF_IT, and that |
| * it falls within the option. Since IPOPT_MINOFF_IT > |
| * IPOPT_POS_OV_FLG, we don't need the explicit check. |
| */ |
| ASSERT(len > IPOPT_POS_OV_FLG); |
| break; |
| } |
| |
| return (opt); |
| } |
| |
| /* |
| * Use the outgoing IP header to create an IP_OPTIONS option the way |
| * it was passed down from the application. |
| * |
| * This is compatible with BSD in that it returns |
| * the reverse source route with the final destination |
| * as the last entry. The first 4 bytes of the option |
| * will contain the final destination. |
| */ |
| int |
| ip_opt_get_user(conn_t *connp, uchar_t *buf) |
| { |
| ipoptp_t opts; |
| |