blob: 6063fa01d22b680a9d92046b3354e7f365a11fba [file] [log] [blame]
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
ja97890fecf4ec2006-02-07 02:27:51 -08005 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07007 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
dh155122f4b3ec62007-01-19 16:59:38 -080021
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070022/*
meem66b718c2010-04-12 21:02:11 -040023 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 1990 Mentat Inc.
Dan McDonald7199b8e2017-02-01 14:55:57 -050025 * Copyright (c) 2017 OmniTI Computer Consulting, Inc. All rights reserved.
Daniel Hoffman48bbca82017-02-17 11:48:20 -080026 * Copyright (c) 2016 by Delphix. All rights reserved.
Andy Fiddaman221e47f2020-09-18 20:04:57 +000027 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
Dan McDonald9495f632021-10-30 00:14:30 -040028 * Copyright 2021 Joyent, Inc.
Robert Mustacchi0accf552022-10-22 23:25:41 +000029 * Copyright 2022 Oxide Computer Company
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070030 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070031
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070032#include <sys/types.h>
33#include <sys/stream.h>
34#include <sys/dlpi.h>
35#include <sys/stropts.h>
36#include <sys/sysmacros.h>
37#include <sys/strsubr.h>
38#include <sys/strlog.h>
39#include <sys/strsun.h>
40#include <sys/zone.h>
41#define _SUN_TPI_VERSION 2
42#include <sys/tihdr.h>
43#include <sys/xti_inet.h>
44#include <sys/ddi.h>
Erik Nordmarkbd670b32009-11-11 11:49:49 -080045#include <sys/suntpi.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070046#include <sys/cmn_err.h>
47#include <sys/debug.h>
48#include <sys/kobj.h>
49#include <sys/modctl.h>
50#include <sys/atomic.h>
51#include <sys/policy.h>
jpk45916cd2006-03-24 12:29:20 -080052#include <sys/priv.h>
Eric Chengda14ceb2008-12-04 18:16:10 -080053#include <sys/taskq.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070054
55#include <sys/systm.h>
56#include <sys/param.h>
57#include <sys/kmem.h>
dr146992381a2a92006-10-20 16:37:58 -070058#include <sys/sdt.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070059#include <sys/socket.h>
60#include <sys/vtrace.h>
61#include <sys/isa_defs.h>
dr1469921b47e082008-01-20 23:43:45 -080062#include <sys/mac.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070063#include <net/if.h>
64#include <net/if_arp.h>
65#include <net/route.h>
66#include <sys/sockio.h>
67#include <netinet/in.h>
68#include <net/if_dl.h>
69
70#include <inet/common.h>
71#include <inet/mi.h>
72#include <inet/mib2.h>
73#include <inet/nd.h>
74#include <inet/arp.h>
75#include <inet/snmpcom.h>
nordmarkfc80c0d2007-10-11 22:57:36 -070076#include <inet/optcom.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070077#include <inet/kstatcom.h>
78
79#include <netinet/igmp_var.h>
80#include <netinet/ip6.h>
81#include <netinet/icmp6.h>
82#include <netinet/sctp.h>
83
84#include <inet/ip.h>
masputraff550d02005-10-22 22:50:14 -070085#include <inet/ip_impl.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070086#include <inet/ip6.h>
87#include <inet/ip6_asp.h>
88#include <inet/tcp.h>
masputraff550d02005-10-22 22:50:14 -070089#include <inet/tcp_impl.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070090#include <inet/ip_multi.h>
91#include <inet/ip_if.h>
92#include <inet/ip_ire.h>
sangeetac793af92006-08-11 05:59:29 -070093#include <inet/ip_ftable.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070094#include <inet/ip_rts.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070095#include <inet/ip_ndp.h>
96#include <inet/ip_listutils.h>
97#include <netinet/igmp.h>
98#include <netinet/ip_mroute.h>
99#include <inet/ipp_common.h>
Sebastien Roy45a4b792017-08-01 13:21:40 -0400100#include <inet/cc.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700101
102#include <net/pfkeyv2.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700103#include <inet/sadb.h>
104#include <inet/ipsec_impl.h>
Sebastien Roy2b24ab62009-09-22 22:04:45 -0400105#include <inet/iptun/iptun_impl.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700106#include <inet/ipdrop.h>
dr146992381a2a92006-10-20 16:37:58 -0700107#include <inet/ip_netinfo.h>
Sangeeta Misradbed73c2009-11-03 23:15:19 -0800108#include <inet/ilb_ip.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700109
110#include <sys/ethernet.h>
111#include <net/if_types.h>
112#include <sys/cpuvar.h>
113
114#include <ipp/ipp.h>
115#include <ipp/ipp_impl.h>
116#include <ipp/ipgpc/ipgpc.h>
117
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700118#include <sys/pattr.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700119#include <inet/ipclassifier.h>
120#include <inet/sctp_ip.h>
priyankabe547022006-06-21 12:18:43 -0700121#include <inet/sctp/sctp_impl.h>
masputraff550d02005-10-22 22:50:14 -0700122#include <inet/udp_impl.h>
nordmarkfc80c0d2007-10-11 22:57:36 -0700123#include <inet/rawip_impl.h>
124#include <inet/rts_impl.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700125
jpk45916cd2006-03-24 12:29:20 -0800126#include <sys/tsol/label.h>
127#include <sys/tsol/tnet.h>
128
Eric Chengda14ceb2008-12-04 18:16:10 -0800129#include <sys/squeue_impl.h>
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800130#include <inet/ip_arp.h>
jpk45916cd2006-03-24 12:29:20 -0800131
Erik Nordmarkb36a5612009-11-19 11:04:40 -0800132#include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */
133
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700134/*
135 * Values for squeue switch:
Eric Chengda14ceb2008-12-04 18:16:10 -0800136 * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
137 * IP_SQUEUE_ENTER: SQ_PROCESS
138 * IP_SQUEUE_FILL: SQ_FILL
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700139 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800140int ip_squeue_enter = IP_SQUEUE_ENTER; /* Setable in /etc/system */
dh155122f4b3ec62007-01-19 16:59:38 -0800141
Eric Chengda14ceb2008-12-04 18:16:10 -0800142int ip_squeue_flag;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700143
dh155122f4b3ec62007-01-19 16:59:38 -0800144/*
145 * Setable in /etc/system
146 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700147int ip_poll_normal_ms = 100;
148int ip_poll_normal_ticks = 0;
yz147064e7176232006-12-13 10:43:15 -0800149int ip_modclose_ackwait_ms = 3000;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700150
151/*
carlsonj6a8288c2007-09-11 04:26:06 -0700152 * It would be nice to have these present only in DEBUG systems, but the
153 * current design of the global symbol checking logic requires them to be
154 * unconditionally present.
155 */
156uint_t ip_thread_data; /* TSD key for debug support */
157krwlock_t ip_thread_rwlock;
158list_t ip_thread_list;
159
160/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700161 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
162 */
163
164struct listptr_s {
165 mblk_t *lp_head; /* pointer to the head of the list */
166 mblk_t *lp_tail; /* pointer to the tail of the list */
167};
168
169typedef struct listptr_s listptr_t;
170
171/*
jpk45916cd2006-03-24 12:29:20 -0800172 * This is used by ip_snmp_get_mib2_ip_route_media and
173 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
174 */
175typedef struct iproutedata_s {
176 uint_t ird_idx;
meeme11c3f42009-01-06 20:16:25 -0500177 uint_t ird_flags; /* see below */
jpk45916cd2006-03-24 12:29:20 -0800178 listptr_t ird_route; /* ipRouteEntryTable */
179 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */
180 listptr_t ird_attrs; /* ipRouteAttributeTable */
181} iproutedata_t;
182
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800183/* Include ire_testhidden and IRE_IF_CLONE routes */
184#define IRD_REPORT_ALL 0x01
meeme11c3f42009-01-06 20:16:25 -0500185
jpk45916cd2006-03-24 12:29:20 -0800186/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700187 * Cluster specific hooks. These should be NULL when booted as a non-cluster
188 */
189
190/*
191 * Hook functions to enable cluster networking
192 * On non-clustered systems these vectors must always be NULL.
193 *
194 * Hook function to Check ip specified ip address is a shared ip address
195 * in the cluster
196 *
197 */
Lu Huafeng8e4b7702008-12-17 12:37:29 +0800198int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
199 sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700200
201/*
202 * Hook function to generate cluster wide ip fragment identifier
203 */
Lu Huafeng8e4b7702008-12-17 12:37:29 +0800204uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
205 sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
206 void *args) = NULL;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700207
208/*
Thejaswini Singarajipura9c2c14a2008-09-29 19:18:37 -0400209 * Hook function to generate cluster wide SPI.
210 */
Lu Huafeng8e4b7702008-12-17 12:37:29 +0800211void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
212 void *) = NULL;
Thejaswini Singarajipura9c2c14a2008-09-29 19:18:37 -0400213
214/*
215 * Hook function to verify if the SPI is already utlized.
216 */
217
Lu Huafeng8e4b7702008-12-17 12:37:29 +0800218int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
Thejaswini Singarajipura9c2c14a2008-09-29 19:18:37 -0400219
220/*
221 * Hook function to delete the SPI from the cluster wide repository.
222 */
223
Lu Huafeng8e4b7702008-12-17 12:37:29 +0800224void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
Thejaswini Singarajipura9c2c14a2008-09-29 19:18:37 -0400225
226/*
227 * Hook function to inform the cluster when packet received on an IDLE SA
228 */
229
Lu Huafeng8e4b7702008-12-17 12:37:29 +0800230void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
231 in6_addr_t, in6_addr_t, void *) = NULL;
Thejaswini Singarajipura9c2c14a2008-09-29 19:18:37 -0400232
233/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700234 * Synchronization notes:
235 *
236 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
237 * MT level protection given by STREAMS. IP uses a combination of its own
238 * internal serialization mechanism and standard Solaris locking techniques.
meeme11c3f42009-01-06 20:16:25 -0500239 * The internal serialization is per phyint. This is used to serialize
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800240 * plumbing operations, IPMP operations, most set ioctls, etc.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700241 *
242 * Plumbing is a long sequence of operations involving message
243 * exchanges between IP, ARP and device drivers. Many set ioctls are typically
244 * involved in plumbing operations. A natural model is to serialize these
245 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
246 * parallel without any interference. But various set ioctls on hme0 are best
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800247 * serialized, along with IPMP operations and processing of DLPI control
248 * messages received from drivers on a per phyint basis. This serialization is
249 * provided by the ipsq_t and primitives operating on this. Details can
250 * be found in ip_if.c above the core primitives operating on ipsq_t.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700251 *
252 * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
253 * Simiarly lookup of an ire by a thread also returns a refheld ire.
254 * In addition ipif's and ill's referenced by the ire are also indirectly
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800255 * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
meeme11c3f42009-01-06 20:16:25 -0500256 * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700257 * address of an ipif has to go through the ipsq_t. This ensures that only
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800258 * one such exclusive operation proceeds at any time on the ipif. It then
259 * waits for all refcnts
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700260 * associated with this ipif to come down to zero. The address is changed
261 * only after the ipif has been quiesced. Then the ipif is brought up again.
262 * More details are described above the comment in ip_sioctl_flags.
263 *
264 * Packet processing is based mostly on IREs and are fully multi-threaded
265 * using standard Solaris MT techniques.
266 *
267 * There are explicit locks in IP to handle:
268 * - The ip_g_head list maintained by mi_open_link() and friends.
269 *
270 * - The reassembly data structures (one lock per hash bucket)
271 *
272 * - conn_lock is meant to protect conn_t fields. The fields actually
273 * protected by conn_lock are documented in the conn_t definition.
274 *
275 * - ire_lock to protect some of the fields of the ire, IRE tables
276 * (one lock per hash bucket). Refer to ip_ire.c for details.
277 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800278 * - ndp_g_lock and ncec_lock for protecting NCEs.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700279 *
280 * - ill_lock protects fields of the ill and ipif. Details in ip.h
281 *
282 * - ill_g_lock: This is a global reader/writer lock. Protects the following
283 * * The AVL tree based global multi list of all ills.
284 * * The linked list of all ipifs of an ill
meeme11c3f42009-01-06 20:16:25 -0500285 * * The <ipsq-xop> mapping
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700286 * * <ill-phyint> association
287 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif
meeme11c3f42009-01-06 20:16:25 -0500288 * into an ill, changing the <ipsq-xop> mapping of an ill, changing the
289 * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
290 * writer for the actual duration of the insertion/deletion/change.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700291 *
292 * - ill_lock: This is a per ill mutex.
meeme11c3f42009-01-06 20:16:25 -0500293 * It protects some members of the ill_t struct; see ip.h for details.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700294 * It also protects the <ill-phyint> assoc.
295 * It also protects the list of ipifs hanging off the ill.
296 *
297 * - ipsq_lock: This is a per ipsq_t mutex lock.
meeme11c3f42009-01-06 20:16:25 -0500298 * This protects some members of the ipsq_t struct; see ip.h for details.
299 * It also protects the <ipsq-ipxop> mapping
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700300 *
meeme11c3f42009-01-06 20:16:25 -0500301 * - ipx_lock: This is a per ipxop_t mutex lock.
302 * This protects some members of the ipxop_t struct; see ip.h for details.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700303 *
304 * - phyint_lock: This is a per phyint mutex lock. Protects just the
305 * phyint_flags
306 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700307 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
308 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
309 * uniqueness check also done atomically.
310 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700311 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
312 * group list linked by ill_usesrc_grp_next. It also protects the
313 * ill_usesrc_ifindex field. It is taken as a writer when a member of the
314 * group is being added or deleted. This lock is taken as a reader when
315 * walking the list/group(eg: to get the number of members in a usesrc group).
316 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next
317 * field is changing state i.e from NULL to non-NULL or vice-versa. For
318 * example, it is not necessary to take this lock in the initial portion
meeme11c3f42009-01-06 20:16:25 -0500319 * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
320 * operations are executed exclusively and that ensures that the "usesrc
321 * group state" cannot change. The "usesrc group state" change can happen
322 * only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700323 *
meeme11c3f42009-01-06 20:16:25 -0500324 * Changing <ill-phyint>, <ipsq-xop> assocications:
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700325 *
326 * To change the <ill-phyint> association, the ill_g_lock must be held
327 * as writer, and the ill_locks of both the v4 and v6 instance of the ill
328 * must be held.
329 *
meeme11c3f42009-01-06 20:16:25 -0500330 * To change the <ipsq-xop> association, the ill_g_lock must be held as
331 * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
332 * This is only done when ills are added or removed from IPMP groups.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700333 *
334 * To add or delete an ipif from the list of ipifs hanging off the ill,
335 * ill_g_lock (writer) and ill_lock must be held and the thread must be
meeme11c3f42009-01-06 20:16:25 -0500336 * a writer on the associated ipsq.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700337 *
338 * To add or delete an ill to the system, the ill_g_lock must be held as
339 * writer and the thread must be a writer on the associated ipsq.
340 *
341 * To add or delete an ilm to an ill, the ill_lock must be held and the thread
342 * must be a writer on the associated ipsq.
343 *
344 * Lock hierarchy
345 *
346 * Some lock hierarchy scenarios are listed below.
347 *
meeme11c3f42009-01-06 20:16:25 -0500348 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700349 * ill_g_lock -> ill_lock(s) -> phyint_lock
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800350 * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700351 * ill_g_lock -> ip_addr_avail_lock
352 * conn_lock -> irb_lock -> ill_lock -> ire_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700353 * ill_g_lock -> ip_g_nd_lock
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800354 * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
355 * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
356 * arl_lock -> ill_lock
357 * ips_ire_dep_lock -> irb_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700358 *
359 * When more than 1 ill lock is needed to be held, all ill lock addresses
360 * are sorted on address and locked starting from highest addressed lock
361 * downward.
362 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800363 * Multicast scenarios
364 * ips_ill_g_lock -> ill_mcast_lock
365 * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
366 * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
367 * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
368 * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
369 * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
370 *
jpk45916cd2006-03-24 12:29:20 -0800371 * IPsec scenarios
372 *
373 * ipsa_lock -> ill_g_lock -> ill_lock
jpk45916cd2006-03-24 12:29:20 -0800374 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
375 *
376 * Trusted Solaris scenarios
377 *
378 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
379 * igsa_lock -> gcdb_lock
380 * gcgrp_rwlock -> ire_lock
381 * gcgrp_rwlock -> gcdb_lock
382 *
Eric Chengda14ceb2008-12-04 18:16:10 -0800383 * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
384 *
385 * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
386 * sq_lock -> conn_lock -> QLOCK(q)
387 * ill_lock -> ft_lock -> fe_lock
sangeetac793af92006-08-11 05:59:29 -0700388 *
389 * Routing/forwarding table locking notes:
390 *
391 * Lock acquisition order: Radix tree lock, irb_lock.
392 * Requirements:
393 * i. Walker must not hold any locks during the walker callback.
394 * ii Walker must not see a truncated tree during the walk because of any node
395 * deletion.
396 * iii Existing code assumes ire_bucket is valid if it is non-null and is used
397 * in many places in the code to walk the irb list. Thus even if all the
398 * ires in a bucket have been deleted, we still can't free the radix node
399 * until the ires have actually been inactive'd (freed).
400 *
401 * Tree traversal - Need to hold the global tree lock in read mode.
402 * Before dropping the global tree lock, need to either increment the ire_refcnt
403 * to ensure that the radix node can't be deleted.
404 *
405 * Tree add - Need to hold the global tree lock in write mode to add a
406 * radix node. To prevent the node from being deleted, increment the
407 * irb_refcnt, after the node is added to the tree. The ire itself is
408 * added later while holding the irb_lock, but not the tree lock.
409 *
410 * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
411 * All associated ires must be inactive (i.e. freed), and irb_refcnt
412 * must be zero.
413 *
414 * Walker - Increment irb_refcnt before calling the walker callback. Hold the
415 * global tree lock (read mode) for traversal.
416 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800417 * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
418 * hence we will acquire irb_lock while holding ips_ire_dep_lock.
419 *
danmcd437220c2007-09-04 06:48:33 -0700420 * IPsec notes :
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700421 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800422 * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
423 * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
424 * ip_xmit_attr_t has the
danmcd437220c2007-09-04 06:48:33 -0700425 * information used by the IPsec code for applying the right level of
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800426 * protection. The information initialized by IP in the ip_xmit_attr_t
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700427 * is determined by the per-socket policy or global policy in the system.
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800428 * For inbound datagrams, the ip_recv_attr_t
429 * starts out with nothing in it. It gets filled
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700430 * with the right information if it goes through the AH/ESP code, which
431 * happens if the incoming packet is secure. The information initialized
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800432 * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700433 * the policy requirements needed by per-socket policy or global policy
434 * is met or not.
435 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700436 * For fully connected sockets i.e dst, src [addr, port] is known,
437 * conn_policy_cached is set indicating that policy has been cached.
438 * conn_in_enforce_policy may or may not be set depending on whether
439 * there is a global policy match or per-socket policy match.
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800440 * Policy inheriting happpens in ip_policy_set once the destination is known.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700441 * Once the right policy is set on the conn_t, policy cannot change for
442 * this socket. This makes life simpler for TCP (UDP ?) where
443 * re-transmissions go out with the same policy. For symmetry, policy
444 * is cached for fully connected UDP sockets also. Thus if policy is cached,
445 * it also implies that policy is latched i.e policy cannot change
446 * on these sockets. As we have the right policy on the conn, we don't
447 * have to lookup global policy for every outbound and inbound datagram
448 * and thus serving as an optimization. Note that a global policy change
449 * does not affect fully connected sockets if they have policy. If fully
450 * connected sockets did not have any policy associated with it, global
451 * policy change may affect them.
452 *
453 * IP Flow control notes:
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800454 * ---------------------
455 * Non-TCP streams are flow controlled by IP. The way this is accomplished
456 * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
457 * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
458 * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
459 * functions.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700460 *
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800461 * Per Tx ring udp flow control:
462 * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
463 * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
464 *
465 * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
466 * To achieve best performance, outgoing traffic need to be fanned out among
467 * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
468 * traffic out of the NIC and it takes a fanout hint. UDP connections pass
469 * the address of connp as fanout hint to mac_tx(). Under flow controlled
470 * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
471 * cookie points to a specific Tx ring that is blocked. The cookie is used to
472 * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
473 * point to drain_lists (idl_t's). These drain list will store the blocked UDP
474 * connp's. The drain list is not a single list but a configurable number of
475 * lists.
476 *
477 * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
478 * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
479 * which is equal to 128. This array in turn contains a pointer to idl_t[],
480 * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
481 * list will point to the list of connp's that are flow controlled.
482 *
483 * --------------- ------- ------- -------
484 * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
485 * | --------------- ------- ------- -------
486 * | --------------- ------- ------- -------
487 * |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
488 * ---------------- | --------------- ------- ------- -------
489 * |idl_tx_list[0]|->| --------------- ------- ------- -------
490 * ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
491 * | --------------- ------- ------- -------
492 * . . . . .
493 * | --------------- ------- ------- -------
494 * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
495 * --------------- ------- ------- -------
496 * --------------- ------- ------- -------
497 * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
498 * | --------------- ------- ------- -------
499 * | --------------- ------- ------- -------
500 * ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
501 * |idl_tx_list[1]|->| --------------- ------- ------- -------
502 * ---------------- | . . . .
503 * | --------------- ------- ------- -------
504 * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
505 * --------------- ------- ------- -------
506 * .....
507 * ----------------
508 * |idl_tx_list[n]|-> ...
509 * ----------------
510 *
meem3344d752010-03-27 02:33:20 -0400511 * When mac_tx() returns a cookie, the cookie is hashed into an index into
512 * ips_idl_tx_list[], and conn_drain_insert() is called with the idl_tx_list
513 * to insert the conn onto. conn_drain_insert() asserts flow control for the
514 * sockets via su_txq_full() (non-STREAMS) or QFULL on conn_wq (STREAMS).
515 * Further, conn_blocked is set to indicate that the conn is blocked.
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800516 *
meem3344d752010-03-27 02:33:20 -0400517 * GLDv3 calls ill_flow_enable() when flow control is relieved. The cookie
518 * passed in the call to ill_flow_enable() identifies the blocked Tx ring and
519 * is again hashed to locate the appropriate idl_tx_list, which is then
520 * drained via conn_walk_drain(). conn_walk_drain() goes through each conn in
521 * the drain list and calls conn_drain_remove() to clear flow control (via
522 * calling su_txq_full() or clearing QFULL), and remove the conn from the
523 * drain list.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700524 *
meem3344d752010-03-27 02:33:20 -0400525 * Note that the drain list is not a single list but a (configurable) array of
526 * lists (8 elements by default). Synchronization between drain insertion and
527 * flow control wakeup is handled by using idl_txl->txl_lock, and only
528 * conn_drain_insert() and conn_drain_remove() manipulate the drain list.
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800529 *
meem3344d752010-03-27 02:33:20 -0400530 * Flow control via STREAMS is used when ILL_DIRECT_CAPABLE() returns FALSE.
531 * On the send side, if the packet cannot be sent down to the driver by IP
532 * (canput() fails), ip_xmit() drops the packet and returns EWOULDBLOCK to the
533 * caller, who may then invoke ixa_check_drain_insert() to insert the conn on
534 * the 0'th drain list. When ip_wsrv() runs on the ill_wq because flow
535 * control has been relieved, the blocked conns in the 0'th drain list are
536 * drained as in the non-STREAMS case.
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800537 *
meem3344d752010-03-27 02:33:20 -0400538 * In both the STREAMS and non-STREAMS cases, the sockfs upcall to set QFULL
539 * is done when the conn is inserted into the drain list (conn_drain_insert())
540 * and cleared when the conn is removed from the it (conn_drain_remove()).
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700541 *
542 * IPQOS notes:
543 *
544 * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
545 * and IPQoS modules. IPPF includes hooks in IP at different control points
546 * (callout positions) which direct packets to IPQoS modules for policy
547 * processing. Policies, if present, are global.
548 *
549 * The callout positions are located in the following paths:
550 * o local_in (packets destined for this host)
551 * o local_out (packets orginating from this host )
552 * o fwd_in (packets forwarded by this m/c - inbound)
553 * o fwd_out (packets forwarded by this m/c - outbound)
554 * Hooks at these callout points can be enabled/disabled using the ndd variable
555 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
556 * By default all the callout positions are enabled.
557 *
558 * Outbound (local_out)
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800559 * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700560 *
561 * Inbound (local_in)
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800562 * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700563 *
564 * Forwarding (in and out)
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800565 * Hooks are placed in ire_recv_forward_v4/v6.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700566 *
567 * IP Policy Framework processing (IPPF processing)
568 * Policy processing for a packet is initiated by ip_process, which ascertains
569 * that the classifier (ipgpc) is loaded and configured, failing which the
570 * packet resumes normal processing in IP. If the clasifier is present, the
571 * packet is acted upon by one or more IPQoS modules (action instances), per
572 * filters configured in ipgpc and resumes normal IP processing thereafter.
573 * An action instance can drop a packet in course of its processing.
574 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700575 * Zones notes:
576 *
577 * The partitioning rules for networking are as follows:
578 * 1) Packets coming from a zone must have a source address belonging to that
579 * zone.
580 * 2) Packets coming from a zone can only be sent on a physical interface on
581 * which the zone has an IP address.
582 * 3) Between two zones on the same machine, packet delivery is only allowed if
583 * there's a matching route for the destination and zone in the forwarding
584 * table.
585 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
586 * different zones can bind to the same port with the wildcard address
587 * (INADDR_ANY).
588 *
589 * The granularity of interface partitioning is at the logical interface level.
590 * Therefore, every zone has its own IP addresses, and incoming packets can be
591 * attributed to a zone unambiguously. A logical interface is placed into a zone
592 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
593 * structure. Rule (1) is implemented by modifying the source address selection
594 * algorithm so that the list of eligible addresses is filtered based on the
595 * sending process zone.
596 *
597 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
598 * across all zones, depending on their type. Here is the break-up:
599 *
600 * IRE type Shared/exclusive
601 * -------- ----------------
602 * IRE_BROADCAST Exclusive
603 * IRE_DEFAULT (default routes) Shared (*)
nordmark5597b602006-09-14 18:05:27 -0700604 * IRE_LOCAL Exclusive (x)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700605 * IRE_LOOPBACK Exclusive
606 * IRE_PREFIX (net routes) Shared (*)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700607 * IRE_IF_NORESOLVER (interface routes) Exclusive
608 * IRE_IF_RESOLVER (interface routes) Exclusive
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800609 * IRE_IF_CLONE (interface routes) Exclusive
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700610 * IRE_HOST (host routes) Shared (*)
611 *
612 * (*) A zone can only use a default or off-subnet route if the gateway is
613 * directly reachable from the zone, that is, if the gateway's address matches
614 * one of the zone's logical interfaces.
615 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800616 * (x) IRE_LOCAL are handled a bit differently.
617 * When ip_restrict_interzone_loopback is set (the default),
618 * ire_route_recursive restricts loopback using an IRE_LOCAL
nordmark5597b602006-09-14 18:05:27 -0700619 * between zone to the case when L2 would have conceptually looped the packet
620 * back, i.e. the loopback which is required since neither Ethernet drivers
621 * nor Ethernet hardware loops them back. This is the case when the normal
622 * routes (ignoring IREs with different zoneids) would send out the packet on
meeme11c3f42009-01-06 20:16:25 -0500623 * the same ill as the ill with which is IRE_LOCAL is associated.
nordmark5597b602006-09-14 18:05:27 -0700624 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700625 * Multiple zones can share a common broadcast address; typically all zones
626 * share the 255.255.255.255 address. Incoming as well as locally originated
627 * broadcast packets must be dispatched to all the zones on the broadcast
628 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
629 * since some zones may not be on the 10.16.72/24 network. To handle this, each
630 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
631 * sent to every zone that has an IRE_BROADCAST entry for the destination
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800632 * address on the input ill, see ip_input_broadcast().
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700633 *
634 * Applications in different zones can join the same multicast group address.
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800635 * The same logic applies for multicast as for broadcast. ip_input_multicast
636 * dispatches packets to all zones that have members on the physical interface.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700637 */
638
639/*
640 * Squeue Fanout flags:
641 * 0: No fanout.
642 * 1: Fanout across all squeues
643 */
644boolean_t ip_squeue_fanout = 0;
645
646/*
647 * Maximum dups allowed per packet.
648 */
649uint_t ip_max_frag_dups = 10;
650
nordmarkfc80c0d2007-10-11 22:57:36 -0700651static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
652 cred_t *credp, boolean_t isv6);
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800653static mblk_t *ip_xmit_attach_llhdr(mblk_t *, nce_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700654
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800655static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
656static void icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
657static void icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
658 ip_recv_attr_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700659static void icmp_options_update(ipha_t *);
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800660static void icmp_param_problem(mblk_t *, uint8_t, ip_recv_attr_t *);
661static void icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
662static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
663static void icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
664 ip_recv_attr_t *);
665static void icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
666static void icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
667 ip_recv_attr_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700668
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700669mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
670char *ip_dot_addr(ipaddr_t, char *);
671mblk_t *ip_carve_mp(mblk_t **, ssize_t);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700672static char *ip_dot_saddr(uchar_t *, char *);
Toomas Soome8a06b3d2018-10-15 22:13:16 +0300673static int ip_lrput(queue_t *, mblk_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700674ipaddr_t ip_net_mask(ipaddr_t);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700675char *ip_nv_lookup(nv_t *, int);
Toomas Soome8a06b3d2018-10-15 22:13:16 +0300676int ip_rput(queue_t *, mblk_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700677static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
678 void *dummy_arg);
Baban Kenkre6f773e22010-07-23 16:52:46 -0400679int ip_snmp_get(queue_t *, mblk_t *, int, boolean_t);
apersson31736642006-12-19 17:33:00 -0800680static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
Baban Kenkre6f773e22010-07-23 16:52:46 -0400681 mib2_ipIfStatsEntry_t *, ip_stack_t *, boolean_t);
dh155122f4b3ec62007-01-19 16:59:38 -0800682static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
Baban Kenkre6f773e22010-07-23 16:52:46 -0400683 ip_stack_t *, boolean_t);
684static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *,
685 boolean_t);
dh155122f4b3ec62007-01-19 16:59:38 -0800686static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
687static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
688static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
689static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
690static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
Baban Kenkre6f773e22010-07-23 16:52:46 -0400691 ip_stack_t *ipst, boolean_t);
dh155122f4b3ec62007-01-19 16:59:38 -0800692static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
Baban Kenkre6f773e22010-07-23 16:52:46 -0400693 ip_stack_t *ipst, boolean_t);
dh155122f4b3ec62007-01-19 16:59:38 -0800694static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
695 ip_stack_t *ipst);
696static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
697 ip_stack_t *ipst);
698static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
699 ip_stack_t *ipst);
700static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
701 ip_stack_t *ipst);
702static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
703 ip_stack_t *ipst);
704static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
705 ip_stack_t *ipst);
meeme11c3f42009-01-06 20:16:25 -0500706static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
dh155122f4b3ec62007-01-19 16:59:38 -0800707 ip_stack_t *ipst);
meeme11c3f42009-01-06 20:16:25 -0500708static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
dh155122f4b3ec62007-01-19 16:59:38 -0800709 ip_stack_t *ipst);
jpk45916cd2006-03-24 12:29:20 -0800710static void ip_snmp_get2_v4(ire_t *, iproutedata_t *);
711static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
Toomas Soome8a06b3d2018-10-15 22:13:16 +0300712static void ip_snmp_get2_v4_media(ncec_t *, void *);
713static void ip_snmp_get2_v6_media(ncec_t *, void *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700714int ip_snmp_set(queue_t *, int, int, uchar_t *, int);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700715
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800716static mblk_t *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800717 mblk_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700718
dh155122f4b3ec62007-01-19 16:59:38 -0800719static void conn_drain_init(ip_stack_t *);
720static void conn_drain_fini(ip_stack_t *);
meem3344d752010-03-27 02:33:20 -0400721static void conn_drain(conn_t *connp, boolean_t closing);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700722
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800723static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800724static void conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700725
dh155122f4b3ec62007-01-19 16:59:38 -0800726static void *ip_stack_init(netstackid_t stackid, netstack_t *ns);
727static void ip_stack_shutdown(netstackid_t stackid, void *arg);
728static void ip_stack_fini(netstackid_t stackid, void *arg);
729
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700730static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800731 const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
732 ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
733 const in6_addr_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700734
Eric Chengda14ceb2008-12-04 18:16:10 -0800735static int ip_squeue_switch(int);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700736
dh155122f4b3ec62007-01-19 16:59:38 -0800737static void *ip_kstat_init(netstackid_t, ip_stack_t *);
738static void ip_kstat_fini(netstackid_t, kstat_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700739static int ip_kstat_update(kstat_t *kp, int rw);
dh155122f4b3ec62007-01-19 16:59:38 -0800740static void *icmp_kstat_init(netstackid_t);
741static void icmp_kstat_fini(netstackid_t, kstat_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700742static int icmp_kstat_update(kstat_t *kp, int rw);
dh155122f4b3ec62007-01-19 16:59:38 -0800743static void *ip_kstat2_init(netstackid_t, ip_stat_t *);
744static void ip_kstat2_fini(netstackid_t, kstat_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700745
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800746static void ipobs_init(ip_stack_t *);
747static void ipobs_fini(ip_stack_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700748
Kacheong Poon5dd46ab2010-07-19 17:27:45 -0700749static int ip_tp_cpu_update(cpu_setup_t, int, void *);
750
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700751ipaddr_t ip_g_all_ones = IP_HOST_MASK;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700752
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700753static long ip_rput_pullups;
754int dohwcksum = 1; /* use h/w cksum if supported by the hardware */
755
gt145670aa92d852008-01-11 12:54:58 -0800756vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
757vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700758
dh155122f4b3ec62007-01-19 16:59:38 -0800759int ip_debug;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700760
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700761/*
762 * Multirouting/CGTP stuff
763 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700764int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700765
766/*
Girish Moodalbail6e91bba2010-03-26 17:53:11 -0400767 * IP tunables related declarations. Definitions are in ip_tunables.c
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700768 */
Girish Moodalbail6e91bba2010-03-26 17:53:11 -0400769extern mod_prop_info_t ip_propinfo_tbl[];
770extern int ip_propinfo_count;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700771
772/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700773 * Table of IP ioctls encoding the various properties of the ioctl and
774 * indexed based on the last byte of the ioctl command. Occasionally there
775 * is a clash, and there is more than 1 ioctl with the same last byte.
776 * In such a case 1 ioctl is encoded in the ndx table and the remaining
777 * ioctls are encoded in the misc table. An entry in the ndx table is
778 * retrieved by indexing on the last byte of the ioctl command and comparing
779 * the ioctl command with the value in the ndx table. In the event of a
780 * mismatch the misc table is then searched sequentially for the desired
781 * ioctl command.
782 *
783 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
784 */
785ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
786 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
787 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
788 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
789 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
790 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
791 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
792 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
793 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
794 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
795 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
796
797 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV,
798 MISC_CMD, ip_siocaddrt, NULL },
799 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV,
800 MISC_CMD, ip_siocdelrt, NULL },
801
802 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
803 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
meeme11c3f42009-01-06 20:16:25 -0500804 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700805 IF_CMD, ip_sioctl_get_addr, NULL },
806
807 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
808 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
809 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
meeme11c3f42009-01-06 20:16:25 -0500810 IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700811
812 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
meeme11c3f42009-01-06 20:16:25 -0500813 IPI_PRIV | IPI_WR,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700814 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
815 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
meeme11c3f42009-01-06 20:16:25 -0500816 IPI_MODOK | IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700817 IF_CMD, ip_sioctl_get_flags, NULL },
818
819 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
820 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
821
822 /* copyin size cannot be coded for SIOCGIFCONF */
meem98e93c22007-08-31 12:48:28 -0700823 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700824 MISC_CMD, ip_sioctl_get_ifconf, NULL },
825
826 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
827 IF_CMD, ip_sioctl_mtu, NULL },
meeme11c3f42009-01-06 20:16:25 -0500828 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700829 IF_CMD, ip_sioctl_get_mtu, NULL },
830 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
meeme11c3f42009-01-06 20:16:25 -0500831 IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700832 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
833 IF_CMD, ip_sioctl_brdaddr, NULL },
834 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
meeme11c3f42009-01-06 20:16:25 -0500835 IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700836 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
837 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
838 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
meeme11c3f42009-01-06 20:16:25 -0500839 IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700840 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
841 IF_CMD, ip_sioctl_metric, NULL },
842 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
843
844 /* See 166-168 below for extended SIOC*XARP ioctls */
meeme11c3f42009-01-06 20:16:25 -0500845 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
meem98e93c22007-08-31 12:48:28 -0700846 ARP_CMD, ip_sioctl_arp, NULL },
meeme11c3f42009-01-06 20:16:25 -0500847 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
meem98e93c22007-08-31 12:48:28 -0700848 ARP_CMD, ip_sioctl_arp, NULL },
meeme11c3f42009-01-06 20:16:25 -0500849 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
meem98e93c22007-08-31 12:48:28 -0700850 ARP_CMD, ip_sioctl_arp, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700851
852 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
853 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
854 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
855 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
856 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
857 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
858 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
859 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
860 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
861 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
862 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
863 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
864 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
865 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
866 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
867 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
868 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
869 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
870 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
871 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
872 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
873
874 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
875 MISC_CMD, if_unitsel, if_unitsel_restart },
876
877 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
878 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
879 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
880 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
881 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
882 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
883 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
884 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
885 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
886 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
887 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
888 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
889 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
890 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
891 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
892 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
893 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
894 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
895
896 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
897 IPI_PRIV | IPI_WR | IPI_MODOK,
898 IF_CMD, ip_sioctl_sifname, NULL },
899
900 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
901 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
902 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
903 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
904 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
905 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
906 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
907 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
908 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
909 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
910 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
911 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
912 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
913
meeme11c3f42009-01-06 20:16:25 -0500914 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700915 MISC_CMD, ip_sioctl_get_ifnum, NULL },
meeme11c3f42009-01-06 20:16:25 -0500916 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700917 IF_CMD, ip_sioctl_get_muxid, NULL },
918 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
meeme11c3f42009-01-06 20:16:25 -0500919 IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700920
921 /* Both if and lif variants share same func */
meeme11c3f42009-01-06 20:16:25 -0500922 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700923 IF_CMD, ip_sioctl_get_lifindex, NULL },
924 /* Both if and lif variants share same func */
925 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
meeme11c3f42009-01-06 20:16:25 -0500926 IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700927
928 /* copyin size cannot be coded for SIOCGIFCONF */
meem98e93c22007-08-31 12:48:28 -0700929 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700930 MISC_CMD, ip_sioctl_get_ifconf, NULL },
931 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
932 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
933 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
934 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
935 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
936 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
937 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
938 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
939 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
940 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
941 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
942 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
943 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
944 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
945 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
946 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
947 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
948
949 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500950 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700951 ip_sioctl_removeif_restart },
952 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500953 IPI_GET_CMD | IPI_PRIV | IPI_WR,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700954 LIF_CMD, ip_sioctl_addif, NULL },
955#define SIOCLIFADDR_NDX 112
956 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
957 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
958 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500959 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700960 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
961 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
962 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500963 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700964 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500965 IPI_PRIV | IPI_WR,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700966 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
967 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500968 IPI_GET_CMD | IPI_MODOK,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700969 LIF_CMD, ip_sioctl_get_flags, NULL },
970
971 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
972 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
973
meem98e93c22007-08-31 12:48:28 -0700974 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700975 ip_sioctl_get_lifconf, NULL },
976 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
977 LIF_CMD, ip_sioctl_mtu, NULL },
meeme11c3f42009-01-06 20:16:25 -0500978 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700979 LIF_CMD, ip_sioctl_get_mtu, NULL },
980 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500981 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700982 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
983 LIF_CMD, ip_sioctl_brdaddr, NULL },
984 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500985 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700986 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
987 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
988 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500989 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700990 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
991 LIF_CMD, ip_sioctl_metric, NULL },
992 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -0500993 IPI_PRIV | IPI_WR | IPI_MODOK,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700994 LIF_CMD, ip_sioctl_slifname,
995 ip_sioctl_slifname_restart },
996
meeme11c3f42009-01-06 20:16:25 -0500997 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700998 MISC_CMD, ip_sioctl_get_lifnum, NULL },
999 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001000 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001001 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001002 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001003 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001004 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001005 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001006 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001007 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1008 LIF_CMD, ip_sioctl_token, NULL },
1009 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001010 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001011 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1012 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
1013 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001014 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001015 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1016 LIF_CMD, ip_sioctl_lnkinfo, NULL },
1017
1018 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001019 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001020 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
1021 LIF_CMD, ip_siocdelndp_v6, NULL },
1022 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
1023 LIF_CMD, ip_siocqueryndp_v6, NULL },
1024 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
1025 LIF_CMD, ip_siocsetndp_v6, NULL },
1026 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1027 MISC_CMD, ip_sioctl_tmyaddr, NULL },
1028 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1029 MISC_CMD, ip_sioctl_tonlink, NULL },
1030 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
1031 MISC_CMD, ip_sioctl_tmysite, NULL },
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001032 /* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1033 /* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
Dan McDonald843ea702014-01-19 11:47:59 -05001034
1035 /* Old *IPSECONFIG ioctls are now deprecated, now see spdsock.c */
1036 /* 149 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1037 /* 150 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1038 /* 151 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1039 /* 152 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001040
meeme11c3f42009-01-06 20:16:25 -05001041 /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1042
meemc445e3e2009-02-05 14:37:01 -05001043 /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
1044 LIF_CMD, ip_sioctl_get_binding, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001045 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001046 IPI_PRIV | IPI_WR,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001047 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
1048 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001049 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
1050 /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
1051 IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001052
1053 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
1054 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1055 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1056 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1057
meeme11c3f42009-01-06 20:16:25 -05001058 /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001059
1060 /* These are handled in ip_sioctl_copyin_setup itself */
1061 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
1062 MISC_CMD, NULL, NULL },
1063 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
1064 MISC_CMD, NULL, NULL },
1065 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
1066
meem98e93c22007-08-31 12:48:28 -07001067 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001068 ip_sioctl_get_lifconf, NULL },
1069
meeme11c3f42009-01-06 20:16:25 -05001070 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
meem98e93c22007-08-31 12:48:28 -07001071 XARP_CMD, ip_sioctl_arp, NULL },
meeme11c3f42009-01-06 20:16:25 -05001072 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
meem98e93c22007-08-31 12:48:28 -07001073 XARP_CMD, ip_sioctl_arp, NULL },
meeme11c3f42009-01-06 20:16:25 -05001074 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
meem98e93c22007-08-31 12:48:28 -07001075 XARP_CMD, ip_sioctl_arp, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001076
1077 /* SIOCPOPSOCKFS is not handled by IP */
1078 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
1079
1080 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001081 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001082 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
meeme11c3f42009-01-06 20:16:25 -05001083 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001084 ip_sioctl_slifzone_restart },
1085 /* 172-174 are SCTP ioctls and not handled by IP */
1086 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1087 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1088 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1089 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
1090 IPI_GET_CMD, LIF_CMD,
1091 ip_sioctl_get_lifusesrc, 0 },
1092 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
1093 IPI_PRIV | IPI_WR,
1094 LIF_CMD, ip_sioctl_slifusesrc,
1095 NULL },
1096 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
1097 ip_sioctl_get_lifsrcof, NULL },
1098 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
meem98e93c22007-08-31 12:48:28 -07001099 MSFILT_CMD, ip_sioctl_msfilter, NULL },
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001100 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
meem98e93c22007-08-31 12:48:28 -07001101 MSFILT_CMD, ip_sioctl_msfilter, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001102 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
meem98e93c22007-08-31 12:48:28 -07001103 MSFILT_CMD, ip_sioctl_msfilter, NULL },
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001104 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
meem98e93c22007-08-31 12:48:28 -07001105 MSFILT_CMD, ip_sioctl_msfilter, NULL },
meeme11c3f42009-01-06 20:16:25 -05001106 /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
meeme704a8f2007-10-30 11:15:43 -07001107 /* SIOCSENABLESDP is handled by SDP */
1108 /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001109 /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
Darren Reeda6911612010-08-12 16:05:23 -07001110 /* 185 */ { SIOCGIFHWADDR, sizeof (struct ifreq), IPI_GET_CMD,
1111 IF_CMD, ip_sioctl_get_ifhwaddr, NULL },
Sangeeta Misradbed73c2009-11-03 23:15:19 -08001112 /* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
1113 /* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
1114 ip_sioctl_ilb_cmd, NULL },
Girish Moodalbail6e91bba2010-03-26 17:53:11 -04001115 /* 188 */ { SIOCGETPROP, 0, IPI_GET_CMD, 0, NULL, NULL },
1116 /* 189 */ { SIOCSETPROP, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL},
1117 /* 190 */ { SIOCGLIFDADSTATE, sizeof (struct lifreq),
1118 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dadstate, NULL },
1119 /* 191 */ { SIOCSLIFPREFIX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
Darren Reeda6911612010-08-12 16:05:23 -07001120 LIF_CMD, ip_sioctl_prefix, ip_sioctl_prefix_restart },
1121 /* 192 */ { SIOCGLIFHWADDR, sizeof (struct lifreq), IPI_GET_CMD,
1122 LIF_CMD, ip_sioctl_get_lifhwaddr, NULL }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001123};
1124
1125int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1126
1127ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001128 { I_LINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1129 { I_UNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1130 { I_PLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1131 { I_PUNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1132 { ND_GET, 0, 0, 0, NULL, NULL },
1133 { ND_SET, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001134 { IP_IOCTL, 0, 0, 0, NULL, NULL },
meeme11c3f42009-01-06 20:16:25 -05001135 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001136 MISC_CMD, mrt_ioctl},
meeme11c3f42009-01-06 20:16:25 -05001137 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001138 MISC_CMD, mrt_ioctl},
meeme11c3f42009-01-06 20:16:25 -05001139 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001140 MISC_CMD, mrt_ioctl}
1141};
1142
1143int ip_misc_ioctl_count =
1144 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1145
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001146int conn_drain_nthreads; /* Number of drainers reqd. */
1147 /* Settable in /etc/system */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001148/* Defined in ip_ire.c */
1149extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
1150extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
1151extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
1152
1153static nv_t ire_nv_arr[] = {
1154 { IRE_BROADCAST, "BROADCAST" },
1155 { IRE_LOCAL, "LOCAL" },
1156 { IRE_LOOPBACK, "LOOPBACK" },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001157 { IRE_DEFAULT, "DEFAULT" },
1158 { IRE_PREFIX, "PREFIX" },
1159 { IRE_IF_NORESOLVER, "IF_NORESOL" },
1160 { IRE_IF_RESOLVER, "IF_RESOLV" },
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001161 { IRE_IF_CLONE, "IF_CLONE" },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001162 { IRE_HOST, "HOST" },
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001163 { IRE_MULTICAST, "MULTICAST" },
1164 { IRE_NOROUTE, "NOROUTE" },
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001165 { 0 }
1166};
1167
1168nv_t *ire_nv_tbl = ire_nv_arr;
1169
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001170/* Simple ICMP IP Header Template */
1171static ipha_t icmp_ipha = {
1172 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
1173};
1174
1175struct module_info ip_mod_info = {
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001176 IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
1177 IP_MOD_LOWAT
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001178};
1179
carlsonj69bb4bb2006-08-14 14:10:48 -07001180/*
1181 * Duplicate static symbols within a module confuses mdb; so we avoid the
1182 * problem by making the symbols here distinct from those in udp.c.
1183 */
1184
nordmarkfc80c0d2007-10-11 22:57:36 -07001185/*
1186 * Entry points for IP as a device and as a module.
nordmarkfc80c0d2007-10-11 22:57:36 -07001187 * We have separate open functions for the /dev/ip and /dev/ip6 devices.
1188 */
1189static struct qinit iprinitv4 = {
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001190 ip_rput, NULL, ip_openv4, ip_close, NULL, &ip_mod_info
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001191};
1192
nordmarkfc80c0d2007-10-11 22:57:36 -07001193struct qinit iprinitv6 = {
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001194 ip_rput_v6, NULL, ip_openv6, ip_close, NULL, &ip_mod_info
nordmarkfc80c0d2007-10-11 22:57:36 -07001195};
1196
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001197static struct qinit ipwinit = {
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001198 ip_wput_nondata, ip_wsrv, NULL, NULL, NULL, &ip_mod_info
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001199};
1200
carlsonj69bb4bb2006-08-14 14:10:48 -07001201static struct qinit iplrinit = {
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001202 ip_lrput, NULL, ip_openv4, ip_close, NULL, &ip_mod_info
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001203};
1204
carlsonj69bb4bb2006-08-14 14:10:48 -07001205static struct qinit iplwinit = {
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001206 ip_lwput, NULL, NULL, NULL, NULL, &ip_mod_info
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001207};
1208
nordmarkfc80c0d2007-10-11 22:57:36 -07001209/* For AF_INET aka /dev/ip */
1210struct streamtab ipinfov4 = {
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001211 &iprinitv4, &ipwinit, &iplrinit, &iplwinit
nordmarkfc80c0d2007-10-11 22:57:36 -07001212};
1213
1214/* For AF_INET6 aka /dev/ip6 */
1215struct streamtab ipinfov6 = {
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001216 &iprinitv6, &ipwinit, &iplrinit, &iplwinit
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001217};
1218
1219#ifdef DEBUG
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001220boolean_t skip_sctp_cksum = B_FALSE;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001221#endif
nordmark5597b602006-09-14 18:05:27 -07001222
1223/*
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001224 * Generate an ICMP fragmentation needed message.
1225 * When called from ip_output side a minimal ip_recv_attr_t needs to be
1226 * constructed by the caller.
nordmark5597b602006-09-14 18:05:27 -07001227 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001228void
1229icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001230{
1231 icmph_t icmph;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001232 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001233
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001234 mp = icmp_pkt_err_ok(mp, ira);
1235 if (mp == NULL)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001236 return;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001237
1238 bzero(&icmph, sizeof (icmph_t));
1239 icmph.icmph_type = ICMP_DEST_UNREACHABLE;
1240 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
1241 icmph.icmph_du_mtu = htons((uint16_t)mtu);
dh155122f4b3ec62007-01-19 16:59:38 -08001242 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
1243 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001244
1245 icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001246}
1247
1248/*
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001249 * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
1250 * If the ICMP message is consumed by IP, i.e., it should not be delivered
1251 * to any IPPROTO_ICMP raw sockets, then it returns NULL.
1252 * Likewise, if the ICMP error is misformed (too short, etc), then it
1253 * returns NULL. The caller uses this to determine whether or not to send
1254 * to raw sockets.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001255 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001256 * All error messages are passed to the matching transport stream.
1257 *
1258 * The following cases are handled by icmp_inbound:
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001259 * 1) It needs to send a reply back and possibly delivering it
1260 * to the "interested" upper clients.
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001261 * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001262 * 3) It needs to change some values in IP only.
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001263 * 4) It needs to change some values in IP and upper layers e.g TCP
1264 * by delivering an error to the upper layers.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001265 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001266 * We handle the above three cases in the context of IPsec in the
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001267 * following way :
1268 *
1269 * 1) Send the reply back in the same way as the request came in.
1270 * If it came in encrypted, it goes out encrypted. If it came in
1271 * clear, it goes out in clear. Thus, this will prevent chosen
1272 * plain text attack.
1273 * 2) The client may or may not expect things to come in secure.
1274 * If it comes in secure, the policy constraints are checked
1275 * before delivering it to the upper layers. If it comes in
1276 * clear, ipsec_inbound_accept_clear will decide whether to
1277 * accept this in clear or not. In both the cases, if the returned
1278 * message (IP header + 8 bytes) that caused the icmp message has
1279 * AH/ESP headers, it is sent up to AH/ESP for validation before
1280 * sending up. If there are only 8 bytes of returned message, then
1281 * upper client will not be notified.
1282 * 3) Check with global policy to see whether it matches the constaints.
1283 * But this will be done only if icmp_accept_messages_in_clear is
1284 * zero.
1285 * 4) If we need to change both in IP and ULP, then the decision taken
1286 * while affecting the values in IP and while delivering up to TCP
1287 * should be the same.
1288 *
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001289 * There are two cases.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001290 *
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001291 * a) If we reject data at the IP layer (ipsec_check_global_policy()
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001292 * failed), we will not deliver it to the ULP, even though they
1293 * are *willing* to accept in *clear*. This is fine as our global
1294 * disposition to icmp messages asks us reject the datagram.
1295 *
1296 * b) If we accept data at the IP layer (ipsec_check_global_policy()
1297 * succeeded or icmp_accept_messages_in_clear is 1), and not able
1298 * to deliver it to ULP (policy failed), it can lead to
1299 * consistency problems. The cases known at this time are
1300 * ICMP_DESTINATION_UNREACHABLE messages with following code
1301 * values :
1302 *
1303 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
1304 * and Upper layer rejects. Then the communication will
1305 * come to a stop. This is solved by making similar decisions
1306 * at both levels. Currently, when we are unable to deliver
1307 * to the Upper Layer (due to policy failures) while IP has
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001308 * adjusted dce_pmtu, the next outbound datagram would
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001309 * generate a local ICMP_FRAGMENTATION_NEEDED message - which
1310 * will be with the right level of protection. Thus the right
1311 * value will be communicated even if we are not able to
1312 * communicate when we get from the wire initially. But this
1313 * assumes there would be at least one outbound datagram after
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001314 * IP has adjusted its dce_pmtu value. To make things
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001315 * simpler, we accept in clear after the validation of
1316 * AH/ESP headers.
1317 *
1318 * - Other ICMP ERRORS : We may not be able to deliver it to the
1319 * upper layer depending on the level of protection the upper
1320 * layer expects and the disposition in ipsec_inbound_accept_clear().
1321 * ipsec_inbound_accept_clear() decides whether a given ICMP error
1322 * should be accepted in clear when the Upper layer expects secure.
1323 * Thus the communication may get aborted by some bad ICMP
1324 * packets.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001325 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001326mblk_t *
1327icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001328{
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001329 icmph_t *icmph;
1330 ipha_t *ipha; /* Outer header */
1331 int ip_hdr_length; /* Outer header length */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001332 boolean_t interested;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001333 ipif_t *ipif;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001334 uint32_t ts;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001335 uint32_t *tsp;
1336 timestruc_t now;
1337 ill_t *ill = ira->ira_ill;
1338 ip_stack_t *ipst = ill->ill_ipst;
1339 zoneid_t zoneid = ira->ira_zoneid;
1340 int len_needed;
1341 mblk_t *mp_ret = NULL;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001342
1343 ipha = (ipha_t *)mp->b_rptr;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001344
dh155122f4b3ec62007-01-19 16:59:38 -08001345 BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001346
1347 ip_hdr_length = ira->ira_ip_hdr_length;
1348 if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
1349 if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
1350 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1351 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1352 freemsg(mp);
1353 return (NULL);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001354 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001355 /* Last chance to get real. */
1356 ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
1357 if (ipha == NULL) {
1358 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
1359 freemsg(mp);
1360 return (NULL);
1361 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001362 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001363
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001364 /* The IP header will always be a multiple of four bytes */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001365 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1366 ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001367 icmph->icmph_code));
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001368
1369 /*
1370 * We will set "interested" to "true" if we should pass a copy to
1371 * the transport or if we handle the packet locally.
1372 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001373 interested = B_FALSE;
1374 switch (icmph->icmph_type) {
1375 case ICMP_ECHO_REPLY:
dh155122f4b3ec62007-01-19 16:59:38 -08001376 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001377 break;
1378 case ICMP_DEST_UNREACHABLE:
1379 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
dh155122f4b3ec62007-01-19 16:59:38 -08001380 BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001381 interested = B_TRUE; /* Pass up to transport */
dh155122f4b3ec62007-01-19 16:59:38 -08001382 BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001383 break;
1384 case ICMP_SOURCE_QUENCH:
1385 interested = B_TRUE; /* Pass up to transport */
dh155122f4b3ec62007-01-19 16:59:38 -08001386 BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001387 break;
1388 case ICMP_REDIRECT:
dh155122f4b3ec62007-01-19 16:59:38 -08001389 if (!ipst->ips_ip_ignore_redirect)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001390 interested = B_TRUE;
dh155122f4b3ec62007-01-19 16:59:38 -08001391 BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001392 break;
1393 case ICMP_ECHO_REQUEST:
1394 /*
1395 * Whether to respond to echo requests that come in as IP
1396 * broadcasts or as IP multicast is subject to debate
1397 * (what isn't?). We aim to please, you pick it.
1398 * Default is do it.
1399 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001400 if (ira->ira_flags & IRAF_MULTICAST) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001401 /* multicast: respond based on tunable */
dh155122f4b3ec62007-01-19 16:59:38 -08001402 interested = ipst->ips_ip_g_resp_to_echo_mcast;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001403 } else if (ira->ira_flags & IRAF_BROADCAST) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001404 /* broadcast: respond based on tunable */
dh155122f4b3ec62007-01-19 16:59:38 -08001405 interested = ipst->ips_ip_g_resp_to_echo_bcast;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001406 } else {
1407 /* unicast: always respond */
1408 interested = B_TRUE;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001409 }
dh155122f4b3ec62007-01-19 16:59:38 -08001410 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001411 if (!interested) {
1412 /* We never pass these to RAW sockets */
1413 freemsg(mp);
1414 return (NULL);
1415 }
1416
1417 /* Check db_ref to make sure we can modify the packet. */
1418 if (mp->b_datap->db_ref > 1) {
1419 mblk_t *mp1;
1420
1421 mp1 = copymsg(mp);
1422 freemsg(mp);
1423 if (!mp1) {
1424 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1425 return (NULL);
1426 }
1427 mp = mp1;
1428 ipha = (ipha_t *)mp->b_rptr;
1429 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1430 }
1431 icmph->icmph_type = ICMP_ECHO_REPLY;
1432 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
1433 icmp_send_reply_v4(mp, ipha, icmph, ira);
1434 return (NULL);
1435
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001436 case ICMP_ROUTER_ADVERTISEMENT:
1437 case ICMP_ROUTER_SOLICITATION:
1438 break;
1439 case ICMP_TIME_EXCEEDED:
1440 interested = B_TRUE; /* Pass up to transport */
dh155122f4b3ec62007-01-19 16:59:38 -08001441 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001442 break;
1443 case ICMP_PARAM_PROBLEM:
1444 interested = B_TRUE; /* Pass up to transport */
dh155122f4b3ec62007-01-19 16:59:38 -08001445 BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001446 break;
1447 case ICMP_TIME_STAMP_REQUEST:
1448 /* Response to Time Stamp Requests is local policy. */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001449 if (ipst->ips_ip_g_resp_to_timestamp) {
1450 if (ira->ira_flags & IRAF_MULTIBROADCAST)
1451 interested =
1452 ipst->ips_ip_g_resp_to_timestamp_bcast;
1453 else
1454 interested = B_TRUE;
1455 }
1456 if (!interested) {
1457 /* We never pass these to RAW sockets */
1458 freemsg(mp);
1459 return (NULL);
1460 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001461
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001462 /* Make sure we have enough of the packet */
1463 len_needed = ip_hdr_length + ICMPH_SIZE +
1464 3 * sizeof (uint32_t);
1465
1466 if (mp->b_wptr - mp->b_rptr < len_needed) {
1467 ipha = ip_pullup(mp, len_needed, ira);
1468 if (ipha == NULL) {
1469 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1470 ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1471 mp, ill);
1472 freemsg(mp);
1473 return (NULL);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001474 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001475 /* Refresh following the pullup. */
1476 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001477 }
dh155122f4b3ec62007-01-19 16:59:38 -08001478 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001479 /* Check db_ref to make sure we can modify the packet. */
1480 if (mp->b_datap->db_ref > 1) {
1481 mblk_t *mp1;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001482
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001483 mp1 = copymsg(mp);
1484 freemsg(mp);
1485 if (!mp1) {
1486 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1487 return (NULL);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001488 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001489 mp = mp1;
1490 ipha = (ipha_t *)mp->b_rptr;
1491 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001492 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001493 icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001494 tsp = (uint32_t *)&icmph[1];
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001495 tsp++; /* Skip past 'originate time' */
1496 /* Compute # of milliseconds since midnight */
1497 gethrestime(&now);
1498 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
Josef 'Jeff' Sipek19449252014-04-29 13:05:25 -04001499 NSEC2MSEC(now.tv_nsec);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001500 *tsp++ = htonl(ts); /* Lay in 'receive time' */
1501 *tsp++ = htonl(ts); /* Lay in 'send time' */
dh155122f4b3ec62007-01-19 16:59:38 -08001502 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001503 icmp_send_reply_v4(mp, ipha, icmph, ira);
1504 return (NULL);
1505
1506 case ICMP_TIME_STAMP_REPLY:
1507 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
1508 break;
1509 case ICMP_INFO_REQUEST:
1510 /* Per RFC 1122 3.2.2.7, ignore this. */
1511 case ICMP_INFO_REPLY:
1512 break;
1513 case ICMP_ADDRESS_MASK_REQUEST:
1514 if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1515 interested =
1516 ipst->ips_ip_respond_to_address_mask_broadcast;
1517 } else {
1518 interested = B_TRUE;
1519 }
1520 if (!interested) {
1521 /* We never pass these to RAW sockets */
1522 freemsg(mp);
1523 return (NULL);
1524 }
1525 len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
1526 if (mp->b_wptr - mp->b_rptr < len_needed) {
1527 ipha = ip_pullup(mp, len_needed, ira);
1528 if (ipha == NULL) {
1529 BUMP_MIB(ill->ill_ip_mib,
1530 ipIfStatsInTruncatedPkts);
1531 ip_drop_input("ipIfStatsInTruncatedPkts", mp,
1532 ill);
1533 freemsg(mp);
1534 return (NULL);
1535 }
1536 /* Refresh following the pullup. */
1537 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1538 }
1539 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
1540 /* Check db_ref to make sure we can modify the packet. */
1541 if (mp->b_datap->db_ref > 1) {
1542 mblk_t *mp1;
1543
1544 mp1 = copymsg(mp);
1545 freemsg(mp);
1546 if (!mp1) {
1547 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1548 return (NULL);
1549 }
1550 mp = mp1;
1551 ipha = (ipha_t *)mp->b_rptr;
1552 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1553 }
1554 /*
1555 * Need the ipif with the mask be the same as the source
1556 * address of the mask reply. For unicast we have a specific
1557 * ipif. For multicast/broadcast we only handle onlink
1558 * senders, and use the source address to pick an ipif.
1559 */
1560 ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
1561 if (ipif == NULL) {
1562 /* Broadcast or multicast */
1563 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
1564 if (ipif == NULL) {
1565 freemsg(mp);
1566 return (NULL);
1567 }
1568 }
1569 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
1570 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
1571 ipif_refrele(ipif);
1572 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
1573 icmp_send_reply_v4(mp, ipha, icmph, ira);
1574 return (NULL);
1575
1576 case ICMP_ADDRESS_MASK_REPLY:
1577 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
1578 break;
1579 default:
1580 interested = B_TRUE; /* Pass up to transport */
1581 BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001582 break;
1583 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001584 /*
1585 * See if there is an ICMP client to avoid an extra copymsg/freemsg
1586 * if there isn't one.
1587 */
1588 if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
1589 /* If there is an ICMP client and we want one too, copy it. */
1590
1591 if (!interested) {
1592 /* Caller will deliver to RAW sockets */
1593 return (mp);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001594 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001595 mp_ret = copymsg(mp);
1596 if (mp_ret == NULL) {
apersson31736642006-12-19 17:33:00 -08001597 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001598 ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001599 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001600 } else if (!interested) {
1601 /* Neither we nor raw sockets are interested. Drop packet now */
1602 freemsg(mp);
1603 return (NULL);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001604 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001605
1606 /*
1607 * ICMP error or redirect packet. Make sure we have enough of
1608 * the header and that db_ref == 1 since we might end up modifying
1609 * the packet.
1610 */
1611 if (mp->b_cont != NULL) {
1612 if (ip_pullup(mp, -1, ira) == NULL) {
1613 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1614 ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1615 mp, ill);
1616 freemsg(mp);
1617 return (mp_ret);
1618 }
1619 }
1620
1621 if (mp->b_datap->db_ref > 1) {
1622 mblk_t *mp1;
1623
1624 mp1 = copymsg(mp);
1625 if (mp1 == NULL) {
1626 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1627 ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1628 freemsg(mp);
1629 return (mp_ret);
1630 }
1631 freemsg(mp);
1632 mp = mp1;
1633 }
1634
1635 /*
1636 * In case mp has changed, verify the message before any further
1637 * processes.
1638 */
1639 ipha = (ipha_t *)mp->b_rptr;
1640 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1641 if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
1642 freemsg(mp);
1643 return (mp_ret);
1644 }
1645
1646 switch (icmph->icmph_type) {
1647 case ICMP_REDIRECT:
1648 icmp_redirect_v4(mp, ipha, icmph, ira);
1649 break;
1650 case ICMP_DEST_UNREACHABLE:
1651 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
1652 /* Update DCE and adjust MTU is icmp header if needed */
1653 icmp_inbound_too_big_v4(icmph, ira);
1654 }
Toomas Soomea9f62b12018-03-27 08:17:48 +03001655 /* FALLTHROUGH */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001656 default:
1657 icmp_inbound_error_fanout_v4(mp, icmph, ira);
1658 break;
1659 }
1660 return (mp_ret);
1661}
1662
1663/*
1664 * Send an ICMP echo, timestamp or address mask reply.
1665 * The caller has already updated the payload part of the packet.
1666 * We handle the ICMP checksum, IP source address selection and feed
1667 * the packet into ip_output_simple.
1668 */
1669static void
1670icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
1671 ip_recv_attr_t *ira)
1672{
1673 uint_t ip_hdr_length = ira->ira_ip_hdr_length;
1674 ill_t *ill = ira->ira_ill;
1675 ip_stack_t *ipst = ill->ill_ipst;
1676 ip_xmit_attr_t ixas;
1677
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001678 /* Send out an ICMP packet */
1679 icmph->icmph_checksum = 0;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001680 icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001681 /* Reset time to live. */
dh155122f4b3ec62007-01-19 16:59:38 -08001682 ipha->ipha_ttl = ipst->ips_ip_def_ttl;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001683 {
1684 /* Swap source and destination addresses */
1685 ipaddr_t tmp;
1686
1687 tmp = ipha->ipha_src;
1688 ipha->ipha_src = ipha->ipha_dst;
1689 ipha->ipha_dst = tmp;
1690 }
1691 ipha->ipha_ident = 0;
1692 if (!IS_SIMPLE_IPH(ipha))
1693 icmp_options_update(ipha);
1694
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001695 bzero(&ixas, sizeof (ixas));
1696 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1697 ixas.ixa_zoneid = ira->ira_zoneid;
1698 ixas.ixa_cred = kcred;
1699 ixas.ixa_cpid = NOPID;
1700 ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
1701 ixas.ixa_ifindex = 0;
1702 ixas.ixa_ipst = ipst;
1703 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1704
1705 if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001706 /*
1707 * This packet should go out the same way as it
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001708 * came in i.e in clear, independent of the IPsec policy
1709 * for transmitting packets.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001710 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001711 ixas.ixa_flags |= IXAF_NO_IPSEC;
1712 } else {
1713 if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
apersson31736642006-12-19 17:33:00 -08001714 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001715 /* Note: mp already consumed and ip_drop_packet done */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001716 return;
1717 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001718 }
1719 if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1720 /*
1721 * Not one or our addresses (IRE_LOCALs), thus we let
1722 * ip_output_simple pick the source.
1723 */
1724 ipha->ipha_src = INADDR_ANY;
1725 ixas.ixa_flags |= IXAF_SET_SOURCE;
1726 }
1727 /* Should we send with DF and use dce_pmtu? */
1728 if (ipst->ips_ipv4_icmp_return_pmtu) {
1729 ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
1730 ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
1731 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001732
dh155122f4b3ec62007-01-19 16:59:38 -08001733 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001734
1735 (void) ip_output_simple(mp, &ixas);
1736 ixa_cleanup(&ixas);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001737}
1738
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001739/*
1740 * Verify the ICMP messages for either for ICMP error or redirect packet.
1741 * The caller should have fully pulled up the message. If it's a redirect
1742 * packet, only basic checks on IP header will be done; otherwise, verify
1743 * the packet by looking at the included ULP header.
1744 *
1745 * Called before icmp_inbound_error_fanout_v4 is called.
1746 */
1747static boolean_t
1748icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
priyankabe547022006-06-21 12:18:43 -07001749{
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001750 ill_t *ill = ira->ira_ill;
1751 int hdr_length;
1752 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
1753 conn_t *connp;
1754 ipha_t *ipha; /* Inner IP header */
priyankabe547022006-06-21 12:18:43 -07001755
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001756 ipha = (ipha_t *)&icmph[1];
1757 if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
1758 goto truncated;
1759
1760 hdr_length = IPH_HDR_LENGTH(ipha);
1761
1762 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
1763 goto discard_pkt;
1764
1765 if (hdr_length < sizeof (ipha_t))
1766 goto truncated;
1767
1768 if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
1769 goto truncated;
1770
1771 /*
1772 * Stop here for ICMP_REDIRECT.
1773 */
1774 if (icmph->icmph_type == ICMP_REDIRECT)
1775 return (B_TRUE);
1776
1777 /*
1778 * ICMP errors only.
1779 */
priyankabe547022006-06-21 12:18:43 -07001780 switch (ipha->ipha_protocol) {
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001781 case IPPROTO_UDP:
1782 /*
1783 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1784 * transport header.
1785 */
1786 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1787 mp->b_wptr)
1788 goto truncated;
1789 break;
1790 case IPPROTO_TCP: {
1791 tcpha_t *tcpha;
priyankabe547022006-06-21 12:18:43 -07001792
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001793 /*
1794 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1795 * transport header.
1796 */
1797 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1798 mp->b_wptr)
1799 goto truncated;
1800
1801 tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
1802 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
1803 ipst);
1804 if (connp == NULL)
1805 goto discard_pkt;
1806
1807 if ((connp->conn_verifyicmp != NULL) &&
1808 !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
1809 CONN_DEC_REF(connp);
1810 goto discard_pkt;
priyankabe547022006-06-21 12:18:43 -07001811 }
priyankabe547022006-06-21 12:18:43 -07001812 CONN_DEC_REF(connp);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001813 break;
priyankabe547022006-06-21 12:18:43 -07001814 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001815 case IPPROTO_SCTP:
1816 /*
1817 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1818 * transport header.
1819 */
1820 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1821 mp->b_wptr)
1822 goto truncated;
1823 break;
1824 case IPPROTO_ESP:
1825 case IPPROTO_AH:
1826 break;
1827 case IPPROTO_ENCAP:
1828 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
1829 mp->b_wptr)
1830 goto truncated;
1831 break;
1832 default:
1833 break;
1834 }
1835
1836 return (B_TRUE);
1837
1838discard_pkt:
1839 /* Bogus ICMP error. */
1840 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1841 return (B_FALSE);
1842
1843truncated:
1844 /* We pulled up everthing already. Must be truncated */
1845 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1846 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1847 return (B_FALSE);
priyankabe547022006-06-21 12:18:43 -07001848}
1849
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001850/* Table from RFC 1191 */
1851static int icmp_frag_size_table[] =
1852{ 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
1853
1854/*
1855 * Process received ICMP Packet too big.
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001856 * Just handles the DCE create/update, including using the above table of
1857 * PMTU guesses. The caller is responsible for validating the packet before
1858 * passing it in and also to fanout the ICMP error to any matching transport
1859 * conns. Assumes the message has been fully pulled up and verified.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001860 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001861 * Before getting here, the caller has called icmp_inbound_verify_v4()
1862 * that should have verified with ULP to prevent undoing the changes we're
1863 * going to make to DCE. For example, TCP might have verified that the packet
1864 * which generated error is in the send window.
1865 *
1866 * In some cases modified this MTU in the ICMP header packet; the caller
1867 * should pass to the matching ULP after this returns.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001868 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001869static void
1870icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001871{
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001872 dce_t *dce;
1873 int old_mtu;
1874 int mtu, orig_mtu;
1875 ipaddr_t dst;
1876 boolean_t disable_pmtud;
1877 ill_t *ill = ira->ira_ill;
1878 ip_stack_t *ipst = ill->ill_ipst;
1879 uint_t hdr_length;
1880 ipha_t *ipha;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001881
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001882 /* Caller already pulled up everything. */
1883 ipha = (ipha_t *)&icmph[1];
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001884 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
1885 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
apersson31736642006-12-19 17:33:00 -08001886 ASSERT(ill != NULL);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001887
1888 hdr_length = IPH_HDR_LENGTH(ipha);
1889
priyankabe547022006-06-21 12:18:43 -07001890 /*
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001891 * We handle path MTU for source routed packets since the DCE
1892 * is looked up using the final destination.
priyankabe547022006-06-21 12:18:43 -07001893 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001894 dst = ip_get_dst(ipha);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001895
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001896 dce = dce_lookup_and_add_v4(dst, ipst);
1897 if (dce == NULL) {
1898 /* Couldn't add a unique one - ENOMEM */
1899 ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
1900 ntohl(dst)));
1901 return;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001902 }
Kacheong Poon16c9d762008-11-13 10:19:37 -08001903
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001904 /* Check for MTU discovery advice as described in RFC 1191 */
1905 mtu = ntohs(icmph->icmph_du_mtu);
Kacheong Poon16c9d762008-11-13 10:19:37 -08001906 orig_mtu = mtu;
1907 disable_pmtud = B_FALSE;
1908
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001909 mutex_enter(&dce->dce_lock);
1910 if (dce->dce_flags & DCEF_PMTU)
1911 old_mtu = dce->dce_pmtu;
1912 else
1913 old_mtu = ill->ill_mtu;
1914
1915 if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
1916 uint32_t length;
1917 int i;
1918
priyankabe547022006-06-21 12:18:43 -07001919 /*
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001920 * Use the table from RFC 1191 to figure out
1921 * the next "plateau" based on the length in
1922 * the original IP packet.
priyankabe547022006-06-21 12:18:43 -07001923 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001924 length = ntohs(ipha->ipha_length);
1925 DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
1926 uint32_t, length);
1927 if (old_mtu <= length &&