blob: c081c44a0484844606b50aa48a065ddfde3efc9e [file] [log] [blame]
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
kcpoon5dddb8b2006-01-06 00:24:46 -08005 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07007 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
kcpoon5dddb8b2006-01-06 00:24:46 -080021
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070022/*
Yuri Pankov59927d32014-01-08 18:32:42 +040023 * Copyright (c) 1990 Mentat Inc.
meem1f197382010-04-03 14:24:23 -040024 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
Yuri Pankov0b905b42017-07-07 18:55:34 +030025 * Copyright 2017 Nexenta Systems, Inc.
Dan McDonald7199b8e2017-02-01 14:55:57 -050026 * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
Dan McDonald42c5ef02019-02-22 14:42:52 -050027 * Copyright 2019, Joyent, Inc.
Ryan Goodfellow2514b112022-06-09 07:52:45 -070028 * Copyright 2022 Oxide Computer Company
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070029 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070030
31#ifndef _INET_IP_H
32#define _INET_IP_H
33
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070034#ifdef __cplusplus
35extern "C" {
36#endif
37
38#include <sys/isa_defs.h>
39#include <sys/types.h>
40#include <inet/mib2.h>
41#include <inet/nd.h>
42#include <sys/atomic.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070043#include <net/if_dl.h>
44#include <net/if.h>
45#include <netinet/ip.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070046#include <netinet/igmp.h>
dr146992381a2a92006-10-20 16:37:58 -070047#include <sys/neti.h>
48#include <sys/hook.h>
49#include <sys/hook_event.h>
50#include <sys/hook_impl.h>
dh155122f4b3ec62007-01-19 16:59:38 -080051#include <inet/ip_stack.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070052
53#ifdef _KERNEL
54#include <netinet/ip6.h>
55#include <sys/avl.h>
Eric Chengda14ceb2008-12-04 18:16:10 -080056#include <sys/list.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070057#include <sys/vmem.h>
58#include <sys/squeue.h>
jpk45916cd2006-03-24 12:29:20 -080059#include <net/route.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070060#include <sys/systm.h>
sangeetac793af92006-08-11 05:59:29 -070061#include <net/radix.h>
carlsonj6a8288c2007-09-11 04:26:06 -070062#include <sys/modhash.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070063
64#ifdef DEBUG
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070065#define CONN_DEBUG
66#endif
67
68#define IP_DEBUG
69/*
70 * The mt-streams(9F) flags for the IP module; put here so that other
71 * "drivers" that are actually IP (e.g., ICMP, UDP) can use the same set
72 * of flags.
73 */
74#define IP_DEVMTFLAGS D_MP
masputraff550d02005-10-22 22:50:14 -070075#endif /* _KERNEL */
76
77#define IP_MOD_NAME "ip"
78#define IP_DEV_NAME "/dev/ip"
79#define IP6_DEV_NAME "/dev/ip6"
80
81#define UDP_MOD_NAME "udp"
82#define UDP_DEV_NAME "/dev/udp"
83#define UDP6_DEV_NAME "/dev/udp6"
84
85#define TCP_MOD_NAME "tcp"
86#define TCP_DEV_NAME "/dev/tcp"
87#define TCP6_DEV_NAME "/dev/tcp6"
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070088
kcpoon77c67f22006-04-03 08:39:23 -070089#define SCTP_MOD_NAME "sctp"
90
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070091#ifndef _IPADDR_T
92#define _IPADDR_T
93typedef uint32_t ipaddr_t;
94#endif
95
96/* Number of bits in an address */
97#define IP_ABITS 32
Erik Nordmarkbd670b32009-11-11 11:49:49 -080098#define IPV4_ABITS IP_ABITS
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070099#define IPV6_ABITS 128
Ravi Chandra Nallan7f125a52010-07-13 18:17:30 +0530100#define IP_MAX_HW_LEN 40
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700101
102#define IP_HOST_MASK (ipaddr_t)0xffffffffU
103
104#define IP_CSUM(mp, off, sum) (~ip_cksum(mp, off, sum) & 0xFFFF)
105#define IP_CSUM_PARTIAL(mp, off, sum) ip_cksum(mp, off, sum)
106#define IP_BCSUM_PARTIAL(bp, len, sum) bcksum(bp, len, sum)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700107
108#define ILL_FRAG_HASH_TBL_COUNT ((unsigned int)64)
109#define ILL_FRAG_HASH_TBL_SIZE (ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t))
110
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700111#define IPV4_ADDR_LEN 4
112#define IP_ADDR_LEN IPV4_ADDR_LEN
113#define IP_ARP_PROTO_TYPE 0x0800
114
115#define IPV4_VERSION 4
116#define IP_VERSION IPV4_VERSION
117#define IP_SIMPLE_HDR_LENGTH_IN_WORDS 5
118#define IP_SIMPLE_HDR_LENGTH 20
119#define IP_MAX_HDR_LENGTH 60
120
jpk45916cd2006-03-24 12:29:20 -0800121#define IP_MAX_OPT_LENGTH (IP_MAX_HDR_LENGTH-IP_SIMPLE_HDR_LENGTH)
122
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700123#define IP_MIN_MTU (IP_MAX_HDR_LENGTH + 8) /* 68 bytes */
124
125/*
126 * XXX IP_MAXPACKET is defined in <netinet/ip.h> as well. At some point the
127 * 2 files should be cleaned up to remove all redundant definitions.
128 */
129#define IP_MAXPACKET 65535
130#define IP_SIMPLE_HDR_VERSION \
131 ((IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS)
132
jpk45916cd2006-03-24 12:29:20 -0800133#define UDPH_SIZE 8
134
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700135/*
136 * Constants and type definitions to support IP IOCTL commands
137 */
138#define IP_IOCTL (('i'<<8)|'p')
139#define IP_IOC_IRE_DELETE 4
140#define IP_IOC_IRE_DELETE_NO_REPLY 5
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700141#define IP_IOC_RTS_REQUEST 7
142
143/* Common definitions used by IP IOCTL data structures */
144typedef struct ipllcmd_s {
145 uint_t ipllc_cmd;
146 uint_t ipllc_name_offset;
147 uint_t ipllc_name_length;
148} ipllc_t;
149
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700150/* IP IRE Delete Command Structure. */
151typedef struct ipid_s {
152 ipllc_t ipid_ipllc;
153 uint_t ipid_ire_type;
154 uint_t ipid_addr_offset;
155 uint_t ipid_addr_length;
156 uint_t ipid_mask_offset;
157 uint_t ipid_mask_length;
158} ipid_t;
159
160#define ipid_cmd ipid_ipllc.ipllc_cmd
161
162#ifdef _KERNEL
163/*
164 * Temporary state for ip options parser.
165 */
166typedef struct ipoptp_s
167{
168 uint8_t *ipoptp_next; /* next option to look at */
169 uint8_t *ipoptp_end; /* end of options */
170 uint8_t *ipoptp_cur; /* start of current option */
171 uint8_t ipoptp_len; /* length of current option */
172 uint32_t ipoptp_flags;
173} ipoptp_t;
174
175/*
176 * Flag(s) for ipoptp_flags
177 */
178#define IPOPTP_ERROR 0x00000001
179#endif /* _KERNEL */
180
Richard Lowebbf21552022-02-26 16:40:47 -0600181/* Controls forwarding of IP packets, set via ipadm(8)/ndd(8) */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700182#define IP_FORWARD_NEVER 0
183#define IP_FORWARD_ALWAYS 1
184
Girish Moodalbail6e91bba2010-03-26 17:53:11 -0400185#define WE_ARE_FORWARDING(ipst) ((ipst)->ips_ip_forwarding == IP_FORWARD_ALWAYS)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700186
187#define IPH_HDR_LENGTH(ipha) \
188 ((int)(((ipha_t *)ipha)->ipha_version_and_hdr_length & 0xF) << 2)
189
190#define IPH_HDR_VERSION(ipha) \
191 ((int)(((ipha_t *)ipha)->ipha_version_and_hdr_length) >> 4)
192
193#ifdef _KERNEL
194/*
195 * IP reassembly macros. We hide starting and ending offsets in b_next and
196 * b_prev of messages on the reassembly queue. The messages are chained using
197 * b_cont. These macros are used in ip_reassemble() so we don't have to see
198 * the ugly casts and assignments.
199 * Note that the offsets are <= 64k i.e. a uint_t is sufficient to represent
200 * them.
201 */
202#define IP_REASS_START(mp) ((uint_t)(uintptr_t)((mp)->b_next))
203#define IP_REASS_SET_START(mp, u) \
204 ((mp)->b_next = (mblk_t *)(uintptr_t)(u))
205#define IP_REASS_END(mp) ((uint_t)(uintptr_t)((mp)->b_prev))
206#define IP_REASS_SET_END(mp, u) \
207 ((mp)->b_prev = (mblk_t *)(uintptr_t)(u))
208
209#define IP_REASS_COMPLETE 0x1
210#define IP_REASS_PARTIAL 0x2
211#define IP_REASS_FAILED 0x4
212
213/*
214 * Test to determine whether this is a module instance of IP or a
215 * driver instance of IP.
216 */
217#define CONN_Q(q) (WR(q)->q_next == NULL)
218
219#define Q_TO_CONN(q) ((conn_t *)(q)->q_ptr)
220#define Q_TO_TCP(q) (Q_TO_CONN((q))->conn_tcp)
masputraff550d02005-10-22 22:50:14 -0700221#define Q_TO_UDP(q) (Q_TO_CONN((q))->conn_udp)
nordmarkfc80c0d2007-10-11 22:57:36 -0700222#define Q_TO_ICMP(q) (Q_TO_CONN((q))->conn_icmp)
223#define Q_TO_RTS(q) (Q_TO_CONN((q))->conn_rts)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700224
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800225#define CONNP_TO_WQ(connp) ((connp)->conn_wq)
226#define CONNP_TO_RQ(connp) ((connp)->conn_rq)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700227
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700228#define GRAB_CONN_LOCK(q) { \
229 if (q != NULL && CONN_Q(q)) \
230 mutex_enter(&(Q_TO_CONN(q))->conn_lock); \
231}
232
233#define RELEASE_CONN_LOCK(q) { \
234 if (q != NULL && CONN_Q(q)) \
235 mutex_exit(&(Q_TO_CONN(q))->conn_lock); \
236}
237
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700238/*
Brian Ruthven838a4ff2009-12-22 00:58:06 +0000239 * Ref counter macros for ioctls. This provides a guard for TCP to stop
240 * tcp_close from removing the rq/wq whilst an ioctl is still in flight on the
241 * stream. The ioctl could have been queued on e.g. an ipsq. tcp_close will wait
242 * until the ioctlref count is zero before proceeding.
243 * Ideally conn_oper_pending_ill would be used for this purpose. However, in the
244 * case where an ioctl is aborted or interrupted, it can be cleared prematurely.
245 * There are also some race possibilities between ip and the stream head which
246 * can also end up with conn_oper_pending_ill being cleared prematurely. So, to
247 * avoid these situations, we use a dedicated ref counter for ioctls which is
248 * used in addition to and in parallel with the normal conn_ref count.
249 */
250#define CONN_INC_IOCTLREF_LOCKED(connp) { \
251 ASSERT(MUTEX_HELD(&(connp)->conn_lock)); \
252 DTRACE_PROBE1(conn__inc__ioctlref, conn_t *, (connp)); \
253 (connp)->conn_ioctlref++; \
254 mutex_exit(&(connp)->conn_lock); \
255}
256
257#define CONN_INC_IOCTLREF(connp) { \
258 mutex_enter(&(connp)->conn_lock); \
259 CONN_INC_IOCTLREF_LOCKED(connp); \
260}
261
262#define CONN_DEC_IOCTLREF(connp) { \
263 mutex_enter(&(connp)->conn_lock); \
264 DTRACE_PROBE1(conn__dec__ioctlref, conn_t *, (connp)); \
265 /* Make sure conn_ioctlref will not underflow. */ \
266 ASSERT((connp)->conn_ioctlref != 0); \
267 if ((--(connp)->conn_ioctlref == 0) && \
268 ((connp)->conn_state_flags & CONN_CLOSING)) { \
269 cv_broadcast(&(connp)->conn_cv); \
270 } \
271 mutex_exit(&(connp)->conn_lock); \
272}
273
274
275/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700276 * Complete the pending operation. Usually an ioctl. Can also
277 * be a bind or option management request that got enqueued
278 * in an ipsq_t. Called on completion of the operation.
279 */
280#define CONN_OPER_PENDING_DONE(connp) { \
281 mutex_enter(&(connp)->conn_lock); \
282 (connp)->conn_oper_pending_ill = NULL; \
283 cv_broadcast(&(connp)->conn_refcv); \
284 mutex_exit(&(connp)->conn_lock); \
285 CONN_DEC_REF(connp); \
286}
287
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700288/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700289 * Values for squeue switch:
290 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700291#define IP_SQUEUE_ENTER_NODRAIN 1
292#define IP_SQUEUE_ENTER 2
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800293#define IP_SQUEUE_FILL 3
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700294
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800295extern int ip_squeue_flag;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700296
297/* IP Fragmentation Reassembly Header */
298typedef struct ipf_s {
299 struct ipf_s *ipf_hash_next;
300 struct ipf_s **ipf_ptphn; /* Pointer to previous hash next. */
301 uint32_t ipf_ident; /* Ident to match. */
302 uint8_t ipf_protocol; /* Protocol to match. */
303 uchar_t ipf_last_frag_seen : 1; /* Last fragment seen ? */
304 time_t ipf_timestamp; /* Reassembly start time. */
305 mblk_t *ipf_mp; /* mblk we live in. */
306 mblk_t *ipf_tail_mp; /* Frag queue tail pointer. */
307 int ipf_hole_cnt; /* Number of holes (hard-case). */
308 int ipf_end; /* Tail end offset (0 -> hard-case). */
309 uint_t ipf_gen; /* Frag queue generation */
310 size_t ipf_count; /* Count of bytes used by frag */
311 uint_t ipf_nf_hdr_len; /* Length of nonfragmented header */
312 in6_addr_t ipf_v6src; /* IPv6 source address */
313 in6_addr_t ipf_v6dst; /* IPv6 dest address */
314 uint_t ipf_prev_nexthdr_offset; /* Offset for nexthdr value */
315 uint8_t ipf_ecn; /* ECN info for the fragments */
316 uint8_t ipf_num_dups; /* Number of times dup frags recvd */
masputraff550d02005-10-22 22:50:14 -0700317 uint16_t ipf_checksum_flags; /* Hardware checksum flags */
318 uint32_t ipf_checksum; /* Partial checksum of fragment data */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700319} ipf_t;
320
Eric Chengda14ceb2008-12-04 18:16:10 -0800321/*
322 * IPv4 Fragments
323 */
324#define IS_V4_FRAGMENT(ipha_fragment_offset_and_flags) \
325 (((ntohs(ipha_fragment_offset_and_flags) & IPH_OFFSET) != 0) || \
326 ((ntohs(ipha_fragment_offset_and_flags) & IPH_MF) != 0))
327
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700328#define ipf_src V4_PART_OF_V6(ipf_v6src)
329#define ipf_dst V4_PART_OF_V6(ipf_v6dst)
330
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700331#endif /* _KERNEL */
332
333/* ICMP types */
334#define ICMP_ECHO_REPLY 0
335#define ICMP_DEST_UNREACHABLE 3
336#define ICMP_SOURCE_QUENCH 4
337#define ICMP_REDIRECT 5
338#define ICMP_ECHO_REQUEST 8
339#define ICMP_ROUTER_ADVERTISEMENT 9
340#define ICMP_ROUTER_SOLICITATION 10
341#define ICMP_TIME_EXCEEDED 11
342#define ICMP_PARAM_PROBLEM 12
343#define ICMP_TIME_STAMP_REQUEST 13
344#define ICMP_TIME_STAMP_REPLY 14
345#define ICMP_INFO_REQUEST 15
346#define ICMP_INFO_REPLY 16
347#define ICMP_ADDRESS_MASK_REQUEST 17
348#define ICMP_ADDRESS_MASK_REPLY 18
349
Sebastien Roy2b24ab62009-09-22 22:04:45 -0400350/* Evaluates to true if the ICMP type is an ICMP error */
351#define ICMP_IS_ERROR(type) ( \
352 (type) == ICMP_DEST_UNREACHABLE || \
353 (type) == ICMP_SOURCE_QUENCH || \
354 (type) == ICMP_TIME_EXCEEDED || \
355 (type) == ICMP_PARAM_PROBLEM)
356
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700357/* ICMP_TIME_EXCEEDED codes */
358#define ICMP_TTL_EXCEEDED 0
359#define ICMP_REASSEMBLY_TIME_EXCEEDED 1
360
361/* ICMP_DEST_UNREACHABLE codes */
362#define ICMP_NET_UNREACHABLE 0
363#define ICMP_HOST_UNREACHABLE 1
364#define ICMP_PROTOCOL_UNREACHABLE 2
365#define ICMP_PORT_UNREACHABLE 3
366#define ICMP_FRAGMENTATION_NEEDED 4
367#define ICMP_SOURCE_ROUTE_FAILED 5
368#define ICMP_DEST_NET_UNKNOWN 6
369#define ICMP_DEST_HOST_UNKNOWN 7
370#define ICMP_SRC_HOST_ISOLATED 8
371#define ICMP_DEST_NET_UNREACH_ADMIN 9
372#define ICMP_DEST_HOST_UNREACH_ADMIN 10
373#define ICMP_DEST_NET_UNREACH_TOS 11
374#define ICMP_DEST_HOST_UNREACH_TOS 12
375
376/* ICMP Header Structure */
377typedef struct icmph_s {
378 uint8_t icmph_type;
379 uint8_t icmph_code;
380 uint16_t icmph_checksum;
381 union {
382 struct { /* ECHO request/response structure */
383 uint16_t u_echo_ident;
384 uint16_t u_echo_seqnum;
385 } u_echo;
386 struct { /* Destination unreachable structure */
387 uint16_t u_du_zero;
388 uint16_t u_du_mtu;
389 } u_du;
390 struct { /* Parameter problem structure */
391 uint8_t u_pp_ptr;
392 uint8_t u_pp_rsvd[3];
393 } u_pp;
394 struct { /* Redirect structure */
395 ipaddr_t u_rd_gateway;
396 } u_rd;
397 } icmph_u;
398} icmph_t;
399
400#define icmph_echo_ident icmph_u.u_echo.u_echo_ident
401#define icmph_echo_seqnum icmph_u.u_echo.u_echo_seqnum
402#define icmph_du_zero icmph_u.u_du.u_du_zero
403#define icmph_du_mtu icmph_u.u_du.u_du_mtu
404#define icmph_pp_ptr icmph_u.u_pp.u_pp_ptr
405#define icmph_rd_gateway icmph_u.u_rd.u_rd_gateway
406
407#define ICMPH_SIZE 8
408
409/*
410 * Minimum length of transport layer header included in an ICMP error
411 * message for it to be considered valid.
412 */
413#define ICMP_MIN_TP_HDR_LEN 8
414
415/* Aligned IP header */
416typedef struct ipha_s {
417 uint8_t ipha_version_and_hdr_length;
418 uint8_t ipha_type_of_service;
419 uint16_t ipha_length;
420 uint16_t ipha_ident;
421 uint16_t ipha_fragment_offset_and_flags;
422 uint8_t ipha_ttl;
423 uint8_t ipha_protocol;
424 uint16_t ipha_hdr_checksum;
425 ipaddr_t ipha_src;
426 ipaddr_t ipha_dst;
427} ipha_t;
428
brendan10e6dad2008-06-13 19:06:55 -0700429/*
430 * IP Flags
431 *
432 * Some of these constant names are copied for the DTrace IP provider in
433 * usr/src/lib/libdtrace/common/{ip.d.in, ip.sed.in}, which should be kept
434 * in sync.
435 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700436#define IPH_DF 0x4000 /* Don't fragment */
437#define IPH_MF 0x2000 /* More fragments to come */
438#define IPH_OFFSET 0x1FFF /* Where the offset lives */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800439
440/* Byte-order specific values */
441#ifdef _BIG_ENDIAN
442#define IPH_DF_HTONS 0x4000 /* Don't fragment */
443#define IPH_MF_HTONS 0x2000 /* More fragments to come */
444#define IPH_OFFSET_HTONS 0x1FFF /* Where the offset lives */
445#else
446#define IPH_DF_HTONS 0x0040 /* Don't fragment */
447#define IPH_MF_HTONS 0x0020 /* More fragments to come */
448#define IPH_OFFSET_HTONS 0xFF1F /* Where the offset lives */
449#endif
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700450
451/* ECN code points for IPv4 TOS byte and IPv6 traffic class octet. */
meem79242222008-07-29 18:39:05 -0700452#define IPH_ECN_NECT 0x0 /* Not ECN-Capable Transport */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700453#define IPH_ECN_ECT1 0x1 /* ECN-Capable Transport, ECT(1) */
454#define IPH_ECN_ECT0 0x2 /* ECN-Capable Transport, ECT(0) */
455#define IPH_ECN_CE 0x3 /* ECN-Congestion Experienced (CE) */
456
meeme11c3f42009-01-06 20:16:25 -0500457struct ill_s;
458
Sebastien Roy2b24ab62009-09-22 22:04:45 -0400459typedef void ip_v6intfid_func_t(struct ill_s *, in6_addr_t *);
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800460typedef void ip_v6mapinfo_func_t(struct ill_s *, uchar_t *, uchar_t *);
461typedef void ip_v4mapinfo_func_t(struct ill_s *, uchar_t *, uchar_t *);
meeme11c3f42009-01-06 20:16:25 -0500462
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700463/* IP Mac info structure */
464typedef struct ip_m_s {
meeme11c3f42009-01-06 20:16:25 -0500465 t_uscalar_t ip_m_mac_type; /* From <sys/dlpi.h> */
466 int ip_m_type; /* From <net/if_types.h> */
Sebastien Roy2b24ab62009-09-22 22:04:45 -0400467 t_uscalar_t ip_m_ipv4sap;
468 t_uscalar_t ip_m_ipv6sap;
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800469 ip_v4mapinfo_func_t *ip_m_v4mapping;
470 ip_v6mapinfo_func_t *ip_m_v6mapping;
meeme11c3f42009-01-06 20:16:25 -0500471 ip_v6intfid_func_t *ip_m_v6intfid;
Sebastien Roy2b24ab62009-09-22 22:04:45 -0400472 ip_v6intfid_func_t *ip_m_v6destintfid;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700473} ip_m_t;
474
475/*
476 * The following functions attempt to reduce the link layer dependency
477 * of the IP stack. The current set of link specific operations are:
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800478 * a. map from IPv4 class D (224.0/4) multicast address range or the
479 * IPv6 multicast address range (ff00::/8) to the link layer multicast
480 * address.
481 * b. derive the default IPv6 interface identifier from the interface.
482 * c. derive the default IPv6 destination interface identifier from
meeme11c3f42009-01-06 20:16:25 -0500483 * the interface (point-to-point only).
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700484 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800485extern void ip_mcast_mapping(struct ill_s *, uchar_t *, uchar_t *);
Sebastien Roy2b24ab62009-09-22 22:04:45 -0400486/* ip_m_v6*intfid return void and are never NULL */
487#define MEDIA_V6INTFID(ip_m, ill, v6ptr) (ip_m)->ip_m_v6intfid(ill, v6ptr)
meeme11c3f42009-01-06 20:16:25 -0500488#define MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \
Sebastien Roy2b24ab62009-09-22 22:04:45 -0400489 (ip_m)->ip_m_v6destintfid(ill, v6ptr)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700490
491/* Router entry types */
492#define IRE_BROADCAST 0x0001 /* Route entry for broadcast address */
493#define IRE_DEFAULT 0x0002 /* Route entry for default gateway */
494#define IRE_LOCAL 0x0004 /* Route entry for local address */
495#define IRE_LOOPBACK 0x0008 /* Route entry for loopback address */
496#define IRE_PREFIX 0x0010 /* Route entry for prefix routes */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800497#ifndef _KERNEL
498/* Keep so user-level still compiles */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700499#define IRE_CACHE 0x0020 /* Cached Route entry */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800500#endif
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700501#define IRE_IF_NORESOLVER 0x0040 /* Route entry for local interface */
502 /* net without any address mapping. */
503#define IRE_IF_RESOLVER 0x0080 /* Route entry for local interface */
504 /* net with resolver. */
505#define IRE_HOST 0x0100 /* Host route entry */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800506/* Keep so user-level still compiles */
dd1935166bdb8e62006-10-27 15:48:26 -0700507#define IRE_HOST_REDIRECT 0x0200 /* only used for T_SVR4_OPTMGMT_REQ */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800508#define IRE_IF_CLONE 0x0400 /* Per host clone of IRE_IF */
509#define IRE_MULTICAST 0x0800 /* Special - not in table */
510#define IRE_NOROUTE 0x1000 /* Special - not in table */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700511
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700512#define IRE_INTERFACE (IRE_IF_NORESOLVER | IRE_IF_RESOLVER)
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800513
514#define IRE_IF_ALL (IRE_IF_NORESOLVER | IRE_IF_RESOLVER | \
515 IRE_IF_CLONE)
dd1935166bdb8e62006-10-27 15:48:26 -0700516#define IRE_OFFSUBNET (IRE_DEFAULT | IRE_PREFIX | IRE_HOST)
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800517#define IRE_OFFLINK IRE_OFFSUBNET
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700518/*
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800519 * Note that we view IRE_NOROUTE as ONLINK since we can "send" to them without
520 * going through a router; the result of sending will be an error/icmp error.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700521 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800522#define IRE_ONLINK (IRE_IF_ALL|IRE_LOCAL|IRE_LOOPBACK| \
523 IRE_BROADCAST|IRE_MULTICAST|IRE_NOROUTE)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700524
525/* Arguments to ire_flush_cache() */
526#define IRE_FLUSH_DELETE 0
527#define IRE_FLUSH_ADD 1
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800528#define IRE_FLUSH_GWCHANGE 2
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700529
530/*
Erik Nordmark9e3469d2010-01-08 08:42:20 -0800531 * Flags to ire_route_recursive
532 */
533#define IRR_NONE 0
534#define IRR_ALLOCATE 1 /* OK to allocate IRE_IF_CLONE */
535#define IRR_INCOMPLETE 2 /* OK to return incomplete chain */
536
537/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700538 * Open/close synchronization flags.
539 * These are kept in a separate field in the conn and the synchronization
540 * depends on the atomic 32 bit access to that field.
541 */
542#define CONN_CLOSING 0x01 /* ip_close waiting for ip_wsrv */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800543#define CONN_CONDEMNED 0x02 /* conn is closing, no more refs */
544#define CONN_INCIPIENT 0x04 /* conn not yet visible, no refs */
545#define CONN_QUIESCED 0x08 /* conn is now quiescent */
546#define CONN_UPDATE_ILL 0x10 /* conn_update_ill in progress */
georges325b8062007-02-06 07:01:31 -0800547
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700548/*
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800549 * Flags for dce_flags field. Specifies which information has been set.
550 * dce_ident is always present, but the other ones are identified by the flags.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700551 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800552#define DCEF_DEFAULT 0x0001 /* Default DCE - no pmtu or uinfo */
553#define DCEF_PMTU 0x0002 /* Different than interface MTU */
554#define DCEF_UINFO 0x0004 /* dce_uinfo set */
Dan McDonald7199b8e2017-02-01 14:55:57 -0500555#define DCEF_TOO_SMALL_PMTU 0x0008 /* Smaller than IPv4 MIN */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700556
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800557#ifdef _KERNEL
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700558/*
559 * Extra structures need for per-src-addr filtering (IGMPv3/MLDv2)
560 */
561#define MAX_FILTER_SIZE 64
562
563typedef struct slist_s {
564 int sl_numsrc;
565 in6_addr_t sl_addr[MAX_FILTER_SIZE];
566} slist_t;
567
568/*
569 * Following struct is used to maintain retransmission state for
570 * a multicast group. One rtx_state_t struct is an in-line field
571 * of the ilm_t struct; the slist_ts in the rtx_state_t struct are
572 * alloc'd as needed.
573 */
574typedef struct rtx_state_s {
575 uint_t rtx_timer; /* retrans timer */
576 int rtx_cnt; /* retrans count */
577 int rtx_fmode_cnt; /* retrans count for fmode change */
578 slist_t *rtx_allow;
579 slist_t *rtx_block;
580} rtx_state_t;
581
582/*
583 * Used to construct list of multicast address records that will be
584 * sent in a single listener report.
585 */
586typedef struct mrec_s {
587 struct mrec_s *mrec_next;
588 uint8_t mrec_type;
589 uint8_t mrec_auxlen; /* currently unused */
590 in6_addr_t mrec_group;
591 slist_t mrec_srcs;
592} mrec_t;
593
594/* Group membership list per upper conn */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800595
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700596/*
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800597 * We record the multicast information from the socket option in
598 * ilg_ifaddr/ilg_ifindex. This allows rejoining the group in the case when
599 * the ifaddr (or ifindex) disappears and later reappears, potentially on
600 * a different ill. The IPv6 multicast socket options and ioctls all specify
601 * the interface using an ifindex. For IPv4 some socket options/ioctls use
602 * the interface address and others use the index. We record here the method
603 * that was actually used (and leave the other of ilg_ifaddr or ilg_ifindex)
604 * at zero so that we can rejoin the way the application intended.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700605 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800606 * We track the ill on which we will or already have joined an ilm using
607 * ilg_ill. When we have succeeded joining the ilm and have a refhold on it
608 * then we set ilg_ilm. Thus intentionally there is a window where ilg_ill is
609 * set and ilg_ilm is not set. This allows clearing ilg_ill as a signal that
610 * the ill is being unplumbed and the ilm should be discarded.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700611 *
612 * ilg records the state of multicast memberships of a socket end point.
613 * ilm records the state of multicast memberships with the driver and is
614 * maintained per interface.
615 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800616 * The ilg state is protected by conn_ilg_lock.
617 * The ilg will not be freed until ilg_refcnt drops to zero.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700618 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700619typedef struct ilg_s {
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800620 struct ilg_s *ilg_next;
621 struct ilg_s **ilg_ptpn;
622 struct conn_s *ilg_connp; /* Back pointer to get lock */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700623 in6_addr_t ilg_v6group;
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800624 ipaddr_t ilg_ifaddr; /* For some IPv4 cases */
625 uint_t ilg_ifindex; /* IPv6 and some other IPv4 cases */
626 struct ill_s *ilg_ill; /* Where ilm is joined. No refhold */
627 struct ilm_s *ilg_ilm; /* With ilm_refhold */
628 uint_t ilg_refcnt;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700629 mcast_record_t ilg_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
630 slist_t *ilg_filter;
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800631 boolean_t ilg_condemned; /* Conceptually deleted */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700632} ilg_t;
633
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700634/*
meem79242222008-07-29 18:39:05 -0700635 * Multicast address list entry for ill.
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800636 * ilm_ill is used by IPv4 and IPv6
637 *
638 * The ilm state (and other multicast state on the ill) is protected by
639 * ill_mcast_lock. Operations that change state on both an ilg and ilm
640 * in addition use ill_mcast_serializer to ensure that we can't have
641 * interleaving between e.g., add and delete operations for the same conn_t,
Sowmini Varadhanf1c454b2010-01-11 10:29:23 -0500642 * group, and ill. The ill_mcast_serializer is also used to ensure that
643 * multicast group joins do not occur on an interface that is in the process
644 * of joining an IPMP group.
dh155122f4b3ec62007-01-19 16:59:38 -0800645 *
646 * The comment below (and for other netstack_t references) refers
647 * to the fact that we only do netstack_hold in particular cases,
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800648 * such as the references from open endpoints (ill_t and conn_t's
dh155122f4b3ec62007-01-19 16:59:38 -0800649 * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
650 * ire_t's when an ill goes away.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700651 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700652typedef struct ilm_s {
653 in6_addr_t ilm_v6addr;
654 int ilm_refcnt;
655 uint_t ilm_timer; /* IGMP/MLD query resp timer, in msec */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700656 struct ilm_s *ilm_next; /* Linked list for each ill */
657 uint_t ilm_state; /* state of the membership */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800658 struct ill_s *ilm_ill; /* Back pointer to ill - ill_ilm_cnt */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700659 zoneid_t ilm_zoneid;
660 int ilm_no_ilg_cnt; /* number of joins w/ no ilg */
661 mcast_record_t ilm_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
662 slist_t *ilm_filter; /* source filter list */
663 slist_t *ilm_pendsrcs; /* relevant src addrs for pending req */
664 rtx_state_t ilm_rtx; /* SCR retransmission state */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800665 ipaddr_t ilm_ifaddr; /* For IPv4 netstat */
dh155122f4b3ec62007-01-19 16:59:38 -0800666 ip_stack_t *ilm_ipst; /* Does not have a netstack_hold */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700667} ilm_t;
668
669#define ilm_addr V4_PART_OF_V6(ilm_v6addr)
670
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700671/*
672 * Soft reference to an IPsec SA.
673 *
meem79242222008-07-29 18:39:05 -0700674 * On relative terms, conn's can be persistent (living as long as the
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700675 * processes which create them), while SA's are ephemeral (dying when
676 * they hit their time-based or byte-based lifetimes).
677 *
678 * We could hold a hard reference to an SA from an ipsec_latch_t,
679 * but this would cause expired SA's to linger for a potentially
680 * unbounded time.
681 *
682 * Instead, we remember the hash bucket number and bucket generation
683 * in addition to the pointer. The bucket generation is incremented on
684 * each deletion.
685 */
686typedef struct ipsa_ref_s
687{
688 struct ipsa_s *ipsr_sa;
689 struct isaf_s *ipsr_bucket;
690 uint64_t ipsr_gen;
691} ipsa_ref_t;
692
693/*
694 * IPsec "latching" state.
695 *
696 * In the presence of IPsec policy, fully-bound conn's bind a connection
697 * to more than just the 5-tuple, but also a specific IPsec action and
698 * identity-pair.
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800699 * The identity pair is accessed from both the receive and transmit side
700 * hence it is maintained in the ipsec_latch_t structure. conn_latch and
701 * ixa_ipsec_latch points to it.
702 * The policy and actions are stored in conn_latch_in_policy and
703 * conn_latch_in_action for the inbound side, and in ixa_ipsec_policy and
704 * ixa_ipsec_action for the transmit side.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700705 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800706 * As an optimization, we also cache soft references to IPsec SA's in
707 * ip_xmit_attr_t so that we can fast-path around most of the work needed for
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700708 * outbound IPsec SA selection.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700709 */
710typedef struct ipsec_latch_s
711{
712 kmutex_t ipl_lock;
713 uint32_t ipl_refcnt;
714
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700715 struct ipsid_s *ipl_local_cid;
716 struct ipsid_s *ipl_remote_cid;
717 unsigned int
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700718 ipl_ids_latched : 1,
719
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800720 ipl_pad_to_bit_31 : 31;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700721} ipsec_latch_t;
722
723#define IPLATCH_REFHOLD(ipl) { \
Josef 'Jeff' Sipek1a5e2582014-08-08 10:50:14 -0400724 atomic_inc_32(&(ipl)->ipl_refcnt); \
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700725 ASSERT((ipl)->ipl_refcnt != 0); \
726}
727
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800728#define IPLATCH_REFRELE(ipl) { \
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700729 ASSERT((ipl)->ipl_refcnt != 0); \
730 membar_exit(); \
Josef 'Jeff' Sipek1a5e2582014-08-08 10:50:14 -0400731 if (atomic_dec_32_nv(&(ipl)->ipl_refcnt) == 0) \
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800732 iplatch_free(ipl); \
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700733}
734
735/*
736 * peer identity structure.
737 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700738typedef struct conn_s conn_t;
739
740/*
meem79242222008-07-29 18:39:05 -0700741 * This is used to match an inbound/outbound datagram with policy.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700742 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700743typedef struct ipsec_selector {
744 in6_addr_t ips_local_addr_v6;
745 in6_addr_t ips_remote_addr_v6;
746 uint16_t ips_local_port;
747 uint16_t ips_remote_port;
748 uint8_t ips_icmp_type;
749 uint8_t ips_icmp_code;
750 uint8_t ips_protocol;
751 uint8_t ips_isv4 : 1,
752 ips_is_icmp_inv_acq: 1;
753} ipsec_selector_t;
754
755/*
756 * Note that we put v4 addresses in the *first* 32-bit word of the
757 * selector rather than the last to simplify the prefix match/mask code
758 * in spd.c
759 */
760#define ips_local_addr_v4 ips_local_addr_v6.s6_addr32[0]
761#define ips_remote_addr_v4 ips_remote_addr_v6.s6_addr32[0]
762
763/* Values used in IP by IPSEC Code */
764#define IPSEC_OUTBOUND B_TRUE
765#define IPSEC_INBOUND B_FALSE
766
767/*
768 * There are two variants in policy failures. The packet may come in
769 * secure when not needed (IPSEC_POLICY_???_NOT_NEEDED) or it may not
770 * have the desired level of protection (IPSEC_POLICY_MISMATCH).
771 */
772#define IPSEC_POLICY_NOT_NEEDED 0
773#define IPSEC_POLICY_MISMATCH 1
774#define IPSEC_POLICY_AUTH_NOT_NEEDED 2
775#define IPSEC_POLICY_ENCR_NOT_NEEDED 3
776#define IPSEC_POLICY_SE_NOT_NEEDED 4
777#define IPSEC_POLICY_MAX 5 /* Always max + 1. */
778
779/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700780 * Check with IPSEC inbound policy if
781 *
782 * 1) per-socket policy is present - indicated by conn_in_enforce_policy.
783 * 2) Or if we have not cached policy on the conn and the global policy is
784 * non-empty.
785 */
dh155122f4b3ec62007-01-19 16:59:38 -0800786#define CONN_INBOUND_POLICY_PRESENT(connp, ipss) \
787 ((connp)->conn_in_enforce_policy || \
Toomas Soome8a06b3d2018-10-15 22:13:16 +0300788 (!((connp)->conn_policy_cached) && \
dh155122f4b3ec62007-01-19 16:59:38 -0800789 (ipss)->ipsec_inbound_v4_policy_present))
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700790
dh155122f4b3ec62007-01-19 16:59:38 -0800791#define CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) \
792 ((connp)->conn_in_enforce_policy || \
793 (!(connp)->conn_policy_cached && \
794 (ipss)->ipsec_inbound_v6_policy_present))
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700795
dh155122f4b3ec62007-01-19 16:59:38 -0800796#define CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) \
797 ((connp)->conn_out_enforce_policy || \
798 (!((connp)->conn_policy_cached) && \
799 (ipss)->ipsec_outbound_v4_policy_present))
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700800
dh155122f4b3ec62007-01-19 16:59:38 -0800801#define CONN_OUTBOUND_POLICY_PRESENT_V6(connp, ipss) \
802 ((connp)->conn_out_enforce_policy || \
803 (!(connp)->conn_policy_cached && \
804 (ipss)->ipsec_outbound_v6_policy_present))
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700805
806/*
807 * Information cached in IRE for upper layer protocol (ULP).
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700808 */
809typedef struct iulp_s {
810 boolean_t iulp_set; /* Is any metric set? */
811 uint32_t iulp_ssthresh; /* Slow start threshold (TCP). */
812 clock_t iulp_rtt; /* Guestimate in millisecs. */
813 clock_t iulp_rtt_sd; /* Cached value of RTT variance. */
814 uint32_t iulp_spipe; /* Send pipe size. */
815 uint32_t iulp_rpipe; /* Receive pipe size. */
816 uint32_t iulp_rtomax; /* Max round trip timeout. */
817 uint32_t iulp_sack; /* Use SACK option (TCP)? */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800818 uint32_t iulp_mtu; /* Setable with routing sockets */
819
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700820 uint32_t
821 iulp_tstamp_ok : 1, /* Use timestamp option (TCP)? */
822 iulp_wscale_ok : 1, /* Use window scale option (TCP)? */
823 iulp_ecn_ok : 1, /* Enable ECN (for TCP)? */
824 iulp_pmtud_ok : 1, /* Enable PMTUd? */
825
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800826 /* These three are passed out by ip_set_destination */
827 iulp_localnet: 1, /* IRE_ONLINK */
828 iulp_loopback: 1, /* IRE_LOOPBACK */
829 iulp_local: 1, /* IRE_LOCAL */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700830
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800831 iulp_not_used : 25;
832} iulp_t;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700833
834/*
meem3344d752010-03-27 02:33:20 -0400835 * The conn drain list structure (idl_t), protected by idl_lock. Each conn_t
836 * inserted in the list points back at this idl_t using conn_idl, and is
837 * chained by conn_drain_next and conn_drain_prev, which are also protected by
838 * idl_lock. When flow control is relieved, either ip_wsrv() (STREAMS) or
839 * ill_flow_enable() (non-STREAMS) will call conn_drain().
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800840 *
841 * The conn drain list, idl_t, itself is part of tx cookie list structure.
842 * A tx cookie list points to a blocked Tx ring and contains the list of
843 * all conn's that are blocked due to the flow-controlled Tx ring (via
844 * the idl drain list). Note that a link can have multiple Tx rings. The
845 * drain list will store the conn's blocked due to Tx ring being flow
846 * controlled.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700847 */
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800848
849typedef uintptr_t ip_mac_tx_cookie_t;
850typedef struct idl_s idl_t;
851typedef struct idl_tx_list_s idl_tx_list_t;
852
853struct idl_tx_list_s {
854 ip_mac_tx_cookie_t txl_cookie;
855 kmutex_t txl_lock; /* Lock for this list */
856 idl_t *txl_drain_list;
857 int txl_drain_index;
858};
859
860struct idl_s {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700861 conn_t *idl_conn; /* Head of drain list */
862 kmutex_t idl_lock; /* Lock for this list */
Venugopal Iyerae6aa222009-02-17 01:31:30 -0800863 idl_tx_list_t *idl_itl;
864};
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700865
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700866/*
867 * Interface route structure which holds the necessary information to recreate
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800868 * routes that are tied to an interface i.e. have ire_ill set.
869 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700870 * These routes which were initially created via a routing socket or via the
871 * SIOCADDRT ioctl may be gateway routes (RTF_GATEWAY being set) or may be
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800872 * traditional interface routes. When an ill comes back up after being
873 * down, this information will be used to recreate the routes. These
874 * are part of an mblk_t chain that hangs off of the ILL (ill_saved_ire_mp).
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700875 */
876typedef struct ifrt_s {
877 ushort_t ifrt_type; /* Type of IRE */
878 in6_addr_t ifrt_v6addr; /* Address IRE represents. */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800879 in6_addr_t ifrt_v6gateway_addr; /* Gateway if IRE_OFFLINK */
880 in6_addr_t ifrt_v6setsrc_addr; /* Src addr if RTF_SETSRC */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700881 in6_addr_t ifrt_v6mask; /* Mask for matching IRE. */
882 uint32_t ifrt_flags; /* flags related to route */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800883 iulp_t ifrt_metrics; /* Routing socket metrics */
884 zoneid_t ifrt_zoneid; /* zoneid for route */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700885} ifrt_t;
886
887#define ifrt_addr V4_PART_OF_V6(ifrt_v6addr)
888#define ifrt_gateway_addr V4_PART_OF_V6(ifrt_v6gateway_addr)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700889#define ifrt_mask V4_PART_OF_V6(ifrt_v6mask)
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800890#define ifrt_setsrc_addr V4_PART_OF_V6(ifrt_v6setsrc_addr)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700891
892/* Number of IP addresses that can be hosted on a physical interface */
893#define MAX_ADDRS_PER_IF 8192
894/*
895 * Number of Source addresses to be considered for source address
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800896 * selection. Used by ipif_select_source_v4/v6.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700897 */
898#define MAX_IPIF_SELECT_SOURCE 50
899
900#ifdef IP_DEBUG
901/*
meem79242222008-07-29 18:39:05 -0700902 * Trace refholds and refreles for debugging.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700903 */
carlsonj6a8288c2007-09-11 04:26:06 -0700904#define TR_STACK_DEPTH 14
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700905typedef struct tr_buf_s {
906 int tr_depth;
carlsonj6a8288c2007-09-11 04:26:06 -0700907 clock_t tr_time;
908 pc_t tr_stack[TR_STACK_DEPTH];
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700909} tr_buf_t;
910
911typedef struct th_trace_s {
carlsonj6a8288c2007-09-11 04:26:06 -0700912 int th_refcnt;
913 uint_t th_trace_lastref;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700914 kthread_t *th_id;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700915#define TR_BUF_MAX 38
carlsonj6a8288c2007-09-11 04:26:06 -0700916 tr_buf_t th_trbuf[TR_BUF_MAX];
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700917} th_trace_t;
carlsonj6a8288c2007-09-11 04:26:06 -0700918
919typedef struct th_hash_s {
920 list_node_t thh_link;
921 mod_hash_t *thh_hash;
922 ip_stack_t *thh_ipst;
923} th_hash_t;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700924#endif
925
926/* The following are ipif_state_flags */
927#define IPIF_CONDEMNED 0x1 /* The ipif is being removed */
928#define IPIF_CHANGING 0x2 /* A critcal ipif field is changing */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700929#define IPIF_SET_LINKLOCAL 0x10 /* transient flag during bringup */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700930
931/* IP interface structure, one per local address */
932typedef struct ipif_s {
933 struct ipif_s *ipif_next;
934 struct ill_s *ipif_ill; /* Back pointer to our ill */
935 int ipif_id; /* Logical unit number */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700936 in6_addr_t ipif_v6lcl_addr; /* Local IP address for this if. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700937 in6_addr_t ipif_v6subnet; /* Subnet prefix for this if. */
938 in6_addr_t ipif_v6net_mask; /* Net mask for this interface. */
939 in6_addr_t ipif_v6brd_addr; /* Broadcast addr for this interface. */
940 in6_addr_t ipif_v6pp_dst_addr; /* Point-to-point dest address. */
941 uint64_t ipif_flags; /* Interface flags. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700942 uint_t ipif_ire_type; /* IRE_LOCAL or IRE_LOOPBACK */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700943
944 /*
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800945 * The packet count in the ipif contain the sum of the
946 * packet counts in dead IRE_LOCAL/LOOPBACK for this ipif.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700947 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700948 uint_t ipif_ib_pkt_count; /* Inbound packets for our dead IREs */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800949
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700950 /* Exclusive bit fields, protected by ipsq_t */
951 unsigned int
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700952 ipif_was_up : 1, /* ipif was up before */
carlsonj69bb4bb2006-08-14 14:10:48 -0700953 ipif_addr_ready : 1, /* DAD is done */
954 ipif_was_dup : 1, /* DAD had failed */
Sowmini Varadhan3efde6d2009-05-07 20:59:19 -0400955 ipif_added_nce : 1, /* nce added for local address */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800956
957 ipif_pad_to_31 : 28;
958
959 ilm_t *ipif_allhosts_ilm; /* For all-nodes join */
960 ilm_t *ipif_solmulti_ilm; /* For IPv6 solicited multicast join */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700961
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700962 uint_t ipif_seqid; /* unique index across all ills */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700963 uint_t ipif_state_flags; /* See IPIF_* flag defs above */
964 uint_t ipif_refcnt; /* active consistent reader cnt */
sowmini968d2fd2008-03-21 06:08:04 -0700965
sowmini968d2fd2008-03-21 06:08:04 -0700966 zoneid_t ipif_zoneid; /* zone ID number */
carlsonj69bb4bb2006-08-14 14:10:48 -0700967 timeout_id_t ipif_recovery_id; /* Timer for DAD recovery */
carlsonj6a8288c2007-09-11 04:26:06 -0700968 boolean_t ipif_trace_disable; /* True when alloc fails */
meeme11c3f42009-01-06 20:16:25 -0500969 /*
970 * For an IPMP interface, ipif_bound_ill tracks the ill whose hardware
971 * information this ipif is associated with via ARP/NDP. We can use
972 * an ill pointer (rather than an index) because only ills that are
973 * part of a group will be pointed to, and an ill cannot disappear
974 * while it's in a group.
975 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800976 struct ill_s *ipif_bound_ill;
977 struct ipif_s *ipif_bound_next; /* bound ipif chain */
978 boolean_t ipif_bound; /* B_TRUE if we successfully bound */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700979
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800980 struct ire_s *ipif_ire_local; /* Our IRE_LOCAL or LOOPBACK */
Erik Nordmark0e0e37a2009-11-17 11:42:22 -0800981 struct ire_s *ipif_ire_if; /* Our IRE_INTERFACE */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800982} ipif_t;
sowmini968d2fd2008-03-21 06:08:04 -0700983
984/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700985 * The following table lists the protection levels of the various members
986 * of the ipif_t. The following notation is used.
987 *
988 * Write once - Written to only once at the time of bringing up
989 * the interface and can be safely read after the bringup without any lock.
990 *
991 * ipsq - Need to execute in the ipsq to perform the indicated access.
992 *
993 * ill_lock - Need to hold this mutex to perform the indicated access.
994 *
995 * ill_g_lock - Need to hold this rw lock as reader/writer for read access or
996 * write access respectively.
997 *
998 * down ill - Written to only when the ill is down (i.e all ipifs are down)
999 * up ill - Read only when the ill is up (i.e. at least 1 ipif is up)
1000 *
1001 * Table of ipif_t members and their protection
1002 *
meem8df01f72007-05-30 16:02:35 -07001003 * ipif_next ipsq + ill_lock + ipsq OR ill_lock OR
1004 * ill_g_lock ill_g_lock
meemb051ecf2006-12-27 21:32:46 -08001005 * ipif_ill ipsq + down ipif write once
1006 * ipif_id ipsq + down ipif write once
meemb051ecf2006-12-27 21:32:46 -08001007 * ipif_v6lcl_addr ipsq + down ipif up ipif
meemb051ecf2006-12-27 21:32:46 -08001008 * ipif_v6subnet ipsq + down ipif up ipif
1009 * ipif_v6net_mask ipsq + down ipif up ipif
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001010 *
1011 * ipif_v6brd_addr
1012 * ipif_v6pp_dst_addr
1013 * ipif_flags ill_lock ill_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001014 * ipif_ire_type ipsq + down ill up ill
1015 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001016 * ipif_ib_pkt_count Approx
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001017 *
1018 * bit fields ill_lock ill_lock
1019 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001020 * ipif_allhosts_ilm ipsq ipsq
1021 * ipif_solmulti_ilm ipsq ipsq
1022 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001023 * ipif_seqid ipsq Write once
1024 *
1025 * ipif_state_flags ill_lock ill_lock
1026 * ipif_refcnt ill_lock ill_lock
meeme11c3f42009-01-06 20:16:25 -05001027 * ipif_bound_ill ipsq + ipmp_lock ipsq OR ipmp_lock
1028 * ipif_bound_next ipsq ipsq
1029 * ipif_bound ipsq ipsq
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001030 *
1031 * ipif_ire_local ipsq + ips_ill_g_lock ipsq OR ips_ill_g_lock
Erik Nordmark0e0e37a2009-11-17 11:42:22 -08001032 * ipif_ire_if ipsq + ips_ill_g_lock ipsq OR ips_ill_g_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001033 */
1034
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001035/*
1036 * Return values from ip_laddr_verify_{v4,v6}
1037 */
1038typedef enum { IPVL_UNICAST_UP, IPVL_UNICAST_DOWN, IPVL_MCAST, IPVL_BCAST,
1039 IPVL_BAD} ip_laddr_t;
1040
1041
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001042#define IP_TR_HASH(tid) ((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1))
1043
carlsonj6a8288c2007-09-11 04:26:06 -07001044#ifdef DEBUG
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001045#define IPIF_TRACE_REF(ipif) ipif_trace_ref(ipif)
1046#define ILL_TRACE_REF(ill) ill_trace_ref(ill)
1047#define IPIF_UNTRACE_REF(ipif) ipif_untrace_ref(ipif)
1048#define ILL_UNTRACE_REF(ill) ill_untrace_ref(ill)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001049#else
1050#define IPIF_TRACE_REF(ipif)
1051#define ILL_TRACE_REF(ill)
1052#define IPIF_UNTRACE_REF(ipif)
1053#define ILL_UNTRACE_REF(ill)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001054#endif
1055
meem79242222008-07-29 18:39:05 -07001056/* IPv4 compatibility macros */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001057#define ipif_lcl_addr V4_PART_OF_V6(ipif_v6lcl_addr)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001058#define ipif_subnet V4_PART_OF_V6(ipif_v6subnet)
1059#define ipif_net_mask V4_PART_OF_V6(ipif_v6net_mask)
1060#define ipif_brd_addr V4_PART_OF_V6(ipif_v6brd_addr)
1061#define ipif_pp_dst_addr V4_PART_OF_V6(ipif_v6pp_dst_addr)
1062
1063/* Macros for easy backreferences to the ill. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001064#define ipif_isv6 ipif_ill->ill_isv6
1065
1066#define SIOCLIFADDR_NDX 112 /* ndx of SIOCLIFADDR in the ndx ioctl table */
1067
1068/*
1069 * mode value for ip_ioctl_finish for finishing an ioctl
1070 */
1071#define CONN_CLOSE 1 /* No mi_copy */
1072#define COPYOUT 2 /* do an mi_copyout if needed */
1073#define NO_COPYOUT 3 /* do an mi_copy_done */
meemb051ecf2006-12-27 21:32:46 -08001074#define IPI2MODE(ipi) ((ipi)->ipi_flags & IPI_GET_CMD ? COPYOUT : NO_COPYOUT)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001075
1076/*
meeme11c3f42009-01-06 20:16:25 -05001077 * The IP-MT design revolves around the serialization objects ipsq_t (IPSQ)
1078 * and ipxop_t (exclusive operation or "xop"). Becoming "writer" on an IPSQ
1079 * ensures that no other threads can become "writer" on any IPSQs sharing that
1080 * IPSQ's xop until the writer thread is done.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001081 *
meeme11c3f42009-01-06 20:16:25 -05001082 * Each phyint points to one IPSQ that remains fixed over the phyint's life.
1083 * Each IPSQ points to one xop that can change over the IPSQ's life. If a
1084 * phyint is *not* in an IPMP group, then its IPSQ will refer to the IPSQ's
1085 * "own" xop (ipsq_ownxop). If a phyint *is* part of an IPMP group, then its
1086 * IPSQ will refer to the "group" xop, which is shorthand for the xop of the
1087 * IPSQ of the IPMP meta-interface's phyint. Thus, all phyints that are part
1088 * of the same IPMP group will have their IPSQ's point to the group xop, and
1089 * thus becoming "writer" on any phyint in the group will prevent any other
1090 * writer on any other phyint in the group. All IPSQs sharing the same xop
1091 * are chained together through ipsq_next (in the degenerate common case,
1092 * ipsq_next simply refers to itself). Note that the group xop is guaranteed
1093 * to exist at least as long as there are members in the group, since the IPMP
1094 * meta-interface can only be destroyed if the group is empty.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001095 *
meeme11c3f42009-01-06 20:16:25 -05001096 * Incoming exclusive operation requests are enqueued on the IPSQ they arrived
1097 * on rather than the xop. This makes switching xop's (as would happen when a
1098 * phyint leaves an IPMP group) simple, because after the phyint leaves the
1099 * group, any operations enqueued on its IPSQ can be safely processed with
1100 * respect to its new xop, and any operations enqueued on the IPSQs of its
1101 * former group can be processed with respect to their existing group xop.
1102 * Even so, switching xops is a subtle dance; see ipsq_dq() for details.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001103 *
meeme11c3f42009-01-06 20:16:25 -05001104 * An IPSQ's "own" xop is embedded within the IPSQ itself since they have have
1105 * identical lifetimes, and because doing so simplifies pointer management.
1106 * While each phyint and IPSQ point to each other, it is not possible to free
1107 * the IPSQ when the phyint is freed, since we may still *inside* the IPSQ
1108 * when the phyint is being freed. Thus, ipsq_phyint is set to NULL when the
1109 * phyint is freed, and the IPSQ free is later done in ipsq_exit().
1110 *
1111 * ipsq_t synchronization: read write
1112 *
1113 * ipsq_xopq_mphead ipx_lock ipx_lock
1114 * ipsq_xopq_mptail ipx_lock ipx_lock
1115 * ipsq_xop_switch_mp ipsq_lock ipsq_lock
1116 * ipsq_phyint write once write once
1117 * ipsq_next RW_READER ill_g_lock RW_WRITER ill_g_lock
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001118 * ipsq_xop ipsq_lock or ipsq ipsq_lock + ipsq
meeme11c3f42009-01-06 20:16:25 -05001119 * ipsq_swxop ipsq ipsq
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001120 * ipsq_ownxop see ipxop_t see ipxop_t
meeme11c3f42009-01-06 20:16:25 -05001121 * ipsq_ipst write once write once
1122 *
1123 * ipxop_t synchronization: read write
1124 *
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001125 * ipx_writer ipx_lock ipx_lock
1126 * ipx_xop_queued ipx_lock ipx_lock
meeme11c3f42009-01-06 20:16:25 -05001127 * ipx_mphead ipx_lock ipx_lock
1128 * ipx_mptail ipx_lock ipx_lock
1129 * ipx_ipsq write once write once
1130 * ips_ipsq_queued ipx_lock ipx_lock
1131 * ipx_waitfor ipsq or ipx_lock ipsq + ipx_lock
1132 * ipx_reentry_cnt ipsq or ipx_lock ipsq + ipx_lock
1133 * ipx_current_done ipsq ipsq
1134 * ipx_current_ioctl ipsq ipsq
1135 * ipx_current_ipif ipsq or ipx_lock ipsq + ipx_lock
1136 * ipx_pending_ipif ipsq or ipx_lock ipsq + ipx_lock
1137 * ipx_pending_mp ipsq or ipx_lock ipsq + ipx_lock
1138 * ipx_forced ipsq ipsq
1139 * ipx_depth ipsq ipsq
1140 * ipx_stack ipsq ipsq
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001141 */
meeme11c3f42009-01-06 20:16:25 -05001142typedef struct ipxop_s {
1143 kmutex_t ipx_lock; /* see above */
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001144 kthread_t *ipx_writer; /* current owner */
meeme11c3f42009-01-06 20:16:25 -05001145 mblk_t *ipx_mphead; /* messages tied to this op */
1146 mblk_t *ipx_mptail;
1147 struct ipsq_s *ipx_ipsq; /* associated ipsq */
1148 boolean_t ipx_ipsq_queued; /* ipsq using xop has queued op */
1149 int ipx_waitfor; /* waiting; values encoded below */
1150 int ipx_reentry_cnt;
1151 boolean_t ipx_current_done; /* is the current operation done? */
1152 int ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */
1153 ipif_t *ipx_current_ipif; /* ipif for current op */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001154 ipif_t *ipx_pending_ipif; /* ipif for ipx_pending_mp */
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001155 mblk_t *ipx_pending_mp; /* current ioctl mp while waiting */
1156 boolean_t ipx_forced; /* debugging aid */
carlsonj6a8288c2007-09-11 04:26:06 -07001157#ifdef DEBUG
meeme11c3f42009-01-06 20:16:25 -05001158 int ipx_depth; /* debugging aid */
1159#define IPX_STACK_DEPTH 15
1160 pc_t ipx_stack[IPX_STACK_DEPTH]; /* debugging aid */
carlsonj6a8288c2007-09-11 04:26:06 -07001161#endif
meeme11c3f42009-01-06 20:16:25 -05001162} ipxop_t;
1163
1164typedef struct ipsq_s {
1165 kmutex_t ipsq_lock; /* see above */
1166 mblk_t *ipsq_switch_mp; /* op to handle right after switch */
1167 mblk_t *ipsq_xopq_mphead; /* list of excl ops (mostly ioctls) */
1168 mblk_t *ipsq_xopq_mptail;
1169 struct phyint *ipsq_phyint; /* associated phyint */
1170 struct ipsq_s *ipsq_next; /* next ipsq sharing ipsq_xop */
1171 struct ipxop_s *ipsq_xop; /* current xop synchronization info */
1172 struct ipxop_s *ipsq_swxop; /* switch xop to on ipsq_exit() */
1173 struct ipxop_s ipsq_ownxop; /* our own xop (may not be in-use) */
1174 ip_stack_t *ipsq_ipst; /* does not have a netstack_hold */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001175} ipsq_t;
1176
meeme11c3f42009-01-06 20:16:25 -05001177/*
1178 * ipx_waitfor values:
1179 */
1180enum {
1181 IPIF_DOWN = 1, /* ipif_down() waiting for refcnts to drop */
1182 ILL_DOWN, /* ill_down() waiting for refcnts to drop */
1183 IPIF_FREE, /* ipif_free() waiting for refcnts to drop */
1184 ILL_FREE /* ill unplumb waiting for refcnts to drop */
1185};
1186
1187/* Operation types for ipsq_try_enter() */
1188#define CUR_OP 0 /* request writer within current operation */
1189#define NEW_OP 1 /* request writer for a new operation */
1190#define SWITCH_OP 2 /* request writer once IPSQ XOP switches */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001191
1192/*
meeme11c3f42009-01-06 20:16:25 -05001193 * Kstats tracked on each IPMP meta-interface. Order here must match
1194 * ipmp_kstats[] in ip/ipmp.c.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001195 */
meeme11c3f42009-01-06 20:16:25 -05001196enum {
1197 IPMP_KSTAT_OBYTES, IPMP_KSTAT_OBYTES64, IPMP_KSTAT_RBYTES,
1198 IPMP_KSTAT_RBYTES64, IPMP_KSTAT_OPACKETS, IPMP_KSTAT_OPACKETS64,
1199 IPMP_KSTAT_OERRORS, IPMP_KSTAT_IPACKETS, IPMP_KSTAT_IPACKETS64,
1200 IPMP_KSTAT_IERRORS, IPMP_KSTAT_MULTIRCV, IPMP_KSTAT_MULTIXMT,
1201 IPMP_KSTAT_BRDCSTRCV, IPMP_KSTAT_BRDCSTXMT, IPMP_KSTAT_LINK_UP,
1202 IPMP_KSTAT_MAX /* keep last */
1203};
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001204
1205/*
1206 * phyint represents state that is common to both IPv4 and IPv6 interfaces.
1207 * There is a separate ill_t representing IPv4 and IPv6 which has a
meem79242222008-07-29 18:39:05 -07001208 * backpointer to the phyint structure for accessing common state.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001209 */
1210typedef struct phyint {
1211 struct ill_s *phyint_illv4;
1212 struct ill_s *phyint_illv6;
meeme11c3f42009-01-06 20:16:25 -05001213 uint_t phyint_ifindex; /* SIOCSLIFINDEX */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001214 uint64_t phyint_flags;
1215 avl_node_t phyint_avl_by_index; /* avl tree by index */
1216 avl_node_t phyint_avl_by_name; /* avl tree by name */
1217 kmutex_t phyint_lock;
1218 struct ipsq_s *phyint_ipsq; /* back pointer to ipsq */
meeme11c3f42009-01-06 20:16:25 -05001219 struct ipmp_grp_s *phyint_grp; /* associated IPMP group */
1220 char phyint_name[LIFNAMSIZ]; /* physical interface name */
1221 uint64_t phyint_kstats0[IPMP_KSTAT_MAX]; /* baseline kstats */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001222} phyint_t;
1223
1224#define CACHE_ALIGN_SIZE 64
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001225#define CACHE_ALIGN(align_struct) P2ROUNDUP(sizeof (struct align_struct),\
1226 CACHE_ALIGN_SIZE)
1227struct _phyint_list_s_ {
1228 avl_tree_t phyint_list_avl_by_index; /* avl tree by index */
1229 avl_tree_t phyint_list_avl_by_name; /* avl tree by name */
1230};
1231
1232typedef union phyint_list_u {
1233 struct _phyint_list_s_ phyint_list_s;
1234 char phyint_list_filler[CACHE_ALIGN(_phyint_list_s_)];
1235} phyint_list_t;
1236
1237#define phyint_list_avl_by_index phyint_list_s.phyint_list_avl_by_index
1238#define phyint_list_avl_by_name phyint_list_s.phyint_list_avl_by_name
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001239
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001240/*
1241 * Fragmentation hash bucket
1242 */
1243typedef struct ipfb_s {
1244 struct ipf_s *ipfb_ipf; /* List of ... */
1245 size_t ipfb_count; /* Count of bytes used by frag(s) */
1246 kmutex_t ipfb_lock; /* Protect all ipf in list */
1247 uint_t ipfb_frag_pkts; /* num of distinct fragmented pkts */
1248} ipfb_t;
1249
1250/*
1251 * IRE bucket structure. Usually there is an array of such structures,
1252 * each pointing to a linked list of ires. irb_refcnt counts the number
1253 * of walkers of a given hash bucket. Usually the reference count is
1254 * bumped up if the walker wants no IRES to be DELETED while walking the
1255 * list. Bumping up does not PREVENT ADDITION. This allows walking a given
1256 * hash bucket without stumbling up on a free pointer.
sangeetac793af92006-08-11 05:59:29 -07001257 *
1258 * irb_t structures in ip_ftable are dynamically allocated and freed.
apersson31736642006-12-19 17:33:00 -08001259 * In order to identify the irb_t structures that can be safely kmem_free'd
sangeetac793af92006-08-11 05:59:29 -07001260 * we need to ensure that
1261 * - the irb_refcnt is quiescent, indicating no other walkers,
1262 * - no other threads or ire's are holding references to the irb,
1263 * i.e., irb_nire == 0,
1264 * - there are no active ire's in the bucket, i.e., irb_ire_cnt == 0
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001265 */
1266typedef struct irb {
1267 struct ire_s *irb_ire; /* First ire in this bucket */
1268 /* Should be first in this struct */
1269 krwlock_t irb_lock; /* Protect this bucket */
1270 uint_t irb_refcnt; /* Protected by irb_lock */
1271 uchar_t irb_marks; /* CONDEMNED ires in this bucket ? */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001272#define IRB_MARK_CONDEMNED 0x0001 /* Contains some IRE_IS_CONDEMNED */
1273#define IRB_MARK_DYNAMIC 0x0002 /* Dynamically allocated */
1274 /* Once IPv6 uses radix then IRB_MARK_DYNAMIC will be always be set */
sangeetac793af92006-08-11 05:59:29 -07001275 uint_t irb_ire_cnt; /* Num of active IRE in this bucket */
sangeetac793af92006-08-11 05:59:29 -07001276 int irb_nire; /* Num of ftable ire's that ref irb */
dh155122f4b3ec62007-01-19 16:59:38 -08001277 ip_stack_t *irb_ipst; /* Does not have a netstack_hold */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001278} irb_t;
1279
Ravi Chandra Nallan7f125a52010-07-13 18:17:30 +05301280/*
1281 * This is the structure used to store the multicast physical addresses
1282 * that an interface has joined.
1283 * The refcnt keeps track of the number of multicast IP addresses mapping
1284 * to a physical multicast address.
1285 */
1286typedef struct multiphysaddr_s {
1287 struct multiphysaddr_s *mpa_next;
1288 char mpa_addr[IP_MAX_HW_LEN];
1289 int mpa_refcnt;
1290} multiphysaddr_t;
1291
sangeetac793af92006-08-11 05:59:29 -07001292#define IRB2RT(irb) (rt_t *)((caddr_t)(irb) - offsetof(rt_t, rt_irb))
1293
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001294/* Forward declarations */
1295struct dce_s;
1296typedef struct dce_s dce_t;
1297struct ire_s;
1298typedef struct ire_s ire_t;
1299struct ncec_s;
1300typedef struct ncec_s ncec_t;
1301struct nce_s;
1302typedef struct nce_s nce_t;
1303struct ip_recv_attr_s;
1304typedef struct ip_recv_attr_s ip_recv_attr_t;
1305struct ip_xmit_attr_s;
1306typedef struct ip_xmit_attr_s ip_xmit_attr_t;
1307
1308struct tsol_ire_gw_secattr_s;
1309typedef struct tsol_ire_gw_secattr_s tsol_ire_gw_secattr_t;
1310
1311/*
1312 * This is a structure for a one-element route cache that is passed
1313 * by reference between ip_input and ill_inputfn.
1314 */
1315typedef struct {
1316 ire_t *rtc_ire;
1317 ipaddr_t rtc_ipaddr;
1318 in6_addr_t rtc_ip6addr;
1319} rtc_t;
1320
1321/*
1322 * Note: Temporarily use 64 bits, and will probably go back to 32 bits after
1323 * more cleanup work is done.
1324 */
1325typedef uint64_t iaflags_t;
1326
1327/* The ill input function pointer type */
1328typedef void (*pfillinput_t)(mblk_t *, void *, void *, ip_recv_attr_t *,
1329 rtc_t *);
1330
1331/* The ire receive function pointer type */
1332typedef void (*pfirerecv_t)(ire_t *, mblk_t *, void *, ip_recv_attr_t *);
1333
1334/* The ire send and postfrag function pointer types */
1335typedef int (*pfiresend_t)(ire_t *, mblk_t *, void *,
1336 ip_xmit_attr_t *, uint32_t *);
1337typedef int (*pfirepostfrag_t)(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t,
1338 zoneid_t, zoneid_t, uintptr_t *);
1339
sangeetac793af92006-08-11 05:59:29 -07001340
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001341#define IP_V4_G_HEAD 0
1342#define IP_V6_G_HEAD 1
1343
1344#define MAX_G_HEADS 2
1345
1346/*
1347 * unpadded ill_if structure
1348 */
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001349struct _ill_if_s_ {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001350 union ill_if_u *illif_next;
1351 union ill_if_u *illif_prev;
1352 avl_tree_t illif_avl_by_ppa; /* AVL tree sorted on ppa */
1353 vmem_t *illif_ppa_arena; /* ppa index space */
1354 uint16_t illif_mcast_v1; /* hints for */
1355 uint16_t illif_mcast_v2; /* [igmp|mld]_slowtimo */
1356 int illif_name_len; /* name length */
1357 char illif_name[LIFNAMSIZ]; /* name of interface type */
1358};
1359
1360/* cache aligned ill_if structure */
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001361typedef union ill_if_u {
1362 struct _ill_if_s_ ill_if_s;
1363 char illif_filler[CACHE_ALIGN(_ill_if_s_)];
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001364} ill_if_t;
1365
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001366#define illif_next ill_if_s.illif_next
1367#define illif_prev ill_if_s.illif_prev
1368#define illif_avl_by_ppa ill_if_s.illif_avl_by_ppa
1369#define illif_ppa_arena ill_if_s.illif_ppa_arena
1370#define illif_mcast_v1 ill_if_s.illif_mcast_v1
1371#define illif_mcast_v2 ill_if_s.illif_mcast_v2
1372#define illif_name ill_if_s.illif_name
1373#define illif_name_len ill_if_s.illif_name_len
1374
1375typedef struct ill_walk_context_s {
1376 int ctx_current_list; /* current list being searched */
1377 int ctx_last_list; /* last list to search */
1378} ill_walk_context_t;
1379
1380/*
dh155122f4b3ec62007-01-19 16:59:38 -08001381 * ill_g_heads structure, one for IPV4 and one for IPV6
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001382 */
1383struct _ill_g_head_s_ {
1384 ill_if_t *ill_g_list_head;
1385 ill_if_t *ill_g_list_tail;
1386};
1387
1388typedef union ill_g_head_u {
1389 struct _ill_g_head_s_ ill_g_head_s;
1390 char ill_g_head_filler[CACHE_ALIGN(_ill_g_head_s_)];
1391} ill_g_head_t;
1392
1393#define ill_g_list_head ill_g_head_s.ill_g_list_head
1394#define ill_g_list_tail ill_g_head_s.ill_g_list_tail
1395
dh155122f4b3ec62007-01-19 16:59:38 -08001396#define IP_V4_ILL_G_LIST(ipst) \
1397 (ipst)->ips_ill_g_heads[IP_V4_G_HEAD].ill_g_list_head
1398#define IP_V6_ILL_G_LIST(ipst) \
1399 (ipst)->ips_ill_g_heads[IP_V6_G_HEAD].ill_g_list_head
1400#define IP_VX_ILL_G_LIST(i, ipst) \
1401 (ipst)->ips_ill_g_heads[i].ill_g_list_head
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001402
dh155122f4b3ec62007-01-19 16:59:38 -08001403#define ILL_START_WALK_V4(ctx_ptr, ipst) \
1404 ill_first(IP_V4_G_HEAD, IP_V4_G_HEAD, ctx_ptr, ipst)
1405#define ILL_START_WALK_V6(ctx_ptr, ipst) \
1406 ill_first(IP_V6_G_HEAD, IP_V6_G_HEAD, ctx_ptr, ipst)
1407#define ILL_START_WALK_ALL(ctx_ptr, ipst) \
1408 ill_first(MAX_G_HEADS, MAX_G_HEADS, ctx_ptr, ipst)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001409
1410/*
1411 * Capabilities, possible flags for ill_capabilities.
1412 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001413#define ILL_CAPAB_LSO 0x04 /* Large Send Offload */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001414#define ILL_CAPAB_HCKSUM 0x08 /* Hardware checksumming */
1415#define ILL_CAPAB_ZEROCOPY 0x10 /* Zero-copy */
Eric Chengda14ceb2008-12-04 18:16:10 -08001416#define ILL_CAPAB_DLD 0x20 /* DLD capabilities */
1417#define ILL_CAPAB_DLD_POLL 0x40 /* Polling */
1418#define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001419
1420/*
1421 * Per-ill Hardware Checksumming capbilities.
1422 */
1423typedef struct ill_hcksum_capab_s ill_hcksum_capab_t;
1424
1425/*
1426 * Per-ill Zero-copy capabilities.
1427 */
1428typedef struct ill_zerocopy_capab_s ill_zerocopy_capab_t;
1429
1430/*
Eric Chengda14ceb2008-12-04 18:16:10 -08001431 * DLD capbilities.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001432 */
Eric Chengda14ceb2008-12-04 18:16:10 -08001433typedef struct ill_dld_capab_s ill_dld_capab_t;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001434
1435/*
1436 * Per-ill polling resource map.
1437 */
1438typedef struct ill_rx_ring ill_rx_ring_t;
1439
yl15005183476012006-11-13 20:44:19 -08001440/*
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001441 * Per-ill Large Send Offload capabilities.
yl15005183476012006-11-13 20:44:19 -08001442 */
1443typedef struct ill_lso_capab_s ill_lso_capab_t;
1444
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001445/* The following are ill_state_flags */
1446#define ILL_LL_SUBNET_PENDING 0x01 /* Waiting for DL_INFO_ACK from drv */
1447#define ILL_CONDEMNED 0x02 /* No more new ref's to the ILL */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001448#define ILL_DL_UNBIND_IN_PROGRESS 0x04 /* UNBIND_REQ is sent */
Sowmini Varadhan2ea22bf2010-06-07 10:10:19 -04001449/*
1450 * ILL_DOWN_IN_PROGRESS is set to ensure the following:
1451 * - no packets are sent to the driver after the DL_UNBIND_REQ is sent,
1452 * - no longstanding references will be acquired on objects that are being
1453 * brought down.
1454 */
1455#define ILL_DOWN_IN_PROGRESS 0x08
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001456
1457/* Is this an ILL whose source address is used by other ILL's ? */
1458#define IS_USESRC_ILL(ill) \
1459 (((ill)->ill_usesrc_ifindex == 0) && \
kcpoon5dddb8b2006-01-06 00:24:46 -08001460 ((ill)->ill_usesrc_grp_next != NULL))
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001461
1462/* Is this a client/consumer of the usesrc ILL ? */
1463#define IS_USESRC_CLI_ILL(ill) \
1464 (((ill)->ill_usesrc_ifindex != 0) && \
kcpoon5dddb8b2006-01-06 00:24:46 -08001465 ((ill)->ill_usesrc_grp_next != NULL))
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001466
1467/* Is this an virtual network interface (vni) ILL ? */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001468#define IS_VNI(ill) \
sangeetac793af92006-08-11 05:59:29 -07001469 (((ill)->ill_phyint->phyint_flags & (PHYI_LOOPBACK|PHYI_VIRTUAL)) == \
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001470 PHYI_VIRTUAL)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001471
kcpoon48de1bd2007-06-13 04:53:06 -07001472/* Is this a loopback ILL? */
1473#define IS_LOOPBACK(ill) \
1474 ((ill)->ill_phyint->phyint_flags & PHYI_LOOPBACK)
1475
meeme11c3f42009-01-06 20:16:25 -05001476/* Is this an IPMP meta-interface ILL? */
1477#define IS_IPMP(ill) \
1478 ((ill)->ill_phyint->phyint_flags & PHYI_IPMP)
1479
1480/* Is this ILL under an IPMP meta-interface? (aka "in a group?") */
1481#define IS_UNDER_IPMP(ill) \
1482 ((ill)->ill_grp != NULL && !IS_IPMP(ill))
1483
1484/* Is ill1 in the same illgrp as ill2? */
1485#define IS_IN_SAME_ILLGRP(ill1, ill2) \
1486 ((ill1)->ill_grp != NULL && ((ill1)->ill_grp == (ill2)->ill_grp))
1487
1488/* Is ill1 on the same LAN as ill2? */
1489#define IS_ON_SAME_LAN(ill1, ill2) \
1490 ((ill1) == (ill2) || IS_IN_SAME_ILLGRP(ill1, ill2))
1491
1492#define ILL_OTHER(ill) \
1493 ((ill)->ill_isv6 ? (ill)->ill_phyint->phyint_illv4 : \
1494 (ill)->ill_phyint->phyint_illv6)
1495
1496/*
1497 * IPMP group ILL state structure -- up to two per IPMP group (V4 and V6).
1498 * Created when the V4 and/or V6 IPMP meta-interface is I_PLINK'd. It is
1499 * guaranteed to persist while there are interfaces of that type in the group.
1500 * In general, most fields are accessed outside of the IPSQ (e.g., in the
1501 * datapath), and thus use locks in addition to the IPSQ for protection.
1502 *
1503 * synchronization: read write
1504 *
1505 * ig_if ipsq or ill_g_lock ipsq and ill_g_lock
1506 * ig_actif ipsq or ipmp_lock ipsq and ipmp_lock
1507 * ig_nactif ipsq or ipmp_lock ipsq and ipmp_lock
1508 * ig_next_ill ipsq or ipmp_lock ipsq and ipmp_lock
1509 * ig_ipmp_ill write once write once
1510 * ig_cast_ill ipsq or ipmp_lock ipsq and ipmp_lock
1511 * ig_arpent ipsq ipsq
1512 * ig_mtu ipsq ipsq
Erik Nordmark1eee1702010-08-16 15:30:54 -07001513 * ig_mc_mtu ipsq ipsq
meeme11c3f42009-01-06 20:16:25 -05001514 */
1515typedef struct ipmp_illgrp_s {
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001516 list_t ig_if; /* list of all interfaces */
meeme11c3f42009-01-06 20:16:25 -05001517 list_t ig_actif; /* list of active interfaces */
1518 uint_t ig_nactif; /* number of active interfaces */
1519 struct ill_s *ig_next_ill; /* next active interface to use */
1520 struct ill_s *ig_ipmp_ill; /* backpointer to IPMP meta-interface */
1521 struct ill_s *ig_cast_ill; /* nominated ill for multi/broadcast */
1522 list_t ig_arpent; /* list of ARP entries */
Erik Nordmark1eee1702010-08-16 15:30:54 -07001523 uint_t ig_mtu; /* ig_ipmp_ill->ill_mtu */
1524 uint_t ig_mc_mtu; /* ig_ipmp_ill->ill_mc_mtu */
meeme11c3f42009-01-06 20:16:25 -05001525} ipmp_illgrp_t;
1526
1527/*
1528 * IPMP group state structure -- one per IPMP group. Created when the
1529 * IPMP meta-interface is plumbed; it is guaranteed to persist while there
1530 * are interfaces in it.
1531 *
1532 * ipmp_grp_t synchronization: read write
1533 *
1534 * gr_name ipmp_lock ipmp_lock
1535 * gr_ifname write once write once
1536 * gr_mactype ipmp_lock ipmp_lock
1537 * gr_phyint write once write once
1538 * gr_nif ipmp_lock ipmp_lock
1539 * gr_nactif ipsq ipsq
1540 * gr_v4 ipmp_lock ipmp_lock
1541 * gr_v6 ipmp_lock ipmp_lock
1542 * gr_nv4 ipmp_lock ipmp_lock
1543 * gr_nv6 ipmp_lock ipmp_lock
1544 * gr_pendv4 ipmp_lock ipmp_lock
1545 * gr_pendv6 ipmp_lock ipmp_lock
1546 * gr_linkdownmp ipsq ipsq
1547 * gr_ksp ipmp_lock ipmp_lock
1548 * gr_kstats0 atomic atomic
1549 */
1550typedef struct ipmp_grp_s {
1551 char gr_name[LIFGRNAMSIZ]; /* group name */
1552 char gr_ifname[LIFNAMSIZ]; /* interface name */
1553 t_uscalar_t gr_mactype; /* DLPI mactype of group */
1554 phyint_t *gr_phyint; /* IPMP group phyint */
1555 uint_t gr_nif; /* number of interfaces in group */
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001556 uint_t gr_nactif; /* number of active interfaces */
meeme11c3f42009-01-06 20:16:25 -05001557 ipmp_illgrp_t *gr_v4; /* V4 group information */
1558 ipmp_illgrp_t *gr_v6; /* V6 group information */
1559 uint_t gr_nv4; /* number of ills in V4 group */
1560 uint_t gr_nv6; /* number of ills in V6 group */
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001561 uint_t gr_pendv4; /* number of pending ills in V4 group */
1562 uint_t gr_pendv6; /* number of pending ills in V6 group */
meeme11c3f42009-01-06 20:16:25 -05001563 mblk_t *gr_linkdownmp; /* message used to bring link down */
1564 kstat_t *gr_ksp; /* group kstat pointer */
1565 uint64_t gr_kstats0[IPMP_KSTAT_MAX]; /* baseline group kstats */
1566} ipmp_grp_t;
1567
1568/*
1569 * IPMP ARP entry -- one per SIOCS*ARP entry tied to the group. Used to keep
1570 * ARP up-to-date as the active set of interfaces in the group changes.
1571 */
1572typedef struct ipmp_arpent_s {
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001573 ipaddr_t ia_ipaddr; /* IP address for this entry */
1574 boolean_t ia_proxyarp; /* proxy ARP entry? */
1575 boolean_t ia_notified; /* ARP notified about this entry? */
1576 list_node_t ia_node; /* next ARP entry in list */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001577 uint16_t ia_flags; /* nce_flags for the address */
1578 size_t ia_lladdr_len;
1579 uchar_t *ia_lladdr;
meeme11c3f42009-01-06 20:16:25 -05001580} ipmp_arpent_t;
1581
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001582struct arl_s;
1583
1584/*
1585 * Per-ill capabilities.
1586 */
1587struct ill_hcksum_capab_s {
1588 uint_t ill_hcksum_version; /* interface version */
1589 uint_t ill_hcksum_txflags; /* capabilities on transmit */
1590};
1591
1592struct ill_zerocopy_capab_s {
1593 uint_t ill_zerocopy_version; /* interface version */
1594 uint_t ill_zerocopy_flags; /* capabilities */
1595};
1596
1597struct ill_lso_capab_s {
1598 uint_t ill_lso_flags; /* capabilities */
Robert Mustacchi62366fb2020-04-01 15:30:20 +00001599 uint_t ill_lso_max_tcpv4; /* maximum size of payload */
1600 uint_t ill_lso_max_tcpv6; /* maximum size of payload */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001601};
1602
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001603/*
1604 * IP Lower level Structure.
1605 * Instance data structure in ip_open when there is a device below us.
1606 */
1607typedef struct ill_s {
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001608 pfillinput_t ill_inputfn; /* Fast input function selector */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001609 ill_if_t *ill_ifptr; /* pointer to interface type */
1610 queue_t *ill_rq; /* Read queue. */
1611 queue_t *ill_wq; /* Write queue. */
1612
1613 int ill_error; /* Error value sent up by device. */
1614
1615 ipif_t *ill_ipif; /* Interface chain for this ILL. */
1616
1617 uint_t ill_ipif_up_count; /* Number of IPIFs currently up. */
1618 uint_t ill_max_frag; /* Max IDU from DLPI. */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001619 uint_t ill_current_frag; /* Current IDU from DLPI. */
1620 uint_t ill_mtu; /* User-specified MTU; SIOCSLIFMTU */
Erik Nordmark1eee1702010-08-16 15:30:54 -07001621 uint_t ill_mc_mtu; /* MTU for multi/broadcast */
Girish Moodalbail6e91bba2010-03-26 17:53:11 -04001622 uint_t ill_metric; /* BSD if metric, for compatibility. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001623 char *ill_name; /* Our name. */
carlsonj69bb4bb2006-08-14 14:10:48 -07001624 uint_t ill_ipif_dup_count; /* Number of duplicate addresses. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001625 uint_t ill_name_length; /* Name length, incl. terminator. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001626 uint_t ill_net_type; /* IRE_IF_RESOLVER/IRE_IF_NORESOLVER. */
1627 /*
1628 * Physical Point of Attachment num. If DLPI style 1 provider
1629 * then this is derived from the devname.
1630 */
1631 uint_t ill_ppa;
1632 t_uscalar_t ill_sap;
1633 t_scalar_t ill_sap_length; /* Including sign (for position) */
1634 uint_t ill_phys_addr_length; /* Excluding the sap. */
1635 uint_t ill_bcast_addr_length; /* Only set when the DL provider */
1636 /* supports broadcast. */
1637 t_uscalar_t ill_mactype;
1638 uint8_t *ill_frag_ptr; /* Reassembly state. */
1639 timeout_id_t ill_frag_timer_id; /* timeout id for the frag timer */
1640 ipfb_t *ill_frag_hash_tbl; /* Fragment hash list head. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001641
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001642 krwlock_t ill_mcast_lock; /* Protects multicast state */
1643 kmutex_t ill_mcast_serializer; /* Serialize across ilg and ilm state */
meem79242222008-07-29 18:39:05 -07001644 ilm_t *ill_ilm; /* Multicast membership for ill */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001645 uint_t ill_global_timer; /* for IGMPv3/MLDv2 general queries */
1646 int ill_mcast_type; /* type of router which is querier */
1647 /* on this interface */
1648 uint16_t ill_mcast_v1_time; /* # slow timeouts since last v1 qry */
1649 uint16_t ill_mcast_v2_time; /* # slow timeouts since last v2 qry */
1650 uint8_t ill_mcast_v1_tset; /* 1 => timer is set; 0 => not set */
1651 uint8_t ill_mcast_v2_tset; /* 1 => timer is set; 0 => not set */
1652
1653 uint8_t ill_mcast_rv; /* IGMPv3/MLDv2 robustness variable */
1654 int ill_mcast_qi; /* IGMPv3/MLDv2 query interval var */
1655
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001656 /*
1657 * All non-NULL cells between 'ill_first_mp_to_free' and
1658 * 'ill_last_mp_to_free' are freed in ill_delete.
1659 */
1660#define ill_first_mp_to_free ill_bcast_mp
1661 mblk_t *ill_bcast_mp; /* DLPI header for broadcasts. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001662 mblk_t *ill_unbind_mp; /* unbind mp from ill_dl_up() */
Philip Kirkb127ac42008-11-06 06:47:54 -05001663 mblk_t *ill_promiscoff_mp; /* for ill_leave_allmulti() */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001664 mblk_t *ill_dlpi_deferred; /* b_next chain of control messages */
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001665 mblk_t *ill_dest_addr_mp; /* mblk which holds ill_dest_addr */
Cathy Zhou5d460ea2009-03-17 20:14:50 -07001666 mblk_t *ill_replumb_mp; /* replumb mp from ill_replumb() */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001667 mblk_t *ill_phys_addr_mp; /* mblk which holds ill_phys_addr */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001668 mblk_t *ill_mcast_deferred; /* b_next chain of IGMP/MLD packets */
1669#define ill_last_mp_to_free ill_mcast_deferred
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001670
1671 cred_t *ill_credp; /* opener's credentials */
1672 uint8_t *ill_phys_addr; /* ill_phys_addr_mp->b_rptr + off */
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001673 uint8_t *ill_dest_addr; /* ill_dest_addr_mp->b_rptr + off */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001674
1675 uint_t ill_state_flags; /* see ILL_* flags above */
1676
1677 /* Following bit fields protected by ipsq_t */
1678 uint_t
1679 ill_needs_attach : 1,
1680 ill_reserved : 1,
1681 ill_isv6 : 1,
1682 ill_dlpi_style_set : 1,
1683
1684 ill_ifname_pending : 1,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001685 ill_logical_down : 1,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001686 ill_dl_up : 1,
1687 ill_up_ipifs : 1,
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001688
carlsonj69bb4bb2006-08-14 14:10:48 -07001689 ill_note_link : 1, /* supports link-up notification */
yz1470648fb46f22007-09-21 07:56:36 -07001690 ill_capab_reneg : 1, /* capability renegotiation to be done */
Eric Chengda14ceb2008-12-04 18:16:10 -08001691 ill_dld_capab_inprog : 1, /* direct dld capab call in prog */
Philip Kirkb127ac42008-11-06 06:47:54 -05001692 ill_need_recover_multicast : 1,
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001693
1694 ill_replumbing : 1,
1695 ill_arl_dlpi_pending : 1,
Sowmini Varadhanf1c454b2010-01-11 10:29:23 -05001696 ill_grp_pending : 1,
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001697
Sowmini Varadhanf1c454b2010-01-11 10:29:23 -05001698 ill_pad_to_bit_31 : 17;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001699
1700 /* Following bit fields protected by ill_lock */
1701 uint_t
1702 ill_fragtimer_executing : 1,
1703 ill_fragtimer_needrestart : 1,
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001704 ill_manual_token : 1, /* system won't override ill_token */
Girish Moodalbail6e91bba2010-03-26 17:53:11 -04001705 /*
1706 * ill_manual_linklocal : system will not change the
1707 * linklocal whenever ill_token changes.
1708 */
1709 ill_manual_linklocal : 1,
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001710
Sebastien Roy792bd772009-12-21 15:22:35 -05001711 ill_manual_dst_linklocal : 1, /* same for pt-pt dst linklocal */
1712
Dan McDonald42c5ef02019-02-22 14:42:52 -05001713 ill_mcast_ncec_cleanup : 1, /* Reaping mcast ncecs. */
1714 ill_pad_bit_31 : 26;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001715
1716 /*
1717 * Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'.
1718 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001719 int ill_muxid; /* muxid returned from plink */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001720
georges0a5d9592008-05-30 10:00:54 -07001721 /* Used for IP frag reassembly throttling on a per ILL basis. */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001722 uint_t ill_ipf_gen; /* Generation of next fragment queue */
georges0a5d9592008-05-30 10:00:54 -07001723 uint_t ill_frag_count; /* Count of all reassembly mblk bytes */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001724 uint_t ill_frag_free_num_pkts; /* num of fragmented packets to free */
1725 clock_t ill_last_frag_clean_time; /* time when frag's were pruned */
1726 int ill_type; /* From <net/if_types.h> */
ja978904d876312006-10-10 06:05:36 -07001727 uint_t ill_dlpi_multicast_state; /* See below IDS_* */
1728 uint_t ill_dlpi_fastpath_state; /* See below IDS_* */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001729
1730 /*
1731 * Capabilities related fields.
1732 */
Eric Chengda14ceb2008-12-04 18:16:10 -08001733 uint_t ill_dlpi_capab_state; /* State of capability query, IDCS_* */
1734 uint_t ill_capab_pending_cnt;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001735 uint64_t ill_capabilities; /* Enabled capabilities, ILL_CAPAB_* */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001736 ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */
1737 ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */
Eric Chengda14ceb2008-12-04 18:16:10 -08001738 ill_dld_capab_t *ill_dld_capab; /* DLD capabilities */
1739 ill_lso_capab_t *ill_lso_capab; /* Large Segment Offload capabilities */
1740 mblk_t *ill_capab_reset_mp; /* Preallocated mblk for capab reset */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001741
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001742 uint8_t ill_max_hops; /* Maximum hops for any logical interface */
meeme11c3f42009-01-06 20:16:25 -05001743 uint_t ill_user_mtu; /* User-specified MTU via SIOCSLIFLNKINFO */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001744 uint32_t ill_reachable_time; /* Value for ND algorithm in msec */
1745 uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */
1746 uint_t ill_max_buf; /* Max # of req to buffer for ND */
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001747 in6_addr_t ill_token; /* IPv6 interface id */
1748 in6_addr_t ill_dest_token; /* Destination IPv6 interface id */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001749 uint_t ill_token_length;
1750 uint32_t ill_xmit_count; /* ndp max multicast xmits */
apersson31736642006-12-19 17:33:00 -08001751 mib2_ipIfStatsEntry_t *ill_ip_mib; /* ver indep. interface mib */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001752 mib2_ipv6IfIcmpEntry_t *ill_icmp6_mib; /* Per interface mib */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001753
1754 phyint_t *ill_phyint;
1755 uint64_t ill_flags;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001756
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001757 kmutex_t ill_lock; /* Please see table below */
1758 /*
1759 * The ill_nd_lla* fields handle the link layer address option
1760 * from neighbor discovery. This is used for external IPv6
1761 * address resolution.
1762 */
1763 mblk_t *ill_nd_lla_mp; /* mblk which holds ill_nd_lla */
1764 uint8_t *ill_nd_lla; /* Link Layer Address */
1765 uint_t ill_nd_lla_len; /* Link Layer Address length */
1766 /*
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001767 * We have 4 phys_addr_req's sent down. This field keeps track
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001768 * of which one is pending.
1769 */
1770 t_uscalar_t ill_phys_addr_pend; /* which dl_phys_addr_req pending */
1771 /*
1772 * Used to save errors that occur during plumbing
1773 */
1774 uint_t ill_ifname_pending_err;
1775 avl_node_t ill_avl_byppa; /* avl node based on ppa */
Dan McDonald42c5ef02019-02-22 14:42:52 -05001776 uint_t ill_mcast_nces; /* Number of NCEs that are multicast. */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001777 list_t ill_nce; /* pointer to nce_s list */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001778 uint_t ill_refcnt; /* active refcnt by threads */
sowmini384ad172008-04-08 12:13:12 -07001779 uint_t ill_ire_cnt; /* ires associated with this ill */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001780 kcondvar_t ill_cv;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001781 uint_t ill_ncec_cnt; /* ncecs associated with this ill */
sowmini384ad172008-04-08 12:13:12 -07001782 uint_t ill_nce_cnt; /* nces associated with this ill */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001783 uint_t ill_waiters; /* threads waiting in ipsq_enter */
1784 /*
1785 * Contains the upper read queue pointer of the module immediately
1786 * beneath IP. This field allows IP to validate sub-capability
1787 * acknowledgments coming up from downstream.
1788 */
1789 queue_t *ill_lmod_rq; /* read queue pointer of module below */
1790 uint_t ill_lmod_cnt; /* number of modules beneath IP */
1791 ip_m_t *ill_media; /* media specific params/functions */
1792 t_uscalar_t ill_dlpi_pending; /* Last DLPI primitive issued */
1793 uint_t ill_usesrc_ifindex; /* use src addr from this ILL */
1794 struct ill_s *ill_usesrc_grp_next; /* Next ILL in the usesrc group */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001795 boolean_t ill_trace_disable; /* True when alloc fails */
dh155122f4b3ec62007-01-19 16:59:38 -08001796 zoneid_t ill_zoneid;
1797 ip_stack_t *ill_ipst; /* Corresponds to a netstack_hold */
meeme704a8f2007-10-30 11:15:43 -07001798 uint32_t ill_dhcpinit; /* IP_DHCPINIT_IFs for ill */
Eric Chengda14ceb2008-12-04 18:16:10 -08001799 void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */
sowmini384ad172008-04-08 12:13:12 -07001800 uint_t ill_ilm_cnt; /* ilms referencing this ill */
Philip Kirkb127ac42008-11-06 06:47:54 -05001801 uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001802 ilm_t *ill_ipallmulti_ilm;
1803
1804 mblk_t *ill_saved_ire_mp; /* Allocated for each extra IRE */
1805 /* with ire_ill set so they can */
1806 /* survive the ill going down and up. */
1807 kmutex_t ill_saved_ire_lock; /* Protects ill_saved_ire_mp, cnt */
1808 uint_t ill_saved_ire_cnt; /* # entries */
1809 struct arl_ill_common_s *ill_common;
1810 ire_t *ill_ire_multicast; /* IRE_MULTICAST for ill */
1811 clock_t ill_defend_start; /* start of 1 hour period */
1812 uint_t ill_defend_count; /* # of announce/defends per ill */
meeme11c3f42009-01-06 20:16:25 -05001813 /*
1814 * IPMP fields.
1815 */
1816 ipmp_illgrp_t *ill_grp; /* IPMP group information */
Toomas Soome8a06b3d2018-10-15 22:13:16 +03001817 list_node_t ill_actnode; /* next active ill in group */
meeme11c3f42009-01-06 20:16:25 -05001818 list_node_t ill_grpnode; /* next ill in group */
1819 ipif_t *ill_src_ipif; /* source address selection rotor */
1820 ipif_t *ill_move_ipif; /* ipif awaiting move to new ill */
1821 boolean_t ill_nom_cast; /* nominated for mcast/bcast */
1822 uint_t ill_bound_cnt; /* # of data addresses bound to ill */
1823 ipif_t *ill_bound_ipif; /* ipif chain bound to ill */
1824 timeout_id_t ill_refresh_tid; /* ill refresh retry timeout id */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001825
1826 uint32_t ill_mrouter_cnt; /* mrouter allmulti joins */
Sowmini Varadhan550b6e42010-07-01 17:10:52 -04001827 uint32_t ill_allowed_ips_cnt;
1828 in6_addr_t *ill_allowed_ips;
Ravi Chandra Nallan7f125a52010-07-13 18:17:30 +05301829
1830 /* list of multicast physical addresses joined on this ill */
1831 multiphysaddr_t *ill_mphysaddr_list;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001832} ill_t;
1833
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001834/*
sowmini384ad172008-04-08 12:13:12 -07001835 * ILL_FREE_OK() means that there are no incoming pointer references
sowmini968d2fd2008-03-21 06:08:04 -07001836 * to the ill.
1837 */
1838#define ILL_FREE_OK(ill) \
sowmini384ad172008-04-08 12:13:12 -07001839 ((ill)->ill_ire_cnt == 0 && (ill)->ill_ilm_cnt == 0 && \
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001840 (ill)->ill_ncec_cnt == 0 && (ill)->ill_nce_cnt == 0)
sowmini968d2fd2008-03-21 06:08:04 -07001841
1842/*
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001843 * An ipif/ill can be marked down only when the ire and ncec references
sowmini968d2fd2008-03-21 06:08:04 -07001844 * to that ipif/ill goes to zero. ILL_DOWN_OK() is a necessary condition
1845 * quiescence checks. See comments above IPIF_DOWN_OK for details
1846 * on why ires and nces are selectively considered for this macro.
1847 */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001848#define ILL_DOWN_OK(ill) \
1849 (ill->ill_ire_cnt == 0 && ill->ill_ncec_cnt == 0 && \
1850 ill->ill_nce_cnt == 0)
sowmini968d2fd2008-03-21 06:08:04 -07001851
1852/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001853 * The following table lists the protection levels of the various members
1854 * of the ill_t. Same notation as that used for ipif_t above is used.
1855 *
1856 * Write Read
1857 *
1858 * ill_ifptr ill_g_lock + s Write once
1859 * ill_rq ipsq Write once
1860 * ill_wq ipsq Write once
1861 *
1862 * ill_error ipsq None
1863 * ill_ipif ill_g_lock + ipsq ill_g_lock OR ipsq
meem8df01f72007-05-30 16:02:35 -07001864 * ill_ipif_up_count ill_lock + ipsq ill_lock OR ipsq
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001865 * ill_max_frag ill_lock ill_lock
1866 * ill_current_frag ill_lock ill_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001867 *
meemb051ecf2006-12-27 21:32:46 -08001868 * ill_name ill_g_lock + ipsq Write once
1869 * ill_name_length ill_g_lock + ipsq Write once
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001870 * ill_ndd_name ipsq Write once
1871 * ill_net_type ipsq Write once
meemb051ecf2006-12-27 21:32:46 -08001872 * ill_ppa ill_g_lock + ipsq Write once
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001873 * ill_sap ipsq + down ill Write once
1874 * ill_sap_length ipsq + down ill Write once
1875 * ill_phys_addr_length ipsq + down ill Write once
1876 *
1877 * ill_bcast_addr_length ipsq ipsq
1878 * ill_mactype ipsq ipsq
1879 * ill_frag_ptr ipsq ipsq
1880 *
1881 * ill_frag_timer_id ill_lock ill_lock
1882 * ill_frag_hash_tbl ipsq up ill
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001883 * ill_ilm ill_mcast_lock(WRITER) ill_mcast_lock(READER)
1884 * ill_global_timer ill_mcast_lock(WRITER) ill_mcast_lock(READER)
1885 * ill_mcast_type ill_mcast_lock(WRITER) ill_mcast_lock(READER)
1886 * ill_mcast_v1_time ill_mcast_lock(WRITER) ill_mcast_lock(READER)
1887 * ill_mcast_v2_time ill_mcast_lock(WRITER) ill_mcast_lock(READER)
1888 * ill_mcast_v1_tset ill_mcast_lock(WRITER) ill_mcast_lock(READER)
1889 * ill_mcast_v2_tset ill_mcast_lock(WRITER) ill_mcast_lock(READER)
1890 * ill_mcast_rv ill_mcast_lock(WRITER) ill_mcast_lock(READER)
1891 * ill_mcast_qi ill_mcast_lock(WRITER) ill_mcast_lock(READER)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001892 *
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001893 * ill_down_mp ipsq ipsq
meem8df01f72007-05-30 16:02:35 -07001894 * ill_dlpi_deferred ill_lock ill_lock
Thirumalai Srinivasan75718342009-07-07 10:46:23 -07001895 * ill_dlpi_pending ipsq + ill_lock ipsq or ill_lock or
1896 * absence of ipsq writer.
meemb051ecf2006-12-27 21:32:46 -08001897 * ill_phys_addr_mp ipsq + down ill only when ill is up
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001898 * ill_mcast_deferred ill_lock ill_lock
meemb051ecf2006-12-27 21:32:46 -08001899 * ill_phys_addr ipsq + down ill only when ill is up
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001900 * ill_dest_addr_mp ipsq + down ill only when ill is up
1901 * ill_dest_addr ipsq + down ill only when ill is up
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001902 *
1903 * ill_state_flags ill_lock ill_lock
1904 * exclusive bit flags ipsq_t ipsq_t
1905 * shared bit flags ill_lock ill_lock
1906 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001907 * ill_muxid ipsq Not atomic
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001908 *
1909 * ill_ipf_gen Not atomic
georges0a5d9592008-05-30 10:00:54 -07001910 * ill_frag_count atomics atomics
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001911 * ill_type ipsq + down ill only when ill is up
1912 * ill_dlpi_multicast_state ill_lock ill_lock
1913 * ill_dlpi_fastpath_state ill_lock ill_lock
Eric Chengda14ceb2008-12-04 18:16:10 -08001914 * ill_dlpi_capab_state ipsq ipsq
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001915 * ill_max_hops ipsq Not atomic
1916 *
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001917 * ill_mtu ill_lock None
Erik Nordmark1eee1702010-08-16 15:30:54 -07001918 * ill_mc_mtu ill_lock None
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001919 *
meeme11c3f42009-01-06 20:16:25 -05001920 * ill_user_mtu ipsq + ill_lock ill_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001921 * ill_reachable_time ipsq + ill_lock ill_lock
meemb051ecf2006-12-27 21:32:46 -08001922 * ill_reachable_retrans_time ipsq + ill_lock ill_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001923 * ill_max_buf ipsq + ill_lock ill_lock
1924 *
1925 * Next 2 fields need ill_lock because of the get ioctls. They should not
1926 * report partially updated results without executing in the ipsq.
1927 * ill_token ipsq + ill_lock ill_lock
1928 * ill_token_length ipsq + ill_lock ill_lock
Sebastien Roy2b24ab62009-09-22 22:04:45 -04001929 * ill_dest_token ipsq + down ill only when ill is up
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001930 * ill_xmit_count ipsq + down ill write once
1931 * ill_ip6_mib ipsq + down ill only when ill is up
1932 * ill_icmp6_mib ipsq + down ill only when ill is up
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001933 *
1934 * ill_phyint ipsq, ill_g_lock, ill_lock Any of them
1935 * ill_flags ill_lock ill_lock
meemb051ecf2006-12-27 21:32:46 -08001936 * ill_nd_lla_mp ipsq + down ill only when ill is up
1937 * ill_nd_lla ipsq + down ill only when ill is up
1938 * ill_nd_lla_len ipsq + down ill only when ill is up
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001939 * ill_phys_addr_pend ipsq + down ill only when ill is up
1940 * ill_ifname_pending_err ipsq ipsq
meemb051ecf2006-12-27 21:32:46 -08001941 * ill_avl_byppa ipsq, ill_g_lock write once
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001942 *
1943 * ill_fastpath_list ill_lock ill_lock
1944 * ill_refcnt ill_lock ill_lock
sowmini384ad172008-04-08 12:13:12 -07001945 * ill_ire_cnt ill_lock ill_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001946 * ill_cv ill_lock ill_lock
Dan McDonald42c5ef02019-02-22 14:42:52 -05001947 * ill_mcast_nces ill_lock ill_lock
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001948 * ill_ncec_cnt ill_lock ill_lock
sowmini384ad172008-04-08 12:13:12 -07001949 * ill_nce_cnt ill_lock ill_lock
1950 * ill_ilm_cnt ill_lock ill_lock
meeme11c3f42009-01-06 20:16:25 -05001951 * ill_src_ipif ill_g_lock ill_g_lock
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001952 * ill_trace ill_lock ill_lock
1953 * ill_usesrc_grp_next ill_g_usesrc_lock ill_g_usesrc_lock
meeme704a8f2007-10-30 11:15:43 -07001954 * ill_dhcpinit atomics atomics
Eric Chengda14ceb2008-12-04 18:16:10 -08001955 * ill_flownotify_mh write once write once
1956 * ill_capab_pending_cnt ipsq ipsq
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001957 * ill_ipallmulti_cnt ill_lock ill_lock
1958 * ill_ipallmulti_ilm ill_lock ill_lock
1959 * ill_saved_ire_mp ill_saved_ire_lock ill_saved_ire_lock
1960 * ill_saved_ire_cnt ill_saved_ire_lock ill_saved_ire_lock
1961 * ill_arl ??? ???
1962 * ill_ire_multicast ipsq + quiescent none
meeme11c3f42009-01-06 20:16:25 -05001963 * ill_bound_ipif ipsq ipsq
1964 * ill_actnode ipsq + ipmp_lock ipsq OR ipmp_lock
1965 * ill_grpnode ipsq + ill_g_lock ipsq OR ill_g_lock
1966 * ill_src_ipif ill_g_lock ill_g_lock
1967 * ill_move_ipif ipsq ipsq
1968 * ill_nom_cast ipsq ipsq OR advisory
1969 * ill_refresh_tid ill_lock ill_lock
1970 * ill_grp (for IPMP ill) write once write once
1971 * ill_grp (for underlying ill) ipsq + ill_g_lock ipsq OR ill_g_lock
Sowmini Varadhanf1c454b2010-01-11 10:29:23 -05001972 * ill_grp_pending ill_mcast_serializer ill_mcast_serializer
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001973 * ill_mrouter_cnt atomics atomics
Ravi Chandra Nallan7f125a52010-07-13 18:17:30 +05301974 * ill_mphysaddr_list ill_lock ill_lock
meeme11c3f42009-01-06 20:16:25 -05001975 *
1976 * NOTE: It's OK to make heuristic decisions on an underlying interface
1977 * by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001978 */
1979
1980/*
1981 * For ioctl restart mechanism see ip_reprocess_ioctl()
1982 */
1983struct ip_ioctl_cmd_s;
1984
1985typedef int (*ifunc_t)(ipif_t *, struct sockaddr_in *, queue_t *, mblk_t *,
1986 struct ip_ioctl_cmd_s *, void *);
1987
1988typedef struct ip_ioctl_cmd_s {
1989 int ipi_cmd;
1990 size_t ipi_copyin_size;
1991 uint_t ipi_flags;
1992 uint_t ipi_cmd_type;
1993 ifunc_t ipi_func;
1994 ifunc_t ipi_func_restart;
1995} ip_ioctl_cmd_t;
1996
1997/*
1998 * ipi_cmd_type:
1999 *
2000 * IF_CMD 1 old style ifreq cmd
2001 * LIF_CMD 2 new style lifreq cmd
Sebastien Roy2b24ab62009-09-22 22:04:45 -04002002 * ARP_CMD 3 arpreq cmd
2003 * XARP_CMD 4 xarpreq cmd
2004 * MSFILT_CMD 5 multicast source filter cmd
2005 * MISC_CMD 6 misc cmd (not a more specific one above)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002006 */
2007
Sebastien Roy2b24ab62009-09-22 22:04:45 -04002008enum { IF_CMD = 1, LIF_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD };
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002009
2010#define IPI_DONTCARE 0 /* For ioctl encoded values that don't matter */
2011
2012/* Flag values in ipi_flags */
Ryan Goodfellow2514b112022-06-09 07:52:45 -07002013#define IPI_PRIV 0x1 /* Command requires PRIV_SYS_IP_CONFIG */
meem98e93c22007-08-31 12:48:28 -07002014#define IPI_MODOK 0x2 /* Permitted on mod instance of IP */
2015#define IPI_WR 0x4 /* Need to grab writer access */
2016#define IPI_GET_CMD 0x8 /* branch to mi_copyout on success */
meeme11c3f42009-01-06 20:16:25 -05002017/* unused 0x10 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002018#define IPI_NULL_BCONT 0x20 /* ioctl has not data and hence no b_cont */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002019
2020extern ip_ioctl_cmd_t ip_ndx_ioctl_table[];
2021extern ip_ioctl_cmd_t ip_misc_ioctl_table[];
2022extern int ip_ndx_ioctl_count;
2023extern int ip_misc_ioctl_count;
2024
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002025/* Passed down by ARP to IP during I_PLINK/I_PUNLINK */
2026typedef struct ipmx_s {
2027 char ipmx_name[LIFNAMSIZ]; /* if name */
2028 uint_t
2029 ipmx_arpdev_stream : 1, /* This is the arp stream */
2030 ipmx_notused : 31;
2031} ipmx_t;
2032
2033/*
2034 * State for detecting if a driver supports certain features.
2035 * Support for DL_ENABMULTI_REQ uses ill_dlpi_multicast_state.
2036 * Support for DLPI M_DATA fastpath uses ill_dlpi_fastpath_state.
2037 */
ja978904d876312006-10-10 06:05:36 -07002038#define IDS_UNKNOWN 0 /* No DLPI request sent */
2039#define IDS_INPROGRESS 1 /* DLPI request sent */
2040#define IDS_OK 2 /* DLPI request completed successfully */
2041#define IDS_FAILED 3 /* DLPI request failed */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002042
Eric Chengda14ceb2008-12-04 18:16:10 -08002043/* Support for DL_CAPABILITY_REQ uses ill_dlpi_capab_state. */
2044enum {
2045 IDCS_UNKNOWN,
2046 IDCS_PROBE_SENT,
2047 IDCS_OK,
2048 IDCS_RESET_SENT,
2049 IDCS_RENEG,
2050 IDCS_FAILED
2051};
2052
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002053/* Extended NDP Management Structure */
2054typedef struct ipndp_s {
2055 ndgetf_t ip_ndp_getf;
2056 ndsetf_t ip_ndp_setf;
2057 caddr_t ip_ndp_data;
2058 char *ip_ndp_name;
2059} ipndp_t;
2060
Erik Nordmarkbd670b32009-11-11 11:49:49 -08002061/* IXA Notification types */
2062typedef enum {
2063 IXAN_LSO, /* LSO capability change */
2064 IXAN_PMTU, /* PMTU change */
2065 IXAN_ZCOPY /* ZEROCOPY capability change */
2066} ixa_notify_type_t;
2067
2068typedef uint_t ixa_notify_arg_t;
2069
2070typedef void (*ixa_notify_t)(void *, ip_xmit_attr_t *ixa, ixa_notify_type_t,
2071 ixa_notify_arg_t);
2072
2073/*
2074 * Attribute flags that are common to the transmit and receive attributes
2075 */
2076#define IAF_IS_IPV4 0x80000000 /* ipsec_*_v4 */
2077#define IAF_TRUSTED_ICMP 0x40000000 /* ipsec_*_icmp_loopback */
2078#define IAF_NO_LOOP_ZONEID_SET 0x20000000 /* Zone that shouldn't have */
2079 /* a copy */
2080#define IAF_LOOPBACK_COPY 0x10000000 /* For multi and broadcast */
2081
2082#define IAF_MASK 0xf0000000 /* Flags that are common */
2083
2084/*
2085 * Transmit side attributes used between the transport protocols and IP as
2086 * well as inside IP. It is also used to cache information in the conn_t i.e.
2087 * replaces conn_ire and the IPsec caching in the conn_t.
2088 */
2089struct ip_xmit_attr_s {
2090 iaflags_t ixa_flags; /* IXAF_*. See below */
2091
2092 uint32_t ixa_free_flags; /* IXA_FREE_*. See below */
2093 uint32_t ixa_refcnt; /* Using atomics */
2094
2095 /*
2096 * Always initialized independently of ixa_flags settings.
2097 * Used by ip_xmit so we keep them up front for cache locality.
2098 */
2099 uint32_t ixa_xmit_hint; /* For ECMP and GLD TX ring fanout */
2100 uint_t ixa_pktlen; /* Always set. For frag and stats */
2101 zoneid_t ixa_zoneid; /* Assumed always set */
2102
2103 /* Always set for conn_ip_output(); might be stale */
2104 /*
2105 * Since TCP keeps the conn_t around past the process going away
2106 * we need to use the "notr" (e.g, ire_refhold_notr) for ixa_ire,
2107 * ixa_nce, and ixa_dce.
2108 */
2109 ire_t *ixa_ire; /* Forwarding table entry */
2110 uint_t ixa_ire_generation;
2111 nce_t *ixa_nce; /* Neighbor cache entry */
2112 dce_t *ixa_dce; /* Destination cache entry */
2113 uint_t ixa_dce_generation;
2114 uint_t ixa_src_generation; /* If IXAF_VERIFY_SOURCE */
2115
2116 uint32_t ixa_src_preferences; /* prefs for src addr select */
2117 uint32_t ixa_pmtu; /* IXAF_VERIFY_PMTU */
2118
2119 /* Set by ULP if IXAF_VERIFY_PMTU; otherwise set by IP */
2120 uint32_t ixa_fragsize;
2121
2122 int8_t ixa_use_min_mtu; /* IXAF_USE_MIN_MTU values */
2123
2124 pfirepostfrag_t ixa_postfragfn; /* Set internally in IP */
2125
2126 in6_addr_t ixa_nexthop_v6; /* IXAF_NEXTHOP_SET */
2127#define ixa_nexthop_v4 V4_PART_OF_V6(ixa_nexthop_v6)
2128
2129 zoneid_t ixa_no_loop_zoneid; /* IXAF_NO_LOOP_ZONEID_SET */
2130
2131 uint_t ixa_scopeid; /* For IPv6 link-locals */
2132
2133 uint_t ixa_broadcast_ttl; /* IXAF_BROACAST_TTL_SET */
2134
2135 uint_t ixa_multicast_ttl; /* Assumed set for multicast */
2136 uint_t ixa_multicast_ifindex; /* Assumed set for multicast */
2137 ipaddr_t ixa_multicast_ifaddr; /* Assumed set for multicast */
2138
2139 int ixa_raw_cksum_offset; /* If IXAF_SET_RAW_CKSUM */
2140
2141 uint32_t ixa_ident; /* For IPv6 fragment header */
2142
Alan Maguire9cd928f2010-05-27 17:29:51 -04002143 uint64_t ixa_conn_id; /* Used by DTrace */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08002144 /*
2145 * Cached LSO information.
2146 */
2147 ill_lso_capab_t ixa_lso_capab; /* Valid when IXAF_LSO_CAPAB */
2148
2149 uint64_t ixa_ipsec_policy_gen; /* Generation from iph_gen */
2150 /*
2151 * The following IPsec fields are only initialized when
2152 * IXAF_IPSEC_SECURE is set. Otherwise they contain garbage.
2153 */
2154 ipsec_latch_t *ixa_ipsec_latch; /* Just the ids */
Toomas Soome8a06b3d2018-10-15 22:13:16 +03002155 struct ipsa_s *ixa_ipsec_ah_sa; /* Hard reference SA for AH */
2156 struct ipsa_s *ixa_ipsec_esp_sa; /* Hard reference SA for ESP */
2157 struct ipsec_policy_s *ixa_ipsec_policy; /* why are we here? */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08002158 struct ipsec_action_s *ixa_ipsec_action; /* For reflected packets */
2159 ipsa_ref_t ixa_ipsec_ref[2]; /* Soft reference to SA */
2160 /* 0: ESP, 1: AH */
2161
2162 /*
2163 * The selectors here are potentially different than the SPD rule's
2164 * selectors, and we need to have both available for IKEv2.
2165 *
2166 * NOTE: "Source" and "Dest" are w.r.t. outbound datagrams. Ports can
2167 * be zero, and the protocol number is needed to make the ports
2168 * significant.
2169 */
2170 uint16_t ixa_ipsec_src_port; /* Source port number of d-gram. */
2171 uint16_t ixa_ipsec_dst_port; /* Destination port number of d-gram. */
2172 uint8_t ixa_ipsec_icmp_type; /* ICMP type of d-gram */
2173 uint8_t ixa_ipsec_icmp_code; /* ICMP code of d-gram */
2174
2175 sa_family_t ixa_ipsec_inaf; /* Inner address family */
2176#define IXA_MAX_ADDRLEN 4 /* Max addr len. (in 32-bit words) */
2177 uint32_t ixa_ipsec_insrc[IXA_MAX_ADDRLEN]; /* Inner src address */
2178 uint32_t ixa_ipsec_indst[IXA_MAX_ADDRLEN]; /* Inner dest address */
2179 uint8_t ixa_ipsec_insrcpfx; /* Inner source prefix */
2180 uint8_t ixa_ipsec_indstpfx; /* Inner destination prefix */
2181
2182 uint8_t ixa_ipsec_proto; /* IP protocol number for d-gram. */
2183
2184 /* Always initialized independently of ixa_flags settings */
2185 uint_t ixa_ifindex; /* Assumed always set */
2186 uint16_t ixa_ip_hdr_length; /* Points to ULP header */
2187 uint8_t ixa_protocol; /* Protocol number for ULP cksum */
2188 ts_label_t *ixa_tsl; /* Always set. NULL if not TX */
2189 ip_stack_t *ixa_ipst; /* Always set */
2190 uint32_t ixa_extra_ident; /* Set if LSO */
2191 cred_t *ixa_cred; /* For getpeerucred */
2192 pid_t ixa_cpid; /* For getpeerucred */
2193
2194#ifdef DEBUG
2195 kthread_t *ixa_curthread; /* For serialization assert */
2196#endif
2197 squeue_t *ixa_sqp; /* Set from conn_sqp as a hint */
2198 uintptr_t ixa_cookie; /* cookie to use for tx flow control */
2199
2200 /*
2201 * Must be set by ULP if any of IXAF_VERIFY_LSO, IXAF_VERIFY_PMTU,
2202 * or IXAF_VERIFY_ZCOPY is set.
2203 */
2204 ixa_notify_t ixa_notify; /* Registered upcall notify function */
2205 void *ixa_notify_cookie; /* ULP cookie for ixa_notify */
Jerry Jelinek7c6d7022012-02-13 19:50:26 +00002206
2207 uint_t ixa_tcpcleanup; /* Used by conn_ixa_cleanup */
Erik Nordmarkbd670b32009-11-11 11:49:49 -08002208};
2209
2210/*
2211 * Flags to indicate which transmit attributes are set.
2212 * Split into "xxx_SET" ones which indicate that the "xxx" field it set, and
2213 * single flags.
2214 */
2215#define IXAF_REACH_CONF 0x00000001 /* Reachability confirmation */
2216#define IXAF_BROADCAST_TTL_SET 0x00000002 /* ixa_broadcast_ttl valid */
2217#define IXAF_SET_SOURCE 0x00000004 /* Replace if broadcast */
2218#define IXAF_USE_MIN_MTU 0x00000008 /* IPV6_USE_MIN_MTU */
2219
2220#define IXAF_DONTFRAG 0x00000010 /* IP*_DONTFRAG */
2221#define IXAF_VERIFY_PMTU 0x00000020 /* ixa_pmtu/ixa_fragsize set */
2222#define IXAF_PMTU_DISCOVERY 0x00000040 /* Crea