blob: a0157d3c488d0c2f6e67f2d2d9c8e2bd68fad1a1 [file] [log] [blame]
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
Erik Nordmark1eee1702010-08-16 15:30:54 -070023 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
Patrick Mooneyb22a70a2018-01-03 21:11:35 +000024 * Copyright 2018 Joyent, Inc.
Erik Nordmarkbd670b32009-11-11 11:49:49 -080025 */
26/* Copyright (c) 1990 Mentat Inc. */
27
28#include <sys/types.h>
29#include <sys/stream.h>
30#include <sys/strsubr.h>
31#include <sys/dlpi.h>
32#include <sys/strsun.h>
33#include <sys/zone.h>
34#include <sys/ddi.h>
35#include <sys/sunddi.h>
36#include <sys/cmn_err.h>
37#include <sys/debug.h>
38#include <sys/atomic.h>
39
40#include <sys/systm.h>
41#include <sys/param.h>
42#include <sys/kmem.h>
43#include <sys/sdt.h>
44#include <sys/socket.h>
45#include <sys/mac.h>
46#include <net/if.h>
47#include <net/if_arp.h>
48#include <net/route.h>
49#include <sys/sockio.h>
50#include <netinet/in.h>
51#include <net/if_dl.h>
52
53#include <inet/common.h>
54#include <inet/mi.h>
55#include <inet/mib2.h>
56#include <inet/nd.h>
57#include <inet/arp.h>
58#include <inet/snmpcom.h>
59#include <inet/kstatcom.h>
60
61#include <netinet/igmp_var.h>
62#include <netinet/ip6.h>
63#include <netinet/icmp6.h>
64#include <netinet/sctp.h>
65
66#include <inet/ip.h>
67#include <inet/ip_impl.h>
68#include <inet/ip6.h>
69#include <inet/ip6_asp.h>
70#include <inet/tcp.h>
71#include <inet/ip_multi.h>
72#include <inet/ip_if.h>
73#include <inet/ip_ire.h>
74#include <inet/ip_ftable.h>
75#include <inet/ip_rts.h>
76#include <inet/optcom.h>
77#include <inet/ip_ndp.h>
78#include <inet/ip_listutils.h>
79#include <netinet/igmp.h>
80#include <netinet/ip_mroute.h>
81#include <inet/ipp_common.h>
82
83#include <net/pfkeyv2.h>
84#include <inet/sadb.h>
85#include <inet/ipsec_impl.h>
86#include <inet/ipdrop.h>
87#include <inet/ip_netinfo.h>
88
89#include <sys/pattr.h>
90#include <inet/ipclassifier.h>
91#include <inet/sctp_ip.h>
92#include <inet/sctp/sctp_impl.h>
93#include <inet/udp_impl.h>
94#include <sys/sunddi.h>
95
96#include <sys/tsol/label.h>
97#include <sys/tsol/tnet.h>
98
Erik Nordmarkb36a5612009-11-19 11:04:40 -080099#include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */
100
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800101#ifdef DEBUG
102extern boolean_t skip_sctp_cksum;
103#endif
104
105static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
106static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
107static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
108static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
109static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
110
111/*
112 * There are two types of output functions for IP used for different
113 * purposes:
114 * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
115 * is no context in the form of a conn_t. However, there is a
116 * ip_xmit_attr_t that the callers use to influence interface selection
117 * (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
118 *
119 * - conn_ip_output() is used when sending packets with a conn_t and
120 * ip_set_destination has been called to cache information. In that case
121 * various socket options are recorded in the ip_xmit_attr_t and should
122 * be taken into account.
123 */
124
125/*
126 * The caller *must* have called conn_connect() or ip_attr_connect()
127 * before calling conn_ip_output(). The caller needs to redo that each time
128 * the destination IP address or port changes, as well as each time there is
129 * a change to any socket option that would modify how packets are routed out
130 * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
131 *
132 * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
133 * We assert for that here.
134 */
135int
136conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
137{
138 iaflags_t ixaflags = ixa->ixa_flags;
139 ire_t *ire;
140 nce_t *nce;
141 dce_t *dce;
142 ill_t *ill;
143 ip_stack_t *ipst = ixa->ixa_ipst;
144 int error;
145
146 /* We defer ipIfStatsHCOutRequests until an error or we have an ill */
147
148 ASSERT(ixa->ixa_ire != NULL);
149 /* Note there is no ixa_nce when reject and blackhole routes */
150 ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */
151
152#ifdef DEBUG
153 ASSERT(ixa->ixa_curthread == NULL);
154 ixa->ixa_curthread = curthread;
155#endif
156
157 /*
158 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
159 * for IGMP/MLD traffic.
160 */
161
162 ire = ixa->ixa_ire;
163
164 /*
165 * If the ULP says the (old) IRE resulted in reachability we
166 * record this before determine whether to use a new IRE.
167 * No locking for performance reasons.
168 */
169 if (ixaflags & IXAF_REACH_CONF)
170 ire->ire_badcnt = 0;
171
172 /*
173 * Has routing changed since we cached the results of the lookup?
174 *
175 * This check captures all of:
176 * - the cached ire being deleted (by means of the special
177 * IRE_GENERATION_CONDEMNED)
178 * - A potentially better ire being added (ire_generation being
179 * increased)
180 * - A deletion of the nexthop ire that was used when we did the
181 * lookup.
182 * - An addition of a potentially better nexthop ire.
183 * The last two are handled by walking and increasing the generation
184 * number on all dependant IREs in ire_flush_cache().
185 *
186 * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
187 * since we ensure that each time we set ixa_ire to such an IRE we
188 * make sure the ixa_ire_generation does not match (by using
189 * IRE_GENERATION_VERIFY).
190 */
191 if (ire->ire_generation != ixa->ixa_ire_generation) {
192 error = ip_verify_ire(mp, ixa);
193 if (error != 0) {
194 ip_drop_output("ipIfStatsOutDiscards - verify ire",
195 mp, NULL);
196 goto drop;
197 }
198 ire = ixa->ixa_ire;
199 ASSERT(ire != NULL);
200 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
201#ifdef DEBUG
202 ASSERT(ixa->ixa_curthread == curthread);
203 ixa->ixa_curthread = NULL;
204#endif
205 ire->ire_ob_pkt_count++;
206 /* ixa_dce might be condemned; use default one */
207 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
208 &ipst->ips_dce_default->dce_ident));
209 }
210 /*
211 * If the ncec changed then ip_verify_ire already set
212 * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
213 * so we can recheck the interface mtu.
214 */
215
216 /*
217 * Note that ire->ire_generation could already have changed.
218 * We catch that next time we send a packet.
219 */
220 }
221
222 /*
223 * No need to lock access to ixa_nce since the ip_xmit_attr usage
224 * is single threaded.
225 */
226 ASSERT(ixa->ixa_nce != NULL);
227 nce = ixa->ixa_nce;
228 if (nce->nce_is_condemned) {
229 error = ip_verify_nce(mp, ixa);
230 /*
231 * In case ZEROCOPY capability become not available, we
232 * copy the message and free the original one. We might
233 * be copying more data than needed but it doesn't hurt
234 * since such change rarely happens.
235 */
236 switch (error) {
237 case 0:
238 break;
239 case ENOTSUP: { /* ZEROCOPY */
240 mblk_t *nmp;
241
242 if ((nmp = copymsg(mp)) != NULL) {
243 freemsg(mp);
244 mp = nmp;
245
246 break;
247 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800248 }
Toomas Soomea9f62b12018-03-27 08:17:48 +0300249 /* FALLTHROUGH */
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800250 default:
251 ip_drop_output("ipIfStatsOutDiscards - verify nce",
252 mp, NULL);
253 goto drop;
254 }
255 ire = ixa->ixa_ire;
256 ASSERT(ire != NULL);
257 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
258#ifdef DEBUG
259 ASSERT(ixa->ixa_curthread == curthread);
260 ixa->ixa_curthread = NULL;
261#endif
262 ire->ire_ob_pkt_count++;
263 /* ixa_dce might be condemned; use default one */
264 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
265 ixa, &ipst->ips_dce_default->dce_ident));
266 }
267 ASSERT(ixa->ixa_nce != NULL);
268 nce = ixa->ixa_nce;
269
270 /*
271 * Note that some other event could already have made
272 * the new nce condemned. We catch that next time we
273 * try to send a packet.
274 */
275 }
276 /*
277 * If there is no per-destination dce_t then we have a reference to
278 * the default dce_t (which merely contains the dce_ipid).
279 * The generation check captures both the introduction of a
280 * per-destination dce_t (e.g., due to ICMP packet too big) and
281 * any change to the per-destination dce (including it becoming
282 * condemned by use of the special DCE_GENERATION_CONDEMNED).
283 */
284 dce = ixa->ixa_dce;
285
286 /*
287 * To avoid a periodic timer to increase the path MTU we
288 * look at dce_last_change_time each time we send a packet.
289 */
Erik Nordmarkb36a5612009-11-19 11:04:40 -0800290 if (dce->dce_flags & DCEF_PMTU) {
291 int64_t now = LBOLT_FASTPATH64;
292
293 if ((TICK_TO_SEC(now) - dce->dce_last_change_time >
294 ipst->ips_ip_pathmtu_interval)) {
295 /*
296 * Older than 20 minutes. Drop the path MTU information.
297 * Since the path MTU changes as a result of this,
298 * twiddle ixa_dce_generation to make us go through the
299 * dce verification code in conn_ip_output.
300 */
301 mutex_enter(&dce->dce_lock);
302 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
303 dce->dce_last_change_time = TICK_TO_SEC(now);
304 mutex_exit(&dce->dce_lock);
305 dce_increment_generation(dce);
306 }
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800307 }
308
309 if (dce->dce_generation != ixa->ixa_dce_generation) {
310 error = ip_verify_dce(mp, ixa);
311 if (error != 0) {
312 ip_drop_output("ipIfStatsOutDiscards - verify dce",
313 mp, NULL);
314 goto drop;
315 }
316 dce = ixa->ixa_dce;
317
318 /*
319 * Note that some other event could already have made the
320 * new dce's generation number change.
321 * We catch that next time we try to send a packet.
322 */
323 }
324
325 ill = nce->nce_ill;
326
327 /*
328 * An initial ixa_fragsize was set in ip_set_destination
329 * and we update it if any routing changes above.
330 * A change to ill_mtu with ifconfig will increase all dce_generation
Erik Nordmark1eee1702010-08-16 15:30:54 -0700331 * so that we will detect that with the generation check. Ditto for
332 * ill_mc_mtu.
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800333 */
334
335 /*
336 * Caller needs to make sure IXAF_VERIFY_SRC is not set if
337 * conn_unspec_src.
338 */
339 if ((ixaflags & IXAF_VERIFY_SOURCE) &&
340 ixa->ixa_src_generation != ipst->ips_src_generation) {
341 /* Check if the IP source is still assigned to the host. */
342 uint_t gen;
343
344 if (!ip_verify_src(mp, ixa, &gen)) {
345 /* Don't send a packet with a source that isn't ours */
346 error = EADDRNOTAVAIL;
347 ip_drop_output("ipIfStatsOutDiscards - invalid src",
348 mp, NULL);
349 goto drop;
350 }
351 /* The source is still valid - update the generation number */
352 ixa->ixa_src_generation = gen;
353 }
354
355 /*
356 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
357 * can only count the use prior to fragmentation. However the MIB
358 * counters on the ill will be incremented in post fragmentation.
359 */
360 ire->ire_ob_pkt_count++;
361 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
362
363 /*
364 * Based on ire_type and ire_flags call one of:
365 * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
366 * ire_send_multirt_v* - if RTF_MULTIRT
367 * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
368 * ire_send_multicast_v* - for IRE_MULTICAST
369 * ire_send_broadcast_v4 - for IRE_BROADCAST
370 * ire_send_wire_v* - for the rest.
371 */
372#ifdef DEBUG
373 ASSERT(ixa->ixa_curthread == curthread);
374 ixa->ixa_curthread = NULL;
375#endif
376 return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
377
378drop:
379 if (ixaflags & IXAF_IS_IPV4) {
380 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
381 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
382 } else {
383 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
384 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
385 }
386 freemsg(mp);
387#ifdef DEBUG
388 ASSERT(ixa->ixa_curthread == curthread);
389 ixa->ixa_curthread = NULL;
390#endif
391 return (error);
392}
393
394/*
395 * Handle both IPv4 and IPv6. Sets the generation number
396 * to allow the caller to know when to call us again.
397 * Returns true if the source address in the packet is a valid source.
398 * We handle callers which try to send with a zero address (since we only
399 * get here if UNSPEC_SRC is not set).
400 */
401boolean_t
402ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
403{
404 ip_stack_t *ipst = ixa->ixa_ipst;
405
406 /*
407 * Need to grab the generation number before we check to
408 * avoid a race with a change to the set of local addresses.
409 * No lock needed since the thread which updates the set of local
410 * addresses use ipif/ill locks and exit those (hence a store memory
411 * barrier) before doing the atomic increase of ips_src_generation.
412 */
413 if (generationp != NULL)
414 *generationp = ipst->ips_src_generation;
415
416 if (ixa->ixa_flags & IXAF_IS_IPV4) {
417 ipha_t *ipha = (ipha_t *)mp->b_rptr;
418
419 if (ipha->ipha_src == INADDR_ANY)
420 return (B_FALSE);
421
422 return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
423 ipst, B_FALSE) != IPVL_BAD);
424 } else {
425 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
426 uint_t scopeid;
427
428 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
429 return (B_FALSE);
430
431 if (ixa->ixa_flags & IXAF_SCOPEID_SET)
432 scopeid = ixa->ixa_scopeid;
433 else
434 scopeid = 0;
435
436 return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
437 ipst, B_FALSE, scopeid) != IPVL_BAD);
438 }
439}
440
441/*
442 * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
443 */
444int
445ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
446{
447 uint_t gen;
448 ire_t *ire;
449 nce_t *nce;
450 int error;
451 boolean_t multirt = B_FALSE;
452
453 /*
454 * Redo ip_select_route.
455 * Need to grab generation number as part of the lookup to
456 * avoid race.
457 */
458 error = 0;
459 ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
460 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
461 if (error != 0) {
462 ire_refrele(ire);
463 return (error);
464 }
465
466 if (ixa->ixa_ire != NULL)
467 ire_refrele_notr(ixa->ixa_ire);
468#ifdef DEBUG
469 ire_refhold_notr(ire);
470 ire_refrele(ire);
471#endif
472 ixa->ixa_ire = ire;
473 ixa->ixa_ire_generation = gen;
474 if (multirt) {
475 if (ixa->ixa_flags & IXAF_IS_IPV4)
476 ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
477 else
478 ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
479 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
480 } else {
481 ixa->ixa_postfragfn = ire->ire_postfragfn;
482 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
483 }
484
485 /*
486 * Don't look for an nce for reject or blackhole.
487 * They have ire_generation set to IRE_GENERATION_VERIFY which
488 * makes conn_ip_output avoid references to ixa_nce.
489 */
490 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
491 ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
492 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
493 return (0);
494 }
495
496 /* The NCE could now be different */
497 nce = ire_to_nce_pkt(ire, mp);
498 if (nce == NULL) {
499 /*
500 * Allocation failure. Make sure we redo ire/nce selection
501 * next time we send.
502 */
503 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
504 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
505 return (ENOBUFS);
506 }
507 if (nce == ixa->ixa_nce) {
508 /* No change */
509 nce_refrele(nce);
510 return (0);
511 }
512
513 /*
514 * Since the path MTU might change as a result of this
515 * route change, we twiddle ixa_dce_generation to
516 * make conn_ip_output go through the ip_verify_dce code.
517 */
518 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
519
520 if (ixa->ixa_nce != NULL)
521 nce_refrele(ixa->ixa_nce);
522 ixa->ixa_nce = nce;
523 return (0);
524}
525
526/*
527 * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
528 */
529static int
530ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
531{
532 ire_t *ire = ixa->ixa_ire;
533 nce_t *nce;
534 int error = 0;
535 ipha_t *ipha = NULL;
536 ip6_t *ip6h = NULL;
537
538 if (ire->ire_ipversion == IPV4_VERSION)
539 ipha = (ipha_t *)mp->b_rptr;
540 else
541 ip6h = (ip6_t *)mp->b_rptr;
542
543 nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
544 if (nce == NULL) {
545 /* Try to find a better ire */
546 return (ip_verify_ire(mp, ixa));
547 }
548
549 /*
550 * The hardware offloading capabilities, for example LSO, of the
551 * interface might have changed, so do sanity verification here.
552 */
553 if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
554 if (!ip_verify_lso(nce->nce_ill, ixa)) {
555 ASSERT(ixa->ixa_notify != NULL);
556 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
557 IXAN_LSO, 0);
558 error = ENOTSUP;
559 }
560 }
561
562 /*
563 * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
564 * any ZEROCOPY changes. In case ZEROCOPY capability is not available
565 * any more, return error so that conn_ip_output() can take care of
566 * the ZEROCOPY message properly. It's safe to continue send the
567 * message when ZEROCOPY newly become available.
568 */
569 if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
570 if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
571 ASSERT(ixa->ixa_notify != NULL);
572 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
573 IXAN_ZCOPY, 0);
574 if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
575 error = ENOTSUP;
576 }
577 }
578
579 /*
580 * Since the path MTU might change as a result of this
581 * change, we twiddle ixa_dce_generation to
582 * make conn_ip_output go through the ip_verify_dce code.
583 */
584 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
585
586 nce_refrele(ixa->ixa_nce);
587 ixa->ixa_nce = nce;
588 return (error);
589}
590
591/*
592 * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
593 */
594static int
595ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
596{
597 dce_t *dce;
598 uint_t gen;
599 uint_t pmtu;
600
601 dce = dce_lookup_pkt(mp, ixa, &gen);
602 ASSERT(dce != NULL);
603
604 dce_refrele_notr(ixa->ixa_dce);
605#ifdef DEBUG
606 dce_refhold_notr(dce);
607 dce_refrele(dce);
608#endif
609 ixa->ixa_dce = dce;
610 ixa->ixa_dce_generation = gen;
611
612 /* Extract the (path) mtu from the dce, ncec_ill etc */
613 pmtu = ip_get_pmtu(ixa);
614
615 /*
616 * Tell ULP about PMTU changes - increase or decrease - by returning
617 * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
618 * both ixa_pmtu and ixa_fragsize appropriately.
619 *
620 * If ULP doesn't set that flag then we need to update ixa_fragsize
621 * since routing could have changed the ill after after ixa_fragsize
622 * was set previously in the conn_ip_output path or in
623 * ip_set_destination.
624 *
625 * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
626 *
627 * In the case of a path MTU increase we send the packet after the
628 * notify to the ULP.
629 */
630 if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
631 if (ixa->ixa_pmtu != pmtu) {
632 uint_t oldmtu = ixa->ixa_pmtu;
633
634 DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
635 uint32_t, ixa->ixa_pmtu);
636 ASSERT(ixa->ixa_notify != NULL);
637 ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
638 IXAN_PMTU, pmtu);
639 if (pmtu < oldmtu)
640 return (EMSGSIZE);
641 }
642 } else {
643 ixa->ixa_fragsize = pmtu;
644 }
645 return (0);
646}
647
648/*
649 * Verify LSO usability. Keep the return value simple to indicate whether
650 * the LSO capability has changed. Handle both IPv4 and IPv6.
651 */
652static boolean_t
653ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
654{
655 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab;
656 ill_lso_capab_t *new_lsoc = ill->ill_lso_capab;
657
658 if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
659 /*
660 * Not unsable any more.
661 */
662 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
663 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
664 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
665 ((ixa->ixa_flags & IXAF_IS_IPV4) ?
666 !ILL_LSO_TCP_IPV4_USABLE(ill) :
667 !ILL_LSO_TCP_IPV6_USABLE(ill))) {
668 ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
669
670 return (B_FALSE);
671 }
672
673 /*
674 * Capability has changed, refresh the copy in ixa.
675 */
Robert Mustacchi62366fb2020-04-01 15:30:20 +0000676 if (lsoc->ill_lso_max_tcpv4 != new_lsoc->ill_lso_max_tcpv4 ||
677 lsoc->ill_lso_max_tcpv6 != new_lsoc->ill_lso_max_tcpv6) {
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800678 *lsoc = *new_lsoc;
679
680 return (B_FALSE);
681 }
682 } else { /* Was not usable */
683 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
684 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
685 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
686 ((ixa->ixa_flags & IXAF_IS_IPV4) ?
687 ILL_LSO_TCP_IPV4_USABLE(ill) :
688 ILL_LSO_TCP_IPV6_USABLE(ill))) {
689 *lsoc = *new_lsoc;
690 ixa->ixa_flags |= IXAF_LSO_CAPAB;
691
692 return (B_FALSE);
693 }
694 }
695
696 return (B_TRUE);
697}
698
699/*
700 * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
701 * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
702 */
703static boolean_t
704ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
705{
706 if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
707 /*
708 * Not unsable any more.
709 */
710 if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
711 (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
712 (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
713 !ILL_ZCOPY_USABLE(ill)) {
714 ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
715
716 return (B_FALSE);
717 }
718 } else { /* Was not usable */
719 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
720 !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
721 !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
722 ILL_ZCOPY_USABLE(ill)) {
723 ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
724
725 return (B_FALSE);
726 }
727 }
728
729 return (B_TRUE);
730}
731
732
733/*
734 * When there is no conn_t context, this will send a packet.
735 * The caller must *not* have called conn_connect() or ip_attr_connect()
736 * before calling ip_output_simple().
737 * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
738 * Honors IXAF_SET_SOURCE.
739 *
740 * We acquire the ire and after calling ire_sendfn we release
741 * the hold on the ire. Ditto for the nce and dce.
742 *
743 * This assumes that the caller has set the following in ip_xmit_attr_t:
744 * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
745 * If ixa_ifindex is non-zero it means send out that ill. (If it is
746 * an upper IPMP ill we load balance across the group; if a lower we send
747 * on that lower ill without load balancing.)
748 * IXAF_IS_IPV4 must be set correctly.
749 * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
750 * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
751 * If neither of those two are set we do an IPsec policy lookup.
752 *
753 * We handle setting things like
754 * ixa_pktlen
755 * ixa_ip_hdr_length
756 * ixa->ixa_protocol
757 *
758 * The caller may set ixa_xmit_hint, which is used for ECMP selection and
759 * transmit ring selecting in GLD.
760 *
761 * The caller must do an ixa_cleanup() to release any IPsec references
762 * after we return.
763 */
764int
765ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
766{
767 ts_label_t *effective_tsl = NULL;
768 int err;
769
770 ASSERT(ixa->ixa_ipst != NULL);
771
772 if (is_system_labeled()) {
773 ip_stack_t *ipst = ixa->ixa_ipst;
774
775 if (ixa->ixa_flags & IXAF_IS_IPV4) {
776 err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
777 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
778 &effective_tsl);
779 } else {
780 err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
781 &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
782 &effective_tsl);
783 }
784 if (err != 0) {
785 ip2dbg(("tsol_check: label check failed (%d)\n", err));
786 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
787 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
788 ip_drop_output("tsol_check_label", mp, NULL);
789 freemsg(mp);
790 return (err);
791 }
792 if (effective_tsl != NULL) {
793 /* Update the label */
794 ip_xmit_attr_replace_tsl(ixa, effective_tsl);
795 }
796 }
797
798 if (ixa->ixa_flags & IXAF_IS_IPV4)
799 return (ip_output_simple_v4(mp, ixa));
800 else
801 return (ip_output_simple_v6(mp, ixa));
802}
803
804int
805ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
806{
807 ipha_t *ipha;
808 ipaddr_t firsthop; /* In IP header */
809 ipaddr_t dst; /* End of source route, or ipha_dst if none */
810 ire_t *ire;
811 ipaddr_t setsrc; /* RTF_SETSRC */
812 int error;
813 ill_t *ill = NULL;
814 dce_t *dce = NULL;
815 nce_t *nce;
816 iaflags_t ixaflags = ixa->ixa_flags;
817 ip_stack_t *ipst = ixa->ixa_ipst;
818 boolean_t repeat = B_FALSE;
819 boolean_t multirt = B_FALSE;
Rafael Vanonid3d50732009-11-13 01:32:32 -0800820 int64_t now;
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800821
822 ipha = (ipha_t *)mp->b_rptr;
823 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
824
825 /*
826 * Even on labeled systems we can have a NULL ixa_tsl e.g.,
827 * for IGMP/MLD traffic.
828 */
829
830 /* Caller already set flags */
831 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
832
833 ASSERT(ixa->ixa_nce == NULL);
834
835 ixa->ixa_pktlen = ntohs(ipha->ipha_length);
836 ASSERT(ixa->ixa_pktlen == msgdsize(mp));
837 ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
838 ixa->ixa_protocol = ipha->ipha_protocol;
839
840 /*
841 * Assumes that source routed packets have already been massaged by
842 * the ULP (ip_massage_options) and as a result ipha_dst is the next
843 * hop in the source route. The final destination is used for IPsec
844 * policy and DCE lookup.
845 */
846 firsthop = ipha->ipha_dst;
847 dst = ip_get_dst(ipha);
848
849repeat_ire:
850 error = 0;
851 setsrc = INADDR_ANY;
Sowmini Varadhan44b099c2010-02-17 22:59:58 -0500852 ire = ip_select_route_v4(firsthop, ipha->ipha_src, ixa, NULL,
853 &setsrc, &error, &multirt);
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800854 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
855 if (error != 0) {
856 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
857 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
858 ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
859 freemsg(mp);
860 goto done;
861 }
862
863 if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
864 /* ire_ill might be NULL hence need to skip some code */
865 if (ixaflags & IXAF_SET_SOURCE)
866 ipha->ipha_src = htonl(INADDR_LOOPBACK);
867 ixa->ixa_fragsize = IP_MAXPACKET;
868 ill = NULL;
869 nce = NULL;
870 ire->ire_ob_pkt_count++;
871 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
872 /* No dce yet; use default one */
873 error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
874 &ipst->ips_dce_default->dce_ident);
875 goto done;
876 }
877
878 /* Note that ipha_dst is only used for IRE_MULTICAST */
879 nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
880 if (nce == NULL) {
881 /* Allocation failure? */
882 ip_drop_output("ire_to_nce", mp, ill);
883 freemsg(mp);
884 error = ENOBUFS;
885 goto done;
886 }
887 if (nce->nce_is_condemned) {
888 nce_t *nce1;
889
890 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
891 nce_refrele(nce);
892 if (nce1 == NULL) {
893 if (!repeat) {
894 /* Try finding a better IRE */
895 repeat = B_TRUE;
896 ire_refrele(ire);
897 goto repeat_ire;
898 }
899 /* Tried twice - drop packet */
900 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
901 ip_drop_output("No nce", mp, ill);
902 freemsg(mp);
903 error = ENOBUFS;
904 goto done;
905 }
906 nce = nce1;
907 }
908
909 /*
910 * For multicast with multirt we have a flag passed back from
911 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
912 * possible multicast address.
913 * We also need a flag for multicast since we can't check
914 * whether RTF_MULTIRT is set in ixa_ire for multicast.
915 */
916 if (multirt) {
917 ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
918 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
919 } else {
920 ixa->ixa_postfragfn = ire->ire_postfragfn;
921 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
922 }
923 ASSERT(ixa->ixa_nce == NULL);
924 ixa->ixa_nce = nce;
925
926 /*
927 * Check for a dce_t with a path mtu.
928 */
929 dce = dce_lookup_v4(dst, ipst, NULL);
930 ASSERT(dce != NULL);
931
932 if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
933 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
934 } else if (dce->dce_flags & DCEF_PMTU) {
935 /*
936 * To avoid a periodic timer to increase the path MTU we
937 * look at dce_last_change_time each time we send a packet.
938 */
Rafael Vanonid3d50732009-11-13 01:32:32 -0800939 now = ddi_get_lbolt64();
940 if (TICK_TO_SEC(now) - dce->dce_last_change_time >
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800941 ipst->ips_ip_pathmtu_interval) {
942 /*
943 * Older than 20 minutes. Drop the path MTU information.
944 */
945 mutex_enter(&dce->dce_lock);
946 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
Rafael Vanonid3d50732009-11-13 01:32:32 -0800947 dce->dce_last_change_time = TICK_TO_SEC(now);
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800948 mutex_exit(&dce->dce_lock);
949 dce_increment_generation(dce);
950 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
951 } else {
952 uint_t fragsize;
953
954 fragsize = ip_get_base_mtu(nce->nce_ill, ire);
955 if (fragsize > dce->dce_pmtu)
956 fragsize = dce->dce_pmtu;
957 ixa->ixa_fragsize = fragsize;
958 }
959 } else {
960 ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
961 }
962
963 /*
964 * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
965 * interface for source address selection.
966 */
967 ill = ire_nexthop_ill(ire);
968
969 if (ixaflags & IXAF_SET_SOURCE) {
970 ipaddr_t src;
971
972 /*
973 * We use the final destination to get
974 * correct selection for source routed packets
975 */
976
977 /* If unreachable we have no ill but need some source */
978 if (ill == NULL) {
979 src = htonl(INADDR_LOOPBACK);
980 error = 0;
981 } else {
982 error = ip_select_source_v4(ill, setsrc, dst,
983 ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
984 &src, NULL, NULL);
985 }
986 if (error != 0) {
987 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
988 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
989 ip_drop_output("ipIfStatsOutDiscards - no source",
990 mp, ill);
991 freemsg(mp);
992 goto done;
993 }
994 ipha->ipha_src = src;
995 } else if (ixaflags & IXAF_VERIFY_SOURCE) {
996 /* Check if the IP source is assigned to the host. */
997 if (!ip_verify_src(mp, ixa, NULL)) {
998 /* Don't send a packet with a source that isn't ours */
999 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1000 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1001 ip_drop_output("ipIfStatsOutDiscards - invalid source",
1002 mp, ill);
1003 freemsg(mp);
1004 error = EADDRNOTAVAIL;
1005 goto done;
1006 }
1007 }
1008
1009
1010 /*
1011 * Check against global IPsec policy to set the AH/ESP attributes.
1012 * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
1013 */
1014 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1015 ASSERT(ixa->ixa_ipsec_policy == NULL);
1016 mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
1017 if (mp == NULL) {
1018 /* MIB and ip_drop_packet already done */
1019 return (EHOSTUNREACH); /* IPsec policy failure */
1020 }
1021 }
1022
1023 if (ill != NULL) {
1024 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
1025 } else {
1026 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
1027 }
1028
1029 /*
1030 * We update the statistics on the most specific IRE i.e., the first
1031 * one we found.
1032 * We don't have an IRE when we fragment, hence ire_ob_pkt_count
1033 * can only count the use prior to fragmentation. However the MIB
1034 * counters on the ill will be incremented in post fragmentation.
1035 */
1036 ire->ire_ob_pkt_count++;
1037
1038 /*
1039 * Based on ire_type and ire_flags call one of:
1040 * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
1041 * ire_send_multirt_v4 - if RTF_MULTIRT
1042 * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
1043 * ire_send_multicast_v4 - for IRE_MULTICAST
1044 * ire_send_broadcast_v4 - for IRE_BROADCAST
1045 * ire_send_wire_v4 - for the rest.
1046 */
1047 error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
1048done:
1049 ire_refrele(ire);
1050 if (dce != NULL)
1051 dce_refrele(dce);
1052 if (ill != NULL)
1053 ill_refrele(ill);
1054 if (ixa->ixa_nce != NULL)
1055 nce_refrele(ixa->ixa_nce);
1056 ixa->ixa_nce = NULL;
1057 return (error);
1058}
1059
1060/*
1061 * ire_sendfn() functions.
1062 * These functions use the following xmit_attr:
1063 * - ixa_fragsize - read to determine whether or not to fragment
1064 * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
1065 * - ixa_ipsec_* are used inside IPsec
1066 * - IXAF_SET_SOURCE - replace IP source in broadcast case.
1067 * - IXAF_LOOPBACK_COPY - for multicast and broadcast
1068 */
1069
1070
1071/*
1072 * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
1073 *
1074 * The checks for restrict_interzone_loopback are done in ire_route_recursive.
1075 */
1076/* ARGSUSED4 */
1077int
1078ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1079 ip_xmit_attr_t *ixa, uint32_t *identp)
1080{
1081 ipha_t *ipha = (ipha_t *)iph_arg;
1082 ip_stack_t *ipst = ixa->ixa_ipst;
1083 ill_t *ill = ire->ire_ill;
1084 ip_recv_attr_t iras; /* NOTE: No bzero for performance */
1085 uint_t pktlen = ixa->ixa_pktlen;
1086
1087 /*
1088 * No fragmentation, no nce, no application of IPsec,
1089 * and no ipha_ident assignment.
1090 *
1091 * Note different order between IP provider and FW_HOOKS than in
1092 * send_wire case.
1093 */
1094
1095 /*
1096 * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the
1097 * send probe, but not the receive probe.
1098 */
1099 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1100 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1101 int, 1);
1102
1103 if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
Toomas Soomeab82c292019-12-28 14:24:51 +02001104 int error = 0;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001105
1106 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
1107 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
1108 FW_HOOKS(ipst->ips_ip4_loopback_out_event,
1109 ipst->ips_ipv4firewall_loopback_out,
1110 NULL, ill, ipha, mp, mp, 0, ipst, error);
1111 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
1112 if (mp == NULL)
1113 return (error);
1114
1115 /*
1116 * Even if the destination was changed by the filter we use the
1117 * forwarding decision that was made based on the address
1118 * in ip_output/ip_set_destination.
1119 */
1120 /* Length could be different */
1121 ipha = (ipha_t *)mp->b_rptr;
1122 pktlen = ntohs(ipha->ipha_length);
1123 }
1124
1125 /*
1126 * If a callback is enabled then we need to know the
1127 * source and destination zoneids for the packet. We already
1128 * have those handy.
1129 */
1130 if (ipst->ips_ip4_observe.he_interested) {
1131 zoneid_t szone, dzone;
1132 zoneid_t stackzoneid;
1133
1134 stackzoneid = netstackid_to_zoneid(
1135 ipst->ips_netstack->netstack_stackid);
1136
1137 if (stackzoneid == GLOBAL_ZONEID) {
1138 /* Shared-IP zone */
1139 dzone = ire->ire_zoneid;
1140 szone = ixa->ixa_zoneid;
1141 } else {
1142 szone = dzone = stackzoneid;
1143 }
1144 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
1145 }
1146
1147 /* Handle lo0 stats */
1148 ipst->ips_loopback_packets++;
1149
1150 /* Map ixa to ira including IPsec policies */
1151 ipsec_out_to_in(ixa, ill, &iras);
1152 iras.ira_pktlen = pktlen;
1153
1154 if (!IS_SIMPLE_IPH(ipha)) {
1155 ip_output_local_options(ipha, ipst);
1156 iras.ira_flags |= IRAF_IPV4_OPTIONS;
1157 }
1158
1159 if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
Toomas Soomeab82c292019-12-28 14:24:51 +02001160 int error = 0;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001161
1162 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
1163 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
1164 FW_HOOKS(ipst->ips_ip4_loopback_in_event,
1165 ipst->ips_ipv4firewall_loopback_in,
1166 ill, NULL, ipha, mp, mp, 0, ipst, error);
1167
1168 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
1169 if (mp == NULL) {
1170 ira_cleanup(&iras, B_FALSE);
1171 return (error);
1172 }
1173 /*
1174 * Even if the destination was changed by the filter we use the
1175 * forwarding decision that was made based on the address
1176 * in ip_output/ip_set_destination.
1177 */
1178 /* Length could be different */
1179 ipha = (ipha_t *)mp->b_rptr;
1180 pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
1181 }
1182
1183 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
1184 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
1185 int, 1);
1186
1187 ire->ire_ib_pkt_count++;
1188 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
1189 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
1190
1191 /* Destined to ire_zoneid - use that for fanout */
1192 iras.ira_zoneid = ire->ire_zoneid;
1193
1194 if (is_system_labeled()) {
1195 iras.ira_flags |= IRAF_SYSTEM_LABELED;
1196
1197 /*
1198 * This updates ira_cred, ira_tsl and ira_free_flags based
1199 * on the label. We don't expect this to ever fail for
1200 * loopback packets, so we silently drop the packet should it
1201 * fail.
1202 */
1203 if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
1204 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1205 ip_drop_input("tsol_get_pkt_label", mp, ill);
1206 freemsg(mp);
1207 return (0);
1208 }
1209 ASSERT(iras.ira_tsl != NULL);
1210
1211 /* tsol_get_pkt_label sometimes does pullupmsg */
1212 ipha = (ipha_t *)mp->b_rptr;
1213 }
1214
1215 ip_fanout_v4(mp, ipha, &iras);
1216
1217 /* We moved any IPsec refs from ixa to iras */
1218 ira_cleanup(&iras, B_FALSE);
1219 return (0);
1220}
1221
1222/*
1223 * ire_sendfn for IRE_BROADCAST
1224 * If the broadcast address is present on multiple ills and ixa_ifindex
1225 * isn't set, then we generate
1226 * a separate datagram (potentially with different source address) for
1227 * those ills. In any case, only one copy is looped back to ip_input_v4.
1228 */
1229int
1230ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1231 ip_xmit_attr_t *ixa, uint32_t *identp)
1232{
1233 ipha_t *ipha = (ipha_t *)iph_arg;
1234 ip_stack_t *ipst = ixa->ixa_ipst;
1235 irb_t *irb = ire->ire_bucket;
1236 ire_t *ire1;
1237 mblk_t *mp1;
1238 ipha_t *ipha1;
1239 iaflags_t ixaflags = ixa->ixa_flags;
1240 nce_t *nce1, *nce_orig;
1241
1242 /*
1243 * Unless ire_send_multirt_v4 already set a ttl, force the
1244 * ttl to a smallish value.
1245 */
1246 if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
1247 /*
1248 * To avoid broadcast storms, we usually set the TTL to 1 for
1249 * broadcasts. This can
1250 * be overridden stack-wide through the ip_broadcast_ttl
1251 * ndd tunable, or on a per-connection basis through the
1252 * IP_BROADCAST_TTL socket option.
1253 *
1254 * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
1255 * will force ttl to one after we've set this.
1256 */
1257 if (ixaflags & IXAF_BROADCAST_TTL_SET)
1258 ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
1259 else
1260 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
1261 }
1262 /*
1263 * Make sure we get a loopback copy (after IPsec and frag)
1264 * Skip hardware checksum so that loopback copy is checksumed.
1265 */
1266 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1267
1268 /* Do we need to potentially generate multiple copies? */
1269 if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
1270 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1271
1272 /*
1273 * Loop over all IRE_BROADCAST in the bucket (might only be one).
1274 * Note that everything in the bucket has the same destination address.
1275 */
1276 irb_refhold(irb);
1277 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1278 /* We do the main IRE after the end of the loop */
1279 if (ire1 == ire)
1280 continue;
1281
1282 /*
1283 * Only IREs for the same IP address should be in the same
1284 * bucket.
1285 * But could have IRE_HOSTs in the case of CGTP.
1286 * If we find any multirt routes we bail out of the loop
1287 * and just do the single packet at the end; ip_postfrag_multirt
1288 * will duplicate the packet.
1289 */
1290 ASSERT(ire1->ire_addr == ire->ire_addr);
1291 if (!(ire1->ire_type & IRE_BROADCAST))
1292 continue;
1293
1294 if (IRE_IS_CONDEMNED(ire1))
1295 continue;
1296
1297 if (ixa->ixa_zoneid != ALL_ZONES &&
1298 ire->ire_zoneid != ire1->ire_zoneid)
1299 continue;
1300
1301 ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
1302
1303 if (ire1->ire_flags & RTF_MULTIRT)
1304 break;
1305
1306 /*
1307 * For IPMP we only send for the ipmp_ill. arp_nce_init() will
1308 * ensure that this goes out on the cast_ill.
1309 */
1310 if (IS_UNDER_IPMP(ire1->ire_ill))
1311 continue;
1312
1313 mp1 = copymsg(mp);
1314 if (mp1 == NULL) {
1315 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1316 ipIfStatsOutDiscards);
1317 ip_drop_output("ipIfStatsOutDiscards",
1318 mp, ire1->ire_ill);
1319 continue;
1320 }
1321
1322 ipha1 = (ipha_t *)mp1->b_rptr;
1323 if (ixa->ixa_flags & IXAF_SET_SOURCE) {
1324 /*
1325 * Need to pick a different source address for each
1326 * interface. If we have a global IPsec policy and
1327 * no per-socket policy then we punt to
1328 * ip_output_simple_v4 using a separate ip_xmit_attr_t.
1329 */
1330 if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
1331 ip_output_simple_broadcast(ixa, mp1);
1332 continue;
1333 }
1334 /* Pick a new source address for each interface */
1335 if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
1336 ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
1337 &ipha1->ipha_src, NULL, NULL) != 0) {
1338 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1339 ipIfStatsOutDiscards);
1340 ip_drop_output("ipIfStatsOutDiscards - select "
1341 "broadcast source", mp1, ire1->ire_ill);
1342 freemsg(mp1);
1343 continue;
1344 }
1345 /*
1346 * Check against global IPsec policy to set the AH/ESP
1347 * attributes. IPsec will set IXAF_IPSEC_* and
1348 * ixa_ipsec_* as appropriate.
1349 */
1350 if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
1351 ASSERT(ixa->ixa_ipsec_policy == NULL);
1352 mp1 = ip_output_attach_policy(mp1, ipha, NULL,
1353 NULL, ixa);
1354 if (mp1 == NULL) {
1355 /*
1356 * MIB and ip_drop_packet already
1357 * done
1358 */
1359 continue;
1360 }
1361 }
1362 }
1363 /* Make sure we have an NCE on this ill */
1364 nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
1365 ire1->ire_type);
1366 if (nce1 == NULL) {
1367 BUMP_MIB(ire1->ire_ill->ill_ip_mib,
1368 ipIfStatsOutDiscards);
1369 ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
1370 mp1, ire1->ire_ill);
1371 freemsg(mp1);
1372 continue;
1373 }
1374 nce_orig = ixa->ixa_nce;
1375 ixa->ixa_nce = nce1;
1376
1377 ire_refhold(ire1);
1378 /*
1379 * Ignore any errors here. We just collect the errno for
1380 * the main ire below
1381 */
1382 (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
1383 ire_refrele(ire1);
1384
1385 ixa->ixa_nce = nce_orig;
1386 nce_refrele(nce1);
1387
1388 ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
1389 }
1390 irb_refrele(irb);
1391 /* Finally, the main one */
1392
1393 /*
1394 * For IPMP we only send broadcasts on the ipmp_ill.
1395 */
1396 if (IS_UNDER_IPMP(ire->ire_ill)) {
1397 freemsg(mp);
1398 return (0);
1399 }
1400
1401 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1402}
1403
1404/*
1405 * Send a packet using a different source address and different
1406 * IPsec policy.
1407 */
1408static void
1409ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
1410{
1411 ip_xmit_attr_t ixas;
1412
1413 bzero(&ixas, sizeof (ixas));
1414 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1415 ixas.ixa_zoneid = ixa->ixa_zoneid;
1416 ixas.ixa_ifindex = 0;
1417 ixas.ixa_ipst = ixa->ixa_ipst;
1418 ixas.ixa_cred = ixa->ixa_cred;
1419 ixas.ixa_cpid = ixa->ixa_cpid;
1420 ixas.ixa_tsl = ixa->ixa_tsl;
1421 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1422
1423 (void) ip_output_simple(mp, &ixas);
1424 ixa_cleanup(&ixas);
1425}
1426
1427
1428static void
1429multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
1430{
1431 ip_stack_t *ipst = ixa->ixa_ipst;
1432
1433 /* Limit the TTL on multirt packets */
1434 if (ire->ire_type & IRE_MULTICAST) {
1435 if (ipha->ipha_ttl > 1) {
1436 ip2dbg(("ire_send_multirt_v4: forcing multicast "
1437 "multirt TTL to 1 (was %d), dst 0x%08x\n",
1438 ipha->ipha_ttl, ntohl(ire->ire_addr)));
1439 ipha->ipha_ttl = 1;
1440 }
1441 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1442 } else if ((ipst->ips_ip_multirt_ttl > 0) &&
1443 (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
1444 ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
1445 /*
1446 * Need to ensure we don't increase the ttl should we go through
1447 * ire_send_broadcast or multicast.
1448 */
1449 ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
1450 }
1451}
1452
1453/*
1454 * ire_sendfn for IRE_MULTICAST
1455 */
1456int
1457ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1458 ip_xmit_attr_t *ixa, uint32_t *identp)
1459{
1460 ipha_t *ipha = (ipha_t *)iph_arg;
1461 ip_stack_t *ipst = ixa->ixa_ipst;
1462 ill_t *ill = ire->ire_ill;
1463 iaflags_t ixaflags = ixa->ixa_flags;
1464
1465 /*
1466 * The IRE_MULTICAST is the same whether or not multirt is in use.
1467 * Hence we need special-case code.
1468 */
1469 if (ixaflags & IXAF_MULTIRT_MULTICAST)
1470 multirt_check_v4(ire, ipha, ixa);
1471
1472 /*
1473 * Check if anything in ip_input_v4 wants a copy of the transmitted
1474 * packet (after IPsec and fragmentation)
1475 *
1476 * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
1477 * RSVP and the rsvp daemon is an example of a
1478 * protocol and user level process that
1479 * handles it's own routing. Hence, it uses the
1480 * SO_DONTROUTE option to accomplish this.
1481 * 2. If the sender has set IP_MULTICAST_LOOP, then we just
1482 * check whether there are any receivers for the group on the ill
1483 * (ignoring the zoneid).
1484 * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
1485 * any members in other shared-IP zones.
1486 * If such members exist, then we indicate that the sending zone
1487 * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
1488 * behavior.
1489 *
1490 * When we loopback we skip hardware checksum to make sure loopback
1491 * copy is checksumed.
1492 *
1493 * Note that ire_ill is the upper in the case of IPMP.
1494 */
1495 ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
1496 if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
1497 !(ixaflags & IXAF_DONTROUTE)) {
1498 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1499 } else if (ixaflags & IXAF_MULTICAST_LOOP) {
1500 /*
1501 * If this zone or any other zone has members then loopback
1502 * a copy.
1503 */
1504 if (ill_hasmembers_v4(ill, ipha->ipha_dst))
1505 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1506 } else if (ipst->ips_netstack->netstack_numzones > 1) {
1507 /*
1508 * This zone should not have a copy. But there are some other
1509 * zones which might have members.
1510 */
1511 if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1512 ixa->ixa_zoneid)) {
1513 ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
1514 ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
1515 ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
1516 }
1517 }
1518
1519 /*
1520 * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
1521 * force the ttl to the IP_MULTICAST_TTL value
1522 */
1523 if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
1524 ipha->ipha_ttl = ixa->ixa_multicast_ttl;
1525 }
1526
1527 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1528}
1529
1530/*
1531 * ire_sendfn for IREs with RTF_MULTIRT
1532 */
1533int
1534ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1535 ip_xmit_attr_t *ixa, uint32_t *identp)
1536{
1537 ipha_t *ipha = (ipha_t *)iph_arg;
1538
1539 multirt_check_v4(ire, ipha, ixa);
1540
1541 if (ire->ire_type & IRE_MULTICAST)
1542 return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
1543 else if (ire->ire_type & IRE_BROADCAST)
1544 return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
1545 else
1546 return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
1547}
1548
1549/*
1550 * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
1551 */
1552int
1553ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1554 ip_xmit_attr_t *ixa, uint32_t *identp)
1555{
1556 ip_stack_t *ipst = ixa->ixa_ipst;
1557 ipha_t *ipha = (ipha_t *)iph_arg;
1558 ill_t *ill;
1559 ip_recv_attr_t iras;
1560 boolean_t dummy;
1561
1562 /* We assign an IP ident for nice errors */
Josef 'Jeff' Sipek1a5e2582014-08-08 10:50:14 -04001563 ipha->ipha_ident = atomic_inc_32_nv(identp);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001564
1565 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
1566
1567 if (ire->ire_type & IRE_NOROUTE) {
1568 /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
1569 ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
1570 RTA_DST, ipst);
1571 }
1572
1573 if (ire->ire_flags & RTF_BLACKHOLE) {
1574 ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
1575 freemsg(mp);
1576 /* No error even for local senders - silent blackhole */
1577 return (0);
1578 }
1579 ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
1580
1581 /*
1582 * We need an ill_t for the ip_recv_attr_t even though this packet
1583 * was never received and icmp_unreachable doesn't currently use
1584 * ira_ill.
1585 */
1586 ill = ill_lookup_on_name("lo0", B_FALSE,
1587 !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
1588 if (ill == NULL) {
1589 freemsg(mp);
1590 return (EHOSTUNREACH);
1591 }
1592
1593 bzero(&iras, sizeof (iras));
1594 /* Map ixa to ira including IPsec policies */
1595 ipsec_out_to_in(ixa, ill, &iras);
1596
1597 if (ip_source_routed(ipha, ipst)) {
1598 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
1599 } else {
1600 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
1601 }
1602 /* We moved any IPsec refs from ixa to iras */
1603 ira_cleanup(&iras, B_FALSE);
1604 ill_refrele(ill);
1605 return (EHOSTUNREACH);
1606}
1607
1608/*
1609 * Calculate a checksum ignoring any hardware capabilities
1610 *
1611 * Returns B_FALSE if the packet was too short for the checksum. Caller
1612 * should free and do stats.
1613 */
1614static boolean_t
1615ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
1616{
1617 ip_stack_t *ipst = ixa->ixa_ipst;
1618 uint_t pktlen = ixa->ixa_pktlen;
1619 uint16_t *cksump;
1620 uint32_t cksum;
1621 uint8_t protocol = ixa->ixa_protocol;
1622 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
1623 ipaddr_t dst = ipha->ipha_dst;
1624 ipaddr_t src = ipha->ipha_src;
1625
1626 /* Just in case it contained garbage */
1627 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1628
1629 /*
1630 * Calculate ULP checksum
1631 */
1632 if (protocol == IPPROTO_TCP) {
1633 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1634 cksum = IP_TCP_CSUM_COMP;
1635 } else if (protocol == IPPROTO_UDP) {
1636 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1637 cksum = IP_UDP_CSUM_COMP;
1638 } else if (protocol == IPPROTO_SCTP) {
1639 sctp_hdr_t *sctph;
1640
1641 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1642 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1643 /*
1644 * Zero out the checksum field to ensure proper
1645 * checksum calculation.
1646 */
1647 sctph->sh_chksum = 0;
1648#ifdef DEBUG
1649 if (!skip_sctp_cksum)
1650#endif
1651 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1652 goto ip_hdr_cksum;
1653 } else {
1654 goto ip_hdr_cksum;
1655 }
1656
1657 /* ULP puts the checksum field is in the first mblk */
1658 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1659
1660 /*
1661 * We accumulate the pseudo header checksum in cksum.
1662 * This is pretty hairy code, so watch close. One
1663 * thing to keep in mind is that UDP and TCP have
1664 * stored their respective datagram lengths in their
1665 * checksum fields. This lines things up real nice.
1666 */
1667 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
1668
1669 cksum = IP_CSUM(mp, ip_hdr_length, cksum);
1670 /*
1671 * For UDP/IPv4 a zero means that the packets wasn't checksummed.
1672 * Change to 0xffff
1673 */
1674 if (protocol == IPPROTO_UDP && cksum == 0)
1675 *cksump = ~cksum;
1676 else
1677 *cksump = cksum;
1678
1679 IP_STAT(ipst, ip_out_sw_cksum);
1680 IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
1681
1682ip_hdr_cksum:
1683 /* Calculate IPv4 header checksum */
1684 ipha->ipha_hdr_checksum = 0;
1685 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1686 return (B_TRUE);
1687}
1688
1689/*
1690 * Calculate the ULP checksum - try to use hardware.
1691 * In the case of MULTIRT, broadcast or multicast the
1692 * IXAF_NO_HW_CKSUM is set in which case we use software.
1693 *
1694 * If the hardware supports IP header checksum offload; then clear the
1695 * contents of IP header checksum field as expected by NIC.
1696 * Do this only if we offloaded either full or partial sum.
1697 *
1698 * Returns B_FALSE if the packet was too short for the checksum. Caller
1699 * should free and do stats.
1700 */
1701static boolean_t
1702ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
1703 ip_xmit_attr_t *ixa, ill_t *ill)
1704{
1705 uint_t pktlen = ixa->ixa_pktlen;
1706 uint16_t *cksump;
1707 uint16_t hck_flags;
1708 uint32_t cksum;
1709 uint8_t protocol = ixa->ixa_protocol;
1710 uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
1711
1712 if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
1713 !dohwcksum) {
1714 return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1715 }
1716
1717 /*
1718 * Calculate ULP checksum. Note that we don't use cksump and cksum
1719 * if the ill has FULL support.
1720 */
1721 if (protocol == IPPROTO_TCP) {
1722 cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
1723 cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */
1724 } else if (protocol == IPPROTO_UDP) {
1725 cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
1726 cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */
1727 } else if (protocol == IPPROTO_SCTP) {
1728 sctp_hdr_t *sctph;
1729
1730 ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
1731 sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
1732 /*
1733 * Zero out the checksum field to ensure proper
1734 * checksum calculation.
1735 */
1736 sctph->sh_chksum = 0;
1737#ifdef DEBUG
1738 if (!skip_sctp_cksum)
1739#endif
1740 sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
1741 goto ip_hdr_cksum;
Patrick Mooneyb22a70a2018-01-03 21:11:35 +00001742 } else if (protocol == IPPROTO_ICMP) {
1743 /*
1744 * Note that we always calculate a SW checksum for ICMP. In the
1745 * future, if HW support for ICMP is advertised, we can change
1746 * this.
1747 */
1748 return (ip_output_sw_cksum_v4(mp, ipha, ixa));
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001749 } else {
1750 ip_hdr_cksum:
1751 /* Calculate IPv4 header checksum */
1752 ipha->ipha_hdr_checksum = 0;
1753 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1754 return (B_TRUE);
1755 }
1756
1757 /* ULP puts the checksum field is in the first mblk */
1758 ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
1759
1760 /*
1761 * Underlying interface supports hardware checksum offload for
1762 * the payload; leave the payload checksum for the hardware to
1763 * calculate. N.B: We only need to set up checksum info on the
1764 * first mblk.
1765 */
1766 hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
1767
1768 DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
1769 if (hck_flags & HCKSUM_INET_FULL_V4) {
1770 /*
1771 * Hardware calculates pseudo-header, header and the
1772 * payload checksums, so clear the checksum field in
1773 * the protocol header.
1774 */
1775 *cksump = 0;
1776 DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
1777
1778 ipha->ipha_hdr_checksum = 0;
1779 if (hck_flags & HCKSUM_IPHDRCKSUM) {
1780 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1781 } else {
1782 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1783 }
1784 return (B_TRUE);
1785 }
1786 if ((hck_flags) & HCKSUM_INET_PARTIAL) {
1787 ipaddr_t dst = ipha->ipha_dst;
1788 ipaddr_t src = ipha->ipha_src;
1789 /*
1790 * Partial checksum offload has been enabled. Fill
1791 * the checksum field in the protocol header with the
1792 * pseudo-header checksum value.
1793 *
1794 * We accumulate the pseudo header checksum in cksum.
1795 * This is pretty hairy code, so watch close. One
1796 * thing to keep in mind is that UDP and TCP have
1797 * stored their respective datagram lengths in their
1798 * checksum fields. This lines things up real nice.
1799 */
1800 cksum += (dst >> 16) + (dst & 0xFFFF) +
1801 (src >> 16) + (src & 0xFFFF);
1802 cksum += *(cksump);
1803 cksum = (cksum & 0xFFFF) + (cksum >> 16);
1804 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
1805
1806 /*
1807 * Offsets are relative to beginning of IP header.
1808 */
1809 DB_CKSUMSTART(mp) = ip_hdr_length;
1810 DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
1811 DB_CKSUMEND(mp) = pktlen;
1812 DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
1813
1814 ipha->ipha_hdr_checksum = 0;
1815 if (hck_flags & HCKSUM_IPHDRCKSUM) {
1816 DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
1817 } else {
1818 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1819 }
1820 return (B_TRUE);
1821 }
1822 /* Hardware capabilities include neither full nor partial IPv4 */
1823 return (ip_output_sw_cksum_v4(mp, ipha, ixa));
1824}
1825
1826/*
1827 * ire_sendfn for offlink and onlink destinations.
1828 * Also called from the multicast, broadcast, multirt send functions.
1829 *
1830 * Assumes that the caller has a hold on the ire.
1831 *
1832 * This function doesn't care if the IRE just became condemned since that
1833 * can happen at any time.
1834 */
1835/* ARGSUSED */
1836int
1837ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1838 ip_xmit_attr_t *ixa, uint32_t *identp)
1839{
1840 ip_stack_t *ipst = ixa->ixa_ipst;
1841 ipha_t *ipha = (ipha_t *)iph_arg;
1842 iaflags_t ixaflags = ixa->ixa_flags;
1843 ill_t *ill;
1844
1845 ASSERT(ixa->ixa_nce != NULL);
1846 ill = ixa->ixa_nce->nce_ill;
1847
1848 if (ixaflags & IXAF_DONTROUTE)
1849 ipha->ipha_ttl = 1;
1850
1851 /*
1852 * Assign an ident value for this packet. There could be other
1853 * threads targeting the same destination, so we have to arrange
1854 * for a atomic increment. Note that we use a 32-bit atomic add
1855 * because it has better performance than its 16-bit sibling.
1856 *
1857 * Normally ixa_extra_ident is 0, but in the case of LSO it will
1858 * be the number of TCP segments that the driver/hardware will
1859 * extraly construct.
1860 *
1861 * If running in cluster mode and if the source address
1862 * belongs to a replicated service then vector through
1863 * cl_inet_ipident vector to allocate ip identifier
1864 * NOTE: This is a contract private interface with the
1865 * clustering group.
1866 */
1867 if (cl_inet_ipident != NULL) {
1868 ipaddr_t src = ipha->ipha_src;
1869 ipaddr_t dst = ipha->ipha_dst;
1870 netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
1871
1872 ASSERT(cl_inet_isclusterwide != NULL);
1873 if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
1874 AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
1875 /*
1876 * Note: not correct with LSO since we can't allocate
1877 * ixa_extra_ident+1 consecutive values.
1878 */
1879 ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
1880 IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
1881 (uint8_t *)(uintptr_t)dst, NULL);
1882 } else {
1883 ipha->ipha_ident = atomic_add_32_nv(identp,
1884 ixa->ixa_extra_ident + 1);
1885 }
1886 } else {
1887 ipha->ipha_ident = atomic_add_32_nv(identp,
1888 ixa->ixa_extra_ident + 1);
1889 }
1890#ifndef _BIG_ENDIAN
1891 ipha->ipha_ident = htons(ipha->ipha_ident);
1892#endif
1893
1894 /*
1895 * This might set b_band, thus the IPsec and fragmentation
1896 * code in IP ensures that b_band is updated in the first mblk.
1897 */
1898 if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
1899 /* ip_process translates an IS_UNDER_IPMP */
1900 mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
1901 if (mp == NULL) {
1902 /* ip_drop_packet and MIB done */
1903 return (0); /* Might just be delayed */
1904 }
1905 }
1906
1907 /*
1908 * Verify any IPv4 options.
1909 *
1910 * The presense of IP options also forces the network stack to
1911 * calculate the checksum in software. This is because:
1912 *
1913 * Wrap around: certain partial-checksum NICs (eri, ce) limit
1914 * the size of "start offset" width to 6-bit. This effectively
1915 * sets the largest value of the offset to 64-bytes, starting
1916 * from the MAC header. When the cumulative MAC and IP headers
1917 * exceed such limit, the offset will wrap around. This causes
1918 * the checksum to be calculated at the wrong place.
1919 *
1920 * IPv4 source routing: none of the full-checksum capable NICs
1921 * is capable of correctly handling the IPv4 source-routing
1922 * option for purposes of calculating the pseudo-header; the
1923 * actual destination is different from the destination in the
1924 * header which is that of the next-hop. (This case may not be
1925 * true for NICs which can parse IPv6 extension headers, but
1926 * we choose to simplify the implementation by not offloading
1927 * checksum when they are present.)
1928 */
1929 if (!IS_SIMPLE_IPH(ipha)) {
1930 ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
1931 /* An IS_UNDER_IPMP ill is ok here */
1932 if (ip_output_options(mp, ipha, ixa, ill)) {
1933 /* Packet has been consumed and ICMP error sent */
1934 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1935 return (EINVAL);
1936 }
1937 }
1938
1939 /*
1940 * To handle IPsec/iptun's labeling needs we need to tag packets
1941 * while we still have ixa_tsl
1942 */
1943 if (is_system_labeled() && ixa->ixa_tsl != NULL &&
1944 (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
1945 ill->ill_mactype == DL_IPV6)) {
1946 cred_t *newcr;
1947
1948 newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
1949 KM_NOSLEEP);
1950 if (newcr == NULL) {
1951 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1952 ip_drop_output("ipIfStatsOutDiscards - newcr",
1953 mp, ill);
1954 freemsg(mp);
1955 return (ENOBUFS);
1956 }
1957 mblk_setcred(mp, newcr, NOPID);
1958 crfree(newcr); /* mblk_setcred did its own crhold */
1959 }
1960
1961 if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
1962 (ixaflags & IXAF_IPSEC_SECURE)) {
1963 uint32_t pktlen;
1964
1965 pktlen = ixa->ixa_pktlen;
1966 if (ixaflags & IXAF_IPSEC_SECURE)
1967 pktlen += ipsec_out_extra_length(ixa);
1968
1969 if (pktlen > IP_MAXPACKET)
1970 return (EMSGSIZE);
1971
1972 if (ixaflags & IXAF_SET_ULP_CKSUM) {
1973 /*
1974 * Compute ULP checksum and IP header checksum
1975 * using software
1976 */
1977 if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
1978 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1979 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
1980 freemsg(mp);
1981 return (EINVAL);
1982 }
1983 } else {
1984 /* Calculate IPv4 header checksum */
1985 ipha->ipha_hdr_checksum = 0;
1986 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1987 }
1988
1989 /*
1990 * If this packet would generate a icmp_frag_needed
1991 * message, we need to handle it before we do the IPsec
1992 * processing. Otherwise, we need to strip the IPsec
1993 * headers before we send up the message to the ULPs
1994 * which becomes messy and difficult.
1995 *
1996 * We check using IXAF_DONTFRAG. The DF bit in the header
1997 * is not inspected - it will be copied to any generated
1998 * fragments.
1999 */
2000 if ((pktlen > ixa->ixa_fragsize) &&
2001 (ixaflags & IXAF_DONTFRAG)) {
2002 /* Generate ICMP and return error */
2003 ip_recv_attr_t iras;
2004
2005 DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
2006 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2007 uint_t, ixa->ixa_pmtu);
2008
2009 bzero(&iras, sizeof (iras));
2010 /* Map ixa to ira including IPsec policies */
2011 ipsec_out_to_in(ixa, ill, &iras);
2012
2013 ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
2014 icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
2015 /* We moved any IPsec refs from ixa to iras */
2016 ira_cleanup(&iras, B_FALSE);
2017 return (EMSGSIZE);
2018 }
2019 DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
2020 uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
2021 uint_t, ixa->ixa_pmtu);
2022
2023 if (ixaflags & IXAF_IPSEC_SECURE) {
2024 /*
2025 * Pass in sufficient information so that
2026 * IPsec can determine whether to fragment, and
2027 * which function to call after fragmentation.
2028 */
2029 return (ipsec_out_process(mp, ixa));
2030 }
2031 return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
2032 ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
2033 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
2034 ixa->ixa_postfragfn, &ixa->ixa_cookie));
2035 }
2036 if (ixaflags & IXAF_SET_ULP_CKSUM) {
2037 /* Compute ULP checksum and IP header checksum */
2038 /* An IS_UNDER_IPMP ill is ok here */
2039 if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
2040 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2041 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2042 freemsg(mp);
2043 return (EINVAL);
2044 }
2045 } else {
2046 /* Calculate IPv4 header checksum */
2047 ipha->ipha_hdr_checksum = 0;
2048 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2049 }
2050 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
2051 ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
2052 ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
2053}
2054
2055/*
2056 * Send mp into ip_input
2057 * Common for IPv4 and IPv6
2058 */
2059void
2060ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2061 uint_t pkt_len, zoneid_t nolzid)
2062{
2063 rtc_t rtc;
2064 ill_t *ill = nce->nce_ill;
2065 ip_recv_attr_t iras; /* NOTE: No bzero for performance */
2066 ncec_t *ncec;
2067
2068 ncec = nce->nce_common;
2069 iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
2070 IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
2071 if (ncec->ncec_flags & NCE_F_BCAST)
2072 iras.ira_flags |= IRAF_L2DST_BROADCAST;
2073 else if (ncec->ncec_flags & NCE_F_MCAST)
2074 iras.ira_flags |= IRAF_L2DST_MULTICAST;
2075
2076 iras.ira_free_flags = 0;
2077 iras.ira_cred = NULL;
2078 iras.ira_cpid = NOPID;
2079 iras.ira_tsl = NULL;
2080 iras.ira_zoneid = ALL_ZONES;
2081 iras.ira_pktlen = pkt_len;
2082 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
2083 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
2084
2085 if (ixaflags & IXAF_IS_IPV4)
2086 iras.ira_flags |= IRAF_IS_IPV4;
2087
2088 iras.ira_ill = iras.ira_rill = ill;
2089 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2090 iras.ira_rifindex = iras.ira_ruifindex;
2091 iras.ira_mhip = NULL;
2092
2093 iras.ira_flags |= ixaflags & IAF_MASK;
2094 iras.ira_no_loop_zoneid = nolzid;
2095
2096 /* Broadcast and multicast doesn't care about the squeue */
2097 iras.ira_sqp = NULL;
2098
2099 rtc.rtc_ire = NULL;
2100 if (ixaflags & IXAF_IS_IPV4) {
2101 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2102
2103 rtc.rtc_ipaddr = INADDR_ANY;
2104
2105 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
2106 if (rtc.rtc_ire != NULL) {
2107 ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
2108 ire_refrele(rtc.rtc_ire);
2109 }
2110 } else {
2111 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2112
2113 rtc.rtc_ip6addr = ipv6_all_zeros;
2114
2115 (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
2116 if (rtc.rtc_ire != NULL) {
2117 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
2118 ire_refrele(rtc.rtc_ire);
2119 }
2120 }
2121 /* Any references to clean up? No hold on ira */
2122 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
2123 ira_cleanup(&iras, B_FALSE);
2124}
2125
2126/*
2127 * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
2128 * looks at the IXAF_LOOPBACK_COPY flag.
2129 * Common for IPv4 and IPv6.
2130 *
2131 * If the loopback copy fails (due to no memory) but we send the packet out
2132 * on the wire we return no failure. Only in the case we supress the wire
2133 * sending do we take the loopback failure into account.
2134 *
2135 * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
2136 * Those operations are performed on this packet in ip_xmit() and it would
2137 * be odd to do it twice for the same packet.
2138 */
2139int
2140ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2141 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2142 uintptr_t *ixacookie)
2143{
2144 ill_t *ill = nce->nce_ill;
2145 int error = 0;
2146
2147 /*
2148 * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
2149 * had looped it back
2150 */
2151 if (ixaflags & IXAF_LOOPBACK_COPY) {
2152 mblk_t *mp1;
2153
2154 mp1 = copymsg(mp);
2155 if (mp1 == NULL) {
2156 /* Failed to deliver the loopback copy. */
2157 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2158 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2159 error = ENOBUFS;
2160 } else {
2161 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2162 nolzid);
2163 }
2164 }
2165
2166 /*
2167 * If TTL = 0 then only do the loopback to this host i.e. we are
2168 * done. We are also done if this was the
2169 * loopback interface since it is sufficient
2170 * to loopback one copy of a multicast packet.
2171 */
2172 if (ixaflags & IXAF_IS_IPV4) {
2173 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2174
2175 if (ipha->ipha_ttl == 0) {
2176 ip_drop_output("multicast ipha_ttl not sent to wire",
2177 mp, ill);
2178 freemsg(mp);
2179 return (error);
2180 }
2181 } else {
2182 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2183
2184 if (ip6h->ip6_hops == 0) {
2185 ip_drop_output("multicast ipha_ttl not sent to wire",
2186 mp, ill);
2187 freemsg(mp);
2188 return (error);
2189 }
2190 }
2191 if (nce->nce_ill->ill_wq == NULL) {
2192 /* Loopback interface */
2193 ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
2194 freemsg(mp);
2195 return (error);
2196 }
2197
2198 return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2199 ixacookie));
2200}
2201
2202/*
2203 * Post fragmentation function for RTF_MULTIRT routes.
2204 * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
2205 * checks IXAF_LOOPBACK_COPY.
2206 *
2207 * If no packet is sent due to failures then we return an errno, but if at
2208 * least one succeeded we return zero.
2209 */
2210int
2211ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
2212 uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
2213 uintptr_t *ixacookie)
2214{
2215 irb_t *irb;
2216 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2217 ire_t *ire;
2218 ire_t *ire1;
2219 mblk_t *mp1;
2220 nce_t *nce1;
2221 ill_t *ill = nce->nce_ill;
2222 ill_t *ill1;
2223 ip_stack_t *ipst = ill->ill_ipst;
2224 int error = 0;
2225 int num_sent = 0;
2226 int err;
2227 uint_t ire_type;
2228 ipaddr_t nexthop;
2229
2230 ASSERT(ixaflags & IXAF_IS_IPV4);
2231
2232 /* Check for IXAF_LOOPBACK_COPY */
2233 if (ixaflags & IXAF_LOOPBACK_COPY) {
2234 mblk_t *mp1;
2235
2236 mp1 = copymsg(mp);
2237 if (mp1 == NULL) {
2238 /* Failed to deliver the loopback copy. */
2239 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2240 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
2241 error = ENOBUFS;
2242 } else {
2243 ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
2244 nolzid);
2245 }
2246 }
2247
2248 /*
2249 * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
2250 * a copy to each one.
2251 * Use the nce (nexthop) and ipha_dst to find the ire.
2252 *
2253 * MULTIRT is not designed to work with shared-IP zones thus we don't
2254 * need to pass a zoneid or a label to the IRE lookup.
2255 */
2256 if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
2257 /* Broadcast and multicast case */
2258 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
2259 NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2260 } else {
2261 ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
2262
2263 /* Unicast case */
2264 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
2265 NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
2266 }
2267
2268 if (ire == NULL ||
2269 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2270 !(ire->ire_flags & RTF_MULTIRT)) {
2271 /* Drop */
2272 ip_drop_output("ip_postfrag_multirt didn't find route",
2273 mp, nce->nce_ill);
2274 if (ire != NULL)
2275 ire_refrele(ire);
2276 return (ENETUNREACH);
2277 }
2278
2279 irb = ire->ire_bucket;
2280 irb_refhold(irb);
2281 for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
2282 /*
2283 * For broadcast we can have a mixture of IRE_BROADCAST and
2284 * IRE_HOST due to the manually added IRE_HOSTs that are used
2285 * to trigger the creation of the special CGTP broadcast routes.
2286 * Thus we have to skip if ire_type doesn't match the original.
2287 */
2288 if (IRE_IS_CONDEMNED(ire1) ||
2289 !(ire1->ire_flags & RTF_MULTIRT) ||
2290 ire1->ire_type != ire->ire_type)
2291 continue;
2292
2293 /* Do the ire argument one after the loop */
2294 if (ire1 == ire)
2295 continue;
2296
2297 ill1 = ire_nexthop_ill(ire1);
2298 if (ill1 == NULL) {
2299 /*
2300 * This ire might not have been picked by
2301 * ire_route_recursive, in which case ire_dep might
2302 * not have been setup yet.
2303 * We kick ire_route_recursive to try to resolve
2304 * starting at ire1.
2305 */
2306 ire_t *ire2;
Sowmini Varadhan44b099c2010-02-17 22:59:58 -05002307 uint_t match_flags = MATCH_IRE_DSTONLY;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08002308
Sowmini Varadhan44b099c2010-02-17 22:59:58 -05002309 if (ire1->ire_ill != NULL)
2310 match_flags |= MATCH_IRE_ILL;
Erik Nordmarkbd670b32009-11-11 11:49:49 -08002311 ire2 = ire_route_recursive_impl_v4(ire1,
2312 ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
Sowmini Varadhan44b099c2010-02-17 22:59:58 -05002313 ire1->ire_zoneid, NULL, match_flags,
Erik Nordmark9e3469d2010-01-08 08:42:20 -08002314 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08002315 if (ire2 != NULL)
2316 ire_refrele(ire2);
2317 ill1 = ire_nexthop_ill(ire1);
2318 }
2319
2320 if (ill1 == NULL) {
2321 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2322 ip_drop_output("ipIfStatsOutDiscards - no ill",
2323 mp, ill);
2324 error = ENETUNREACH;
2325 continue;
2326 }
2327
2328 /* Pick the addr and type to use for arp_nce_init */
2329 if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
2330 ire_type = IRE_BROADCAST;
2331 nexthop = ire1->ire_gateway_addr;
2332 } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
2333 ire_type = IRE_MULTICAST;
2334 nexthop = ipha->ipha_dst;
2335 } else {
2336 ire_type = ire1->ire_type; /* Doesn't matter */
2337 nexthop = ire1->ire_gateway_addr;
2338 }
2339
2340 /* If IPMP meta or under, then we just drop */
2341 if (ill1->ill_grp != NULL) {
2342 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2343 ip_drop_output("ipIfStatsOutDiscards - IPMP",
2344 mp, ill1);
2345 ill_refrele(ill1);
2346 error = ENETUNREACH;
2347 continue;
2348 }
2349
2350 nce1 = arp_nce_init(ill1, nexthop, ire_type);
2351 if (nce1 == NULL) {
2352 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2353 ip_drop_output("ipIfStatsOutDiscards - no nce",
2354 mp, ill1);
2355 ill_refrele(ill1);
2356 error = ENETUNREACH;
2357 continue;
2358 }
2359 mp1 = copymsg(mp);
2360 if (mp1 == NULL) {
2361 BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
2362 ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
2363 nce_refrele(nce1);
2364 ill_refrele(ill1);
2365 error = ENOBUFS;
2366 continue;
2367 }
2368 /* Preserve HW checksum for this copy */
2369 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
2370 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
2371 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
2372 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
2373 DB_LSOMSS(mp1) = DB_LSOMSS(mp);
2374
2375 ire1->ire_ob_pkt_count++;
2376 err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
2377 0, ixacookie);
2378 if (err == 0)
2379 num_sent++;
2380 else
2381 error = err;
2382 nce_refrele(nce1);
2383 ill_refrele(ill1);
2384 }
2385 irb_refrele(irb);
2386 ire_refrele(ire);
2387 /* Finally, the main one */
2388 err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
2389 ixacookie);
2390 if (err == 0)
2391 num_sent++;
2392 else
2393 error = err;
2394 if (num_sent > 0)
2395 return (0);
2396 else
2397 return (error);
2398}
2399
2400/*
2401 * Verify local connectivity. This check is called by ULP fusion code.
2402 * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
2403 * the interface is brought down and back up. So we simply fail the local
2404 * process. The caller, TCP Fusion, should unfuse the connection.
2405 */
2406boolean_t
2407ip_output_verify_local(ip_xmit_attr_t *ixa)
2408{
2409 ire_t *ire = ixa->ixa_ire;
2410
2411 if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
2412 return (B_FALSE);
2413
2414 return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
2415}
2416
2417/*
2418 * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
2419 *
2420 * The caller must call ip_output_verify_local() first. This function handles
2421 * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
2422 */
2423mblk_t *
2424ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
2425 boolean_t hooks_in, conn_t *peer_connp)
2426{
2427 ill_t *ill = ixa->ixa_ire->ire_ill;
2428 ipha_t *ipha = NULL;
2429 ip6_t *ip6h = NULL;
2430 ip_stack_t *ipst = ixa->ixa_ipst;
2431 iaflags_t ixaflags = ixa->ixa_flags;
2432 ip_recv_attr_t iras;
2433 int error;
2434
2435 ASSERT(mp != NULL);
2436
2437 if (ixaflags & IXAF_IS_IPV4) {
2438 ipha = (ipha_t *)mp->b_rptr;
2439
2440 /*
2441 * If a callback is enabled then we need to know the
2442 * source and destination zoneids for the packet. We already
2443 * have those handy.
2444 */
2445 if (ipst->ips_ip4_observe.he_interested) {
2446 zoneid_t szone, dzone;
2447 zoneid_t stackzoneid;
2448
2449 stackzoneid = netstackid_to_zoneid(
2450 ipst->ips_netstack->netstack_stackid);
2451
2452 if (stackzoneid == GLOBAL_ZONEID) {
2453 /* Shared-IP zone */
2454 dzone = ixa->ixa_ire->ire_zoneid;
2455 szone = ixa->ixa_zoneid;
2456 } else {
2457 szone = dzone = stackzoneid;
2458 }
2459 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2460 ipst);
2461 }
2462 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2463 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2464 NULL, int, 1);
2465
2466 /* FW_HOOKS: LOOPBACK_OUT */
2467 if (hooks_out) {
2468 DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
2469 ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
2470 FW_HOOKS(ipst->ips_ip4_loopback_out_event,
2471 ipst->ips_ipv4firewall_loopback_out,
2472 NULL, ill, ipha, mp, mp, 0, ipst, error);
2473 DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
2474 }
2475 if (mp == NULL)
2476 return (NULL);
2477
2478 /* FW_HOOKS: LOOPBACK_IN */
2479 if (hooks_in) {
2480 DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
2481 ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
2482 FW_HOOKS(ipst->ips_ip4_loopback_in_event,
2483 ipst->ips_ipv4firewall_loopback_in,
2484 ill, NULL, ipha, mp, mp, 0, ipst, error);
2485 DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
2486 }
2487 if (mp == NULL)
2488 return (NULL);
2489
2490 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2491 ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
2492 NULL, int, 1);
2493
2494 /* Inbound IPsec polocies */
2495 if (peer_connp != NULL) {
2496 /* Map ixa to ira including IPsec policies. */
2497 ipsec_out_to_in(ixa, ill, &iras);
2498 mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
2499 NULL, &iras);
2500 }
2501 } else {
2502 ip6h = (ip6_t *)mp->b_rptr;
2503
2504 /*
2505 * If a callback is enabled then we need to know the
2506 * source and destination zoneids for the packet. We already
2507 * have those handy.
2508 */
2509 if (ipst->ips_ip6_observe.he_interested) {
2510 zoneid_t szone, dzone;
2511 zoneid_t stackzoneid;
2512
2513 stackzoneid = netstackid_to_zoneid(
2514 ipst->ips_netstack->netstack_stackid);
2515
2516 if (stackzoneid == GLOBAL_ZONEID) {
2517 /* Shared-IP zone */
2518 dzone = ixa->ixa_ire->ire_zoneid;
2519 szone = ixa->ixa_zoneid;
2520 } else {
2521 szone = dzone = stackzoneid;
2522 }
2523 ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
2524 ipst);
2525 }
2526 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2527 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2528 ip6h, int, 1);
2529
2530 /* FW_HOOKS: LOOPBACK_OUT */
2531 if (hooks_out) {
2532 DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
2533 ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
2534 FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
2535 ipst->ips_ipv6firewall_loopback_out,
2536 NULL, ill, ip6h, mp, mp, 0, ipst, error);
2537 DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
2538 }
2539 if (mp == NULL)
2540 return (NULL);
2541
2542 /* FW_HOOKS: LOOPBACK_IN */
2543 if (hooks_in) {
2544 DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
2545 ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
2546 FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
2547 ipst->ips_ipv6firewall_loopback_in,
2548 ill, NULL, ip6h, mp, mp, 0, ipst, error);
2549 DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
2550 }
2551 if (mp == NULL)
2552 return (NULL);
2553
2554 DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
2555 ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
2556 ip6h, int, 1);
2557
2558 /* Inbound IPsec polocies */
2559 if (peer_connp != NULL) {
2560 /* Map ixa to ira including IPsec policies. */
2561 ipsec_out_to_in(ixa, ill, &iras);
2562 mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
2563 ip6h, &iras);
2564 }
2565 }
2566
2567 if (mp == NULL) {
2568 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2569 ip_drop_input("ipIfStatsInDiscards", NULL, ill);
2570 }
2571
2572 return (mp);
2573}