blob: 8feb104d032f21a5ca4b4cbcb2ef5ab31fe3d1a0 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2014, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
* The Regents of the University of California
* All Rights Reserved
*
* University Acknowledgment- Portions of this document are derived from
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
* VM - shared or copy-on-write from a vnode/anonymous memory.
*/
#include <sys/types.h>
#include <sys/param.h>
#include <sys/t_lock.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/debug.h>
#include <sys/cred.h>
#include <sys/vmsystm.h>
#include <sys/tuneable.h>
#include <sys/bitmap.h>
#include <sys/swap.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/vtrace.h>
#include <sys/cmn_err.h>
#include <sys/callb.h>
#include <sys/vm.h>
#include <sys/dumphdr.h>
#include <sys/lgrp.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_vn.h>
#include <vm/pvn.h>
#include <vm/anon.h>
#include <vm/page.h>
#include <vm/vpage.h>
#include <sys/proc.h>
#include <sys/task.h>
#include <sys/project.h>
#include <sys/zone.h>
#include <sys/shm_impl.h>
/*
* Private seg op routines.
*/
static int segvn_dup(struct seg *seg, struct seg *newseg);
static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
static void segvn_free(struct seg *seg);
static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
caddr_t addr, size_t len, enum fault_type type,
enum seg_rw rw);
static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
static int segvn_setprot(struct seg *seg, caddr_t addr,
size_t len, uint_t prot);
static int segvn_checkprot(struct seg *seg, caddr_t addr,
size_t len, uint_t prot);
static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
static size_t segvn_swapout(struct seg *seg);
static int segvn_sync(struct seg *seg, caddr_t addr, size_t len,
int attr, uint_t flags);
static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len,
char *vec);
static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
int attr, int op, ulong_t *lockmap, size_t pos);
static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
uint_t *protv);
static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr);
static int segvn_gettype(struct seg *seg, caddr_t addr);
static int segvn_getvp(struct seg *seg, caddr_t addr,
struct vnode **vpp);
static int segvn_advise(struct seg *seg, caddr_t addr, size_t len,
uint_t behav);
static void segvn_dump(struct seg *seg);
static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
struct page ***ppp, enum lock_type type, enum seg_rw rw);
static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
uint_t szc);
static int segvn_getmemid(struct seg *seg, caddr_t addr,
memid_t *memidp);
static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t);
static int segvn_capable(struct seg *seg, segcapability_t capable);
struct seg_ops segvn_ops = {
segvn_dup,
segvn_unmap,
segvn_free,
segvn_fault,
segvn_faulta,
segvn_setprot,
segvn_checkprot,
segvn_kluster,
segvn_swapout,
segvn_sync,
segvn_incore,
segvn_lockop,
segvn_getprot,
segvn_getoffset,
segvn_gettype,
segvn_getvp,
segvn_advise,
segvn_dump,
segvn_pagelock,
segvn_setpagesize,
segvn_getmemid,
segvn_getpolicy,
segvn_capable,
};
/*
* Common zfod structures, provided as a shorthand for others to use.
*/
static segvn_crargs_t zfod_segvn_crargs =
SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
static segvn_crargs_t kzfod_segvn_crargs =
SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
PROT_ALL & ~PROT_USER);
static segvn_crargs_t stack_noexec_crargs =
SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */
caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */
caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */
caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
#define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */
size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */
size_t segvn_pglock_comb_thrshld = (1UL << 16); /* 64K */
size_t segvn_pglock_comb_balign = (1UL << 16); /* 64K */
uint_t segvn_pglock_comb_bshift;
size_t segvn_pglock_comb_palign;
static int segvn_concat(struct seg *, struct seg *, int);
static int segvn_extend_prev(struct seg *, struct seg *,
struct segvn_crargs *, size_t);
static int segvn_extend_next(struct seg *, struct seg *,
struct segvn_crargs *, size_t);
static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
static void segvn_pagelist_rele(page_t **);
static void segvn_setvnode_mpss(vnode_t *);
static void segvn_relocate_pages(page_t **, page_t *);
static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
uint_t, page_t **, page_t **, uint_t *, int *);
static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
u_offset_t, struct vpage *, page_t **, uint_t,
enum fault_type, enum seg_rw, int);
static void segvn_vpage(struct seg *);
static size_t segvn_count_swap_by_vpages(struct seg *);
static void segvn_purge(struct seg *seg);
static int segvn_reclaim(void *, caddr_t, size_t, struct page **,
enum seg_rw, int);
static int shamp_reclaim(void *, caddr_t, size_t, struct page **,
enum seg_rw, int);
static int sameprot(struct seg *, caddr_t, size_t);
static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
static int segvn_clrszc(struct seg *);
static struct seg *segvn_split_seg(struct seg *, caddr_t);
static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
ulong_t, uint_t);
static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
size_t, void *, u_offset_t);
static struct kmem_cache *segvn_cache;
static struct kmem_cache **segvn_szc_cache;
#ifdef VM_STATS
static struct segvnvmstats_str {
ulong_t fill_vp_pages[31];
ulong_t fltvnpages[49];
ulong_t fullszcpages[10];
ulong_t relocatepages[3];
ulong_t fltanpages[17];
ulong_t pagelock[2];
ulong_t demoterange[3];
} segvnvmstats;
#endif /* VM_STATS */
#define SDR_RANGE 1 /* demote entire range */
#define SDR_END 2 /* demote non aligned ends only */
#define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \
if ((len) != 0) { \
lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \
ASSERT(lpgaddr >= (seg)->s_base); \
lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \
(len)), pgsz); \
ASSERT(lpgeaddr > lpgaddr); \
ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \
} else { \
lpgeaddr = lpgaddr = (addr); \
} \
}
/*ARGSUSED*/
static int
segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
{
struct segvn_data *svd = buf;
rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
svd->svn_trnext = svd->svn_trprev = NULL;
return (0);
}
/*ARGSUSED1*/
static void
segvn_cache_destructor(void *buf, void *cdrarg)
{
struct segvn_data *svd = buf;
rw_destroy(&svd->lock);
mutex_destroy(&svd->segfree_syncmtx);
}
/*ARGSUSED*/
static int
svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
{
bzero(buf, sizeof (svntr_t));
return (0);
}
/*
* Patching this variable to non-zero allows the system to run with
* stacks marked as "not executable". It's a bit of a kludge, but is
* provided as a tweakable for platforms that export those ABIs
* (e.g. sparc V8) that have executable stacks enabled by default.
* There are also some restrictions for platforms that don't actually
* implement 'noexec' protections.
*
* Once enabled, the system is (therefore) unable to provide a fully
* ABI-compliant execution environment, though practically speaking,
* most everything works. The exceptions are generally some interpreters
* and debuggers that create executable code on the stack and jump
* into it (without explicitly mprotecting the address range to include
* PROT_EXEC).
*
* One important class of applications that are disabled are those
* that have been transformed into malicious agents using one of the
* numerous "buffer overflow" attacks. See 4007890.
*/
int noexec_user_stack = 0;
int noexec_user_stack_log = 1;
int segvn_lpg_disable = 0;
uint_t segvn_maxpgszc = 0;
ulong_t segvn_vmpss_clrszc_cnt;
ulong_t segvn_vmpss_clrszc_err;
ulong_t segvn_fltvnpages_clrszc_cnt;
ulong_t segvn_fltvnpages_clrszc_err;
ulong_t segvn_setpgsz_align_err;
ulong_t segvn_setpgsz_anon_align_err;
ulong_t segvn_setpgsz_getattr_err;
ulong_t segvn_setpgsz_eof_err;
ulong_t segvn_faultvnmpss_align_err1;
ulong_t segvn_faultvnmpss_align_err2;
ulong_t segvn_faultvnmpss_align_err3;
ulong_t segvn_faultvnmpss_align_err4;
ulong_t segvn_faultvnmpss_align_err5;
ulong_t segvn_vmpss_pageio_deadlk_err;
int segvn_use_regions = 1;
/*
* Segvn supports text replication optimization for NUMA platforms. Text
* replica's are represented by anon maps (amp). There's one amp per text file
* region per lgroup. A process chooses the amp for each of its text mappings
* based on the lgroup assignment of its main thread (t_tid = 1). All
* processes that want a replica on a particular lgroup for the same text file
* mapping share the same amp. amp's are looked up in svntr_hashtab hash table
* with vp,off,size,szc used as a key. Text replication segments are read only
* MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
* forcing COW faults from vnode to amp and mapping amp pages instead of vnode
* pages. Replication amp is assigned to a segment when it gets its first
* pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
* rechecks periodically if the process still maps an amp local to the main
* thread. If not async thread forces process to remap to an amp in the new
* home lgroup of the main thread. Current text replication implementation
* only provides the benefit to workloads that do most of their work in the
* main thread of a process or all the threads of a process run in the same
* lgroup. To extend text replication benefit to different types of
* multithreaded workloads further work would be needed in the hat layer to
* allow the same virtual address in the same hat to simultaneously map
* different physical addresses (i.e. page table replication would be needed
* for x86).
*
* amp pages are used instead of vnode pages as long as segment has a very
* simple life cycle. It's created via segvn_create(), handles S_EXEC
* (S_READ) pagefaults and is fully unmapped. If anything more complicated
* happens such as protection is changed, real COW fault happens, pagesize is
* changed, MC_LOCK is requested or segment is partially unmapped we turn off
* text replication by converting the segment back to vnode only segment
* (unmap segment's address range and set svd->amp to NULL).
*
* The original file can be changed after amp is inserted into
* svntr_hashtab. Processes that are launched after the file is already
* changed can't use the replica's created prior to the file change. To
* implement this functionality hash entries are timestamped. Replica's can
* only be used if current file modification time is the same as the timestamp
* saved when hash entry was created. However just timestamps alone are not
* sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
* deal with file changes via MAP_SHARED mappings differently. When writable
* MAP_SHARED mappings are created to vnodes marked as executable we mark all
* existing replica's for this vnode as not usable for future text
* mappings. And we don't create new replica's for files that currently have
* potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
* true).
*/
#define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20)
size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
static ulong_t svntr_hashtab_sz = 512;
static svntr_bucket_t *svntr_hashtab = NULL;
static struct kmem_cache *svntr_cache;
static svntr_stats_t *segvn_textrepl_stats;
static ksema_t segvn_trasync_sem;
int segvn_disable_textrepl = 1;
size_t textrepl_size_thresh = (size_t)-1;
size_t segvn_textrepl_bytes = 0;
size_t segvn_textrepl_max_bytes = 0;
clock_t segvn_update_textrepl_interval = 0;
int segvn_update_tr_time = 10;
int segvn_disable_textrepl_update = 0;
static void segvn_textrepl(struct seg *);
static void segvn_textunrepl(struct seg *, int);
static void segvn_inval_trcache(vnode_t *);
static void segvn_trasync_thread(void);
static void segvn_trupdate_wakeup(void *);
static void segvn_trupdate(void);
static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
ulong_t);
/*
* Initialize segvn data structures
*/
void
segvn_init(void)
{
uint_t maxszc;
uint_t szc;
size_t pgsz;
segvn_cache = kmem_cache_create("segvn_cache",
sizeof (struct segvn_data), 0,
segvn_cache_constructor, segvn_cache_destructor, NULL,
NULL, NULL, 0);
if (segvn_lpg_disable == 0) {
szc = maxszc = page_num_pagesizes() - 1;
if (szc == 0) {
segvn_lpg_disable = 1;
}
if (page_get_pagesize(0) != PAGESIZE) {
panic("segvn_init: bad szc 0");
/*NOTREACHED*/
}
while (szc != 0) {
pgsz = page_get_pagesize(szc);
if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
panic("segvn_init: bad szc %d", szc);
/*NOTREACHED*/
}
szc--;
}
if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
segvn_maxpgszc = maxszc;
}
if (segvn_maxpgszc) {
segvn_szc_cache = (struct kmem_cache **)kmem_alloc(
(segvn_maxpgszc + 1) * sizeof (struct kmem_cache *),
KM_SLEEP);
}
for (szc = 1; szc <= segvn_maxpgszc; szc++) {
char str[32];
(void) sprintf(str, "segvn_szc_cache%d", szc);
segvn_szc_cache[szc] = kmem_cache_create(str,
page_get_pagecnt(szc) * sizeof (page_t *), 0,
NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
}
if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL))
segvn_use_regions = 0;
/*
* For now shared regions and text replication segvn support
* are mutually exclusive. This is acceptable because
* currently significant benefit from text replication was
* only observed on AMD64 NUMA platforms (due to relatively
* small L2$ size) and currently we don't support shared
* regions on x86.
*/
if (segvn_use_regions && !segvn_disable_textrepl) {
segvn_disable_textrepl = 1;
}
#if defined(_LP64)
if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
!segvn_disable_textrepl) {
ulong_t i;
size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
svntr_cache = kmem_cache_create("svntr_cache",
sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
NULL, NULL, NULL, 0);
svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
for (i = 0; i < svntr_hashtab_sz; i++) {
mutex_init(&svntr_hashtab[i].tr_lock, NULL,
MUTEX_DEFAULT, NULL);
}
segvn_textrepl_max_bytes = ptob(physmem) /
segvn_textrepl_max_bytes_factor;
segvn_textrepl_stats = kmem_zalloc(NCPU *
sizeof (svntr_stats_t), KM_SLEEP);
sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
(void) thread_create(NULL, 0, segvn_trasync_thread,
NULL, 0, &p0, TS_RUN, minclsyspri);
}
#endif
if (!ISP2(segvn_pglock_comb_balign) ||
segvn_pglock_comb_balign < PAGESIZE) {
segvn_pglock_comb_balign = 1UL << 16; /* 64K */
}
segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1;
segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign);
}
#define SEGVN_PAGEIO ((void *)0x1)
#define SEGVN_NOPAGEIO ((void *)0x2)
static void
segvn_setvnode_mpss(vnode_t *vp)
{
int err;
ASSERT(vp->v_mpssdata == NULL ||
vp->v_mpssdata == SEGVN_PAGEIO ||
vp->v_mpssdata == SEGVN_NOPAGEIO);
if (vp->v_mpssdata == NULL) {
if (vn_vmpss_usepageio(vp)) {
err = VOP_PAGEIO(vp, (page_t *)NULL,
(u_offset_t)0, 0, 0, CRED(), NULL);
} else {
err = ENOSYS;
}
/*
* set v_mpssdata just once per vnode life
* so that it never changes.
*/
mutex_enter(&vp->v_lock);
if (vp->v_mpssdata == NULL) {
if (err == EINVAL) {
vp->v_mpssdata = SEGVN_PAGEIO;
} else {
vp->v_mpssdata = SEGVN_NOPAGEIO;
}
}
mutex_exit(&vp->v_lock);
}
}
int
segvn_create(struct seg *seg, void *argsp)
{
struct segvn_crargs *a = (struct segvn_crargs *)argsp;
struct segvn_data *svd;
size_t swresv = 0;
struct cred *cred;
struct anon_map *amp;
int error = 0;
size_t pgsz;
lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT;
int use_rgn = 0;
int trok = 0;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
panic("segvn_create type");
/*NOTREACHED*/
}
/*
* Check arguments. If a shared anon structure is given then
* it is illegal to also specify a vp.
*/
if (a->amp != NULL && a->vp != NULL) {
panic("segvn_create anon_map");
/*NOTREACHED*/
}
if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) &&
a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) &&
segvn_use_regions) {
use_rgn = 1;
}
/* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
if (a->type == MAP_SHARED)
a->flags &= ~MAP_NORESERVE;
if (a->szc != 0) {
if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) ||
(a->amp != NULL && a->type == MAP_PRIVATE) ||
(a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
a->szc = 0;
} else {
if (a->szc > segvn_maxpgszc)
a->szc = segvn_maxpgszc;
pgsz = page_get_pagesize(a->szc);
if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
!IS_P2ALIGNED(seg->s_size, pgsz)) {
a->szc = 0;
} else if (a->vp != NULL) {
if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
/*
* paranoid check.
* hat_page_demote() is not supported
* on swapfs pages.
*/
a->szc = 0;
} else if (map_addr_vacalign_check(seg->s_base,
a->offset & PAGEMASK)) {
a->szc = 0;
}
} else if (a->amp != NULL) {
pgcnt_t anum = btopr(a->offset);
pgcnt_t pgcnt = page_get_pagecnt(a->szc);
if (!IS_P2ALIGNED(anum, pgcnt)) {
a->szc = 0;
}
}
}
}
/*
* If segment may need private pages, reserve them now.
*/
if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
(a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
if (anon_resv_zone(seg->s_size,
seg->s_as->a_proc->p_zone) == 0)
return (EAGAIN);
swresv = seg->s_size;
TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
seg, swresv, 1);
}
/*
* Reserve any mapping structures that may be required.
*
* Don't do it for segments that may use regions. It's currently a
* noop in the hat implementations anyway.
*/
if (!use_rgn) {
hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
}
if (a->cred) {
cred = a->cred;
crhold(cred);
} else {
crhold(cred = CRED());
}
/* Inform the vnode of the new mapping */
if (a->vp != NULL) {
error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
seg->s_as, seg->s_base, seg->s_size, a->prot,
a->maxprot, a->type, cred, NULL);
if (error) {
if (swresv != 0) {
anon_unresv_zone(swresv,
seg->s_as->a_proc->p_zone);
TRACE_3(TR_FAC_VM, TR_ANON_PROC,
"anon proc:%p %lu %u", seg, swresv, 0);
}
crfree(cred);
if (!use_rgn) {
hat_unload(seg->s_as->a_hat, seg->s_base,
seg->s_size, HAT_UNLOAD_UNMAP);
}
return (error);
}
/*
* svntr_hashtab will be NULL if we support shared regions.
*/
trok = ((a->flags & MAP_TEXT) &&
(seg->s_size > textrepl_size_thresh ||
(a->flags & _MAP_TEXTREPL)) &&
lgrp_optimizations() && svntr_hashtab != NULL &&
a->type == MAP_PRIVATE && swresv == 0 &&
!(a->flags & MAP_NORESERVE) &&
seg->s_as != &kas && a->vp->v_type == VREG);
ASSERT(!trok || !use_rgn);
}
/*
* MAP_NORESERVE mappings don't count towards the VSZ of a process
* until we fault the pages in.
*/
if ((a->vp == NULL || a->vp->v_type != VREG) &&
a->flags & MAP_NORESERVE) {
seg->s_as->a_resvsize -= seg->s_size;
}
/*
* If more than one segment in the address space, and they're adjacent
* virtually, try to concatenate them. Don't concatenate if an
* explicit anon_map structure was supplied (e.g., SystemV shared
* memory) or if we'll use text replication for this segment.
*/
if (a->amp == NULL && !use_rgn && !trok) {
struct seg *pseg, *nseg;
struct segvn_data *psvd, *nsvd;
lgrp_mem_policy_t ppolicy, npolicy;
uint_t lgrp_mem_policy_flags = 0;
extern lgrp_mem_policy_t lgrp_mem_default_policy;
/*
* Memory policy flags (lgrp_mem_policy_flags) is valid when
* extending stack/heap segments.
*/
if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
!(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
} else {
/*
* Get policy when not extending it from another segment
*/
mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
}
/*
* First, try to concatenate the previous and new segments
*/
pseg = AS_SEGPREV(seg->s_as, seg);
if (pseg != NULL &&
pseg->s_base + pseg->s_size == seg->s_base &&
pseg->s_ops == &segvn_ops) {
/*
* Get memory allocation policy from previous segment.
* When extension is specified (e.g. for heap) apply
* this policy to the new segment regardless of the
* outcome of segment concatenation. Extension occurs
* for non-default policy otherwise default policy is
* used and is based on extended segment size.
*/
psvd = (struct segvn_data *)pseg->s_data;
ppolicy = psvd->policy_info.mem_policy;
if (lgrp_mem_policy_flags ==
LGRP_MP_FLAG_EXTEND_UP) {
if (ppolicy != lgrp_mem_default_policy) {
mpolicy = ppolicy;
} else {
mpolicy = lgrp_mem_policy_default(
pseg->s_size + seg->s_size,
a->type);
}
}
if (mpolicy == ppolicy &&
(pseg->s_size + seg->s_size <=
segvn_comb_thrshld || psvd->amp == NULL) &&
segvn_extend_prev(pseg, seg, a, swresv) == 0) {
/*
* success! now try to concatenate
* with following seg
*/
crfree(cred);
nseg = AS_SEGNEXT(pseg->s_as, pseg);
if (nseg != NULL &&
nseg != pseg &&
nseg->s_ops == &segvn_ops &&
pseg->s_base + pseg->s_size ==
nseg->s_base)
(void) segvn_concat(pseg, nseg, 0);
ASSERT(pseg->s_szc == 0 ||
(a->szc == pseg->s_szc &&
IS_P2ALIGNED(pseg->s_base, pgsz) &&
IS_P2ALIGNED(pseg->s_size, pgsz)));
return (0);
}
}
/*
* Failed, so try to concatenate with following seg
*/
nseg = AS_SEGNEXT(seg->s_as, seg);
if (nseg != NULL &&
seg->s_base + seg->s_size == nseg->s_base &&
nseg->s_ops == &segvn_ops) {
/*
* Get memory allocation policy from next segment.
* When extension is specified (e.g. for stack) apply
* this policy to the new segment regardless of the
* outcome of segment concatenation. Extension occurs
* for non-default policy otherwise default policy is
* used and is based on extended segment size.
*/
nsvd = (struct segvn_data *)nseg->s_data;
npolicy = nsvd->policy_info.mem_policy;
if (lgrp_mem_policy_flags ==
LGRP_MP_FLAG_EXTEND_DOWN) {
if (npolicy != lgrp_mem_default_policy) {
mpolicy = npolicy;
} else {
mpolicy = lgrp_mem_policy_default(
nseg->s_size + seg->s_size,
a->type);
}
}
if (mpolicy == npolicy &&
segvn_extend_next(seg, nseg, a, swresv) == 0) {
crfree(cred);
ASSERT(nseg->s_szc == 0 ||
(a->szc == nseg->s_szc &&
IS_P2ALIGNED(nseg->s_base, pgsz) &&
IS_P2ALIGNED(nseg->s_size, pgsz)));
return (0);
}
}
}
if (a->vp != NULL) {
VN_HOLD(a->vp);
if (a->type == MAP_SHARED)
lgrp_shm_policy_init(NULL, a->vp);
}
svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
seg->s_ops = &segvn_ops;
seg->s_data = (void *)svd;
seg->s_szc = a->szc;
svd->seg = seg;
svd->vp = a->vp;
/*
* Anonymous mappings have no backing file so the offset is meaningless.
*/
svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
svd->prot = a->prot;
svd->maxprot = a->maxprot;
svd->pageprot = 0;
svd->type = a->type;
svd->vpage = NULL;
svd->cred = cred;
svd->advice = MADV_NORMAL;
svd->pageadvice = 0;
svd->flags = (ushort_t)a->flags;
svd->softlockcnt = 0;
svd->softlockcnt_sbase = 0;
svd->softlockcnt_send = 0;
svd->rcookie = HAT_INVALID_REGION_COOKIE;
svd->pageswap = 0;
if (a->szc != 0 && a->vp != NULL) {
segvn_setvnode_mpss(a->vp);
}
if (svd->type == MAP_SHARED && svd->vp != NULL &&
(svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
ASSERT(vn_is_mapped(svd->vp, V_WRITE));
segvn_inval_trcache(svd->vp);
}
amp = a->amp;
if ((svd->amp = amp) == NULL) {
svd->anon_index = 0;
if (svd->type == MAP_SHARED) {
svd->swresv = 0;
/*
* Shared mappings to a vp need no other setup.
* If we have a shared mapping to an anon_map object
* which hasn't been allocated yet, allocate the
* struct now so that it will be properly shared
* by remembering the swap reservation there.
*/
if (a->vp == NULL) {
svd->amp = anonmap_alloc(seg->s_size, swresv,
ANON_SLEEP);
svd->amp->a_szc = seg->s_szc;
}
} else {
/*
* Private mapping (with or without a vp).
* Allocate anon_map when needed.
*/
svd->swresv = swresv;
}
} else {
pgcnt_t anon_num;
/*
* Mapping to an existing anon_map structure without a vp.
* For now we will insure that the segment size isn't larger
* than the size - offset gives us. Later on we may wish to
* have the anon array dynamically allocated itself so that
* we don't always have to allocate all the anon pointer slots.
* This of course involves adding extra code to check that we
* aren't trying to use an anon pointer slot beyond the end
* of the currently allocated anon array.
*/
if ((amp->size - a->offset) < seg->s_size) {
panic("segvn_create anon_map size");
/*NOTREACHED*/
}
anon_num = btopr(a->offset);
if (a->type == MAP_SHARED) {
/*
* SHARED mapping to a given anon_map.
*/
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
amp->refcnt++;
if (a->szc > amp->a_szc) {
amp->a_szc = a->szc;
}
ANON_LOCK_EXIT(&amp->a_rwlock);
svd->anon_index = anon_num;
svd->swresv = 0;
} else {
/*
* PRIVATE mapping to a given anon_map.
* Make sure that all the needed anon
* structures are created (so that we will
* share the underlying pages if nothing
* is written by this mapping) and then
* duplicate the anon array as is done
* when a privately mapped segment is dup'ed.
*/
struct anon *ap;
caddr_t addr;
caddr_t eaddr;
ulong_t anon_idx;
int hat_flag = HAT_LOAD;
if (svd->flags & MAP_TEXT) {
hat_flag |= HAT_LOAD_TEXT;
}
svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
svd->amp->a_szc = seg->s_szc;
svd->anon_index = 0;
svd->swresv = swresv;
/*
* Prevent 2 threads from allocating anon
* slots simultaneously.
*/
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
eaddr = seg->s_base + seg->s_size;
for (anon_idx = anon_num, addr = seg->s_base;
addr < eaddr; addr += PAGESIZE, anon_idx++) {
page_t *pp;
if ((ap = anon_get_ptr(amp->ahp,
anon_idx)) != NULL)
continue;
/*
* Allocate the anon struct now.
* Might as well load up translation
* to the page while we're at it...
*/
pp = anon_zero(seg, addr, &ap, cred);
if (ap == NULL || pp == NULL) {
panic("segvn_create anon_zero");
/*NOTREACHED*/
}
/*
* Re-acquire the anon_map lock and
* initialize the anon array entry.
*/
ASSERT(anon_get_ptr(amp->ahp,
anon_idx) == NULL);
(void) anon_set_ptr(amp->ahp, anon_idx, ap,
ANON_SLEEP);
ASSERT(seg->s_szc == 0);
ASSERT(!IS_VMODSORT(pp->p_vnode));
ASSERT(use_rgn == 0);
hat_memload(seg->s_as->a_hat, addr, pp,
svd->prot & ~PROT_WRITE, hat_flag);
page_unlock(pp);
}
ASSERT(seg->s_szc == 0);
anon_dup(amp->ahp, anon_num, svd->amp->ahp,
0, seg->s_size);
ANON_LOCK_EXIT(&amp->a_rwlock);
}
}
/*
* Set default memory allocation policy for segment
*
* Always set policy for private memory at least for initialization
* even if this is a shared memory segment
*/
(void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
if (svd->type == MAP_SHARED)
(void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
svd->vp, svd->offset, seg->s_size);
if (use_rgn) {
ASSERT(!trok);
ASSERT(svd->amp == NULL);
svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base,
seg->s_size, (void *)svd->vp, svd->offset, svd->prot,
(uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback,
HAT_REGION_TEXT);
}
ASSERT(!trok || !(svd->prot & PROT_WRITE));
svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;
return (0);
}
/*
* Concatenate two existing segments, if possible.
* Return 0 on success, -1 if two segments are not compatible
* or -2 on memory allocation failure.
* If amp_cat == 1 then try and concat segments with anon maps
*/
static int
segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
{
struct segvn_data *svd1 = seg1->s_data;
struct segvn_data *svd2 = seg2->s_data;
struct anon_map *amp1 = svd1->amp;
struct anon_map *amp2 = svd2->amp;
struct vpage *vpage1 = svd1->vpage;
struct vpage *vpage2 = svd2->vpage, *nvpage = NULL;
size_t size, nvpsize;
pgcnt_t npages1, npages2;
ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as);
ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
ASSERT(seg1->s_ops == seg2->s_ops);
if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) ||
HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
return (-1);
}
/* both segments exist, try to merge them */
#define incompat(x) (svd1->x != svd2->x)
if (incompat(vp) || incompat(maxprot) ||
(!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) ||
(!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
incompat(type) || incompat(cred) || incompat(flags) ||
seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
(svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0)
return (-1);
#undef incompat
/*
* vp == NULL implies zfod, offset doesn't matter
*/
if (svd1->vp != NULL &&
svd1->offset + seg1->s_size != svd2->offset) {
return (-1);
}
/*
* Don't concatenate if either segment uses text replication.
*/
if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) {
return (-1);
}
/*
* Fail early if we're not supposed to concatenate
* segments with non NULL amp.
*/
if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) {
return (-1);
}
if (svd1->vp == NULL && svd1->type == MAP_SHARED) {
if (amp1 != amp2) {
return (-1);
}
if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) !=
svd2->anon_index) {
return (-1);
}
ASSERT(amp1 == NULL || amp1->refcnt >= 2);
}
/*
* If either seg has vpages, create a new merged vpage array.
*/
if (vpage1 != NULL || vpage2 != NULL) {
struct vpage *vp, *evp;
npages1 = seg_pages(seg1);
npages2 = seg_pages(seg2);
nvpsize = vpgtob(npages1 + npages2);
if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) {
return (-2);
}
if (vpage1 != NULL) {
bcopy(vpage1, nvpage, vpgtob(npages1));
} else {
evp = nvpage + npages1;
for (vp = nvpage; vp < evp; vp++) {
VPP_SETPROT(vp, svd1->prot);
VPP_SETADVICE(vp, svd1->advice);
}
}
if (vpage2 != NULL) {
bcopy(vpage2, nvpage + npages1, vpgtob(npages2));
} else {
evp = nvpage + npages1 + npages2;
for (vp = nvpage + npages1; vp < evp; vp++) {
VPP_SETPROT(vp, svd2->prot);
VPP_SETADVICE(vp, svd2->advice);
}
}
if (svd2->pageswap && (!svd1->pageswap && svd1->swresv)) {
ASSERT(svd1->swresv == seg1->s_size);
ASSERT(!(svd1->flags & MAP_NORESERVE));
ASSERT(!(svd2->flags & MAP_NORESERVE));
evp = nvpage + npages1;
for (vp = nvpage; vp < evp; vp++) {
VPP_SETSWAPRES(vp);
}
}
if (svd1->pageswap && (!svd2->pageswap && svd2->swresv)) {
ASSERT(svd2->swresv == seg2->s_size);
ASSERT(!(svd1->flags & MAP_NORESERVE));
ASSERT(!(svd2->flags & MAP_NORESERVE));
vp = nvpage + npages1;
evp = vp + npages2;
for (; vp < evp; vp++) {
VPP_SETSWAPRES(vp);
}
}
}
ASSERT((vpage1 != NULL || vpage2 != NULL) ||
(svd1->pageswap == 0 && svd2->pageswap == 0));
/*
* If either segment has private pages, create a new merged anon
* array. If mergeing shared anon segments just decrement anon map's
* refcnt.
*/
if (amp1 != NULL && svd1->type == MAP_SHARED) {
ASSERT(amp1 == amp2 && svd1->vp == NULL);
ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
ASSERT(amp1->refcnt >= 2);
amp1->refcnt--;
ANON_LOCK_EXIT(&amp1->a_rwlock);
svd2->amp = NULL;
} else if (amp1 != NULL || amp2 != NULL) {
struct anon_hdr *nahp;
struct anon_map *namp = NULL;
size_t asize;
ASSERT(svd1->type == MAP_PRIVATE);
asize = seg1->s_size + seg2->s_size;
if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
if (nvpage != NULL) {
kmem_free(nvpage, nvpsize);
}
return (-2);
}
if (amp1 != NULL) {
/*
* XXX anon rwlock is not really needed because
* this is a private segment and we are writers.
*/
ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
ASSERT(amp1->refcnt == 1);
if (anon_copy_ptr(amp1->ahp, svd1->anon_index,
nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) {
anon_release(nahp, btop(asize));
ANON_LOCK_EXIT(&amp1->a_rwlock);
if (nvpage != NULL) {
kmem_free(nvpage, nvpsize);
}
return (-2);
}
}
if (amp2 != NULL) {
ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
ASSERT(amp2->refcnt == 1);
if (anon_copy_ptr(amp2->ahp, svd2->anon_index,
nahp, btop(seg1->s_size), btop(seg2->s_size),
ANON_NOSLEEP)) {
anon_release(nahp, btop(asize));
ANON_LOCK_EXIT(&amp2->a_rwlock);
if (amp1 != NULL) {
ANON_LOCK_EXIT(&amp1->a_rwlock);
}
if (nvpage != NULL) {
kmem_free(nvpage, nvpsize);
}
return (-2);
}
}
if (amp1 != NULL) {
namp = amp1;
anon_release(amp1->ahp, btop(amp1->size));
}
if (amp2 != NULL) {
if (namp == NULL) {
ASSERT(amp1 == NULL);
namp = amp2;
anon_release(amp2->ahp, btop(amp2->size));
} else {
amp2->refcnt--;
ANON_LOCK_EXIT(&amp2->a_rwlock);
anonmap_free(amp2);
}
svd2->amp = NULL; /* needed for seg_free */
}
namp->ahp = nahp;
namp->size = asize;
svd1->amp = namp;
svd1->anon_index = 0;
ANON_LOCK_EXIT(&namp->a_rwlock);
}
/*
* Now free the old vpage structures.
*/
if (nvpage != NULL) {
if (vpage1 != NULL) {
kmem_free(vpage1, vpgtob(npages1));
}
if (vpage2 != NULL) {
svd2->vpage = NULL;
kmem_free(vpage2, vpgtob(npages2));
}
if (svd2->pageprot) {
svd1->pageprot = 1;
}
if (svd2->pageadvice) {
svd1->pageadvice = 1;
}
if (svd2->pageswap) {
svd1->pageswap = 1;
}
svd1->vpage = nvpage;
}
/* all looks ok, merge segments */
svd1->swresv += svd2->swresv;
svd2->swresv = 0; /* so seg_free doesn't release swap space */
size = seg2->s_size;
seg_free(seg2);
seg1->s_size += size;
return (0);
}
/*
* Extend the previous segment (seg1) to include the
* new segment (seg2 + a), if possible.
* Return 0 on success.
*/
static int
segvn_extend_prev(seg1, seg2, a, swresv)
struct seg *seg1, *seg2;
struct segvn_crargs *a;
size_t swresv;
{
struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data;
size_t size;
struct anon_map *amp1;
struct vpage *new_vpage;
/*
* We don't need any segment level locks for "segvn" data
* since the address space is "write" locked.
*/
ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) {
return (-1);
}
/* second segment is new, try to extend first */
/* XXX - should also check cred */
if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
(!svd1->pageprot && (svd1->prot != a->prot)) ||
svd1->type != a->type || svd1->flags != a->flags ||
seg1->s_szc != a->szc || svd1->softlockcnt_send > 0)
return (-1);
/* vp == NULL implies zfod, offset doesn't matter */
if (svd1->vp != NULL &&
svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
return (-1);
if (svd1->tr_state != SEGVN_TR_OFF) {
return (-1);
}
amp1 = svd1->amp;
if (amp1) {
pgcnt_t newpgs;
/*
* Segment has private pages, can data structures
* be expanded?
*
* Acquire the anon_map lock to prevent it from changing,
* if it is shared. This ensures that the anon_map
* will not change while a thread which has a read/write
* lock on an address space references it.
* XXX - Don't need the anon_map lock at all if "refcnt"
* is 1.
*
* Can't grow a MAP_SHARED segment with an anonmap because
* there may be existing anon slots where we want to extend
* the segment and we wouldn't know what to do with them
* (e.g., for tmpfs right thing is to just leave them there,
* for /dev/zero they should be cleared out).
*/
if (svd1->type == MAP_SHARED)
return (-1);
ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
if (amp1->refcnt > 1) {
ANON_LOCK_EXIT(&amp1->a_rwlock);
return (-1);
}
newpgs = anon_grow(amp1->ahp, &svd1->anon_index,
btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP);
if (newpgs == 0) {
ANON_LOCK_EXIT(&amp1->a_rwlock);
return (-1);
}
amp1->size = ptob(newpgs);
ANON_LOCK_EXIT(&amp1->a_rwlock);
}
if (svd1->vpage != NULL) {
struct vpage *vp, *evp;
new_vpage =
kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
KM_NOSLEEP);
if (new_vpage == NULL)
return (-1);
bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1)));
kmem_free(svd1->vpage, vpgtob(seg_pages(seg1)));
svd1->vpage = new_vpage;
vp = new_vpage + seg_pages(seg1);
evp = vp + seg_pages(seg2);
for (; vp < evp; vp++)
VPP_SETPROT(vp, a->prot);
if (svd1->pageswap && swresv) {
ASSERT(!(svd1->flags & MAP_NORESERVE));
ASSERT(swresv == seg2->s_size);
vp = new_vpage + seg_pages(seg1);
for (; vp < evp; vp++) {
VPP_SETSWAPRES(vp);
}
}
}
ASSERT(svd1->vpage != NULL || svd1->pageswap == 0);
size = seg2->s_size;
seg_free(seg2);
seg1->s_size += size;
svd1->swresv += swresv;
if (svd1->pageprot && (a->prot & PROT_WRITE) &&
svd1->type == MAP_SHARED && svd1->vp != NULL &&
(svd1->vp->v_flag & VVMEXEC)) {
ASSERT(vn_is_mapped(svd1->vp, V_WRITE));
segvn_inval_trcache(svd1->vp);
}
return (0);
}
/*
* Extend the next segment (seg2) to include the
* new segment (seg1 + a), if possible.
* Return 0 on success.
*/
static int
segvn_extend_next(
struct seg *seg1,
struct seg *seg2,
struct segvn_crargs *a,
size_t swresv)
{
struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data;
size_t size;
struct anon_map *amp2;
struct vpage *new_vpage;
/*
* We don't need any segment level locks for "segvn" data
* since the address space is "write" locked.
*/
ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock));
if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
return (-1);
}
/* first segment is new, try to extend second */
/* XXX - should also check cred */
if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
(!svd2->pageprot && (svd2->prot != a->prot)) ||
svd2->type != a->type || svd2->flags != a->flags ||
seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0)
return (-1);
/* vp == NULL implies zfod, offset doesn't matter */
if (svd2->vp != NULL &&
(a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
return (-1);
if (svd2->tr_state != SEGVN_TR_OFF) {
return (-1);
}
amp2 = svd2->amp;
if (amp2) {
pgcnt_t newpgs;
/*
* Segment has private pages, can data structures
* be expanded?
*
* Acquire the anon_map lock to prevent it from changing,
* if it is shared. This ensures that the anon_map
* will not change while a thread which has a read/write
* lock on an address space references it.
*
* XXX - Don't need the anon_map lock at all if "refcnt"
* is 1.
*/
if (svd2->type == MAP_SHARED)
return (-1);
ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
if (amp2->refcnt > 1) {
ANON_LOCK_EXIT(&amp2->a_rwlock);
return (-1);
}
newpgs = anon_grow(amp2->ahp, &svd2->anon_index,
btop(seg2->s_size), btop(seg1->s_size),
ANON_NOSLEEP | ANON_GROWDOWN);
if (newpgs == 0) {
ANON_LOCK_EXIT(&amp2->a_rwlock);
return (-1);
}
amp2->size = ptob(newpgs);
ANON_LOCK_EXIT(&amp2->a_rwlock);
}
if (svd2->vpage != NULL) {
struct vpage *vp, *evp;
new_vpage =
kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
KM_NOSLEEP);
if (new_vpage == NULL) {
/* Not merging segments so adjust anon_index back */
if (amp2)
svd2->anon_index += seg_pages(seg1);
return (-1);
}
bcopy(svd2->vpage, new_vpage + seg_pages(seg1),
vpgtob(seg_pages(seg2)));
kmem_free(svd2->vpage, vpgtob(seg_pages(seg2)));
svd2->vpage = new_vpage;
vp = new_vpage;
evp = vp + seg_pages(seg1);
for (; vp < evp; vp++)
VPP_SETPROT(vp, a->prot);
if (svd2->pageswap && swresv) {
ASSERT(!(svd2->flags & MAP_NORESERVE));
ASSERT(swresv == seg1->s_size);
vp = new_vpage;
for (; vp < evp; vp++) {
VPP_SETSWAPRES(vp);
}
}
}
ASSERT(svd2->vpage != NULL || svd2->pageswap == 0);
size = seg1->s_size;
seg_free(seg1);
seg2->s_size += size;
seg2->s_base -= size;
svd2->offset -= size;
svd2->swresv += swresv;
if (svd2->pageprot && (a->prot & PROT_WRITE) &&
svd2->type == MAP_SHARED && svd2->vp != NULL &&
(svd2->vp->v_flag & VVMEXEC)) {
ASSERT(vn_is_mapped(svd2->vp, V_WRITE));
segvn_inval_trcache(svd2->vp);
}
return (0);
}
static int
segvn_dup(struct seg *seg, struct seg *newseg)
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
struct segvn_data *newsvd;
pgcnt_t npages = seg_pages(seg);
int error = 0;
uint_t prot;
size_t len;
struct anon_map *amp;
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
ASSERT(newseg->s_as->a_proc->p_parent == curproc);
/*
* If segment has anon reserved, reserve more for the new seg.
* For a MAP_NORESERVE segment swresv will be a count of all the
* allocated anon slots; thus we reserve for the child as many slots
* as the parent has allocated. This semantic prevents the child or
* parent from dieing during a copy-on-write fault caused by trying
* to write a shared pre-existing anon page.
*/
if ((len = svd->swresv) != 0) {
if (anon_resv(svd->swresv) == 0)
return (ENOMEM);
TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
seg, len, 0);
}
newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
newseg->s_ops = &segvn_ops;
newseg->s_data = (void *)newsvd;
newseg->s_szc = seg->s_szc;
newsvd->seg = newseg;
if ((newsvd->vp = svd->vp) != NULL) {
VN_HOLD(svd->vp);
if (svd->type == MAP_SHARED)
lgrp_shm_policy_init(NULL, svd->vp);
}
newsvd->offset = svd->offset;
newsvd->prot = svd->prot;
newsvd->maxprot = svd->maxprot;
newsvd->pageprot = svd->pageprot;
newsvd->type = svd->type;
newsvd->cred = svd->cred;
crhold(newsvd->cred);
newsvd->advice = svd->advice;
newsvd->pageadvice = svd->pageadvice;
newsvd->swresv = svd->swresv;
newsvd->pageswap = svd->pageswap;
newsvd->flags = svd->flags;
newsvd->softlockcnt = 0;
newsvd->softlockcnt_sbase = 0;
newsvd->softlockcnt_send = 0;
newsvd->policy_info = svd->policy_info;
newsvd->rcookie = HAT_INVALID_REGION_COOKIE;
if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) {
/*
* Not attaching to a shared anon object.
*/
ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) ||
svd->tr_state == SEGVN_TR_OFF);
if (svd->tr_state == SEGVN_TR_ON) {
ASSERT(newsvd->vp != NULL && amp != NULL);
newsvd->tr_state = SEGVN_TR_INIT;
} else {
newsvd->tr_state = svd->tr_state;
}
newsvd->amp = NULL;
newsvd->anon_index = 0;
} else {
/* regions for now are only used on pure vnode segments */
ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
ASSERT(svd->tr_state == SEGVN_TR_OFF);
newsvd->tr_state = SEGVN_TR_OFF;
if (svd->type == MAP_SHARED) {
newsvd->amp = amp;
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
amp->refcnt++;
ANON_LOCK_EXIT(&amp->a_rwlock);
newsvd->anon_index = svd->anon_index;
} else {
int reclaim = 1;
/*
* Allocate and initialize new anon_map structure.
*/
newsvd->amp = anonmap_alloc(newseg->s_size, 0,
ANON_SLEEP);
newsvd->amp->a_szc = newseg->s_szc;
newsvd->anon_index = 0;
/*
* We don't have to acquire the anon_map lock
* for the new segment (since it belongs to an
* address space that is still not associated
* with any process), or the segment in the old
* address space (since all threads in it
* are stopped while duplicating the address space).
*/
/*
* The goal of the following code is to make sure that
* softlocked pages do not end up as copy on write
* pages. This would cause problems where one
* thread writes to a page that is COW and a different
* thread in the same process has softlocked it. The
* softlock lock would move away from this process
* because the write would cause this process to get
* a copy (without the softlock).
*
* The strategy here is to just break the
* sharing on pages that could possibly be
* softlocked.
*/
retry:
if (svd->softlockcnt) {
struct anon *ap, *newap;
size_t i;
uint_t vpprot;
page_t *anon_pl[1+1], *pp;
caddr_t addr;
ulong_t old_idx = svd->anon_index;
ulong_t new_idx = 0;
/*
* The softlock count might be non zero
* because some pages are still stuck in the
* cache for lazy reclaim. Flush the cache
* now. This should drop the count to zero.
* [or there is really I/O going on to these
* pages]. Note, we have the writers lock so
* nothing gets inserted during the flush.
*/
if (reclaim == 1) {
segvn_purge(seg);
reclaim = 0;
goto retry;
}
i = btopr(seg->s_size);
addr = seg->s_base;
/*
* XXX break cow sharing using PAGESIZE
* pages. They will be relocated into larger
* pages at fault time.
*/
while (i-- > 0) {
if (ap = anon_get_ptr(amp->ahp,
old_idx)) {
error = anon_getpage(&ap,
&vpprot, anon_pl, PAGESIZE,
seg, addr, S_READ,
svd->cred);
if (error) {
newsvd->vpage = NULL;
goto out;
}
/*
* prot need not be computed
* below 'cause anon_private is
* going to ignore it anyway
* as child doesn't inherit
* pagelock from parent.
*/
prot = svd->pageprot ?
VPP_PROT(
&svd->vpage[
seg_page(seg, addr)])
: svd->prot;
pp = anon_private(&newap,
newseg, addr, prot,
anon_pl[0], 0,
newsvd->cred);
if (pp == NULL) {
/* no mem abort */
newsvd->vpage = NULL;
error = ENOMEM;
goto out;
}
(void) anon_set_ptr(
newsvd->amp->ahp, new_idx,
newap, ANON_SLEEP);
page_unlock(pp);
}
addr += PAGESIZE;
old_idx++;
new_idx++;
}
} else { /* common case */
if (seg->s_szc != 0) {
/*
* If at least one of anon slots of a
* large page exists then make sure
* all anon slots of a large page
* exist to avoid partial cow sharing
* of a large page in the future.
*/
anon_dup_fill_holes(amp->ahp,
svd->anon_index, newsvd->amp->ahp,
0, seg->s_size, seg->s_szc,
svd->vp != NULL);
} else {
anon_dup(amp->ahp, svd->anon_index,
newsvd->amp->ahp, 0, seg->s_size);
}
hat_clrattr(seg->s_as->a_hat, seg->s_base,
seg->s_size, PROT_WRITE);
}
}
}
/*
* If necessary, create a vpage structure for the new segment.
* Do not copy any page lock indications.
*/
if (svd->vpage != NULL) {
uint_t i;
struct vpage *ovp = svd->vpage;
struct vpage *nvp;
nvp = newsvd->vpage =
kmem_alloc(vpgtob(npages), KM_SLEEP);
for (i = 0; i < npages; i++) {
*nvp = *ovp++;
VPP_CLRPPLOCK(nvp++);
}
} else
newsvd->vpage = NULL;
/* Inform the vnode of the new mapping */
if (newsvd->vp != NULL) {
error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset,
newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot,
newsvd->maxprot, newsvd->type, newsvd->cred, NULL);
}
out:
if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
ASSERT(newsvd->amp == NULL);
ASSERT(newsvd->tr_state == SEGVN_TR_OFF);
newsvd->rcookie = svd->rcookie;
hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie);
}
return (error);
}
/*
* callback function to invoke free_vp_pages() for only those pages actually
* processed by the HAT when a shared region is destroyed.
*/
extern int free_pages;
static void
segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
size_t r_size, void *r_obj, u_offset_t r_objoff)
{
u_offset_t off;
size_t len;
vnode_t *vp = (vnode_t *)r_obj;
ASSERT(eaddr > saddr);
ASSERT(saddr >= r_saddr);
ASSERT(saddr < r_saddr + r_size);
ASSERT(eaddr > r_saddr);
ASSERT(eaddr <= r_saddr + r_size);
ASSERT(vp != NULL);
if (!free_pages) {
return;
}
len = eaddr - saddr;
off = (saddr - r_saddr) + r_objoff;
free_vp_pages(vp, off, len);
}
/*
* callback function used by segvn_unmap to invoke free_vp_pages() for only
* those pages actually processed by the HAT
*/
static void
segvn_hat_unload_callback(hat_callback_t *cb)
{
struct seg *seg = cb->hcb_data;
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
size_t len;
u_offset_t off;
ASSERT(svd->vp != NULL);
ASSERT(cb->hcb_end_addr > cb->hcb_start_addr);
ASSERT(cb->hcb_start_addr >= seg->s_base);
len = cb->hcb_end_addr - cb->hcb_start_addr;
off = cb->hcb_start_addr - seg->s_base;
free_vp_pages(svd->vp, svd->offset + off, len);
}
/*
* This function determines the number of bytes of swap reserved by
* a segment for which per-page accounting is present. It is used to
* calculate the correct value of a segvn_data's swresv.
*/
static size_t
segvn_count_swap_by_vpages(struct seg *seg)
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
struct vpage *vp, *evp;
size_t nswappages = 0;
ASSERT(svd->pageswap);
ASSERT(svd->vpage != NULL);
evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
for (vp = svd->vpage; vp < evp; vp++) {
if (VPP_ISSWAPRES(vp))
nswappages++;
}
return (nswappages << PAGESHIFT);
}
static int
segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
struct segvn_data *nsvd;
struct seg *nseg;
struct anon_map *amp;
pgcnt_t opages; /* old segment size in pages */
pgcnt_t npages; /* new segment size in pages */
pgcnt_t dpages; /* pages being deleted (unmapped) */
hat_callback_t callback; /* used for free_vp_pages() */
hat_callback_t *cbp = NULL;
caddr_t nbase;
size_t nsize;
size_t oswresv;
int reclaim = 1;
/*
* We don't need any segment level locks for "segvn" data
* since the address space is "write" locked.
*/
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
/*
* Fail the unmap if pages are SOFTLOCKed through this mapping.
* softlockcnt is protected from change by the as write lock.
*/
retry:
if (svd->softlockcnt > 0) {
ASSERT(svd->tr_state == SEGVN_TR_OFF);
/*
* If this is shared segment non 0 softlockcnt
* means locked pages are still in use.
*/
if (svd->type == MAP_SHARED) {
return (EAGAIN);
}
/*
* since we do have the writers lock nobody can fill
* the cache during the purge. The flush either succeeds
* or we still have pending I/Os.
*/
if (reclaim == 1) {
segvn_purge(seg);
reclaim = 0;
goto retry;
}
return (EAGAIN);
}
/*
* Check for bad sizes
*/
if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
(len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) {
panic("segvn_unmap");
/*NOTREACHED*/
}
if (seg->s_szc != 0) {
size_t pgsz = page_get_pagesize(seg->s_szc);
int err;
if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
ASSERT(seg->s_base != addr || seg->s_size != len);
if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
ASSERT(svd->amp == NULL);
ASSERT(svd->tr_state == SEGVN_TR_OFF);
hat_leave_region(seg->s_as->a_hat,
svd->rcookie, HAT_REGION_TEXT);
svd->rcookie = HAT_INVALID_REGION_COOKIE;
/*
* could pass a flag to segvn_demote_range()
* below to tell it not to do any unloads but
* this case is rare enough to not bother for
* now.
*/
} else if (svd->tr_state == SEGVN_TR_INIT) {
svd->tr_state = SEGVN_TR_OFF;
} else if (svd->tr_state == SEGVN_TR_ON) {
ASSERT(svd->amp != NULL);
segvn_textunrepl(seg, 1);
ASSERT(svd->amp == NULL);
ASSERT(svd->tr_state == SEGVN_TR_OFF);
}
VM_STAT_ADD(segvnvmstats.demoterange[0]);
err = segvn_demote_range(seg, addr, len, SDR_END, 0);
if (err == 0) {
return (IE_RETRY);
}
return (err);
}
}
/* Inform the vnode of the unmapping. */
if (svd->vp) {
int error;
error = VOP_DELMAP(svd->vp,
(offset_t)svd->offset + (uintptr_t)(addr - seg->s_base),
seg->s_as, addr, len, svd->prot, svd->maxprot,
svd->type, svd->cred, NULL);
if (error == EAGAIN)
return (error);
}
/*
* Remove any page locks set through this mapping.
* If text replication is not off no page locks could have been
* established via this mapping.
*/
if (svd->tr_state == SEGVN_TR_OFF) {
(void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
}
if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
ASSERT(svd->amp == NULL);
ASSERT(svd->tr_state == SEGVN_TR_OFF);
ASSERT(svd->type == MAP_PRIVATE);
hat_leave_region(seg->s_as->a_hat, svd->rcookie,
HAT_REGION_TEXT);
svd->rcookie = HAT_INVALID_REGION_COOKIE;
} else if (svd->tr_state == SEGVN_TR_ON) {
ASSERT(svd->amp != NULL);
ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE));
segvn_textunrepl(seg, 1);
ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
} else {
if (svd->tr_state != SEGVN_TR_OFF) {
ASSERT(svd->tr_state == SEGVN_TR_INIT);
svd->tr_state = SEGVN_TR_OFF;
}
/*
* Unload any hardware translations in the range to be taken
* out. Use a callback to invoke free_vp_pages() effectively.
*/
if (svd->vp != NULL && free_pages != 0) {
callback.hcb_data = seg;
callback.hcb_function = segvn_hat_unload_callback;
cbp = &callback;
}
hat_unload_callback(seg->s_as->a_hat, addr, len,
HAT_UNLOAD_UNMAP, cbp);
if (svd->type == MAP_SHARED && svd->vp != NULL &&
(svd->vp->v_flag & VVMEXEC) &&
((svd->prot & PROT_WRITE) || svd->pageprot)) {
segvn_inval_trcache(svd->vp);
}
}
/*
* Check for entire segment
*/
if (addr == seg->s_base && len == seg->s_size) {
seg_free(seg);
return (0);
}
opages = seg_pages(seg);
dpages = btop(len);
npages = opages - dpages;
amp = svd->amp;
ASSERT(amp == NULL || amp->a_szc >= seg->s_szc);
/*
* Check for beginning of segment
*/
if (addr == seg->s_base) {
if (svd->vpage != NULL) {
size_t nbytes;
struct vpage *ovpage;
ovpage = svd->vpage; /* keep pointer to vpage */
nbytes = vpgtob(npages);
svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
bcopy(&ovpage[dpages], svd->vpage, nbytes);
/* free up old vpage */
kmem_free(ovpage, vpgtob(opages));
}
if (amp != NULL) {
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
/*
* Shared anon map is no longer in use. Before
* freeing its pages purge all entries from
* pcache that belong to this amp.
*/
if (svd->type == MAP_SHARED) {
ASSERT(amp->refcnt == 1);
ASSERT(svd->softlockcnt == 0);
anonmap_purge(amp);
}
/*
* Free up now unused parts of anon_map array.
*/
if (amp->a_szc == seg->s_szc) {
if (seg->s_szc != 0) {
anon_free_pages(amp->ahp,
svd->anon_index, len,
seg->s_szc);
} else {
anon_free(amp->ahp,
svd->anon_index,
len);
}
} else {
ASSERT(svd->type == MAP_SHARED);
ASSERT(amp->a_szc > seg->s_szc);
anon_shmap_free_pages(amp,
svd->anon_index, len);
}
/*
* Unreserve swap space for the
* unmapped chunk of this segment in
* case it's MAP_SHARED
*/
if (svd->type == MAP_SHARED) {
anon_unresv_zone(len,
seg->s_as->a_proc->p_zone);
amp->swresv -= len;
}
}
ANON_LOCK_EXIT(&amp->a_rwlock);
svd->anon_index += dpages;
}
if (svd->vp != NULL)
svd->offset += len;
seg->s_base += len;
seg->s_size -= len;
if (svd->swresv) {
if (svd->flags & MAP_NORESERVE) {
ASSERT(amp);
oswresv = svd->swresv;
svd->swresv = ptob(anon_pages(amp->ahp,
svd->anon_index, npages));
anon_unresv_zone(oswresv - svd->swresv,
seg->s_as->a_proc->p_zone);
if (SEG_IS_PARTIAL_RESV(seg))
seg->s_as->a_resvsize -= oswresv -
svd->swresv;
} else {
size_t unlen;
if (svd->pageswap) {
oswresv = svd->swresv;
svd->swresv =
segvn_count_swap_by_vpages(seg);
ASSERT(oswresv >= svd->swresv);
unlen = oswresv - svd->swresv;
} else {
svd->swresv -= len;
ASSERT(svd->swresv == seg->s_size);
unlen = len;
}
anon_unresv_zone(unlen,
seg->s_as->a_proc->p_zone);
}
TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
seg, len, 0);
}
return (0);
}
/*
* Check for end of segment
*/
if (addr + len == seg->s_base + seg->s_size) {
if (svd->vpage != NULL) {
size_t nbytes;
struct vpage *ovpage;
ovpage = svd->vpage; /* keep pointer to vpage */
nbytes = vpgtob(npages);
svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
bcopy(ovpage, svd->vpage, nbytes);
/* free up old vpage */
kmem_free(ovpage, vpgtob(opages));
}
if (amp != NULL) {
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
/*
* Free up now unused parts of anon_map array.
*/
ulong_t an_idx = svd->anon_index + npages;
/*
* Shared anon map is no longer in use. Before
* freeing its pages purge all entries from
* pcache that belong to this amp.
*/
if (svd->type == MAP_SHARED) {
ASSERT(amp->refcnt == 1);
ASSERT(svd->softlockcnt == 0);
anonmap_purge(amp);
}
if (amp->a_szc == seg->s_szc) {
if (seg->s_szc != 0) {
anon_free_pages(amp->ahp,
an_idx, len,
seg->s_szc);
} else {
anon_free(amp->ahp, an_idx,
len);
}
} else {
ASSERT(svd->type == MAP_SHARED);
ASSERT(amp->a_szc > seg->s_szc);
anon_shmap_free_pages(amp,
an_idx, len);
}
/*
* Unreserve swap space for the
* unmapped chunk of this segment in
* case it's MAP_SHARED
*/
if (svd->type == MAP_SHARED) {
anon_unresv_zone(len,
seg->s_as->a_proc->p_zone);
amp->swresv -= len;
}
}
ANON_LOCK_EXIT(&amp->a_rwlock);
}
seg->s_size -= len;
if (svd->swresv) {
if (svd->flags & MAP_NORESERVE) {
ASSERT(amp);
oswresv = svd->swresv;
svd->swresv = ptob(anon_pages(amp->ahp,
svd->anon_index, npages));
anon_unresv_zone(oswresv - svd->swresv,
seg->s_as->a_proc->p_zone);
if (SEG_IS_PARTIAL_RESV(seg))
seg->s_as->a_resvsize -= oswresv -
svd->swresv;
} else {
size_t unlen;
if (svd->pageswap) {
oswresv = svd->swresv;
svd->swresv =
segvn_count_swap_by_vpages(seg);
ASSERT(oswresv >= svd->swresv);
unlen = oswresv - svd->swresv;
} else {
svd->swresv -= len;
ASSERT(svd->swresv == seg->s_size);
unlen = len;
}
anon_unresv_zone(unlen,
seg->s_as->a_proc->p_zone);
}
TRACE_3(TR_FAC_VM, TR_ANON_PROC,
"anon proc:%p %lu %u", seg, len, 0);
}
return (0);
}
/*
* The section to go is in the middle of the segment,
* have to make it into two segments. nseg is made for
* the high end while seg is cut down at the low end.
*/
nbase = addr + len; /* new seg base */
nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */
seg->s_size = addr - seg->s_base; /* shrink old seg */
nseg = seg_alloc(seg->s_as, nbase, nsize);
if (nseg == NULL) {
panic("segvn_unmap seg_alloc");
/*NOTREACHED*/
}
nseg->s_ops = seg->s_ops;
nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
nseg->s_data = (void *)nsvd;
nseg->s_szc = seg->s_szc;
*nsvd = *svd;
nsvd->seg = nseg;
nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
nsvd->swresv = 0;
nsvd->softlockcnt = 0;
nsvd->softlockcnt_sbase = 0;
nsvd->softlockcnt_send = 0;
ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
if (svd->vp != NULL) {
VN_HOLD(nsvd->vp);
if (nsvd->type == MAP_SHARED)
lgrp_shm_policy_init(NULL, nsvd->vp);
}
crhold(svd->cred);
if (svd->vpage == NULL) {
nsvd->vpage = NULL;
} else {
/* need to split vpage into two arrays */
size_t nbytes;
struct vpage *ovpage;
ovpage = svd->vpage; /* keep pointer to vpage */
npages = seg_pages(seg); /* seg has shrunk */
nbytes = vpgtob(npages);
svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
bcopy(ovpage, svd->vpage, nbytes);
npages = seg_pages(nseg);
nbytes = vpgtob(npages);
nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes);
/* free up old vpage */
kmem_free(ovpage, vpgtob(opages));
}
if (amp == NULL) {
nsvd->amp = NULL;
nsvd->anon_index = 0;
} else {
/*
* Need to create a new anon map for the new segment.
* We'll also allocate a new smaller array for the old
* smaller segment to save space.
*/
opages = btop((uintptr_t)(addr - seg->s_base));
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
/*
* Free up now unused parts of anon_map array.
*/
ulong_t an_idx = svd->anon_index + opages;
/*
* Shared anon map is no longer in use. Before
* freeing its pages purge all entries from
* pcache that belong to this amp.
*/
if (svd->type == MAP_SHARED) {
ASSERT(amp->refcnt == 1);
ASSERT(svd->softlockcnt == 0);
anonmap_purge(amp);
}
if (amp->a_szc == seg->s_szc) {
if (seg->s_szc != 0) {
anon_free_pages(amp->ahp, an_idx, len,
seg->s_szc);
} else {
anon_free(amp->ahp, an_idx,
len);
}
} else {
ASSERT(svd->type == MAP_SHARED);
ASSERT(amp->a_szc > seg->s_szc);
anon_shmap_free_pages(amp, an_idx, len);
}
/*
* Unreserve swap space for the
* unmapped chunk of this segment in
* case it's MAP_SHARED
*/
if (svd->type == MAP_SHARED) {
anon_unresv_zone(len,
seg->s_as->a_proc->p_zone);
amp->swresv -= len;
}
}
nsvd->anon_index = svd->anon_index +
btop((uintptr_t)(nseg->s_base - seg->s_base));
if (svd->type == MAP_SHARED) {
amp->refcnt++;
nsvd->amp = amp;
} else {
struct anon_map *namp;
struct anon_hdr *nahp;
ASSERT(svd->type == MAP_PRIVATE);
nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
namp->a_szc = seg->s_szc;
(void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
0, btop(seg->s_size), ANON_SLEEP);
(void) anon_copy_ptr(amp->ahp, nsvd->anon_index,
namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
anon_release(amp->ahp, btop(amp->size));
svd->anon_index = 0;
nsvd->anon_index = 0;
amp->ahp = nahp;
amp->size = seg->s_size;
nsvd->amp = namp;
}
ANON_LOCK_EXIT(&amp->a_rwlock);
}
if (svd->swresv) {
if (svd->flags & MAP_NORESERVE) {
ASSERT(amp);
oswresv = svd->swresv;
svd->swresv = ptob(anon_pages(amp->ahp,
svd->anon_index, btop(seg->s_size)));
nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
nsvd->anon_index, btop(nseg->s_size)));
ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
anon_unresv_zone(oswresv - (svd->swresv + nsvd->swresv),
seg->s_as->a_proc->p_zone);
if (SEG_IS_PARTIAL_RESV(seg))
seg->s_as->a_resvsize -= oswresv -
(svd->swresv + nsvd->swresv);
} else {
size_t unlen;
if (svd->pageswap) {
oswresv = svd->swresv;
svd->swresv = segvn_count_swap_by_vpages(seg);
nsvd->swresv = segvn_count_swap_by_vpages(nseg);
ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
unlen = oswresv - (svd->swresv + nsvd->swresv);
} else {
if (seg->s_size + nseg->s_size + len !=
svd->swresv) {
panic("segvn_unmap: cannot split "
"swap reservation");
/*NOTREACHED*/
}
svd->swresv = seg->s_size;
nsvd->swresv = nseg->s_size;
unlen = len;
}
anon_unresv_zone(unlen,
seg->s_as->a_proc->p_zone);
}
TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
seg, len, 0);
}
return (0); /* I'm glad that's all over with! */
}
static void
segvn_free(struct seg *seg)
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
pgcnt_t npages = seg_pages(seg);
struct anon_map *amp;
size_t len;
/*
* We don't need any segment level locks for "segvn" data
* since the address space is "write" locked.
*/
ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
ASSERT(svd->tr_state == SEGVN_TR_OFF);
ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
/*
* Be sure to unlock pages. XXX Why do things get free'ed instead
* of unmapped? XXX
*/
(void) segvn_lockop(seg, seg->s_base, seg->s_size,
0, MC_UNLOCK, NULL, 0);
/*
* Deallocate the vpage and anon pointers if necessary and possible.
*/
if (svd->vpage != NULL) {
kmem_free(svd->vpage, vpgtob(npages));
svd->vpage = NULL;
}
if ((amp = svd->amp) != NULL) {
/*
* If there are no more references to this anon_map
* structure, then deallocate the structure after freeing
* up all the anon slot pointers that we can.
*/
ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
ASSERT(amp->a_szc >= seg->s_szc);
if (--amp->refcnt == 0) {
if (svd->type == MAP_PRIVATE) {
/*
* Private - we only need to anon_free
* the part that this segment refers to.
*/
if (seg->s_szc != 0) {
anon_free_pages(amp->ahp,
svd->anon_index, seg->s_size,
seg->s_szc);
} else {
anon_free(amp->ahp, svd->anon_index,
seg->s_size);
}
} else {
/*
* Shared anon map is no longer in use. Before
* freeing its pages purge all entries from
* pcache that belong to this amp.
*/
ASSERT(svd->softlockcnt == 0);
anonmap_purge(amp);
/*
* Shared - anon_free the entire
* anon_map's worth of stuff and
* release any swap reservation.
*/
if (amp->a_szc != 0) {
anon_shmap_free_pages(amp, 0,
amp->size);
} else {
anon_free(amp->ahp, 0, amp->size);
}
if ((len = amp->swresv) != 0) {
anon_unresv_zone(len,
seg->s_as->a_proc->p_zone);
TRACE_3(TR_FAC_VM, TR_ANON_PROC,
"anon proc:%p %lu %u", seg, len, 0);
}
}
svd->amp = NULL;
ANON_LOCK_EXIT(&amp->a_rwlock);
anonmap_free(amp);
} else if (svd->type == MAP_PRIVATE) {
/*
* We had a private mapping which still has
* a held anon_map so just free up all the
* anon slot pointers that we were using.
*/
if (seg->s_szc != 0) {
anon_free_pages(amp->ahp, svd->anon_index,
seg->s_size, seg->s_szc);
} else {
anon_free(amp->ahp, svd->anon_index,
seg->s_size);
}
ANON_LOCK_EXIT(&amp->a_rwlock);
} else {
ANON_LOCK_EXIT(&amp->a_rwlock);
}
}
/*
* Release swap reservation.
*/
if ((len = svd->swresv) != 0) {
anon_unresv_zone(svd->swresv,
seg->s_as->a_proc->p_zone);
TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
seg, len, 0);
if (SEG_IS_PARTIAL_RESV(seg))
seg->s_as->a_resvsize -= svd->swresv;
svd->swresv = 0;
}
/*
* Release claim on vnode, credentials, and finally free the
* private data.
*/
if (svd->vp != NULL) {
if (svd->type == MAP_SHARED)
lgrp_shm_policy_fini(NULL, svd->vp);
VN_RELE(svd->vp);
svd->vp = NULL;
}
crfree(svd->cred);
svd->pageprot = 0;
svd->pageadvice = 0;
svd->pageswap = 0;
svd->cred = NULL;
/*
* Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
* still working with this segment without holding as lock (in case
* it's called by pcache async thread).
*/
ASSERT(svd->softlockcnt == 0);
mutex_enter(&svd->segfree_syncmtx);
mutex_exit(&svd->segfree_syncmtx);
seg->s_data = NULL;
kmem_cache_free(segvn_cache, svd);
}
/*
* Do a F_SOFTUNLOCK call over the range requested. The range must have
* already been F_SOFTLOCK'ed.
* Caller must always match addr and len of a softunlock with a previous
* softlock with exactly the same addr and len.
*/
static void
segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
page_t *pp;
caddr_t adr;
struct vnode *vp;
u_offset_t offset;
ulong_t anon_index;
struct anon_map *amp;
struct anon *ap = NULL;
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
if ((amp = svd->amp) != NULL)
anon_index = svd->anon_index + seg_page(seg, addr);
if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
ASSERT(svd->tr_state == SEGVN_TR_OFF);
hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie);
} else {
hat_unlock(seg->s_as->a_hat, addr, len);
}
for (adr = addr; adr < addr + len; adr += PAGESIZE) {
if (amp != NULL) {
ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
if ((ap = anon_get_ptr(amp->ahp, anon_index++))
!= NULL) {
swap_xlate(ap, &vp, &offset);
} else {
vp = svd->vp;
offset = svd->offset +
(uintptr_t)(adr - seg->s_base);
}
ANON_LOCK_EXIT(&amp->a_rwlock);
} else {
vp = svd->vp;
offset = svd->offset +
(uintptr_t)(adr - seg->s_base);
}
/*
* Use page_find() instead of page_lookup() to
* find the page since we know that it is locked.
*/
pp = page_find(vp, offset);
if (pp == NULL) {
panic(
"segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
(void *)adr, (void *)ap, (void *)vp, offset);
/*NOTREACHED*/
}
if (rw == S_WRITE) {
hat_setrefmod(pp);
if (seg->s_as->a_vbits)
hat_setstat(seg->s_as, adr, PAGESIZE,
P_REF | P_MOD);
} else if (rw != S_OTHER) {
hat_setref(pp);
if (seg->s_as->a_vbits)
hat_setstat(seg->s_as, adr, PAGESIZE, P_REF);
}
TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
"segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
page_unlock(pp);
}
ASSERT(svd->softlockcnt >= btop(len));
if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) {
/*
* All SOFTLOCKS are gone. Wakeup any waiting
* unmappers so they can try again to unmap.
* Check for waiters first without the mutex
* held so we don't always grab the mutex on
* softunlocks.
*/
if (AS_ISUNMAPWAIT(seg->s_as)) {
mutex_enter(&seg->s_as->a_contents);
if (AS_ISUNMAPWAIT(seg->s_as)) {
AS_CLRUNMAPWAIT(seg->s_as);
cv_broadcast(&seg->s_as->a_cv);
}
mutex_exit(&seg->s_as->a_contents);
}
}
}
#define PAGE_HANDLED ((page_t *)-1)
/*
* Release all the pages in the NULL terminated ppp list
* which haven't already been converted to PAGE_HANDLED.
*/
static void
segvn_pagelist_rele(page_t **ppp)
{
for (; *ppp != NULL; ppp++) {
if (*ppp != PAGE_HANDLED)
page_unlock(*ppp);
}
}
static int stealcow = 1;
/*
* Workaround for viking chip bug. See bug id 1220902.
* To fix this down in pagefault() would require importing so
* much as and segvn code as to be unmaintainable.
*/
int enable_mbit_wa = 0;
/*
* Handles all the dirty work of getting the right
* anonymous pages and loading up the translations.
* This routine is called only from segvn_fault()
* when looping over the range of addresses requested.
*
* The basic algorithm here is:
* If this is an anon_zero case
* Call anon_zero to allocate page
* Load up translation
* Return
* endif
* If this is an anon page
* Use anon_getpage to get the page
* else
* Find page in pl[] list passed in
* endif
* If not a cow
* Load up the translation to the page
* return
* endif
* Call anon_private to handle cow
* Load up (writable) translation to new page
*/
static faultcode_t
segvn_faultpage(
struct hat *hat, /* the hat to use for mapping */
struct seg *seg, /* seg_vn of interest */
caddr_t addr, /* address in as */
u_offset_t off, /* offset in vp */
struct vpage *vpage, /* pointer to vpage for vp, off */
page_t *pl[], /* object source page pointer */
uint_t vpprot, /* access allowed to object pages */
enum fault_type type, /* type of fault */
enum seg_rw rw, /* type of access at fault */
int brkcow) /* we may need to break cow */
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
page_t *pp, **ppp;
uint_t pageflags = 0;
page_t *anon_pl[1 + 1];
page_t *opp = NULL; /* original page */
uint_t prot;