| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright 2010 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| |
| /* |
| * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. |
| * All rights reserved. |
| */ |
| |
| /* |
| * Copyright (c) 2013, Joyent, Inc. All rights reserved. |
| * Copyright 2015 Nexenta Systems, Inc. All rights reserved. |
| * Copyright 2022 Oxide Computer Company |
| */ |
| |
| #include <sys/param.h> |
| #include <sys/types.h> |
| #include <sys/systm.h> |
| #include <sys/cred.h> |
| #include <sys/time.h> |
| #include <sys/vnode.h> |
| #include <sys/vfs.h> |
| #include <sys/vfs_opreg.h> |
| #include <sys/file.h> |
| #include <sys/filio.h> |
| #include <sys/uio.h> |
| #include <sys/buf.h> |
| #include <sys/mman.h> |
| #include <sys/pathname.h> |
| #include <sys/dirent.h> |
| #include <sys/debug.h> |
| #include <sys/vmsystm.h> |
| #include <sys/fcntl.h> |
| #include <sys/flock.h> |
| #include <sys/swap.h> |
| #include <sys/errno.h> |
| #include <sys/strsubr.h> |
| #include <sys/sysmacros.h> |
| #include <sys/kmem.h> |
| #include <sys/cmn_err.h> |
| #include <sys/pathconf.h> |
| #include <sys/utsname.h> |
| #include <sys/dnlc.h> |
| #include <sys/acl.h> |
| #include <sys/systeminfo.h> |
| #include <sys/atomic.h> |
| #include <sys/policy.h> |
| #include <sys/sdt.h> |
| #include <sys/zone.h> |
| |
| #include <rpc/types.h> |
| #include <rpc/auth.h> |
| #include <rpc/clnt.h> |
| #include <rpc/rpc_rdma.h> |
| |
| #include <nfs/nfs.h> |
| #include <nfs/nfs_clnt.h> |
| #include <nfs/rnode.h> |
| #include <nfs/nfs_acl.h> |
| #include <nfs/lm.h> |
| |
| #include <vm/hat.h> |
| #include <vm/as.h> |
| #include <vm/page.h> |
| #include <vm/pvn.h> |
| #include <vm/seg.h> |
| #include <vm/seg_map.h> |
| #include <vm/seg_kpm.h> |
| #include <vm/seg_vn.h> |
| |
| #include <fs/fs_subr.h> |
| |
| #include <sys/ddi.h> |
| |
| static int nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, |
| cred_t *); |
| static int nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, |
| stable_how *); |
| static int nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *); |
| static int nfs3setattr(vnode_t *, struct vattr *, int, cred_t *); |
| static int nfs3_accessx(void *, int, cred_t *); |
| static int nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *); |
| static int nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int); |
| static int nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl, |
| int, vnode_t **, cred_t *, int); |
| static int nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *); |
| static int nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl, |
| int, vnode_t **, cred_t *); |
| static int nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *, |
| caller_context_t *); |
| static int do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *); |
| static void nfs3readdir(vnode_t *, rddir_cache *, cred_t *); |
| static void nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *); |
| static int nfs3_bio(struct buf *, stable_how *, cred_t *); |
| static int nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *, |
| page_t *[], size_t, struct seg *, caddr_t, |
| enum seg_rw, cred_t *); |
| static void nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, |
| cred_t *); |
| static int nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, |
| int, cred_t *); |
| static int nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, |
| int, cred_t *); |
| static int nfs3_commit(vnode_t *, offset3, count3, cred_t *); |
| static void nfs3_set_mod(vnode_t *); |
| static void nfs3_get_commit(vnode_t *); |
| static void nfs3_get_commit_range(vnode_t *, u_offset_t, size_t); |
| static int nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); |
| static int nfs3_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *); |
| static int nfs3_sync_commit(vnode_t *, page_t *, offset3, count3, |
| cred_t *); |
| static void nfs3_async_commit(vnode_t *, page_t *, offset3, count3, |
| cred_t *); |
| static void nfs3_delmap_callback(struct as *, void *, uint_t); |
| |
| /* |
| * Error flags used to pass information about certain special errors |
| * which need to be handled specially. |
| */ |
| #define NFS_EOF -98 |
| #define NFS_VERF_MISMATCH -97 |
| |
| /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ |
| #define ALIGN64(x, ptr, sz) \ |
| x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ |
| if (x) { \ |
| x = sizeof (uint64_t) - (x); \ |
| sz -= (x); \ |
| ptr += (x); \ |
| } |
| |
| /* |
| * These are the vnode ops routines which implement the vnode interface to |
| * the networked file system. These routines just take their parameters, |
| * make them look networkish by putting the right info into interface structs, |
| * and then calling the appropriate remote routine(s) to do the work. |
| * |
| * Note on directory name lookup cacheing: If we detect a stale fhandle, |
| * we purge the directory cache relative to that vnode. This way, the |
| * user won't get burned by the cache repeatedly. See <nfs/rnode.h> for |
| * more details on rnode locking. |
| */ |
| |
| static int nfs3_open(vnode_t **, int, cred_t *, caller_context_t *); |
| static int nfs3_close(vnode_t *, int, int, offset_t, cred_t *, |
| caller_context_t *); |
| static int nfs3_read(vnode_t *, struct uio *, int, cred_t *, |
| caller_context_t *); |
| static int nfs3_write(vnode_t *, struct uio *, int, cred_t *, |
| caller_context_t *); |
| static int nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, |
| caller_context_t *); |
| static int nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *, |
| caller_context_t *); |
| static int nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *, |
| caller_context_t *); |
| static int nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *); |
| static int nfs3_readlink(vnode_t *, struct uio *, cred_t *, |
| caller_context_t *); |
| static int nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *); |
| static void nfs3_inactive(vnode_t *, cred_t *, caller_context_t *); |
| static int nfs3_lookup(vnode_t *, char *, vnode_t **, |
| struct pathname *, int, vnode_t *, cred_t *, |
| caller_context_t *, int *, pathname_t *); |
| static int nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl, |
| int, vnode_t **, cred_t *, int, caller_context_t *, |
| vsecattr_t *); |
| static int nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *, |
| int); |
| static int nfs3_link(vnode_t *, vnode_t *, char *, cred_t *, |
| caller_context_t *, int); |
| static int nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, |
| caller_context_t *, int); |
| static int nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, |
| cred_t *, caller_context_t *, int, vsecattr_t *); |
| static int nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *, |
| caller_context_t *, int); |
| static int nfs3_symlink(vnode_t *, char *, struct vattr *, char *, |
| cred_t *, caller_context_t *, int); |
| static int nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *, |
| caller_context_t *, int); |
| static int nfs3_fid(vnode_t *, fid_t *, caller_context_t *); |
| static int nfs3_rwlock(vnode_t *, int, caller_context_t *); |
| static void nfs3_rwunlock(vnode_t *, int, caller_context_t *); |
| static int nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); |
| static int nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *, |
| page_t *[], size_t, struct seg *, caddr_t, |
| enum seg_rw, cred_t *, caller_context_t *); |
| static int nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *, |
| caller_context_t *); |
| static int nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, |
| uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); |
| static int nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, |
| uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); |
| static int nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t, |
| struct flk_callback *, cred_t *, caller_context_t *); |
| static int nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t, |
| cred_t *, caller_context_t *); |
| static int nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *); |
| static int nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, |
| uint_t, uint_t, uint_t, cred_t *, caller_context_t *); |
| static int nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *, |
| caller_context_t *); |
| static int nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, |
| cred_t *, caller_context_t *); |
| static void nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *, |
| caller_context_t *); |
| static int nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, |
| caller_context_t *); |
| static int nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, |
| caller_context_t *); |
| static int nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, |
| caller_context_t *); |
| |
| struct vnodeops *nfs3_vnodeops; |
| |
| const fs_operation_def_t nfs3_vnodeops_template[] = { |
| VOPNAME_OPEN, { .vop_open = nfs3_open }, |
| VOPNAME_CLOSE, { .vop_close = nfs3_close }, |
| VOPNAME_READ, { .vop_read = nfs3_read }, |
| VOPNAME_WRITE, { .vop_write = nfs3_write }, |
| VOPNAME_IOCTL, { .vop_ioctl = nfs3_ioctl }, |
| VOPNAME_GETATTR, { .vop_getattr = nfs3_getattr }, |
| VOPNAME_SETATTR, { .vop_setattr = nfs3_setattr }, |
| VOPNAME_ACCESS, { .vop_access = nfs3_access }, |
| VOPNAME_LOOKUP, { .vop_lookup = nfs3_lookup }, |
| VOPNAME_CREATE, { .vop_create = nfs3_create }, |
| VOPNAME_REMOVE, { .vop_remove = nfs3_remove }, |
| VOPNAME_LINK, { .vop_link = nfs3_link }, |
| VOPNAME_RENAME, { .vop_rename = nfs3_rename }, |
| VOPNAME_MKDIR, { .vop_mkdir = nfs3_mkdir }, |
| VOPNAME_RMDIR, { .vop_rmdir = nfs3_rmdir }, |
| VOPNAME_READDIR, { .vop_readdir = nfs3_readdir }, |
| VOPNAME_SYMLINK, { .vop_symlink = nfs3_symlink }, |
| VOPNAME_READLINK, { .vop_readlink = nfs3_readlink }, |
| VOPNAME_FSYNC, { .vop_fsync = nfs3_fsync }, |
| VOPNAME_INACTIVE, { .vop_inactive = nfs3_inactive }, |
| VOPNAME_FID, { .vop_fid = nfs3_fid }, |
| VOPNAME_RWLOCK, { .vop_rwlock = nfs3_rwlock }, |
| VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs3_rwunlock }, |
| VOPNAME_SEEK, { .vop_seek = nfs3_seek }, |
| VOPNAME_FRLOCK, { .vop_frlock = nfs3_frlock }, |
| VOPNAME_SPACE, { .vop_space = nfs3_space }, |
| VOPNAME_REALVP, { .vop_realvp = nfs3_realvp }, |
| VOPNAME_GETPAGE, { .vop_getpage = nfs3_getpage }, |
| VOPNAME_PUTPAGE, { .vop_putpage = nfs3_putpage }, |
| VOPNAME_MAP, { .vop_map = nfs3_map }, |
| VOPNAME_ADDMAP, { .vop_addmap = nfs3_addmap }, |
| VOPNAME_DELMAP, { .vop_delmap = nfs3_delmap }, |
| /* no separate nfs3_dump */ |
| VOPNAME_DUMP, { .vop_dump = nfs_dump }, |
| VOPNAME_PATHCONF, { .vop_pathconf = nfs3_pathconf }, |
| VOPNAME_PAGEIO, { .vop_pageio = nfs3_pageio }, |
| VOPNAME_DISPOSE, { .vop_dispose = nfs3_dispose }, |
| VOPNAME_SETSECATTR, { .vop_setsecattr = nfs3_setsecattr }, |
| VOPNAME_GETSECATTR, { .vop_getsecattr = nfs3_getsecattr }, |
| VOPNAME_SHRLOCK, { .vop_shrlock = nfs3_shrlock }, |
| VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, |
| NULL, NULL |
| }; |
| |
| /* |
| * XXX: This is referenced in modstubs.s |
| */ |
| struct vnodeops * |
| nfs3_getvnodeops(void) |
| { |
| return (nfs3_vnodeops); |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) |
| { |
| int error; |
| struct vattr va; |
| rnode_t *rp; |
| vnode_t *vp; |
| |
| vp = *vpp; |
| if (nfs_zone() != VTOMI(vp)->mi_zone) |
| return (EIO); |
| rp = VTOR(vp); |
| mutex_enter(&rp->r_statelock); |
| if (rp->r_cred == NULL) { |
| crhold(cr); |
| rp->r_cred = cr; |
| } |
| mutex_exit(&rp->r_statelock); |
| |
| /* |
| * If there is no cached data or if close-to-open |
| * consistency checking is turned off, we can avoid |
| * the over the wire getattr. Otherwise, if the |
| * file system is mounted readonly, then just verify |
| * the caches are up to date using the normal mechanism. |
| * Else, if the file is not mmap'd, then just mark |
| * the attributes as timed out. They will be refreshed |
| * and the caches validated prior to being used. |
| * Else, the file system is mounted writeable so |
| * force an over the wire GETATTR in order to ensure |
| * that all cached data is valid. |
| */ |
| if (vp->v_count > 1 || |
| ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) && |
| !(VTOMI(vp)->mi_flags & MI_NOCTO))) { |
| if (vn_is_readonly(vp)) |
| error = nfs3_validate_caches(vp, cr); |
| else if (rp->r_mapcnt == 0 && vp->v_count == 1) { |
| PURGE_ATTRCACHE(vp); |
| error = 0; |
| } else { |
| va.va_mask = AT_ALL; |
| error = nfs3_getattr_otw(vp, &va, cr); |
| } |
| } else |
| error = 0; |
| |
| return (error); |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, |
| caller_context_t *ct) |
| { |
| rnode_t *rp; |
| int error; |
| struct vattr va; |
| |
| /* |
| * zone_enter(2) prevents processes from changing zones with NFS files |
| * open; if we happen to get here from the wrong zone we can't do |
| * anything over the wire. |
| */ |
| if (VTOMI(vp)->mi_zone != nfs_zone()) { |
| /* |
| * We could attempt to clean up locks, except we're sure |
| * that the current process didn't acquire any locks on |
| * the file: any attempt to lock a file belong to another zone |
| * will fail, and one can't lock an NFS file and then change |
| * zones, as that fails too. |
| * |
| * Returning an error here is the sane thing to do. A |
| * subsequent call to VN_RELE() which translates to a |
| * nfs3_inactive() will clean up state: if the zone of the |
| * vnode's origin is still alive and kicking, an async worker |
| * thread will handle the request (from the correct zone), and |
| * everything (minus the commit and final nfs3_getattr_otw() |
| * call) should be OK. If the zone is going away |
| * nfs_async_inactive() will throw away cached pages inline. |
| */ |
| return (EIO); |
| } |
| |
| /* |
| * If we are using local locking for this filesystem, then |
| * release all of the SYSV style record locks. Otherwise, |
| * we are doing network locking and we need to release all |
| * of the network locks. All of the locks held by this |
| * process on this file are released no matter what the |
| * incoming reference count is. |
| */ |
| if (VTOMI(vp)->mi_flags & MI_LLOCK) { |
| cleanlocks(vp, ttoproc(curthread)->p_pid, 0); |
| cleanshares(vp, ttoproc(curthread)->p_pid); |
| } else |
| nfs_lockrelease(vp, flag, offset, cr); |
| |
| if (count > 1) |
| return (0); |
| |
| /* |
| * If the file has been `unlinked', then purge the |
| * DNLC so that this vnode will get reycled quicker |
| * and the .nfs* file on the server will get removed. |
| */ |
| rp = VTOR(vp); |
| if (rp->r_unldvp != NULL) |
| dnlc_purge_vp(vp); |
| |
| /* |
| * If the file was open for write and there are pages, |
| * then if the file system was mounted using the "no-close- |
| * to-open" semantics, then start an asynchronous flush |
| * of the all of the pages in the file. |
| * else the file system was not mounted using the "no-close- |
| * to-open" semantics, then do a synchronous flush and |
| * commit of all of the dirty and uncommitted pages. |
| * |
| * The asynchronous flush of the pages in the "nocto" path |
| * mostly just associates a cred pointer with the rnode so |
| * writes which happen later will have a better chance of |
| * working. It also starts the data being written to the |
| * server, but without unnecessarily delaying the application. |
| */ |
| if ((flag & FWRITE) && vn_has_cached_data(vp)) { |
| if (VTOMI(vp)->mi_flags & MI_NOCTO) { |
| error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC, |
| cr, ct); |
| if (error == EAGAIN) |
| error = 0; |
| } else |
| error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr); |
| if (!error) { |
| mutex_enter(&rp->r_statelock); |
| error = rp->r_error; |
| rp->r_error = 0; |
| mutex_exit(&rp->r_statelock); |
| } |
| } else { |
| mutex_enter(&rp->r_statelock); |
| error = rp->r_error; |
| rp->r_error = 0; |
| mutex_exit(&rp->r_statelock); |
| } |
| |
| /* |
| * If RWRITEATTR is set, then issue an over the wire GETATTR to |
| * refresh the attribute cache with a set of attributes which |
| * weren't returned from a WRITE. This will enable the close- |
| * to-open processing to work. |
| */ |
| if (rp->r_flags & RWRITEATTR) |
| (void) nfs3_getattr_otw(vp, &va, cr); |
| |
| return (error); |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr) |
| { |
| mntinfo_t *mi; |
| READ3args args; |
| READ3uiores res; |
| int tsize; |
| offset_t offset; |
| ssize_t count; |
| int error; |
| int douprintf; |
| failinfo_t fi; |
| char *sv_hostname; |
| |
| mi = VTOMI(vp); |
| ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); |
| sv_hostname = VTOR(vp)->r_server->sv_hostname; |
| |
| douprintf = 1; |
| args.file = *VTOFH3(vp); |
| fi.vp = vp; |
| fi.fhp = (caddr_t)&args.file; |
| fi.copyproc = nfs3copyfh; |
| fi.lookupproc = nfs3lookup; |
| fi.xattrdirproc = acl_getxattrdir3; |
| |
| res.uiop = uiop; |
| |
| res.wlist = NULL; |
| |
| offset = uiop->uio_loffset; |
| count = uiop->uio_resid; |
| |
| do { |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| do { |
| tsize = MIN(mi->mi_tsize, count); |
| args.offset = (offset3)offset; |
| args.count = (count3)tsize; |
| res.size = (uint_t)tsize; |
| args.res_uiop = uiop; |
| args.res_data_val_alt = NULL; |
| |
| error = rfs3call(mi, NFSPROC3_READ, |
| xdr_READ3args, (caddr_t)&args, |
| xdr_READ3uiores, (caddr_t)&res, cr, |
| &douprintf, &res.status, 0, &fi); |
| } while (error == ENFS_TRYAGAIN); |
| |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| if (error) |
| return (error); |
| |
| error = geterrno3(res.status); |
| if (error) |
| return (error); |
| |
| if (res.count != res.size) { |
| zcmn_err(getzoneid(), CE_WARN, |
| "nfs3_directio_read: server %s returned incorrect amount", |
| sv_hostname); |
| return (EIO); |
| } |
| count -= res.count; |
| offset += res.count; |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; |
| KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count; |
| mutex_exit(&mi->mi_lock); |
| } |
| lwp_stat_update(LWP_STAT_INBLK, 1); |
| } while (count && !res.eof); |
| |
| return (0); |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, |
| caller_context_t *ct) |
| { |
| rnode_t *rp; |
| u_offset_t off; |
| offset_t diff; |
| int on; |
| size_t n; |
| caddr_t base; |
| uint_t flags; |
| int error = 0; |
| mntinfo_t *mi; |
| |
| rp = VTOR(vp); |
| mi = VTOMI(vp); |
| |
| ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); |
| |
| if (nfs_zone() != mi->mi_zone) |
| return (EIO); |
| |
| if (vp->v_type != VREG) |
| return (EISDIR); |
| |
| if (uiop->uio_resid == 0) |
| return (0); |
| |
| if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) |
| return (EINVAL); |
| |
| /* |
| * Bypass VM if caching has been disabled (e.g., locking) or if |
| * using client-side direct I/O and the file is not mmap'd and |
| * there are no cached pages. |
| */ |
| if ((vp->v_flag & VNOCACHE) || |
| (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && |
| rp->r_mapcnt == 0 && rp->r_inmap == 0 && |
| !vn_has_cached_data(vp))) { |
| return (nfs3_directio_read(vp, uiop, cr)); |
| } |
| |
| do { |
| off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ |
| on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ |
| n = MIN(MAXBSIZE - on, uiop->uio_resid); |
| |
| error = nfs3_validate_caches(vp, cr); |
| if (error) |
| break; |
| |
| mutex_enter(&rp->r_statelock); |
| while (rp->r_flags & RINCACHEPURGE) { |
| if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { |
| mutex_exit(&rp->r_statelock); |
| return (EINTR); |
| } |
| } |
| diff = rp->r_size - uiop->uio_loffset; |
| mutex_exit(&rp->r_statelock); |
| if (diff <= 0) |
| break; |
| if (diff < n) |
| n = (size_t)diff; |
| |
| if (vpm_enable) { |
| /* |
| * Copy data. |
| */ |
| error = vpm_data_copy(vp, off + on, n, uiop, |
| 1, NULL, 0, S_READ); |
| } else { |
| base = segmap_getmapflt(segkmap, vp, off + on, n, 1, |
| S_READ); |
| |
| error = uiomove(base + on, n, UIO_READ, uiop); |
| } |
| |
| if (!error) { |
| /* |
| * If read a whole block or read to eof, |
| * won't need this buffer again soon. |
| */ |
| mutex_enter(&rp->r_statelock); |
| if (n + on == MAXBSIZE || |
| uiop->uio_loffset == rp->r_size) |
| flags = SM_DONTNEED; |
| else |
| flags = 0; |
| mutex_exit(&rp->r_statelock); |
| if (vpm_enable) { |
| error = vpm_sync_pages(vp, off, n, flags); |
| } else { |
| error = segmap_release(segkmap, base, flags); |
| } |
| } else { |
| if (vpm_enable) { |
| (void) vpm_sync_pages(vp, off, n, 0); |
| } else { |
| (void) segmap_release(segkmap, base, 0); |
| } |
| } |
| } while (!error && uiop->uio_resid > 0); |
| |
| return (error); |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, |
| caller_context_t *ct) |
| { |
| rlim64_t limit = uiop->uio_llimit; |
| rnode_t *rp; |
| u_offset_t off; |
| caddr_t base; |
| uint_t flags; |
| int remainder; |
| size_t n; |
| int on; |
| int error; |
| int resid; |
| offset_t offset; |
| mntinfo_t *mi; |
| uint_t bsize; |
| |
| rp = VTOR(vp); |
| |
| if (vp->v_type != VREG) |
| return (EISDIR); |
| |
| mi = VTOMI(vp); |
| if (nfs_zone() != mi->mi_zone) |
| return (EIO); |
| if (uiop->uio_resid == 0) |
| return (0); |
| |
| if (ioflag & FAPPEND) { |
| struct vattr va; |
| |
| /* |
| * Must serialize if appending. |
| */ |
| if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { |
| nfs_rw_exit(&rp->r_rwlock); |
| if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, |
| INTR(vp))) |
| return (EINTR); |
| } |
| |
| va.va_mask = AT_SIZE; |
| error = nfs3getattr(vp, &va, cr); |
| if (error) |
| return (error); |
| uiop->uio_loffset = va.va_size; |
| } |
| |
| offset = uiop->uio_loffset + uiop->uio_resid; |
| |
| if (uiop->uio_loffset < 0 || offset < 0) |
| return (EINVAL); |
| |
| if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) |
| limit = MAXOFFSET_T; |
| |
| /* |
| * Check to make sure that the process will not exceed |
| * its limit on file size. It is okay to write up to |
| * the limit, but not beyond. Thus, the write which |
| * reaches the limit will be short and the next write |
| * will return an error. |
| */ |
| remainder = 0; |
| if (offset > limit) { |
| remainder = offset - limit; |
| uiop->uio_resid = limit - uiop->uio_loffset; |
| if (uiop->uio_resid <= 0) { |
| proc_t *p = ttoproc(curthread); |
| |
| uiop->uio_resid += remainder; |
| mutex_enter(&p->p_lock); |
| (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], |
| p->p_rctls, p, RCA_UNSAFE_SIGINFO); |
| mutex_exit(&p->p_lock); |
| return (EFBIG); |
| } |
| } |
| |
| if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) |
| return (EINTR); |
| |
| /* |
| * Bypass VM if caching has been disabled (e.g., locking) or if |
| * using client-side direct I/O and the file is not mmap'd and |
| * there are no cached pages. |
| */ |
| if ((vp->v_flag & VNOCACHE) || |
| (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) && |
| rp->r_mapcnt == 0 && rp->r_inmap == 0 && |
| !vn_has_cached_data(vp))) { |
| size_t bufsize; |
| int count; |
| u_offset_t org_offset; |
| stable_how stab_comm; |
| |
| nfs3_fwrite: |
| if (rp->r_flags & RSTALE) { |
| resid = uiop->uio_resid; |
| offset = uiop->uio_loffset; |
| error = rp->r_error; |
| /* |
| * A close may have cleared r_error, if so, |
| * propagate ESTALE error return properly |
| */ |
| if (error == 0) |
| error = ESTALE; |
| goto bottom; |
| } |
| bufsize = MIN(uiop->uio_resid, mi->mi_stsize); |
| base = kmem_alloc(bufsize, KM_SLEEP); |
| do { |
| if (ioflag & FDSYNC) |
| stab_comm = DATA_SYNC; |
| else |
| stab_comm = FILE_SYNC; |
| resid = uiop->uio_resid; |
| offset = uiop->uio_loffset; |
| count = MIN(uiop->uio_resid, bufsize); |
| org_offset = uiop->uio_loffset; |
| error = uiomove(base, count, UIO_WRITE, uiop); |
| if (!error) { |
| error = nfs3write(vp, base, org_offset, |
| count, cr, &stab_comm); |
| } |
| } while (!error && uiop->uio_resid > 0); |
| kmem_free(base, bufsize); |
| goto bottom; |
| } |
| |
| |
| bsize = vp->v_vfsp->vfs_bsize; |
| |
| do { |
| off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ |
| on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ |
| n = MIN(MAXBSIZE - on, uiop->uio_resid); |
| |
| resid = uiop->uio_resid; |
| offset = uiop->uio_loffset; |
| |
| if (rp->r_flags & RSTALE) { |
| error = rp->r_error; |
| /* |
| * A close may have cleared r_error, if so, |
| * propagate ESTALE error return properly |
| */ |
| if (error == 0) |
| error = ESTALE; |
| break; |
| } |
| |
| /* |
| * Don't create dirty pages faster than they |
| * can be cleaned so that the system doesn't |
| * get imbalanced. If the async queue is |
| * maxed out, then wait for it to drain before |
| * creating more dirty pages. Also, wait for |
| * any threads doing pagewalks in the vop_getattr |
| * entry points so that they don't block for |
| * long periods. |
| */ |
| mutex_enter(&rp->r_statelock); |
| while ((mi->mi_max_threads != 0 && |
| rp->r_awcount > 2 * mi->mi_max_threads) || |
| rp->r_gcount > 0) { |
| if (INTR(vp)) { |
| klwp_t *lwp = ttolwp(curthread); |
| |
| if (lwp != NULL) |
| lwp->lwp_nostop++; |
| if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { |
| mutex_exit(&rp->r_statelock); |
| if (lwp != NULL) |
| lwp->lwp_nostop--; |
| error = EINTR; |
| goto bottom; |
| } |
| if (lwp != NULL) |
| lwp->lwp_nostop--; |
| } else |
| cv_wait(&rp->r_cv, &rp->r_statelock); |
| } |
| mutex_exit(&rp->r_statelock); |
| |
| /* |
| * Touch the page and fault it in if it is not in core |
| * before segmap_getmapflt or vpm_data_copy can lock it. |
| * This is to avoid the deadlock if the buffer is mapped |
| * to the same file through mmap which we want to write. |
| */ |
| uio_prefaultpages((long)n, uiop); |
| |
| if (vpm_enable) { |
| /* |
| * It will use kpm mappings, so no need to |
| * pass an address. |
| */ |
| error = writerp(rp, NULL, n, uiop, 0); |
| } else { |
| if (segmap_kpm) { |
| int pon = uiop->uio_loffset & PAGEOFFSET; |
| size_t pn = MIN(PAGESIZE - pon, |
| uiop->uio_resid); |
| int pagecreate; |
| |
| mutex_enter(&rp->r_statelock); |
| pagecreate = (pon == 0) && (pn == PAGESIZE || |
| uiop->uio_loffset + pn >= rp->r_size); |
| mutex_exit(&rp->r_statelock); |
| |
| base = segmap_getmapflt(segkmap, vp, off + on, |
| pn, !pagecreate, S_WRITE); |
| |
| error = writerp(rp, base + pon, n, uiop, |
| pagecreate); |
| |
| } else { |
| base = segmap_getmapflt(segkmap, vp, off + on, |
| n, 0, S_READ); |
| error = writerp(rp, base + on, n, uiop, 0); |
| } |
| } |
| |
| if (!error) { |
| if (mi->mi_flags & MI_NOAC) |
| flags = SM_WRITE; |
| else if ((uiop->uio_loffset % bsize) == 0 || |
| IS_SWAPVP(vp)) { |
| /* |
| * Have written a whole block. |
| * Start an asynchronous write |
| * and mark the buffer to |
| * indicate that it won't be |
| * needed again soon. |
| */ |
| flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; |
| } else |
| flags = 0; |
| if ((ioflag & (FSYNC|FDSYNC)) || |
| (rp->r_flags & ROUTOFSPACE)) { |
| flags &= ~SM_ASYNC; |
| flags |= SM_WRITE; |
| } |
| if (vpm_enable) { |
| error = vpm_sync_pages(vp, off, n, flags); |
| } else { |
| error = segmap_release(segkmap, base, flags); |
| } |
| } else { |
| if (vpm_enable) { |
| (void) vpm_sync_pages(vp, off, n, 0); |
| } else { |
| (void) segmap_release(segkmap, base, 0); |
| } |
| /* |
| * In the event that we got an access error while |
| * faulting in a page for a write-only file just |
| * force a write. |
| */ |
| if (error == EACCES) |
| goto nfs3_fwrite; |
| } |
| } while (!error && uiop->uio_resid > 0); |
| |
| bottom: |
| if (error) { |
| uiop->uio_resid = resid + remainder; |
| uiop->uio_loffset = offset; |
| } else |
| uiop->uio_resid += remainder; |
| |
| nfs_rw_exit(&rp->r_lkserlock); |
| |
| return (error); |
| } |
| |
| /* |
| * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} |
| */ |
| static int |
| nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, |
| int flags, cred_t *cr) |
| { |
| struct buf *bp; |
| int error; |
| page_t *savepp; |
| uchar_t fsdata; |
| stable_how stab_comm; |
| |
| ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); |
| bp = pageio_setup(pp, len, vp, flags); |
| ASSERT(bp != NULL); |
| |
| /* |
| * pageio_setup should have set b_addr to 0. This |
| * is correct since we want to do I/O on a page |
| * boundary. bp_mapin will use this addr to calculate |
| * an offset, and then set b_addr to the kernel virtual |
| * address it allocated for us. |
| */ |
| ASSERT(bp->b_un.b_addr == 0); |
| |
| bp->b_edev = 0; |
| bp->b_dev = 0; |
| bp->b_lblkno = lbtodb(off); |
| bp->b_file = vp; |
| bp->b_offset = (offset_t)off; |
| bp_mapin(bp); |
| |
| /* |
| * Calculate the desired level of stability to write data |
| * on the server and then mark all of the pages to reflect |
| * this. |
| */ |
| if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && |
| freemem > desfree) { |
| stab_comm = UNSTABLE; |
| fsdata = C_DELAYCOMMIT; |
| } else { |
| stab_comm = FILE_SYNC; |
| fsdata = C_NOCOMMIT; |
| } |
| |
| savepp = pp; |
| do { |
| pp->p_fsdata = fsdata; |
| } while ((pp = pp->p_next) != savepp); |
| |
| error = nfs3_bio(bp, &stab_comm, cr); |
| |
| bp_mapout(bp); |
| pageio_done(bp); |
| |
| /* |
| * If the server wrote pages in a more stable fashion than |
| * was requested, then clear all of the marks in the pages |
| * indicating that COMMIT operations were required. |
| */ |
| if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) { |
| do { |
| pp->p_fsdata = C_NOCOMMIT; |
| } while ((pp = pp->p_next) != savepp); |
| } |
| |
| return (error); |
| } |
| |
| /* |
| * Write to file. Writes to remote server in largest size |
| * chunks that the server can handle. Write is synchronous. |
| */ |
| static int |
| nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, |
| stable_how *stab_comm) |
| { |
| mntinfo_t *mi; |
| WRITE3args args; |
| WRITE3res res; |
| int error; |
| int tsize; |
| rnode_t *rp; |
| int douprintf; |
| |
| rp = VTOR(vp); |
| mi = VTOMI(vp); |
| |
| ASSERT(nfs_zone() == mi->mi_zone); |
| |
| args.file = *VTOFH3(vp); |
| args.stable = *stab_comm; |
| |
| *stab_comm = FILE_SYNC; |
| |
| douprintf = 1; |
| |
| do { |
| if ((vp->v_flag & VNOCACHE) || |
| (rp->r_flags & RDIRECTIO) || |
| (mi->mi_flags & MI_DIRECTIO)) |
| tsize = MIN(mi->mi_stsize, count); |
| else |
| tsize = MIN(mi->mi_curwrite, count); |
| args.offset = (offset3)offset; |
| args.count = (count3)tsize; |
| args.data.data_len = (uint_t)tsize; |
| args.data.data_val = base; |
| |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); |
| mutex_exit(&mi->mi_lock); |
| } |
| args.mblk = NULL; |
| do { |
| error = rfs3call(mi, NFSPROC3_WRITE, |
| xdr_WRITE3args, (caddr_t)&args, |
| xdr_WRITE3res, (caddr_t)&res, cr, |
| &douprintf, &res.status, 0, NULL); |
| } while (error == ENFS_TRYAGAIN); |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| if (error) |
| return (error); |
| error = geterrno3(res.status); |
| if (!error) { |
| if (res.resok.count > args.count) { |
| zcmn_err(getzoneid(), CE_WARN, |
| "nfs3write: server %s wrote %u, " |
| "requested was %u", |
| rp->r_server->sv_hostname, |
| res.resok.count, args.count); |
| return (EIO); |
| } |
| if (res.resok.committed == UNSTABLE) { |
| *stab_comm = UNSTABLE; |
| if (args.stable == DATA_SYNC || |
| args.stable == FILE_SYNC) { |
| zcmn_err(getzoneid(), CE_WARN, |
| "nfs3write: server %s did not commit to stable storage", |
| rp->r_server->sv_hostname); |
| return (EIO); |
| } |
| } |
| tsize = (int)res.resok.count; |
| count -= tsize; |
| base += tsize; |
| offset += tsize; |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; |
| KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += |
| tsize; |
| mutex_exit(&mi->mi_lock); |
| } |
| lwp_stat_update(LWP_STAT_OUBLK, 1); |
| mutex_enter(&rp->r_statelock); |
| if (rp->r_flags & RHAVEVERF) { |
| if (rp->r_verf != res.resok.verf) { |
| nfs3_set_mod(vp); |
| rp->r_verf = res.resok.verf; |
| /* |
| * If the data was written UNSTABLE, |
| * then might as well stop because |
| * the whole block will have to get |
| * rewritten anyway. |
| */ |
| if (*stab_comm == UNSTABLE) { |
| mutex_exit(&rp->r_statelock); |
| break; |
| } |
| } |
| } else { |
| rp->r_verf = res.resok.verf; |
| rp->r_flags |= RHAVEVERF; |
| } |
| /* |
| * Mark the attribute cache as timed out and |
| * set RWRITEATTR to indicate that the file |
| * was modified with a WRITE operation and |
| * that the attributes can not be trusted. |
| */ |
| PURGE_ATTRCACHE_LOCKED(rp); |
| rp->r_flags |= RWRITEATTR; |
| mutex_exit(&rp->r_statelock); |
| } |
| } while (!error && count); |
| |
| return (error); |
| } |
| |
| /* |
| * Read from a file. Reads data in largest chunks our interface can handle. |
| */ |
| static int |
| nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count, size_t *residp, |
| cred_t *cr) |
| { |
| mntinfo_t *mi; |
| READ3args args; |
| READ3vres res; |
| int tsize; |
| int error; |
| int douprintf; |
| failinfo_t fi; |
| rnode_t *rp; |
| struct vattr va; |
| hrtime_t t; |
| |
| rp = VTOR(vp); |
| mi = VTOMI(vp); |
| ASSERT(nfs_zone() == mi->mi_zone); |
| douprintf = 1; |
| |
| args.file = *VTOFH3(vp); |
| fi.vp = vp; |
| fi.fhp = (caddr_t)&args.file; |
| fi.copyproc = nfs3copyfh; |
| fi.lookupproc = nfs3lookup; |
| fi.xattrdirproc = acl_getxattrdir3; |
| |
| res.pov.fres.vp = vp; |
| res.pov.fres.vap = &va; |
| |
| res.wlist = NULL; |
| *residp = count; |
| do { |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| do { |
| if ((vp->v_flag & VNOCACHE) || |
| (rp->r_flags & RDIRECTIO) || |
| (mi->mi_flags & MI_DIRECTIO)) |
| tsize = MIN(mi->mi_tsize, count); |
| else |
| tsize = MIN(mi->mi_curread, count); |
| res.data.data_val = base; |
| res.data.data_len = tsize; |
| args.offset = (offset3)offset; |
| args.count = (count3)tsize; |
| args.res_uiop = NULL; |
| args.res_data_val_alt = base; |
| |
| t = gethrtime(); |
| error = rfs3call(mi, NFSPROC3_READ, |
| xdr_READ3args, (caddr_t)&args, |
| xdr_READ3vres, (caddr_t)&res, cr, |
| &douprintf, &res.status, 0, &fi); |
| } while (error == ENFS_TRYAGAIN); |
| |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| if (error) |
| return (error); |
| |
| error = geterrno3(res.status); |
| if (error) |
| return (error); |
| |
| if (res.count != res.data.data_len) { |
| zcmn_err(getzoneid(), CE_WARN, |
| "nfs3read: server %s returned incorrect amount", |
| rp->r_server->sv_hostname); |
| return (EIO); |
| } |
| |
| count -= res.count; |
| *residp = count; |
| base += res.count; |
| offset += res.count; |
| if (mi->mi_io_kstats) { |
| mutex_enter(&mi->mi_lock); |
| KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; |
| KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count; |
| mutex_exit(&mi->mi_lock); |
| } |
| lwp_stat_update(LWP_STAT_INBLK, 1); |
| } while (count && !res.eof); |
| |
| if (res.pov.attributes) { |
| mutex_enter(&rp->r_statelock); |
| if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) { |
| mutex_exit(&rp->r_statelock); |
| PURGE_ATTRCACHE(vp); |
| } else { |
| if (rp->r_mtime <= t) |
| nfs_attrcache_va(vp, &va); |
| mutex_exit(&rp->r_statelock); |
| } |
| } |
| |
| return (0); |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, |
| caller_context_t *ct) |
| { |
| |
| if (nfs_zone() != VTOMI(vp)->mi_zone) |
| return (EIO); |
| switch (cmd) { |
| case _FIODIRECTIO: |
| return (nfs_directio(vp, (int)arg, cr)); |
| default: |
| return (ENOTTY); |
| } |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, |
| caller_context_t *ct) |
| { |
| int error; |
| rnode_t *rp; |
| |
| if (nfs_zone() != VTOMI(vp)->mi_zone) |
| return (EIO); |
| /* |
| * If it has been specified that the return value will |
| * just be used as a hint, and we are only being asked |
| * for size, fsid or rdevid, then return the client's |
| * notion of these values without checking to make sure |
| * that the attribute cache is up to date. |
| * The whole point is to avoid an over the wire GETATTR |
| * call. |
| */ |
| rp = VTOR(vp); |
| if (flags & ATTR_HINT) { |
| if (vap->va_mask == |
| (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { |
| mutex_enter(&rp->r_statelock); |
| if (vap->va_mask | AT_SIZE) |
| vap->va_size = rp->r_size; |
| if (vap->va_mask | AT_FSID) |
| vap->va_fsid = rp->r_attr.va_fsid; |
| if (vap->va_mask | AT_RDEV) |
| vap->va_rdev = rp->r_attr.va_rdev; |
| mutex_exit(&rp->r_statelock); |
| return (0); |
| } |
| } |
| |
| /* |
| * Only need to flush pages if asking for the mtime |
| * and if there any dirty pages or any outstanding |
| * asynchronous (write) requests for this file. |
| */ |
| if (vap->va_mask & AT_MTIME) { |
| if (vn_has_cached_data(vp) && |
| ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) { |
| mutex_enter(&rp->r_statelock); |
| rp->r_gcount++; |
| mutex_exit(&rp->r_statelock); |
| error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct); |
| mutex_enter(&rp->r_statelock); |
| if (error && (error == ENOSPC || error == EDQUOT)) { |
| if (!rp->r_error) |
| rp->r_error = error; |
| } |
| if (--rp->r_gcount == 0) |
| cv_broadcast(&rp->r_cv); |
| mutex_exit(&rp->r_statelock); |
| } |
| } |
| |
| return (nfs3getattr(vp, vap, cr)); |
| } |
| |
| /*ARGSUSED4*/ |
| static int |
| nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, |
| caller_context_t *ct) |
| { |
| int error; |
| struct vattr va; |
| |
| if (vap->va_mask & AT_NOSET) |
| return (EINVAL); |
| if (nfs_zone() != VTOMI(vp)->mi_zone) |
| return (EIO); |
| |
| va.va_mask = AT_UID | AT_MODE; |
| error = nfs3getattr(vp, &va, cr); |
| if (error) |
| return (error); |
| |
| error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx, |
| vp); |
| if (error) |
| return (error); |
| |
| error = nfs3setattr(vp, vap, flags, cr); |
| |
| if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0) |
| vnevent_truncate(vp, ct); |
| |
| return (error); |
| } |
| |
| static int |
| nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) |
| { |
| int error; |
| uint_t mask; |
| SETATTR3args args; |
| SETATTR3res res; |
| int douprintf; |
| rnode_t *rp; |
| struct vattr va; |
| mode_t omode; |
| vsecattr_t *vsp; |
| hrtime_t t; |
| |
| ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); |
| mask = vap->va_mask; |
| |
| rp = VTOR(vp); |
| |
| /* |
| * Only need to flush pages if there are any pages and |
| * if the file is marked as dirty in some fashion. The |
| * file must be flushed so that we can accurately |
| * determine the size of the file and the cached data |
| * after the SETATTR returns. A file is considered to |
| * be dirty if it is either marked with RDIRTY, has |
| * outstanding i/o's active, or is mmap'd. In this |
| * last case, we can't tell whether there are dirty |
| * pages, so we flush just to be sure. |
| */ |
| if (vn_has_cached_data(vp) && |
| ((rp->r_flags & RDIRTY) || |
| rp->r_count > 0 || |
| rp->r_mapcnt > 0)) { |
| ASSERT(vp->v_type != VCHR); |
| error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, NULL); |
| if (error && (error == ENOSPC || error == EDQUOT)) { |
| mutex_enter(&rp->r_statelock); |
| if (!rp->r_error) |
| rp->r_error = error; |
| mutex_exit(&rp->r_statelock); |
| } |
| } |
| |
| args.object = *RTOFH3(rp); |
| /* |
| * If the intent is for the server to set the times, |
| * there is no point in have the mask indicating set mtime or |
| * atime, because the vap values may be junk, and so result |
| * in an overflow error. Remove these flags from the vap mask |
| * before calling in this case, and restore them afterwards. |
| */ |
| if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) { |
| /* Use server times, so don't set the args time fields */ |
| vap->va_mask &= ~(AT_ATIME | AT_MTIME); |
| error = vattr_to_sattr3(vap, &args.new_attributes); |
| vap->va_mask |= (mask & (AT_ATIME | AT_MTIME)); |
| if (mask & AT_ATIME) { |
| args.new_attributes.atime.set_it = SET_TO_SERVER_TIME; |
| } |
| if (mask & AT_MTIME) { |
| args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME; |
| } |
| } else { |
| /* Either do not set times or use the client specified times */ |
| error = vattr_to_sattr3(vap, &args.new_attributes); |
| } |
| |
| if (error) { |
| /* req time field(s) overflow - return immediately */ |
| return (error); |
| } |
| |
| va.va_mask = AT_MODE | AT_CTIME; |
| error = nfs3getattr(vp, &va, cr); |
| if (error) |
| return (error); |
| omode = va.va_mode; |
| |
| tryagain: |
| if (mask & AT_SIZE) { |
| args.guard.check = TRUE; |
| args.guard.obj_ctime.seconds = va.va_ctime.tv_sec; |
| args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec; |
| } else |
| args.guard.check = FALSE; |
| |
| douprintf = 1; |
| |
| t = gethrtime(); |
| |
| error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR, |
| xdr_SETATTR3args, (caddr_t)&args, |
| xdr_SETATTR3res, (caddr_t)&res, cr, |
| &douprintf, &res.status, 0, NULL); |
| |
| /* |
| * Purge the access cache and ACL cache if changing either the |
| * owner of the file, the group owner, or the mode. These may |
| * change the access permissions of the file, so purge old |
| * information and start over again. |
| */ |
| if (mask & (AT_UID | AT_GID | AT_MODE)) { |
| (void) nfs_access_purge_rp(rp); |
| if (rp->r_secattr != NULL) { |
| mutex_enter(&rp->r_statelock); |
| vsp = rp->r_secattr; |
| rp->r_secattr = NULL; |
| mutex_exit(&rp->r_statelock); |
| if (vsp != NULL) |
| nfs_acl_free(vsp); |
| } |
| } |
| |
| if (error) { |
| PURGE_ATTRCACHE(vp); |
| return (error); |
| } |
| |
| error = geterrno3(res.status); |
| if (!error) { |
| /* |
| * If changing the size of the file, invalidate |
| * any local cached data which is no longer part |
| * of the file. We also possibly invalidate the |
| * last page in the file. We could use |
| * pvn_vpzero(), but this would mark the page as |
| * modified and require it to be written back to |
| * the server for no particularly good reason. |
| * This way, if we access it, then we bring it |
| * back in. A read should be cheaper than a |
| * write. |
| */ |
| if (mask & AT_SIZE) { |
| nfs_invalidate_pages(vp, |
| (vap->va_size & PAGEMASK), cr); |
| } |
| nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr); |
| /* |
| * Some servers will change the mode to clear the setuid |
| * and setgid bits when changing the uid or gid. The |
| * client needs to compensate appropriately. |
| */ |
| if (mask & (AT_UID | AT_GID)) { |
| int terror; |
| |
| va.va_mask = AT_MODE; |
| terror = nfs3getattr(vp, &va, cr); |
| if (!terror && |
| (((mask & AT_MODE) && va.va_mode != vap->va_mode) || |
| (!(mask & AT_MODE) && va.va_mode != omode))) { |
| va.va_mask = AT_MODE; |
| if (mask & AT_MODE) |
| va.va_mode = vap->va_mode; |
| else |
| va.va_mode = omode; |
| (void) nfs3setattr(vp, &va, 0, cr); |
| } |
| } |
| } else { |
| nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr); |
| /* |
| * If we got back a "not synchronized" error, then |
| * we need to retry with a new guard value. The |
| * guard value used is the change time. If the |
| * server returned post_op_attr, then we can just |
| * retry because we have the latest attributes. |
| * Otherwise, we issue a GETATTR to get the latest |
| * attributes and then retry. If we couldn't get |
| * the attributes this way either, then we give |
| * up because we can't complete the operation as |
| * required. |
| */ |
| if (res.status == NFS3ERR_NOT_SYNC) { |
| va.va_mask = AT_CTIME; |
| if (nfs3getattr(vp, &va, cr) == 0) |
| goto tryagain; |
| } |
| PURGE_STALE_FH(error, vp, cr); |
| } |
| |
| return (error); |
| } |
| |
| static int |
| nfs3_accessx(void *vp, int mode, cred_t *cr) |
| { |
| ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone); |
| return (nfs3_access(vp, mode, 0, cr, NULL)); |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) |
| { |
| int error; |
| ACCESS3args args; |
| ACCESS3res res; |
| int douprintf; |
| uint32 acc; |
| rnode_t *rp; |
| cred_t *cred, *ncr, *ncrfree = NULL; |
| failinfo_t fi; |
| nfs_access_type_t cacc; |
| hrtime_t t; |
| |
| acc = 0; |
| if (nfs_zone() != VTOMI(vp)->mi_zone) |
| return (EIO); |
| if (mode & VREAD) |
| acc |= ACCESS3_READ; |
| if (mode & VWRITE) { |
| if (vn_is_readonly(vp) && !IS_DEVVP(vp)) |
| return (EROFS); |
| if (vp->v_type == VDIR) |
| acc |= ACCESS3_DELETE; |
| acc |= ACCESS3_MODIFY | ACCESS3_EXTEND; |
| } |
| if (mode & VEXEC) { |
| if (vp->v_type == VDIR) |
| acc |= ACCESS3_LOOKUP; |
| else |
| acc |= ACCESS3_EXECUTE; |
| } |
| |
| rp = VTOR(vp); |
| args.object = *VTOFH3(vp); |
| if (vp->v_type == VDIR) { |
| args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY | |
| ACCESS3_EXTEND | ACCESS3_LOOKUP; |
| } else { |
| args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND | |
| ACCESS3_EXECUTE; |
| } |
| fi.vp = vp; |
| fi.fhp = (caddr_t)&args.object; |
| fi.copyproc = nfs3copyfh; |
| fi.lookupproc = nfs3lookup; |
| fi.xattrdirproc = acl_getxattrdir3; |
| |
| cred = cr; |
| /* |
| * ncr and ncrfree both initially |
| * point to the memory area returned |
| * by crnetadjust(); |
| * ncrfree not NULL when exiting means |
| * that we need to release it |
| */ |
| ncr = crnetadjust(cred); |
| ncrfree = ncr; |
| tryagain: |
| if (rp->r_acache != NULL) { |
| cacc = nfs_access_check(rp, acc, cred); |
| if (cacc == NFS_ACCESS_ALLOWED) { |
| if (ncrfree != NULL) |
| crfree(ncrfree); |
| return (0); |
| } |
| if (cacc == NFS_ACCESS_DENIED) { |
| /* |
| * If the cred can be adjusted, try again |
| * with the new cred. |
| */ |
| if (ncr != NULL) { |
| cred = ncr; |
| ncr = NULL; |
| goto tryagain; |
| } |
| if (ncrfree != NULL) |
| crfree(ncrfree); |
| return (EACCES); |
| } |
| } |
| |
| douprintf = 1; |
| |
| t = gethrtime(); |
| |
| error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS, |
| xdr_ACCESS3args, (caddr_t)&args, |
| xdr_ACCESS3res, (caddr_t)&res, cred, |
| &douprintf, &res.status, 0, &fi); |
| |
| if (error) { |
| if (ncrfree != NULL) |
| crfree(ncrfree); |
| return (error); |
| } |
| |
| error = geterrno3(res.status); |
| if (!error) { |
| nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr); |
| nfs_access_cache(rp, args.access, res.resok.access, cred); |
| /* |
| * we just cached results with cred; if cred is the |
| * adjusted credentials from crnetadjust, we do not want |
| * to release them before exiting: hence setting ncrfree |
| * to NULL |
| */ |
| if (cred != cr) |
| ncrfree = NULL; |
| if ((acc & res.resok.access) != acc) { |
| /* |
| * If the cred can be adjusted, try again |
| * with the new cred. |
| */ |
| if (ncr != NULL) { |
| cred = ncr; |
| ncr = NULL; |
| goto tryagain; |
| } |
| error = EACCES; |
| } |
| } else { |
| nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr); |
| PURGE_STALE_FH(error, vp, cr); |
| } |
| |
| if (ncrfree != NULL) |
| crfree(ncrfree); |
| |
| return (error); |
| } |
| |
| static int nfs3_do_symlink_cache = 1; |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) |
| { |
| int error; |
| READLINK3args args; |
| READLINK3res res; |
| nfspath3 resdata_backup; |
| rnode_t *rp; |
| int douprintf; |
| int len; |
| failinfo_t fi; |
| hrtime_t t; |
| |
| /* |
| * Can't readlink anything other than a symbolic link. |
| */ |
| if (vp->v_type != VLNK) |
| return (EINVAL); |
| if (nfs_zone() != VTOMI(vp)->mi_zone) |
| return (EIO); |
| |
| rp = VTOR(vp); |
| if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) { |
| error = nfs3_validate_caches(vp, cr); |
| if (error) |
| return (error); |
| mutex_enter(&rp->r_statelock); |
| if (rp->r_symlink.contents != NULL) { |
| error = uiomove(rp->r_symlink.contents, |
| rp->r_symlink.len, UIO_READ, uiop); |
| mutex_exit(&rp->r_statelock); |
| return (error); |
| } |
| mutex_exit(&rp->r_statelock); |
| } |
| |
| args.symlink = *VTOFH3(vp); |
| fi.vp = vp; |
| fi.fhp = (caddr_t)&args.symlink; |
| fi.copyproc = nfs3copyfh; |
| fi.lookupproc = nfs3lookup; |
| fi.xattrdirproc = acl_getxattrdir3; |
| |
| res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP); |
| |
| resdata_backup = res.resok.data; |
| |
| douprintf = 1; |
| |
| t = gethrtime(); |
| |
| error = rfs3call(VTOMI(vp), NFSPROC3_READLINK, |
| xdr_READLINK3args, (caddr_t)&args, |
| xdr_READLINK3res, (caddr_t)&res, cr, |
| &douprintf, &res.status, 0, &fi); |
| |
| if (res.resok.data == nfs3nametoolong) |
| error = EINVAL; |
| |
| if (error) { |
| kmem_free(resdata_backup, MAXPATHLEN); |
| return (error); |
| } |
| |
| error = geterrno3(res.status); |
| if (!error) { |
| nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t, |
| cr); |
| len = strlen(res.resok.data); |
| error = uiomove(res.resok.data, len, UIO_READ, uiop); |
| if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) { |
| mutex_enter(&rp->r_statelock); |
| if (rp->r_symlink.contents == NULL) { |
| rp->r_symlink.contents = res.resok.data; |
| rp->r_symlink.len = len; |
| rp->r_symlink.size = MAXPATHLEN; |
| mutex_exit(&rp->r_statelock); |
| } else { |
| mutex_exit(&rp->r_statelock); |
| |
| kmem_free((void *)res.resok.data, MAXPATHLEN); |
| } |
| } else { |
| kmem_free((void *)res.resok.data, MAXPATHLEN); |
| } |
| } else { |
| nfs3_cache_post_op_attr(vp, |
| &res.resfail.symlink_attributes, t, cr); |
| PURGE_STALE_FH(error, vp, cr); |
| |
| kmem_free((void *)res.resok.data, MAXPATHLEN); |
| |
| } |
| |
| /* |
| * The over the wire error for attempting to readlink something |
| * other than a symbolic link is ENXIO. However, we need to |
| * return EINVAL instead of ENXIO, so we map it here. |
| */ |
| return (error == ENXIO ? EINVAL : error); |
| } |
| |
| /* |
| * Flush local dirty pages to stable storage on the server. |
| * |
| * If FNODSYNC is specified, then there is nothing to do because |
| * metadata changes are not cached on the client before being |
| * sent to the server. |
| */ |
| /* ARGSUSED */ |
| static int |
| nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) |
| { |
| int error; |
| |
| if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) |
| return (0); |
| if (nfs_zone() != VTOMI(vp)->mi_zone) |
| return (EIO); |
| |
| error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr); |
| if (!error) |
| error = VTOR(vp)->r_error; |
| return (error); |
| } |
| |
| /* |
| * Weirdness: if the file was removed or the target of a rename |
| * operation while it was open, it got renamed instead. Here we |
| * remove the renamed file. |
| */ |
| /* ARGSUSED */ |
| static void |
| nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) |
| { |
| rnode_t *rp; |
| |
| ASSERT(vp != DNLC_NO_VNODE); |
| |
| /* |
| * If this is coming from the wrong zone, we let someone in the right |
| * zone take care of it asynchronously. We can get here due to |
| * VN_RELE() being called from pageout() or fsflush(). This call may |
| * potentially turn into an expensive no-op if, for instance, v_count |
| * gets incremented in the meantime, but it's still correct. |
| */ |
| if (nfs_zone() != VTOMI(vp)->mi_zone) { |
| nfs_async_inactive(vp, cr, nfs3_inactive); |
| return; |
| } |
| |
| rp = VTOR(vp); |
| redo: |
| if (rp->r_unldvp != NULL) { |
| /* |
| * Save the vnode pointer for the directory where the |
| * unlinked-open file got renamed, then set it to NULL |
| * to prevent another thread from getting here before |
| * we're done with the remove. While we have the |
| * statelock, make local copies of the pertinent rnode |
| * fields. If we weren't to do this in an atomic way, the |
| * the unl* fields could become inconsistent with respect |
| * to each other due to a race condition between this |
| * code and nfs_remove(). See bug report 1034328. |
| */ |
| mutex_enter(&rp->r_statelock); |
| if (rp->r_unldvp != NULL) { |
| vnode_t *unldvp; |
| char *unlname; |
| cred_t *unlcred; |
| REMOVE3args args; |
| REMOVE3res res; |
| int douprintf; |
| int error; |
| hrtime_t t; |
| |
| unldvp = rp->r_unldvp; |
| rp->r_unldvp = NULL; |
| unlname = rp->r_unlname; |
| rp->r_unlname = NULL; |
| unlcred = rp->r_unlcred; |
| rp->r_unlcred = NULL; |
| mutex_exit(&rp->r_statelock); |
| |
| /* |
| * If there are any dirty pages left, then flush |
| * them. This is unfortunate because they just |
| * may get thrown away during the remove operation, |
| * but we have to do this for correctness. |
| */ |
| if (vn_has_cached_data(vp) && |
| ((rp->r_flags & RDIRTY) || rp->r_count > 0)) { |
| ASSERT(vp->v_type != VCHR); |
| error = nfs3_putpage(vp, (offset_t)0, 0, 0, |
| cr, ct); |
| if (error) { |
| mutex_enter(&rp->r_statelock); |
| if (!rp->r_error) |
| rp->r_error = error; |
| mutex_exit(&rp->r_statelock); |
| } |
| } |
| |
| /* |
| * Do the remove operation on the renamed file |
| */ |
| setdiropargs3(&args.object, unlname, unldvp); |
| |
| douprintf = 1; |
| |
| t = gethrtime(); |
| |
| error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE, |
| xdr_diropargs3, (caddr_t)&args, |
| xdr_REMOVE3res, (caddr_t)&res, unlcred, |
| &douprintf, &res.status, 0, NULL); |
| |
| if (error) { |
| PURGE_ATTRCACHE(unldvp); |
| } else { |
| error = geterrno3(res.status); |
| if (!error) { |
| nfs3_cache_wcc_data(unldvp, |
| &res.resok.dir_wcc, t, cr); |
| if (HAVE_RDDIR_CACHE(VTOR(unldvp))) |
| nfs_purge_rddir_cache(unldvp); |
| } else { |
| nfs3_cache_wcc_data(unldvp, |
| &res.resfail.dir_wcc, t, cr); |
| PURGE_STALE_FH(error, unldvp, cr); |
| } |
| } |
| |
| /* |
| * Release stuff held for the remove |
| */ |
| VN_RELE(unldvp); |
| kmem_free(unlname, MAXNAMELEN); |
| crfree(unlcred); |
| goto redo; |
| } |
| mutex_exit(&rp->r_statelock); |
| } |
| |
| rp_addfree(rp, cr); |
| } |
| |
| /* |
| * Remote file system operations having to do with directory manipulation. |
| */ |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, |
| int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, |
| int *direntflags, pathname_t *realpnp) |
| { |
| int error; |
| vnode_t *vp; |
| vnode_t *avp = NULL; |
| rnode_t *drp; |
| |
| if (nfs_zone() != VTOMI(dvp)->mi_zone) |
| return (EPERM); |
| |
| drp = VTOR(dvp); |
| |
| /* |
| * Are we looking up extended attributes? If so, "dvp" is |
| * the file or directory for which we want attributes, and |
| * we need a lookup of the hidden attribute directory |
| * before we lookup the rest of the path. |
| */ |
| if (flags & LOOKUP_XATTR) { |
| bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0); |
| mntinfo_t *mi; |
| |
| mi = VTOMI(dvp); |
| if (!(mi->mi_flags & MI_EXTATTR)) |
| return (EINVAL); |
| |
| if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) |
| return (EINTR); |
| |
| (void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr); |
| if (avp == NULL) |
| error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0); |
| else |
| error = 0; |
| |
| nfs_rw_exit(&drp->r_rwlock); |
| |
| if (error) { |
| if (mi->mi_flags & MI_EXTATTR) |
| return (error); |
| return (EINVAL); |
| } |
| dvp = avp; |
| drp = VTOR(dvp); |
| } |
| |
| if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) { |
| error = EINTR; |
| goto out; |
| } |
| |
| error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0); |
| |
| nfs_rw_exit(&drp->r_rwlock); |
| |
| /* |
| * If vnode is a device, create special vnode. |
| */ |
| if (!error && IS_DEVVP(*vpp)) { |
| vp = *vpp; |
| *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); |
| VN_RELE(vp); |
| } |
| |
| out: |
| if (avp != NULL) |
| VN_RELE(avp); |
| |
| return (error); |
| } |
| |
| static int nfs3_lookup_neg_cache = 1; |
| |
| #ifdef DEBUG |
| static int nfs3_lookup_dnlc_hits = 0; |
| static int nfs3_lookup_dnlc_misses = 0; |
| static int nfs3_lookup_dnlc_neg_hits = 0; |
| static int nfs3_lookup_dnlc_disappears = 0; |
| static int nfs3_lookup_dnlc_lookups = 0; |
| #endif |
| |
| /* ARGSUSED */ |
| int |
| nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, |
| int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags) |
| { |
| int error; |
| rnode_t *drp; |
| |
| ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); |
| /* |
| * If lookup is for "", just return dvp. Don't need |
| * to send it over the wire, look it up in the dnlc, |
| * or perform any access checks. |
| */ |
| if (*nm == '\0') { |
| VN_HOLD(dvp); |
| *vpp = dvp; |
| return (0); |
| } |
| |
| /* |
| * Can't do lookups in non-directories. |
| */ |
| if (dvp->v_type != VDIR) |
| return (ENOTDIR); |
| |
| /* |
| * If we're called with RFSCALL_SOFT, it's important that |
| * the only rfscall is one we make directly; if we permit |
| * an access call because we're looking up "." or validating |
| * a dnlc hit, we'll deadlock because that rfscall will not |
| * have the RFSCALL_SOFT set. |
| */ |
| if (rfscall_flags & RFSCALL_SOFT) |
| goto callit; |
| |
| /* |
| * If lookup is for ".", just return dvp. Don't need |
| * to send it over the wire or look it up in the dnlc, |
| * just need to check access. |
| */ |
| if (strcmp(nm, ".") == 0) { |
| error = nfs3_access(dvp, VEXEC, 0, cr, NULL); |
| if (error) |
| return (error); |
| VN_HOLD(dvp); |
| *vpp = dvp; |
| return (0); |
| } |
| |
| drp = VTOR(dvp); |
| if (!(drp->r_flags & RLOOKUP)) { |
| mutex_enter(&drp->r_statelock); |
| drp->r_flags |= RLOOKUP; |
| mutex_exit(&drp->r_statelock); |
| } |
| |
| /* |
| * Lookup this name in the DNLC. If there was a valid entry, |
| * then return the results of the lookup. |
| */ |
| error = nfs3lookup_dnlc(dvp, nm, vpp, cr); |
| if (error || *vpp != NULL) |
| return (error); |
| |
| callit: |
| error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags); |
| |
| return (error); |
| } |
| |
| static int |
| nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) |
| { |
| int error; |
| vnode_t *vp; |
| |
| ASSERT(*nm != '\0'); |
| ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); |
| /* |
| * Lookup this name in the DNLC. If successful, then validate |
| * the caches and then recheck the DNLC. The DNLC is rechecked |
| * just in case this entry got invalidated during the call |
| * to nfs3_validate_caches. |
| * |
| * An assumption is being made that it is safe to say that a |
| * file exists which may not on the server. Any operations to |
| * the server will fail with ESTALE. |
| */ |
| #ifdef DEBUG |
| nfs3_lookup_dnlc_lookups++; |
| #endif |
| vp = dnlc_lookup(dvp, nm); |
| if (vp != NULL) { |
| VN_RELE(vp); |
| if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) { |
| PURGE_ATTRCACHE(dvp); |
| } |
| error = nfs3_validate_caches(dvp, cr); |
| if (error) |
| return (error); |
| vp = dnlc_lookup(dvp, nm); |
| if (vp != NULL) { |
| error = nfs3_access(dvp, VEXEC, 0, cr, NULL); |
| if (error) { |
| VN_RELE(vp); |
| return (error); |
| } |
| if (vp == DNLC_NO_VNODE) { |
| VN_RELE(vp); |
| #ifdef DEBUG |
| nfs3_lookup_dnlc_neg_hits++; |
| #endif |
| return (ENOENT); |
| } |
| *vpp = vp; |
| #ifdef DEBUG |
| nfs3_lookup_dnlc_hits++; |
| #endif |
| return (0); |
| } |
| #ifdef DEBUG |
| nfs3_lookup_dnlc_disappears++; |
| #endif |
| } |
| #ifdef DEBUG |
| else |
| nfs3_lookup_dnlc_misses++; |
| #endif |
| |
| *vpp = NULL; |
| |
| return (0); |
| } |
| |
| static int |
| nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, |
| int rfscall_flags) |
| { |
| int error; |
| LOOKUP3args args; |
| LOOKUP3vres res; |
| int douprintf; |
| struct vattr vattr; |
| struct vattr dvattr; |
| vnode_t *vp; |
| failinfo_t fi; |
| hrtime_t t; |
| |
| ASSERT(*nm != '\0'); |
| ASSERT(dvp->v_type == VDIR); |
| ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); |
| |
| setdiropargs3(&args.what, nm, dvp); |
| |
| fi.vp = dvp; |
| fi.fhp = (caddr_t)&args.what.dir; |
| fi.copyproc = nfs3copyfh; |
| fi.lookupproc = nfs3lookup; |
| fi.xattrdirproc = acl_getxattrdir3; |
| res.obj_attributes.fres.vp = dvp; |
| res.obj_attributes.fres.vap = &vattr; |
| res.dir_attributes.fres.vp = dvp; |
| res.dir_attributes.fres.vap = &dvattr; |
| |
| douprintf = 1; |
| |
| t = gethrtime(); |
| |
| error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP, |
| xdr_diropargs3, (caddr_t)&args, |
| xdr_LOOKUP3vres, (caddr_t)&res, cr, |
| &douprintf, &res.status, rfscall_flags, &fi); |
| |
| if (error) |
| return (error); |
| |
| nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr); |
| |
| error = geterrno3(res.status); |
| if (error) { |
| PURGE_STALE_FH(error, dvp, cr); |
| if (error == ENOENT && nfs3_lookup_neg_cache) |
| dnlc_enter(dvp, nm, DNLC_NO_VNODE); |
| return (error); |
| } |
| |
| if (res.obj_attributes.attributes) { |
| vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap, |
| dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); |
| } else { |
| vp = makenfs3node_va(&res.object, NULL, |
| dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm); |
| if (vp->v_type == VNON) { |
| vattr.va_mask = AT_TYPE; |
| error = nfs3getattr(vp, &vattr, cr); |
| if (error) { |
| VN_RELE(vp); |
| return (error); |
| } |
| vp->v_type = vattr.va_type; |
| } |
| } |
| |
| if (!(rfscall_flags & RFSCALL_SOFT)) |
| dnlc_update(dvp, nm, vp); |
| |
| *vpp = vp; |
| |
| return (error); |
| } |
| |
| #ifdef DEBUG |
| static int nfs3_create_misses = 0; |
| #endif |
| |
| /* ARGSUSED */ |
| static int |
| nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, |
| int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct, |
| vsecattr_t *vsecp) |
| { |
| int error; |
| vnode_t *vp; |
| rnode_t *rp; |
| struct vattr vattr; |
| rnode_t *drp; |
| vnode_t *tempvp; |
| |
| drp = VTOR(dvp); |
| if (nfs_zone() != VTOMI(dvp)->mi_zone) |
| return (EPERM); |
| if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp))) |
| return (EINTR); |
| |
| top: |
| /* |
| * We make a copy of the attributes because the caller does not |
| * expect us to change what va points to. |
| */ |
| vattr = *va; |
| |
| /* |
| * If the pathname is "", just use dvp. Don't need |
| * to send it over the wire, look it up in the dnlc, |
| * or perform any access checks. |
| */ |
| if (*nm == '\0') { |
| error = 0; |
| VN_HOLD(dvp); |
| vp = dvp; |
| /* |
| * If the pathname is ".", just use dvp. Don't need |
| * to send it over the wire or look it up in the dnlc, |
| * just need to check access. |
| */ |
| } else if (strcmp(nm, ".") == 0) { |
| error = nfs3_access(dvp, VEXEC, 0, cr, ct); |
| if (error) { |
| nfs_rw_exit(&drp->r_rwlock); |
| return (error); |
| } |
| VN_HOLD(dvp); |
| vp = dvp; |
| /* |
| * We need to go over the wire, just to be sure whether the |
| * file exists or not. Using the DNLC can be dangerous in |
| * this case when making a decision regarding existence. |
| */ |
| } else { |
| error = nfs3lookup_otw(dvp, nm, &vp, cr, 0); |
| } |
| if (!error) { |
| if (exclusive == EXCL) |
| error = EEXIST; |
| else if (vp->v_type == VDIR && (mode & VWRITE)) |
| error = EISDIR; |
| else { |
| /* |
| * If vnode is a device, create special vnode. |
| */ |
| if (IS_DEVVP(vp)) { |
| tempvp = vp; |
| vp = specvp(vp, vp->v_rdev, vp->v_type, cr); |
| VN_RELE(tempvp); |
| } |
| if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { |
| if ((vattr.va_mask & AT_SIZE) && |
| vp->v_type == VREG) { |
| rp = VTOR(vp); |
| /* |
| * Check here for large file handled |
| * by LF-unaware process (as |
| * ufs_create() does) |
| */ |
| if (!(lfaware & FOFFMAX)) { |
| mutex_enter(&rp->r_statelock); |
| if (rp->r_size > MAXOFF32_T) |
| error = EOVERFLOW; |
| mutex_exit(&rp->r_statelock); |
| } |
| if (!error) { |
| vattr.va_mask = AT_SIZE; |
| error = nfs3setattr(vp, |
| &vattr, 0, cr); |
| |
| /* |
| * Existing file was truncated; |
| * emit a create event. |
| */ |
| vnevent_create(vp, ct); |
| } |
| } |
| } |
| } |
| nfs_rw_exit(&drp->r_rwlock); |
| if (error) { |
| VN_RELE(vp); |
| } else { |
| *vpp = vp; |
| } |
| |
| return (error); |
| } |
| |
| dnlc_remove(dvp, nm); |
| |
| /* |
| * Decide what the group-id of the created file should be. |
| * Set it in attribute list as advisory... |
| */ |
| error = setdirgid(dvp, &vattr.va_gid, cr); |
| if (error) { |
| nfs_rw_exit(&drp->r_rwlock); |
| return (error); |
| } |
| vattr.va_mask |= AT_GID; |
| |
| ASSERT(vattr.va_mask & AT_TYPE); |
| if (vattr.va_type == VREG) { |
| ASSERT(vattr.va_mask & AT_MODE); |
| if (MANDMODE(vattr.va_mode)) { |
| nfs_rw_exit(&drp->r_rwlock); |
| return (EACCES); |
| } |
| error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr, |
| lfaware); |
| /* |
| * If this is not an exclusive create, then the CREATE |
| * request will be made with the GUARDED mode set. This |
| * means that the server will return EEXIST if the file |
| * exists. The file could exist because of a retransmitted |
| * request. In this case, we recover by starting over and |
| * checking to see whether the file exists. This second |
| * time through it should and a CREATE request will not be |
| * sent. |
| * |
| * This handles the problem of a dangling CREATE request |
| * which contains attributes which indicate that the file |
| * should be truncated. This retransmitted request could |
| * possibly truncate valid data in the file if not caught |
| * by the duplicate request mechanism on the server or if |
| * not caught by other means. The scenario is: |
| * |
| * Client transmits CREATE request with size = 0 |
| * Client times out, retransmits request. |
| * Response to the first request arrives from the server |
| * and the client proceeds on. |
| * Client writes data to the file. |
| * The server now processes retransmitted CREATE request |
| * and truncates file. |
| * |
| * The use of the GUARDED CREATE request prevents this from |
| * happening because the retransmitted CREATE would fail |
| * with EEXIST and would not truncate the file. |
| */ |
| if (error == EEXIST && exclusive == NONEXCL) { |
| #ifdef DEBUG |
| nfs3_create_misses++; |
| #endif |
| goto top; |
| } |
| nfs_rw_exit(&drp->r_rwlock); |
| return (error); |
| } |
| error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); |
| nfs_rw_exit(&drp->r_rwlock); |
| return (error); |
| } |
| |
| /* ARGSUSED */ |
| static int |
| nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, |
| int mode, vnode_t **vpp, cred_t *cr, int lfaware) |
| { |
| int error; |
| CREATE3args args; |
| CREATE3res res; |
| int douprintf; |
| vnode_t *vp; |
| struct vattr vattr; |
| nfstime3 *verfp; |
| rnode_t *rp; |
| timestruc_t now; |
| hrtime_t t; |
| |
| ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone); |
| setdiropargs3(&args.where, nm, dvp); |
| if (exclusive == EXCL) { |
| args.how.mode = EXCLUSIVE; |
| /* |
| * Construct the create verifier. This verifier needs |
| * to be unique between different clients. It also needs |
| * to vary for each exclusive create request generated |
| * from the client to the server. |
| * |
| * The first attempt is made to use the hostid and a |
| * unique number on the client. If the hostid has not |
| * been set, the high resolution time that the exclusive |
| * create request is being made is used. This will work |
| * unless two different clients, both with the hostid |
| * not set, attempt an exclusive create request on the |
| * same file, at exactly the same clock time. The |
| * chances of this happening seem small enough to be |
| * reasonable. |
| */ |
| verfp = (nfstime3 *)&args.how.createhow3_u.verf; |
| verfp->seconds = zone_get_hostid(NULL); |
| if (verfp->seconds != 0) |
| verfp->nseconds = newnum(); |
| else { |
| gethrestime(&now); |
| verfp->seconds = now.tv_sec; |
| verfp->nseconds = now.tv_nsec; |
| } |
| /* |
| * Since the server will use this value for the mtime, |
| * make sure that it can't overflow. Zero out the MSB. |
| * The actual value does not matter here, only its uniqeness. |
| */ |
| verfp->seconds %= INT32_MAX; |
| } else { |
| /* |
| * Issue the non-exclusive create in guarded mode. This |
| * may result in some false EEXIST responses for |
| * retransmitted requests, but these will be handled at |
| * a higher level. By using GUARDED, duplicate requests |
| * to do file truncation and possible access problems |
| * can be avoided. |
| */ |
| args.how.mode = GUARDED; |
| error = vattr_to_sattr3(va, |
| &args.how.createhow3_u.obj_attributes); |
| if (error) { |
| /* req time field(s) overflow - return immediately */ |
| return (error); |
| } |
| } |
| |
| douprintf = 1; |
| |
| t = gethrtime(); |
| |
| error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE, |
| xdr_CREATE3args, (caddr_t)&args, |
| xdr_CREATE3res, (caddr_t)&res, cr, |
| &douprintf, &res.status, 0, NULL); |
| |
| if (error) { |
| PURGE_ATTRCACHE(dvp); |
| return (error); |
| } |
| |
| error = geterrno3(res.status); |
| if (!error) { |
| nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr); |
| if (HAVE_RDDIR_CACHE(VTOR(dvp))) |
| nfs_purge_rddir_cache(dvp); |
| |
| /* |
| * On exclusive create the times need to be explicitly |
| * set to clear any potential verifier that may be stored |
| * in one of these fields (see comment below). This |
| * is done here to cover the case where no post op attrs |
| * were returned or a 'invalid' time was returned in |
| * the attributes. |
| */ |
| if (exclusive == EXCL) |
| va->va_mask |= (AT_MTIME | AT_ATIME); |
| |
| if (!res.resok.obj.handle_follows) { |
| error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0); |
| if (error) |
| return (error); |
| } else { |
| if (res.resok.obj_attributes.attributes) { |
| vp = makenfs3node(&res.resok.obj.handle, |
| &res.resok.obj_attributes.attr, |
| dvp->v_vfsp, t, cr, NULL, NULL); |
| } else { |
| vp = makenfs3node(&res.resok.obj.handle, NULL, |
| dvp->v_vfsp, t, cr, NULL, NULL); |
| |
| /* |
| * On an exclusive create, it is possible |
| * that attributes were returned but those |
| * postop attributes failed to decode |
| * properly. If this is the case, |
| * then most likely the atime or mtime |
| * were invalid for our client; this |
| * is caused by the server storing the |
| * create verifier in one of the time |
| * fields(most likely mtime). |
| * So... we are going to setattr just the |
| * atime/mtime to clear things up. |
| */ |
| if (exclusive == EXCL) { |
| if (error = |
| nfs3excl_create_settimes(vp, |
| va, cr)) { |
| /* |
| * Setting the times failed. |
| * Remove the file and return |
| * the error. |
| */ |
| VN_RELE(vp); |
| (void) nfs3_remove(dvp, |
| nm, cr, NULL, 0); |
| return (error); |
| } |
| } |
| |
| /* |
| * This handles the non-exclusive case |
| * and the exclusive case where no post op |
| * attrs were returned. |
| */ |
| if (vp->v_type == VNON) { |
| vattr.va_mask = AT_TYPE; |
| error = nfs3getattr(vp, &vattr, cr); |
| if (error) { |
| VN_RELE(vp); |
| return (error); |
| } |
| vp->v_type = vattr.va_type; |
| } |
| } |
| dnlc_update(dvp, nm, vp); |
| } |
| |
| rp = VTOR(vp); |
| |
| /* |
| * Check here for large file handled by |
| * LF-unaware process (as ufs_create() does) |
| */ |
| if ((va->va_mask & AT_SIZE) && vp->v_type == VREG && |
| !(lfaware & FOFFMAX)) { |
| mutex_enter(&rp->r_statelock); |
| if (rp->r_size > MAXOFF32_T) { |
| mutex_exit(&rp->r_statelock); |
| VN_RELE(vp); |
| return (EOVERFLOW); |
| } |
| mutex_exit(&rp->r_statelock); |
| } |
| |
| if (exclusive == EXCL && |
| (va->va_mask & ~(AT_GID | AT_SIZE))) { |
| /* |
| * If doing an exclusive create, then generate |
| * a SETATTR to set the initial attributes. |
| * Try to set the mtime and the atime to the |
| * server's current time. It is somewhat |
| * expected that these fields will be used to |
| * store the exclusive create cookie. If not, |
| * server implementors will need to know that |
| * a SETATTR will follow an exclusive create |
| * and the cookie should be destroyed if |
| * appropriate. This work may have been done |
| * earlier in this function if post op attrs |
| * were not available. |
| * |
| * The AT_GID and AT_SIZE bits are turned off |
| * so that the SETATTR request will not attempt |
| * to process these. The gid will be set |
| * separately if appropriate. The size is turned |
| * off because it is assumed that a new file will |
| * be created empty and if the file wasn't empty, |
| * then the exclusive create will have failed |
| * because the file must have existed already. |
| * Therefore, no truncate operation is needed. |
| */ |
| va->va_mask &= ~(AT_GID | AT_SIZE); |
| error = nfs3setattr(vp, va, 0, cr); |
| if (error) { |
| /* |
| * Couldn't correct the attributes of |
| * the newly created file and the |
| * attributes are wrong. Remove the |
| * file and return an error to the |
| * application. |
| */ |
| VN_RELE(vp); |
| (void) nfs3_remove(dvp, nm, cr, NULL, 0); |
| return (error); |
| } |
| } |
| |
| if (va->va_gid != rp->r_attr.va_gid) { |
| /* |
| * If the gid on the file isn't right, then |
| * generate a SETATTR to attempt to change |
| * it. This may or may not work, depending |
| * upon the server's semantics for allowing |
| * file ownership changes. |
| */ |
| va->va_mask = AT_GID; |
| (void) nfs3setattr(vp, va, 0, cr); |
| } |
| |
| /* |
| * If vnode is a device create special vnode |
| */ |
| if (IS_DEVVP(vp)) { |
| *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); |
| VN_RELE(vp); |
| } else |
| *vpp = vp; |
| } else { |
| nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr); |
| PURGE_STALE_FH(error, dvp, cr); |
| } |
| |
| return (error); |
| } |
| |
| /* |
| * Special setattr function to take care of rest of atime/mtime |
| * after successful exclusive create. This function exists to avoid |
| * handling attributes from the server; exclusive the atime/mtime fields |
| * may be 'invalid' in client's view and therefore can not be trusted. |
| */ |
| static int |
| nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr) |
| { |
| int error; |
| uint_t mask; |
| SETATTR3args args; |
| SETATTR3res res; |
| int douprintf; |
| rnode_t *rp; |
| hrtime_t t; |
| |
| ASSERT(nfs_zone() == VTOMI(vp)->mi_zone); |
| /* save the caller's mask so that it can be reset later */ |
| mask = vap->va_mask; |
| |
| rp = VTOR(vp); |
| |
| args.object = *RTOFH3(rp); |
| args.guard.check = FALSE; |
| |
| /* Use the mask to initialize the arguments */ |
| vap->va_mask = 0; |
| error = vattr_to_sattr3(vap, &args.new_attributes); |
| |
| /* We want to set just atime/mtime on this request */ |
| args.new_attributes.atime.set_it = SET_TO_SERVER_TIME; |
| args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME; |
| |
| douprintf = 1; |
| |
| t = gethrtime(); |
| |
| error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR, |
| xdr_SETATTR3args, (caddr_t)&args, |
| xdr_SETATTR3res, (caddr_t)&res, cr, |
| &douprintf, &res.status, 0, NULL); |
| |
| if (error) { |
| vap->va_mask = mask; |
| return (error); |
| } |
| |
| error = geterrno3(res.status); |
| if (!error) { |
| /* |
| * It is important to pick up the attributes. |
| * Since this is the exclusive create path, the |
| * attributes on the initial create were ignored |
| * and we need these to have the correct info. |
| */ |
| nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr); |
| /* |
| * No need to do the atime/mtime work again so clear |
| |