| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright 2006 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| * |
| * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. |
| * All rights reserved. |
| */ |
| |
| #pragma ident "%Z%%M% %I% %E% SMI" |
| |
| #include <sys/param.h> |
| #include <sys/types.h> |
| #include <sys/systm.h> |
| #include <sys/cred_impl.h> |
| #include <sys/proc.h> |
| #include <sys/user.h> |
| #include <sys/time.h> |
| #include <sys/buf.h> |
| #include <sys/vfs.h> |
| #include <sys/vnode.h> |
| #include <sys/socket.h> |
| #include <sys/uio.h> |
| #include <sys/tiuser.h> |
| #include <sys/swap.h> |
| #include <sys/errno.h> |
| #include <sys/debug.h> |
| #include <sys/kmem.h> |
| #include <sys/kstat.h> |
| #include <sys/cmn_err.h> |
| #include <sys/vtrace.h> |
| #include <sys/session.h> |
| #include <sys/dnlc.h> |
| #include <sys/bitmap.h> |
| #include <sys/acl.h> |
| #include <sys/ddi.h> |
| #include <sys/pathname.h> |
| #include <sys/flock.h> |
| #include <sys/dirent.h> |
| #include <sys/flock.h> |
| #include <sys/callb.h> |
| #include <sys/atomic.h> |
| #include <sys/list.h> |
| #include <sys/tsol/tnet.h> |
| #include <sys/priv.h> |
| |
| #include <inet/ip6.h> |
| |
| #include <rpc/types.h> |
| #include <rpc/xdr.h> |
| #include <rpc/auth.h> |
| #include <rpc/clnt.h> |
| |
| #include <nfs/nfs.h> |
| #include <nfs/nfs4.h> |
| #include <nfs/nfs_clnt.h> |
| #include <nfs/rnode.h> |
| #include <nfs/nfs_acl.h> |
| |
| /* |
| * The hash queues for the access to active and cached rnodes |
| * are organized as doubly linked lists. A reader/writer lock |
| * for each hash bucket is used to control access and to synchronize |
| * lookups, additions, and deletions from the hash queue. |
| * |
| * The rnode freelist is organized as a doubly linked list with |
| * a head pointer. Additions and deletions are synchronized via |
| * a single mutex. |
| * |
| * In order to add an rnode to the free list, it must be hashed into |
| * a hash queue and the exclusive lock to the hash queue be held. |
| * If an rnode is not hashed into a hash queue, then it is destroyed |
| * because it represents no valuable information that can be reused |
| * about the file. The exclusive lock to the hash queue must be |
| * held in order to prevent a lookup in the hash queue from finding |
| * the rnode and using it and assuming that the rnode is not on the |
| * freelist. The lookup in the hash queue will have the hash queue |
| * locked, either exclusive or shared. |
| * |
| * The vnode reference count for each rnode is not allowed to drop |
| * below 1. This prevents external entities, such as the VM |
| * subsystem, from acquiring references to vnodes already on the |
| * freelist and then trying to place them back on the freelist |
| * when their reference is released. This means that the when an |
| * rnode is looked up in the hash queues, then either the rnode |
| * is removed from the freelist and that reference is tranfered to |
| * the new reference or the vnode reference count must be incremented |
| * accordingly. The mutex for the freelist must be held in order to |
| * accurately test to see if the rnode is on the freelist or not. |
| * The hash queue lock might be held shared and it is possible that |
| * two different threads may race to remove the rnode from the |
| * freelist. This race can be resolved by holding the mutex for the |
| * freelist. Please note that the mutex for the freelist does not |
| * need to held if the rnode is not on the freelist. It can not be |
| * placed on the freelist due to the requirement that the thread |
| * putting the rnode on the freelist must hold the exclusive lock |
| * to the hash queue and the thread doing the lookup in the hash |
| * queue is holding either a shared or exclusive lock to the hash |
| * queue. |
| * |
| * The lock ordering is: |
| * |
| * hash bucket lock -> vnode lock |
| * hash bucket lock -> freelist lock |
| */ |
| static rhashq_t *rtable; |
| |
| static kmutex_t rpfreelist_lock; |
| static rnode_t *rpfreelist = NULL; |
| static long rnew = 0; |
| long nrnode = 0; |
| |
| static int rtablesize; |
| static int rtablemask; |
| |
| static int hashlen = 4; |
| |
| static struct kmem_cache *rnode_cache; |
| |
| /* |
| * Mutex to protect the following variables: |
| * nfs_major |
| * nfs_minor |
| */ |
| kmutex_t nfs_minor_lock; |
| int nfs_major; |
| int nfs_minor; |
| |
| /* Do we allow preepoch (negative) time values otw? */ |
| bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */ |
| |
| /* |
| * Access cache |
| */ |
| static acache_hash_t *acache; |
| static long nacache; /* used strictly to size the number of hash queues */ |
| |
| static int acachesize; |
| static int acachemask; |
| static struct kmem_cache *acache_cache; |
| |
| /* |
| * Client side utilities |
| */ |
| |
| /* |
| * client side statistics |
| */ |
| static const struct clstat clstat_tmpl = { |
| { "calls", KSTAT_DATA_UINT64 }, |
| { "badcalls", KSTAT_DATA_UINT64 }, |
| { "clgets", KSTAT_DATA_UINT64 }, |
| { "cltoomany", KSTAT_DATA_UINT64 }, |
| #ifdef DEBUG |
| { "clalloc", KSTAT_DATA_UINT64 }, |
| { "noresponse", KSTAT_DATA_UINT64 }, |
| { "failover", KSTAT_DATA_UINT64 }, |
| { "remap", KSTAT_DATA_UINT64 }, |
| #endif |
| }; |
| |
| /* |
| * The following are statistics that describe behavior of the system as a whole |
| * and doesn't correspond to any one particular zone. |
| */ |
| #ifdef DEBUG |
| static struct clstat_debug { |
| kstat_named_t nrnode; /* number of allocated rnodes */ |
| kstat_named_t access; /* size of access cache */ |
| kstat_named_t dirent; /* size of readdir cache */ |
| kstat_named_t dirents; /* size of readdir buf cache */ |
| kstat_named_t reclaim; /* number of reclaims */ |
| kstat_named_t clreclaim; /* number of cl reclaims */ |
| kstat_named_t f_reclaim; /* number of free reclaims */ |
| kstat_named_t a_reclaim; /* number of active reclaims */ |
| kstat_named_t r_reclaim; /* number of rnode reclaims */ |
| kstat_named_t rpath; /* bytes used to store rpaths */ |
| } clstat_debug = { |
| { "nrnode", KSTAT_DATA_UINT64 }, |
| { "access", KSTAT_DATA_UINT64 }, |
| { "dirent", KSTAT_DATA_UINT64 }, |
| { "dirents", KSTAT_DATA_UINT64 }, |
| { "reclaim", KSTAT_DATA_UINT64 }, |
| { "clreclaim", KSTAT_DATA_UINT64 }, |
| { "f_reclaim", KSTAT_DATA_UINT64 }, |
| { "a_reclaim", KSTAT_DATA_UINT64 }, |
| { "r_reclaim", KSTAT_DATA_UINT64 }, |
| { "r_path", KSTAT_DATA_UINT64 }, |
| }; |
| #endif /* DEBUG */ |
| |
| /* |
| * We keep a global list of per-zone client data, so we can clean up all zones |
| * if we get low on memory. |
| */ |
| static list_t nfs_clnt_list; |
| static kmutex_t nfs_clnt_list_lock; |
| static zone_key_t nfsclnt_zone_key; |
| |
| static struct kmem_cache *chtab_cache; |
| |
| /* |
| * Some servers do not properly update the attributes of the |
| * directory when changes are made. To allow interoperability |
| * with these broken servers, the nfs_disable_rddir_cache |
| * parameter must be set in /etc/system |
| */ |
| int nfs_disable_rddir_cache = 0; |
| |
| int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **, |
| struct chtab **); |
| void clfree(CLIENT *, struct chtab *); |
| static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, |
| struct chtab **, struct nfs_clnt *); |
| static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **, |
| struct chtab **, struct nfs_clnt *); |
| static void clreclaim(void *); |
| static int nfs_feedback(int, int, mntinfo_t *); |
| static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, |
| caddr_t, cred_t *, int *, enum clnt_stat *, int, |
| failinfo_t *); |
| static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t, |
| caddr_t, cred_t *, int *, int, failinfo_t *); |
| static void rinactive(rnode_t *, cred_t *); |
| static int rtablehash(nfs_fhandle *); |
| static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *, |
| struct vnodeops *, |
| int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int, |
| cred_t *), |
| int (*)(const void *, const void *), int *, cred_t *, |
| char *, char *); |
| static void rp_rmfree(rnode_t *); |
| static void rp_addhash(rnode_t *); |
| static void rp_rmhash_locked(rnode_t *); |
| static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *); |
| static void destroy_rnode(rnode_t *); |
| static void rddir_cache_free(rddir_cache *); |
| static int nfs_free_data_reclaim(rnode_t *); |
| static int nfs_active_data_reclaim(rnode_t *); |
| static int nfs_free_reclaim(void); |
| static int nfs_active_reclaim(void); |
| static int nfs_rnode_reclaim(void); |
| static void nfs_reclaim(void *); |
| static int failover_safe(failinfo_t *); |
| static void failover_newserver(mntinfo_t *mi); |
| static void failover_thread(mntinfo_t *mi); |
| static int failover_wait(mntinfo_t *); |
| static int failover_remap(failinfo_t *); |
| static int failover_lookup(char *, vnode_t *, |
| int (*)(vnode_t *, char *, vnode_t **, |
| struct pathname *, int, vnode_t *, cred_t *, int), |
| int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int), |
| vnode_t **); |
| static void nfs_free_r_path(rnode_t *); |
| static void nfs_set_vroot(vnode_t *); |
| static char *nfs_getsrvnames(mntinfo_t *, size_t *); |
| |
| /* |
| * from rpcsec module (common/rpcsec) |
| */ |
| extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **); |
| extern void sec_clnt_freeh(AUTH *); |
| extern void sec_clnt_freeinfo(struct sec_data *); |
| |
| /* |
| * used in mount policy |
| */ |
| extern ts_label_t *getflabel_cipso(vfs_t *); |
| |
| /* |
| * EIO or EINTR are not recoverable errors. |
| */ |
| #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO)) |
| |
| /* |
| * Common handle get program for NFS, NFS ACL, and NFS AUTH client. |
| */ |
| static int |
| clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, |
| struct chtab **chp, struct nfs_clnt *nfscl) |
| { |
| struct chhead *ch, *newch; |
| struct chhead **plistp; |
| struct chtab *cp; |
| int error; |
| k_sigset_t smask; |
| |
| if (newcl == NULL || chp == NULL || ci == NULL) |
| return (EINVAL); |
| |
| *newcl = NULL; |
| *chp = NULL; |
| |
| /* |
| * Find an unused handle or create one |
| */ |
| newch = NULL; |
| nfscl->nfscl_stat.clgets.value.ui64++; |
| top: |
| /* |
| * Find the correct entry in the cache to check for free |
| * client handles. The search is based on the RPC program |
| * number, program version number, dev_t for the transport |
| * device, and the protocol family. |
| */ |
| mutex_enter(&nfscl->nfscl_chtable_lock); |
| plistp = &nfscl->nfscl_chtable; |
| for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { |
| if (ch->ch_prog == ci->cl_prog && |
| ch->ch_vers == ci->cl_vers && |
| ch->ch_dev == svp->sv_knconf->knc_rdev && |
| (strcmp(ch->ch_protofmly, |
| svp->sv_knconf->knc_protofmly) == 0)) |
| break; |
| plistp = &ch->ch_next; |
| } |
| |
| /* |
| * If we didn't find a cache entry for this quadruple, then |
| * create one. If we don't have one already preallocated, |
| * then drop the cache lock, create one, and then start over. |
| * If we did have a preallocated entry, then just add it to |
| * the front of the list. |
| */ |
| if (ch == NULL) { |
| if (newch == NULL) { |
| mutex_exit(&nfscl->nfscl_chtable_lock); |
| newch = kmem_alloc(sizeof (*newch), KM_SLEEP); |
| newch->ch_timesused = 0; |
| newch->ch_prog = ci->cl_prog; |
| newch->ch_vers = ci->cl_vers; |
| newch->ch_dev = svp->sv_knconf->knc_rdev; |
| newch->ch_protofmly = kmem_alloc( |
| strlen(svp->sv_knconf->knc_protofmly) + 1, |
| KM_SLEEP); |
| (void) strcpy(newch->ch_protofmly, |
| svp->sv_knconf->knc_protofmly); |
| newch->ch_list = NULL; |
| goto top; |
| } |
| ch = newch; |
| newch = NULL; |
| ch->ch_next = nfscl->nfscl_chtable; |
| nfscl->nfscl_chtable = ch; |
| /* |
| * We found a cache entry, but if it isn't on the front of the |
| * list, then move it to the front of the list to try to take |
| * advantage of locality of operations. |
| */ |
| } else if (ch != nfscl->nfscl_chtable) { |
| *plistp = ch->ch_next; |
| ch->ch_next = nfscl->nfscl_chtable; |
| nfscl->nfscl_chtable = ch; |
| } |
| |
| /* |
| * If there was a free client handle cached, then remove it |
| * from the list, init it, and use it. |
| */ |
| if (ch->ch_list != NULL) { |
| cp = ch->ch_list; |
| ch->ch_list = cp->ch_list; |
| mutex_exit(&nfscl->nfscl_chtable_lock); |
| if (newch != NULL) { |
| kmem_free(newch->ch_protofmly, |
| strlen(newch->ch_protofmly) + 1); |
| kmem_free(newch, sizeof (*newch)); |
| } |
| (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf, |
| &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr); |
| error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, |
| &cp->ch_client->cl_auth); |
| if (error || cp->ch_client->cl_auth == NULL) { |
| CLNT_DESTROY(cp->ch_client); |
| kmem_cache_free(chtab_cache, cp); |
| return ((error != 0) ? error : EINTR); |
| } |
| ch->ch_timesused++; |
| *newcl = cp->ch_client; |
| *chp = cp; |
| return (0); |
| } |
| |
| /* |
| * There weren't any free client handles which fit, so allocate |
| * a new one and use that. |
| */ |
| #ifdef DEBUG |
| atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1); |
| #endif |
| mutex_exit(&nfscl->nfscl_chtable_lock); |
| |
| nfscl->nfscl_stat.cltoomany.value.ui64++; |
| if (newch != NULL) { |
| kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1); |
| kmem_free(newch, sizeof (*newch)); |
| } |
| |
| cp = kmem_cache_alloc(chtab_cache, KM_SLEEP); |
| cp->ch_head = ch; |
| |
| sigintr(&smask, (int)ci->cl_flags & MI_INT); |
| error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog, |
| ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client); |
| sigunintr(&smask); |
| |
| if (error != 0) { |
| kmem_cache_free(chtab_cache, cp); |
| #ifdef DEBUG |
| atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); |
| #endif |
| /* |
| * Warning is unnecessary if error is EINTR. |
| */ |
| if (error != EINTR) { |
| nfs_cmn_err(error, CE_WARN, |
| "clget: couldn't create handle: %m\n"); |
| } |
| return (error); |
| } |
| (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL); |
| auth_destroy(cp->ch_client->cl_auth); |
| error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr, |
| &cp->ch_client->cl_auth); |
| if (error || cp->ch_client->cl_auth == NULL) { |
| CLNT_DESTROY(cp->ch_client); |
| kmem_cache_free(chtab_cache, cp); |
| #ifdef DEBUG |
| atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1); |
| #endif |
| return ((error != 0) ? error : EINTR); |
| } |
| ch->ch_timesused++; |
| *newcl = cp->ch_client; |
| ASSERT(cp->ch_client->cl_nosignal == FALSE); |
| *chp = cp; |
| return (0); |
| } |
| |
| int |
| clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl, |
| struct chtab **chp) |
| { |
| struct nfs_clnt *nfscl; |
| |
| nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); |
| ASSERT(nfscl != NULL); |
| |
| return (clget_impl(ci, svp, cr, newcl, chp, nfscl)); |
| } |
| |
| static int |
| acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, |
| struct chtab **chp, struct nfs_clnt *nfscl) |
| { |
| clinfo_t ci; |
| int error; |
| |
| /* |
| * Set read buffer size to rsize |
| * and add room for RPC headers. |
| */ |
| ci.cl_readsize = mi->mi_tsize; |
| if (ci.cl_readsize != 0) |
| ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); |
| |
| /* |
| * If soft mount and server is down just try once. |
| * meaning: do not retransmit. |
| */ |
| if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) |
| ci.cl_retrans = 0; |
| else |
| ci.cl_retrans = mi->mi_retrans; |
| |
| ci.cl_prog = NFS_ACL_PROGRAM; |
| ci.cl_vers = mi->mi_vers; |
| ci.cl_flags = mi->mi_flags; |
| |
| /* |
| * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS |
| * security flavor, the client tries to establish a security context |
| * by contacting the server. If the connection is timed out or reset, |
| * e.g. server reboot, we will try again. |
| */ |
| do { |
| error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); |
| |
| if (error == 0) |
| break; |
| |
| /* |
| * For forced unmount or zone shutdown, bail out, no retry. |
| */ |
| if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { |
| error = EIO; |
| break; |
| } |
| |
| /* do not retry for softmount */ |
| if (!(mi->mi_flags & MI_HARD)) |
| break; |
| |
| /* let the caller deal with the failover case */ |
| if (FAILOVER_MOUNT(mi)) |
| break; |
| |
| } while (error == ETIMEDOUT || error == ECONNRESET); |
| |
| return (error); |
| } |
| |
| static int |
| nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl, |
| struct chtab **chp, struct nfs_clnt *nfscl) |
| { |
| clinfo_t ci; |
| int error; |
| |
| /* |
| * Set read buffer size to rsize |
| * and add room for RPC headers. |
| */ |
| ci.cl_readsize = mi->mi_tsize; |
| if (ci.cl_readsize != 0) |
| ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA); |
| |
| /* |
| * If soft mount and server is down just try once. |
| * meaning: do not retransmit. |
| */ |
| if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN)) |
| ci.cl_retrans = 0; |
| else |
| ci.cl_retrans = mi->mi_retrans; |
| |
| ci.cl_prog = mi->mi_prog; |
| ci.cl_vers = mi->mi_vers; |
| ci.cl_flags = mi->mi_flags; |
| |
| /* |
| * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS |
| * security flavor, the client tries to establish a security context |
| * by contacting the server. If the connection is timed out or reset, |
| * e.g. server reboot, we will try again. |
| */ |
| do { |
| error = clget_impl(&ci, svp, cr, newcl, chp, nfscl); |
| |
| if (error == 0) |
| break; |
| |
| /* |
| * For forced unmount or zone shutdown, bail out, no retry. |
| */ |
| if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { |
| error = EIO; |
| break; |
| } |
| |
| /* do not retry for softmount */ |
| if (!(mi->mi_flags & MI_HARD)) |
| break; |
| |
| /* let the caller deal with the failover case */ |
| if (FAILOVER_MOUNT(mi)) |
| break; |
| |
| } while (error == ETIMEDOUT || error == ECONNRESET); |
| |
| return (error); |
| } |
| |
| static void |
| clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl) |
| { |
| if (cl->cl_auth != NULL) { |
| sec_clnt_freeh(cl->cl_auth); |
| cl->cl_auth = NULL; |
| } |
| |
| /* |
| * Timestamp this cache entry so that we know when it was last |
| * used. |
| */ |
| cp->ch_freed = gethrestime_sec(); |
| |
| /* |
| * Add the free client handle to the front of the list. |
| * This way, the list will be sorted in youngest to oldest |
| * order. |
| */ |
| mutex_enter(&nfscl->nfscl_chtable_lock); |
| cp->ch_list = cp->ch_head->ch_list; |
| cp->ch_head->ch_list = cp; |
| mutex_exit(&nfscl->nfscl_chtable_lock); |
| } |
| |
| void |
| clfree(CLIENT *cl, struct chtab *cp) |
| { |
| struct nfs_clnt *nfscl; |
| |
| nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); |
| ASSERT(nfscl != NULL); |
| |
| clfree_impl(cl, cp, nfscl); |
| } |
| |
| #define CL_HOLDTIME 60 /* time to hold client handles */ |
| |
| static void |
| clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime) |
| { |
| struct chhead *ch; |
| struct chtab *cp; /* list of objects that can be reclaimed */ |
| struct chtab *cpe; |
| struct chtab *cpl; |
| struct chtab **cpp; |
| #ifdef DEBUG |
| int n = 0; |
| #endif |
| |
| /* |
| * Need to reclaim some memory, so step through the cache |
| * looking through the lists for entries which can be freed. |
| */ |
| cp = NULL; |
| |
| mutex_enter(&nfscl->nfscl_chtable_lock); |
| |
| /* |
| * Here we step through each non-NULL quadruple and start to |
| * construct the reclaim list pointed to by cp. Note that |
| * cp will contain all eligible chtab entries. When this traversal |
| * completes, chtab entries from the last quadruple will be at the |
| * front of cp and entries from previously inspected quadruples have |
| * been appended to the rear of cp. |
| */ |
| for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) { |
| if (ch->ch_list == NULL) |
| continue; |
| /* |
| * Search each list for entries older then |
| * cl_holdtime seconds. The lists are maintained |
| * in youngest to oldest order so that when the |
| * first entry is found which is old enough, then |
| * all of the rest of the entries on the list will |
| * be old enough as well. |
| */ |
| cpl = ch->ch_list; |
| cpp = &ch->ch_list; |
| while (cpl != NULL && |
| cpl->ch_freed + cl_holdtime > gethrestime_sec()) { |
| cpp = &cpl->ch_list; |
| cpl = cpl->ch_list; |
| } |
| if (cpl != NULL) { |
| *cpp = NULL; |
| if (cp != NULL) { |
| cpe = cpl; |
| while (cpe->ch_list != NULL) |
| cpe = cpe->ch_list; |
| cpe->ch_list = cp; |
| } |
| cp = cpl; |
| } |
| } |
| |
| mutex_exit(&nfscl->nfscl_chtable_lock); |
| |
| /* |
| * If cp is empty, then there is nothing to reclaim here. |
| */ |
| if (cp == NULL) |
| return; |
| |
| /* |
| * Step through the list of entries to free, destroying each client |
| * handle and kmem_free'ing the memory for each entry. |
| */ |
| while (cp != NULL) { |
| #ifdef DEBUG |
| n++; |
| #endif |
| CLNT_DESTROY(cp->ch_client); |
| cpl = cp->ch_list; |
| kmem_cache_free(chtab_cache, cp); |
| cp = cpl; |
| } |
| |
| #ifdef DEBUG |
| /* |
| * Update clalloc so that nfsstat shows the current number |
| * of allocated client handles. |
| */ |
| atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n); |
| #endif |
| } |
| |
| /* ARGSUSED */ |
| static void |
| clreclaim(void *all) |
| { |
| struct nfs_clnt *nfscl; |
| |
| #ifdef DEBUG |
| clstat_debug.clreclaim.value.ui64++; |
| #endif |
| /* |
| * The system is low on memory; go through and try to reclaim some from |
| * every zone on the system. |
| */ |
| mutex_enter(&nfs_clnt_list_lock); |
| nfscl = list_head(&nfs_clnt_list); |
| for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) |
| clreclaim_zone(nfscl, CL_HOLDTIME); |
| mutex_exit(&nfs_clnt_list_lock); |
| } |
| |
| /* |
| * Minimum time-out values indexed by call type |
| * These units are in "eights" of a second to avoid multiplies |
| */ |
| static unsigned int minimum_timeo[] = { |
| 6, 7, 10 |
| }; |
| |
| /* |
| * Back off for retransmission timeout, MAXTIMO is in hz of a sec |
| */ |
| #define MAXTIMO (20*hz) |
| #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim)) |
| #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1)) |
| |
| #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */ |
| #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */ |
| #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */ |
| |
| /* |
| * Function called when rfscall notices that we have been |
| * re-transmitting, or when we get a response without retransmissions. |
| * Return 1 if the transfer size was adjusted down - 0 if no change. |
| */ |
| static int |
| nfs_feedback(int flag, int which, mntinfo_t *mi) |
| { |
| int kind; |
| int r = 0; |
| |
| mutex_enter(&mi->mi_lock); |
| if (flag == FEEDBACK_REXMIT1) { |
| if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 && |
| mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME) |
| goto done; |
| if (mi->mi_curread > MIN_NFS_TSIZE) { |
| mi->mi_curread /= 2; |
| if (mi->mi_curread < MIN_NFS_TSIZE) |
| mi->mi_curread = MIN_NFS_TSIZE; |
| r = 1; |
| } |
| |
| if (mi->mi_curwrite > MIN_NFS_TSIZE) { |
| mi->mi_curwrite /= 2; |
| if (mi->mi_curwrite < MIN_NFS_TSIZE) |
| mi->mi_curwrite = MIN_NFS_TSIZE; |
| r = 1; |
| } |
| } else if (flag == FEEDBACK_OK) { |
| kind = mi->mi_timer_type[which]; |
| if (kind == 0 || |
| mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME) |
| goto done; |
| if (kind == 1) { |
| if (mi->mi_curread >= mi->mi_tsize) |
| goto done; |
| mi->mi_curread += MIN_NFS_TSIZE; |
| if (mi->mi_curread > mi->mi_tsize/2) |
| mi->mi_curread = mi->mi_tsize; |
| } else if (kind == 2) { |
| if (mi->mi_curwrite >= mi->mi_stsize) |
| goto done; |
| mi->mi_curwrite += MIN_NFS_TSIZE; |
| if (mi->mi_curwrite > mi->mi_stsize/2) |
| mi->mi_curwrite = mi->mi_stsize; |
| } |
| } |
| done: |
| mutex_exit(&mi->mi_lock); |
| return (r); |
| } |
| |
| #ifdef DEBUG |
| static int rfs2call_hits = 0; |
| static int rfs2call_misses = 0; |
| #endif |
| |
| int |
| rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, |
| xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, |
| enum nfsstat *statusp, int flags, failinfo_t *fi) |
| { |
| int rpcerror; |
| enum clnt_stat rpc_status; |
| |
| ASSERT(statusp != NULL); |
| |
| rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, |
| cr, douprintf, &rpc_status, flags, fi); |
| if (!rpcerror) { |
| /* |
| * See crnetadjust() for comments. |
| */ |
| if (*statusp == NFSERR_ACCES && |
| (cr = crnetadjust(cr)) != NULL) { |
| #ifdef DEBUG |
| rfs2call_hits++; |
| #endif |
| rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, |
| resp, cr, douprintf, NULL, flags, fi); |
| crfree(cr); |
| #ifdef DEBUG |
| if (*statusp == NFSERR_ACCES) |
| rfs2call_misses++; |
| #endif |
| } |
| } else if (rpc_status == RPC_PROCUNAVAIL) { |
| *statusp = NFSERR_OPNOTSUPP; |
| rpcerror = 0; |
| } |
| |
| return (rpcerror); |
| } |
| |
| #define NFS3_JUKEBOX_DELAY 10 * hz |
| |
| static clock_t nfs3_jukebox_delay = 0; |
| |
| #ifdef DEBUG |
| static int rfs3call_hits = 0; |
| static int rfs3call_misses = 0; |
| #endif |
| |
| int |
| rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, |
| xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, |
| nfsstat3 *statusp, int flags, failinfo_t *fi) |
| { |
| int rpcerror; |
| int user_informed; |
| |
| user_informed = 0; |
| do { |
| rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp, |
| cr, douprintf, NULL, flags, fi); |
| if (!rpcerror) { |
| cred_t *crr; |
| if (*statusp == NFS3ERR_JUKEBOX) { |
| if (ttoproc(curthread) == &p0) { |
| rpcerror = EAGAIN; |
| break; |
| } |
| if (!user_informed) { |
| user_informed = 1; |
| uprintf( |
| "file temporarily unavailable on the server, retrying...\n"); |
| } |
| delay(nfs3_jukebox_delay); |
| } |
| /* |
| * See crnetadjust() for comments. |
| */ |
| else if (*statusp == NFS3ERR_ACCES && |
| (crr = crnetadjust(cr)) != NULL) { |
| #ifdef DEBUG |
| rfs3call_hits++; |
| #endif |
| rpcerror = rfscall(mi, which, xdrargs, argsp, |
| xdrres, resp, crr, douprintf, |
| NULL, flags, fi); |
| |
| crfree(crr); |
| #ifdef DEBUG |
| if (*statusp == NFS3ERR_ACCES) |
| rfs3call_misses++; |
| #endif |
| } |
| } |
| } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); |
| |
| return (rpcerror); |
| } |
| |
| #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv) |
| #define INC_READERS(mi) { \ |
| mi->mi_readers++; \ |
| } |
| #define DEC_READERS(mi) { \ |
| mi->mi_readers--; \ |
| if (mi->mi_readers == 0) \ |
| cv_broadcast(&mi->mi_failover_cv); \ |
| } |
| |
| static int |
| rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, |
| xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, |
| enum clnt_stat *rpc_status, int flags, failinfo_t *fi) |
| { |
| CLIENT *client; |
| struct chtab *ch; |
| cred_t *cr = icr; |
| enum clnt_stat status; |
| struct rpc_err rpcerr; |
| struct timeval wait; |
| int timeo; /* in units of hz */ |
| int my_rsize, my_wsize; |
| bool_t tryagain; |
| bool_t cred_cloned = FALSE; |
| k_sigset_t smask; |
| servinfo_t *svp; |
| struct nfs_clnt *nfscl; |
| zoneid_t zoneid = getzoneid(); |
| #ifdef DEBUG |
| char *bufp; |
| #endif |
| |
| |
| TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, |
| "rfscall_start:which %d mi %p", which, mi); |
| |
| nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); |
| ASSERT(nfscl != NULL); |
| |
| nfscl->nfscl_stat.calls.value.ui64++; |
| mi->mi_reqs[which].value.ui64++; |
| |
| rpcerr.re_status = RPC_SUCCESS; |
| |
| /* |
| * In case of forced unmount or zone shutdown, return EIO. |
| */ |
| |
| if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { |
| rpcerr.re_status = RPC_FAILED; |
| rpcerr.re_errno = EIO; |
| return (rpcerr.re_errno); |
| } |
| |
| /* |
| * Remember the transfer sizes in case |
| * nfs_feedback changes them underneath us. |
| */ |
| my_rsize = mi->mi_curread; |
| my_wsize = mi->mi_curwrite; |
| |
| /* |
| * NFS client failover support |
| * |
| * If this rnode is not in sync with the current server (VALID_FH), |
| * we'd like to do a remap to get in sync. We can be interrupted |
| * in failover_remap(), and if so we'll bail. Otherwise, we'll |
| * use the best info we have to try the RPC. Part of that is |
| * unconditionally updating the filehandle copy kept for V3. |
| * |
| * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible |
| * rw_enter(); we're trying to keep the current server from being |
| * changed on us until we're done with the remapping and have a |
| * matching client handle. We don't want to sending a filehandle |
| * to the wrong host. |
| */ |
| failoverretry: |
| if (FAILOVER_MOUNT(mi)) { |
| mutex_enter(&mi->mi_lock); |
| if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { |
| if (failover_wait(mi)) { |
| mutex_exit(&mi->mi_lock); |
| return (EINTR); |
| } |
| } |
| INC_READERS(mi); |
| mutex_exit(&mi->mi_lock); |
| if (fi) { |
| if (!VALID_FH(fi) && |
| !(flags & RFSCALL_SOFT) && failover_safe(fi)) { |
| int remaperr; |
| |
| svp = mi->mi_curr_serv; |
| remaperr = failover_remap(fi); |
| if (remaperr != 0) { |
| #ifdef DEBUG |
| if (remaperr != EINTR) |
| nfs_cmn_err(remaperr, CE_WARN, |
| "rfscall couldn't failover: %m"); |
| #endif |
| mutex_enter(&mi->mi_lock); |
| DEC_READERS(mi); |
| mutex_exit(&mi->mi_lock); |
| /* |
| * If failover_remap returns ETIMEDOUT |
| * and the filesystem is hard mounted |
| * we have to retry the call with a new |
| * server. |
| */ |
| if ((mi->mi_flags & MI_HARD) && |
| IS_RECOVERABLE_ERROR(remaperr)) { |
| if (svp == mi->mi_curr_serv) |
| failover_newserver(mi); |
| rpcerr.re_status = RPC_SUCCESS; |
| goto failoverretry; |
| } |
| rpcerr.re_errno = remaperr; |
| return (remaperr); |
| } |
| } |
| if (fi->fhp && fi->copyproc) |
| (*fi->copyproc)(fi->fhp, fi->vp); |
| } |
| } |
| |
| /* For TSOL, use a new cred which has net_mac_aware flag */ |
| if (!cred_cloned && is_system_labeled()) { |
| cred_cloned = TRUE; |
| cr = crdup(icr); |
| (void) setpflags(NET_MAC_AWARE, 1, cr); |
| } |
| |
| /* |
| * clget() calls clnt_tli_kinit() which clears the xid, so we |
| * are guaranteed to reprocess the retry as a new request. |
| */ |
| svp = mi->mi_curr_serv; |
| rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl); |
| |
| if (FAILOVER_MOUNT(mi)) { |
| mutex_enter(&mi->mi_lock); |
| DEC_READERS(mi); |
| mutex_exit(&mi->mi_lock); |
| |
| if ((rpcerr.re_errno == ETIMEDOUT || |
| rpcerr.re_errno == ECONNRESET) && |
| failover_safe(fi)) { |
| if (svp == mi->mi_curr_serv) |
| failover_newserver(mi); |
| goto failoverretry; |
| } |
| } |
| if (rpcerr.re_errno != 0) |
| return (rpcerr.re_errno); |
| |
| if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || |
| svp->sv_knconf->knc_semantics == NC_TPI_COTS) { |
| timeo = (mi->mi_timeo * hz) / 10; |
| } else { |
| mutex_enter(&mi->mi_lock); |
| timeo = CLNT_SETTIMERS(client, |
| &(mi->mi_timers[mi->mi_timer_type[which]]), |
| &(mi->mi_timers[NFS_CALLTYPES]), |
| (minimum_timeo[mi->mi_call_type[which]]*hz)>>3, |
| (void (*)())NULL, (caddr_t)mi, 0); |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| /* |
| * If hard mounted fs, retry call forever unless hard error occurs. |
| */ |
| do { |
| tryagain = FALSE; |
| |
| if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { |
| status = RPC_FAILED; |
| rpcerr.re_status = RPC_FAILED; |
| rpcerr.re_errno = EIO; |
| break; |
| } |
| |
| TICK_TO_TIMEVAL(timeo, &wait); |
| |
| /* |
| * Mask out all signals except SIGHUP, SIGINT, SIGQUIT |
| * and SIGTERM. (Preserving the existing masks). |
| * Mask out SIGINT if mount option nointr is specified. |
| */ |
| sigintr(&smask, (int)mi->mi_flags & MI_INT); |
| if (!(mi->mi_flags & MI_INT)) |
| client->cl_nosignal = TRUE; |
| |
| /* |
| * If there is a current signal, then don't bother |
| * even trying to send out the request because we |
| * won't be able to block waiting for the response. |
| * Simply assume RPC_INTR and get on with it. |
| */ |
| if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) |
| status = RPC_INTR; |
| else { |
| status = CLNT_CALL(client, which, xdrargs, argsp, |
| xdrres, resp, wait); |
| } |
| |
| if (!(mi->mi_flags & MI_INT)) |
| client->cl_nosignal = FALSE; |
| /* |
| * restore original signal mask |
| */ |
| sigunintr(&smask); |
| |
| switch (status) { |
| case RPC_SUCCESS: |
| if ((mi->mi_flags & MI_DYNAMIC) && |
| mi->mi_timer_type[which] != 0 && |
| (mi->mi_curread != my_rsize || |
| mi->mi_curwrite != my_wsize)) |
| (void) nfs_feedback(FEEDBACK_OK, which, mi); |
| break; |
| |
| case RPC_INTR: |
| /* |
| * There is no way to recover from this error, |
| * even if mount option nointr is specified. |
| * SIGKILL, for example, cannot be blocked. |
| */ |
| rpcerr.re_status = RPC_INTR; |
| rpcerr.re_errno = EINTR; |
| break; |
| |
| case RPC_UDERROR: |
| /* |
| * If the NFS server is local (vold) and |
| * it goes away then we get RPC_UDERROR. |
| * This is a retryable error, so we would |
| * loop, so check to see if the specific |
| * error was ECONNRESET, indicating that |
| * target did not exist at all. If so, |
| * return with RPC_PROGUNAVAIL and |
| * ECONNRESET to indicate why. |
| */ |
| CLNT_GETERR(client, &rpcerr); |
| if (rpcerr.re_errno == ECONNRESET) { |
| rpcerr.re_status = RPC_PROGUNAVAIL; |
| rpcerr.re_errno = ECONNRESET; |
| break; |
| } |
| /*FALLTHROUGH*/ |
| |
| default: /* probably RPC_TIMEDOUT */ |
| if (IS_UNRECOVERABLE_RPC(status)) |
| break; |
| |
| /* |
| * increment server not responding count |
| */ |
| mutex_enter(&mi->mi_lock); |
| mi->mi_noresponse++; |
| mutex_exit(&mi->mi_lock); |
| #ifdef DEBUG |
| nfscl->nfscl_stat.noresponse.value.ui64++; |
| #endif |
| |
| if (!(mi->mi_flags & MI_HARD)) { |
| if (!(mi->mi_flags & MI_SEMISOFT) || |
| (mi->mi_ss_call_type[which] == 0)) |
| break; |
| } |
| |
| /* |
| * The call is in progress (over COTS). |
| * Try the CLNT_CALL again, but don't |
| * print a noisy error message. |
| */ |
| if (status == RPC_INPROGRESS) { |
| tryagain = TRUE; |
| break; |
| } |
| |
| if (flags & RFSCALL_SOFT) |
| break; |
| |
| /* |
| * On zone shutdown, just move on. |
| */ |
| if (zone_status_get(curproc->p_zone) >= |
| ZONE_IS_SHUTTING_DOWN) { |
| rpcerr.re_status = RPC_FAILED; |
| rpcerr.re_errno = EIO; |
| break; |
| } |
| |
| /* |
| * NFS client failover support |
| * |
| * If the current server just failed us, we'll |
| * start the process of finding a new server. |
| * After that, we can just retry. |
| */ |
| if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { |
| if (svp == mi->mi_curr_serv) |
| failover_newserver(mi); |
| clfree_impl(client, ch, nfscl); |
| goto failoverretry; |
| } |
| |
| tryagain = TRUE; |
| timeo = backoff(timeo); |
| mutex_enter(&mi->mi_lock); |
| if (!(mi->mi_flags & MI_PRINTED)) { |
| mi->mi_flags |= MI_PRINTED; |
| mutex_exit(&mi->mi_lock); |
| #ifdef DEBUG |
| zprintf(zoneid, |
| "NFS%d server %s not responding still trying\n", |
| mi->mi_vers, svp->sv_hostname); |
| #else |
| zprintf(zoneid, |
| "NFS server %s not responding still trying\n", |
| svp->sv_hostname); |
| #endif |
| } else |
| mutex_exit(&mi->mi_lock); |
| if (*douprintf && curproc->p_sessp->s_vp != NULL) { |
| *douprintf = 0; |
| if (!(mi->mi_flags & MI_NOPRINT)) |
| #ifdef DEBUG |
| uprintf( |
| "NFS%d server %s not responding still trying\n", |
| mi->mi_vers, svp->sv_hostname); |
| #else |
| uprintf( |
| "NFS server %s not responding still trying\n", |
| svp->sv_hostname); |
| #endif |
| } |
| |
| /* |
| * If doing dynamic adjustment of transfer |
| * size and if it's a read or write call |
| * and if the transfer size changed while |
| * retransmitting or if the feedback routine |
| * changed the transfer size, |
| * then exit rfscall so that the transfer |
| * size can be adjusted at the vnops level. |
| */ |
| if ((mi->mi_flags & MI_DYNAMIC) && |
| mi->mi_timer_type[which] != 0 && |
| (mi->mi_curread != my_rsize || |
| mi->mi_curwrite != my_wsize || |
| nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { |
| /* |
| * On read or write calls, return |
| * back to the vnode ops level if |
| * the transfer size changed. |
| */ |
| clfree_impl(client, ch, nfscl); |
| if (cred_cloned) |
| crfree(cr); |
| return (ENFS_TRYAGAIN); |
| } |
| } |
| } while (tryagain); |
| |
| if (status != RPC_SUCCESS) { |
| /* |
| * Let soft mounts use the timed out message. |
| */ |
| if (status == RPC_INPROGRESS) |
| status = RPC_TIMEDOUT; |
| nfscl->nfscl_stat.badcalls.value.ui64++; |
| if (status != RPC_INTR) { |
| mutex_enter(&mi->mi_lock); |
| mi->mi_flags |= MI_DOWN; |
| mutex_exit(&mi->mi_lock); |
| CLNT_GETERR(client, &rpcerr); |
| #ifdef DEBUG |
| bufp = clnt_sperror(client, svp->sv_hostname); |
| zprintf(zoneid, "NFS%d %s failed for %s\n", |
| mi->mi_vers, mi->mi_rfsnames[which], bufp); |
| if (curproc->p_sessp->s_vp != NULL) { |
| if (!(mi->mi_flags & MI_NOPRINT)) { |
| uprintf("NFS%d %s failed for %s\n", |
| mi->mi_vers, mi->mi_rfsnames[which], |
| bufp); |
| } |
| } |
| kmem_free(bufp, MAXPATHLEN); |
| #else |
| zprintf(zoneid, |
| "NFS %s failed for server %s: error %d (%s)\n", |
| mi->mi_rfsnames[which], svp->sv_hostname, |
| status, clnt_sperrno(status)); |
| if (curproc->p_sessp->s_vp != NULL) { |
| if (!(mi->mi_flags & MI_NOPRINT)) { |
| uprintf( |
| "NFS %s failed for server %s: error %d (%s)\n", |
| mi->mi_rfsnames[which], |
| svp->sv_hostname, status, |
| clnt_sperrno(status)); |
| } |
| } |
| #endif |
| /* |
| * when CLNT_CALL() fails with RPC_AUTHERROR, |
| * re_errno is set appropriately depending on |
| * the authentication error |
| */ |
| if (status == RPC_VERSMISMATCH || |
| status == RPC_PROGVERSMISMATCH) |
| rpcerr.re_errno = EIO; |
| } |
| } else { |
| /* |
| * Test the value of mi_down and mi_printed without |
| * holding the mi_lock mutex. If they are both zero, |
| * then it is okay to skip the down and printed |
| * processing. This saves on a mutex_enter and |
| * mutex_exit pair for a normal, successful RPC. |
| * This was just complete overhead. |
| */ |
| if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { |
| mutex_enter(&mi->mi_lock); |
| mi->mi_flags &= ~MI_DOWN; |
| if (mi->mi_flags & MI_PRINTED) { |
| mi->mi_flags &= ~MI_PRINTED; |
| mutex_exit(&mi->mi_lock); |
| #ifdef DEBUG |
| if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) |
| zprintf(zoneid, "NFS%d server %s ok\n", |
| mi->mi_vers, svp->sv_hostname); |
| #else |
| if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) |
| zprintf(zoneid, "NFS server %s ok\n", |
| svp->sv_hostname); |
| #endif |
| } else |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| if (*douprintf == 0) { |
| if (!(mi->mi_flags & MI_NOPRINT)) |
| #ifdef DEBUG |
| if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) |
| uprintf("NFS%d server %s ok\n", |
| mi->mi_vers, svp->sv_hostname); |
| #else |
| if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)) |
| uprintf("NFS server %s ok\n", svp->sv_hostname); |
| #endif |
| *douprintf = 1; |
| } |
| } |
| |
| clfree_impl(client, ch, nfscl); |
| if (cred_cloned) |
| crfree(cr); |
| |
| ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); |
| |
| if (rpc_status != NULL) |
| *rpc_status = rpcerr.re_status; |
| |
| TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", |
| rpcerr.re_errno); |
| |
| return (rpcerr.re_errno); |
| } |
| |
| #ifdef DEBUG |
| static int acl2call_hits = 0; |
| static int acl2call_misses = 0; |
| #endif |
| |
| int |
| acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, |
| xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, |
| enum nfsstat *statusp, int flags, failinfo_t *fi) |
| { |
| int rpcerror; |
| |
| rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, |
| cr, douprintf, flags, fi); |
| if (!rpcerror) { |
| /* |
| * See comments with crnetadjust(). |
| */ |
| if (*statusp == NFSERR_ACCES && |
| (cr = crnetadjust(cr)) != NULL) { |
| #ifdef DEBUG |
| acl2call_hits++; |
| #endif |
| rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, |
| resp, cr, douprintf, flags, fi); |
| crfree(cr); |
| #ifdef DEBUG |
| if (*statusp == NFSERR_ACCES) |
| acl2call_misses++; |
| #endif |
| } |
| } |
| |
| return (rpcerror); |
| } |
| |
| #ifdef DEBUG |
| static int acl3call_hits = 0; |
| static int acl3call_misses = 0; |
| #endif |
| |
| int |
| acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, |
| xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf, |
| nfsstat3 *statusp, int flags, failinfo_t *fi) |
| { |
| int rpcerror; |
| int user_informed; |
| |
| user_informed = 0; |
| |
| do { |
| rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp, |
| cr, douprintf, flags, fi); |
| if (!rpcerror) { |
| cred_t *crr; |
| if (*statusp == NFS3ERR_JUKEBOX) { |
| if (!user_informed) { |
| user_informed = 1; |
| uprintf( |
| "file temporarily unavailable on the server, retrying...\n"); |
| } |
| delay(nfs3_jukebox_delay); |
| } |
| /* |
| * See crnetadjust() for comments. |
| */ |
| else if (*statusp == NFS3ERR_ACCES && |
| (crr = crnetadjust(cr)) != NULL) { |
| #ifdef DEBUG |
| acl3call_hits++; |
| #endif |
| rpcerror = aclcall(mi, which, xdrargs, argsp, |
| xdrres, resp, crr, douprintf, flags, fi); |
| |
| crfree(crr); |
| #ifdef DEBUG |
| if (*statusp == NFS3ERR_ACCES) |
| acl3call_misses++; |
| #endif |
| } |
| } |
| } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX); |
| |
| return (rpcerror); |
| } |
| |
| static int |
| aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, |
| xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf, |
| int flags, failinfo_t *fi) |
| { |
| CLIENT *client; |
| struct chtab *ch; |
| cred_t *cr = icr; |
| bool_t cred_cloned = FALSE; |
| enum clnt_stat status; |
| struct rpc_err rpcerr; |
| struct timeval wait; |
| int timeo; /* in units of hz */ |
| #if 0 /* notyet */ |
| int my_rsize, my_wsize; |
| #endif |
| bool_t tryagain; |
| k_sigset_t smask; |
| servinfo_t *svp; |
| struct nfs_clnt *nfscl; |
| zoneid_t zoneid = getzoneid(); |
| #ifdef DEBUG |
| char *bufp; |
| #endif |
| |
| #if 0 /* notyet */ |
| TRACE_2(TR_FAC_NFS, TR_RFSCALL_START, |
| "rfscall_start:which %d mi %p", which, mi); |
| #endif |
| |
| nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone()); |
| ASSERT(nfscl != NULL); |
| |
| nfscl->nfscl_stat.calls.value.ui64++; |
| mi->mi_aclreqs[which].value.ui64++; |
| |
| rpcerr.re_status = RPC_SUCCESS; |
| |
| if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { |
| rpcerr.re_status = RPC_FAILED; |
| rpcerr.re_errno = EIO; |
| return (rpcerr.re_errno); |
| } |
| |
| #if 0 /* notyet */ |
| /* |
| * Remember the transfer sizes in case |
| * nfs_feedback changes them underneath us. |
| */ |
| my_rsize = mi->mi_curread; |
| my_wsize = mi->mi_curwrite; |
| #endif |
| |
| /* |
| * NFS client failover support |
| * |
| * If this rnode is not in sync with the current server (VALID_FH), |
| * we'd like to do a remap to get in sync. We can be interrupted |
| * in failover_remap(), and if so we'll bail. Otherwise, we'll |
| * use the best info we have to try the RPC. Part of that is |
| * unconditionally updating the filehandle copy kept for V3. |
| * |
| * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible |
| * rw_enter(); we're trying to keep the current server from being |
| * changed on us until we're done with the remapping and have a |
| * matching client handle. We don't want to sending a filehandle |
| * to the wrong host. |
| */ |
| failoverretry: |
| if (FAILOVER_MOUNT(mi)) { |
| mutex_enter(&mi->mi_lock); |
| if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) { |
| if (failover_wait(mi)) { |
| mutex_exit(&mi->mi_lock); |
| return (EINTR); |
| } |
| } |
| INC_READERS(mi); |
| mutex_exit(&mi->mi_lock); |
| if (fi) { |
| if (!VALID_FH(fi) && |
| !(flags & RFSCALL_SOFT) && failover_safe(fi)) { |
| int remaperr; |
| |
| svp = mi->mi_curr_serv; |
| remaperr = failover_remap(fi); |
| if (remaperr != 0) { |
| #ifdef DEBUG |
| if (remaperr != EINTR) |
| nfs_cmn_err(remaperr, CE_WARN, |
| "aclcall couldn't failover: %m"); |
| #endif |
| mutex_enter(&mi->mi_lock); |
| DEC_READERS(mi); |
| mutex_exit(&mi->mi_lock); |
| |
| /* |
| * If failover_remap returns ETIMEDOUT |
| * and the filesystem is hard mounted |
| * we have to retry the call with a new |
| * server. |
| */ |
| if ((mi->mi_flags & MI_HARD) && |
| IS_RECOVERABLE_ERROR(remaperr)) { |
| if (svp == mi->mi_curr_serv) |
| failover_newserver(mi); |
| rpcerr.re_status = RPC_SUCCESS; |
| goto failoverretry; |
| } |
| return (remaperr); |
| } |
| } |
| if (fi->fhp && fi->copyproc) |
| (*fi->copyproc)(fi->fhp, fi->vp); |
| } |
| } |
| |
| /* For TSOL, use a new cred which has net_mac_aware flag */ |
| if (!cred_cloned && is_system_labeled()) { |
| cred_cloned = TRUE; |
| cr = crdup(icr); |
| (void) setpflags(NET_MAC_AWARE, 1, cr); |
| } |
| |
| /* |
| * acl_clget() calls clnt_tli_kinit() which clears the xid, so we |
| * are guaranteed to reprocess the retry as a new request. |
| */ |
| svp = mi->mi_curr_serv; |
| rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl); |
| if (FAILOVER_MOUNT(mi)) { |
| mutex_enter(&mi->mi_lock); |
| DEC_READERS(mi); |
| mutex_exit(&mi->mi_lock); |
| |
| if ((rpcerr.re_errno == ETIMEDOUT || |
| rpcerr.re_errno == ECONNRESET) && |
| failover_safe(fi)) { |
| if (svp == mi->mi_curr_serv) |
| failover_newserver(mi); |
| goto failoverretry; |
| } |
| } |
| if (rpcerr.re_errno != 0) { |
| if (cred_cloned) |
| crfree(cr); |
| return (rpcerr.re_errno); |
| } |
| |
| if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD || |
| svp->sv_knconf->knc_semantics == NC_TPI_COTS) { |
| timeo = (mi->mi_timeo * hz) / 10; |
| } else { |
| mutex_enter(&mi->mi_lock); |
| timeo = CLNT_SETTIMERS(client, |
| &(mi->mi_timers[mi->mi_acl_timer_type[which]]), |
| &(mi->mi_timers[NFS_CALLTYPES]), |
| (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3, |
| (void (*)()) 0, (caddr_t)mi, 0); |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| /* |
| * If hard mounted fs, retry call forever unless hard error occurs. |
| */ |
| do { |
| tryagain = FALSE; |
| |
| if (FS_OR_ZONE_GONE(mi->mi_vfsp)) { |
| status = RPC_FAILED; |
| rpcerr.re_status = RPC_FAILED; |
| rpcerr.re_errno = EIO; |
| break; |
| } |
| |
| TICK_TO_TIMEVAL(timeo, &wait); |
| |
| /* |
| * Mask out all signals except SIGHUP, SIGINT, SIGQUIT |
| * and SIGTERM. (Preserving the existing masks). |
| * Mask out SIGINT if mount option nointr is specified. |
| */ |
| sigintr(&smask, (int)mi->mi_flags & MI_INT); |
| if (!(mi->mi_flags & MI_INT)) |
| client->cl_nosignal = TRUE; |
| |
| /* |
| * If there is a current signal, then don't bother |
| * even trying to send out the request because we |
| * won't be able to block waiting for the response. |
| * Simply assume RPC_INTR and get on with it. |
| */ |
| if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) |
| status = RPC_INTR; |
| else { |
| status = CLNT_CALL(client, which, xdrargs, argsp, |
| xdrres, resp, wait); |
| } |
| |
| if (!(mi->mi_flags & MI_INT)) |
| client->cl_nosignal = FALSE; |
| /* |
| * restore original signal mask |
| */ |
| sigunintr(&smask); |
| |
| switch (status) { |
| case RPC_SUCCESS: |
| #if 0 /* notyet */ |
| if ((mi->mi_flags & MI_DYNAMIC) && |
| mi->mi_timer_type[which] != 0 && |
| (mi->mi_curread != my_rsize || |
| mi->mi_curwrite != my_wsize)) |
| (void) nfs_feedback(FEEDBACK_OK, which, mi); |
| #endif |
| break; |
| |
| /* |
| * Unfortunately, there are servers in the world which |
| * are not coded correctly. They are not prepared to |
| * handle RPC requests to the NFS port which are not |
| * NFS requests. Thus, they may try to process the |
| * NFS_ACL request as if it were an NFS request. This |
| * does not work. Generally, an error will be generated |
| * on the client because it will not be able to decode |
| * the response from the server. However, it seems |
| * possible that the server may not be able to decode |
| * the arguments. Thus, the criteria for deciding |
| * whether the server supports NFS_ACL or not is whether |
| * the following RPC errors are returned from CLNT_CALL. |
| */ |
| case RPC_CANTDECODERES: |
| case RPC_PROGUNAVAIL: |
| case RPC_CANTDECODEARGS: |
| case RPC_PROGVERSMISMATCH: |
| mutex_enter(&mi->mi_lock); |
| mi->mi_flags &= ~(MI_ACL | MI_EXTATTR); |
| mutex_exit(&mi->mi_lock); |
| break; |
| |
| /* |
| * If the server supports NFS_ACL but not the new ops |
| * for extended attributes, make sure we don't retry. |
| */ |
| case RPC_PROCUNAVAIL: |
| mutex_enter(&mi->mi_lock); |
| mi->mi_flags &= ~MI_EXTATTR; |
| mutex_exit(&mi->mi_lock); |
| break; |
| |
| case RPC_INTR: |
| /* |
| * There is no way to recover from this error, |
| * even if mount option nointr is specified. |
| * SIGKILL, for example, cannot be blocked. |
| */ |
| rpcerr.re_status = RPC_INTR; |
| rpcerr.re_errno = EINTR; |
| break; |
| |
| case RPC_UDERROR: |
| /* |
| * If the NFS server is local (vold) and |
| * it goes away then we get RPC_UDERROR. |
| * This is a retryable error, so we would |
| * loop, so check to see if the specific |
| * error was ECONNRESET, indicating that |
| * target did not exist at all. If so, |
| * return with RPC_PROGUNAVAIL and |
| * ECONNRESET to indicate why. |
| */ |
| CLNT_GETERR(client, &rpcerr); |
| if (rpcerr.re_errno == ECONNRESET) { |
| rpcerr.re_status = RPC_PROGUNAVAIL; |
| rpcerr.re_errno = ECONNRESET; |
| break; |
| } |
| /*FALLTHROUGH*/ |
| |
| default: /* probably RPC_TIMEDOUT */ |
| if (IS_UNRECOVERABLE_RPC(status)) |
| break; |
| |
| /* |
| * increment server not responding count |
| */ |
| mutex_enter(&mi->mi_lock); |
| mi->mi_noresponse++; |
| mutex_exit(&mi->mi_lock); |
| #ifdef DEBUG |
| nfscl->nfscl_stat.noresponse.value.ui64++; |
| #endif |
| |
| if (!(mi->mi_flags & MI_HARD)) { |
| if (!(mi->mi_flags & MI_SEMISOFT) || |
| (mi->mi_acl_ss_call_type[which] == 0)) |
| break; |
| } |
| |
| /* |
| * The call is in progress (over COTS). |
| * Try the CLNT_CALL again, but don't |
| * print a noisy error message. |
| */ |
| if (status == RPC_INPROGRESS) { |
| tryagain = TRUE; |
| break; |
| } |
| |
| if (flags & RFSCALL_SOFT) |
| break; |
| |
| /* |
| * On zone shutdown, just move on. |
| */ |
| if (zone_status_get(curproc->p_zone) >= |
| ZONE_IS_SHUTTING_DOWN) { |
| rpcerr.re_status = RPC_FAILED; |
| rpcerr.re_errno = EIO; |
| break; |
| } |
| |
| /* |
| * NFS client failover support |
| * |
| * If the current server just failed us, we'll |
| * start the process of finding a new server. |
| * After that, we can just retry. |
| */ |
| if (FAILOVER_MOUNT(mi) && failover_safe(fi)) { |
| if (svp == mi->mi_curr_serv) |
| failover_newserver(mi); |
| clfree_impl(client, ch, nfscl); |
| goto failoverretry; |
| } |
| |
| tryagain = TRUE; |
| timeo = backoff(timeo); |
| mutex_enter(&mi->mi_lock); |
| if (!(mi->mi_flags & MI_PRINTED)) { |
| mi->mi_flags |= MI_PRINTED; |
| mutex_exit(&mi->mi_lock); |
| #ifdef DEBUG |
| zprintf(zoneid, |
| "NFS_ACL%d server %s not responding still trying\n", |
| mi->mi_vers, svp->sv_hostname); |
| #else |
| zprintf(zoneid, |
| "NFS server %s not responding still trying\n", |
| svp->sv_hostname); |
| #endif |
| } else |
| mutex_exit(&mi->mi_lock); |
| if (*douprintf && curproc->p_sessp->s_vp != NULL) { |
| *douprintf = 0; |
| if (!(mi->mi_flags & MI_NOPRINT)) |
| #ifdef DEBUG |
| uprintf( |
| "NFS_ACL%d server %s not responding still trying\n", |
| mi->mi_vers, svp->sv_hostname); |
| #else |
| uprintf( |
| "NFS server %s not responding still trying\n", |
| svp->sv_hostname); |
| #endif |
| } |
| |
| #if 0 /* notyet */ |
| /* |
| * If doing dynamic adjustment of transfer |
| * size and if it's a read or write call |
| * and if the transfer size changed while |
| * retransmitting or if the feedback routine |
| * changed the transfer size, |
| * then exit rfscall so that the transfer |
| * size can be adjusted at the vnops level. |
| */ |
| if ((mi->mi_flags & MI_DYNAMIC) && |
| mi->mi_acl_timer_type[which] != 0 && |
| (mi->mi_curread != my_rsize || |
| mi->mi_curwrite != my_wsize || |
| nfs_feedback(FEEDBACK_REXMIT1, which, mi))) { |
| /* |
| * On read or write calls, return |
| * back to the vnode ops level if |
| * the transfer size changed. |
| */ |
| clfree_impl(client, ch, nfscl); |
| if (cred_cloned) |
| crfree(cr); |
| return (ENFS_TRYAGAIN); |
| } |
| #endif |
| } |
| } while (tryagain); |
| |
| if (status != RPC_SUCCESS) { |
| /* |
| * Let soft mounts use the timed out message. |
| */ |
| if (status == RPC_INPROGRESS) |
| status = RPC_TIMEDOUT; |
| nfscl->nfscl_stat.badcalls.value.ui64++; |
| if (status == RPC_CANTDECODERES || |
| status == RPC_PROGUNAVAIL || |
| status == RPC_PROCUNAVAIL || |
| status == RPC_CANTDECODEARGS || |
| status == RPC_PROGVERSMISMATCH) |
| CLNT_GETERR(client, &rpcerr); |
| else if (status != RPC_INTR) { |
| mutex_enter(&mi->mi_lock); |
| mi->mi_flags |= MI_DOWN; |
| mutex_exit(&mi->mi_lock); |
| CLNT_GETERR(client, &rpcerr); |
| #ifdef DEBUG |
| bufp = clnt_sperror(client, svp->sv_hostname); |
| zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", |
| mi->mi_vers, mi->mi_aclnames[which], bufp); |
| if (curproc->p_sessp->s_vp != NULL) { |
| if (!(mi->mi_flags & MI_NOPRINT)) { |
| uprintf("NFS_ACL%d %s failed for %s\n", |
| mi->mi_vers, mi->mi_aclnames[which], |
| bufp); |
| } |
| } |
| kmem_free(bufp, MAXPATHLEN); |
| #else |
| zprintf(zoneid, |
| "NFS %s failed for server %s: error %d (%s)\n", |
| mi->mi_aclnames[which], svp->sv_hostname, |
| status, clnt_sperrno(status)); |
| if (curproc->p_sessp->s_vp != NULL) { |
| if (!(mi->mi_flags & MI_NOPRINT)) |
| uprintf( |
| "NFS %s failed for server %s: error %d (%s)\n", |
| mi->mi_aclnames[which], |
| svp->sv_hostname, status, |
| clnt_sperrno(status)); |
| } |
| #endif |
| /* |
| * when CLNT_CALL() fails with RPC_AUTHERROR, |
| * re_errno is set appropriately depending on |
| * the authentication error |
| */ |
| if (status == RPC_VERSMISMATCH || |
| status == RPC_PROGVERSMISMATCH) |
| rpcerr.re_errno = EIO; |
| } |
| } else { |
| /* |
| * Test the value of mi_down and mi_printed without |
| * holding the mi_lock mutex. If they are both zero, |
| * then it is okay to skip the down and printed |
| * processing. This saves on a mutex_enter and |
| * mutex_exit pair for a normal, successful RPC. |
| * This was just complete overhead. |
| */ |
| if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) { |
| mutex_enter(&mi->mi_lock); |
| mi->mi_flags &= ~MI_DOWN; |
| if (mi->mi_flags & MI_PRINTED) { |
| mi->mi_flags &= ~MI_PRINTED; |
| mutex_exit(&mi->mi_lock); |
| #ifdef DEBUG |
| zprintf(zoneid, "NFS_ACL%d server %s ok\n", |
| mi->mi_vers, svp->sv_hostname); |
| #else |
| zprintf(zoneid, "NFS server %s ok\n", |
| svp->sv_hostname); |
| #endif |
| } else |
| mutex_exit(&mi->mi_lock); |
| } |
| |
| if (*douprintf == 0) { |
| if (!(mi->mi_flags & MI_NOPRINT)) |
| #ifdef DEBUG |
| uprintf("NFS_ACL%d server %s ok\n", |
| mi->mi_vers, svp->sv_hostname); |
| #else |
| uprintf("NFS server %s ok\n", svp->sv_hostname); |
| #endif |
| *douprintf = 1; |
| } |
| } |
| |
| clfree_impl(client, ch, nfscl); |
| if (cred_cloned) |
| crfree(cr); |
| |
| ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0); |
| |
| #if 0 /* notyet */ |
| TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d", |
| rpcerr.re_errno); |
| #endif |
| |
| return (rpcerr.re_errno); |
| } |
| |
| int |
| vattr_to_sattr(struct vattr *vap, struct nfssattr *sa) |
| { |
| uint_t mask = vap->va_mask; |
| |
| if (!(mask & AT_MODE)) |
| sa->sa_mode = (uint32_t)-1; |
| else |
| sa->sa_mode = vap->va_mode; |
| if (!(mask & AT_UID)) |
| sa->sa_uid = (uint32_t)-1; |
| else |
| sa->sa_uid = (uint32_t)vap->va_uid; |
| if (!(mask & AT_GID)) |
| sa->sa_gid = (uint32_t)-1; |
| else |
| sa->sa_gid = (uint32_t)vap->va_gid; |
| if (!(mask & AT_SIZE)) |
| sa->sa_size = (uint32_t)-1; |
| else |
| sa->sa_size = (uint32_t)vap->va_size; |
| if (!(mask & AT_ATIME)) |
| sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1; |
| else { |
| /* check time validity */ |
| if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { |
| return (EOVERFLOW); |
| } |
| sa->sa_atime.tv_sec = vap->va_atime.tv_sec; |
| sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000; |
| } |
| if (!(mask & AT_MTIME)) |
| sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1; |
| else { |
| /* check time validity */ |
| if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { |
| return (EOVERFLOW); |
| } |
| sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec; |
| sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; |
| } |
| return (0); |
| } |
| |
| int |
| vattr_to_sattr3(struct vattr *vap, sattr3 *sa) |
| { |
| uint_t mask = vap->va_mask; |
| |
| if (!(mask & AT_MODE)) |
| sa->mode.set_it = FALSE; |
| else { |
| sa->mode.set_it = TRUE; |
| sa->mode.mode = (mode3)vap->va_mode; |
| } |
| if (!(mask & AT_UID)) |
| sa->uid.set_it = FALSE; |
| else { |
| sa->uid.set_it = TRUE; |
| sa->uid.uid = (uid3)vap->va_uid; |
| } |
| if (!(mask & AT_GID)) |
| sa->gid.set_it = FALSE; |
| else { |
| sa->gid.set_it = TRUE; |
| sa->gid.gid = (gid3)vap->va_gid; |
| } |
| if (!(mask & AT_SIZE)) |
| sa->size.set_it = FALSE; |
| else { |
| sa->size.set_it = TRUE; |
| sa->size.size = (size3)vap->va_size; |
| } |
| if (!(mask & AT_ATIME)) |
| sa->atime.set_it = DONT_CHANGE; |
| else { |
| /* check time validity */ |
| if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) { |
| return (EOVERFLOW); |
| } |
| sa->atime.set_it = SET_TO_CLIENT_TIME; |
| sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec; |
| sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec; |
| } |
| if (!(mask & AT_MTIME)) |
| sa->mtime.set_it = DONT_CHANGE; |
| else { |
| /* check time validity */ |
| if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) { |
| return (EOVERFLOW); |
| } |
| sa->mtime.set_it = SET_TO_CLIENT_TIME; |
| sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec; |
| sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec; |
| } |
| return (0); |
| } |
| |
| void |
| setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp) |
| { |
| |
| da->da_fhandle = VTOFH(dvp); |
| da->da_name = nm; |
| da->da_flags = 0; |
| } |
| |
| void |
| setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp) |
| { |
| |
| da->dirp = VTOFH3(dvp); |
| da->name = nm; |
| } |
| |
| int |
| setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr) |
| { |
| int error; |
| rnode_t *rp; |
| struct vattr va; |
| |
| va.va_mask = AT_MODE | AT_GID; |
| error = VOP_GETATTR(dvp, &va, 0, cr); |
| if (error) |
| return (error); |
| |
| /* |
| * To determine the expected group-id of the created file: |
| * 1) If the filesystem was not mounted with the Old-BSD-compatible |
| * GRPID option, and the directory's set-gid bit is clear, |
| * then use the process's gid. |
| * 2) Otherwise, set the group-id to the gid of the parent directory. |
| */ |
| rp = VTOR(dvp); |
| mutex_enter(&rp->r_statelock); |
| if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID)) |
| *gidp = crgetgid(cr); |
| else |
| *gidp = va.va_gid; |
| mutex_exit(&rp->r_statelock); |
| return (0); |
| } |
| |
| int |
| setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr) |
| { |
| int error; |
| struct vattr va; |
| |
| va.va_mask = AT_MODE; |
| error = VOP_GETATTR(dvp, &va, 0, cr); |
| if (error) |
| return (error); |
| |
| /* |
| * Modify the expected mode (om) so that the set-gid bit matches |
| * that of the parent directory (dvp). |
| */ |
| if (va.va_mode & VSGID) |
| *omp |= VSGID; |
| else |
| *omp &= ~VSGID; |
| return (0); |
| } |
| |
| void |
| nfs_setswaplike(vnode_t *vp, vattr_t *vap) |
| { |
| |
| if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) { |
| if (!(vp->v_flag & VSWAPLIKE)) { |
| mutex_enter(&vp->v_lock); |
| vp->v_flag |= VSWAPLIKE; |
| mutex_exit(&vp->v_lock); |
| } |
| } else { |
| if (vp->v_flag & VSWAPLIKE) { |
| mutex_enter(&vp->v_lock); |
| vp->v_flag &= ~VSWAPLIKE; |
| mutex_exit(&vp->v_lock); |
| } |
| } |
| } |
| |
| /* |
| * Free the resources associated with an rnode. |
| */ |
| static void |
| rinactive(rnode_t *rp, cred_t *cr) |
| { |
| vnode_t *vp; |
| cred_t *cred; |
| char *contents; |
| int size; |
| vsecattr_t *vsp; |
| int error; |
| nfs3_pathconf_info *info; |
| |
| /* |
| * Before freeing anything, wait until all asynchronous |
| * activity is done on this rnode. This will allow all |
| * asynchronous read ahead and write behind i/o's to |
| * finish. |
| */ |
| mutex_enter(&rp->r_statelock); |
| while (rp->r_count > 0) |
| cv_wait(&rp->r_cv, &rp->r_statelock); |
| mutex_exit(&rp->r_statelock); |
| |
| /* |
| * Flush and invalidate all pages associated with the vnode. |
| */ |
| vp = RTOV(rp); |
| if (vn_has_cached_data(vp)) { |
| ASSERT(vp->v_type != VCHR); |
| if ((rp->r_flags & RDIRTY) && !rp->r_error) { |
| error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr); |
| if (error && (error == ENOSPC || error == EDQUOT)) { |
| mutex_enter(&rp->r_statelock); |
| if (!rp->r_error) |
| rp->r_error = error; |
| mutex_exit(&rp->r_statelock); |
| } |
| } |
| nfs_invalidate_pages(vp, (u_offset_t)0, cr); |
| } |
| |
| /* |
| * Free any held credentials and caches which may be associated |
| * with this rnode. |
| */ |
| mutex_enter(&rp->r_statelock); |
| cred = rp->r_cred; |
| rp->r_cred = NULL; |
| contents = rp->r_symlink.contents; |
| size = rp->r_symlink.size; |
| rp->r_symlink.contents = NULL; |
| vsp = rp->r_secattr; |
| rp->r_secattr = NULL; |
| info = rp->r_pathconf; |
| rp->r_pathconf = NULL; |
| mutex_exit(&rp->r_statelock); |
| |
| /* |
| * Free the held credential. |
| */ |
| if (cred != NULL) |
| crfree(cred); |
| |
| /* |
| * Free the access cache entries. |
| */ |
| (void) nfs_access_purge_rp(rp); |
| |
| /* |
| * Free the readdir cache entries. |
| */ |
| if (HAVE_RDDIR_CACHE(rp)) |
| nfs_purge_rddir_cache(vp); |
| |
| /* |
| * Free the symbolic link cache. |
| */ |
| if (contents != NULL) { |
| |
| kmem_free((void *)contents, size); |
| } |
| |
| /* |
| * Free any cached ACL. |
| */ |
| if (vsp != NULL) |
| nfs_acl_free(vsp); |
| |
| /* |
| * Free any cached pathconf information. |
| */ |
| if (info != NULL) |
| kmem_free(info, sizeof (*info)); |
| } |
| |
| /* |
| * Return a vnode for the given NFS Version 2 file handle. |
| * If no rnode exists for this fhandle, create one and put it |
| * into the hash queues. If the rnode for this fhandle |
| * already exists, return it. |
| * |
| * Note: make_rnode() may upgrade the hash bucket lock to exclusive. |
| */ |
| vnode_t * |
| makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp, |
| hrtime_t t, cred_t *cr, char *dnm, char *nm) |
| { |
| int newnode; |
| int index; |
| vnode_t *vp; |
| nfs_fhandle nfh; |
| vattr_t va; |
| |
| nfh.fh_len = NFS_FHSIZE; |
| bcopy(fh, nfh.fh_buf, NFS_FHSIZE); |
| |
| index = rtablehash(&nfh); |
| rw_enter(&rtable[index].r_lock, RW_READER); |
| |
| vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops, |
| nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm); |
| |
| if (attr != NULL) { |
| if (!newnode) { |
| rw_exit(&rtable[index].r_lock); |
| (void) nfs_cache_fattr(vp, attr, &va, t, cr); |
| } else { |
| if (attr->na_type < NFNON || attr->na_type > NFSOC) |
| vp->v_type = VBAD; |
| else |
| vp->v_type = n2v_type(attr); |
| /* |
| * A translation here seems to be necessary |
| * because this function can be called |
| * with `attr' that has come from the wire, |
| * and been operated on by vattr_to_nattr(). |
| * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr() |
| * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr() |
| * ->makenfsnode(). |
| */ |
| if ((attr->na_rdev & 0xffff0000) == 0) |
| vp->v_rdev = nfsv2_expdev(attr->na_rdev); |
| else |
| vp->v_rdev = expldev(n2v_rdev(attr)); |
| nfs_attrcache(vp, attr, t); |
| rw_exit(&rtable[index].r_lock); |
| } |
| } else { |
| if (newnode) { |
| PURGE_ATTRCACHE(vp); |
| } |
| rw_exit(&rtable[index].r_lock); |
| } |
| |
| return (vp); |
| } |
| |
| /* |
| * Return a vnode for the given NFS Version 3 file handle. |
| * If no rnode exists for this fhandle, create one and put it |
| * into the hash queues. If the rnode for this fhandle |
| * already exists, return it. |
| * |
| * Note: make_rnode() may upgrade the hash bucket lock to exclusive. |
| */ |
| vnode_t * |
| makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t, |
| cred_t *cr, char *dnm, char *nm) |
| { |
| int newnode; |
| int index; |
| vnode_t *vp; |
| |
| index = rtablehash((nfs_fhandle *)fh); |
| rw_enter(&rtable[index].r_lock, RW_READER); |
| |
| vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, |
| nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, |
| dnm, nm); |
| |
| if (vap == NULL) { |
| if (newnode) { |
| PURGE_ATTRCACHE(vp); |
| } |
| rw_exit(&rtable[index].r_lock); |
| return (vp); |
| } |
| |
| if (!newnode) { |
| rw_exit(&rtable[index].r_lock); |
| nfs_attr_cache(vp, vap, t, cr); |
| } else { |
| rnode_t *rp = VTOR(vp); |
| |
| vp->v_type = vap->va_type; |
| vp->v_rdev = vap->va_rdev; |
| |
| mutex_enter(&rp->r_statelock); |
| if (rp->r_mtime <= t) |
| nfs_attrcache_va(vp, vap); |
| mutex_exit(&rp->r_statelock); |
| rw_exit(&rtable[index].r_lock); |
| } |
| |
| return (vp); |
| } |
| |
| vnode_t * |
| makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t, |
| cred_t *cr, char *dnm, char *nm) |
| { |
| int newnode; |
| int index; |
| vnode_t *vp; |
| vattr_t va; |
| |
| index = rtablehash((nfs_fhandle *)fh); |
| rw_enter(&rtable[index].r_lock, RW_READER); |
| |
| vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp, |
| nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr, |
| dnm, nm); |
| |
| if (attr == NULL) { |
| if (newnode) { |
| PURGE_ATTRCACHE(vp); |
| } |
| rw_exit(&rtable[index].r_lock); |
| return (vp); |
| } |
| |
| if (!newnode) { |
| rw_exit(&rtable[index].r_lock); |
| (void) nfs3_cache_fattr3(vp, attr, &va, t, cr); |
| } else { |
| if (attr->type < NF3REG || attr->type > NF3FIFO) |
| vp->v_type = VBAD; |
| else |
| vp->v_type = nf3_to_vt[attr->type]; |
| vp->v_rdev = makedevice(attr->rdev.specdata1, |
| attr->rdev.specdata2); |
| nfs3_attrcache(vp, attr, t); |
| rw_exit(&rtable[index].r_lock); |
| } |
| |
| return (vp); |
| } |
| |
| /* |
| * Read this comment before making changes to rtablehash()! |
| * This is a hash function in which seemingly obvious and harmless |
| * changes can cause escalations costing million dollars! |
| * Know what you are doing. |
| * |
| * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The |
| * algorithm is currently detailed here: |
| * |
| * http://burtleburtle.net/bob/hash/doobs.html |
| * |
| * Of course, the above link may not be valid by the time you are reading |
| * this, but suffice it to say that the one-at-a-time algorithm works well in |
| * almost all cases. If you are changing the algorithm be sure to verify that |
| * the hash algorithm still provides even distribution in all cases and with |
| * any server returning filehandles in whatever order (sequential or random). |
| */ |
| static int |
| rtablehash(nfs_fhandle *fh) |
| { |
| ulong_t hash, len, i; |
| char *key; |
| |
| key = fh->fh_buf; |
| len = (ulong_t)fh->fh_len; |
| for (hash = 0, i = 0; i < len; i++) { |
| hash += key[i]; |
| hash += (hash << 10); |
| hash ^= (hash >> 6); |
| } |
| hash += (hash << 3); |
| hash ^= (hash >> 11); |
| hash += (hash << 15); |
| return (hash & rtablemask); |
| } |
| |
| static vnode_t * |
| make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp, |
| struct vnodeops *vops, |
| int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), |
| int (*compar)(const void *, const void *), |
| int *newnode, cred_t *cr, char *dnm, char *nm) |
| { |
| rnode_t *rp; |
| rnode_t *trp; |
| vnode_t *vp; |
| mntinfo_t *mi; |
| |
| ASSERT(RW_READ_HELD(&rhtp->r_lock)); |
| |
| mi = VFTOMI(vfsp); |
| start: |
| if ((rp = rfind(rhtp, fh, vfsp)) != NULL) { |
| vp = RTOV(rp); |
| nfs_set_vroot(vp); |
| *newnode = 0; |
| return (vp); |
| } |
| rw_exit(&rhtp->r_lock); |
| |
| mutex_enter(&rpfreelist_lock); |
| if (rpfreelist != NULL && rnew >= nrnode) { |
| rp = rpfreelist; |
| rp_rmfree(rp); |
| mutex_exit(&rpfreelist_lock); |
| |
| vp = RTOV(rp); |
| |
| if (rp->r_flags & RHASHED) { |
| rw_enter(&rp->r_hashq->r_lock, RW_WRITER); |
| mutex_enter(&vp->v_lock); |
| if (vp->v_count > 1) { |
| vp->v_count--; |
| mutex_exit(&vp->v_lock); |
| rw_exit(&rp->r_hashq->r_lock); |
| rw_enter(&rhtp->r_lock, RW_READER); |
| goto start; |
| } |
| mutex_exit(&vp->v_lock); |
| rp_rmhash_locked(rp); |
| rw_exit(&rp->r_hashq->r_lock); |
| } |
| |
| rinactive(rp, cr); |
| |
| mutex_enter(&vp->v_lock); |
| if (vp->v_count > 1) { |
| vp->v_count--; |
| mutex_exit(&vp->v_lock); |
| rw_enter(&rhtp->r_lock, RW_READER); |
| goto start; |
| } |
| mutex_exit(&vp->v_lock); |
| vn_invalid(vp); |
| /* |
| * destroy old locks before bzero'ing and |
| * recreating the locks below. |
| */ |
| nfs_rw_destroy(&rp->r_rwlock); |
| nfs_rw_destroy(&rp->r_lkserlock); |
| mutex_destroy(&rp->r_statelock); |
| cv_destroy(&rp->r_cv); |
| cv_destroy(&rp->r_commit.c_cv); |
| nfs_free_r_path(rp); |
| avl_destroy(&rp->r_dir); |
| /* |
| * Make sure that if rnode is recycled then |
| * VFS count is decremented properly before |
| * reuse. |
| */ |
| VFS_RELE(vp->v_vfsp); |
| vn_reinit(vp); |
| } else { |
| vnode_t *new_vp; |
| |
| mutex_exit(&rpfreelist_lock); |
| |
| rp = kmem_cache_alloc(rnode_cache, KM_SLEEP); |
| new_vp = vn_alloc(KM_SLEEP); |
| |
| atomic_add_long((ulong_t *)&rnew, 1); |
| #ifdef DEBUG |
| clstat_debug.nrnode.value.ui64++; |
| #endif |
| vp = new_vp; |
| } |
| |
| bzero(rp, sizeof (*rp)); |
| rp->r_vnode = vp; |
| nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); |
| nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); |
| mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); |
| cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); |
| cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); |
| rp->r_fh.fh_len = fh->fh_len; |
| bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len); |
| rp->r_server = mi->mi_curr_serv; |
| if (FAILOVER_MOUNT(mi)) { |
| /* |
| * If replicated servers, stash pathnames |
| */ |
| if (dnm != NULL && nm != NULL) { |
| char *s, *p; |
| uint_t len; |
| |
| len = (uint_t)(strlen(dnm) + strlen(nm) + 2); |
| rp->r_path = kmem_alloc(len, KM_SLEEP); |
| #ifdef DEBUG |
| clstat_debug.rpath.value.ui64 += len; |
| #endif |
| s = rp->r_path; |
| for (p = dnm; *p; p++) |
| *s++ = *p; |
| *s++ = '/'; |
| for (p = nm; *p; p++) |
| *s++ = *p; |
| *s = '\0'; |
| } else { |
| /* special case for root */ |
| rp->r_path = kmem_alloc(2, KM_SLEEP); |
| #ifdef DEBUG |
| clstat_debug.rpath.value.ui64 += 2; |
| #endif |
| *rp->r_path = '.'; |
| *(rp->r_path + 1) = '\0'; |
| } |
| } |
| VFS_HOLD(vfsp); |
| rp->r_putapage = putapage; |
| rp->r_hashq = rhtp; |
| rp->r_flags = RREADDIRPLUS; |
| avl_create(&rp->r_dir, compar, sizeof (rddir_cache), |
| offsetof(rddir_cache, tree)); |
| vn_setops(vp, vops); |
| vp->v_data = (caddr_t)rp; |
| vp->v_vfsp = vfsp; |
| vp->v_type = VNON; |
| nfs_set_vroot(vp); |
| |
| /* |
| * There is a race condition if someone else |
| * alloc's the rnode while no locks are held, so we |
| * check again and recover if found. |
| */ |
| rw_enter(&rhtp->r_lock, RW_WRITER); |
| if ((trp = rfind(rhtp, fh, vfsp)) != NULL) { |
| vp = RTOV(trp); |
| nfs_set_vroot(vp); |
| *newnode = 0; |
| rw_exit(&rhtp->r_lock); |
| rp_addfree(rp, cr); |
| rw_enter(&rhtp->r_lock, RW_READER); |
| return (vp); |
| } |
| rp_addhash(rp); |
| *newnode = 1; |
| return (vp); |
| } |
| |
| static void |
| nfs_set_vroot(vnode_t *vp) |
| { |
| rnode_t *rp; |
| nfs_fhandle *rootfh; |
| |
| rp = VTOR(vp); |
| rootfh = &rp->r_server->sv_fhandle; |
| if (rootfh->fh_len == rp->r_fh.fh_len && |
| bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) { |
| if (!(vp->v_flag & VROOT)) { |
| mutex_enter(&vp->v_lock); |
| vp->v_flag |= VROOT; |
| mutex_exit(&vp->v_lock); |
| } |
| } |
| } |
| |
| static void |
| nfs_free_r_path(rnode_t *rp) |
| { |
| char *path; |
| size_t len; |
| |
| path = rp->r_path; |
| if (path) { |
| rp->r_path = NULL; |
| len = strlen(path) + 1; |
| kmem_free(path, len); |
| #ifdef DEBUG |
| clstat_debug.rpath.value.ui64 -= len; |
| #endif |
| } |
| } |
| |
| /* |
| * Put an rnode on the free list. |
| * |
| * Rnodes which were allocated above and beyond the normal limit |
| * are immediately freed. |
| */ |
| void |
| rp_addfree(rnode_t *rp, cred_t *cr) |
| { |
| vnode_t *vp; |
| struct vfs *vfsp; |
| |
| vp = RTOV(rp); |
| ASSERT(vp->v_count >= 1); |
| ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL); |
| |
| /* |
| * If we have too many rnodes allocated and there are no |
| * references to this rnode, or if the rnode is no longer |
| * accessible by it does not reside in the hash queues, |
| * or if an i/o error occurred while writing to the file, |
| * then just free it instead of putting it on the rnode |
| * freelist. |
| */ |
| vfsp = vp->v_vfsp; |
| if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error || |
| (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) { |
| if (rp->r_flags & RHASHED) { |
| rw_enter(&rp->r_hashq->r_lock, RW_WRITER); |
| mutex_enter(&vp->v_lock); |
| if (vp->v_count > 1) { |
| vp->v_count--; |
| mutex_exit(&vp->v_lock); |
| rw_exit(&rp->r_hashq->r_lock); |
| return; |
| } |
| mutex_exit(&vp->v_lock); |
| rp_rmhash_locked(rp); |
| rw_exit(&rp->r_hashq->r_lock); |
| } |
| |
| rinactive(rp, cr); |
| |
| /* |
| * Recheck the vnode reference count. We need to |
| * make sure that another reference has not been |
| * acquired while we were not holding v_lock. The |
| * rnode is not in the rnode hash queues, so the |
| * only way for a reference to have been acquired |
| * is for a VOP_PUTPAGE because the rnode was marked |
| * with RDIRTY or for a modified page. This |
| * reference may have been acquired before our call |
| * to rinactive. The i/o may have been completed, |
| * thus allowing rinactive to complete, but the |
| * reference to the vnode may not have been released |
| * yet. In any case, the rnode can not be destroyed |
| * until the other references to this vnode have been |
| * released. The other references will take care of |
| * either destroying the rnode or placing it on the |
| * rnode freelist. If there are no other references, |
| * then the rnode may be safely destroyed. |
| */ |
| mutex_enter(&vp->v_lock); |
| if (vp->v_count > 1) { |
| vp->v_count--; |
| mutex_exit(&vp->v_lock); |
| return; |
| } |
| mutex_exit(&vp->v_lock); |
| |
| destroy_rnode(rp); |
| return; |
| } |
| |
| /* |
| * Lock the hash queue and then recheck the reference count |
| * to ensure that no other threads have acquired a reference |
| * to indicate that the rnode should not be placed on the |
| * freelist. If another reference has been acquired, then |
| * just release this one and let the other thread complete |
| * the processing of adding this rnode to the freelist. |
| */ |
| rw_enter(&rp->r_hashq->r_lock, RW_WRITER); |
| |
| mutex_enter(&vp->v_lock); |
| if (vp->v_count > 1) { |
| vp->v_count--; |
| mutex_exit(&vp->v_lock); |
| rw_exit(&rp->r_hashq->r_lock); |
| return; |
| } |
| mutex_exit(&vp->v_lock); |
| |
| /* |
| * If there is no cached data or metadata for this file, then |
| * put the rnode on the front of the freelist so that it will |
| * be reused before other rnodes which may have cached data or |
| * metadata associated with them. |
| */ |
| mutex_enter(&rpfreelist_lock); |
| if (rpfreelist == NULL) { |
| rp->r_freef = rp; |
| rp->r_freeb = rp; |
| rpfreelist = rp; |
| } else { |
| rp->r_freef = rpfreelist; |
| rp->r_freeb = rpfreelist->r_freeb; |
| rpfreelist->r_freeb->r_freef = rp; |
| rpfreelist->r_freeb = rp; |
| if (!vn_has_cached_data(vp) && |
| !HAVE_RDDIR_CACHE(rp) && |
| rp->r_symlink.contents == NULL && |
| rp->r_secattr == NULL && |
| rp->r_pathconf == NULL) |
| rpfreelist = rp; |
| } |
| mutex_exit(&rpfreelist_lock); |
| |
| rw_exit(&rp->r_hashq->r_lock); |
| } |
| |
| /* |
| * Remove an rnode from the free list. |
| * |
| * The caller must be holding rpfreelist_lock and the rnode |
| * must be on the freelist. |
| */ |
| static void |
| rp_rmfree(rnode_t *rp) |
| { |
| |
| ASSERT(MUTEX_HELD(&rpfreelist_lock)); |
| ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL); |
| |
| if (rp == rpfreelist) { |
| rpfreelist = rp->r_freef; |
| if (rp == rpfreelist) |
| rpfreelist = NULL; |
| } |
| |
| rp->r_freeb->r_freef = rp->r_freef; |
| rp->r_freef->r_freeb = rp->r_freeb; |
| |
| rp->r_freef = rp->r_freeb = NULL; |
| } |
| |
| /* |
| * Put a rnode in the hash table. |
| * |
| * The caller must be holding the exclusive hash queue lock. |
| */ |
| static void |
| rp_addhash(rnode_t *rp) |
| { |
| |
| ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock)); |
| ASSERT(!(rp->r_flags & RHASHED)); |
| |
| rp->r_hashf = rp->r_hashq->r_hashf; |
| rp-> |