blob: 05e70935be36229c0c13edcb9d928263ee8fb5a4 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All rights reserved.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred_impl.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/time.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/tiuser.h>
#include <sys/swap.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/kmem.h>
#include <sys/kstat.h>
#include <sys/cmn_err.h>
#include <sys/vtrace.h>
#include <sys/session.h>
#include <sys/dnlc.h>
#include <sys/bitmap.h>
#include <sys/acl.h>
#include <sys/ddi.h>
#include <sys/pathname.h>
#include <sys/flock.h>
#include <sys/dirent.h>
#include <sys/flock.h>
#include <sys/callb.h>
#include <sys/atomic.h>
#include <sys/list.h>
#include <sys/tsol/tnet.h>
#include <sys/priv.h>
#include <inet/ip6.h>
#include <rpc/types.h>
#include <rpc/xdr.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>
#include <nfs/nfs.h>
#include <nfs/nfs4.h>
#include <nfs/nfs_clnt.h>
#include <nfs/rnode.h>
#include <nfs/nfs_acl.h>
/*
* The hash queues for the access to active and cached rnodes
* are organized as doubly linked lists. A reader/writer lock
* for each hash bucket is used to control access and to synchronize
* lookups, additions, and deletions from the hash queue.
*
* The rnode freelist is organized as a doubly linked list with
* a head pointer. Additions and deletions are synchronized via
* a single mutex.
*
* In order to add an rnode to the free list, it must be hashed into
* a hash queue and the exclusive lock to the hash queue be held.
* If an rnode is not hashed into a hash queue, then it is destroyed
* because it represents no valuable information that can be reused
* about the file. The exclusive lock to the hash queue must be
* held in order to prevent a lookup in the hash queue from finding
* the rnode and using it and assuming that the rnode is not on the
* freelist. The lookup in the hash queue will have the hash queue
* locked, either exclusive or shared.
*
* The vnode reference count for each rnode is not allowed to drop
* below 1. This prevents external entities, such as the VM
* subsystem, from acquiring references to vnodes already on the
* freelist and then trying to place them back on the freelist
* when their reference is released. This means that the when an
* rnode is looked up in the hash queues, then either the rnode
* is removed from the freelist and that reference is tranfered to
* the new reference or the vnode reference count must be incremented
* accordingly. The mutex for the freelist must be held in order to
* accurately test to see if the rnode is on the freelist or not.
* The hash queue lock might be held shared and it is possible that
* two different threads may race to remove the rnode from the
* freelist. This race can be resolved by holding the mutex for the
* freelist. Please note that the mutex for the freelist does not
* need to held if the rnode is not on the freelist. It can not be
* placed on the freelist due to the requirement that the thread
* putting the rnode on the freelist must hold the exclusive lock
* to the hash queue and the thread doing the lookup in the hash
* queue is holding either a shared or exclusive lock to the hash
* queue.
*
* The lock ordering is:
*
* hash bucket lock -> vnode lock
* hash bucket lock -> freelist lock
*/
static rhashq_t *rtable;
static kmutex_t rpfreelist_lock;
static rnode_t *rpfreelist = NULL;
static long rnew = 0;
long nrnode = 0;
static int rtablesize;
static int rtablemask;
static int hashlen = 4;
static struct kmem_cache *rnode_cache;
/*
* Mutex to protect the following variables:
* nfs_major
* nfs_minor
*/
kmutex_t nfs_minor_lock;
int nfs_major;
int nfs_minor;
/* Do we allow preepoch (negative) time values otw? */
bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
/*
* Access cache
*/
static acache_hash_t *acache;
static long nacache; /* used strictly to size the number of hash queues */
static int acachesize;
static int acachemask;
static struct kmem_cache *acache_cache;
/*
* Client side utilities
*/
/*
* client side statistics
*/
static const struct clstat clstat_tmpl = {
{ "calls", KSTAT_DATA_UINT64 },
{ "badcalls", KSTAT_DATA_UINT64 },
{ "clgets", KSTAT_DATA_UINT64 },
{ "cltoomany", KSTAT_DATA_UINT64 },
#ifdef DEBUG
{ "clalloc", KSTAT_DATA_UINT64 },
{ "noresponse", KSTAT_DATA_UINT64 },
{ "failover", KSTAT_DATA_UINT64 },
{ "remap", KSTAT_DATA_UINT64 },
#endif
};
/*
* The following are statistics that describe behavior of the system as a whole
* and doesn't correspond to any one particular zone.
*/
#ifdef DEBUG
static struct clstat_debug {
kstat_named_t nrnode; /* number of allocated rnodes */
kstat_named_t access; /* size of access cache */
kstat_named_t dirent; /* size of readdir cache */
kstat_named_t dirents; /* size of readdir buf cache */
kstat_named_t reclaim; /* number of reclaims */
kstat_named_t clreclaim; /* number of cl reclaims */
kstat_named_t f_reclaim; /* number of free reclaims */
kstat_named_t a_reclaim; /* number of active reclaims */
kstat_named_t r_reclaim; /* number of rnode reclaims */
kstat_named_t rpath; /* bytes used to store rpaths */
} clstat_debug = {
{ "nrnode", KSTAT_DATA_UINT64 },
{ "access", KSTAT_DATA_UINT64 },
{ "dirent", KSTAT_DATA_UINT64 },
{ "dirents", KSTAT_DATA_UINT64 },
{ "reclaim", KSTAT_DATA_UINT64 },
{ "clreclaim", KSTAT_DATA_UINT64 },
{ "f_reclaim", KSTAT_DATA_UINT64 },
{ "a_reclaim", KSTAT_DATA_UINT64 },
{ "r_reclaim", KSTAT_DATA_UINT64 },
{ "r_path", KSTAT_DATA_UINT64 },
};
#endif /* DEBUG */
/*
* We keep a global list of per-zone client data, so we can clean up all zones
* if we get low on memory.
*/
static list_t nfs_clnt_list;
static kmutex_t nfs_clnt_list_lock;
static zone_key_t nfsclnt_zone_key;
static struct kmem_cache *chtab_cache;
/*
* Some servers do not properly update the attributes of the
* directory when changes are made. To allow interoperability
* with these broken servers, the nfs_disable_rddir_cache
* parameter must be set in /etc/system
*/
int nfs_disable_rddir_cache = 0;
int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
struct chtab **);
void clfree(CLIENT *, struct chtab *);
static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
struct chtab **, struct nfs_clnt *);
static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
struct chtab **, struct nfs_clnt *);
static void clreclaim(void *);
static int nfs_feedback(int, int, mntinfo_t *);
static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
caddr_t, cred_t *, int *, enum clnt_stat *, int,
failinfo_t *);
static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
caddr_t, cred_t *, int *, int, failinfo_t *);
static void rinactive(rnode_t *, cred_t *);
static int rtablehash(nfs_fhandle *);
static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
struct vnodeops *,
int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
cred_t *),
int (*)(const void *, const void *), int *, cred_t *,
char *, char *);
static void rp_rmfree(rnode_t *);
static void rp_addhash(rnode_t *);
static void rp_rmhash_locked(rnode_t *);
static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
static void destroy_rnode(rnode_t *);
static void rddir_cache_free(rddir_cache *);
static int nfs_free_data_reclaim(rnode_t *);
static int nfs_active_data_reclaim(rnode_t *);
static int nfs_free_reclaim(void);
static int nfs_active_reclaim(void);
static int nfs_rnode_reclaim(void);
static void nfs_reclaim(void *);
static int failover_safe(failinfo_t *);
static void failover_newserver(mntinfo_t *mi);
static void failover_thread(mntinfo_t *mi);
static int failover_wait(mntinfo_t *);
static int failover_remap(failinfo_t *);
static int failover_lookup(char *, vnode_t *,
int (*)(vnode_t *, char *, vnode_t **,
struct pathname *, int, vnode_t *, cred_t *, int),
int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
vnode_t **);
static void nfs_free_r_path(rnode_t *);
static void nfs_set_vroot(vnode_t *);
static char *nfs_getsrvnames(mntinfo_t *, size_t *);
/*
* from rpcsec module (common/rpcsec)
*/
extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
extern void sec_clnt_freeh(AUTH *);
extern void sec_clnt_freeinfo(struct sec_data *);
/*
* used in mount policy
*/
extern ts_label_t *getflabel_cipso(vfs_t *);
/*
* EIO or EINTR are not recoverable errors.
*/
#define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
/*
* Common handle get program for NFS, NFS ACL, and NFS AUTH client.
*/
static int
clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
struct chtab **chp, struct nfs_clnt *nfscl)
{
struct chhead *ch, *newch;
struct chhead **plistp;
struct chtab *cp;
int error;
k_sigset_t smask;
if (newcl == NULL || chp == NULL || ci == NULL)
return (EINVAL);
*newcl = NULL;
*chp = NULL;
/*
* Find an unused handle or create one
*/
newch = NULL;
nfscl->nfscl_stat.clgets.value.ui64++;
top:
/*
* Find the correct entry in the cache to check for free
* client handles. The search is based on the RPC program
* number, program version number, dev_t for the transport
* device, and the protocol family.
*/
mutex_enter(&nfscl->nfscl_chtable_lock);
plistp = &nfscl->nfscl_chtable;
for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
if (ch->ch_prog == ci->cl_prog &&
ch->ch_vers == ci->cl_vers &&
ch->ch_dev == svp->sv_knconf->knc_rdev &&
(strcmp(ch->ch_protofmly,
svp->sv_knconf->knc_protofmly) == 0))
break;
plistp = &ch->ch_next;
}
/*
* If we didn't find a cache entry for this quadruple, then
* create one. If we don't have one already preallocated,
* then drop the cache lock, create one, and then start over.
* If we did have a preallocated entry, then just add it to
* the front of the list.
*/
if (ch == NULL) {
if (newch == NULL) {
mutex_exit(&nfscl->nfscl_chtable_lock);
newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
newch->ch_timesused = 0;
newch->ch_prog = ci->cl_prog;
newch->ch_vers = ci->cl_vers;
newch->ch_dev = svp->sv_knconf->knc_rdev;
newch->ch_protofmly = kmem_alloc(
strlen(svp->sv_knconf->knc_protofmly) + 1,
KM_SLEEP);
(void) strcpy(newch->ch_protofmly,
svp->sv_knconf->knc_protofmly);
newch->ch_list = NULL;
goto top;
}
ch = newch;
newch = NULL;
ch->ch_next = nfscl->nfscl_chtable;
nfscl->nfscl_chtable = ch;
/*
* We found a cache entry, but if it isn't on the front of the
* list, then move it to the front of the list to try to take
* advantage of locality of operations.
*/
} else if (ch != nfscl->nfscl_chtable) {
*plistp = ch->ch_next;
ch->ch_next = nfscl->nfscl_chtable;
nfscl->nfscl_chtable = ch;
}
/*
* If there was a free client handle cached, then remove it
* from the list, init it, and use it.
*/
if (ch->ch_list != NULL) {
cp = ch->ch_list;
ch->ch_list = cp->ch_list;
mutex_exit(&nfscl->nfscl_chtable_lock);
if (newch != NULL) {
kmem_free(newch->ch_protofmly,
strlen(newch->ch_protofmly) + 1);
kmem_free(newch, sizeof (*newch));
}
(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
&svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
&cp->ch_client->cl_auth);
if (error || cp->ch_client->cl_auth == NULL) {
CLNT_DESTROY(cp->ch_client);
kmem_cache_free(chtab_cache, cp);
return ((error != 0) ? error : EINTR);
}
ch->ch_timesused++;
*newcl = cp->ch_client;
*chp = cp;
return (0);
}
/*
* There weren't any free client handles which fit, so allocate
* a new one and use that.
*/
#ifdef DEBUG
atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
#endif
mutex_exit(&nfscl->nfscl_chtable_lock);
nfscl->nfscl_stat.cltoomany.value.ui64++;
if (newch != NULL) {
kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
kmem_free(newch, sizeof (*newch));
}
cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
cp->ch_head = ch;
sigintr(&smask, (int)ci->cl_flags & MI_INT);
error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
sigunintr(&smask);
if (error != 0) {
kmem_cache_free(chtab_cache, cp);
#ifdef DEBUG
atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
#endif
/*
* Warning is unnecessary if error is EINTR.
*/
if (error != EINTR) {
nfs_cmn_err(error, CE_WARN,
"clget: couldn't create handle: %m\n");
}
return (error);
}
(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
auth_destroy(cp->ch_client->cl_auth);
error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
&cp->ch_client->cl_auth);
if (error || cp->ch_client->cl_auth == NULL) {
CLNT_DESTROY(cp->ch_client);
kmem_cache_free(chtab_cache, cp);
#ifdef DEBUG
atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
#endif
return ((error != 0) ? error : EINTR);
}
ch->ch_timesused++;
*newcl = cp->ch_client;
ASSERT(cp->ch_client->cl_nosignal == FALSE);
*chp = cp;
return (0);
}
int
clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
struct chtab **chp)
{
struct nfs_clnt *nfscl;
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
}
static int
acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
struct chtab **chp, struct nfs_clnt *nfscl)
{
clinfo_t ci;
int error;
/*
* Set read buffer size to rsize
* and add room for RPC headers.
*/
ci.cl_readsize = mi->mi_tsize;
if (ci.cl_readsize != 0)
ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
/*
* If soft mount and server is down just try once.
* meaning: do not retransmit.
*/
if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
ci.cl_retrans = 0;
else
ci.cl_retrans = mi->mi_retrans;
ci.cl_prog = NFS_ACL_PROGRAM;
ci.cl_vers = mi->mi_vers;
ci.cl_flags = mi->mi_flags;
/*
* clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
* security flavor, the client tries to establish a security context
* by contacting the server. If the connection is timed out or reset,
* e.g. server reboot, we will try again.
*/
do {
error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
if (error == 0)
break;
/*
* For forced unmount or zone shutdown, bail out, no retry.
*/
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
error = EIO;
break;
}
/* do not retry for softmount */
if (!(mi->mi_flags & MI_HARD))
break;
/* let the caller deal with the failover case */
if (FAILOVER_MOUNT(mi))
break;
} while (error == ETIMEDOUT || error == ECONNRESET);
return (error);
}
static int
nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
struct chtab **chp, struct nfs_clnt *nfscl)
{
clinfo_t ci;
int error;
/*
* Set read buffer size to rsize
* and add room for RPC headers.
*/
ci.cl_readsize = mi->mi_tsize;
if (ci.cl_readsize != 0)
ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
/*
* If soft mount and server is down just try once.
* meaning: do not retransmit.
*/
if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
ci.cl_retrans = 0;
else
ci.cl_retrans = mi->mi_retrans;
ci.cl_prog = mi->mi_prog;
ci.cl_vers = mi->mi_vers;
ci.cl_flags = mi->mi_flags;
/*
* clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
* security flavor, the client tries to establish a security context
* by contacting the server. If the connection is timed out or reset,
* e.g. server reboot, we will try again.
*/
do {
error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
if (error == 0)
break;
/*
* For forced unmount or zone shutdown, bail out, no retry.
*/
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
error = EIO;
break;
}
/* do not retry for softmount */
if (!(mi->mi_flags & MI_HARD))
break;
/* let the caller deal with the failover case */
if (FAILOVER_MOUNT(mi))
break;
} while (error == ETIMEDOUT || error == ECONNRESET);
return (error);
}
static void
clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
{
if (cl->cl_auth != NULL) {
sec_clnt_freeh(cl->cl_auth);
cl->cl_auth = NULL;
}
/*
* Timestamp this cache entry so that we know when it was last
* used.
*/
cp->ch_freed = gethrestime_sec();
/*
* Add the free client handle to the front of the list.
* This way, the list will be sorted in youngest to oldest
* order.
*/
mutex_enter(&nfscl->nfscl_chtable_lock);
cp->ch_list = cp->ch_head->ch_list;
cp->ch_head->ch_list = cp;
mutex_exit(&nfscl->nfscl_chtable_lock);
}
void
clfree(CLIENT *cl, struct chtab *cp)
{
struct nfs_clnt *nfscl;
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
clfree_impl(cl, cp, nfscl);
}
#define CL_HOLDTIME 60 /* time to hold client handles */
static void
clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
{
struct chhead *ch;
struct chtab *cp; /* list of objects that can be reclaimed */
struct chtab *cpe;
struct chtab *cpl;
struct chtab **cpp;
#ifdef DEBUG
int n = 0;
#endif
/*
* Need to reclaim some memory, so step through the cache
* looking through the lists for entries which can be freed.
*/
cp = NULL;
mutex_enter(&nfscl->nfscl_chtable_lock);
/*
* Here we step through each non-NULL quadruple and start to
* construct the reclaim list pointed to by cp. Note that
* cp will contain all eligible chtab entries. When this traversal
* completes, chtab entries from the last quadruple will be at the
* front of cp and entries from previously inspected quadruples have
* been appended to the rear of cp.
*/
for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
if (ch->ch_list == NULL)
continue;
/*
* Search each list for entries older then
* cl_holdtime seconds. The lists are maintained
* in youngest to oldest order so that when the
* first entry is found which is old enough, then
* all of the rest of the entries on the list will
* be old enough as well.
*/
cpl = ch->ch_list;
cpp = &ch->ch_list;
while (cpl != NULL &&
cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
cpp = &cpl->ch_list;
cpl = cpl->ch_list;
}
if (cpl != NULL) {
*cpp = NULL;
if (cp != NULL) {
cpe = cpl;
while (cpe->ch_list != NULL)
cpe = cpe->ch_list;
cpe->ch_list = cp;
}
cp = cpl;
}
}
mutex_exit(&nfscl->nfscl_chtable_lock);
/*
* If cp is empty, then there is nothing to reclaim here.
*/
if (cp == NULL)
return;
/*
* Step through the list of entries to free, destroying each client
* handle and kmem_free'ing the memory for each entry.
*/
while (cp != NULL) {
#ifdef DEBUG
n++;
#endif
CLNT_DESTROY(cp->ch_client);
cpl = cp->ch_list;
kmem_cache_free(chtab_cache, cp);
cp = cpl;
}
#ifdef DEBUG
/*
* Update clalloc so that nfsstat shows the current number
* of allocated client handles.
*/
atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
#endif
}
/* ARGSUSED */
static void
clreclaim(void *all)
{
struct nfs_clnt *nfscl;
#ifdef DEBUG
clstat_debug.clreclaim.value.ui64++;
#endif
/*
* The system is low on memory; go through and try to reclaim some from
* every zone on the system.
*/
mutex_enter(&nfs_clnt_list_lock);
nfscl = list_head(&nfs_clnt_list);
for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
clreclaim_zone(nfscl, CL_HOLDTIME);
mutex_exit(&nfs_clnt_list_lock);
}
/*
* Minimum time-out values indexed by call type
* These units are in "eights" of a second to avoid multiplies
*/
static unsigned int minimum_timeo[] = {
6, 7, 10
};
/*
* Back off for retransmission timeout, MAXTIMO is in hz of a sec
*/
#define MAXTIMO (20*hz)
#define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
#define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
#define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
#define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
#define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
/*
* Function called when rfscall notices that we have been
* re-transmitting, or when we get a response without retransmissions.
* Return 1 if the transfer size was adjusted down - 0 if no change.
*/
static int
nfs_feedback(int flag, int which, mntinfo_t *mi)
{
int kind;
int r = 0;
mutex_enter(&mi->mi_lock);
if (flag == FEEDBACK_REXMIT1) {
if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
goto done;
if (mi->mi_curread > MIN_NFS_TSIZE) {
mi->mi_curread /= 2;
if (mi->mi_curread < MIN_NFS_TSIZE)
mi->mi_curread = MIN_NFS_TSIZE;
r = 1;
}
if (mi->mi_curwrite > MIN_NFS_TSIZE) {
mi->mi_curwrite /= 2;
if (mi->mi_curwrite < MIN_NFS_TSIZE)
mi->mi_curwrite = MIN_NFS_TSIZE;
r = 1;
}
} else if (flag == FEEDBACK_OK) {
kind = mi->mi_timer_type[which];
if (kind == 0 ||
mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
goto done;
if (kind == 1) {
if (mi->mi_curread >= mi->mi_tsize)
goto done;
mi->mi_curread += MIN_NFS_TSIZE;
if (mi->mi_curread > mi->mi_tsize/2)
mi->mi_curread = mi->mi_tsize;
} else if (kind == 2) {
if (mi->mi_curwrite >= mi->mi_stsize)
goto done;
mi->mi_curwrite += MIN_NFS_TSIZE;
if (mi->mi_curwrite > mi->mi_stsize/2)
mi->mi_curwrite = mi->mi_stsize;
}
}
done:
mutex_exit(&mi->mi_lock);
return (r);
}
#ifdef DEBUG
static int rfs2call_hits = 0;
static int rfs2call_misses = 0;
#endif
int
rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
enum nfsstat *statusp, int flags, failinfo_t *fi)
{
int rpcerror;
enum clnt_stat rpc_status;
ASSERT(statusp != NULL);
rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
cr, douprintf, &rpc_status, flags, fi);
if (!rpcerror) {
/*
* See crnetadjust() for comments.
*/
if (*statusp == NFSERR_ACCES &&
(cr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
rfs2call_hits++;
#endif
rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
resp, cr, douprintf, NULL, flags, fi);
crfree(cr);
#ifdef DEBUG
if (*statusp == NFSERR_ACCES)
rfs2call_misses++;
#endif
}
} else if (rpc_status == RPC_PROCUNAVAIL) {
*statusp = NFSERR_OPNOTSUPP;
rpcerror = 0;
}
return (rpcerror);
}
#define NFS3_JUKEBOX_DELAY 10 * hz
static clock_t nfs3_jukebox_delay = 0;
#ifdef DEBUG
static int rfs3call_hits = 0;
static int rfs3call_misses = 0;
#endif
int
rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
nfsstat3 *statusp, int flags, failinfo_t *fi)
{
int rpcerror;
int user_informed;
user_informed = 0;
do {
rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
cr, douprintf, NULL, flags, fi);
if (!rpcerror) {
cred_t *crr;
if (*statusp == NFS3ERR_JUKEBOX) {
if (ttoproc(curthread) == &p0) {
rpcerror = EAGAIN;
break;
}
if (!user_informed) {
user_informed = 1;
uprintf(
"file temporarily unavailable on the server, retrying...\n");
}
delay(nfs3_jukebox_delay);
}
/*
* See crnetadjust() for comments.
*/
else if (*statusp == NFS3ERR_ACCES &&
(crr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
rfs3call_hits++;
#endif
rpcerror = rfscall(mi, which, xdrargs, argsp,
xdrres, resp, crr, douprintf,
NULL, flags, fi);
crfree(crr);
#ifdef DEBUG
if (*statusp == NFS3ERR_ACCES)
rfs3call_misses++;
#endif
}
}
} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
return (rpcerror);
}
#define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
#define INC_READERS(mi) { \
mi->mi_readers++; \
}
#define DEC_READERS(mi) { \
mi->mi_readers--; \
if (mi->mi_readers == 0) \
cv_broadcast(&mi->mi_failover_cv); \
}
static int
rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
{
CLIENT *client;
struct chtab *ch;
cred_t *cr = icr;
enum clnt_stat status;
struct rpc_err rpcerr;
struct timeval wait;
int timeo; /* in units of hz */
int my_rsize, my_wsize;
bool_t tryagain;
bool_t cred_cloned = FALSE;
k_sigset_t smask;
servinfo_t *svp;
struct nfs_clnt *nfscl;
zoneid_t zoneid = getzoneid();
#ifdef DEBUG
char *bufp;
#endif
TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
"rfscall_start:which %d mi %p", which, mi);
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
nfscl->nfscl_stat.calls.value.ui64++;
mi->mi_reqs[which].value.ui64++;
rpcerr.re_status = RPC_SUCCESS;
/*
* In case of forced unmount or zone shutdown, return EIO.
*/
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
return (rpcerr.re_errno);
}
/*
* Remember the transfer sizes in case
* nfs_feedback changes them underneath us.
*/
my_rsize = mi->mi_curread;
my_wsize = mi->mi_curwrite;
/*
* NFS client failover support
*
* If this rnode is not in sync with the current server (VALID_FH),
* we'd like to do a remap to get in sync. We can be interrupted
* in failover_remap(), and if so we'll bail. Otherwise, we'll
* use the best info we have to try the RPC. Part of that is
* unconditionally updating the filehandle copy kept for V3.
*
* Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
* rw_enter(); we're trying to keep the current server from being
* changed on us until we're done with the remapping and have a
* matching client handle. We don't want to sending a filehandle
* to the wrong host.
*/
failoverretry:
if (FAILOVER_MOUNT(mi)) {
mutex_enter(&mi->mi_lock);
if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
if (failover_wait(mi)) {
mutex_exit(&mi->mi_lock);
return (EINTR);
}
}
INC_READERS(mi);
mutex_exit(&mi->mi_lock);
if (fi) {
if (!VALID_FH(fi) &&
!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
int remaperr;
svp = mi->mi_curr_serv;
remaperr = failover_remap(fi);
if (remaperr != 0) {
#ifdef DEBUG
if (remaperr != EINTR)
nfs_cmn_err(remaperr, CE_WARN,
"rfscall couldn't failover: %m");
#endif
mutex_enter(&mi->mi_lock);
DEC_READERS(mi);
mutex_exit(&mi->mi_lock);
/*
* If failover_remap returns ETIMEDOUT
* and the filesystem is hard mounted
* we have to retry the call with a new
* server.
*/
if ((mi->mi_flags & MI_HARD) &&
IS_RECOVERABLE_ERROR(remaperr)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
rpcerr.re_status = RPC_SUCCESS;
goto failoverretry;
}
rpcerr.re_errno = remaperr;
return (remaperr);
}
}
if (fi->fhp && fi->copyproc)
(*fi->copyproc)(fi->fhp, fi->vp);
}
}
/* For TSOL, use a new cred which has net_mac_aware flag */
if (!cred_cloned && is_system_labeled()) {
cred_cloned = TRUE;
cr = crdup(icr);
(void) setpflags(NET_MAC_AWARE, 1, cr);
}
/*
* clget() calls clnt_tli_kinit() which clears the xid, so we
* are guaranteed to reprocess the retry as a new request.
*/
svp = mi->mi_curr_serv;
rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
if (FAILOVER_MOUNT(mi)) {
mutex_enter(&mi->mi_lock);
DEC_READERS(mi);
mutex_exit(&mi->mi_lock);
if ((rpcerr.re_errno == ETIMEDOUT ||
rpcerr.re_errno == ECONNRESET) &&
failover_safe(fi)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
goto failoverretry;
}
}
if (rpcerr.re_errno != 0)
return (rpcerr.re_errno);
if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
timeo = (mi->mi_timeo * hz) / 10;
} else {
mutex_enter(&mi->mi_lock);
timeo = CLNT_SETTIMERS(client,
&(mi->mi_timers[mi->mi_timer_type[which]]),
&(mi->mi_timers[NFS_CALLTYPES]),
(minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
(void (*)())NULL, (caddr_t)mi, 0);
mutex_exit(&mi->mi_lock);
}
/*
* If hard mounted fs, retry call forever unless hard error occurs.
*/
do {
tryagain = FALSE;
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
status = RPC_FAILED;
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
break;
}
TICK_TO_TIMEVAL(timeo, &wait);
/*
* Mask out all signals except SIGHUP, SIGINT, SIGQUIT
* and SIGTERM. (Preserving the existing masks).
* Mask out SIGINT if mount option nointr is specified.
*/
sigintr(&smask, (int)mi->mi_flags & MI_INT);
if (!(mi->mi_flags & MI_INT))
client->cl_nosignal = TRUE;
/*
* If there is a current signal, then don't bother
* even trying to send out the request because we
* won't be able to block waiting for the response.
* Simply assume RPC_INTR and get on with it.
*/
if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
status = RPC_INTR;
else {
status = CLNT_CALL(client, which, xdrargs, argsp,
xdrres, resp, wait);
}
if (!(mi->mi_flags & MI_INT))
client->cl_nosignal = FALSE;
/*
* restore original signal mask
*/
sigunintr(&smask);
switch (status) {
case RPC_SUCCESS:
if ((mi->mi_flags & MI_DYNAMIC) &&
mi->mi_timer_type[which] != 0 &&
(mi->mi_curread != my_rsize ||
mi->mi_curwrite != my_wsize))
(void) nfs_feedback(FEEDBACK_OK, which, mi);
break;
case RPC_INTR:
/*
* There is no way to recover from this error,
* even if mount option nointr is specified.
* SIGKILL, for example, cannot be blocked.
*/
rpcerr.re_status = RPC_INTR;
rpcerr.re_errno = EINTR;
break;
case RPC_UDERROR:
/*
* If the NFS server is local (vold) and
* it goes away then we get RPC_UDERROR.
* This is a retryable error, so we would
* loop, so check to see if the specific
* error was ECONNRESET, indicating that
* target did not exist at all. If so,
* return with RPC_PROGUNAVAIL and
* ECONNRESET to indicate why.
*/
CLNT_GETERR(client, &rpcerr);
if (rpcerr.re_errno == ECONNRESET) {
rpcerr.re_status = RPC_PROGUNAVAIL;
rpcerr.re_errno = ECONNRESET;
break;
}
/*FALLTHROUGH*/
default: /* probably RPC_TIMEDOUT */
if (IS_UNRECOVERABLE_RPC(status))
break;
/*
* increment server not responding count
*/
mutex_enter(&mi->mi_lock);
mi->mi_noresponse++;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
nfscl->nfscl_stat.noresponse.value.ui64++;
#endif
if (!(mi->mi_flags & MI_HARD)) {
if (!(mi->mi_flags & MI_SEMISOFT) ||
(mi->mi_ss_call_type[which] == 0))
break;
}
/*
* The call is in progress (over COTS).
* Try the CLNT_CALL again, but don't
* print a noisy error message.
*/
if (status == RPC_INPROGRESS) {
tryagain = TRUE;
break;
}
if (flags & RFSCALL_SOFT)
break;
/*
* On zone shutdown, just move on.
*/
if (zone_status_get(curproc->p_zone) >=
ZONE_IS_SHUTTING_DOWN) {
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
break;
}
/*
* NFS client failover support
*
* If the current server just failed us, we'll
* start the process of finding a new server.
* After that, we can just retry.
*/
if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
clfree_impl(client, ch, nfscl);
goto failoverretry;
}
tryagain = TRUE;
timeo = backoff(timeo);
mutex_enter(&mi->mi_lock);
if (!(mi->mi_flags & MI_PRINTED)) {
mi->mi_flags |= MI_PRINTED;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
zprintf(zoneid,
"NFS%d server %s not responding still trying\n",
mi->mi_vers, svp->sv_hostname);
#else
zprintf(zoneid,
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
} else
mutex_exit(&mi->mi_lock);
if (*douprintf && curproc->p_sessp->s_vp != NULL) {
*douprintf = 0;
if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
uprintf(
"NFS%d server %s not responding still trying\n",
mi->mi_vers, svp->sv_hostname);
#else
uprintf(
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
}
/*
* If doing dynamic adjustment of transfer
* size and if it's a read or write call
* and if the transfer size changed while
* retransmitting or if the feedback routine
* changed the transfer size,
* then exit rfscall so that the transfer
* size can be adjusted at the vnops level.
*/
if ((mi->mi_flags & MI_DYNAMIC) &&
mi->mi_timer_type[which] != 0 &&
(mi->mi_curread != my_rsize ||
mi->mi_curwrite != my_wsize ||
nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
/*
* On read or write calls, return
* back to the vnode ops level if
* the transfer size changed.
*/
clfree_impl(client, ch, nfscl);
if (cred_cloned)
crfree(cr);
return (ENFS_TRYAGAIN);
}
}
} while (tryagain);
if (status != RPC_SUCCESS) {
/*
* Let soft mounts use the timed out message.
*/
if (status == RPC_INPROGRESS)
status = RPC_TIMEDOUT;
nfscl->nfscl_stat.badcalls.value.ui64++;
if (status != RPC_INTR) {
mutex_enter(&mi->mi_lock);
mi->mi_flags |= MI_DOWN;
mutex_exit(&mi->mi_lock);
CLNT_GETERR(client, &rpcerr);
#ifdef DEBUG
bufp = clnt_sperror(client, svp->sv_hostname);
zprintf(zoneid, "NFS%d %s failed for %s\n",
mi->mi_vers, mi->mi_rfsnames[which], bufp);
if (curproc->p_sessp->s_vp != NULL) {
if (!(mi->mi_flags & MI_NOPRINT)) {
uprintf("NFS%d %s failed for %s\n",
mi->mi_vers, mi->mi_rfsnames[which],
bufp);
}
}
kmem_free(bufp, MAXPATHLEN);
#else
zprintf(zoneid,
"NFS %s failed for server %s: error %d (%s)\n",
mi->mi_rfsnames[which], svp->sv_hostname,
status, clnt_sperrno(status));
if (curproc->p_sessp->s_vp != NULL) {
if (!(mi->mi_flags & MI_NOPRINT)) {
uprintf(
"NFS %s failed for server %s: error %d (%s)\n",
mi->mi_rfsnames[which],
svp->sv_hostname, status,
clnt_sperrno(status));
}
}
#endif
/*
* when CLNT_CALL() fails with RPC_AUTHERROR,
* re_errno is set appropriately depending on
* the authentication error
*/
if (status == RPC_VERSMISMATCH ||
status == RPC_PROGVERSMISMATCH)
rpcerr.re_errno = EIO;
}
} else {
/*
* Test the value of mi_down and mi_printed without
* holding the mi_lock mutex. If they are both zero,
* then it is okay to skip the down and printed
* processing. This saves on a mutex_enter and
* mutex_exit pair for a normal, successful RPC.
* This was just complete overhead.
*/
if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~MI_DOWN;
if (mi->mi_flags & MI_PRINTED) {
mi->mi_flags &= ~MI_PRINTED;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
zprintf(zoneid, "NFS%d server %s ok\n",
mi->mi_vers, svp->sv_hostname);
#else
if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
zprintf(zoneid, "NFS server %s ok\n",
svp->sv_hostname);
#endif
} else
mutex_exit(&mi->mi_lock);
}
if (*douprintf == 0) {
if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
uprintf("NFS%d server %s ok\n",
mi->mi_vers, svp->sv_hostname);
#else
if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
uprintf("NFS server %s ok\n", svp->sv_hostname);
#endif
*douprintf = 1;
}
}
clfree_impl(client, ch, nfscl);
if (cred_cloned)
crfree(cr);
ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
if (rpc_status != NULL)
*rpc_status = rpcerr.re_status;
TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
rpcerr.re_errno);
return (rpcerr.re_errno);
}
#ifdef DEBUG
static int acl2call_hits = 0;
static int acl2call_misses = 0;
#endif
int
acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
enum nfsstat *statusp, int flags, failinfo_t *fi)
{
int rpcerror;
rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
cr, douprintf, flags, fi);
if (!rpcerror) {
/*
* See comments with crnetadjust().
*/
if (*statusp == NFSERR_ACCES &&
(cr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
acl2call_hits++;
#endif
rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
resp, cr, douprintf, flags, fi);
crfree(cr);
#ifdef DEBUG
if (*statusp == NFSERR_ACCES)
acl2call_misses++;
#endif
}
}
return (rpcerror);
}
#ifdef DEBUG
static int acl3call_hits = 0;
static int acl3call_misses = 0;
#endif
int
acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
nfsstat3 *statusp, int flags, failinfo_t *fi)
{
int rpcerror;
int user_informed;
user_informed = 0;
do {
rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
cr, douprintf, flags, fi);
if (!rpcerror) {
cred_t *crr;
if (*statusp == NFS3ERR_JUKEBOX) {
if (!user_informed) {
user_informed = 1;
uprintf(
"file temporarily unavailable on the server, retrying...\n");
}
delay(nfs3_jukebox_delay);
}
/*
* See crnetadjust() for comments.
*/
else if (*statusp == NFS3ERR_ACCES &&
(crr = crnetadjust(cr)) != NULL) {
#ifdef DEBUG
acl3call_hits++;
#endif
rpcerror = aclcall(mi, which, xdrargs, argsp,
xdrres, resp, crr, douprintf, flags, fi);
crfree(crr);
#ifdef DEBUG
if (*statusp == NFS3ERR_ACCES)
acl3call_misses++;
#endif
}
}
} while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
return (rpcerror);
}
static int
aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
int flags, failinfo_t *fi)
{
CLIENT *client;
struct chtab *ch;
cred_t *cr = icr;
bool_t cred_cloned = FALSE;
enum clnt_stat status;
struct rpc_err rpcerr;
struct timeval wait;
int timeo; /* in units of hz */
#if 0 /* notyet */
int my_rsize, my_wsize;
#endif
bool_t tryagain;
k_sigset_t smask;
servinfo_t *svp;
struct nfs_clnt *nfscl;
zoneid_t zoneid = getzoneid();
#ifdef DEBUG
char *bufp;
#endif
#if 0 /* notyet */
TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
"rfscall_start:which %d mi %p", which, mi);
#endif
nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
ASSERT(nfscl != NULL);
nfscl->nfscl_stat.calls.value.ui64++;
mi->mi_aclreqs[which].value.ui64++;
rpcerr.re_status = RPC_SUCCESS;
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
return (rpcerr.re_errno);
}
#if 0 /* notyet */
/*
* Remember the transfer sizes in case
* nfs_feedback changes them underneath us.
*/
my_rsize = mi->mi_curread;
my_wsize = mi->mi_curwrite;
#endif
/*
* NFS client failover support
*
* If this rnode is not in sync with the current server (VALID_FH),
* we'd like to do a remap to get in sync. We can be interrupted
* in failover_remap(), and if so we'll bail. Otherwise, we'll
* use the best info we have to try the RPC. Part of that is
* unconditionally updating the filehandle copy kept for V3.
*
* Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
* rw_enter(); we're trying to keep the current server from being
* changed on us until we're done with the remapping and have a
* matching client handle. We don't want to sending a filehandle
* to the wrong host.
*/
failoverretry:
if (FAILOVER_MOUNT(mi)) {
mutex_enter(&mi->mi_lock);
if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
if (failover_wait(mi)) {
mutex_exit(&mi->mi_lock);
return (EINTR);
}
}
INC_READERS(mi);
mutex_exit(&mi->mi_lock);
if (fi) {
if (!VALID_FH(fi) &&
!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
int remaperr;
svp = mi->mi_curr_serv;
remaperr = failover_remap(fi);
if (remaperr != 0) {
#ifdef DEBUG
if (remaperr != EINTR)
nfs_cmn_err(remaperr, CE_WARN,
"aclcall couldn't failover: %m");
#endif
mutex_enter(&mi->mi_lock);
DEC_READERS(mi);
mutex_exit(&mi->mi_lock);
/*
* If failover_remap returns ETIMEDOUT
* and the filesystem is hard mounted
* we have to retry the call with a new
* server.
*/
if ((mi->mi_flags & MI_HARD) &&
IS_RECOVERABLE_ERROR(remaperr)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
rpcerr.re_status = RPC_SUCCESS;
goto failoverretry;
}
return (remaperr);
}
}
if (fi->fhp && fi->copyproc)
(*fi->copyproc)(fi->fhp, fi->vp);
}
}
/* For TSOL, use a new cred which has net_mac_aware flag */
if (!cred_cloned && is_system_labeled()) {
cred_cloned = TRUE;
cr = crdup(icr);
(void) setpflags(NET_MAC_AWARE, 1, cr);
}
/*
* acl_clget() calls clnt_tli_kinit() which clears the xid, so we
* are guaranteed to reprocess the retry as a new request.
*/
svp = mi->mi_curr_serv;
rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
if (FAILOVER_MOUNT(mi)) {
mutex_enter(&mi->mi_lock);
DEC_READERS(mi);
mutex_exit(&mi->mi_lock);
if ((rpcerr.re_errno == ETIMEDOUT ||
rpcerr.re_errno == ECONNRESET) &&
failover_safe(fi)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
goto failoverretry;
}
}
if (rpcerr.re_errno != 0) {
if (cred_cloned)
crfree(cr);
return (rpcerr.re_errno);
}
if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
timeo = (mi->mi_timeo * hz) / 10;
} else {
mutex_enter(&mi->mi_lock);
timeo = CLNT_SETTIMERS(client,
&(mi->mi_timers[mi->mi_acl_timer_type[which]]),
&(mi->mi_timers[NFS_CALLTYPES]),
(minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
(void (*)()) 0, (caddr_t)mi, 0);
mutex_exit(&mi->mi_lock);
}
/*
* If hard mounted fs, retry call forever unless hard error occurs.
*/
do {
tryagain = FALSE;
if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
status = RPC_FAILED;
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
break;
}
TICK_TO_TIMEVAL(timeo, &wait);
/*
* Mask out all signals except SIGHUP, SIGINT, SIGQUIT
* and SIGTERM. (Preserving the existing masks).
* Mask out SIGINT if mount option nointr is specified.
*/
sigintr(&smask, (int)mi->mi_flags & MI_INT);
if (!(mi->mi_flags & MI_INT))
client->cl_nosignal = TRUE;
/*
* If there is a current signal, then don't bother
* even trying to send out the request because we
* won't be able to block waiting for the response.
* Simply assume RPC_INTR and get on with it.
*/
if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
status = RPC_INTR;
else {
status = CLNT_CALL(client, which, xdrargs, argsp,
xdrres, resp, wait);
}
if (!(mi->mi_flags & MI_INT))
client->cl_nosignal = FALSE;
/*
* restore original signal mask
*/
sigunintr(&smask);
switch (status) {
case RPC_SUCCESS:
#if 0 /* notyet */
if ((mi->mi_flags & MI_DYNAMIC) &&
mi->mi_timer_type[which] != 0 &&
(mi->mi_curread != my_rsize ||
mi->mi_curwrite != my_wsize))
(void) nfs_feedback(FEEDBACK_OK, which, mi);
#endif
break;
/*
* Unfortunately, there are servers in the world which
* are not coded correctly. They are not prepared to
* handle RPC requests to the NFS port which are not
* NFS requests. Thus, they may try to process the
* NFS_ACL request as if it were an NFS request. This
* does not work. Generally, an error will be generated
* on the client because it will not be able to decode
* the response from the server. However, it seems
* possible that the server may not be able to decode
* the arguments. Thus, the criteria for deciding
* whether the server supports NFS_ACL or not is whether
* the following RPC errors are returned from CLNT_CALL.
*/
case RPC_CANTDECODERES:
case RPC_PROGUNAVAIL:
case RPC_CANTDECODEARGS:
case RPC_PROGVERSMISMATCH:
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
mutex_exit(&mi->mi_lock);
break;
/*
* If the server supports NFS_ACL but not the new ops
* for extended attributes, make sure we don't retry.
*/
case RPC_PROCUNAVAIL:
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~MI_EXTATTR;
mutex_exit(&mi->mi_lock);
break;
case RPC_INTR:
/*
* There is no way to recover from this error,
* even if mount option nointr is specified.
* SIGKILL, for example, cannot be blocked.
*/
rpcerr.re_status = RPC_INTR;
rpcerr.re_errno = EINTR;
break;
case RPC_UDERROR:
/*
* If the NFS server is local (vold) and
* it goes away then we get RPC_UDERROR.
* This is a retryable error, so we would
* loop, so check to see if the specific
* error was ECONNRESET, indicating that
* target did not exist at all. If so,
* return with RPC_PROGUNAVAIL and
* ECONNRESET to indicate why.
*/
CLNT_GETERR(client, &rpcerr);
if (rpcerr.re_errno == ECONNRESET) {
rpcerr.re_status = RPC_PROGUNAVAIL;
rpcerr.re_errno = ECONNRESET;
break;
}
/*FALLTHROUGH*/
default: /* probably RPC_TIMEDOUT */
if (IS_UNRECOVERABLE_RPC(status))
break;
/*
* increment server not responding count
*/
mutex_enter(&mi->mi_lock);
mi->mi_noresponse++;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
nfscl->nfscl_stat.noresponse.value.ui64++;
#endif
if (!(mi->mi_flags & MI_HARD)) {
if (!(mi->mi_flags & MI_SEMISOFT) ||
(mi->mi_acl_ss_call_type[which] == 0))
break;
}
/*
* The call is in progress (over COTS).
* Try the CLNT_CALL again, but don't
* print a noisy error message.
*/
if (status == RPC_INPROGRESS) {
tryagain = TRUE;
break;
}
if (flags & RFSCALL_SOFT)
break;
/*
* On zone shutdown, just move on.
*/
if (zone_status_get(curproc->p_zone) >=
ZONE_IS_SHUTTING_DOWN) {
rpcerr.re_status = RPC_FAILED;
rpcerr.re_errno = EIO;
break;
}
/*
* NFS client failover support
*
* If the current server just failed us, we'll
* start the process of finding a new server.
* After that, we can just retry.
*/
if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
if (svp == mi->mi_curr_serv)
failover_newserver(mi);
clfree_impl(client, ch, nfscl);
goto failoverretry;
}
tryagain = TRUE;
timeo = backoff(timeo);
mutex_enter(&mi->mi_lock);
if (!(mi->mi_flags & MI_PRINTED)) {
mi->mi_flags |= MI_PRINTED;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
zprintf(zoneid,
"NFS_ACL%d server %s not responding still trying\n",
mi->mi_vers, svp->sv_hostname);
#else
zprintf(zoneid,
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
} else
mutex_exit(&mi->mi_lock);
if (*douprintf && curproc->p_sessp->s_vp != NULL) {
*douprintf = 0;
if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
uprintf(
"NFS_ACL%d server %s not responding still trying\n",
mi->mi_vers, svp->sv_hostname);
#else
uprintf(
"NFS server %s not responding still trying\n",
svp->sv_hostname);
#endif
}
#if 0 /* notyet */
/*
* If doing dynamic adjustment of transfer
* size and if it's a read or write call
* and if the transfer size changed while
* retransmitting or if the feedback routine
* changed the transfer size,
* then exit rfscall so that the transfer
* size can be adjusted at the vnops level.
*/
if ((mi->mi_flags & MI_DYNAMIC) &&
mi->mi_acl_timer_type[which] != 0 &&
(mi->mi_curread != my_rsize ||
mi->mi_curwrite != my_wsize ||
nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
/*
* On read or write calls, return
* back to the vnode ops level if
* the transfer size changed.
*/
clfree_impl(client, ch, nfscl);
if (cred_cloned)
crfree(cr);
return (ENFS_TRYAGAIN);
}
#endif
}
} while (tryagain);
if (status != RPC_SUCCESS) {
/*
* Let soft mounts use the timed out message.
*/
if (status == RPC_INPROGRESS)
status = RPC_TIMEDOUT;
nfscl->nfscl_stat.badcalls.value.ui64++;
if (status == RPC_CANTDECODERES ||
status == RPC_PROGUNAVAIL ||
status == RPC_PROCUNAVAIL ||
status == RPC_CANTDECODEARGS ||
status == RPC_PROGVERSMISMATCH)
CLNT_GETERR(client, &rpcerr);
else if (status != RPC_INTR) {
mutex_enter(&mi->mi_lock);
mi->mi_flags |= MI_DOWN;
mutex_exit(&mi->mi_lock);
CLNT_GETERR(client, &rpcerr);
#ifdef DEBUG
bufp = clnt_sperror(client, svp->sv_hostname);
zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
mi->mi_vers, mi->mi_aclnames[which], bufp);
if (curproc->p_sessp->s_vp != NULL) {
if (!(mi->mi_flags & MI_NOPRINT)) {
uprintf("NFS_ACL%d %s failed for %s\n",
mi->mi_vers, mi->mi_aclnames[which],
bufp);
}
}
kmem_free(bufp, MAXPATHLEN);
#else
zprintf(zoneid,
"NFS %s failed for server %s: error %d (%s)\n",
mi->mi_aclnames[which], svp->sv_hostname,
status, clnt_sperrno(status));
if (curproc->p_sessp->s_vp != NULL) {
if (!(mi->mi_flags & MI_NOPRINT))
uprintf(
"NFS %s failed for server %s: error %d (%s)\n",
mi->mi_aclnames[which],
svp->sv_hostname, status,
clnt_sperrno(status));
}
#endif
/*
* when CLNT_CALL() fails with RPC_AUTHERROR,
* re_errno is set appropriately depending on
* the authentication error
*/
if (status == RPC_VERSMISMATCH ||
status == RPC_PROGVERSMISMATCH)
rpcerr.re_errno = EIO;
}
} else {
/*
* Test the value of mi_down and mi_printed without
* holding the mi_lock mutex. If they are both zero,
* then it is okay to skip the down and printed
* processing. This saves on a mutex_enter and
* mutex_exit pair for a normal, successful RPC.
* This was just complete overhead.
*/
if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
mutex_enter(&mi->mi_lock);
mi->mi_flags &= ~MI_DOWN;
if (mi->mi_flags & MI_PRINTED) {
mi->mi_flags &= ~MI_PRINTED;
mutex_exit(&mi->mi_lock);
#ifdef DEBUG
zprintf(zoneid, "NFS_ACL%d server %s ok\n",
mi->mi_vers, svp->sv_hostname);
#else
zprintf(zoneid, "NFS server %s ok\n",
svp->sv_hostname);
#endif
} else
mutex_exit(&mi->mi_lock);
}
if (*douprintf == 0) {
if (!(mi->mi_flags & MI_NOPRINT))
#ifdef DEBUG
uprintf("NFS_ACL%d server %s ok\n",
mi->mi_vers, svp->sv_hostname);
#else
uprintf("NFS server %s ok\n", svp->sv_hostname);
#endif
*douprintf = 1;
}
}
clfree_impl(client, ch, nfscl);
if (cred_cloned)
crfree(cr);
ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
#if 0 /* notyet */
TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
rpcerr.re_errno);
#endif
return (rpcerr.re_errno);
}
int
vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
{
uint_t mask = vap->va_mask;
if (!(mask & AT_MODE))
sa->sa_mode = (uint32_t)-1;
else
sa->sa_mode = vap->va_mode;
if (!(mask & AT_UID))
sa->sa_uid = (uint32_t)-1;
else
sa->sa_uid = (uint32_t)vap->va_uid;
if (!(mask & AT_GID))
sa->sa_gid = (uint32_t)-1;
else
sa->sa_gid = (uint32_t)vap->va_gid;
if (!(mask & AT_SIZE))
sa->sa_size = (uint32_t)-1;
else
sa->sa_size = (uint32_t)vap->va_size;
if (!(mask & AT_ATIME))
sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
else {
/* check time validity */
if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
return (EOVERFLOW);
}
sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
}
if (!(mask & AT_MTIME))
sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
else {
/* check time validity */
if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
return (EOVERFLOW);
}
sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
}
return (0);
}
int
vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
{
uint_t mask = vap->va_mask;
if (!(mask & AT_MODE))
sa->mode.set_it = FALSE;
else {
sa->mode.set_it = TRUE;
sa->mode.mode = (mode3)vap->va_mode;
}
if (!(mask & AT_UID))
sa->uid.set_it = FALSE;
else {
sa->uid.set_it = TRUE;
sa->uid.uid = (uid3)vap->va_uid;
}
if (!(mask & AT_GID))
sa->gid.set_it = FALSE;
else {
sa->gid.set_it = TRUE;
sa->gid.gid = (gid3)vap->va_gid;
}
if (!(mask & AT_SIZE))
sa->size.set_it = FALSE;
else {
sa->size.set_it = TRUE;
sa->size.size = (size3)vap->va_size;
}
if (!(mask & AT_ATIME))
sa->atime.set_it = DONT_CHANGE;
else {
/* check time validity */
if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
return (EOVERFLOW);
}
sa->atime.set_it = SET_TO_CLIENT_TIME;
sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
}
if (!(mask & AT_MTIME))
sa->mtime.set_it = DONT_CHANGE;
else {
/* check time validity */
if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
return (EOVERFLOW);
}
sa->mtime.set_it = SET_TO_CLIENT_TIME;
sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
}
return (0);
}
void
setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
{
da->da_fhandle = VTOFH(dvp);
da->da_name = nm;
da->da_flags = 0;
}
void
setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
{
da->dirp = VTOFH3(dvp);
da->name = nm;
}
int
setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
{
int error;
rnode_t *rp;
struct vattr va;
va.va_mask = AT_MODE | AT_GID;
error = VOP_GETATTR(dvp, &va, 0, cr);
if (error)
return (error);
/*
* To determine the expected group-id of the created file:
* 1) If the filesystem was not mounted with the Old-BSD-compatible
* GRPID option, and the directory's set-gid bit is clear,
* then use the process's gid.
* 2) Otherwise, set the group-id to the gid of the parent directory.
*/
rp = VTOR(dvp);
mutex_enter(&rp->r_statelock);
if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
*gidp = crgetgid(cr);
else
*gidp = va.va_gid;
mutex_exit(&rp->r_statelock);
return (0);
}
int
setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
{
int error;
struct vattr va;
va.va_mask = AT_MODE;
error = VOP_GETATTR(dvp, &va, 0, cr);
if (error)
return (error);
/*
* Modify the expected mode (om) so that the set-gid bit matches
* that of the parent directory (dvp).
*/
if (va.va_mode & VSGID)
*omp |= VSGID;
else
*omp &= ~VSGID;
return (0);
}
void
nfs_setswaplike(vnode_t *vp, vattr_t *vap)
{
if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
if (!(vp->v_flag & VSWAPLIKE)) {
mutex_enter(&vp->v_lock);
vp->v_flag |= VSWAPLIKE;
mutex_exit(&vp->v_lock);
}
} else {
if (vp->v_flag & VSWAPLIKE) {
mutex_enter(&vp->v_lock);
vp->v_flag &= ~VSWAPLIKE;
mutex_exit(&vp->v_lock);
}
}
}
/*
* Free the resources associated with an rnode.
*/
static void
rinactive(rnode_t *rp, cred_t *cr)
{
vnode_t *vp;
cred_t *cred;
char *contents;
int size;
vsecattr_t *vsp;
int error;
nfs3_pathconf_info *info;
/*
* Before freeing anything, wait until all asynchronous
* activity is done on this rnode. This will allow all
* asynchronous read ahead and write behind i/o's to
* finish.
*/
mutex_enter(&rp->r_statelock);
while (rp->r_count > 0)
cv_wait(&rp->r_cv, &rp->r_statelock);
mutex_exit(&rp->r_statelock);
/*
* Flush and invalidate all pages associated with the vnode.
*/
vp = RTOV(rp);
if (vn_has_cached_data(vp)) {
ASSERT(vp->v_type != VCHR);
if ((rp->r_flags & RDIRTY) && !rp->r_error) {
error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr);
if (error && (error == ENOSPC || error == EDQUOT)) {
mutex_enter(&rp->r_statelock);
if (!rp->r_error)
rp->r_error = error;
mutex_exit(&rp->r_statelock);
}
}
nfs_invalidate_pages(vp, (u_offset_t)0, cr);
}
/*
* Free any held credentials and caches which may be associated
* with this rnode.
*/
mutex_enter(&rp->r_statelock);
cred = rp->r_cred;
rp->r_cred = NULL;
contents = rp->r_symlink.contents;
size = rp->r_symlink.size;
rp->r_symlink.contents = NULL;
vsp = rp->r_secattr;
rp->r_secattr = NULL;
info = rp->r_pathconf;
rp->r_pathconf = NULL;
mutex_exit(&rp->r_statelock);
/*
* Free the held credential.
*/
if (cred != NULL)
crfree(cred);
/*
* Free the access cache entries.
*/
(void) nfs_access_purge_rp(rp);
/*
* Free the readdir cache entries.
*/
if (HAVE_RDDIR_CACHE(rp))
nfs_purge_rddir_cache(vp);
/*
* Free the symbolic link cache.
*/
if (contents != NULL) {
kmem_free((void *)contents, size);
}
/*
* Free any cached ACL.
*/
if (vsp != NULL)
nfs_acl_free(vsp);
/*
* Free any cached pathconf information.
*/
if (info != NULL)
kmem_free(info, sizeof (*info));
}
/*
* Return a vnode for the given NFS Version 2 file handle.
* If no rnode exists for this fhandle, create one and put it
* into the hash queues. If the rnode for this fhandle
* already exists, return it.
*
* Note: make_rnode() may upgrade the hash bucket lock to exclusive.
*/
vnode_t *
makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
hrtime_t t, cred_t *cr, char *dnm, char *nm)
{
int newnode;
int index;
vnode_t *vp;
nfs_fhandle nfh;
vattr_t va;
nfh.fh_len = NFS_FHSIZE;
bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
index = rtablehash(&nfh);
rw_enter(&rtable[index].r_lock, RW_READER);
vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
if (attr != NULL) {
if (!newnode) {
rw_exit(&rtable[index].r_lock);
(void) nfs_cache_fattr(vp, attr, &va, t, cr);
} else {
if (attr->na_type < NFNON || attr->na_type > NFSOC)
vp->v_type = VBAD;
else
vp->v_type = n2v_type(attr);
/*
* A translation here seems to be necessary
* because this function can be called
* with `attr' that has come from the wire,
* and been operated on by vattr_to_nattr().
* See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
* ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
* ->makenfsnode().
*/
if ((attr->na_rdev & 0xffff0000) == 0)
vp->v_rdev = nfsv2_expdev(attr->na_rdev);
else
vp->v_rdev = expldev(n2v_rdev(attr));
nfs_attrcache(vp, attr, t);
rw_exit(&rtable[index].r_lock);
}
} else {
if (newnode) {
PURGE_ATTRCACHE(vp);
}
rw_exit(&rtable[index].r_lock);
}
return (vp);
}
/*
* Return a vnode for the given NFS Version 3 file handle.
* If no rnode exists for this fhandle, create one and put it
* into the hash queues. If the rnode for this fhandle
* already exists, return it.
*
* Note: make_rnode() may upgrade the hash bucket lock to exclusive.
*/
vnode_t *
makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
cred_t *cr, char *dnm, char *nm)
{
int newnode;
int index;
vnode_t *vp;
index = rtablehash((nfs_fhandle *)fh);
rw_enter(&rtable[index].r_lock, RW_READER);
vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
dnm, nm);
if (vap == NULL) {
if (newnode) {
PURGE_ATTRCACHE(vp);
}
rw_exit(&rtable[index].r_lock);
return (vp);
}
if (!newnode) {
rw_exit(&rtable[index].r_lock);
nfs_attr_cache(vp, vap, t, cr);
} else {
rnode_t *rp = VTOR(vp);
vp->v_type = vap->va_type;
vp->v_rdev = vap->va_rdev;
mutex_enter(&rp->r_statelock);
if (rp->r_mtime <= t)
nfs_attrcache_va(vp, vap);
mutex_exit(&rp->r_statelock);
rw_exit(&rtable[index].r_lock);
}
return (vp);
}
vnode_t *
makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
cred_t *cr, char *dnm, char *nm)
{
int newnode;
int index;
vnode_t *vp;
vattr_t va;
index = rtablehash((nfs_fhandle *)fh);
rw_enter(&rtable[index].r_lock, RW_READER);
vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
dnm, nm);
if (attr == NULL) {
if (newnode) {
PURGE_ATTRCACHE(vp);
}
rw_exit(&rtable[index].r_lock);
return (vp);
}
if (!newnode) {
rw_exit(&rtable[index].r_lock);
(void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
} else {
if (attr->type < NF3REG || attr->type > NF3FIFO)
vp->v_type = VBAD;
else
vp->v_type = nf3_to_vt[attr->type];
vp->v_rdev = makedevice(attr->rdev.specdata1,
attr->rdev.specdata2);
nfs3_attrcache(vp, attr, t);
rw_exit(&rtable[index].r_lock);
}
return (vp);
}
/*
* Read this comment before making changes to rtablehash()!
* This is a hash function in which seemingly obvious and harmless
* changes can cause escalations costing million dollars!
* Know what you are doing.
*
* rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
* algorithm is currently detailed here:
*
* http://burtleburtle.net/bob/hash/doobs.html
*
* Of course, the above link may not be valid by the time you are reading
* this, but suffice it to say that the one-at-a-time algorithm works well in
* almost all cases. If you are changing the algorithm be sure to verify that
* the hash algorithm still provides even distribution in all cases and with
* any server returning filehandles in whatever order (sequential or random).
*/
static int
rtablehash(nfs_fhandle *fh)
{
ulong_t hash, len, i;
char *key;
key = fh->fh_buf;
len = (ulong_t)fh->fh_len;
for (hash = 0, i = 0; i < len; i++) {
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return (hash & rtablemask);
}
static vnode_t *
make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
struct vnodeops *vops,
int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
int (*compar)(const void *, const void *),
int *newnode, cred_t *cr, char *dnm, char *nm)
{
rnode_t *rp;
rnode_t *trp;
vnode_t *vp;
mntinfo_t *mi;
ASSERT(RW_READ_HELD(&rhtp->r_lock));
mi = VFTOMI(vfsp);
start:
if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
vp = RTOV(rp);
nfs_set_vroot(vp);
*newnode = 0;
return (vp);
}
rw_exit(&rhtp->r_lock);
mutex_enter(&rpfreelist_lock);
if (rpfreelist != NULL && rnew >= nrnode) {
rp = rpfreelist;
rp_rmfree(rp);
mutex_exit(&rpfreelist_lock);
vp = RTOV(rp);
if (rp->r_flags & RHASHED) {
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_exit(&rp->r_hashq->r_lock);
rw_enter(&rhtp->r_lock, RW_READER);
goto start;
}
mutex_exit(&vp->v_lock);
rp_rmhash_locked(rp);
rw_exit(&rp->r_hashq->r_lock);
}
rinactive(rp, cr);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_enter(&rhtp->r_lock, RW_READER);
goto start;
}
mutex_exit(&vp->v_lock);
vn_invalid(vp);
/*
* destroy old locks before bzero'ing and
* recreating the locks below.
*/
nfs_rw_destroy(&rp->r_rwlock);
nfs_rw_destroy(&rp->r_lkserlock);
mutex_destroy(&rp->r_statelock);
cv_destroy(&rp->r_cv);
cv_destroy(&rp->r_commit.c_cv);
nfs_free_r_path(rp);
avl_destroy(&rp->r_dir);
/*
* Make sure that if rnode is recycled then
* VFS count is decremented properly before
* reuse.
*/
VFS_RELE(vp->v_vfsp);
vn_reinit(vp);
} else {
vnode_t *new_vp;
mutex_exit(&rpfreelist_lock);
rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
new_vp = vn_alloc(KM_SLEEP);
atomic_add_long((ulong_t *)&rnew, 1);
#ifdef DEBUG
clstat_debug.nrnode.value.ui64++;
#endif
vp = new_vp;
}
bzero(rp, sizeof (*rp));
rp->r_vnode = vp;
nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
rp->r_fh.fh_len = fh->fh_len;
bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
rp->r_server = mi->mi_curr_serv;
if (FAILOVER_MOUNT(mi)) {
/*
* If replicated servers, stash pathnames
*/
if (dnm != NULL && nm != NULL) {
char *s, *p;
uint_t len;
len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
rp->r_path = kmem_alloc(len, KM_SLEEP);
#ifdef DEBUG
clstat_debug.rpath.value.ui64 += len;
#endif
s = rp->r_path;
for (p = dnm; *p; p++)
*s++ = *p;
*s++ = '/';
for (p = nm; *p; p++)
*s++ = *p;
*s = '\0';
} else {
/* special case for root */
rp->r_path = kmem_alloc(2, KM_SLEEP);
#ifdef DEBUG
clstat_debug.rpath.value.ui64 += 2;
#endif
*rp->r_path = '.';
*(rp->r_path + 1) = '\0';
}
}
VFS_HOLD(vfsp);
rp->r_putapage = putapage;
rp->r_hashq = rhtp;
rp->r_flags = RREADDIRPLUS;
avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
offsetof(rddir_cache, tree));
vn_setops(vp, vops);
vp->v_data = (caddr_t)rp;
vp->v_vfsp = vfsp;
vp->v_type = VNON;
nfs_set_vroot(vp);
/*
* There is a race condition if someone else
* alloc's the rnode while no locks are held, so we
* check again and recover if found.
*/
rw_enter(&rhtp->r_lock, RW_WRITER);
if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
vp = RTOV(trp);
nfs_set_vroot(vp);
*newnode = 0;
rw_exit(&rhtp->r_lock);
rp_addfree(rp, cr);
rw_enter(&rhtp->r_lock, RW_READER);
return (vp);
}
rp_addhash(rp);
*newnode = 1;
return (vp);
}
static void
nfs_set_vroot(vnode_t *vp)
{
rnode_t *rp;
nfs_fhandle *rootfh;
rp = VTOR(vp);
rootfh = &rp->r_server->sv_fhandle;
if (rootfh->fh_len == rp->r_fh.fh_len &&
bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
if (!(vp->v_flag & VROOT)) {
mutex_enter(&vp->v_lock);
vp->v_flag |= VROOT;
mutex_exit(&vp->v_lock);
}
}
}
static void
nfs_free_r_path(rnode_t *rp)
{
char *path;
size_t len;
path = rp->r_path;
if (path) {
rp->r_path = NULL;
len = strlen(path) + 1;
kmem_free(path, len);
#ifdef DEBUG
clstat_debug.rpath.value.ui64 -= len;
#endif
}
}
/*
* Put an rnode on the free list.
*
* Rnodes which were allocated above and beyond the normal limit
* are immediately freed.
*/
void
rp_addfree(rnode_t *rp, cred_t *cr)
{
vnode_t *vp;
struct vfs *vfsp;
vp = RTOV(rp);
ASSERT(vp->v_count >= 1);
ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
/*
* If we have too many rnodes allocated and there are no
* references to this rnode, or if the rnode is no longer
* accessible by it does not reside in the hash queues,
* or if an i/o error occurred while writing to the file,
* then just free it instead of putting it on the rnode
* freelist.
*/
vfsp = vp->v_vfsp;
if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
(vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
if (rp->r_flags & RHASHED) {
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_exit(&rp->r_hashq->r_lock);
return;
}
mutex_exit(&vp->v_lock);
rp_rmhash_locked(rp);
rw_exit(&rp->r_hashq->r_lock);
}
rinactive(rp, cr);
/*
* Recheck the vnode reference count. We need to
* make sure that another reference has not been
* acquired while we were not holding v_lock. The
* rnode is not in the rnode hash queues, so the
* only way for a reference to have been acquired
* is for a VOP_PUTPAGE because the rnode was marked
* with RDIRTY or for a modified page. This
* reference may have been acquired before our call
* to rinactive. The i/o may have been completed,
* thus allowing rinactive to complete, but the
* reference to the vnode may not have been released
* yet. In any case, the rnode can not be destroyed
* until the other references to this vnode have been
* released. The other references will take care of
* either destroying the rnode or placing it on the
* rnode freelist. If there are no other references,
* then the rnode may be safely destroyed.
*/
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
return;
}
mutex_exit(&vp->v_lock);
destroy_rnode(rp);
return;
}
/*
* Lock the hash queue and then recheck the reference count
* to ensure that no other threads have acquired a reference
* to indicate that the rnode should not be placed on the
* freelist. If another reference has been acquired, then
* just release this one and let the other thread complete
* the processing of adding this rnode to the freelist.
*/
rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
mutex_enter(&vp->v_lock);
if (vp->v_count > 1) {
vp->v_count--;
mutex_exit(&vp->v_lock);
rw_exit(&rp->r_hashq->r_lock);
return;
}
mutex_exit(&vp->v_lock);
/*
* If there is no cached data or metadata for this file, then
* put the rnode on the front of the freelist so that it will
* be reused before other rnodes which may have cached data or
* metadata associated with them.
*/
mutex_enter(&rpfreelist_lock);
if (rpfreelist == NULL) {
rp->r_freef = rp;
rp->r_freeb = rp;
rpfreelist = rp;
} else {
rp->r_freef = rpfreelist;
rp->r_freeb = rpfreelist->r_freeb;
rpfreelist->r_freeb->r_freef = rp;
rpfreelist->r_freeb = rp;
if (!vn_has_cached_data(vp) &&
!HAVE_RDDIR_CACHE(rp) &&
rp->r_symlink.contents == NULL &&
rp->r_secattr == NULL &&
rp->r_pathconf == NULL)
rpfreelist = rp;
}
mutex_exit(&rpfreelist_lock);
rw_exit(&rp->r_hashq->r_lock);
}
/*
* Remove an rnode from the free list.
*
* The caller must be holding rpfreelist_lock and the rnode
* must be on the freelist.
*/
static void
rp_rmfree(rnode_t *rp)
{
ASSERT(MUTEX_HELD(&rpfreelist_lock));
ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
if (rp == rpfreelist) {
rpfreelist = rp->r_freef;
if (rp == rpfreelist)
rpfreelist = NULL;
}
rp->r_freeb->r_freef = rp->r_freef;
rp->r_freef->r_freeb = rp->r_freeb;
rp->r_freef = rp->r_freeb = NULL;
}
/*
* Put a rnode in the hash table.
*
* The caller must be holding the exclusive hash queue lock.
*/
static void
rp_addhash(rnode_t *rp)
{
ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
ASSERT(!(rp->r_flags & RHASHED));
rp->r_hashf = rp->r_hashq->r_hashf;
rp->