blob: 4ec7d34b92b086ce46bf63992982d96cfebf9ac0 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2012 Milan Jurik. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
*/
/*
* Overview of the RSM Kernel Agent:
* ---------------------------------
*
* rsm.c constitutes the implementation of the RSM kernel agent. The RSM
* kernel agent is a pseudo device driver which makes use of the RSMPI
* interface on behalf of the RSMAPI user library.
*
* The kernel agent functionality can be categorized into the following
* components:
* 1. Driver Infrastructure
* 2. Export/Import Segment Management
* 3. Internal resource allocation/deallocation
*
* The driver infrastructure includes the basic module loading entry points
* like _init, _info, _fini to load, unload and report information about
* the driver module. The driver infrastructure also includes the
* autoconfiguration entry points namely, attach, detach and getinfo for
* the device autoconfiguration.
*
* The kernel agent is a pseudo character device driver and exports
* a cb_ops structure which defines the driver entry points for character
* device access. This includes the open and close entry points. The
* other entry points provided include ioctl, devmap and segmap and chpoll.
* read and write entry points are not used since the device is memory
* mapped. Also ddi_prop_op is used for the prop_op entry point.
*
* The ioctl entry point supports a number of commands, which are used by
* the RSMAPI library in order to export and import segments. These
* commands include commands for binding and rebinding the physical pages
* allocated to the virtual address range, publishing the export segment,
* unpublishing and republishing an export segment, creating an
* import segment and a virtual connection from this import segment to
* an export segment, performing scatter-gather data transfer, barrier
* operations.
*
*
* Export and Import segments:
* ---------------------------
*
* In order to create an RSM export segment a process allocates a range in its
* virtual address space for the segment using standard Solaris interfaces.
* The process then calls RSMAPI, which in turn makes an ioctl call to the
* RSM kernel agent for an allocation of physical memory pages and for
* creation of the export segment by binding these pages to the virtual
* address range. These pages are locked in memory so that remote accesses
* are always applied to the correct page. Then the RSM segment is published,
* again via RSMAPI making an ioctl to the RSM kernel agent, and a segment id
* is assigned to it.
*
* In order to import a published RSM segment, RSMAPI creates an import
* segment and forms a virtual connection across the interconnect to the
* export segment, via an ioctl into the kernel agent with the connect
* command. The import segment setup is completed by mapping the
* local device memory into the importers virtual address space. The
* mapping of the import segment is handled by the segmap/devmap
* infrastructure described as follows.
*
* Segmap and Devmap interfaces:
*
* The RSM kernel agent allows device memory to be directly accessed by user
* threads via memory mapping. In order to do so, the RSM kernel agent
* supports the devmap and segmap entry points.
*
* The segmap entry point(rsm_segmap) is responsible for setting up a memory
* mapping as requested by mmap. The devmap entry point(rsm_devmap) is
* responsible for exporting the device memory to the user applications.
* rsm_segmap calls RSMPI rsm_map to allocate device memory. Then the
* control is transfered to the devmap_setup call which calls rsm_devmap.
*
* rsm_devmap validates the user mapping to the device or kernel memory
* and passes the information to the system for setting up the mapping. The
* actual setting up of the mapping is done by devmap_devmem_setup(for
* device memory) or devmap_umem_setup(for kernel memory). Callbacks are
* registered for device context management via the devmap_devmem_setup
* or devmap_umem_setup calls. The callbacks are rsmmap_map, rsmmap_unmap,
* rsmmap_access, rsmmap_dup. The callbacks are called when a new mapping
* is created, a mapping is freed, a mapping is accessed or an existing
* mapping is duplicated respectively. These callbacks allow the RSM kernel
* agent to maintain state information associated with the mappings.
* The state information is mainly in the form of a cookie list for the import
* segment for which mapping has been done.
*
* Forced disconnect of import segments:
*
* When an exported segment is unpublished, the exporter sends a forced
* disconnect message to all its importers. The importer segments are
* unloaded and disconnected. This involves unloading the original
* mappings and remapping to a preallocated kernel trash page. This is
* done by devmap_umem_remap. The trash/dummy page is a kernel page,
* preallocated by the kernel agent during attach using ddi_umem_alloc with
* the DDI_UMEM_TRASH flag set. This avoids a core dump in the application
* due to unloading of the original mappings.
*
* Additionally every segment has a mapping generation number associated
* with it. This is an entry in the barrier generation page, created
* during attach time. This mapping generation number for the import
* segments is incremented on a force disconnect to notify the application
* of the force disconnect. On this notification, the application needs
* to reconnect the segment to establish a new legitimate mapping.
*
*
* Locks used in the kernel agent:
* -------------------------------
*
* The kernel agent uses a variety of mutexes and condition variables for
* mutual exclusion of the shared data structures and for synchronization
* between the various threads. Some of the locks are described as follows.
*
* Each resource structure, which represents either an export/import segment
* has a lock associated with it. The lock is the resource mutex, rsmrc_lock.
* This is used directly by RSMRC_LOCK and RSMRC_UNLOCK macros and in the
* rsmseglock_acquire and rsmseglock_release macros. An additional
* lock called the rsmsi_lock is used for the shared import data structure
* that is relevant for resources representing import segments. There is
* also a condition variable associated with the resource called s_cv. This
* is used to wait for events like the segment state change etc.
*
* The resource structures are allocated from a pool of resource structures,
* called rsm_resource. This pool is protected via a reader-writer lock,
* called rsmrc_lock.
*
* There are two separate hash tables, one for the export segments and
* one for the import segments. The export segments are inserted into the
* export segment hash table only after they have been published and the
* import segments are inserted in the import segments list only after they
* have successfully connected to an exported segment. These tables are
* protected via reader-writer locks.
*
* Debug Support in the kernel agent:
* ----------------------------------
*
* Debugging support in the kernel agent is provided by the following
* macros.
*
* DBG_PRINTF((category, level, message)) is a macro which logs a debug
* message to the kernel agents debug buffer, rsmka_dbg. This debug buffer
* can be viewed in kmdb as *rsmka_dbg/s. The message is logged based
* on the definition of the category and level. All messages that belong to
* the specified category(rsmdbg_category) and are of an equal or greater
* severity than the specified level(rsmdbg_level) are logged. The message
* is a string which uses the same formatting rules as the strings used in
* printf.
*
* The category defines which component of the kernel agent has logged this
* message. There are a number of categories that have been defined such as
* RSM_KERNEL_AGENT, RSM_OPS, RSM_IMPORT, RSM_EXPORT etc. A macro,
* DBG_ADDCATEGORY is used to add in another category to the currently
* specified category value so that the component using this new category
* can also effectively log debug messages. Thus, the category of a specific
* message is some combination of the available categories and we can define
* sub-categories if we want a finer level of granularity.
*
* The level defines the severity of the message. Different level values are
* defined, with RSM_ERR being the most severe and RSM_DEBUG_VERBOSE being
* the least severe(debug level is 0).
*
* DBG_DEFINE and DBG_DEFINE_STR are macros provided to declare a debug
* variable or a string respectively.
*
*
* NOTES:
*
* Special Fork and Exec Handling:
* -------------------------------
*
* The backing physical pages of an exported segment are always locked down.
* Thus, there are two cases in which a process having exported segments
* will cause a cpu to hang: (1) the process invokes exec; (2) a process
* forks and invokes exit before the duped file descriptors for the export
* segments are closed in the child process. The hang is caused because the
* address space release algorithm in Solaris VM subsystem is based on a
* non-blocking loop which does not terminate while segments are locked
* down. In addition to this, Solaris VM subsystem lacks a callback
* mechanism to the rsm kernel agent to allow unlocking these export
* segment pages.
*
* In order to circumvent this problem, the kernel agent does the following.
* The Solaris VM subsystem keeps memory segments in increasing order of
* virtual addressses. Thus a special page(special_exit_offset) is allocated
* by the kernel agent and is mmapped into the heap area of the process address
* space(the mmap is done by the RSMAPI library). During the mmap processing
* of this special page by the devmap infrastructure, a callback(the same
* devmap context management callbacks discussed above) is registered for an
* unmap.
*
* As discussed above, this page is processed by the Solaris address space
* release code before any of the exported segments pages(which are allocated
* from high memory). It is during this processing that the unmap callback gets
* called and this callback is responsible for force destroying the exported
* segments and thus eliminating the problem of locked pages.
*
* Flow-control:
* ------------
*
* A credit based flow control algorithm is used for messages whose
* processing cannot be done in the interrupt context because it might
* involve invoking rsmpi calls, or might take a long time to complete
* or might need to allocate resources. The algorithm operates on a per
* path basis. To send a message the pathend needs to have a credit and
* it consumes one for every message that is flow controlled. On the
* receiving pathend the message is put on a msgbuf_queue and a task is
* dispatched on the worker thread - recv_taskq where it is processed.
* After processing the message, the receiving pathend dequeues the message,
* and if it has processed > RSMIPC_LOTSFREE_MSGBUFS messages sends
* credits to the sender pathend.
*
* RSM_DRTEST:
* -----------
*
* This is used to enable the DR testing using a test driver on test
* platforms which do not supported DR.
*
*/
#include <sys/types.h>
#include <sys/param.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/systm.h>
#include <sys/cred.h>
#include <sys/vm.h>
#include <sys/uio.h>
#include <vm/seg.h>
#include <vm/page.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/errno.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/proc.h>
#include <sys/mman.h>
#include <sys/open.h>
#include <sys/atomic.h>
#include <sys/mem_config.h>
#include <sys/ddi.h>
#include <sys/devops.h>
#include <sys/ddidevmap.h>
#include <sys/sunddi.h>
#include <sys/esunddi.h>
#include <sys/ddi_impldefs.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/ddi_impldefs.h>
#include <sys/modctl.h>
#include <sys/policy.h>
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/param.h>
#include <sys/taskq.h>
#include <sys/rsm/rsm_common.h>
#include <sys/rsm/rsmapi_common.h>
#include <sys/rsm/rsm.h>
#include <rsm_in.h>
#include <sys/rsm/rsmka_path_int.h>
#include <sys/rsm/rsmpi.h>
#include <sys/modctl.h>
#include <sys/debug.h>
#include <sys/tuneable.h>
#ifdef RSM_DRTEST
extern int rsm_kphysm_setup_func_register(kphysm_setup_vector_t *vec,
void *arg);
extern void rsm_kphysm_setup_func_unregister(kphysm_setup_vector_t *vec,
void *arg);
#endif
extern void dbg_printf(int category, int level, char *fmt, ...);
extern void rsmka_pathmanager_init();
extern void rsmka_pathmanager_cleanup();
extern void rele_sendq_token(sendq_token_t *);
extern rsm_addr_t get_remote_hwaddr(adapter_t *, rsm_node_id_t);
extern rsm_node_id_t get_remote_nodeid(adapter_t *, rsm_addr_t);
extern int rsmka_topology_ioctl(caddr_t, int, int);
extern pri_t maxclsyspri;
extern work_queue_t work_queue;
extern kmutex_t ipc_info_lock;
extern kmutex_t ipc_info_cvlock;
extern kcondvar_t ipc_info_cv;
extern kmutex_t path_hold_cvlock;
extern kcondvar_t path_hold_cv;
extern kmutex_t rsmka_buf_lock;
extern path_t *rsm_find_path(char *, int, rsm_addr_t);
extern adapter_t *rsmka_lookup_adapter(char *, int);
extern sendq_token_t *rsmka_get_sendq_token(rsm_node_id_t, sendq_token_t *);
extern boolean_t rsmka_do_path_active(path_t *, int);
extern boolean_t rsmka_check_node_alive(rsm_node_id_t);
extern void rsmka_release_adapter(adapter_t *);
extern void rsmka_enqueue_msgbuf(path_t *path, void *data);
extern void rsmka_dequeue_msgbuf(path_t *path);
extern msgbuf_elem_t *rsmka_gethead_msgbuf(path_t *path);
/* lint -w2 */
static int rsm_open(dev_t *, int, int, cred_t *);
static int rsm_close(dev_t, int, int, cred_t *);
static int rsm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
cred_t *credp, int *rvalp);
static int rsm_devmap(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
uint_t);
static int rsm_segmap(dev_t, off_t, struct as *, caddr_t *, off_t, uint_t,
uint_t, uint_t, cred_t *);
static int rsm_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
struct pollhead **phpp);
static int rsm_info(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int rsm_attach(dev_info_t *, ddi_attach_cmd_t);
static int rsm_detach(dev_info_t *, ddi_detach_cmd_t);
static int rsmipc_send(rsm_node_id_t, rsmipc_request_t *, rsmipc_reply_t *);
static void rsm_force_unload(rsm_node_id_t, rsm_memseg_id_t, boolean_t);
static void rsm_send_importer_disconnects(rsm_memseg_id_t, rsm_node_id_t);
static void rsm_send_republish(rsm_memseg_id_t, rsmapi_access_entry_t *, int,
rsm_permission_t);
static void rsm_export_force_destroy(ddi_umem_cookie_t *);
static void rsmacl_free(rsmapi_access_entry_t *, int);
static void rsmpiacl_free(rsm_access_entry_t *, int);
static int rsm_inc_pgcnt(pgcnt_t);
static void rsm_dec_pgcnt(pgcnt_t);
static void rsm_free_mapinfo(rsm_mapinfo_t *mapinfop);
static rsm_mapinfo_t *rsm_get_mapinfo(rsmseg_t *, off_t, size_t, off_t *,
size_t *);
static void exporter_quiesce();
static void rsmseg_suspend(rsmseg_t *, int *);
static void rsmsegshare_suspend(rsmseg_t *);
static int rsmseg_resume(rsmseg_t *, void **);
static int rsmsegshare_resume(rsmseg_t *);
static struct cb_ops rsm_cb_ops = {
rsm_open, /* open */
rsm_close, /* close */
nodev, /* strategy */
nodev, /* print */
nodev, /* dump */
nodev, /* read */
nodev, /* write */
rsm_ioctl, /* ioctl */
rsm_devmap, /* devmap */
NULL, /* mmap */
rsm_segmap, /* segmap */
rsm_chpoll, /* poll */
ddi_prop_op, /* cb_prop_op */
0, /* streamtab */
D_NEW|D_MP|D_DEVMAP, /* Driver compatibility flag */
0,
0,
0
};
static struct dev_ops rsm_ops = {
DEVO_REV, /* devo_rev, */
0, /* refcnt */
rsm_info, /* get_dev_info */
nulldev, /* identify */
nulldev, /* probe */
rsm_attach, /* attach */
rsm_detach, /* detach */
nodev, /* reset */
&rsm_cb_ops, /* driver operations */
(struct bus_ops *)0, /* bus operations */
0,
ddi_quiesce_not_needed, /* quiesce */
};
/*
* Module linkage information for the kernel.
*/
static struct modldrv modldrv = {
&mod_driverops, /* Type of module. This one is a pseudo driver */
"Remote Shared Memory Driver",
&rsm_ops, /* driver ops */
};
static struct modlinkage modlinkage = {
MODREV_1,
(void *)&modldrv,
0,
0,
0
};
static void rsm_dr_callback_post_add(void *arg, pgcnt_t delta);
static int rsm_dr_callback_pre_del(void *arg, pgcnt_t delta);
static void rsm_dr_callback_post_del(void *arg, pgcnt_t delta, int cancelled);
static kphysm_setup_vector_t rsm_dr_callback_vec = {
KPHYSM_SETUP_VECTOR_VERSION,
rsm_dr_callback_post_add,
rsm_dr_callback_pre_del,
rsm_dr_callback_post_del
};
/* This flag can be changed to 0 to help with PIT testing */
int rsmka_modunloadok = 1;
int no_reply_cnt = 0;
uint64_t rsm_ctrlmsg_errcnt = 0;
uint64_t rsm_ipcsend_errcnt = 0;
#define MAX_NODES 64
static struct rsm_driver_data rsm_drv_data;
static struct rsmresource_table rsm_resource;
static void rsmresource_insert(minor_t, rsmresource_t *, rsm_resource_type_t);
static void rsmresource_destroy(void);
static int rsmresource_alloc(minor_t *);
static rsmresource_t *rsmresource_free(minor_t rnum);
static int rsm_closeconnection(rsmseg_t *seg, void **cookie);
static int rsm_unpublish(rsmseg_t *seg, int mode);
static int rsm_unbind(rsmseg_t *seg);
static uint_t rsmhash(rsm_memseg_id_t key);
static void rsmhash_alloc(rsmhash_table_t *rhash, int size);
static void rsmhash_free(rsmhash_table_t *rhash, int size);
static void *rsmhash_getbkt(rsmhash_table_t *rhash, uint_t hashval);
static void **rsmhash_bktaddr(rsmhash_table_t *rhash, uint_t hashval);
static int rsm_send_notimporting(rsm_node_id_t dest, rsm_memseg_id_t segid,
void *cookie);
int rsm_disconnect(rsmseg_t *seg);
void rsmseg_unload(rsmseg_t *);
void rsm_suspend_complete(rsm_node_id_t src_node, int flag);
rsm_intr_hand_ret_t rsm_srv_func(rsm_controller_object_t *chd,
rsm_intr_q_op_t opcode, rsm_addr_t src,
void *data, size_t size, rsm_intr_hand_arg_t arg);
static void rsm_intr_callback(void *, rsm_addr_t, rsm_intr_hand_arg_t);
rsm_node_id_t my_nodeid;
/* cookie, va, offsets and length for the barrier */
static rsm_gnum_t *bar_va;
static ddi_umem_cookie_t bar_cookie;
static off_t barrier_offset;
static size_t barrier_size;
static int max_segs;
/* cookie for the trash memory */
static ddi_umem_cookie_t remap_cookie;
static rsm_memseg_id_t rsm_nextavail_segmentid;
extern taskq_t *work_taskq;
extern char *taskq_name;
static dev_info_t *rsm_dip; /* private copy of devinfo pointer */
static rsmhash_table_t rsm_export_segs; /* list of exported segs */
rsmhash_table_t rsm_import_segs; /* list of imported segs */
static rsmhash_table_t rsm_event_queues; /* list of event queues */
static rsm_ipc_t rsm_ipc; /* ipc info */
/* list of nodes to which RSMIPC_MSG_SUSPEND has been sent */
static list_head_t rsm_suspend_list;
/* list of descriptors for remote importers */
static importers_table_t importer_list;
kmutex_t rsm_suspend_cvlock;
kcondvar_t rsm_suspend_cv;
static kmutex_t rsm_lock;
adapter_t loopback_adapter;
rsm_controller_attr_t loopback_attr;
int rsmipc_send_controlmsg(path_t *path, int msgtype);
void rsmka_init_loopback();
int rsmka_null_seg_create(
rsm_controller_handle_t,
rsm_memseg_export_handle_t *,
size_t,
uint_t,
rsm_memory_local_t *,
rsm_resource_callback_t,
rsm_resource_callback_arg_t);
int rsmka_null_seg_destroy(
rsm_memseg_export_handle_t);
int rsmka_null_bind(
rsm_memseg_export_handle_t,
off_t,
rsm_memory_local_t *,
rsm_resource_callback_t,
rsm_resource_callback_arg_t);
int rsmka_null_unbind(
rsm_memseg_export_handle_t,
off_t,
size_t);
int rsmka_null_rebind(
rsm_memseg_export_handle_t,
off_t,
rsm_memory_local_t *,
rsm_resource_callback_t,
rsm_resource_callback_arg_t);
int rsmka_null_publish(
rsm_memseg_export_handle_t,
rsm_access_entry_t [],
uint_t,
rsm_memseg_id_t,
rsm_resource_callback_t,
rsm_resource_callback_arg_t);
int rsmka_null_republish(
rsm_memseg_export_handle_t,
rsm_access_entry_t [],
uint_t,
rsm_resource_callback_t,
rsm_resource_callback_arg_t);
int rsmka_null_unpublish(
rsm_memseg_export_handle_t);
rsm_ops_t null_rsmpi_ops;
/*
* data and locks to keep track of total amount of exported memory
*/
static pgcnt_t rsm_pgcnt;
static pgcnt_t rsm_pgcnt_max; /* max allowed */
static kmutex_t rsm_pgcnt_lock;
static int rsm_enable_dr;
static char loopback_str[] = "loopback";
int rsm_hash_size;
/*
* The locking model is as follows:
*
* Local operations:
* find resource - grab reader lock on resouce list
* insert rc - grab writer lock
* delete rc - grab writer lock and resource mutex
* read/write - no lock
*
* Remote invocations:
* find resource - grab read lock and resource mutex
*
* State:
* resource state - grab resource mutex
*/
int
_init(void)
{
int e;
e = mod_install(&modlinkage);
if (e != 0) {
return (e);
}
mutex_init(&rsm_lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&rsmka_buf_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&rsm_resource.rsmrc_lock, NULL, RW_DRIVER, NULL);
rsm_hash_size = RSM_HASHSZ;
rw_init(&rsm_export_segs.rsmhash_rw, NULL, RW_DRIVER, NULL);
rw_init(&rsm_import_segs.rsmhash_rw, NULL, RW_DRIVER, NULL);
mutex_init(&importer_list.lock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&rsm_ipc.lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&rsm_ipc.cv, NULL, CV_DRIVER, 0);
mutex_init(&rsm_suspend_cvlock, NULL, MUTEX_DRIVER, NULL);
cv_init(&rsm_suspend_cv, NULL, CV_DRIVER, 0);
mutex_init(&rsm_drv_data.drv_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&rsm_drv_data.drv_cv, NULL, CV_DRIVER, 0);
rsm_ipc.count = RSMIPC_SZ;
rsm_ipc.wanted = 0;
rsm_ipc.sequence = 0;
(void) mutex_init(&rsm_pgcnt_lock, NULL, MUTEX_DRIVER, NULL);
for (e = 0; e < RSMIPC_SZ; e++) {
rsmipc_slot_t *slot = &rsm_ipc.slots[e];
RSMIPC_SET(slot, RSMIPC_FREE);
mutex_init(&slot->rsmipc_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&slot->rsmipc_cv, NULL, CV_DRIVER, 0);
}
/*
* Initialize the suspend message list
*/
rsm_suspend_list.list_head = NULL;
mutex_init(&rsm_suspend_list.list_lock, NULL, MUTEX_DRIVER, NULL);
/*
* It is assumed here that configuration data is available
* during system boot since _init may be called at that time.
*/
rsmka_pathmanager_init();
DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE,
"rsm: _init done\n"));
return (DDI_SUCCESS);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&modlinkage, modinfop));
}
int
_fini(void)
{
int e;
DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE,
"rsm: _fini enter\n"));
/*
* The rsmka_modunloadok flag is simply used to help with
* the PIT testing. Make this flag 0 to disallow modunload.
*/
if (rsmka_modunloadok == 0)
return (EBUSY);
/* rsm_detach will be called as a result of mod_remove */
e = mod_remove(&modlinkage);
if (e) {
DBG_PRINTF((RSM_KERNEL_AGENT, RSM_ERR,
"Unable to fini RSM %x\n", e));
return (e);
}
rsmka_pathmanager_cleanup();
rw_destroy(&rsm_resource.rsmrc_lock);
rw_destroy(&rsm_export_segs.rsmhash_rw);
rw_destroy(&rsm_import_segs.rsmhash_rw);
rw_destroy(&rsm_event_queues.rsmhash_rw);
mutex_destroy(&importer_list.lock);
mutex_destroy(&rsm_ipc.lock);
cv_destroy(&rsm_ipc.cv);
(void) mutex_destroy(&rsm_suspend_list.list_lock);
(void) mutex_destroy(&rsm_pgcnt_lock);
DBG_PRINTF((RSM_KERNEL_AGENT, RSM_DEBUG_VERBOSE, "_fini done\n"));
return (DDI_SUCCESS);
}
/*ARGSUSED1*/
static int
rsm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
{
minor_t rnum;
int percent;
int ret;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_attach enter\n"));
switch (cmd) {
case DDI_ATTACH:
break;
case DDI_RESUME:
default:
DBG_PRINTF((category, RSM_ERR,
"rsm:rsm_attach - cmd not supported\n"));
return (DDI_FAILURE);
}
if (rsm_dip != NULL) {
DBG_PRINTF((category, RSM_ERR,
"rsm:rsm_attach - supports only "
"one instance\n"));
return (DDI_FAILURE);
}
rsm_enable_dr = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
"enable-dynamic-reconfiguration", 1);
mutex_enter(&rsm_drv_data.drv_lock);
rsm_drv_data.drv_state = RSM_DRV_REG_PROCESSING;
mutex_exit(&rsm_drv_data.drv_lock);
if (rsm_enable_dr) {
#ifdef RSM_DRTEST
ret = rsm_kphysm_setup_func_register(&rsm_dr_callback_vec,
(void *)NULL);
#else
ret = kphysm_setup_func_register(&rsm_dr_callback_vec,
(void *)NULL);
#endif
if (ret != 0) {
mutex_exit(&rsm_drv_data.drv_lock);
cmn_err(CE_CONT, "rsm:rsm_attach - Dynamic "
"reconfiguration setup failed\n");
return (DDI_FAILURE);
}
}
mutex_enter(&rsm_drv_data.drv_lock);
ASSERT(rsm_drv_data.drv_state == RSM_DRV_REG_PROCESSING);
rsm_drv_data.drv_state = RSM_DRV_OK;
cv_broadcast(&rsm_drv_data.drv_cv);
mutex_exit(&rsm_drv_data.drv_lock);
/*
* page_list_read_lock();
* xx_setup();
* page_list_read_unlock();
*/
rsm_hash_size = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
"segment-hashtable-size", RSM_HASHSZ);
if (rsm_hash_size == 0) {
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsm: segment-hashtable-size in rsm.conf "
"must be greater than 0, defaulting to 128\n"));
rsm_hash_size = RSM_HASHSZ;
}
DBG_PRINTF((category, RSM_DEBUG, "rsm_attach rsm_hash_size: %d\n",
rsm_hash_size));
rsm_pgcnt = 0;
percent = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
"max-exported-memory", 0);
if (percent < 0) {
DBG_PRINTF((category, RSM_ERR,
"rsm:rsm_attach not enough memory available to "
"export, or max-exported-memory set incorrectly.\n"));
return (DDI_FAILURE);
}
/* 0 indicates no fixed upper limit. maxmem is the max */
/* available pageable physical mem */
rsm_pgcnt_max = (percent*maxmem)/100;
if (rsm_pgcnt_max > 0) {
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsm: Available physical memory = %lu pages, "
"Max exportable memory = %lu pages",
maxmem, rsm_pgcnt_max));
}
/*
* Create minor number
*/
if (rsmresource_alloc(&rnum) != RSM_SUCCESS) {
DBG_PRINTF((category, RSM_ERR,
"rsm: rsm_attach - Unable to get "
"minor number\n"));
return (DDI_FAILURE);
}
ASSERT(rnum == RSM_DRIVER_MINOR);
if (ddi_create_minor_node(devi, DRIVER_NAME, S_IFCHR,
rnum, DDI_PSEUDO, NULL) == DDI_FAILURE) {
DBG_PRINTF((category, RSM_ERR,
"rsm: rsm_attach - unable to allocate "
"minor #\n"));
return (DDI_FAILURE);
}
rsm_dip = devi;
/*
* Allocate the hashtables
*/
rsmhash_alloc(&rsm_export_segs, rsm_hash_size);
rsmhash_alloc(&rsm_import_segs, rsm_hash_size);
importer_list.bucket = (importing_token_t **)
kmem_zalloc(rsm_hash_size * sizeof (importing_token_t *), KM_SLEEP);
/*
* Allocate a resource struct
*/
{
rsmresource_t *p;
p = (rsmresource_t *)kmem_zalloc(sizeof (*p), KM_SLEEP);
mutex_init(&p->rsmrc_lock, NULL, MUTEX_DRIVER, (void *) NULL);
rsmresource_insert(rnum, p, RSM_RESOURCE_BAR);
}
/*
* Based on the rsm.conf property max-segments, determine the maximum
* number of segments that can be exported/imported. This is then used
* to determine the size for barrier failure pages.
*/
/* First get the max number of segments from the rsm.conf file */
max_segs = ddi_prop_get_int(DDI_DEV_T_ANY, devi,
DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
"max-segments", 0);
if (max_segs == 0) {
/* Use default number of segments */
max_segs = RSM_MAX_NUM_SEG;
}
/*
* Based on the max number of segments allowed, determine the barrier
* page size. add 1 to max_segs since the barrier page itself uses
* a slot
*/
barrier_size = roundup((max_segs + 1) * sizeof (rsm_gnum_t),
PAGESIZE);
/*
* allocation of the barrier failure page
*/
bar_va = (rsm_gnum_t *)ddi_umem_alloc(barrier_size,
DDI_UMEM_SLEEP, &bar_cookie);
/*
* Set the barrier_offset
*/
barrier_offset = 0;
/*
* Allocate a trash memory and get a cookie for it. This will be used
* when remapping segments during force disconnects. Allocate the
* trash memory with a large size which is page aligned.
*/
(void) ddi_umem_alloc((size_t)TRASHSIZE,
DDI_UMEM_TRASH, &remap_cookie);
/* initialize user segment id allocation variable */
rsm_nextavail_segmentid = (rsm_memseg_id_t)RSM_USER_APP_ID_BASE;
/*
* initialize the null_rsmpi_ops vector and the loopback adapter
*/
rsmka_init_loopback();
ddi_report_dev(devi);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_attach done\n"));
return (DDI_SUCCESS);
}
/*
* The call to mod_remove in the _fine routine will cause the system
* to call rsm_detach
*/
/*ARGSUSED*/
static int
rsm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_detach enter\n"));
switch (cmd) {
case DDI_DETACH:
break;
default:
DBG_PRINTF((category, RSM_ERR,
"rsm:rsm_detach - cmd %x not supported\n",
cmd));
return (DDI_FAILURE);
}
mutex_enter(&rsm_drv_data.drv_lock);
while (rsm_drv_data.drv_state != RSM_DRV_OK)
cv_wait(&rsm_drv_data.drv_cv, &rsm_drv_data.drv_lock);
rsm_drv_data.drv_state = RSM_DRV_UNREG_PROCESSING;
mutex_exit(&rsm_drv_data.drv_lock);
/*
* Unregister the DR callback functions
*/
if (rsm_enable_dr) {
#ifdef RSM_DRTEST
rsm_kphysm_setup_func_unregister(&rsm_dr_callback_vec,
(void *)NULL);
#else
kphysm_setup_func_unregister(&rsm_dr_callback_vec,
(void *)NULL);
#endif
}
mutex_enter(&rsm_drv_data.drv_lock);
ASSERT(rsm_drv_data.drv_state == RSM_DRV_UNREG_PROCESSING);
rsm_drv_data.drv_state = RSM_DRV_NEW;
mutex_exit(&rsm_drv_data.drv_lock);
ASSERT(rsm_suspend_list.list_head == NULL);
/*
* Release all resources, seglist, controller, ...
*/
/* remove intersend queues */
/* remove registered services */
ddi_remove_minor_node(dip, DRIVER_NAME);
rsm_dip = NULL;
/*
* Free minor zero resource
*/
{
rsmresource_t *p;
p = rsmresource_free(RSM_DRIVER_MINOR);
if (p) {
mutex_destroy(&p->rsmrc_lock);
kmem_free((void *)p, sizeof (*p));
}
}
/*
* Free resource table
*/
rsmresource_destroy();
/*
* Free the hash tables
*/
rsmhash_free(&rsm_export_segs, rsm_hash_size);
rsmhash_free(&rsm_import_segs, rsm_hash_size);
kmem_free((void *)importer_list.bucket,
rsm_hash_size * sizeof (importing_token_t *));
importer_list.bucket = NULL;
/* free barrier page */
if (bar_cookie != NULL) {
ddi_umem_free(bar_cookie);
}
bar_va = NULL;
bar_cookie = NULL;
/*
* Free the memory allocated for the trash
*/
if (remap_cookie != NULL) {
ddi_umem_free(remap_cookie);
}
remap_cookie = NULL;
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_detach done\n"));
return (DDI_SUCCESS);
}
/*ARGSUSED*/
static int
rsm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
register int error;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_DDI);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_info enter\n"));
switch (infocmd) {
case DDI_INFO_DEVT2DEVINFO:
if (rsm_dip == NULL)
error = DDI_FAILURE;
else {
*result = (void *)rsm_dip;
error = DDI_SUCCESS;
}
break;
case DDI_INFO_DEVT2INSTANCE:
*result = (void *)0;
error = DDI_SUCCESS;
break;
default:
error = DDI_FAILURE;
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_info done\n"));
return (error);
}
adapter_t *
rsm_getadapter(rsm_ioctlmsg_t *msg, int mode)
{
adapter_t *adapter;
char adapter_devname[MAXNAMELEN];
int instance;
DBG_DEFINE(category,
RSM_KERNEL_AGENT | RSM_IMPORT | RSM_EXPORT | RSM_IOCTL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_getadapter enter\n"));
instance = msg->cnum;
if ((msg->cname_len <= 0) || (msg->cname_len > MAXNAMELEN)) {
return (NULL);
}
if (ddi_copyin(msg->cname, adapter_devname, msg->cname_len, mode))
return (NULL);
if (strcmp(adapter_devname, "loopback") == 0)
return (&loopback_adapter);
adapter = rsmka_lookup_adapter(adapter_devname, instance);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_getadapter done\n"));
return (adapter);
}
/*
* *********************** Resource Number Management ********************
* All resources are stored in a simple hash table. The table is an array
* of pointers to resource blks. Each blk contains:
* base - base number of this blk
* used - number of used slots in this blk.
* blks - array of pointers to resource items.
* An entry in a resource blk is empty if it's NULL.
*
* We start with no resource array. Each time we run out of slots, we
* reallocate a new larger array and copy the pointer to the new array and
* a new resource blk is allocated and added to the hash table.
*
* The resource control block contains:
* root - array of pointer of resource blks
* sz - current size of array.
* len - last valid entry in array.
*
* A search operation based on a resource number is as follows:
* index = rnum / RESOURCE_BLKSZ;
* ASSERT(index < resource_block.len);
* ASSERT(index < resource_block.sz);
* offset = rnum % RESOURCE_BLKSZ;
* ASSERT(offset >= resource_block.root[index]->base);
* ASSERT(offset < resource_block.root[index]->base + RESOURCE_BLKSZ);
* return resource_block.root[index]->blks[offset];
*
* A resource blk is freed with its used count reachs zero.
*/
static int
rsmresource_alloc(minor_t *rnum)
{
/* search for available resource slot */
int i, j, empty = -1;
rsmresource_blk_t *blk;
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_alloc enter\n"));
rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
/* Try to find an empty slot */
for (i = 0; i < rsm_resource.rsmrc_len; i++) {
blk = rsm_resource.rsmrc_root[i];
if (blk != NULL && blk->rsmrcblk_avail > 0) {
/* found an empty slot in this blk */
for (j = 0; j < RSMRC_BLKSZ; j++) {
if (blk->rsmrcblk_blks[j] == NULL) {
*rnum = (minor_t)
(j + (i * RSMRC_BLKSZ));
/*
* obey gen page limits
*/
if (*rnum >= max_segs + 1) {
if (empty < 0) {
rw_exit(&rsm_resource.
rsmrc_lock);
DBG_PRINTF((
RSM_KERNEL_ALL,
RSM_ERR,
"rsmresource"
"_alloc failed:"
"not enough res"
"%d\n", *rnum));
return (RSMERR_INSUFFICIENT_RESOURCES);
} else {
/* use empty slot */
break;
}
}
blk->rsmrcblk_blks[j] = RSMRC_RESERVED;
blk->rsmrcblk_avail--;
rw_exit(&rsm_resource.rsmrc_lock);
DBG_PRINTF((RSM_KERNEL_ALL,
RSM_DEBUG_VERBOSE,
"rsmresource_alloc done\n"));
return (RSM_SUCCESS);
}
}
} else if (blk == NULL && empty < 0) {
/* remember first empty slot */
empty = i;
}
}
/* Couldn't find anything, allocate a new blk */
/*
* Do we need to reallocate the root array
*/
if (empty < 0) {
if (rsm_resource.rsmrc_len == rsm_resource.rsmrc_sz) {
/*
* Allocate new array and copy current stuff into it
*/
rsmresource_blk_t **p;
uint_t newsz = (uint_t)rsm_resource.rsmrc_sz +
RSMRC_BLKSZ;
/*
* Don't allocate more that max valid rnum
*/
if (rsm_resource.rsmrc_len*RSMRC_BLKSZ >=
max_segs + 1) {
rw_exit(&rsm_resource.rsmrc_lock);
return (RSMERR_INSUFFICIENT_RESOURCES);
}
p = (rsmresource_blk_t **)kmem_zalloc(
newsz * sizeof (*p),
KM_SLEEP);
if (rsm_resource.rsmrc_root) {
uint_t oldsz;
oldsz = (uint_t)(rsm_resource.rsmrc_sz *
(int)sizeof (*p));
/*
* Copy old data into new space and
* free old stuff
*/
bcopy(rsm_resource.rsmrc_root, p, oldsz);
kmem_free(rsm_resource.rsmrc_root, oldsz);
}
rsm_resource.rsmrc_root = p;
rsm_resource.rsmrc_sz = (int)newsz;
}
empty = rsm_resource.rsmrc_len;
rsm_resource.rsmrc_len++;
}
/*
* Allocate a new blk
*/
blk = (rsmresource_blk_t *)kmem_zalloc(sizeof (*blk), KM_SLEEP);
ASSERT(rsm_resource.rsmrc_root[empty] == NULL);
rsm_resource.rsmrc_root[empty] = blk;
blk->rsmrcblk_avail = RSMRC_BLKSZ - 1;
/*
* Allocate slot
*/
*rnum = (minor_t)(empty * RSMRC_BLKSZ);
/*
* watch out not to exceed bounds of barrier page
*/
if (*rnum >= max_segs + 1) {
rw_exit(&rsm_resource.rsmrc_lock);
DBG_PRINTF((RSM_KERNEL_ALL, RSM_ERR,
"rsmresource_alloc failed %d\n", *rnum));
return (RSMERR_INSUFFICIENT_RESOURCES);
}
blk->rsmrcblk_blks[0] = RSMRC_RESERVED;
rw_exit(&rsm_resource.rsmrc_lock);
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_alloc done\n"));
return (RSM_SUCCESS);
}
static rsmresource_t *
rsmresource_free(minor_t rnum)
{
/* search for available resource slot */
int i, j;
rsmresource_blk_t *blk;
rsmresource_t *p;
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_free enter\n"));
i = (int)(rnum / RSMRC_BLKSZ);
j = (int)(rnum % RSMRC_BLKSZ);
if (i >= rsm_resource.rsmrc_len) {
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_free done\n"));
return (NULL);
}
rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
ASSERT(rsm_resource.rsmrc_root);
ASSERT(i < rsm_resource.rsmrc_len);
ASSERT(i < rsm_resource.rsmrc_sz);
blk = rsm_resource.rsmrc_root[i];
if (blk == NULL) {
rw_exit(&rsm_resource.rsmrc_lock);
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_free done\n"));
return (NULL);
}
ASSERT(blk->rsmrcblk_blks[j]); /* reserved or full */
p = blk->rsmrcblk_blks[j];
if (p == RSMRC_RESERVED) {
p = NULL;
}
blk->rsmrcblk_blks[j] = NULL;
blk->rsmrcblk_avail++;
if (blk->rsmrcblk_avail == RSMRC_BLKSZ) {
/* free this blk */
kmem_free(blk, sizeof (*blk));
rsm_resource.rsmrc_root[i] = NULL;
}
rw_exit(&rsm_resource.rsmrc_lock);
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_free done\n"));
return (p);
}
static rsmresource_t *
rsmresource_lookup(minor_t rnum, int lock)
{
int i, j;
rsmresource_blk_t *blk;
rsmresource_t *p;
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_lookup enter\n"));
/* Find resource and lock it in READER mode */
/* search for available resource slot */
i = (int)(rnum / RSMRC_BLKSZ);
j = (int)(rnum % RSMRC_BLKSZ);
if (i >= rsm_resource.rsmrc_len) {
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_lookup done\n"));
return (NULL);
}
rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
blk = rsm_resource.rsmrc_root[i];
if (blk != NULL) {
ASSERT(i < rsm_resource.rsmrc_len);
ASSERT(i < rsm_resource.rsmrc_sz);
p = blk->rsmrcblk_blks[j];
if (lock == RSM_LOCK) {
if (p != RSMRC_RESERVED) {
mutex_enter(&p->rsmrc_lock);
} else {
p = NULL;
}
}
} else {
p = NULL;
}
rw_exit(&rsm_resource.rsmrc_lock);
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_lookup done\n"));
return (p);
}
static void
rsmresource_insert(minor_t rnum, rsmresource_t *p, rsm_resource_type_t type)
{
/* Find resource and lock it in READER mode */
/* Caller can upgrade if need be */
/* search for available resource slot */
int i, j;
rsmresource_blk_t *blk;
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_insert enter\n"));
i = (int)(rnum / RSMRC_BLKSZ);
j = (int)(rnum % RSMRC_BLKSZ);
p->rsmrc_type = type;
p->rsmrc_num = rnum;
rw_enter(&rsm_resource.rsmrc_lock, RW_READER);
ASSERT(rsm_resource.rsmrc_root);
ASSERT(i < rsm_resource.rsmrc_len);
ASSERT(i < rsm_resource.rsmrc_sz);
blk = rsm_resource.rsmrc_root[i];
ASSERT(blk);
ASSERT(blk->rsmrcblk_blks[j] == RSMRC_RESERVED);
blk->rsmrcblk_blks[j] = p;
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_insert done\n"));
rw_exit(&rsm_resource.rsmrc_lock);
}
static void
rsmresource_destroy()
{
int i, j;
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_destroy enter\n"));
rw_enter(&rsm_resource.rsmrc_lock, RW_WRITER);
for (i = 0; i < rsm_resource.rsmrc_len; i++) {
rsmresource_blk_t *blk;
blk = rsm_resource.rsmrc_root[i];
if (blk == NULL) {
continue;
}
for (j = 0; j < RSMRC_BLKSZ; j++) {
if (blk->rsmrcblk_blks[j] != NULL) {
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"Not null slot %d, %lx\n", j,
(size_t)blk->rsmrcblk_blks[j]));
}
}
kmem_free(blk, sizeof (*blk));
rsm_resource.rsmrc_root[i] = NULL;
}
if (rsm_resource.rsmrc_root) {
i = rsm_resource.rsmrc_sz * (int)sizeof (rsmresource_blk_t *);
kmem_free(rsm_resource.rsmrc_root, (uint_t)i);
rsm_resource.rsmrc_root = NULL;
rsm_resource.rsmrc_len = 0;
rsm_resource.rsmrc_sz = 0;
}
DBG_PRINTF((RSM_KERNEL_ALL, RSM_DEBUG_VERBOSE,
"rsmresource_destroy done\n"));
rw_exit(&rsm_resource.rsmrc_lock);
}
/* ******************** Generic Key Hash Table Management ********* */
static rsmresource_t *
rsmhash_lookup(rsmhash_table_t *rhash, rsm_memseg_id_t key,
rsm_resource_state_t state)
{
rsmresource_t *p;
uint_t hashval;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_lookup enter\n"));
hashval = rsmhash(key);
DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_lookup %u=%d\n",
key, hashval));
rw_enter(&rhash->rsmhash_rw, RW_READER);
p = (rsmresource_t *)rsmhash_getbkt(rhash, hashval);
for (; p; p = p->rsmrc_next) {
if (p->rsmrc_key == key) {
/* acquire resource lock */
RSMRC_LOCK(p);
break;
}
}
rw_exit(&rhash->rsmhash_rw);
if (p != NULL && p->rsmrc_state != state) {
/* state changed, release lock and return null */
RSMRC_UNLOCK(p);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsmhash_lookup done: state changed\n"));
return (NULL);
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_lookup done\n"));
return (p);
}
static void
rsmhash_rm(rsmhash_table_t *rhash, rsmresource_t *rcelm)
{
rsmresource_t *p, **back;
uint_t hashval;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_rm enter\n"));
hashval = rsmhash(rcelm->rsmrc_key);
DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_rm %u=%d\n",
rcelm->rsmrc_key, hashval));
/*
* It's ok not to find the segment.
*/
rw_enter(&rhash->rsmhash_rw, RW_WRITER);
back = (rsmresource_t **)rsmhash_bktaddr(rhash, hashval);
for (; (p = *back) != NULL; back = &p->rsmrc_next) {
if (p == rcelm) {
*back = rcelm->rsmrc_next;
break;
}
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_rm done\n"));
rw_exit(&rhash->rsmhash_rw);
}
static int
rsmhash_add(rsmhash_table_t *rhash, rsmresource_t *new, rsm_memseg_id_t key,
int dup_check, rsm_resource_state_t state)
{
rsmresource_t *p = NULL, **bktp;
uint_t hashval;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_add enter\n"));
/* lock table */
rw_enter(&rhash->rsmhash_rw, RW_WRITER);
/*
* If the current resource state is other than the state passed in
* then the resource is (probably) already on the list. eg. for an
* import segment if the state is not RSM_STATE_NEW then it's on the
* list already.
*/
RSMRC_LOCK(new);
if (new->rsmrc_state != state) {
RSMRC_UNLOCK(new);
rw_exit(&rhash->rsmhash_rw);
return (RSMERR_BAD_SEG_HNDL);
}
hashval = rsmhash(key);
DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmhash_add %d\n", hashval));
if (dup_check) {
/*
* Used for checking export segments; don't want to have
* the same key used for multiple segments.
*/
p = (rsmresource_t *)rsmhash_getbkt(rhash, hashval);
for (; p; p = p->rsmrc_next) {
if (p->rsmrc_key == key) {
RSMRC_UNLOCK(new);
break;
}
}
}
if (p == NULL) {
/* Key doesn't exist, add it */
bktp = (rsmresource_t **)rsmhash_bktaddr(rhash, hashval);
new->rsmrc_key = key;
new->rsmrc_next = *bktp;
*bktp = new;
}
rw_exit(&rhash->rsmhash_rw);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmhash_add done\n"));
return (p == NULL ? RSM_SUCCESS : RSMERR_SEGID_IN_USE);
}
/*
* XOR each byte of the key.
*/
static uint_t
rsmhash(rsm_memseg_id_t key)
{
uint_t hash = key;
hash ^= (key >> 8);
hash ^= (key >> 16);
hash ^= (key >> 24);
return (hash % rsm_hash_size);
}
/*
* generic function to get a specific bucket
*/
static void *
rsmhash_getbkt(rsmhash_table_t *rhash, uint_t hashval)
{
if (rhash->bucket == NULL)
return (NULL);
else
return ((void *)rhash->bucket[hashval]);
}
/*
* generic function to get a specific bucket's address
*/
static void **
rsmhash_bktaddr(rsmhash_table_t *rhash, uint_t hashval)
{
if (rhash->bucket == NULL)
return (NULL);
else
return ((void **)&(rhash->bucket[hashval]));
}
/*
* generic function to alloc a hash table
*/
static void
rsmhash_alloc(rsmhash_table_t *rhash, int size)
{
rhash->bucket = (rsmresource_t **)
kmem_zalloc(size * sizeof (rsmresource_t *), KM_SLEEP);
}
/*
* generic function to free a hash table
*/
static void
rsmhash_free(rsmhash_table_t *rhash, int size)
{
kmem_free((void *)rhash->bucket, size * sizeof (caddr_t));
rhash->bucket = NULL;
}
/* *********************** Exported Segment Key Management ************ */
#define rsmexport_add(new, key) \
rsmhash_add(&rsm_export_segs, (rsmresource_t *)new, key, 1, \
RSM_STATE_BIND)
#define rsmexport_rm(arg) \
rsmhash_rm(&rsm_export_segs, (rsmresource_t *)(arg))
#define rsmexport_lookup(key) \
(rsmseg_t *)rsmhash_lookup(&rsm_export_segs, key, RSM_STATE_EXPORT)
/* ************************** Import Segment List Management ********** */
/*
* Add segment to import list. This will be useful for paging and loopback
* segment unloading.
*/
#define rsmimport_add(arg, key) \
rsmhash_add(&rsm_import_segs, (rsmresource_t *)(arg), (key), 0, \
RSM_STATE_NEW)
#define rsmimport_rm(arg) \
rsmhash_rm(&rsm_import_segs, (rsmresource_t *)(arg))
/*
* #define rsmimport_lookup(key) \
* (rsmseg_t *)rsmhash_lookup(&rsm_import_segs, (key), RSM_STATE_CONNECT)
*/
/*
* increase the ref count and make the import segment point to the
* shared data structure. Return a pointer to the share data struct
* and the shared data struct is locked upon return
*/
static rsm_import_share_t *
rsmshare_get(rsm_memseg_id_t key, rsm_node_id_t node, adapter_t *adapter,
rsmseg_t *segp)
{
uint_t hash;
rsmresource_t *p;
rsm_import_share_t *shdatap;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmshare_get enter\n"));
hash = rsmhash(key);
/* lock table */
rw_enter(&rsm_import_segs.rsmhash_rw, RW_WRITER);
DBG_PRINTF((category, RSM_DEBUG_LVL2, "rsmshare_get:key=%u, hash=%d\n",
key, hash));
p = (rsmresource_t *)rsmhash_getbkt(&rsm_import_segs, hash);
for (; p; p = p->rsmrc_next) {
/*
* Look for an entry that is importing the same exporter
* with the share data structure allocated.
*/
if ((p->rsmrc_key == key) &&
(p->rsmrc_node == node) &&
(p->rsmrc_adapter == adapter) &&
(((rsmseg_t *)p)->s_share != NULL)) {
shdatap = ((rsmseg_t *)p)->s_share;
break;
}
}
if (p == NULL) {
/* we are the first importer, create the shared data struct */
shdatap = kmem_zalloc(sizeof (rsm_import_share_t), KM_SLEEP);
shdatap->rsmsi_state = RSMSI_STATE_NEW;
shdatap->rsmsi_segid = key;
shdatap->rsmsi_node = node;
mutex_init(&shdatap->rsmsi_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&shdatap->rsmsi_cv, NULL, CV_DRIVER, 0);
}
rsmseglock_acquire(segp);
/* we grab the shared lock before returning from this function */
mutex_enter(&shdatap->rsmsi_lock);
shdatap->rsmsi_refcnt++;
segp->s_share = shdatap;
rsmseglock_release(segp);
rw_exit(&rsm_import_segs.rsmhash_rw);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmshare_get done\n"));
return (shdatap);
}
/*
* the shared data structure should be locked before calling
* rsmsharecv_signal().
* Change the state and signal any waiting segments.
*/
void
rsmsharecv_signal(rsmseg_t *seg, int oldstate, int newstate)
{
ASSERT(rsmsharelock_held(seg));
if (seg->s_share->rsmsi_state == oldstate) {
seg->s_share->rsmsi_state = newstate;
cv_broadcast(&seg->s_share->rsmsi_cv);
}
}
/*
* Add to the hash table
*/
static void
importer_list_add(rsm_node_id_t node, rsm_memseg_id_t key, rsm_addr_t hwaddr,
void *cookie)
{
importing_token_t *head;
importing_token_t *new_token;
int index;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_add enter\n"));
new_token = kmem_zalloc(sizeof (importing_token_t), KM_SLEEP);
new_token->importing_node = node;
new_token->key = key;
new_token->import_segment_cookie = cookie;
new_token->importing_adapter_hwaddr = hwaddr;
index = rsmhash(key);
mutex_enter(&importer_list.lock);
head = importer_list.bucket[index];
importer_list.bucket[index] = new_token;
new_token->next = head;
mutex_exit(&importer_list.lock);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_add done\n"));
}
static void
importer_list_rm(rsm_node_id_t node, rsm_memseg_id_t key, void *cookie)
{
importing_token_t *prev, *token = NULL;
int index;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_rm enter\n"));
index = rsmhash(key);
mutex_enter(&importer_list.lock);
token = importer_list.bucket[index];
prev = token;
while (token != NULL) {
if (token->importing_node == node &&
token->import_segment_cookie == cookie) {
if (prev == token)
importer_list.bucket[index] = token->next;
else
prev->next = token->next;
kmem_free((void *)token, sizeof (*token));
break;
} else {
prev = token;
token = token->next;
}
}
mutex_exit(&importer_list.lock);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "importer_list_rm done\n"));
}
/* **************************Segment Structure Management ************* */
/*
* Free segment structure
*/
static void
rsmseg_free(rsmseg_t *seg)
{
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_free enter\n"));
/* need to take seglock here to avoid race with rsmmap_unmap() */
rsmseglock_acquire(seg);
if (seg->s_ckl != NULL) {
/* Segment is still busy */
seg->s_state = RSM_STATE_END;
rsmseglock_release(seg);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsmseg_free done\n"));
return;
}
rsmseglock_release(seg);
ASSERT(seg->s_state == RSM_STATE_END || seg->s_state == RSM_STATE_NEW);
/*
* If it's an importer decrement the refcount
* and if its down to zero free the shared data structure.
* This is where failures during rsm_connect() are unrefcounted
*/
if (seg->s_share != NULL) {
ASSERT(seg->s_type == RSM_RESOURCE_IMPORT_SEGMENT);
rsmsharelock_acquire(seg);
ASSERT(seg->s_share->rsmsi_refcnt > 0);
seg->s_share->rsmsi_refcnt--;
if (seg->s_share->rsmsi_refcnt == 0) {
rsmsharelock_release(seg);
mutex_destroy(&seg->s_share->rsmsi_lock);
cv_destroy(&seg->s_share->rsmsi_cv);
kmem_free((void *)(seg->s_share),
sizeof (rsm_import_share_t));
} else {
rsmsharelock_release(seg);
}
/*
* The following needs to be done after any
* rsmsharelock calls which use seg->s_share.
*/
seg->s_share = NULL;
}
cv_destroy(&seg->s_cv);
mutex_destroy(&seg->s_lock);
rsmacl_free(seg->s_acl, seg->s_acl_len);
rsmpiacl_free(seg->s_acl_in, seg->s_acl_len);
if (seg->s_adapter)
rsmka_release_adapter(seg->s_adapter);
kmem_free((void *)seg, sizeof (*seg));
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_free done\n"));
}
static rsmseg_t *
rsmseg_alloc(minor_t num, struct cred *cred)
{
rsmseg_t *new;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_alloc enter\n"));
/*
* allocate memory for new segment. This should be a segkmem cache.
*/
new = (rsmseg_t *)kmem_zalloc(sizeof (*new), KM_SLEEP);
new->s_state = RSM_STATE_NEW;
new->s_minor = num;
new->s_acl_len = 0;
new->s_cookie = NULL;
new->s_adapter = NULL;
new->s_mode = 0777 & ~PTOU((ttoproc(curthread)))->u_cmask;
/* we don't have a key yet, will set at export/connect */
new->s_uid = crgetuid(cred);
new->s_gid = crgetgid(cred);
mutex_init(&new->s_lock, NULL, MUTEX_DRIVER, (void *)NULL);
cv_init(&new->s_cv, NULL, CV_DRIVER, 0);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_alloc done\n"));
return (new);
}
/* ******************************** Driver Open/Close/Poll *************** */
/*ARGSUSED1*/
static int
rsm_open(dev_t *devp, int flag, int otyp, struct cred *cred)
{
minor_t rnum;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_open enter\n"));
/*
* Char only
*/
if (otyp != OTYP_CHR) {
DBG_PRINTF((category, RSM_ERR, "rsm_open: bad otyp\n"));
return (EINVAL);
}
/*
* Only zero can be opened, clones are used for resources.
*/
if (getminor(*devp) != RSM_DRIVER_MINOR) {
DBG_PRINTF((category, RSM_ERR,
"rsm_open: bad minor %d\n", getminor(*devp)));
return (ENODEV);
}
if ((flag & FEXCL) != 0 && secpolicy_excl_open(cred) != 0) {
DBG_PRINTF((category, RSM_ERR, "rsm_open: bad perm\n"));
return (EPERM);
}
if (!(flag & FWRITE)) {
/*
* The library function _rsm_librsm_init calls open for
* /dev/rsm with flag set to O_RDONLY. We want a valid
* file descriptor to be returned for minor device zero.
*/
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsm_open RDONLY done\n"));
return (DDI_SUCCESS);
}
/*
* - allocate new minor number and segment.
* - add segment to list of all segments.
* - set minordev data to segment
* - update devp argument to new device
* - update s_cred to cred; make sure you do crhold(cred);
*/
/* allocate a new resource number */
if (rsmresource_alloc(&rnum) == RSM_SUCCESS) {
/*
* We will bind this minor to a specific resource in first
* ioctl
*/
*devp = makedevice(getmajor(*devp), rnum);
} else {
return (EAGAIN);
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_open done\n"));
return (DDI_SUCCESS);
}
static void
rsmseg_close(rsmseg_t *seg, int force_flag)
{
int e = RSM_SUCCESS;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_close enter\n"));
rsmseglock_acquire(seg);
if (!force_flag && (seg->s_hdr.rsmrc_type ==
RSM_RESOURCE_EXPORT_SEGMENT)) {
/*
* If we are processing rsm_close wait for force_destroy
* processing to complete since force_destroy processing
* needs to finish first before we can free the segment.
* force_destroy is only for export segments
*/
while (seg->s_flags & RSM_FORCE_DESTROY_WAIT) {
cv_wait(&seg->s_cv, &seg->s_lock);
}
}
rsmseglock_release(seg);
/* It's ok to read the state without a lock */
switch (seg->s_state) {
case RSM_STATE_EXPORT:
case RSM_STATE_EXPORT_QUIESCING:
case RSM_STATE_EXPORT_QUIESCED:
e = rsm_unpublish(seg, 1);
/* FALLTHRU */
case RSM_STATE_BIND_QUIESCED:
/* FALLTHRU */
case RSM_STATE_BIND:
e = rsm_unbind(seg);
if (e != RSM_SUCCESS && force_flag == 1)
return;
ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_EXPORT_SEGMENT);
/* FALLTHRU */
case RSM_STATE_NEW_QUIESCED:
rsmseglock_acquire(seg);
seg->s_state = RSM_STATE_NEW;
cv_broadcast(&seg->s_cv);
rsmseglock_release(seg);
break;
case RSM_STATE_NEW:
break;
case RSM_STATE_ZOMBIE:
/*
* Segments in this state have been removed off the
* exported segments list and have been unpublished
* and unbind. These segments have been removed during
* a callback to the rsm_export_force_destroy, which
* is called for the purpose of unlocking these
* exported memory segments when a process exits but
* leaves the segments locked down since rsm_close is
* is not called for the segments. This can happen
* when a process calls fork or exec and then exits.
* Once the segments are in the ZOMBIE state, all that
* remains is to destroy them when rsm_close is called.
* This is done here. Thus, for such segments the
* the state is changed to new so that later in this
* function rsmseg_free is called.
*/
rsmseglock_acquire(seg);
seg->s_state = RSM_STATE_NEW;
rsmseglock_release(seg);
break;
case RSM_STATE_MAP_QUIESCE:
case RSM_STATE_ACTIVE:
/* Disconnect will handle the unmap */
case RSM_STATE_CONN_QUIESCE:
case RSM_STATE_CONNECT:
case RSM_STATE_DISCONNECT:
ASSERT(seg->s_hdr.rsmrc_type == RSM_RESOURCE_IMPORT_SEGMENT);
(void) rsm_disconnect(seg);
break;
case RSM_STATE_MAPPING:
/*FALLTHRU*/
case RSM_STATE_END:
DBG_PRINTF((category, RSM_ERR,
"Invalid segment state %d in rsm_close\n", seg->s_state));
break;
default:
DBG_PRINTF((category, RSM_ERR,
"Invalid segment state %d in rsm_close\n", seg->s_state));
break;
}
/*
* check state.
* - make sure you do crfree(s_cred);
* release segment and minor number
*/
ASSERT(seg->s_state == RSM_STATE_NEW);
/*
* The export_force_destroy callback is created to unlock
* the exported segments of a process
* when the process does a fork or exec and then exits calls this
* function with the force flag set to 1 which indicates that the
* segment state must be converted to ZOMBIE. This state means that the
* segments still exist and have been unlocked and most importantly the
* only operation allowed is to destroy them on an rsm_close.
*/
if (force_flag) {
rsmseglock_acquire(seg);
seg->s_state = RSM_STATE_ZOMBIE;
rsmseglock_release(seg);
} else {
rsmseg_free(seg);
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmseg_close done\n"));
}
static int
rsm_close(dev_t dev, int flag, int otyp, cred_t *cred)
{
minor_t rnum = getminor(dev);
rsmresource_t *res;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_FUNC_ALL| RSM_DDI);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close enter\n"));
flag = flag; cred = cred;
if (otyp != OTYP_CHR)
return (EINVAL);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rnum = %d\n", rnum));
/*
* At this point we are the last reference to the resource.
* Free resource number from resource table.
* It's ok to remove number before we free the segment.
* We need to lock the resource to protect against remote calls.
*/
if (rnum == RSM_DRIVER_MINOR ||
(res = rsmresource_free(rnum)) == NULL) {
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close done\n"));
return (DDI_SUCCESS);
}
switch (res->rsmrc_type) {
case RSM_RESOURCE_EXPORT_SEGMENT:
case RSM_RESOURCE_IMPORT_SEGMENT:
rsmseg_close((rsmseg_t *)res, 0);
break;
case RSM_RESOURCE_BAR:
DBG_PRINTF((category, RSM_ERR, "bad resource in rsm_close\n"));
break;
default:
break;
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_close done\n"));
return (DDI_SUCCESS);
}
/*
* rsm_inc_pgcnt
*
* Description: increment rsm page counter.
*
* Parameters: pgcnt_t pnum; number of pages to be used
*
* Returns: RSM_SUCCESS if memory limit not exceeded
* ENOSPC if memory limit exceeded. In this case, the
* page counter remains unchanged.
*
*/
static int
rsm_inc_pgcnt(pgcnt_t pnum)
{
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
if (rsm_pgcnt_max == 0) { /* no upper limit has been set */
return (RSM_SUCCESS);
}
mutex_enter(&rsm_pgcnt_lock);
if (rsm_pgcnt + pnum > rsm_pgcnt_max) {
/* ensure that limits have not been exceeded */
mutex_exit(&rsm_pgcnt_lock);
return (RSMERR_INSUFFICIENT_MEM);
}
rsm_pgcnt += pnum;
DBG_PRINTF((category, RSM_DEBUG, "rsm_pgcnt incr to %d.\n",
rsm_pgcnt));
mutex_exit(&rsm_pgcnt_lock);
return (RSM_SUCCESS);
}
/*
* rsm_dec_pgcnt
*
* Description: decrement rsm page counter.
*
* Parameters: pgcnt_t pnum; number of pages freed
*
*/
static void
rsm_dec_pgcnt(pgcnt_t pnum)
{
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
if (rsm_pgcnt_max == 0) { /* no upper limit has been set */
return;
}
mutex_enter(&rsm_pgcnt_lock);
ASSERT(rsm_pgcnt >= pnum);
rsm_pgcnt -= pnum;
DBG_PRINTF((category, RSM_DEBUG, "rsm_pgcnt decr to %d.\n",
rsm_pgcnt));
mutex_exit(&rsm_pgcnt_lock);
}
static struct umem_callback_ops rsm_as_ops = {
UMEM_CALLBACK_VERSION, /* version number */
rsm_export_force_destroy,
};
static int
rsm_bind_pages(ddi_umem_cookie_t *cookie, caddr_t vaddr, size_t len,
proc_t *procp)
{
int error = RSM_SUCCESS;
ulong_t pnum;
struct umem_callback_ops *callbackops = &rsm_as_ops;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind_pages enter\n"));
/*
* Make sure vaddr and len are aligned on a page boundary
*/
if ((uintptr_t)vaddr & (PAGESIZE - 1)) {
return (RSMERR_BAD_ADDR);
}
if (len & (PAGESIZE - 1)) {
return (RSMERR_BAD_LENGTH);
}
/*
* Find number of pages
*/
pnum = btopr(len);
error = rsm_inc_pgcnt(pnum);
if (error != RSM_SUCCESS) {
DBG_PRINTF((category, RSM_ERR,
"rsm_bind_pages:mem limit exceeded\n"));
return (RSMERR_INSUFFICIENT_MEM);
}
error = umem_lockmemory(vaddr, len,
DDI_UMEMLOCK_WRITE|DDI_UMEMLOCK_READ|DDI_UMEMLOCK_LONGTERM,
cookie,
callbackops, procp);
if (error) {
rsm_dec_pgcnt(pnum);
DBG_PRINTF((category, RSM_ERR,
"rsm_bind_pages:ddi_umem_lock failed\n"));
/*
* ddi_umem_lock, in the case of failure, returns one of
* the following three errors. These are translated into
* the RSMERR namespace and returned.
*/
if (error == EFAULT)
return (RSMERR_BAD_ADDR);
else if (error == EACCES)
return (RSMERR_PERM_DENIED);
else
return (RSMERR_INSUFFICIENT_MEM);
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind_pages done\n"));
return (error);
}
static int
rsm_unbind_pages(rsmseg_t *seg)
{
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind_pages enter\n"));
ASSERT(rsmseglock_held(seg));
if (seg->s_cookie != NULL) {
/* unlock address range */
ddi_umem_unlock(seg->s_cookie);
rsm_dec_pgcnt(btopr(seg->s_len));
seg->s_cookie = NULL;
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind_pages done\n"));
return (RSM_SUCCESS);
}
static int
rsm_bind(rsmseg_t *seg, rsm_ioctlmsg_t *msg, intptr_t dataptr, int mode)
{
int e;
adapter_t *adapter;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind enter\n"));
adapter = rsm_getadapter(msg, mode);
if (adapter == NULL) {
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsm_bind done:no adapter\n"));
return (RSMERR_CTLR_NOT_PRESENT);
}
/* lock address range */
if (msg->vaddr == NULL) {
rsmka_release_adapter(adapter);
DBG_PRINTF((category, RSM_ERR,
"rsm: rsm_bind done: invalid vaddr\n"));
return (RSMERR_BAD_ADDR);
}
if (msg->len <= 0) {
rsmka_release_adapter(adapter);
DBG_PRINTF((category, RSM_ERR,
"rsm_bind: invalid length\n"));
return (RSMERR_BAD_LENGTH);
}
/* Lock segment */
rsmseglock_acquire(seg);
while (seg->s_state == RSM_STATE_NEW_QUIESCED) {
if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
DBG_PRINTF((category, RSM_DEBUG,
"rsm_bind done: cv_wait INTERRUPTED"));
rsmka_release_adapter(adapter);
rsmseglock_release(seg);
return (RSMERR_INTERRUPTED);
}
}
ASSERT(seg->s_state == RSM_STATE_NEW);
ASSERT(seg->s_cookie == NULL);
e = rsm_bind_pages(&seg->s_cookie, msg->vaddr, msg->len, curproc);
if (e == RSM_SUCCESS) {
seg->s_flags |= RSM_USER_MEMORY;
if (msg->perm & RSM_ALLOW_REBIND) {
seg->s_flags |= RSMKA_ALLOW_UNBIND_REBIND;
}
if (msg->perm & RSM_CREATE_SEG_DONTWAIT) {
seg->s_flags |= RSMKA_SET_RESOURCE_DONTWAIT;
}
seg->s_region.r_vaddr = msg->vaddr;
/*
* Set the s_pid value in the segment structure. This is used
* to identify exported segments belonging to a particular
* process so that when the process exits, these segments can
* be unlocked forcefully even if rsm_close is not called on
* process exit since there maybe other processes referencing
* them (for example on a fork or exec).
* The s_pid value is also used to authenticate the process
* doing a publish or unpublish on the export segment. Only
* the creator of the export segment has a right to do a
* publish or unpublish and unbind on the segment.
*/
seg->s_pid = ddi_get_pid();
seg->s_len = msg->len;
seg->s_state = RSM_STATE_BIND;
seg->s_adapter = adapter;
seg->s_proc = curproc;
} else {
rsmka_release_adapter(adapter);
DBG_PRINTF((category, RSM_WARNING,
"unable to lock down pages\n"));
}
msg->rnum = seg->s_minor;
/* Unlock segment */
rsmseglock_release(seg);
if (e == RSM_SUCCESS) {
/* copyout the resource number */
#ifdef _MULTI_DATAMODEL
if ((mode & DATAMODEL_MASK) == DATAMODEL_ILP32) {
rsm_ioctlmsg32_t msg32;
msg32.rnum = msg->rnum;
if (ddi_copyout((caddr_t)&msg32.rnum,
(caddr_t)&((rsm_ioctlmsg32_t *)dataptr)->rnum,
sizeof (minor_t), mode)) {
rsmka_release_adapter(adapter);
e = RSMERR_BAD_ADDR;
}
}
#endif
if (ddi_copyout((caddr_t)&msg->rnum,
(caddr_t)&((rsm_ioctlmsg_t *)dataptr)->rnum,
sizeof (minor_t), mode)) {
rsmka_release_adapter(adapter);
e = RSMERR_BAD_ADDR;
}
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_bind done\n"));
return (e);
}
static void
rsm_remap_local_importers(rsm_node_id_t src_nodeid,
rsm_memseg_id_t ex_segid, ddi_umem_cookie_t cookie)
{
rsmresource_t *p = NULL;
rsmhash_table_t *rhash = &rsm_import_segs;
uint_t index;
DBG_PRINTF((RSM_KERNEL_AGENT | RSM_FUNC_ALL, RSM_DEBUG_VERBOSE,
"rsm_remap_local_importers enter\n"));
index = rsmhash(ex_segid);
rw_enter(&rhash->rsmhash_rw, RW_READER);
p = rsmhash_getbkt(rhash, index);
for (; p; p = p->rsmrc_next) {
rsmseg_t *seg = (rsmseg_t *)p;
rsmseglock_acquire(seg);
/*
* Change the s_cookie value of only the local importers
* which have been mapped (in state RSM_STATE_ACTIVE).
* Note that there is no need to change the s_cookie value
* if the imported segment is in RSM_STATE_MAPPING since
* eventually the s_cookie will be updated via the mapping
* functionality.
*/
if ((seg->s_segid == ex_segid) && (seg->s_node == src_nodeid) &&
(seg->s_state == RSM_STATE_ACTIVE)) {
seg->s_cookie = cookie;
}
rsmseglock_release(seg);
}
rw_exit(&rhash->rsmhash_rw);
DBG_PRINTF((RSM_KERNEL_AGENT | RSM_FUNC_ALL, RSM_DEBUG_VERBOSE,
"rsm_remap_local_importers done\n"));
}
static int
rsm_rebind(rsmseg_t *seg, rsm_ioctlmsg_t *msg)
{
int e;
adapter_t *adapter;
ddi_umem_cookie_t cookie;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind enter\n"));
/* Check for permissions to rebind */
if (!(seg->s_flags & RSMKA_ALLOW_UNBIND_REBIND)) {
return (RSMERR_REBIND_NOT_ALLOWED);
}
if (seg->s_pid != ddi_get_pid() &&
ddi_get_pid() != 0) {
DBG_PRINTF((category, RSM_ERR, "rsm_rebind: Not owner\n"));
return (RSMERR_NOT_CREATOR);
}
/*
* We will not be allowing partial rebind and hence length passed
* in must be same as segment length
*/
if (msg->vaddr == NULL) {
DBG_PRINTF((category, RSM_ERR,
"rsm_rebind done: null msg->vaddr\n"));
return (RSMERR_BAD_ADDR);
}
if (msg->len != seg->s_len) {
DBG_PRINTF((category, RSM_ERR,
"rsm_rebind: invalid length\n"));
return (RSMERR_BAD_LENGTH);
}
/* Lock segment */
rsmseglock_acquire(seg);
while ((seg->s_state == RSM_STATE_BIND_QUIESCED) ||
(seg->s_state == RSM_STATE_EXPORT_QUIESCING) ||
(seg->s_state == RSM_STATE_EXPORT_QUIESCED)) {
if (cv_wait_sig(&seg->s_cv, &seg->s_lock) == 0) {
rsmseglock_release(seg);
DBG_PRINTF((category, RSM_DEBUG,
"rsm_rebind done: cv_wait INTERRUPTED"));
return (RSMERR_INTERRUPTED);
}
}
/* verify segment state */
if ((seg->s_state != RSM_STATE_BIND) &&
(seg->s_state != RSM_STATE_EXPORT)) {
/* Unlock segment */
rsmseglock_release(seg);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsm_rebind done: invalid state\n"));
return (RSMERR_BAD_SEG_HNDL);
}
ASSERT(seg->s_cookie != NULL);
if (msg->vaddr == seg->s_region.r_vaddr) {
rsmseglock_release(seg);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind done\n"));
return (RSM_SUCCESS);
}
e = rsm_bind_pages(&cookie, msg->vaddr, msg->len, curproc);
if (e == RSM_SUCCESS) {
struct buf *xbuf;
dev_t sdev = 0;
rsm_memory_local_t mem;
xbuf = ddi_umem_iosetup(cookie, 0, msg->len, B_WRITE,
sdev, 0, NULL, DDI_UMEM_SLEEP);
ASSERT(xbuf != NULL);
mem.ms_type = RSM_MEM_BUF;
mem.ms_bp = xbuf;
adapter = seg->s_adapter;
e = adapter->rsmpi_ops->rsm_rebind(
seg->s_handle.out, 0, &mem,
RSM_RESOURCE_DONTWAIT, NULL);
if (e == RSM_SUCCESS) {
/*
* unbind the older pages, and unload local importers;
* but don't disconnect importers
*/
(void) rsm_unbind_pages(seg);
seg->s_cookie = cookie;
seg->s_region.r_vaddr = msg->vaddr;
rsm_remap_local_importers(my_nodeid, seg->s_segid,
cookie);
} else {
/*
* Unbind the pages associated with "cookie" by the
* rsm_bind_pages calls prior to this. This is
* similar to what is done in the rsm_unbind_pages
* routine for the seg->s_cookie.
*/
ddi_umem_unlock(cookie);
rsm_dec_pgcnt(btopr(msg->len));
DBG_PRINTF((category, RSM_ERR,
"rsm_rebind failed with %d\n", e));
}
/*
* At present there is no dependency on the existence of xbuf.
* So we can free it here. If in the future this changes, it can
* be freed sometime during the segment destroy.
*/
freerbuf(xbuf);
}
/* Unlock segment */
rsmseglock_release(seg);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_rebind done\n"));
return (e);
}
static int
rsm_unbind(rsmseg_t *seg)
{
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind enter\n"));
rsmseglock_acquire(seg);
/* verify segment state */
if ((seg->s_state != RSM_STATE_BIND) &&
(seg->s_state != RSM_STATE_BIND_QUIESCED)) {
rsmseglock_release(seg);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsm_unbind: invalid state\n"));
return (RSMERR_BAD_SEG_HNDL);
}
/* unlock current range */
(void) rsm_unbind_pages(seg);
if (seg->s_state == RSM_STATE_BIND) {
seg->s_state = RSM_STATE_NEW;
} else if (seg->s_state == RSM_STATE_BIND_QUIESCED) {
seg->s_state = RSM_STATE_NEW_QUIESCED;
}
rsmseglock_release(seg);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_unbind done\n"));
return (RSM_SUCCESS);
}
/* **************************** Exporter Access List Management ******* */
static void
rsmacl_free(rsmapi_access_entry_t *acl, int acl_len)
{
int acl_sz;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_free enter\n"));
/* acl could be NULL */
if (acl != NULL && acl_len > 0) {
acl_sz = acl_len * sizeof (rsmapi_access_entry_t);
kmem_free((void *)acl, acl_sz);
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_free done\n"));
}
static void
rsmpiacl_free(rsm_access_entry_t *acl, int acl_len)
{
int acl_sz;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_free enter\n"));
if (acl != NULL && acl_len > 0) {
acl_sz = acl_len * sizeof (rsm_access_entry_t);
kmem_free((void *)acl, acl_sz);
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_free done\n"));
}
static int
rsmacl_build(rsm_ioctlmsg_t *msg, int mode,
rsmapi_access_entry_t **list, int *len, int loopback)
{
rsmapi_access_entry_t *acl;
int acl_len;
int i;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_build enter\n"));
*len = 0;
*list = NULL;
acl_len = msg->acl_len;
if ((loopback && acl_len > 1) || (acl_len < 0) ||
(acl_len > MAX_NODES)) {
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsmacl_build done: acl invalid\n"));
return (RSMERR_BAD_ACL);
}
if (acl_len > 0 && acl_len <= MAX_NODES) {
size_t acl_size = acl_len * sizeof (rsmapi_access_entry_t);
acl = kmem_alloc(acl_size, KM_SLEEP);
if (ddi_copyin((caddr_t)msg->acl, (caddr_t)acl,
acl_size, mode)) {
kmem_free((void *) acl, acl_size);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsmacl_build done: BAD_ADDR\n"));
return (RSMERR_BAD_ADDR);
}
/*
* Verify access list
*/
for (i = 0; i < acl_len; i++) {
if (acl[i].ae_node > MAX_NODES ||
(loopback && (acl[i].ae_node != my_nodeid)) ||
acl[i].ae_permission > RSM_ACCESS_TRUSTED) {
/* invalid entry */
kmem_free((void *) acl, acl_size);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsmacl_build done: EINVAL\n"));
return (RSMERR_BAD_ACL);
}
}
*len = acl_len;
*list = acl;
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmacl_build done\n"));
return (DDI_SUCCESS);
}
static int
rsmpiacl_create(rsmapi_access_entry_t *src, rsm_access_entry_t **dest,
int acl_len, adapter_t *adapter)
{
rsm_access_entry_t *acl;
rsm_addr_t hwaddr;
int i;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_create enter\n"));
if (src != NULL) {
size_t acl_size = acl_len * sizeof (rsm_access_entry_t);
acl = kmem_alloc(acl_size, KM_SLEEP);
/*
* translate access list
*/
for (i = 0; i < acl_len; i++) {
if (src[i].ae_node == my_nodeid) {
acl[i].ae_addr = adapter->hwaddr;
} else {
hwaddr = get_remote_hwaddr(adapter,
src[i].ae_node);
if ((int64_t)hwaddr < 0) {
/* invalid hwaddr */
kmem_free((void *) acl, acl_size);
DBG_PRINTF((category,
RSM_DEBUG_VERBOSE,
"rsmpiacl_create done:"
"EINVAL hwaddr\n"));
return (RSMERR_INTERNAL_ERROR);
}
acl[i].ae_addr = hwaddr;
}
/* rsmpi understands only RSM_PERM_XXXX */
acl[i].ae_permission =
src[i].ae_permission & RSM_PERM_RDWR;
}
*dest = acl;
} else {
*dest = NULL;
}
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmpiacl_create done\n"));
return (RSM_SUCCESS);
}
static int
rsmsegacl_validate(rsmipc_request_t *req, rsm_node_id_t rnode,
rsmipc_reply_t *reply)
{
int i;
rsmseg_t *seg;
rsm_memseg_id_t key = req->rsmipc_key;
rsm_permission_t perm = req->rsmipc_perm;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsmsegacl_validate enter\n"));
/*
* Find segment and grab its lock. The reason why we grab the segment
* lock in side the search is to avoid the race when the segment is
* being deleted and we already have a pointer to it.
*/
seg = rsmexport_lookup(key);
if (!seg) {
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsmsegacl_validate done: %u ENXIO\n", key));
return (RSMERR_SEG_NOT_PUBLISHED);
}
ASSERT(rsmseglock_held(seg));
ASSERT(seg->s_state == RSM_STATE_EXPORT);
/*
* We implement a 2-level protection scheme.
* First, we check if local/remote host has access rights.
* Second, we check if the user has access rights.
*
* This routine only validates the rnode access_list
*/
if (seg->s_acl_len > 0) {
/*
* Check host access list
*/
ASSERT(seg->s_acl != NULL);
for (i = 0; i < seg->s_acl_len; i++) {
if (seg->s_acl[i].ae_node == rnode) {
perm &= seg->s_acl[i].ae_permission;
goto found;
}
}
/* rnode is not found in the list */
rsmseglock_release(seg);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE,
"rsmsegacl_validate done: EPERM\n"));
return (RSMERR_SEG_NOT_PUBLISHED_TO_NODE);
} else {
/* use default owner creation umask */
perm &= seg->s_mode;
}
found:
/* update perm for this node */
reply->rsmipc_mode = perm;
reply->rsmipc_uid = seg->s_uid;
reply->rsmipc_gid = seg->s_gid;
reply->rsmipc_segid = seg->s_segid;
reply->rsmipc_seglen = seg->s_len;
/*
* Perm of requesting node is valid; source will validate user
*/
rsmseglock_release(seg);
/*
* Add the importer to the list right away, if connect fails
* the importer will ask the exporter to remove it.
*/
importer_list_add(rnode, key, req->rsmipc_adapter_hwaddr,
req->rsmipc_segment_cookie);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsmsegacl_validate done\n"));
return (RSM_SUCCESS);
}
/* ************************** Exporter Calls ************************* */
static int
rsm_publish(rsmseg_t *seg, rsm_ioctlmsg_t *msg, intptr_t dataptr, int mode)
{
int e;
int acl_len;
rsmapi_access_entry_t *acl;
rsm_access_entry_t *rsmpi_acl;
rsm_memory_local_t mem;
struct buf *xbuf;
dev_t sdev = 0;
adapter_t *adapter;
rsm_memseg_id_t segment_id = 0;
int loopback_flag = 0;
int create_flags = 0;
rsm_resource_callback_t callback_flag;
DBG_DEFINE(category, RSM_KERNEL_AGENT | RSM_EXPORT);
DBG_PRINTF((category, RSM_DEBUG_VERBOSE, "rsm_publish enter\n"));
if (seg->s_adapter == &loopback_adapter)
loopback_flag = 1;
if (seg->s_pid != ddi_get_pid() &&
ddi_get_pid() != 0) {
DBG_PRINTF((category, RSM_ERR,
"rsm_publish: Not creator\n"));
return (RSMERR_NOT_CREATOR);
}
/*
* Get per node access list
*/
e = rsmacl_build(msg, mode, &acl, &acl_len, loopback_flag);
if (e != DDI_SUCCESS) {
DBG_PRINTF((category, RSM_ERR,
"rsm_publish done: rsmacl_build failed\n"));
return (e);
}
/*
* The application provided msg->key is used for resolving a
* segment id according to the following:
* key = 0 Kernel Agent selects the segment id
* key <= RSM_DLPI_ID_END Reserved for system usage except
* RSMLIB range
* key < RSM_USER_APP_ID_BASE segment id = key
* key >= RSM_USER_APP_ID_BASE Reserved for KA selections
*
* rsm_nextavail_segmentid is initialized to 0x80000000 and
* overflows to zero after 0x80000000 allocations.
* An algorithm is needed which allows reinitialization and provides
* for reallocation after overflow. For now, ENOMEM is returned
* once the overflow condition has occurred.
*/
if (msg->key == 0) {
mutex_enter(&rsm_lock);
segment_id = rsm_nextavail_segmentid;
if (segment_id != 0) {
rsm_nextavail_segmentid++;
mutex_exit(&rsm_lock);
} else {
mutex_exit(&rsm_lock);
DBG_PRINTF((category, RSM_ERR,
"rsm_publish done: no more keys avlbl\n"));
return (RSMERR_INSUFFICIENT_RESOURCES);
}
} else if BETWEEN(msg->key, RSM_RSMLIB_ID_BASE, RSM_RSMLIB_ID_END)
/* range reserved for internal use by base/ndi libraries */
segment_id = msg->key;
else if (msg->key <= RSM_DLPI_ID_END)
return (RSMERR_RESERVED_SEGID);
else if (msg->key <= (uint_t)RSM_USER_APP_ID_BASE -1)
segment_id = msg->key;
else {
DBG_PRINTF((category, RSM_ERR,
"rsm_publish done: invalid key %u\n", msg->key));
return (RSMERR_RESERVED_SEGID);
}
/* Add key to exportlist; The segment lock is held on success */
e = rsmexport_add(seg, segment_id);
if (e) {
rsmacl_free(acl, acl_len);
DBG_PRINTF((category, RSM_ERR,
"rsm_publish done: export_add failed: %d\n", e));
return (e);
}
seg->s_segid = segment_id;
if ((seg->s_state != RSM_STATE_BIND) &&
(seg->s_state != RSM_STATE_BIND_QUIESCED)) {
/* state changed since then, free acl and return */
rsmseglock_release(seg);
rsmexport_rm(seg);