| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| |
| /* |
| * Copyright (c) 2007, The Ohio State University. All rights reserved. |
| * |
| * Portions of this source code is developed by the team members of |
| * The Ohio State University's Network-Based Computing Laboratory (NBCL), |
| * headed by Professor Dhabaleswar K. (DK) Panda. |
| * |
| * Acknowledgements to contributions from developors: |
| * Ranjit Noronha: noronha@cse.ohio-state.edu |
| * Lei Chai : chail@cse.ohio-state.edu |
| * Weikuan Yu : yuw@cse.ohio-state.edu |
| * |
| */ |
| |
| /* |
| * The rpcib plugin. Implements the interface for RDMATF's |
| * interaction with IBTF. |
| */ |
| |
| #include <sys/param.h> |
| #include <sys/types.h> |
| #include <sys/user.h> |
| #include <sys/systm.h> |
| #include <sys/sysmacros.h> |
| #include <sys/proc.h> |
| #include <sys/socket.h> |
| #include <sys/file.h> |
| #include <sys/stream.h> |
| #include <sys/strsubr.h> |
| #include <sys/stropts.h> |
| #include <sys/errno.h> |
| #include <sys/kmem.h> |
| #include <sys/debug.h> |
| #include <sys/pathname.h> |
| #include <sys/kstat.h> |
| #include <sys/t_lock.h> |
| #include <sys/ddi.h> |
| #include <sys/cmn_err.h> |
| #include <sys/time.h> |
| #include <sys/isa_defs.h> |
| #include <sys/callb.h> |
| #include <sys/sunddi.h> |
| #include <sys/sunndi.h> |
| #include <sys/sdt.h> |
| #include <sys/ib/ibtl/ibti.h> |
| #include <rpc/rpc.h> |
| #include <rpc/ib.h> |
| #include <sys/modctl.h> |
| #include <sys/kstr.h> |
| #include <sys/sockio.h> |
| #include <sys/vnode.h> |
| #include <sys/tiuser.h> |
| #include <net/if.h> |
| #include <net/if_types.h> |
| #include <sys/cred.h> |
| #include <rpc/rpc_rdma.h> |
| #include <nfs/nfs.h> |
| #include <sys/atomic.h> |
| |
| #define NFS_RDMA_PORT 2050 |
| |
| /* |
| * Convenience structure used by rpcib_get_ib_addresses() |
| */ |
| typedef struct rpcib_ipaddrs { |
| void *ri_list; /* pointer to list of addresses */ |
| uint_t ri_count; /* number of addresses in list */ |
| uint_t ri_size; /* size of ri_list in bytes */ |
| } rpcib_ipaddrs_t; |
| |
| /* |
| * Prototype declarations for driver ops |
| */ |
| static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); |
| static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, |
| void *, void **); |
| static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); |
| static boolean_t rpcib_rdma_capable_interface(struct lifreq *); |
| static int rpcib_do_ip_ioctl(int, int, void *); |
| static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); |
| static int rpcib_cache_kstat_update(kstat_t *, int); |
| static void rib_force_cleanup(void *); |
| |
| struct { |
| kstat_named_t cache_limit; |
| kstat_named_t cache_allocation; |
| kstat_named_t cache_hits; |
| kstat_named_t cache_misses; |
| kstat_named_t cache_misses_above_the_limit; |
| } rpcib_kstat = { |
| {"cache_limit", KSTAT_DATA_UINT64 }, |
| {"cache_allocation", KSTAT_DATA_UINT64 }, |
| {"cache_hits", KSTAT_DATA_UINT64 }, |
| {"cache_misses", KSTAT_DATA_UINT64 }, |
| {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, |
| }; |
| |
| /* rpcib cb_ops */ |
| static struct cb_ops rpcib_cbops = { |
| nulldev, /* open */ |
| nulldev, /* close */ |
| nodev, /* strategy */ |
| nodev, /* print */ |
| nodev, /* dump */ |
| nodev, /* read */ |
| nodev, /* write */ |
| nodev, /* ioctl */ |
| nodev, /* devmap */ |
| nodev, /* mmap */ |
| nodev, /* segmap */ |
| nochpoll, /* poll */ |
| ddi_prop_op, /* prop_op */ |
| NULL, /* stream */ |
| D_MP, /* cb_flag */ |
| CB_REV, /* rev */ |
| nodev, /* int (*cb_aread)() */ |
| nodev /* int (*cb_awrite)() */ |
| }; |
| |
| /* |
| * Device options |
| */ |
| static struct dev_ops rpcib_ops = { |
| DEVO_REV, /* devo_rev, */ |
| 0, /* refcnt */ |
| rpcib_getinfo, /* info */ |
| nulldev, /* identify */ |
| nulldev, /* probe */ |
| rpcib_attach, /* attach */ |
| rpcib_detach, /* detach */ |
| nodev, /* reset */ |
| &rpcib_cbops, /* driver ops - devctl interfaces */ |
| NULL, /* bus operations */ |
| NULL, /* power */ |
| ddi_quiesce_not_needed, /* quiesce */ |
| }; |
| |
| /* |
| * Module linkage information. |
| */ |
| |
| static struct modldrv rib_modldrv = { |
| &mod_driverops, /* Driver module */ |
| "RPCIB plugin driver", /* Driver name and version */ |
| &rpcib_ops, /* Driver ops */ |
| }; |
| |
| static struct modlinkage rib_modlinkage = { |
| MODREV_1, |
| (void *)&rib_modldrv, |
| NULL |
| }; |
| |
| typedef struct rib_lrc_entry { |
| struct rib_lrc_entry *forw; |
| struct rib_lrc_entry *back; |
| char *lrc_buf; |
| |
| uint32_t lrc_len; |
| void *avl_node; |
| bool_t registered; |
| |
| struct mrc lrc_mhandle; |
| bool_t lrc_on_freed_list; |
| } rib_lrc_entry_t; |
| |
| typedef struct cache_struct { |
| rib_lrc_entry_t r; |
| uint32_t len; |
| uint32_t elements; |
| kmutex_t node_lock; |
| avl_node_t avl_link; |
| } cache_avl_struct_t; |
| |
| static uint64_t rib_total_buffers = 0; |
| uint64_t cache_limit = 100 * 1024 * 1024; |
| static volatile uint64_t cache_allocation = 0; |
| static uint64_t cache_watermark = 80 * 1024 * 1024; |
| static uint64_t cache_hits = 0; |
| static uint64_t cache_misses = 0; |
| static uint64_t cache_cold_misses = 0; |
| static uint64_t cache_hot_misses = 0; |
| static uint64_t cache_misses_above_the_limit = 0; |
| static bool_t stats_enabled = FALSE; |
| |
| static uint64_t max_unsignaled_rws = 5; |
| |
| /* |
| * rib_stat: private data pointer used when registering |
| * with the IBTF. It is returned to the consumer |
| * in all callbacks. |
| */ |
| static rpcib_state_t *rib_stat = NULL; |
| |
| #define RNR_RETRIES IBT_RNR_RETRY_1 |
| #define MAX_PORTS 2 |
| |
| int preposted_rbufs = RDMA_BUFS_GRANT; |
| int send_threshold = 1; |
| |
| /* |
| * State of the plugin. |
| * ACCEPT = accepting new connections and requests. |
| * NO_ACCEPT = not accepting new connection and requests. |
| * This should eventually move to rpcib_state_t structure, since this |
| * will tell in which state the plugin is for a particular type of service |
| * like NFS, NLM or v4 Callback deamon. The plugin might be in accept |
| * state for one and in no_accept state for the other. |
| */ |
| int plugin_state; |
| kmutex_t plugin_state_lock; |
| |
| ldi_ident_t rpcib_li; |
| |
| /* |
| * RPCIB RDMATF operations |
| */ |
| #if defined(MEASURE_POOL_DEPTH) |
| static void rib_posted_rbufs(uint32_t x) { return; } |
| #endif |
| static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); |
| static rdma_stat rib_disconnect(CONN *conn); |
| static void rib_listen(struct rdma_svc_data *rd); |
| static void rib_listen_stop(struct rdma_svc_data *rd); |
| static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, |
| uint_t buflen, struct mrc *buf_handle); |
| static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, |
| struct mrc buf_handle); |
| static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, |
| caddr_t buf, uint_t buflen, struct mrc *buf_handle); |
| static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, |
| struct mrc buf_handle); |
| static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, |
| uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, |
| void *lrc); |
| static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, |
| struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); |
| static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, |
| caddr_t buf, int len, int cpu); |
| |
| static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); |
| |
| static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); |
| static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); |
| |
| static void rib_rbuf_free(CONN *conn, int ptype, void *buf); |
| |
| static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); |
| static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); |
| static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); |
| static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); |
| static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); |
| static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); |
| static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); |
| static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); |
| static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **); |
| static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); |
| static rdma_stat rib_conn_release(CONN *conn); |
| static rdma_stat rib_getinfo(rdma_info_t *info); |
| |
| static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); |
| static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); |
| static void rib_destroy_cache(rib_hca_t *hca); |
| static void rib_server_side_cache_reclaim(void *argp); |
| static int avl_compare(const void *t1, const void *t2); |
| |
| static void rib_stop_services(rib_hca_t *); |
| static void rib_close_channels(rib_conn_list_t *); |
| |
| /* |
| * RPCIB addressing operations |
| */ |
| |
| /* |
| * RDMA operations the RPCIB module exports |
| */ |
| static rdmaops_t rib_ops = { |
| rib_reachable, |
| rib_conn_get, |
| rib_conn_release, |
| rib_listen, |
| rib_listen_stop, |
| rib_registermem, |
| rib_deregistermem, |
| rib_registermemsync, |
| rib_deregistermemsync, |
| rib_syncmem, |
| rib_reg_buf_alloc, |
| rib_reg_buf_free, |
| rib_send, |
| rib_send_resp, |
| rib_post_resp, |
| rib_post_resp_remove, |
| rib_post_recv, |
| rib_recv, |
| rib_read, |
| rib_write, |
| rib_getinfo, |
| }; |
| |
| /* |
| * RDMATF RPCIB plugin details |
| */ |
| static rdma_mod_t rib_mod = { |
| "ibtf", /* api name */ |
| RDMATF_VERS_1, |
| 0, |
| &rib_ops, /* rdma op vector for ibtf */ |
| }; |
| |
| static rdma_stat open_hcas(rpcib_state_t *); |
| static rdma_stat rib_qp_init(rib_qp_t *, int); |
| static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); |
| static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); |
| static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); |
| static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); |
| static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); |
| static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, |
| ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); |
| static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, |
| ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); |
| static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *, |
| ibt_ip_addr_t *, ibt_ip_addr_t *); |
| static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, |
| rib_qp_t **); |
| static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, |
| rib_qp_t **); |
| static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); |
| static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); |
| static int rib_free_sendwait(struct send_wid *); |
| static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); |
| static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); |
| static void rdma_done_rem_list(rib_qp_t *); |
| static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); |
| |
| static void rib_async_handler(void *, |
| ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); |
| static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); |
| static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); |
| static int rib_free_svc_recv(struct svc_recv *); |
| static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); |
| static void rib_free_wid(struct recv_wid *); |
| static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); |
| static void rib_detach_hca(rib_hca_t *); |
| static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int, |
| ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *); |
| |
| /* |
| * Registration with IBTF as a consumer |
| */ |
| static struct ibt_clnt_modinfo_s rib_modinfo = { |
| IBTI_V2, |
| IBT_GENERIC, |
| rib_async_handler, /* async event handler */ |
| NULL, /* Memory Region Handler */ |
| "nfs/ib" |
| }; |
| |
| /* |
| * Global strucuture |
| */ |
| |
| typedef struct rpcib_s { |
| dev_info_t *rpcib_dip; |
| kmutex_t rpcib_mutex; |
| } rpcib_t; |
| |
| rpcib_t rpcib; |
| |
| /* |
| * /etc/system controlled variable to control |
| * debugging in rpcib kernel module. |
| * Set it to values greater that 1 to control |
| * the amount of debugging messages required. |
| */ |
| int rib_debug = 0; |
| |
| int |
| _init(void) |
| { |
| int error; |
| |
| error = mod_install((struct modlinkage *)&rib_modlinkage); |
| if (error != 0) { |
| /* |
| * Could not load module |
| */ |
| return (error); |
| } |
| mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); |
| return (0); |
| } |
| |
| int |
| _fini() |
| { |
| int status; |
| |
| if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) { |
| return (EBUSY); |
| } |
| |
| /* |
| * Remove module |
| */ |
| if ((status = mod_remove(&rib_modlinkage)) != 0) { |
| (void) rdma_register_mod(&rib_mod); |
| return (status); |
| } |
| mutex_destroy(&plugin_state_lock); |
| return (0); |
| } |
| |
| int |
| _info(struct modinfo *modinfop) |
| { |
| return (mod_info(&rib_modlinkage, modinfop)); |
| } |
| |
| /* |
| * rpcib_getinfo() |
| * Given the device number, return the devinfo pointer or the |
| * instance number. |
| * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. |
| */ |
| |
| /*ARGSUSED*/ |
| static int |
| rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) |
| { |
| int ret = DDI_SUCCESS; |
| |
| switch (cmd) { |
| case DDI_INFO_DEVT2DEVINFO: |
| if (rpcib.rpcib_dip != NULL) |
| *result = rpcib.rpcib_dip; |
| else { |
| *result = NULL; |
| ret = DDI_FAILURE; |
| } |
| break; |
| |
| case DDI_INFO_DEVT2INSTANCE: |
| *result = NULL; |
| break; |
| |
| default: |
| ret = DDI_FAILURE; |
| } |
| return (ret); |
| } |
| |
| static int |
| rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) |
| { |
| ibt_status_t ibt_status; |
| rdma_stat r_status; |
| |
| switch (cmd) { |
| case DDI_ATTACH: |
| break; |
| case DDI_RESUME: |
| return (DDI_SUCCESS); |
| default: |
| return (DDI_FAILURE); |
| } |
| |
| mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); |
| |
| mutex_enter(&rpcib.rpcib_mutex); |
| if (rpcib.rpcib_dip != NULL) { |
| mutex_exit(&rpcib.rpcib_mutex); |
| return (DDI_FAILURE); |
| } |
| rpcib.rpcib_dip = dip; |
| mutex_exit(&rpcib.rpcib_mutex); |
| /* |
| * Create the "rpcib" minor-node. |
| */ |
| if (ddi_create_minor_node(dip, |
| "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { |
| /* Error message, no cmn_err as they print on console */ |
| return (DDI_FAILURE); |
| } |
| |
| if (rib_stat == NULL) { |
| rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); |
| mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); |
| } |
| |
| rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); |
| if (rib_stat->hca_count < 1) { |
| mutex_destroy(&rib_stat->open_hca_lock); |
| kmem_free(rib_stat, sizeof (*rib_stat)); |
| rib_stat = NULL; |
| return (DDI_FAILURE); |
| } |
| |
| ibt_status = ibt_attach(&rib_modinfo, dip, |
| (void *)rib_stat, &rib_stat->ibt_clnt_hdl); |
| |
| if (ibt_status != IBT_SUCCESS) { |
| ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); |
| mutex_destroy(&rib_stat->open_hca_lock); |
| kmem_free(rib_stat, sizeof (*rib_stat)); |
| rib_stat = NULL; |
| return (DDI_FAILURE); |
| } |
| |
| mutex_enter(&rib_stat->open_hca_lock); |
| if (open_hcas(rib_stat) != RDMA_SUCCESS) { |
| ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); |
| (void) ibt_detach(rib_stat->ibt_clnt_hdl); |
| mutex_exit(&rib_stat->open_hca_lock); |
| mutex_destroy(&rib_stat->open_hca_lock); |
| kmem_free(rib_stat, sizeof (*rib_stat)); |
| rib_stat = NULL; |
| return (DDI_FAILURE); |
| } |
| mutex_exit(&rib_stat->open_hca_lock); |
| |
| /* |
| * Register with rdmatf |
| */ |
| rib_mod.rdma_count = rib_stat->hca_count; |
| r_status = rdma_register_mod(&rib_mod); |
| if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { |
| rib_detach_hca(rib_stat->hca); |
| ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); |
| (void) ibt_detach(rib_stat->ibt_clnt_hdl); |
| mutex_destroy(&rib_stat->open_hca_lock); |
| kmem_free(rib_stat, sizeof (*rib_stat)); |
| rib_stat = NULL; |
| return (DDI_FAILURE); |
| } |
| |
| |
| return (DDI_SUCCESS); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) |
| { |
| switch (cmd) { |
| |
| case DDI_DETACH: |
| break; |
| |
| case DDI_SUSPEND: |
| default: |
| return (DDI_FAILURE); |
| } |
| |
| /* |
| * Detach the hca and free resources |
| */ |
| mutex_enter(&plugin_state_lock); |
| plugin_state = NO_ACCEPT; |
| mutex_exit(&plugin_state_lock); |
| rib_detach_hca(rib_stat->hca); |
| ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); |
| (void) ibt_detach(rib_stat->ibt_clnt_hdl); |
| |
| mutex_enter(&rpcib.rpcib_mutex); |
| rpcib.rpcib_dip = NULL; |
| mutex_exit(&rpcib.rpcib_mutex); |
| |
| mutex_destroy(&rpcib.rpcib_mutex); |
| return (DDI_SUCCESS); |
| } |
| |
| |
| static void rib_rbufpool_free(rib_hca_t *, int); |
| static void rib_rbufpool_deregister(rib_hca_t *, int); |
| static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); |
| static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); |
| static rdma_stat rib_rem_replylist(rib_qp_t *); |
| static int rib_remreply(rib_qp_t *, struct reply *); |
| static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); |
| static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); |
| |
| |
| /* |
| * One CQ pair per HCA |
| */ |
| static rdma_stat |
| rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, |
| rib_cq_t **cqp, rpcib_state_t *ribstat) |
| { |
| rib_cq_t *cq; |
| ibt_cq_attr_t cq_attr; |
| uint32_t real_size; |
| ibt_status_t status; |
| rdma_stat error = RDMA_SUCCESS; |
| |
| cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); |
| cq->rib_hca = hca; |
| cq_attr.cq_size = cq_size; |
| cq_attr.cq_flags = IBT_CQ_NO_FLAGS; |
| status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, |
| &real_size); |
| if (status != IBT_SUCCESS) { |
| cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," |
| " status=%d", status); |
| error = RDMA_FAILED; |
| goto fail; |
| } |
| ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); |
| |
| /* |
| * Enable CQ callbacks. CQ Callbacks are single shot |
| * (e.g. you have to call ibt_enable_cq_notify() |
| * after each callback to get another one). |
| */ |
| status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); |
| if (status != IBT_SUCCESS) { |
| cmn_err(CE_WARN, "rib_create_cq: " |
| "enable_cq_notify failed, status %d", status); |
| error = RDMA_FAILED; |
| goto fail; |
| } |
| *cqp = cq; |
| |
| return (error); |
| fail: |
| if (cq->rib_cq_hdl) |
| (void) ibt_free_cq(cq->rib_cq_hdl); |
| if (cq) |
| kmem_free(cq, sizeof (rib_cq_t)); |
| return (error); |
| } |
| |
| static rdma_stat |
| open_hcas(rpcib_state_t *ribstat) |
| { |
| rib_hca_t *hca; |
| ibt_status_t ibt_status; |
| rdma_stat status; |
| ibt_hca_portinfo_t *pinfop; |
| ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; |
| uint_t size, cq_size; |
| int i; |
| kstat_t *ksp; |
| cache_avl_struct_t example_avl_node; |
| char rssc_name[32]; |
| |
| ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); |
| |
| if (ribstat->hcas == NULL) |
| ribstat->hcas = kmem_zalloc(ribstat->hca_count * |
| sizeof (rib_hca_t), KM_SLEEP); |
| |
| /* |
| * Open a hca and setup for RDMA |
| */ |
| for (i = 0; i < ribstat->hca_count; i++) { |
| ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, |
| ribstat->hca_guids[i], |
| &ribstat->hcas[i].hca_hdl); |
| if (ibt_status != IBT_SUCCESS) { |
| continue; |
| } |
| ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; |
| hca = &(ribstat->hcas[i]); |
| hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; |
| hca->state = HCA_INITED; |
| |
| /* |
| * query HCA info |
| */ |
| ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); |
| if (ibt_status != IBT_SUCCESS) { |
| goto fail1; |
| } |
| |
| /* |
| * One PD (Protection Domain) per HCA. |
| * A qp is allowed to access a memory region |
| * only when it's in the same PD as that of |
| * the memory region. |
| */ |
| ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); |
| if (ibt_status != IBT_SUCCESS) { |
| goto fail1; |
| } |
| |
| /* |
| * query HCA ports |
| */ |
| ibt_status = ibt_query_hca_ports(hca->hca_hdl, |
| 0, &pinfop, &hca->hca_nports, &size); |
| if (ibt_status != IBT_SUCCESS) { |
| goto fail2; |
| } |
| hca->hca_ports = pinfop; |
| hca->hca_pinfosz = size; |
| pinfop = NULL; |
| |
| cq_size = DEF_CQ_SIZE; /* default cq size */ |
| /* |
| * Create 2 pairs of cq's (1 pair for client |
| * and the other pair for server) on this hca. |
| * If number of qp's gets too large, then several |
| * cq's will be needed. |
| */ |
| status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, |
| &hca->svc_rcq, ribstat); |
| if (status != RDMA_SUCCESS) { |
| goto fail3; |
| } |
| |
| status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, |
| &hca->svc_scq, ribstat); |
| if (status != RDMA_SUCCESS) { |
| goto fail3; |
| } |
| |
| status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, |
| &hca->clnt_rcq, ribstat); |
| if (status != RDMA_SUCCESS) { |
| goto fail3; |
| } |
| |
| status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, |
| &hca->clnt_scq, ribstat); |
| if (status != RDMA_SUCCESS) { |
| goto fail3; |
| } |
| |
| /* |
| * Create buffer pools. |
| * Note rib_rbuf_create also allocates memory windows. |
| */ |
| hca->recv_pool = rib_rbufpool_create(hca, |
| RECV_BUFFER, MAX_BUFS); |
| if (hca->recv_pool == NULL) { |
| goto fail3; |
| } |
| |
| hca->send_pool = rib_rbufpool_create(hca, |
| SEND_BUFFER, MAX_BUFS); |
| if (hca->send_pool == NULL) { |
| rib_rbufpool_destroy(hca, RECV_BUFFER); |
| goto fail3; |
| } |
| |
| if (hca->server_side_cache == NULL) { |
| (void) sprintf(rssc_name, |
| "rib_server_side_cache_%04d", i); |
| hca->server_side_cache = kmem_cache_create( |
| rssc_name, |
| sizeof (cache_avl_struct_t), 0, |
| NULL, |
| NULL, |
| rib_server_side_cache_reclaim, |
| hca, NULL, 0); |
| } |
| |
| avl_create(&hca->avl_tree, |
| avl_compare, |
| sizeof (cache_avl_struct_t), |
| (uint_t)(uintptr_t)&example_avl_node.avl_link- |
| (uint_t)(uintptr_t)&example_avl_node); |
| |
| rw_init(&hca->avl_rw_lock, |
| NULL, RW_DRIVER, hca->iblock); |
| mutex_init(&hca->cache_allocation, |
| NULL, MUTEX_DRIVER, NULL); |
| hca->avl_init = TRUE; |
| |
| /* Create kstats for the cache */ |
| ASSERT(INGLOBALZONE(curproc)); |
| |
| if (!stats_enabled) { |
| ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", |
| KSTAT_TYPE_NAMED, |
| sizeof (rpcib_kstat) / sizeof (kstat_named_t), |
| KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, |
| GLOBAL_ZONEID); |
| if (ksp) { |
| ksp->ks_data = (void *) &rpcib_kstat; |
| ksp->ks_update = rpcib_cache_kstat_update; |
| kstat_install(ksp); |
| stats_enabled = TRUE; |
| } |
| } |
| if (NULL == hca->reg_cache_clean_up) { |
| hca->reg_cache_clean_up = ddi_taskq_create(NULL, |
| "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0); |
| } |
| |
| /* |
| * Initialize the registered service list and |
| * the lock |
| */ |
| hca->service_list = NULL; |
| rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); |
| |
| mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); |
| rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, |
| hca->iblock); |
| rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, |
| hca->iblock); |
| rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); |
| mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| hca->inuse = TRUE; |
| /* |
| * XXX One hca only. Add multi-hca functionality if needed |
| * later. |
| */ |
| ribstat->hca = hca; |
| ribstat->nhca_inited++; |
| ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); |
| break; |
| |
| fail3: |
| ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); |
| fail2: |
| (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); |
| fail1: |
| (void) ibt_close_hca(hca->hca_hdl); |
| |
| } |
| if (ribstat->hca != NULL) |
| return (RDMA_SUCCESS); |
| else |
| return (RDMA_FAILED); |
| } |
| |
| /* |
| * Callback routines |
| */ |
| |
| /* |
| * SCQ handlers |
| */ |
| /* ARGSUSED */ |
| static void |
| rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) |
| { |
| ibt_status_t ibt_status; |
| ibt_wc_t wc; |
| int i; |
| |
| /* |
| * Re-enable cq notify here to avoid missing any |
| * completion queue notification. |
| */ |
| (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); |
| |
| ibt_status = IBT_SUCCESS; |
| while (ibt_status != IBT_CQ_EMPTY) { |
| bzero(&wc, sizeof (wc)); |
| ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); |
| if (ibt_status != IBT_SUCCESS) |
| return; |
| |
| /* |
| * Got a send completion |
| */ |
| if (wc.wc_id != NULL) { /* XXX can it be otherwise ???? */ |
| struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id; |
| CONN *conn = qptoc(wd->qp); |
| |
| mutex_enter(&wd->sendwait_lock); |
| switch (wc.wc_status) { |
| case IBT_WC_SUCCESS: |
| wd->status = RDMA_SUCCESS; |
| break; |
| case IBT_WC_WR_FLUSHED_ERR: |
| wd->status = RDMA_FAILED; |
| break; |
| default: |
| /* |
| * RC Send Q Error Code Local state Remote State |
| * ==================== =========== ============ |
| * IBT_WC_BAD_RESPONSE_ERR ERROR None |
| * IBT_WC_LOCAL_LEN_ERR ERROR None |
| * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None |
| * IBT_WC_LOCAL_PROTECT_ERR ERROR None |
| * IBT_WC_MEM_WIN_BIND_ERR ERROR None |
| * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR |
| * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR |
| * IBT_WC_REMOTE_OP_ERR ERROR ERROR |
| * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None |
| * IBT_WC_TRANS_TIMEOUT_ERR ERROR None |
| * IBT_WC_WR_FLUSHED_ERR None None |
| */ |
| /* |
| * Channel in error state. Set connection to |
| * ERROR and cleanup will happen either from |
| * conn_release or from rib_conn_get |
| */ |
| wd->status = RDMA_FAILED; |
| mutex_enter(&conn->c_lock); |
| if (conn->c_state != C_DISCONN_PEND) |
| conn->c_state = C_ERROR_CONN; |
| mutex_exit(&conn->c_lock); |
| break; |
| } |
| |
| if (wd->cv_sig == 1) { |
| /* |
| * Notify poster |
| */ |
| cv_signal(&wd->wait_cv); |
| mutex_exit(&wd->sendwait_lock); |
| } else { |
| /* |
| * Poster not waiting for notification. |
| * Free the send buffers and send_wid |
| */ |
| for (i = 0; i < wd->nsbufs; i++) { |
| rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER, |
| (void *)(uintptr_t)wd->sbufaddr[i]); |
| } |
| mutex_exit(&wd->sendwait_lock); |
| (void) rib_free_sendwait(wd); |
| } |
| } |
| } |
| } |
| |
| /* ARGSUSED */ |
| static void |
| rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) |
| { |
| ibt_status_t ibt_status; |
| ibt_wc_t wc; |
| int i; |
| |
| /* |
| * Re-enable cq notify here to avoid missing any |
| * completion queue notification. |
| */ |
| (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); |
| |
| ibt_status = IBT_SUCCESS; |
| while (ibt_status != IBT_CQ_EMPTY) { |
| bzero(&wc, sizeof (wc)); |
| ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); |
| if (ibt_status != IBT_SUCCESS) |
| return; |
| |
| /* |
| * Got a send completion |
| */ |
| if (wc.wc_id != NULL) { /* XXX NULL possible ???? */ |
| struct send_wid *wd = |
| (struct send_wid *)(uintptr_t)wc.wc_id; |
| mutex_enter(&wd->sendwait_lock); |
| if (wd->cv_sig == 1) { |
| /* |
| * Update completion status and notify poster |
| */ |
| if (wc.wc_status == IBT_WC_SUCCESS) |
| wd->status = RDMA_SUCCESS; |
| else |
| wd->status = RDMA_FAILED; |
| cv_signal(&wd->wait_cv); |
| mutex_exit(&wd->sendwait_lock); |
| } else { |
| /* |
| * Poster not waiting for notification. |
| * Free the send buffers and send_wid |
| */ |
| for (i = 0; i < wd->nsbufs; i++) { |
| rib_rbuf_free(qptoc(wd->qp), |
| SEND_BUFFER, |
| (void *)(uintptr_t)wd->sbufaddr[i]); |
| } |
| mutex_exit(&wd->sendwait_lock); |
| (void) rib_free_sendwait(wd); |
| } |
| } |
| } |
| } |
| |
| /* |
| * RCQ handler |
| */ |
| /* ARGSUSED */ |
| static void |
| rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) |
| { |
| rib_qp_t *qp; |
| ibt_status_t ibt_status; |
| ibt_wc_t wc; |
| struct recv_wid *rwid; |
| |
| /* |
| * Re-enable cq notify here to avoid missing any |
| * completion queue notification. |
| */ |
| (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); |
| |
| ibt_status = IBT_SUCCESS; |
| while (ibt_status != IBT_CQ_EMPTY) { |
| bzero(&wc, sizeof (wc)); |
| ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); |
| if (ibt_status != IBT_SUCCESS) |
| return; |
| |
| rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; |
| qp = rwid->qp; |
| if (wc.wc_status == IBT_WC_SUCCESS) { |
| XDR inxdrs, *xdrs; |
| uint_t xid, vers, op, find_xid = 0; |
| struct reply *r; |
| CONN *conn = qptoc(qp); |
| uint32_t rdma_credit = 0; |
| |
| xdrs = &inxdrs; |
| xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, |
| wc.wc_bytes_xfer, XDR_DECODE); |
| /* |
| * Treat xid as opaque (xid is the first entity |
| * in the rpc rdma message). |
| */ |
| xid = *(uint32_t *)(uintptr_t)rwid->addr; |
| |
| /* Skip xid and set the xdr position accordingly. */ |
| XDR_SETPOS(xdrs, sizeof (uint32_t)); |
| (void) xdr_u_int(xdrs, &vers); |
| (void) xdr_u_int(xdrs, &rdma_credit); |
| (void) xdr_u_int(xdrs, &op); |
| XDR_DESTROY(xdrs); |
| |
| if (vers != RPCRDMA_VERS) { |
| /* |
| * Invalid RPC/RDMA version. Cannot |
| * interoperate. Set connection to |
| * ERROR state and bail out. |
| */ |
| mutex_enter(&conn->c_lock); |
| if (conn->c_state != C_DISCONN_PEND) |
| conn->c_state = C_ERROR_CONN; |
| mutex_exit(&conn->c_lock); |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)rwid->addr); |
| rib_free_wid(rwid); |
| continue; |
| } |
| |
| mutex_enter(&qp->replylist_lock); |
| for (r = qp->replylist; r != NULL; r = r->next) { |
| if (r->xid == xid) { |
| find_xid = 1; |
| switch (op) { |
| case RDMA_MSG: |
| case RDMA_NOMSG: |
| case RDMA_MSGP: |
| r->status = RDMA_SUCCESS; |
| r->vaddr_cq = rwid->addr; |
| r->bytes_xfer = |
| wc.wc_bytes_xfer; |
| cv_signal(&r->wait_cv); |
| break; |
| default: |
| rib_rbuf_free(qptoc(qp), |
| RECV_BUFFER, |
| (void *)(uintptr_t) |
| rwid->addr); |
| break; |
| } |
| break; |
| } |
| } |
| mutex_exit(&qp->replylist_lock); |
| if (find_xid == 0) { |
| /* RPC caller not waiting for reply */ |
| |
| DTRACE_PROBE1(rpcib__i__nomatchxid1, |
| int, xid); |
| |
| rib_rbuf_free(qptoc(qp), RECV_BUFFER, |
| (void *)(uintptr_t)rwid->addr); |
| } |
| } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { |
| CONN *conn = qptoc(qp); |
| |
| /* |
| * Connection being flushed. Just free |
| * the posted buffer |
| */ |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)rwid->addr); |
| } else { |
| CONN *conn = qptoc(qp); |
| /* |
| * RC Recv Q Error Code Local state Remote State |
| * ==================== =========== ============ |
| * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd |
| * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd |
| * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd |
| * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd |
| * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd |
| * IBT_WC_WR_FLUSHED_ERR None None |
| */ |
| /* |
| * Channel in error state. Set connection |
| * in ERROR state. |
| */ |
| mutex_enter(&conn->c_lock); |
| if (conn->c_state != C_DISCONN_PEND) |
| conn->c_state = C_ERROR_CONN; |
| mutex_exit(&conn->c_lock); |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)rwid->addr); |
| } |
| rib_free_wid(rwid); |
| } |
| } |
| |
| /* Server side */ |
| /* ARGSUSED */ |
| static void |
| rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) |
| { |
| rdma_recv_data_t *rdp; |
| rib_qp_t *qp; |
| ibt_status_t ibt_status; |
| ibt_wc_t wc; |
| struct svc_recv *s_recvp; |
| CONN *conn; |
| mblk_t *mp; |
| |
| /* |
| * Re-enable cq notify here to avoid missing any |
| * completion queue notification. |
| */ |
| (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); |
| |
| ibt_status = IBT_SUCCESS; |
| while (ibt_status != IBT_CQ_EMPTY) { |
| bzero(&wc, sizeof (wc)); |
| ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); |
| if (ibt_status != IBT_SUCCESS) |
| return; |
| |
| s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; |
| qp = s_recvp->qp; |
| conn = qptoc(qp); |
| mutex_enter(&qp->posted_rbufs_lock); |
| qp->n_posted_rbufs--; |
| #if defined(MEASURE_POOL_DEPTH) |
| rib_posted_rbufs(preposted_rbufs - qp->n_posted_rbufs); |
| #endif |
| if (qp->n_posted_rbufs == 0) |
| cv_signal(&qp->posted_rbufs_cv); |
| mutex_exit(&qp->posted_rbufs_lock); |
| |
| if (wc.wc_status == IBT_WC_SUCCESS) { |
| XDR inxdrs, *xdrs; |
| uint_t xid, vers, op; |
| uint32_t rdma_credit; |
| |
| xdrs = &inxdrs; |
| /* s_recvp->vaddr stores data */ |
| xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, |
| wc.wc_bytes_xfer, XDR_DECODE); |
| |
| /* |
| * Treat xid as opaque (xid is the first entity |
| * in the rpc rdma message). |
| */ |
| xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; |
| /* Skip xid and set the xdr position accordingly. */ |
| XDR_SETPOS(xdrs, sizeof (uint32_t)); |
| if (!xdr_u_int(xdrs, &vers) || |
| !xdr_u_int(xdrs, &rdma_credit) || |
| !xdr_u_int(xdrs, &op)) { |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)s_recvp->vaddr); |
| XDR_DESTROY(xdrs); |
| (void) rib_free_svc_recv(s_recvp); |
| continue; |
| } |
| XDR_DESTROY(xdrs); |
| |
| if (vers != RPCRDMA_VERS) { |
| /* |
| * Invalid RPC/RDMA version. |
| * Drop rpc rdma message. |
| */ |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)s_recvp->vaddr); |
| (void) rib_free_svc_recv(s_recvp); |
| continue; |
| } |
| /* |
| * Is this for RDMA_DONE? |
| */ |
| if (op == RDMA_DONE) { |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)s_recvp->vaddr); |
| /* |
| * Wake up the thread waiting on |
| * a RDMA_DONE for xid |
| */ |
| mutex_enter(&qp->rdlist_lock); |
| rdma_done_notify(qp, xid); |
| mutex_exit(&qp->rdlist_lock); |
| (void) rib_free_svc_recv(s_recvp); |
| continue; |
| } |
| |
| mutex_enter(&plugin_state_lock); |
| if (plugin_state == ACCEPT) { |
| while ((mp = allocb(sizeof (*rdp), BPRI_LO)) |
| == NULL) |
| (void) strwaitbuf( |
| sizeof (*rdp), BPRI_LO); |
| /* |
| * Plugin is in accept state, hence the master |
| * transport queue for this is still accepting |
| * requests. Hence we can call svc_queuereq to |
| * queue this recieved msg. |
| */ |
| rdp = (rdma_recv_data_t *)mp->b_rptr; |
| rdp->conn = conn; |
| rdp->rpcmsg.addr = |
| (caddr_t)(uintptr_t)s_recvp->vaddr; |
| rdp->rpcmsg.type = RECV_BUFFER; |
| rdp->rpcmsg.len = wc.wc_bytes_xfer; |
| rdp->status = wc.wc_status; |
| mutex_enter(&conn->c_lock); |
| conn->c_ref++; |
| mutex_exit(&conn->c_lock); |
| mp->b_wptr += sizeof (*rdp); |
| svc_queuereq((queue_t *)rib_stat->q, mp); |
| mutex_exit(&plugin_state_lock); |
| } else { |
| /* |
| * The master transport for this is going |
| * away and the queue is not accepting anymore |
| * requests for krpc, so don't do anything, just |
| * free the msg. |
| */ |
| mutex_exit(&plugin_state_lock); |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)s_recvp->vaddr); |
| } |
| } else { |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)s_recvp->vaddr); |
| } |
| (void) rib_free_svc_recv(s_recvp); |
| } |
| } |
| |
| /* |
| * Handles DR event of IBT_HCA_DETACH_EVENT. |
| */ |
| /* ARGSUSED */ |
| static void |
| rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, |
| ibt_async_code_t code, ibt_async_event_t *event) |
| { |
| |
| switch (code) { |
| case IBT_HCA_ATTACH_EVENT: |
| /* ignore */ |
| break; |
| case IBT_HCA_DETACH_EVENT: |
| { |
| ASSERT(rib_stat->hca->hca_hdl == hca_hdl); |
| rib_detach_hca(rib_stat->hca); |
| #ifdef DEBUG |
| cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); |
| #endif |
| break; |
| } |
| #ifdef DEBUG |
| case IBT_EVENT_PATH_MIGRATED: |
| cmn_err(CE_NOTE, "rib_async_handler(): " |
| "IBT_EVENT_PATH_MIGRATED\n"); |
| break; |
| case IBT_EVENT_SQD: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); |
| break; |
| case IBT_EVENT_COM_EST: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); |
| break; |
| case IBT_ERROR_CATASTROPHIC_CHAN: |
| cmn_err(CE_NOTE, "rib_async_handler(): " |
| "IBT_ERROR_CATASTROPHIC_CHAN\n"); |
| break; |
| case IBT_ERROR_INVALID_REQUEST_CHAN: |
| cmn_err(CE_NOTE, "rib_async_handler(): " |
| "IBT_ERROR_INVALID_REQUEST_CHAN\n"); |
| break; |
| case IBT_ERROR_ACCESS_VIOLATION_CHAN: |
| cmn_err(CE_NOTE, "rib_async_handler(): " |
| "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); |
| break; |
| case IBT_ERROR_PATH_MIGRATE_REQ: |
| cmn_err(CE_NOTE, "rib_async_handler(): " |
| "IBT_ERROR_PATH_MIGRATE_REQ\n"); |
| break; |
| case IBT_ERROR_CQ: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); |
| break; |
| case IBT_ERROR_PORT_DOWN: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); |
| break; |
| case IBT_EVENT_PORT_UP: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); |
| break; |
| case IBT_ASYNC_OPAQUE1: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); |
| break; |
| case IBT_ASYNC_OPAQUE2: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); |
| break; |
| case IBT_ASYNC_OPAQUE3: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); |
| break; |
| case IBT_ASYNC_OPAQUE4: |
| cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); |
| break; |
| #endif |
| default: |
| break; |
| } |
| } |
| |
| /* |
| * Client's reachable function. |
| */ |
| static rdma_stat |
| rib_reachable(int addr_type, struct netbuf *raddr, void **handle) |
| { |
| rib_hca_t *hca; |
| rdma_stat status; |
| |
| /* |
| * First check if a hca is still attached |
| */ |
| *handle = NULL; |
| rw_enter(&rib_stat->hca->state_lock, RW_READER); |
| if (rib_stat->hca->state != HCA_INITED) { |
| rw_exit(&rib_stat->hca->state_lock); |
| return (RDMA_FAILED); |
| } |
| status = rib_ping_srv(addr_type, raddr, &hca); |
| rw_exit(&rib_stat->hca->state_lock); |
| |
| if (status == RDMA_SUCCESS) { |
| *handle = (void *)hca; |
| return (RDMA_SUCCESS); |
| } else { |
| *handle = NULL; |
| DTRACE_PROBE(rpcib__i__pingfailed); |
| return (RDMA_FAILED); |
| } |
| } |
| |
| /* Client side qp creation */ |
| static rdma_stat |
| rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) |
| { |
| rib_qp_t *kqp = NULL; |
| CONN *conn; |
| rdma_clnt_cred_ctrl_t *cc_info; |
| |
| ASSERT(qp != NULL); |
| *qp = NULL; |
| |
| kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); |
| conn = qptoc(kqp); |
| kqp->hca = hca; |
| kqp->rdmaconn.c_rdmamod = &rib_mod; |
| kqp->rdmaconn.c_private = (caddr_t)kqp; |
| |
| kqp->mode = RIB_CLIENT; |
| kqp->chan_flags = IBT_BLOCKING; |
| conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); |
| bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); |
| conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; |
| /* |
| * Initialize |
| */ |
| cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); |
| cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); |
| mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); |
| mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); |
| mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| /* |
| * Initialize the client credit control |
| * portion of the rdmaconn struct. |
| */ |
| kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; |
| cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; |
| cc_info->clnt_cc_granted_ops = 0; |
| cc_info->clnt_cc_in_flight_ops = 0; |
| cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); |
| |
| *qp = kqp; |
| return (RDMA_SUCCESS); |
| } |
| |
| /* Server side qp creation */ |
| static rdma_stat |
| rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) |
| { |
| rib_qp_t *kqp = NULL; |
| ibt_chan_sizes_t chan_sizes; |
| ibt_rc_chan_alloc_args_t qp_attr; |
| ibt_status_t ibt_status; |
| rdma_srv_cred_ctrl_t *cc_info; |
| |
| *qp = NULL; |
| |
| kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); |
| kqp->hca = hca; |
| kqp->port_num = port; |
| kqp->rdmaconn.c_rdmamod = &rib_mod; |
| kqp->rdmaconn.c_private = (caddr_t)kqp; |
| |
| /* |
| * Create the qp handle |
| */ |
| bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); |
| qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; |
| qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; |
| qp_attr.rc_pd = hca->pd_hdl; |
| qp_attr.rc_hca_port_num = port; |
| qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; |
| qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; |
| qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; |
| qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; |
| qp_attr.rc_clone_chan = NULL; |
| qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; |
| qp_attr.rc_flags = IBT_WR_SIGNALED; |
| |
| rw_enter(&hca->state_lock, RW_READER); |
| if (hca->state != HCA_DETACHED) { |
| ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, |
| IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, |
| &chan_sizes); |
| } else { |
| rw_exit(&hca->state_lock); |
| goto fail; |
| } |
| rw_exit(&hca->state_lock); |
| |
| if (ibt_status != IBT_SUCCESS) { |
| DTRACE_PROBE1(rpcib__i_svccreatechanfail, |
| int, ibt_status); |
| goto fail; |
| } |
| |
| kqp->mode = RIB_SERVER; |
| kqp->chan_flags = IBT_BLOCKING; |
| kqp->q = q; /* server ONLY */ |
| |
| cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); |
| cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); |
| mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); |
| mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); |
| mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); |
| mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); |
| /* |
| * Set the private data area to qp to be used in callbacks |
| */ |
| ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); |
| kqp->rdmaconn.c_state = C_CONNECTED; |
| |
| /* |
| * Initialize the server credit control |
| * portion of the rdmaconn struct. |
| */ |
| kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; |
| cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; |
| cc_info->srv_cc_buffers_granted = preposted_rbufs; |
| cc_info->srv_cc_cur_buffers_used = 0; |
| cc_info->srv_cc_posted = preposted_rbufs; |
| |
| *qp = kqp; |
| |
| return (RDMA_SUCCESS); |
| fail: |
| if (kqp) |
| kmem_free(kqp, sizeof (rib_qp_t)); |
| |
| return (RDMA_FAILED); |
| } |
| |
| /* ARGSUSED */ |
| ibt_cm_status_t |
| rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, |
| ibt_cm_return_args_t *ret_args, void *priv_data, |
| ibt_priv_data_len_t len) |
| { |
| rpcib_state_t *ribstat; |
| rib_hca_t *hca; |
| |
| ribstat = (rpcib_state_t *)clnt_hdl; |
| hca = (rib_hca_t *)ribstat->hca; |
| |
| switch (event->cm_type) { |
| |
| /* got a connection close event */ |
| case IBT_CM_EVENT_CONN_CLOSED: |
| { |
| CONN *conn; |
| rib_qp_t *qp; |
| |
| /* check reason why connection was closed */ |
| switch (event->cm_event.closed) { |
| case IBT_CM_CLOSED_DREP_RCVD: |
| case IBT_CM_CLOSED_DREQ_TIMEOUT: |
| case IBT_CM_CLOSED_DUP: |
| case IBT_CM_CLOSED_ABORT: |
| case IBT_CM_CLOSED_ALREADY: |
| /* |
| * These cases indicate the local end initiated |
| * the closing of the channel. Nothing to do here. |
| */ |
| break; |
| default: |
| /* |
| * Reason for CONN_CLOSED event must be one of |
| * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD |
| * or IBT_CM_CLOSED_STALE. These indicate cases were |
| * the remote end is closing the channel. In these |
| * cases free the channel and transition to error |
| * state |
| */ |
| qp = ibt_get_chan_private(event->cm_channel); |
| conn = qptoc(qp); |
| mutex_enter(&conn->c_lock); |
| if (conn->c_state == C_DISCONN_PEND) { |
| mutex_exit(&conn->c_lock); |
| break; |
| } |
| |
| conn->c_state = C_ERROR_CONN; |
| |
| /* |
| * Free the rc_channel. Channel has already |
| * transitioned to ERROR state and WRs have been |
| * FLUSHED_ERR already. |
| */ |
| (void) ibt_free_channel(qp->qp_hdl); |
| qp->qp_hdl = NULL; |
| |
| /* |
| * Free the conn if c_ref is down to 0 already |
| */ |
| if (conn->c_ref == 0) { |
| /* |
| * Remove from list and free conn |
| */ |
| conn->c_state = C_DISCONN_PEND; |
| mutex_exit(&conn->c_lock); |
| (void) rib_disconnect_channel(conn, |
| &hca->cl_conn_list); |
| } else { |
| mutex_exit(&conn->c_lock); |
| } |
| #ifdef DEBUG |
| if (rib_debug) |
| cmn_err(CE_NOTE, "rib_clnt_cm_handler: " |
| "(CONN_CLOSED) channel disconnected"); |
| #endif |
| break; |
| } |
| break; |
| } |
| default: |
| break; |
| } |
| return (IBT_CM_ACCEPT); |
| } |
| |
| /* Check server ib address */ |
| rdma_stat |
| rib_chk_srv_ibaddr(struct netbuf *raddr, |
| int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip, |
| ibt_ip_addr_t *d_ip) |
| { |
| struct sockaddr_in *sin4; |
| struct sockaddr_in6 *sin6; |
| ibt_status_t ibt_status; |
| ibt_ip_path_attr_t ipattr; |
| uint8_t npaths = 0; |
| ibt_path_ip_src_t srcip; |
| |
| ASSERT(raddr->buf != NULL); |
| |
| (void) bzero(path, sizeof (ibt_path_info_t)); |
| |
| switch (addr_type) { |
| case AF_INET: |
| sin4 = (struct sockaddr_in *)raddr->buf; |
| d_ip->family = AF_INET; |
| d_ip->un.ip4addr = sin4->sin_addr.s_addr; |
| break; |
| |
| case AF_INET6: |
| sin6 = (struct sockaddr_in6 *)raddr->buf; |
| d_ip->family = AF_INET6; |
| d_ip->un.ip6addr = sin6->sin6_addr; |
| break; |
| |
| default: |
| return (RDMA_INVAL); |
| } |
| |
| bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); |
| bzero(&srcip, sizeof (ibt_path_ip_src_t)); |
| |
| ipattr.ipa_dst_ip = d_ip; |
| ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; |
| ipattr.ipa_ndst = 1; |
| ipattr.ipa_max_paths = 1; |
| npaths = 0; |
| |
| ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, |
| IBT_PATH_NO_FLAGS, |
| &ipattr, |
| path, |
| &npaths, |
| &srcip); |
| |
| if (ibt_status != IBT_SUCCESS || |
| npaths < 1 || |
| path->pi_hca_guid != rib_stat->hca->hca_guid) { |
| |
| bzero(s_ip, sizeof (ibt_path_ip_src_t)); |
| return (RDMA_FAILED); |
| } |
| |
| if (srcip.ip_primary.family == AF_INET) { |
| s_ip->family = AF_INET; |
| s_ip->un.ip4addr = srcip.ip_primary.un.ip4addr; |
| } else { |
| s_ip->family = AF_INET6; |
| s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr; |
| } |
| |
| return (RDMA_SUCCESS); |
| } |
| |
| |
| /* |
| * Connect to the server. |
| */ |
| rdma_stat |
| rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path, |
| ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip) |
| { |
| ibt_chan_open_args_t chan_args; /* channel args */ |
| ibt_chan_sizes_t chan_sizes; |
| ibt_rc_chan_alloc_args_t qp_attr; |
| ibt_status_t ibt_status; |
| ibt_rc_returns_t ret_args; /* conn reject info */ |
| int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ |
| ibt_ip_cm_info_t ipcm_info; |
| uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; |
| |
| |
| (void) bzero(&chan_args, sizeof (chan_args)); |
| (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); |
| (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); |
| |
| switch (ipcm_info.src_addr.family = s_ip->family) { |
| case AF_INET: |
| ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr; |
| break; |
| case AF_INET6: |
| ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr; |
| break; |
| } |
| |
| switch (ipcm_info.dst_addr.family = d_ip->family) { |
| case AF_INET: |
| ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr; |
| break; |
| case AF_INET6: |
| ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr; |
| break; |
| } |
| |
| ipcm_info.src_port = NFS_RDMA_PORT; |
| |
| ibt_status = ibt_format_ip_private_data(&ipcm_info, |
| IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); |
| |
| if (ibt_status != IBT_SUCCESS) { |
| cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); |
| return (-1); |
| } |
| |
| qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num; |
| /* Alloc a RC channel */ |
| qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; |
| qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; |
| qp_attr.rc_pd = hca->pd_hdl; |
| qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; |
| qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; |
| qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; |
| qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; |
| qp_attr.rc_clone_chan = NULL; |
| qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; |
| qp_attr.rc_flags = IBT_WR_SIGNALED; |
| |
| path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT); |
| chan_args.oc_path = path; |
| chan_args.oc_cm_handler = rib_clnt_cm_handler; |
| chan_args.oc_cm_clnt_private = (void *)rib_stat; |
| chan_args.oc_rdma_ra_out = 4; |
| chan_args.oc_rdma_ra_in = 4; |
| chan_args.oc_path_retry_cnt = 2; |
| chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; |
| chan_args.oc_priv_data = cmp_ip_pvt; |
| chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; |
| |
| refresh: |
| rw_enter(&hca->state_lock, RW_READER); |
| if (hca->state != HCA_DETACHED) { |
| ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, |
| IBT_ACHAN_NO_FLAGS, |
| &qp_attr, &qp->qp_hdl, |
| &chan_sizes); |
| } else { |
| rw_exit(&hca->state_lock); |
| return (RDMA_FAILED); |
| } |
| rw_exit(&hca->state_lock); |
| |
| if (ibt_status != IBT_SUCCESS) { |
| DTRACE_PROBE1(rpcib__i_conntosrv, |
| int, ibt_status); |
| return (RDMA_FAILED); |
| } |
| |
| /* Connect to the Server */ |
| (void) bzero(&ret_args, sizeof (ret_args)); |
| mutex_enter(&qp->cb_lock); |
| ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, |
| IBT_BLOCKING, &chan_args, &ret_args); |
| if (ibt_status != IBT_SUCCESS) { |
| DTRACE_PROBE2(rpcib__i_openrctosrv, |
| int, ibt_status, int, ret_args.rc_status); |
| |
| (void) ibt_free_channel(qp->qp_hdl); |
| qp->qp_hdl = NULL; |
| mutex_exit(&qp->cb_lock); |
| if (refresh-- && ibt_status == IBT_CM_FAILURE && |
| ret_args.rc_status == IBT_CM_CONN_STALE) { |
| /* |
| * Got IBT_CM_CONN_STALE probably because of stale |
| * data on the passive end of a channel that existed |
| * prior to reboot. Retry establishing a channel |
| * REFRESH_ATTEMPTS times, during which time the |
| * stale conditions on the server might clear up. |
| */ |
| goto refresh; |
| } |
| return (RDMA_FAILED); |
| } |
| mutex_exit(&qp->cb_lock); |
| /* |
| * Set the private data area to qp to be used in callbacks |
| */ |
| ibt_set_chan_private(qp->qp_hdl, (void *)qp); |
| return (RDMA_SUCCESS); |
| } |
| |
| rdma_stat |
| rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca) |
| { |
| uint_t i; |
| ibt_path_info_t path; |
| ibt_status_t ibt_status; |
| uint8_t num_paths_p; |
| ibt_ip_path_attr_t ipattr; |
| ibt_ip_addr_t dstip; |
| ibt_path_ip_src_t srcip; |
| rpcib_ipaddrs_t addrs4; |
| rpcib_ipaddrs_t addrs6; |
| struct sockaddr_in *sinp; |
| struct sockaddr_in6 *sin6p; |
| rdma_stat retval = RDMA_SUCCESS; |
| |
| *hca = NULL; |
| ASSERT(raddr->buf != NULL); |
| |
| bzero(&path, sizeof (ibt_path_info_t)); |
| bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); |
| bzero(&srcip, sizeof (ibt_path_ip_src_t)); |
| |
| if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || |
| (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { |
| retval = RDMA_FAILED; |
| goto done; |
| } |
| |
| /* Prep the destination address */ |
| switch (addr_type) { |
| case AF_INET: |
| sinp = (struct sockaddr_in *)raddr->buf; |
| dstip.family = AF_INET; |
| dstip.un.ip4addr = sinp->sin_addr.s_addr; |
| sinp = addrs4.ri_list; |
| |
| for (i = 0; i < addrs4.ri_count; i++) { |
| num_paths_p = 0; |
| ipattr.ipa_dst_ip = &dstip; |
| ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; |
| ipattr.ipa_ndst = 1; |
| ipattr.ipa_max_paths = 1; |
| ipattr.ipa_src_ip.family = dstip.family; |
| ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; |
| |
| ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, |
| IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, |
| &srcip); |
| if (ibt_status == IBT_SUCCESS && |
| num_paths_p != 0 && |
| path.pi_hca_guid == rib_stat->hca->hca_guid) { |
| *hca = rib_stat->hca; |
| goto done; |
| } |
| } |
| retval = RDMA_FAILED; |
| break; |
| |
| case AF_INET6: |
| sin6p = (struct sockaddr_in6 *)raddr->buf; |
| dstip.family = AF_INET6; |
| dstip.un.ip6addr = sin6p->sin6_addr; |
| sin6p = addrs6.ri_list; |
| |
| for (i = 0; i < addrs6.ri_count; i++) { |
| num_paths_p = 0; |
| ipattr.ipa_dst_ip = &dstip; |
| ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; |
| ipattr.ipa_ndst = 1; |
| ipattr.ipa_max_paths = 1; |
| ipattr.ipa_src_ip.family = dstip.family; |
| ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; |
| |
| ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, |
| IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p, |
| &srcip); |
| if (ibt_status == IBT_SUCCESS && |
| num_paths_p != 0 && |
| path.pi_hca_guid == rib_stat->hca->hca_guid) { |
| *hca = rib_stat->hca; |
| goto done; |
| } |
| } |
| retval = RDMA_FAILED; |
| break; |
| |
| default: |
| retval = RDMA_INVAL; |
| break; |
| } |
| done: |
| if (addrs4.ri_size > 0) |
| kmem_free(addrs4.ri_list, addrs4.ri_size); |
| if (addrs6.ri_size > 0) |
| kmem_free(addrs6.ri_list, addrs6.ri_size); |
| return (retval); |
| } |
| |
| /* |
| * Close channel, remove from connection list and |
| * free up resources allocated for that channel. |
| */ |
| rdma_stat |
| rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) |
| { |
| rib_qp_t *qp = ctoqp(conn); |
| rib_hca_t *hca; |
| |
| /* |
| * c_ref == 0 and connection is in C_DISCONN_PEND |
| */ |
| hca = qp->hca; |
| if (conn_list != NULL) |
| (void) rib_rm_conn(conn, conn_list); |
| |
| if (qp->qp_hdl != NULL) { |
| /* |
| * If the channel has not been establised, |
| * ibt_flush_channel is called to flush outstanding WRs |
| * on the Qs. Otherwise, ibt_close_rc_channel() is |
| * called. The channel is then freed. |
| */ |
| if (conn_list != NULL) |
| (void) ibt_close_rc_channel(qp->qp_hdl, |
| IBT_BLOCKING, NULL, 0, NULL, NULL, 0); |
| else |
| (void) ibt_flush_channel(qp->qp_hdl); |
| |
| mutex_enter(&qp->posted_rbufs_lock); |
| while (qp->n_posted_rbufs) |
| cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); |
| mutex_exit(&qp->posted_rbufs_lock); |
| (void) ibt_free_channel(qp->qp_hdl); |
| qp->qp_hdl = NULL; |
| } |
| |
| ASSERT(qp->rdlist == NULL); |
| |
| if (qp->replylist != NULL) { |
| (void) rib_rem_replylist(qp); |
| } |
| |
| cv_destroy(&qp->cb_conn_cv); |
| cv_destroy(&qp->posted_rbufs_cv); |
| mutex_destroy(&qp->cb_lock); |
| |
| mutex_destroy(&qp->replylist_lock); |
| mutex_destroy(&qp->posted_rbufs_lock); |
| mutex_destroy(&qp->rdlist_lock); |
| |
| cv_destroy(&conn->c_cv); |
| mutex_destroy(&conn->c_lock); |
| |
| if (conn->c_raddr.buf != NULL) { |
| kmem_free(conn->c_raddr.buf, conn->c_raddr.len); |
| } |
| if (conn->c_laddr.buf != NULL) { |
| kmem_free(conn->c_laddr.buf, conn->c_laddr.len); |
| } |
| |
| /* |
| * Credit control cleanup. |
| */ |
| if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { |
| rdma_clnt_cred_ctrl_t *cc_info; |
| cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; |
| cv_destroy(&cc_info->clnt_cc_cv); |
| } |
| |
| kmem_free(qp, sizeof (rib_qp_t)); |
| |
| /* |
| * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, |
| * then the hca is no longer being used. |
| */ |
| if (conn_list != NULL) { |
| rw_enter(&hca->state_lock, RW_READER); |
| if (hca->state == HCA_DETACHED) { |
| rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); |
| if (hca->srv_conn_list.conn_hd == NULL) { |
| rw_enter(&hca->cl_conn_list.conn_lock, |
| RW_READER); |
| |
| if (hca->cl_conn_list.conn_hd == NULL) { |
| mutex_enter(&hca->inuse_lock); |
| hca->inuse = FALSE; |
| cv_signal(&hca->cb_cv); |
| mutex_exit(&hca->inuse_lock); |
| } |
| rw_exit(&hca->cl_conn_list.conn_lock); |
| } |
| rw_exit(&hca->srv_conn_list.conn_lock); |
| } |
| rw_exit(&hca->state_lock); |
| } |
| |
| return (RDMA_SUCCESS); |
| } |
| |
| /* |
| * Wait for send completion notification. Only on receiving a |
| * notification be it a successful or error completion, free the |
| * send_wid. |
| */ |
| static rdma_stat |
| rib_sendwait(rib_qp_t *qp, struct send_wid *wd) |
| { |
| clock_t timout, cv_wait_ret; |
| rdma_stat error = RDMA_SUCCESS; |
| int i; |
| |
| /* |
| * Wait for send to complete |
| */ |
| ASSERT(wd != NULL); |
| mutex_enter(&wd->sendwait_lock); |
| if (wd->status == (uint_t)SEND_WAIT) { |
| timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + |
| ddi_get_lbolt(); |
| |
| if (qp->mode == RIB_SERVER) { |
| while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, |
| &wd->sendwait_lock, timout)) > 0 && |
| wd->status == (uint_t)SEND_WAIT) |
| ; |
| switch (cv_wait_ret) { |
| case -1: /* timeout */ |
| DTRACE_PROBE(rpcib__i__srvsendwait__timeout); |
| |
| wd->cv_sig = 0; /* no signal needed */ |
| error = RDMA_TIMEDOUT; |
| break; |
| default: /* got send completion */ |
| break; |
| } |
| } else { |
| while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, |
| &wd->sendwait_lock, timout)) > 0 && |
| wd->status == (uint_t)SEND_WAIT) |
| ; |
| switch (cv_wait_ret) { |
| case -1: /* timeout */ |
| DTRACE_PROBE(rpcib__i__clntsendwait__timeout); |
| |
| wd->cv_sig = 0; /* no signal needed */ |
| error = RDMA_TIMEDOUT; |
| break; |
| case 0: /* interrupted */ |
| DTRACE_PROBE(rpcib__i__clntsendwait__intr); |
| |
| wd->cv_sig = 0; /* no signal needed */ |
| error = RDMA_INTR; |
| break; |
| default: /* got send completion */ |
| break; |
| } |
| } |
| } |
| |
| if (wd->status != (uint_t)SEND_WAIT) { |
| /* got send completion */ |
| if (wd->status != RDMA_SUCCESS) { |
| error = wd->status; |
| if (wd->status != RDMA_CONNLOST) |
| error = RDMA_FAILED; |
| } |
| for (i = 0; i < wd->nsbufs; i++) { |
| rib_rbuf_free(qptoc(qp), SEND_BUFFER, |
| (void *)(uintptr_t)wd->sbufaddr[i]); |
| } |
| mutex_exit(&wd->sendwait_lock); |
| (void) rib_free_sendwait(wd); |
| } else { |
| mutex_exit(&wd->sendwait_lock); |
| } |
| return (error); |
| } |
| |
| static struct send_wid * |
| rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) |
| { |
| struct send_wid *wd; |
| |
| wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); |
| wd->xid = xid; |
| wd->cv_sig = cv_sig; |
| wd->qp = qp; |
| cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); |
| mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); |
| wd->status = (uint_t)SEND_WAIT; |
| |
| return (wd); |
| } |
| |
| static int |
| rib_free_sendwait(struct send_wid *wdesc) |
| { |
| cv_destroy(&wdesc->wait_cv); |
| mutex_destroy(&wdesc->sendwait_lock); |
| kmem_free(wdesc, sizeof (*wdesc)); |
| |
| return (0); |
| } |
| |
| static rdma_stat |
| rib_rem_rep(rib_qp_t *qp, struct reply *rep) |
| { |
| mutex_enter(&qp->replylist_lock); |
| if (rep != NULL) { |
| (void) rib_remreply(qp, rep); |
| mutex_exit(&qp->replylist_lock); |
| return (RDMA_SUCCESS); |
| } |
| mutex_exit(&qp->replylist_lock); |
| return (RDMA_FAILED); |
| } |
| |
| /* |
| * Send buffers are freed here only in case of error in posting |
| * on QP. If the post succeeded, the send buffers are freed upon |
| * send completion in rib_sendwait() or in the scq_handler. |
| */ |
| rdma_stat |
| rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, |
| int send_sig, int cv_sig, caddr_t *swid) |
| { |
| struct send_wid *wdesc; |
| struct clist *clp; |
| ibt_status_t ibt_status = IBT_SUCCESS; |
| rdma_stat ret = RDMA_SUCCESS; |
| ibt_send_wr_t tx_wr; |
| int i, nds; |
| ibt_wr_ds_t sgl[DSEG_MAX]; |
| uint_t total_msg_size; |
| rib_qp_t *qp; |
| |
| qp = ctoqp(conn); |
| |
| ASSERT(cl != NULL); |
| |
| bzero(&tx_wr, sizeof (ibt_send_wr_t)); |
| |
| nds = 0; |
| total_msg_size = 0; |
| clp = cl; |
| while (clp != NULL) { |
| if (nds >= DSEG_MAX) { |
| DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); |
| return (RDMA_FAILED); |
| } |
| sgl[nds].ds_va = clp->w.c_saddr; |
| sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ |
| sgl[nds].ds_len = clp->c_len; |
| total_msg_size += clp->c_len; |
| clp = clp->c_next; |
| nds++; |
| } |
| |
| if (send_sig) { |
| /* Set SEND_SIGNAL flag. */ |
| tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; |
| wdesc = rib_init_sendwait(msgid, cv_sig, qp); |
| *swid = (caddr_t)wdesc; |
| } else { |
| tx_wr.wr_flags = IBT_WR_NO_FLAGS; |
| wdesc = rib_init_sendwait(msgid, 0, qp); |
| *swid = (caddr_t)wdesc; |
| } |
| wdesc->nsbufs = nds; |
| for (i = 0; i < nds; i++) { |
| wdesc->sbufaddr[i] = sgl[i].ds_va; |
| } |
| |
| tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; |
| tx_wr.wr_opcode = IBT_WRC_SEND; |
| tx_wr.wr_trans = IBT_RC_SRV; |
| tx_wr.wr_nds = nds; |
| tx_wr.wr_sgl = sgl; |
| |
| mutex_enter(&conn->c_lock); |
| if (conn->c_state == C_CONNECTED) { |
| ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); |
| } |
| if (conn->c_state != C_CONNECTED || |
| ibt_status != IBT_SUCCESS) { |
| if (conn->c_state != C_DISCONN_PEND) |
| conn->c_state = C_ERROR_CONN; |
| mutex_exit(&conn->c_lock); |
| for (i = 0; i < nds; i++) { |
| rib_rbuf_free(conn, SEND_BUFFER, |
| (void *)(uintptr_t)wdesc->sbufaddr[i]); |
| } |
| |
| (void) rib_free_sendwait(wdesc); |
| |
| return (RDMA_CONNLOST); |
| } |
| mutex_exit(&conn->c_lock); |
| |
| if (send_sig) { |
| if (cv_sig) { |
| /* |
| * cv_wait for send to complete. |
| * We can fail due to a timeout or signal or |
| * unsuccessful send. |
| */ |
| ret = rib_sendwait(qp, wdesc); |
| |
| return (ret); |
| } |
| } |
| |
| return (RDMA_SUCCESS); |
| } |
| |
| |
| rdma_stat |
| rib_send(CONN *conn, struct clist *cl, uint32_t msgid) |
| { |
| rdma_stat ret; |
| caddr_t wd; |
| |
| /* send-wait & cv_signal */ |
| ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); |
| return (ret); |
| } |
| |
| /* |
| * Server interface (svc_rdma_ksend). |
| * Send RPC reply and wait for RDMA_DONE. |
| */ |
| rdma_stat |
| rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) |
| { |
| rdma_stat ret = RDMA_SUCCESS; |
| struct rdma_done_list *rd; |
| clock_t timout, cv_wait_ret; |
| caddr_t *wid = NULL; |
| rib_qp_t *qp = ctoqp(conn); |
| |
| mutex_enter(&qp->rdlist_lock); |
| rd = rdma_done_add(qp, msgid); |
| |
| /* No cv_signal (whether send-wait or no-send-wait) */ |
| ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); |
| |
| if (ret != RDMA_SUCCESS) { |
| rdma_done_rm(qp, rd); |
| } else { |
| /* |
| * Wait for RDMA_DONE from remote end |
| */ |
| timout = |
| drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt(); |
| cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, |
| &qp->rdlist_lock, |
| timout); |
| |
| rdma_done_rm(qp, rd); |
| |
| if (cv_wait_ret < 0) { |
| ret = RDMA_TIMEDOUT; |
| } |
| } |
| |
| mutex_exit(&qp->rdlist_lock); |
| return (ret); |
| } |
| |
| static struct recv_wid * |
| rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) |
| { |
| struct recv_wid *rwid; |
| |
| rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); |
| rwid->xid = msgid; |
| rwid->addr = sgl->ds_va; |
| rwid->qp = qp; |
| |
| return (rwid); |
| } |
| |
| static void |
| rib_free_wid(struct recv_wid *rwid) |
| { |
| kmem_free(rwid, sizeof (struct recv_wid)); |
| } |
| |
| rdma_stat |
| rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) |
| { |
| rib_qp_t *qp = ctoqp(conn); |
| struct clist *clp = cl; |
| struct reply *rep; |
| struct recv_wid *rwid; |
| int nds; |
| ibt_wr_ds_t sgl[DSEG_MAX]; |
| ibt_recv_wr_t recv_wr; |
| rdma_stat ret; |
| ibt_status_t ibt_status; |
| |
| /* |
| * rdma_clnt_postrecv uses RECV_BUFFER. |
| */ |
| |
| nds = 0; |
| while (cl != NULL) { |
| if (nds >= DSEG_MAX) { |
| ret = RDMA_FAILED; |
| goto done; |
| } |
| sgl[nds].ds_va = cl->w.c_saddr; |
| sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ |
| sgl[nds].ds_len = cl->c_len; |
| cl = cl->c_next; |
| nds++; |
| } |
| |
| if (nds != 1) { |
| ret = RDMA_FAILED; |
| goto done; |
| } |
| |
| bzero(&recv_wr, sizeof (ibt_recv_wr_t)); |
| recv_wr.wr_nds = nds; |
| recv_wr.wr_sgl = sgl; |
| |
| rwid = rib_create_wid(qp, &sgl[0], msgid); |
| if (rwid) { |
| recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; |
| } else { |
| ret = RDMA_NORESOURCE; |
| goto done; |
| } |
| rep = rib_addreplylist(qp, msgid); |
| if (!rep) { |
| rib_free_wid(rwid); |
| ret = RDMA_NORESOURCE; |
| goto done; |
| } |
| |
| mutex_enter(&conn->c_lock); |
| |
| if (conn->c_state == C_CONNECTED) { |
| ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); |
| } |
| |
| if (conn->c_state != C_CONNECTED || |
| ibt_status != IBT_SUCCESS) { |
| if (conn->c_state != C_DISCONN_PEND) |
| conn->c_state = C_ERROR_CONN; |
| mutex_exit(&conn->c_lock); |
| rib_free_wid(rwid); |
| (void) rib_rem_rep(qp, rep); |
| ret = RDMA_CONNLOST; |
| goto done; |
| } |
| mutex_exit(&conn->c_lock); |
| return (RDMA_SUCCESS); |
| |
| done: |
| while (clp != NULL) { |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (void *)(uintptr_t)clp->w.c_saddr3); |
| clp = clp->c_next; |
| } |
| return (ret); |
| } |
| |
| rdma_stat |
| rib_svc_post(CONN* conn, struct clist *cl) |
| { |
| rib_qp_t *qp = ctoqp(conn); |
| struct svc_recv *s_recvp; |
| int nds; |
| ibt_wr_ds_t sgl[DSEG_MAX]; |
| ibt_recv_wr_t recv_wr; |
| ibt_status_t ibt_status; |
| |
| nds = 0; |
| while (cl != NULL) { |
| if (nds >= DSEG_MAX) { |
| return (RDMA_FAILED); |
| } |
| sgl[nds].ds_va = cl->w.c_saddr; |
| sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ |
| sgl[nds].ds_len = cl->c_len; |
| cl = cl->c_next; |
| nds++; |
| } |
| |
| if (nds != 1) { |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (caddr_t)(uintptr_t)sgl[0].ds_va); |
| |
| return (RDMA_FAILED); |
| } |
| |
| bzero(&recv_wr, sizeof (ibt_recv_wr_t)); |
| recv_wr.wr_nds = nds; |
| recv_wr.wr_sgl = sgl; |
| |
| s_recvp = rib_init_svc_recv(qp, &sgl[0]); |
| /* Use s_recvp's addr as wr id */ |
| recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; |
| mutex_enter(&conn->c_lock); |
| if (conn->c_state == C_CONNECTED) { |
| ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); |
| } |
| if (conn->c_state != C_CONNECTED || |
| ibt_status != IBT_SUCCESS) { |
| if (conn->c_state != C_DISCONN_PEND) |
| conn->c_state = C_ERROR_CONN; |
| mutex_exit(&conn->c_lock); |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (caddr_t)(uintptr_t)sgl[0].ds_va); |
| (void) rib_free_svc_recv(s_recvp); |
| |
| return (RDMA_CONNLOST); |
| } |
| mutex_exit(&conn->c_lock); |
| |
| return (RDMA_SUCCESS); |
| } |
| |
| /* Client */ |
| rdma_stat |
| rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) |
| { |
| |
| return (rib_clnt_post(conn, cl, msgid)); |
| } |
| |
| /* Client */ |
| rdma_stat |
| rib_post_resp_remove(CONN* conn, uint32_t msgid) |
| { |
| rib_qp_t *qp = ctoqp(conn); |
| struct reply *rep; |
| |
| mutex_enter(&qp->replylist_lock); |
| for (rep = qp->replylist; rep != NULL; rep = rep->next) { |
| if (rep->xid == msgid) { |
| if (rep->vaddr_cq) { |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (caddr_t)(uintptr_t)rep->vaddr_cq); |
| } |
| (void) rib_remreply(qp, rep); |
| break; |
| } |
| } |
| mutex_exit(&qp->replylist_lock); |
| |
| return (RDMA_SUCCESS); |
| } |
| |
| /* Server */ |
| rdma_stat |
| rib_post_recv(CONN *conn, struct clist *cl) |
| { |
| rib_qp_t *qp = ctoqp(conn); |
| |
| if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { |
| mutex_enter(&qp->posted_rbufs_lock); |
| qp->n_posted_rbufs++; |
| mutex_exit(&qp->posted_rbufs_lock); |
| return (RDMA_SUCCESS); |
| } |
| return (RDMA_FAILED); |
| } |
| |
| /* |
| * Client side only interface to "recv" the rpc reply buf |
| * posted earlier by rib_post_resp(conn, cl, msgid). |
| */ |
| rdma_stat |
| rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) |
| { |
| struct reply *rep = NULL; |
| clock_t timout, cv_wait_ret; |
| rdma_stat ret = RDMA_SUCCESS; |
| rib_qp_t *qp = ctoqp(conn); |
| |
| /* |
| * Find the reply structure for this msgid |
| */ |
| mutex_enter(&qp->replylist_lock); |
| |
| for (rep = qp->replylist; rep != NULL; rep = rep->next) { |
| if (rep->xid == msgid) |
| break; |
| } |
| |
| if (rep != NULL) { |
| /* |
| * If message not yet received, wait. |
| */ |
| if (rep->status == (uint_t)REPLY_WAIT) { |
| timout = ddi_get_lbolt() + |
| drv_usectohz(REPLY_WAIT_TIME * 1000000); |
| |
| while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, |
| &qp->replylist_lock, timout)) > 0 && |
| rep->status == (uint_t)REPLY_WAIT) |
| ; |
| |
| switch (cv_wait_ret) { |
| case -1: /* timeout */ |
| ret = RDMA_TIMEDOUT; |
| break; |
| case 0: |
| ret = RDMA_INTR; |
| break; |
| default: |
| break; |
| } |
| } |
| |
| if (rep->status == RDMA_SUCCESS) { |
| struct clist *cl = NULL; |
| |
| /* |
| * Got message successfully |
| */ |
| clist_add(&cl, 0, rep->bytes_xfer, NULL, |
| (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); |
| *clp = cl; |
| } else { |
| if (rep->status != (uint_t)REPLY_WAIT) { |
| /* |
| * Got error in reply message. Free |
| * recv buffer here. |
| */ |
| ret = rep->status; |
| rib_rbuf_free(conn, RECV_BUFFER, |
| (caddr_t)(uintptr_t)rep->vaddr_cq); |
| } |
| } |
| (void) rib_remreply(qp, rep); |
| } else { |
| /* |
| * No matching reply structure found for given msgid on the |
| * reply wait list. |
| */ |
| ret = RDMA_INVAL; |
| DTRACE_PROBE(rpcib__i__nomatchxid2); |
| } |
| |
| /* |
| * Done. |
| */ |
| mutex_exit(&qp->replylist_lock); |
| return (ret); |
| } |
| |
| /* |
| * RDMA write a buffer to the remote address. |
| */ |
| rdma_stat |
| rib_write(CONN *conn, struct clist *cl, int wait) |
| { |
| ibt_send_wr_t tx_wr; |
| int cv_sig; |
| int i; |
| ibt_wr_ds_t sgl[DSEG_MAX]; |
| struct send_wid *wdesc; |
| ibt_status_t ibt_status; |
| rdma_stat ret = RDMA_SUCCESS; |
| rib_qp_t *qp = ctoqp(conn); |
| uint64_t n_writes = 0; |
| bool_t force_wait = FALSE; |
| |
| if (cl == NULL) { |
| return (RDMA_FAILED); |
| } |
| |
| |
| while ((cl != NULL)) { |
| if (cl->c_len > 0) { |
| bzero(&tx_wr, sizeof (ibt_send_wr_t)); |
| tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; |
| tx_wr.wr.rc.rcwr.rdma.rdma_rkey = |
| cl->c_dmemhandle.mrc_rmr; /* rkey */ |
| sgl[0].ds_va = cl->w.c_saddr; |
| sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ |
| sgl[0].ds_len = cl->c_len; |
| |
| if (wait) { |
| tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; |
| cv_sig = 1; |
| } else { |
| if (n_writes > max_unsignaled_rws) { |
| n_writes = 0; |
| force_wait = TRUE; |
| tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; |
| cv_sig = 1; |
| } else { |
| tx_wr.wr_flags = IBT_WR_NO_FLAGS; |
| cv_sig = 0; |
| } |
| } |
| |
| wdesc = rib_init_sendwait(0, cv_sig, qp); |
| tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; |
| tx_wr.wr_opcode = IBT_WRC_RDMAW; |
| tx_wr.wr_trans = IBT_RC_SRV; |
| tx_wr.wr_nds = 1; |
| tx_wr.wr_sgl = sgl; |
| |
| mutex_enter(&conn->c_lock); |
| if (conn->c_state == C_CONNECTED) { |
| ibt_status = |
| ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); |
| } |
| if (conn->c_state != C_CONNECTED || |
| ibt_status != IBT_SUCCESS) { |
| if (conn->c_state != C_DISCONN_PEND) |
| conn->c_state = C_ERROR_CONN; |
| mutex_exit(&conn->c_lock); |
| (void) rib_free_sendwait(wdesc); |
| return (RDMA_CONNLOST); |
| } |
| mutex_exit(&conn->c_lock); |
| |
| /* |
| * Wait for send to complete |
| */ |
| if (wait || force_wait) { |
| force_wait = FALSE; |
| ret = rib_sendwait(qp, wdesc); |
| if (ret != 0) { |
| return (ret); |
| } |
| } else { |
| mutex_enter(&wdesc->sendwait_lock); |
| for (i = 0; i < wdesc->nsbufs; i++) { |
| rib_rbuf_free(qptoc(qp), SEND_BUFFER, |
| (void *)(uintptr_t) |
| wdesc->sbufaddr[i]); |
| } |
| mutex_exit(&wdesc->sendwait_lock); |
| (void) rib_free_sendwait(wdesc); |
| } |
| n_writes ++; |
| } |
| cl = cl->c_next; |
| } |
| return (RDMA_SUCCESS); |
| } |
| |
| /* |
| * RDMA Read a buffer from the remote address. |
| */ |
| rdma_stat |
| rib_read(CONN *conn, struct clist *cl, int wait) |
| { |
| ibt_send_wr_t rx_wr; |
| int cv_sig; |
| int i; |
| ibt_wr_ds_t sgl; |
| struct send_wid *wdesc; |
| ibt_status_t ibt_status = IBT_SUCCESS; |
| rdma_stat ret = RDMA_SUCCESS; |
| rib_qp_t *qp = ctoqp(conn); |
| |
| if (cl == NULL) { |
| return (RDMA_FAILED); |
| } |
| |
| while (cl != NULL) { |
| bzero(&rx_wr, sizeof (ibt_send_wr_t)); |
| /* |
| * Remote address is at the head chunk item in list. |
| */ |
| rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; |
| rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; |
| |
| sgl.ds_va = cl->u.c_daddr; |
| sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ |
| sgl.ds_len = cl->c_len; |
| |
| if (wait) { |
| rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; |
| cv_sig = 1; |
| } else { |
| rx_wr.wr_flags = IBT_WR_NO_FLAGS; |
| cv_sig = 0; |
| } |
| |
| wdesc = rib_init_sendwait(0, cv_sig, qp); |
| rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; |
| rx_wr.wr_opcode = IBT_WRC_RDMAR; |
| rx_wr.wr_trans = IBT_RC_SRV; |
| rx_wr.wr_nds = 1; |
| rx_wr.wr_sgl = &sgl; |
| |
| mutex_enter(&conn->c_lock); |
| if (conn->c_state == C_CONNECTED) { |
| ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); |
| } |
| if (conn->c_state != C_CONNECTED || |
| ibt_status != IBT_SUCCESS) { |
| if (conn->c_state != C_DISCONN_PEND) |
| conn->c_state = C_ERROR_CONN; |
| mutex_exit(&conn->c_lock); |
| (void) rib_free_sendwait(wdesc); |
| return (RDMA_CONNLOST); |
| } |
| mutex_exit(&conn->c_lock); |
| |
| /* |
| * Wait for send to complete if this is the |
| * last item in the list. |
| */ |
| if (wait && cl->c_next == NULL) { |
| ret = rib_sendwait(qp, wdesc); |
| if (ret != 0) { |
| return (ret); |
| } |
| } else { |
| mutex_enter(&wdesc->sendwait_lock); |
| for (i = 0; i < wdesc->nsbufs; i++) { |
| rib_rbuf_free(qptoc(qp), SEND_BUFFER, |
| (void *)(uintptr_t)wdesc->sbufaddr[i]); |
| } |
| mutex_exit(&wdesc->sendwait_lock); |
| (void) rib_free_sendwait(wdesc); |
| } |
| cl = cl->c_next; |
| } |
| return (RDMA_SUCCESS); |
| } |
| |
| /* |
| * rib_srv_cm_handler() |
| * Connection Manager callback to handle RC connection requests. |
| */ |
| /* ARGSUSED */ |
| static ibt_cm_status_t |
| rib_srv_cm_handler(void *any, ibt_cm_event_t *event, |
| ibt_cm_return_args_t *ret_args, void *priv_data, |
| ibt_priv_data_len_t len) |
| { |
| queue_t *q; |
| rib_qp_t *qp; |
| rpcib_state_t *ribstat; |
| rib_hca_t *hca; |
| rdma_stat status = RDMA_SUCCESS; |
| int i; |
| struct clist cl; |
| rdma_buf_t rdbuf = {0}; |
| void *buf = NULL; |
| CONN *conn; |
| ibt_ip_cm_info_t ipinfo; |
| struct sockaddr_in *s; |
| struct sockaddr_in6 *s6; |
| int sin_size = sizeof (struct sockaddr_in); |
| int in_size = sizeof (struct in_addr); |
| int sin6_size = sizeof (struct sockaddr_in6); |
| |
| ASSERT(any != NULL); |
|