From: AMEET M. PARANJAPE <aparanja@redhat.com> Date: Mon, 4 May 2009 10:40:26 -0400 Subject: [openib] ehca: fix performance during creation of QPs Message-id: 20090504143743.23800.31697.sendpatchset@squad5-lp1.lab.bos.redhat.com O-Subject: [PATCH RHEL5.4 BZ498527] ehca performance impact during creation of queue pairs Bugzilla: 498527 RH-Acked-by: Doug Ledford <dledford@redhat.com> RHBZ#: ====== https://bugzilla.redhat.com/show_bug.cgi?id=498527 Description: =========== This patch contains performance improvments for ehca driver. It will skip code which is not necessary for userspace queue pairs and will replace vmalloc() calls with kmalloc(). RHEL Version Found: ================ RHEL 5.3 kABI Status: ============ No symbols were harmed. Brew: ===== Built on all platforms. http://brewweb.devel.redhat.com/brew/taskinfo?taskID=1784974 Upstream Status: ================ The patch is already applied for linux-2.6.31 as you can see below: http://lkml.org/lkml/2009/4/21/290 http://lkml.org/lkml/2009/4/21/292 http://lkml.org/lkml/2009/4/21/293 Test Status: ============ If a userspace application tries to allocate a large number of queue pairs the performance of the creation process degrade rapidly and results in softlookup errors: BUG: soft lockup - CPU#10 stuck for 10s! [mpi_lapi_gen_64:21687] REGS: c000001bc72a7340 TRAP: 0901 Tainted: G (2.6.18-128.el5) TASK = c000001e4ad98d40[21687] 'mpi_lapi_gen_64' THREAD: c000001bc72a4000 CPU: 10 NIP [C0000000003C8E3C] ._write_lock+0x44/0x80 LR [C0000000000DB550] .__get_vm_area_node+0xd0/0x1f8 Call Trace: After applying this patch this problem is not seen. =============================================================== Ameet Paranjape 978-392-3903 ext 23903 IBM on-site partner Proposed Patch: =============== diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index c57e9f6..88f9bb9 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -50,7 +50,7 @@ #include "ehca_tools.h" #include "hcp_if.h" -#define HCAD_VERSION "SVNEHCA_0026" +#define HCAD_VERSION "SVNEHCA_0027" MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>"); diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index e3537e2..9eb7605 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -461,7 +461,7 @@ static struct ehca_qp *internal_create_qp( ib_device); struct ib_ucontext *context = NULL; u64 h_ret; - int is_llqp = 0, has_srq = 0; + int is_llqp = 0, has_srq = 0, is_user = 0; int qp_type, max_send_sge, max_recv_sge, ret; /* h_call's out parameters */ @@ -603,9 +603,6 @@ static struct ehca_qp *internal_create_qp( } } - if (pd->uobject && udata) - context = pd->uobject->context; - my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL); if (!my_qp) { ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd); @@ -613,6 +610,11 @@ static struct ehca_qp *internal_create_qp( return ERR_PTR(-ENOMEM); } + if (pd->uobject && udata) { + is_user = 1; + context = pd->uobject->context; + } + atomic_set(&my_qp->nr_events, 0); init_waitqueue_head(&my_qp->wait_completion); spin_lock_init(&my_qp->spinlock_s); @@ -701,7 +703,7 @@ static struct ehca_qp *internal_create_qp( (parms.squeue.is_small || parms.rqueue.is_small); } - h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms); + h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user); if (h_ret != H_SUCCESS) { ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%li", h_ret); @@ -763,18 +765,20 @@ static struct ehca_qp *internal_create_qp( goto create_qp_exit2; } - my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length / - my_qp->ipz_squeue.qe_size; - my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries * + if (!is_user) { + my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length / + my_qp->ipz_squeue.qe_size; + my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries * sizeof(struct ehca_qmap_entry)); - if (!my_qp->sq_map.map) { - ehca_err(pd->device, "Couldn't allocate squeue " - "map ret=%i", ret); - goto create_qp_exit3; + if (!my_qp->sq_map.map) { + ehca_err(pd->device, "Couldn't allocate squeue " + "map ret=%i", ret); + goto create_qp_exit3; + } + INIT_LIST_HEAD(&my_qp->sq_err_node); + /* to avoid the generation of bogus flush CQEs */ + reset_queue_map(&my_qp->sq_map); } - INIT_LIST_HEAD(&my_qp->sq_err_node); - /* to avoid the generation of bogus flush CQEs */ - reset_queue_map(&my_qp->sq_map); } if (HAS_RQ(my_qp)) { @@ -786,20 +790,21 @@ static struct ehca_qp *internal_create_qp( "and pages ret=%i", ret); goto create_qp_exit4; } - - my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length / - my_qp->ipz_rqueue.qe_size; - my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries * + if (!is_user) { + my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length / + my_qp->ipz_rqueue.qe_size; + my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries * sizeof(struct ehca_qmap_entry)); - if (!my_qp->rq_map.map) { - ehca_err(pd->device, "Couldn't allocate squeue " + if (!my_qp->rq_map.map) { + ehca_err(pd->device, "Couldn't allocate squeue " "map ret=%i", ret); - goto create_qp_exit5; + goto create_qp_exit5; + } + INIT_LIST_HEAD(&my_qp->rq_err_node); + /* to avoid the generation of bogus flush CQEs */ + reset_queue_map(&my_qp->rq_map); } - INIT_LIST_HEAD(&my_qp->rq_err_node); - /* to avoid the generation of bogus flush CQEs */ - reset_queue_map(&my_qp->rq_map); - } else if (init_attr->srq) { + } else if (init_attr->srq && !is_user) { /* this is a base QP, use the queue map of the SRQ */ my_qp->rq_map = my_srq->rq_map; INIT_LIST_HEAD(&my_qp->rq_err_node); @@ -912,7 +917,7 @@ create_qp_exit7: kfree(my_qp->mod_qp_parm); create_qp_exit6: - if (HAS_RQ(my_qp)) + if (HAS_RQ(my_qp) && !is_user) vfree(my_qp->rq_map.map); create_qp_exit5: @@ -920,7 +925,7 @@ create_qp_exit5: ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue); create_qp_exit4: - if (HAS_SQ(my_qp)) + if (HAS_SQ(my_qp) && !is_user) vfree(my_qp->sq_map.map); create_qp_exit3: @@ -1238,6 +1243,7 @@ static int internal_modify_qp(struct ib_qp *ibqp, u64 update_mask; u64 h_ret; int bad_wqe_cnt = 0; + int is_user = 0; int squeue_locked = 0; unsigned long flags = 0; @@ -1260,6 +1266,8 @@ static int internal_modify_qp(struct ib_qp *ibqp, ret = ehca2ib_return_code(h_ret); goto modify_qp_exit1; } + if (ibqp->uobject) + is_user = 1; qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state); @@ -1722,7 +1730,8 @@ static int internal_modify_qp(struct ib_qp *ibqp, goto modify_qp_exit2; } } - if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)) { + if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR) + && !is_user) { ret = check_for_left_cqes(my_qp, shca); if (ret) goto modify_qp_exit2; @@ -1732,16 +1741,17 @@ static int internal_modify_qp(struct ib_qp *ibqp, ipz_qeit_reset(&my_qp->ipz_rqueue); ipz_qeit_reset(&my_qp->ipz_squeue); - if (qp_cur_state == IB_QPS_ERR) { + if (qp_cur_state == IB_QPS_ERR && !is_user) { del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node); if (HAS_RQ(my_qp)) del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node); } - reset_queue_map(&my_qp->sq_map); + if (!is_user) + reset_queue_map(&my_qp->sq_map); - if (HAS_RQ(my_qp)) + if (HAS_RQ(my_qp) && !is_user) reset_queue_map(&my_qp->rq_map); } @@ -2132,10 +2142,12 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, int ret; u64 h_ret; u8 port_num; + int is_user = 0; enum ib_qp_type qp_type; unsigned long flags; if (uobject) { + is_user = 1; if (my_qp->mm_count_galpa || my_qp->mm_count_rqueue || my_qp->mm_count_squeue) { ehca_err(dev, "Resources still referenced in " @@ -2162,10 +2174,10 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, * SRQs will never get into an error list and do not have a recv_cq, * so we need to skip them here. */ - if (HAS_RQ(my_qp) && !IS_SRQ(my_qp)) + if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user) del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node); - if (HAS_SQ(my_qp)) + if (HAS_SQ(my_qp) && !is_user) del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node); /* now wait until all pending events have completed */ @@ -2204,12 +2216,14 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, if (HAS_RQ(my_qp)) { ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue); - vfree(my_qp->rq_map.map); + if (!is_user) + vfree(my_qp->rq_map.map); } if (HAS_SQ(my_qp)) { ipz_queue_dtor(my_pd, &my_qp->ipz_squeue); - vfree(my_qp->sq_map.map); + if (!is_user) + vfree(my_qp->sq_map.map); } kmem_cache_free(qp_cache, my_qp); atomic_dec(&shca->num_qps); diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c index 415d3a4..c7cc5d7 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.c +++ b/drivers/infiniband/hw/ehca/hcp_if.c @@ -284,7 +284,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, param->act_pages = (u32)outs[4]; if (ret == H_SUCCESS) - hcp_galpas_ctor(&cq->galpas, outs[5], outs[6]); + hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]); if (ret == H_NOT_ENOUGH_RESOURCES) ehca_gen_err("Not enough resources. ret=%li", ret); @@ -293,7 +293,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, } u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, - struct ehca_alloc_qp_parms *parms) + struct ehca_alloc_qp_parms *parms, int is_user) { u64 ret; u64 allocate_controls, max_r10_reg, r11, r12; @@ -359,7 +359,7 @@ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]); if (ret == H_SUCCESS) - hcp_galpas_ctor(&parms->galpas, outs[6], outs[6]); + hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]); if (ret == H_NOT_ENOUGH_RESOURCES) ehca_gen_err("Not enough resources. ret=%li", ret); diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h index 2c3c6e0..39c1c36 100644 --- a/drivers/infiniband/hw/ehca/hcp_if.h +++ b/drivers/infiniband/hw/ehca/hcp_if.h @@ -78,7 +78,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, * initialize resources, create empty QPPTs (2 rings). */ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, - struct ehca_alloc_qp_parms *parms); + struct ehca_alloc_qp_parms *parms, int is_user); u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, const u8 port_id, diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.c b/drivers/infiniband/hw/ehca/hcp_phyp.c index 2148210..fc3a245 100644 --- a/drivers/infiniband/hw/ehca/hcp_phyp.c +++ b/drivers/infiniband/hw/ehca/hcp_phyp.c @@ -54,12 +54,15 @@ int hcall_unmap_page(u64 mapaddr) return 0; } -int hcp_galpas_ctor(struct h_galpas *galpas, +int hcp_galpas_ctor(struct h_galpas *galpas, int is_user, u64 paddr_kernel, u64 paddr_user) { - int ret = hcall_map_page(paddr_kernel, &galpas->kernel.fw_handle); - if (ret) - return ret; + if (!is_user) { + int ret = hcall_map_page(paddr_kernel, &galpas->kernel.fw_handle); + if (ret) + return ret; + } else + galpas->kernel.fw_handle = NULL; galpas->user.fw_handle = paddr_user; diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.h b/drivers/infiniband/hw/ehca/hcp_phyp.h index 5305c2a..204227d 100644 --- a/drivers/infiniband/hw/ehca/hcp_phyp.h +++ b/drivers/infiniband/hw/ehca/hcp_phyp.h @@ -78,7 +78,7 @@ static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value) *(volatile u64 __force *)addr = value; } -int hcp_galpas_ctor(struct h_galpas *galpas, +int hcp_galpas_ctor(struct h_galpas *galpas, int is_user, u64 paddr_kernel, u64 paddr_user); int hcp_galpas_dtor(struct h_galpas *galpas); diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c index c3a3284..1227c59 100644 --- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c +++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c @@ -220,10 +220,13 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, queue->small_page = NULL; /* allocate queue page pointers */ - queue->queue_pages = vmalloc(nr_of_pages * sizeof(void *)); + queue->queue_pages = kmalloc(nr_of_pages * sizeof(void *), GFP_KERNEL); if (!queue->queue_pages) { - ehca_gen_err("Couldn't allocate queue page list"); - return 0; + queue->queue_pages = vmalloc(nr_of_pages * sizeof(void *)); + if (!queue->queue_pages) { + ehca_gen_err("Couldn't allocate queue page list"); + return 0; + } } memset(queue->queue_pages, 0, nr_of_pages * sizeof(void *)); @@ -240,7 +243,10 @@ int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, ipz_queue_ctor_exit0: ehca_gen_err("Couldn't alloc pages queue=%p " "nr_of_pages=%x", queue, nr_of_pages); - vfree(queue->queue_pages); + if (is_vmalloc_addr(queue->queue_pages)) + vfree(queue->queue_pages); + else + kfree(queue->queue_pages); return 0; } @@ -262,7 +268,10 @@ int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue) free_page((unsigned long)queue->queue_pages[i]); } - vfree(queue->queue_pages); + if (is_vmalloc_addr(queue->queue_pages)) + vfree(queue->queue_pages); + else + kfree(queue->queue_pages); return 1; }