The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
    3  *
    4  * Copyright (c) 2006 Mellanox Technologies. All rights reserved
    5  *
    6  * This software is available to you under a choice of one of two
    7  * licenses.  You may choose to be licensed under the terms of the GNU
    8  * General Public License (GPL) Version 2, available from the file
    9  * COPYING in the main directory of this source tree, or the
   10  * OpenIB.org BSD license below:
   11  *
   12  *     Redistribution and use in source and binary forms, with or
   13  *     without modification, are permitted provided that the following
   14  *     conditions are met:
   15  *
   16  *      - Redistributions of source code must retain the above
   17  *        copyright notice, this list of conditions and the following
   18  *        disclaimer.
   19  *
   20  *      - Redistributions in binary form must reproduce the above
   21  *        copyright notice, this list of conditions and the following
   22  *        disclaimer in the documentation and/or other materials
   23  *        provided with the distribution.
   24  *
   25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   32  * SOFTWARE.
   33  */
   34 
   35 #include <sys/cdefs.h>
   36 __FBSDID("$FreeBSD$");
   37 
   38 #include "ipoib.h"
   39 
   40 #ifdef CONFIG_INFINIBAND_IPOIB_CM
   41 
   42 #include <netinet/ip.h>
   43 #include <netinet/ip_icmp.h>
   44 #include <netinet/icmp6.h>
   45 
   46 #include <rdma/ib_cm.h>
   47 #include <rdma/ib_cache.h>
   48 #include <linux/delay.h>
   49 
   50 int ipoib_max_conn_qp = 128;
   51 
   52 module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
   53 MODULE_PARM_DESC(max_nonsrq_conn_qp,
   54                  "Max number of connected-mode QPs per interface "
   55                  "(applied only if shared receive queue is not available)");
   56 
   57 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
   58 static int data_debug_level;
   59 
   60 module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
   61 MODULE_PARM_DESC(cm_data_debug_level,
   62                  "Enable data path debug tracing for connected mode if > 0");
   63 #endif
   64 
   65 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
   66 
   67 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
   68 #define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
   69 #define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
   70 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
   71 
   72 static struct ib_qp_attr ipoib_cm_err_attr = {
   73         .qp_state = IB_QPS_ERR
   74 };
   75 
   76 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
   77 
   78 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
   79         .wr_id = IPOIB_CM_RX_DRAIN_WRID,
   80         .opcode = IB_WR_SEND,
   81 };
   82 
   83 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
   84                                struct ib_cm_event *event);
   85 
   86 static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
   87 {
   88 
   89         ipoib_dma_unmap_rx(priv, (struct ipoib_rx_buf *)rx_req);
   90 
   91 }
   92 
   93 static int ipoib_cm_post_receive_srq(struct ipoib_dev_priv *priv, int id)
   94 {
   95         const struct ib_recv_wr *bad_wr;
   96         struct ipoib_rx_buf *rx_req;
   97         struct mbuf *m;
   98         int ret;
   99         int i;
  100 
  101         rx_req = (struct ipoib_rx_buf *)&priv->cm.srq_ring[id];
  102         for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
  103                 priv->cm.rx_sge[i].addr = rx_req->mapping[i];
  104                 priv->cm.rx_sge[i].length = m->m_len;
  105         }
  106 
  107         priv->cm.rx_wr.num_sge = i;
  108         priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
  109 
  110         ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
  111         if (unlikely(ret)) {
  112                 ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
  113                 ipoib_dma_unmap_rx(priv, rx_req);
  114                 m_freem(priv->cm.srq_ring[id].mb);
  115                 priv->cm.srq_ring[id].mb = NULL;
  116         }
  117 
  118         return ret;
  119 }
  120 
  121 static int ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv *priv,
  122                                         struct ipoib_cm_rx *rx,
  123                                         struct ib_recv_wr *wr,
  124                                         struct ib_sge *sge, int id)
  125 {
  126         struct ipoib_rx_buf *rx_req;
  127         const struct ib_recv_wr *bad_wr;
  128         struct mbuf *m;
  129         int ret;
  130         int i;
  131 
  132         rx_req = (struct ipoib_rx_buf *)&rx->rx_ring[id];
  133         for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
  134                 sge[i].addr = rx_req->mapping[i];
  135                 sge[i].length = m->m_len;
  136         }
  137 
  138         wr->num_sge = i;
  139         wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
  140 
  141         ret = ib_post_recv(rx->qp, wr, &bad_wr);
  142         if (unlikely(ret)) {
  143                 ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
  144                 ipoib_dma_unmap_rx(priv, rx_req);
  145                 m_freem(rx->rx_ring[id].mb);
  146                 rx->rx_ring[id].mb = NULL;
  147         }
  148 
  149         return ret;
  150 }
  151 
  152 static struct mbuf *
  153 ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
  154 {
  155         return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req,
  156             sizeof(struct ipoib_pseudoheader), priv->cm.max_cm_mtu, IPOIB_CM_RX_SG);
  157 }
  158 
  159 static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv,
  160                                   struct ipoib_cm_rx_buf *rx_ring)
  161 {
  162         int i;
  163 
  164         for (i = 0; i < ipoib_recvq_size; ++i)
  165                 if (rx_ring[i].mb) {
  166                         ipoib_cm_dma_unmap_rx(priv, &rx_ring[i]);
  167                         m_freem(rx_ring[i].mb);
  168                 }
  169 
  170         kfree(rx_ring);
  171 }
  172 
  173 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
  174 {
  175         const struct ib_send_wr *bad_wr;
  176         struct ipoib_cm_rx *p;
  177 
  178         /* We only reserved 1 extra slot in CQ for drain WRs, so
  179          * make sure we have at most 1 outstanding WR. */
  180         if (list_empty(&priv->cm.rx_flush_list) ||
  181             !list_empty(&priv->cm.rx_drain_list))
  182                 return;
  183 
  184         /*
  185          * QPs on flush list are error state.  This way, a "flush
  186          * error" WC will be immediately generated for each WR we post.
  187          */
  188         p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
  189         if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
  190                 ipoib_warn(priv, "failed to post drain wr\n");
  191 
  192         list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
  193 }
  194 
  195 static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
  196 {
  197         struct ipoib_cm_rx *p = ctx;
  198         struct ipoib_dev_priv *priv = p->priv;
  199         unsigned long flags;
  200 
  201         if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
  202                 return;
  203 
  204         spin_lock_irqsave(&priv->lock, flags);
  205         list_move(&p->list, &priv->cm.rx_flush_list);
  206         p->state = IPOIB_CM_RX_FLUSH;
  207         ipoib_cm_start_rx_drain(priv);
  208         spin_unlock_irqrestore(&priv->lock, flags);
  209 }
  210 
  211 static struct ib_qp *ipoib_cm_create_rx_qp(struct ipoib_dev_priv *priv,
  212                                            struct ipoib_cm_rx *p)
  213 {
  214         struct ib_qp_init_attr attr = {
  215                 .event_handler = ipoib_cm_rx_event_handler,
  216                 .send_cq = priv->recv_cq, /* For drain WR */
  217                 .recv_cq = priv->recv_cq,
  218                 .srq = priv->cm.srq,
  219                 .cap.max_send_wr = 1, /* For drain WR */
  220                 .cap.max_send_sge = 1,
  221                 .sq_sig_type = IB_SIGNAL_ALL_WR,
  222                 .qp_type = IB_QPT_RC,
  223                 .qp_context = p,
  224         };
  225 
  226         if (!ipoib_cm_has_srq(priv)) {
  227                 attr.cap.max_recv_wr  = ipoib_recvq_size;
  228                 attr.cap.max_recv_sge = priv->cm.num_frags;
  229         }
  230 
  231         return ib_create_qp(priv->pd, &attr);
  232 }
  233 
  234 static int ipoib_cm_modify_rx_qp(struct ipoib_dev_priv *priv,
  235                                  struct ib_cm_id *cm_id, struct ib_qp *qp,
  236                                  unsigned psn)
  237 {
  238         struct ib_qp_attr qp_attr;
  239         int qp_attr_mask, ret;
  240 
  241         qp_attr.qp_state = IB_QPS_INIT;
  242         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
  243         if (ret) {
  244                 ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
  245                 return ret;
  246         }
  247         ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
  248         if (ret) {
  249                 ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
  250                 return ret;
  251         }
  252         qp_attr.qp_state = IB_QPS_RTR;
  253         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
  254         if (ret) {
  255                 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
  256                 return ret;
  257         }
  258         qp_attr.rq_psn = psn;
  259         ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
  260         if (ret) {
  261                 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
  262                 return ret;
  263         }
  264 
  265         /*
  266          * Current Mellanox HCA firmware won't generate completions
  267          * with error for drain WRs unless the QP has been moved to
  268          * RTS first. This work-around leaves a window where a QP has
  269          * moved to error asynchronously, but this will eventually get
  270          * fixed in firmware, so let's not error out if modify QP
  271          * fails.
  272          */
  273         qp_attr.qp_state = IB_QPS_RTS;
  274         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
  275         if (ret) {
  276                 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
  277                 return 0;
  278         }
  279         ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
  280         if (ret) {
  281                 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
  282                 return 0;
  283         }
  284 
  285         return 0;
  286 }
  287 
  288 static void ipoib_cm_init_rx_wr(struct ipoib_dev_priv *priv,
  289                                 struct ib_recv_wr *wr,
  290                                 struct ib_sge *sge)
  291 {
  292         int i;
  293 
  294         for (i = 0; i < IPOIB_CM_RX_SG; i++)
  295                 sge[i].lkey = priv->pd->local_dma_lkey;
  296 
  297         wr->next    = NULL;
  298         wr->sg_list = sge;
  299         wr->num_sge = 1;
  300 }
  301 
  302 static int ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv *priv,
  303     struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx)
  304 {
  305         struct {
  306                 struct ib_recv_wr wr;
  307                 struct ib_sge sge[IPOIB_CM_RX_SG];
  308         } *t;
  309         int ret;
  310         int i;
  311 
  312         rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL);
  313         if (!rx->rx_ring) {
  314                 printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
  315                        priv->ca->name, ipoib_recvq_size);
  316                 return -ENOMEM;
  317         }
  318 
  319         memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring);
  320 
  321         t = kmalloc(sizeof *t, GFP_KERNEL);
  322         if (!t) {
  323                 ret = -ENOMEM;
  324                 goto err_free;
  325         }
  326 
  327         ipoib_cm_init_rx_wr(priv, &t->wr, t->sge);
  328 
  329         spin_lock_irq(&priv->lock);
  330 
  331         if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
  332                 spin_unlock_irq(&priv->lock);
  333                 ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
  334                 ret = -EINVAL;
  335                 goto err_free;
  336         } else
  337                 ++priv->cm.nonsrq_conn_qp;
  338 
  339         spin_unlock_irq(&priv->lock);
  340 
  341         for (i = 0; i < ipoib_recvq_size; ++i) {
  342                 if (!ipoib_cm_alloc_rx_mb(priv, &rx->rx_ring[i])) {
  343                         ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
  344                                 ret = -ENOMEM;
  345                                 goto err_count;
  346                 }
  347                 ret = ipoib_cm_post_receive_nonsrq(priv, rx, &t->wr, t->sge, i);
  348                 if (ret) {
  349                         ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
  350                                    "failed for buf %d\n", i);
  351                         ret = -EIO;
  352                         goto err_count;
  353                 }
  354         }
  355 
  356         rx->recv_count = ipoib_recvq_size;
  357 
  358         kfree(t);
  359 
  360         return 0;
  361 
  362 err_count:
  363         spin_lock_irq(&priv->lock);
  364         --priv->cm.nonsrq_conn_qp;
  365         spin_unlock_irq(&priv->lock);
  366 
  367 err_free:
  368         kfree(t);
  369         ipoib_cm_free_rx_ring(priv, rx->rx_ring);
  370 
  371         return ret;
  372 }
  373 
  374 static int ipoib_cm_send_rep(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id,
  375                              struct ib_qp *qp, struct ib_cm_req_event_param *req,
  376                              unsigned psn)
  377 {
  378         struct ipoib_cm_data data = {};
  379         struct ib_cm_rep_param rep = {};
  380 
  381         data.qpn = cpu_to_be32(priv->qp->qp_num);
  382         data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
  383 
  384         rep.private_data = &data;
  385         rep.private_data_len = sizeof data;
  386         rep.flow_control = 0;
  387         rep.rnr_retry_count = req->rnr_retry_count;
  388         rep.srq = ipoib_cm_has_srq(priv);
  389         rep.qp_num = qp->qp_num;
  390         rep.starting_psn = psn;
  391         return ib_send_cm_rep(cm_id, &rep);
  392 }
  393 
  394 static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
  395 {
  396         struct ipoib_dev_priv *priv = cm_id->context;
  397         struct ipoib_cm_rx *p;
  398         unsigned psn;
  399         int ret;
  400 
  401         ipoib_dbg(priv, "REQ arrived\n");
  402         p = kzalloc(sizeof *p, GFP_KERNEL);
  403         if (!p)
  404                 return -ENOMEM;
  405         p->priv = priv;
  406         p->id = cm_id;
  407         cm_id->context = p;
  408         p->state = IPOIB_CM_RX_LIVE;
  409         p->jiffies = jiffies;
  410         INIT_LIST_HEAD(&p->list);
  411 
  412         p->qp = ipoib_cm_create_rx_qp(priv, p);
  413         if (IS_ERR(p->qp)) {
  414                 ret = PTR_ERR(p->qp);
  415                 goto err_qp;
  416         }
  417 
  418         psn = random() & 0xffffff;
  419         ret = ipoib_cm_modify_rx_qp(priv, cm_id, p->qp, psn);
  420         if (ret)
  421                 goto err_modify;
  422 
  423         if (!ipoib_cm_has_srq(priv)) {
  424                 ret = ipoib_cm_nonsrq_init_rx(priv, cm_id, p);
  425                 if (ret)
  426                         goto err_modify;
  427         }
  428 
  429         spin_lock_irq(&priv->lock);
  430         queue_delayed_work(ipoib_workqueue,
  431                            &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
  432         /* Add this entry to passive ids list head, but do not re-add it
  433          * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
  434         p->jiffies = jiffies;
  435         if (p->state == IPOIB_CM_RX_LIVE)
  436                 list_move(&p->list, &priv->cm.passive_ids);
  437         spin_unlock_irq(&priv->lock);
  438 
  439         ret = ipoib_cm_send_rep(priv, cm_id, p->qp, &event->param.req_rcvd, psn);
  440         if (ret) {
  441                 ipoib_warn(priv, "failed to send REP: %d\n", ret);
  442                 if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
  443                         ipoib_warn(priv, "unable to move qp to error state\n");
  444         }
  445         return 0;
  446 
  447 err_modify:
  448         ib_destroy_qp(p->qp);
  449 err_qp:
  450         kfree(p);
  451         return ret;
  452 }
  453 
  454 static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
  455                                struct ib_cm_event *event)
  456 {
  457         struct ipoib_cm_rx *p;
  458         struct ipoib_dev_priv *priv;
  459 
  460         switch (event->event) {
  461         case IB_CM_REQ_RECEIVED:
  462                 return ipoib_cm_req_handler(cm_id, event);
  463         case IB_CM_DREQ_RECEIVED:
  464                 p = cm_id->context;
  465                 ib_send_cm_drep(cm_id, NULL, 0);
  466                 /* Fall through */
  467         case IB_CM_REJ_RECEIVED:
  468                 p = cm_id->context;
  469                 priv = p->priv;
  470                 if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
  471                         ipoib_warn(priv, "unable to move qp to error state\n");
  472                 /* Fall through */
  473         default:
  474                 return 0;
  475         }
  476 }
  477 
  478 void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
  479 {
  480         struct ipoib_cm_rx_buf saverx;
  481         struct ipoib_cm_rx_buf *rx_ring;
  482         unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
  483         struct ifnet *dev = priv->dev;
  484         struct mbuf *mb, *newmb;
  485         struct ipoib_cm_rx *p;
  486         int has_srq;
  487 
  488         ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
  489                        wr_id, wc->status);
  490 
  491         if (unlikely(wr_id >= ipoib_recvq_size)) {
  492                 if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
  493                         spin_lock(&priv->lock);
  494                         list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
  495                         ipoib_cm_start_rx_drain(priv);
  496                         if (priv->cm.id != NULL)
  497                                 queue_work(ipoib_workqueue,
  498                                     &priv->cm.rx_reap_task);
  499                         spin_unlock(&priv->lock);
  500                 } else
  501                         ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
  502                                    wr_id, ipoib_recvq_size);
  503                 goto done;
  504         }
  505 
  506         p = wc->qp->qp_context;
  507 
  508         has_srq = ipoib_cm_has_srq(priv);
  509         rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
  510 
  511         mb = rx_ring[wr_id].mb;
  512 
  513         if (unlikely(wc->status != IB_WC_SUCCESS)) {
  514                 ipoib_dbg(priv, "cm recv error "
  515                            "(status=%d, wrid=%d vend_err %x)\n",
  516                            wc->status, wr_id, wc->vendor_err);
  517                 if_inc_counter(dev, IFCOUNTER_IERRORS, 1);
  518                 if (has_srq)
  519                         goto repost;
  520                 else {
  521                         if (!--p->recv_count) {
  522                                 spin_lock(&priv->lock);
  523                                 list_move(&p->list, &priv->cm.rx_reap_list);
  524                                 queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
  525                                 spin_unlock(&priv->lock);
  526                         }
  527                         goto done;
  528                 }
  529         }
  530 
  531         if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
  532                 if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
  533                         p->jiffies = jiffies;
  534                         /* Move this entry to list head, but do not re-add it
  535                          * if it has been moved out of list. */
  536                         if (p->state == IPOIB_CM_RX_LIVE)
  537                                 list_move(&p->list, &priv->cm.passive_ids);
  538                 }
  539         }
  540 
  541         memcpy(&saverx, &rx_ring[wr_id], sizeof(saverx));
  542         newmb = ipoib_cm_alloc_rx_mb(priv, &rx_ring[wr_id]);
  543         if (unlikely(!newmb)) {
  544                 /*
  545                  * If we can't allocate a new RX buffer, dump
  546                  * this packet and reuse the old buffer.
  547                  */
  548                 ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
  549                 if_inc_counter(dev, IFCOUNTER_IERRORS, 1);
  550                 memcpy(&rx_ring[wr_id], &saverx, sizeof(saverx));
  551                 goto repost;
  552         }
  553 
  554         ipoib_cm_dma_unmap_rx(priv, &saverx);
  555 
  556         ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
  557                        wc->byte_len, wc->slid);
  558 
  559         ipoib_dma_mb(priv, mb, wc->byte_len);
  560 
  561         mb->m_pkthdr.rcvif = dev;
  562 
  563         M_PREPEND(mb, sizeof(struct ipoib_pseudoheader), M_NOWAIT);
  564         if (likely(mb != NULL)) {
  565                 struct ipoib_header *ibh;
  566 
  567                 if_inc_counter(dev, IFCOUNTER_IPACKETS, 1);
  568                 if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len);
  569 
  570                 /* fixup destination infiniband address */
  571                 ibh = mtod(mb, struct ipoib_header *);
  572                 memset(ibh->hwaddr, 0, 4);
  573                 memcpy(ibh->hwaddr + 4, priv->local_gid.raw, sizeof(union ib_gid));
  574 
  575                 dev->if_input(dev, mb);
  576         } else {
  577                 if_inc_counter(dev, IFCOUNTER_IERRORS, 1);
  578         }
  579 repost:
  580         if (has_srq) {
  581                 if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id)))
  582                         ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
  583                                    "for buf %d\n", wr_id);
  584         } else {
  585                 if (unlikely(ipoib_cm_post_receive_nonsrq(priv, p,
  586                                                           &priv->cm.rx_wr,
  587                                                           priv->cm.rx_sge,
  588                                                           wr_id))) {
  589                         --p->recv_count;
  590                         ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
  591                                    "for buf %d\n", wr_id);
  592                 }
  593         }
  594 done:
  595         return;
  596 }
  597 
  598 static inline int post_send(struct ipoib_dev_priv *priv,
  599                             struct ipoib_cm_tx *tx,
  600                             struct ipoib_cm_tx_buf *tx_req,
  601                             unsigned int wr_id)
  602 {
  603         const struct ib_send_wr *bad_wr;
  604         struct mbuf *mb = tx_req->mb;
  605         u64 *mapping = tx_req->mapping;
  606         struct mbuf *m;
  607         int i;
  608 
  609         for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
  610                 priv->tx_sge[i].addr = mapping[i];
  611                 priv->tx_sge[i].length = m->m_len;
  612         }
  613         priv->tx_wr.wr.num_sge = i;
  614         priv->tx_wr.wr.wr_id = wr_id | IPOIB_OP_CM;
  615         priv->tx_wr.wr.opcode = IB_WR_SEND;
  616 
  617         return ib_post_send(tx->qp, &priv->tx_wr.wr, &bad_wr);
  618 }
  619 
  620 void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx)
  621 {
  622         struct ipoib_cm_tx_buf *tx_req;
  623         struct ifnet *dev = priv->dev;
  624 
  625         if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) {
  626                 while (ipoib_poll_tx(priv, false))
  627                         ;       /* nothing */
  628         }
  629 
  630         m_adj(mb, sizeof(struct ipoib_pseudoheader));
  631         if (unlikely(mb->m_pkthdr.len > tx->mtu)) {
  632                 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
  633                            mb->m_pkthdr.len, tx->mtu);
  634                 if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
  635                 ipoib_cm_mb_too_long(priv, mb, IPOIB_CM_MTU(tx->mtu));
  636                 return;
  637         }
  638 
  639         ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
  640                        tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num);
  641 
  642 
  643         /*
  644          * We put the mb into the tx_ring _before_ we call post_send()
  645          * because it's entirely possible that the completion handler will
  646          * run before we execute anything after the post_send().  That
  647          * means we have to make sure everything is properly recorded and
  648          * our state is consistent before we call post_send().
  649          */
  650         tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
  651         tx_req->mb = mb;
  652         if (unlikely(ipoib_dma_map_tx(priv->ca, (struct ipoib_tx_buf *)tx_req,
  653             priv->cm.num_frags))) {
  654                 if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
  655                 if (tx_req->mb)
  656                         m_freem(tx_req->mb);
  657                 return;
  658         }
  659 
  660         if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) {
  661                 ipoib_warn(priv, "post_send failed\n");
  662                 if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
  663                 ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
  664                 m_freem(mb);
  665         } else {
  666                 ++tx->tx_head;
  667 
  668                 if (++priv->tx_outstanding == ipoib_sendq_size) {
  669                         ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
  670                                   tx->qp->qp_num);
  671                         if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
  672                                 ipoib_warn(priv, "request notify on send CQ failed\n");
  673                         dev->if_drv_flags |= IFF_DRV_OACTIVE;
  674                 }
  675         }
  676 
  677 }
  678 
  679 void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
  680 {
  681         struct ipoib_cm_tx *tx = wc->qp->qp_context;
  682         unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
  683         struct ifnet *dev = priv->dev;
  684         struct ipoib_cm_tx_buf *tx_req;
  685 
  686         ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
  687                        wr_id, wc->status);
  688 
  689         if (unlikely(wr_id >= ipoib_sendq_size)) {
  690                 ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
  691                            wr_id, ipoib_sendq_size);
  692                 return;
  693         }
  694 
  695         tx_req = &tx->tx_ring[wr_id];
  696 
  697         ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
  698 
  699         /* FIXME: is this right? Shouldn't we only increment on success? */
  700         if_inc_counter(dev, IFCOUNTER_OPACKETS, 1);
  701 
  702         m_freem(tx_req->mb);
  703 
  704         ++tx->tx_tail;
  705         if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
  706             (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
  707             test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
  708                 dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
  709 
  710         if (wc->status != IB_WC_SUCCESS &&
  711             wc->status != IB_WC_WR_FLUSH_ERR) {
  712                 struct ipoib_path *path;
  713 
  714                 ipoib_dbg(priv, "failed cm send event "
  715                            "(status=%d, wrid=%d vend_err %x)\n",
  716                            wc->status, wr_id, wc->vendor_err);
  717 
  718                 path = tx->path;
  719 
  720                 if (path) {
  721                         path->cm = NULL;
  722                         rb_erase(&path->rb_node, &priv->path_tree);
  723                         list_del(&path->list);
  724                 }
  725 
  726                 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
  727                         list_move(&tx->list, &priv->cm.reap_list);
  728                         queue_work(ipoib_workqueue, &priv->cm.reap_task);
  729                 }
  730 
  731                 clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
  732         }
  733 
  734 }
  735 
  736 int ipoib_cm_dev_open(struct ipoib_dev_priv *priv)
  737 {
  738         int ret;
  739 
  740         if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)))
  741                 return 0;
  742 
  743         priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, priv);
  744         if (IS_ERR(priv->cm.id)) {
  745                 printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
  746                 ret = PTR_ERR(priv->cm.id);
  747                 goto err_cm;
  748         }
  749 
  750         ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 0);
  751         if (ret) {
  752                 printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
  753                        IPOIB_CM_IETF_ID | priv->qp->qp_num);
  754                 goto err_listen;
  755         }
  756 
  757         return 0;
  758 
  759 err_listen:
  760         ib_destroy_cm_id(priv->cm.id);
  761 err_cm:
  762         priv->cm.id = NULL;
  763         return ret;
  764 }
  765 
  766 static void ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv *priv)
  767 {
  768         struct ipoib_cm_rx *rx, *n;
  769         LIST_HEAD(list);
  770 
  771         spin_lock_irq(&priv->lock);
  772         list_splice_init(&priv->cm.rx_reap_list, &list);
  773         spin_unlock_irq(&priv->lock);
  774 
  775         list_for_each_entry_safe(rx, n, &list, list) {
  776                 ib_destroy_cm_id(rx->id);
  777                 ib_destroy_qp(rx->qp);
  778                 if (!ipoib_cm_has_srq(priv)) {
  779                         ipoib_cm_free_rx_ring(priv, rx->rx_ring);
  780                         spin_lock_irq(&priv->lock);
  781                         --priv->cm.nonsrq_conn_qp;
  782                         spin_unlock_irq(&priv->lock);
  783                 }
  784                 kfree(rx);
  785         }
  786 }
  787 
  788 void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv)
  789 {
  790         struct ipoib_cm_rx *p;
  791         unsigned long begin;
  792         int ret;
  793 
  794         if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)) || !priv->cm.id)
  795                 return;
  796 
  797         ib_destroy_cm_id(priv->cm.id);
  798         priv->cm.id = NULL;
  799 
  800         cancel_work_sync(&priv->cm.rx_reap_task);
  801 
  802         spin_lock_irq(&priv->lock);
  803         while (!list_empty(&priv->cm.passive_ids)) {
  804                 p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
  805                 list_move(&p->list, &priv->cm.rx_error_list);
  806                 p->state = IPOIB_CM_RX_ERROR;
  807                 spin_unlock_irq(&priv->lock);
  808                 ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
  809                 if (ret)
  810                         ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
  811                 spin_lock_irq(&priv->lock);
  812         }
  813 
  814         /* Wait for all RX to be drained */
  815         begin = jiffies;
  816 
  817         while (!list_empty(&priv->cm.rx_error_list) ||
  818                !list_empty(&priv->cm.rx_flush_list) ||
  819                !list_empty(&priv->cm.rx_drain_list)) {
  820                 if (time_after(jiffies, begin + 5 * HZ)) {
  821                         ipoib_warn(priv, "RX drain timing out\n");
  822 
  823                         /*
  824                          * assume the HW is wedged and just free up everything.
  825                          */
  826                         list_splice_init(&priv->cm.rx_flush_list,
  827                                          &priv->cm.rx_reap_list);
  828                         list_splice_init(&priv->cm.rx_error_list,
  829                                          &priv->cm.rx_reap_list);
  830                         list_splice_init(&priv->cm.rx_drain_list,
  831                                          &priv->cm.rx_reap_list);
  832                         break;
  833                 }
  834                 spin_unlock_irq(&priv->lock);
  835                 msleep(1);
  836                 ipoib_drain_cq(priv);
  837                 spin_lock_irq(&priv->lock);
  838         }
  839 
  840         spin_unlock_irq(&priv->lock);
  841 
  842         ipoib_cm_free_rx_reap_list(priv);
  843 
  844         cancel_delayed_work_sync(&priv->cm.stale_task);
  845 }
  846 
  847 static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
  848 {
  849         struct ipoib_cm_tx *p = cm_id->context;
  850         struct ipoib_dev_priv *priv = p->priv;
  851         struct ipoib_cm_data *data = event->private_data;
  852         struct epoch_tracker et;
  853         struct ifqueue mbqueue;
  854         struct ib_qp_attr qp_attr;
  855         int qp_attr_mask, ret;
  856         struct mbuf *mb;
  857 
  858         ipoib_dbg(priv, "cm rep handler\n");
  859         p->mtu = be32_to_cpu(data->mtu);
  860 
  861         if (p->mtu <= IPOIB_ENCAP_LEN) {
  862                 ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
  863                            p->mtu, IPOIB_ENCAP_LEN);
  864                 return -EINVAL;
  865         }
  866 
  867         qp_attr.qp_state = IB_QPS_RTR;
  868         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
  869         if (ret) {
  870                 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
  871                 return ret;
  872         }
  873 
  874         qp_attr.rq_psn = 0 /* FIXME */;
  875         ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
  876         if (ret) {
  877                 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
  878                 return ret;
  879         }
  880 
  881         qp_attr.qp_state = IB_QPS_RTS;
  882         ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
  883         if (ret) {
  884                 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
  885                 return ret;
  886         }
  887         ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
  888         if (ret) {
  889                 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
  890                 return ret;
  891         }
  892 
  893         bzero(&mbqueue, sizeof(mbqueue));
  894 
  895         spin_lock_irq(&priv->lock);
  896         set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
  897         if (p->path)
  898                 for (;;) {
  899                         _IF_DEQUEUE(&p->path->queue, mb);
  900                         if (mb == NULL)
  901                                 break;
  902                         _IF_ENQUEUE(&mbqueue, mb);
  903                 }
  904         spin_unlock_irq(&priv->lock);
  905 
  906         NET_EPOCH_ENTER(et);
  907         for (;;) {
  908                 struct ifnet *dev = p->priv->dev;
  909                 _IF_DEQUEUE(&mbqueue, mb);
  910                 if (mb == NULL)
  911                         break;
  912                 mb->m_pkthdr.rcvif = dev;
  913                 if (dev->if_transmit(dev, mb))
  914                         ipoib_warn(priv, "dev_queue_xmit failed "
  915                                    "to requeue packet\n");
  916         }
  917         NET_EPOCH_EXIT(et);
  918 
  919         ret = ib_send_cm_rtu(cm_id, NULL, 0);
  920         if (ret) {
  921                 ipoib_warn(priv, "failed to send RTU: %d\n", ret);
  922                 return ret;
  923         }
  924         return 0;
  925 }
  926 
  927 static struct ib_qp *ipoib_cm_create_tx_qp(struct ipoib_dev_priv *priv,
  928     struct ipoib_cm_tx *tx)
  929 {
  930         struct ib_qp_init_attr attr = {
  931                 .send_cq                = priv->send_cq,
  932                 .recv_cq                = priv->recv_cq,
  933                 .srq                    = priv->cm.srq,
  934                 .cap.max_send_wr        = ipoib_sendq_size,
  935                 .cap.max_send_sge       = priv->cm.num_frags,
  936                 .sq_sig_type            = IB_SIGNAL_ALL_WR,
  937                 .qp_type                = IB_QPT_RC,
  938                 .qp_context             = tx
  939         };
  940 
  941         return ib_create_qp(priv->pd, &attr);
  942 }
  943 
  944 static int ipoib_cm_send_req(struct ipoib_dev_priv *priv,
  945                              struct ib_cm_id *id, struct ib_qp *qp,
  946                              u32 qpn,
  947                              struct ib_sa_path_rec *pathrec)
  948 {
  949         struct ipoib_cm_data data = {};
  950         struct ib_cm_req_param req = {};
  951 
  952         ipoib_dbg(priv, "cm send req\n");
  953 
  954         data.qpn = cpu_to_be32(priv->qp->qp_num);
  955         data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
  956 
  957         req.primary_path                = pathrec;
  958         req.alternate_path              = NULL;
  959         req.service_id                  = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
  960         req.qp_num                      = qp->qp_num;
  961         req.qp_type                     = qp->qp_type;
  962         req.private_data                = &data;
  963         req.private_data_len            = sizeof data;
  964         req.flow_control                = 0;
  965 
  966         req.starting_psn                = 0; /* FIXME */
  967 
  968         /*
  969          * Pick some arbitrary defaults here; we could make these
  970          * module parameters if anyone cared about setting them.
  971          */
  972         req.responder_resources         = 4;
  973         req.remote_cm_response_timeout  = 20;
  974         req.local_cm_response_timeout   = 20;
  975         req.retry_count                 = 0; /* RFC draft warns against retries */
  976         req.rnr_retry_count             = 0; /* RFC draft warns against retries */
  977         req.max_cm_retries              = 15;
  978         req.srq                         = ipoib_cm_has_srq(priv);
  979         return ib_send_cm_req(id, &req);
  980 }
  981 
  982 static int ipoib_cm_modify_tx_init(struct ipoib_dev_priv *priv,
  983                                   struct ib_cm_id *cm_id, struct ib_qp *qp)
  984 {
  985         struct ib_qp_attr qp_attr;
  986         int qp_attr_mask, ret;
  987         ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
  988         if (ret) {
  989                 ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
  990                 return ret;
  991         }
  992 
  993         qp_attr.qp_state = IB_QPS_INIT;
  994         qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
  995         qp_attr.port_num = priv->port;
  996         qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
  997 
  998         ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
  999         if (ret) {
 1000                 ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
 1001                 return ret;
 1002         }
 1003         return 0;
 1004 }
 1005 
 1006 static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
 1007                             struct ib_sa_path_rec *pathrec)
 1008 {
 1009         struct ipoib_dev_priv *priv = p->priv;
 1010         int ret;
 1011 
 1012         p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL);
 1013         if (!p->tx_ring) {
 1014                 ipoib_warn(priv, "failed to allocate tx ring\n");
 1015                 ret = -ENOMEM;
 1016                 goto err_tx;
 1017         }
 1018         memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
 1019 
 1020         p->qp = ipoib_cm_create_tx_qp(p->priv, p);
 1021         if (IS_ERR(p->qp)) {
 1022                 ret = PTR_ERR(p->qp);
 1023                 ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
 1024                 goto err_qp;
 1025         }
 1026 
 1027         p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
 1028         if (IS_ERR(p->id)) {
 1029                 ret = PTR_ERR(p->id);
 1030                 ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
 1031                 goto err_id;
 1032         }
 1033 
 1034         ret = ipoib_cm_modify_tx_init(p->priv, p->id,  p->qp);
 1035         if (ret) {
 1036                 ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
 1037                 goto err_modify;
 1038         }
 1039 
 1040         ret = ipoib_cm_send_req(p->priv, p->id, p->qp, qpn, pathrec);
 1041         if (ret) {
 1042                 ipoib_warn(priv, "failed to send cm req: %d\n", ret);
 1043                 goto err_send_cm;
 1044         }
 1045 
 1046         ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
 1047                   p->qp->qp_num, pathrec->dgid.raw, qpn);
 1048 
 1049         return 0;
 1050 
 1051 err_send_cm:
 1052 err_modify:
 1053         ib_destroy_cm_id(p->id);
 1054 err_id:
 1055         p->id = NULL;
 1056         ib_destroy_qp(p->qp);
 1057 err_qp:
 1058         p->qp = NULL;
 1059         kfree(p->tx_ring);
 1060 err_tx:
 1061         return ret;
 1062 }
 1063 
 1064 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 1065 {
 1066         struct ipoib_dev_priv *priv = p->priv;
 1067         struct ifnet *dev = priv->dev;
 1068         struct ipoib_cm_tx_buf *tx_req;
 1069         unsigned long begin;
 1070 
 1071         ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
 1072                   p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
 1073 
 1074         if (p->path)
 1075                 ipoib_path_free(priv, p->path);
 1076 
 1077         if (p->id)
 1078                 ib_destroy_cm_id(p->id);
 1079 
 1080         if (p->tx_ring) {
 1081                 /* Wait for all sends to complete */
 1082                 begin = jiffies;
 1083                 while ((int) p->tx_tail - (int) p->tx_head < 0) {
 1084                         if (time_after(jiffies, begin + 5 * HZ)) {
 1085                                 ipoib_warn(priv, "timing out; %d sends not completed\n",
 1086                                            p->tx_head - p->tx_tail);
 1087                                 goto timeout;
 1088                         }
 1089 
 1090                         msleep(1);
 1091                 }
 1092         }
 1093 
 1094 timeout:
 1095 
 1096         while ((int) p->tx_tail - (int) p->tx_head < 0) {
 1097                 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
 1098                 ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
 1099                 m_freem(tx_req->mb);
 1100                 ++p->tx_tail;
 1101                 if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 1102                     (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
 1103                     test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 1104                         dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
 1105         }
 1106 
 1107         if (p->qp)
 1108                 ib_destroy_qp(p->qp);
 1109 
 1110         kfree(p->tx_ring);
 1111         kfree(p);
 1112 }
 1113 
 1114 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 1115                                struct ib_cm_event *event)
 1116 {
 1117         struct ipoib_cm_tx *tx = cm_id->context;
 1118         struct ipoib_dev_priv *priv = tx->priv;
 1119         struct ipoib_path *path;
 1120         unsigned long flags;
 1121         int ret;
 1122 
 1123         switch (event->event) {
 1124         case IB_CM_DREQ_RECEIVED:
 1125                 ipoib_dbg(priv, "DREQ received.\n");
 1126                 ib_send_cm_drep(cm_id, NULL, 0);
 1127                 break;
 1128         case IB_CM_REP_RECEIVED:
 1129                 ipoib_dbg(priv, "REP received.\n");
 1130                 ret = ipoib_cm_rep_handler(cm_id, event);
 1131                 if (ret)
 1132                         ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
 1133                                        NULL, 0, NULL, 0);
 1134                 break;
 1135         case IB_CM_REQ_ERROR:
 1136         case IB_CM_REJ_RECEIVED:
 1137         case IB_CM_TIMEWAIT_EXIT:
 1138                 ipoib_dbg(priv, "CM error %d.\n", event->event);
 1139                 spin_lock_irqsave(&priv->lock, flags);
 1140                 path = tx->path;
 1141 
 1142                 if (path) {
 1143                         path->cm = NULL;
 1144                         tx->path = NULL;
 1145                         rb_erase(&path->rb_node, &priv->path_tree);
 1146                         list_del(&path->list);
 1147                 }
 1148 
 1149                 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 1150                         list_move(&tx->list, &priv->cm.reap_list);
 1151                         queue_work(ipoib_workqueue, &priv->cm.reap_task);
 1152                 }
 1153 
 1154                 spin_unlock_irqrestore(&priv->lock, flags);
 1155                 if (path)
 1156                         ipoib_path_free(tx->priv, path);
 1157                 break;
 1158         default:
 1159                 break;
 1160         }
 1161 
 1162         return 0;
 1163 }
 1164 
 1165 struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv,
 1166     struct ipoib_path *path)
 1167 {
 1168         struct ipoib_cm_tx *tx;
 1169 
 1170         tx = kzalloc(sizeof *tx, GFP_ATOMIC);
 1171         if (!tx)
 1172                 return NULL;
 1173 
 1174         ipoib_dbg(priv, "Creating cm tx\n");
 1175         path->cm = tx;
 1176         tx->path = path;
 1177         tx->priv = priv;
 1178         list_add(&tx->list, &priv->cm.start_list);
 1179         set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
 1180         queue_work(ipoib_workqueue, &priv->cm.start_task);
 1181         return tx;
 1182 }
 1183 
 1184 void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
 1185 {
 1186         struct ipoib_dev_priv *priv = tx->priv;
 1187         if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 1188                 spin_lock(&priv->lock);
 1189                 list_move(&tx->list, &priv->cm.reap_list);
 1190                 spin_unlock(&priv->lock);
 1191                 queue_work(ipoib_workqueue, &priv->cm.reap_task);
 1192                 ipoib_dbg(priv, "Reap connection for gid %pI6\n",
 1193                           tx->path->pathrec.dgid.raw);
 1194                 tx->path = NULL;
 1195         }
 1196 }
 1197 
 1198 static void ipoib_cm_tx_start(struct work_struct *work)
 1199 {
 1200         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 1201                                                    cm.start_task);
 1202         struct ipoib_path *path;
 1203         struct ipoib_cm_tx *p;
 1204         unsigned long flags;
 1205         int ret;
 1206 
 1207         struct ib_sa_path_rec pathrec;
 1208         u32 qpn;
 1209 
 1210         ipoib_dbg(priv, "cm start task\n");
 1211         spin_lock_irqsave(&priv->lock, flags);
 1212 
 1213         while (!list_empty(&priv->cm.start_list)) {
 1214                 p = list_entry(priv->cm.start_list.next, typeof(*p), list);
 1215                 list_del_init(&p->list);
 1216                 path = p->path;
 1217                 qpn = IPOIB_QPN(path->hwaddr);
 1218                 memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
 1219 
 1220                 spin_unlock_irqrestore(&priv->lock, flags);
 1221 
 1222                 ret = ipoib_cm_tx_init(p, qpn, &pathrec);
 1223 
 1224                 spin_lock_irqsave(&priv->lock, flags);
 1225 
 1226                 if (ret) {
 1227                         path = p->path;
 1228                         if (path) {
 1229                                 path->cm = NULL;
 1230                                 rb_erase(&path->rb_node, &priv->path_tree);
 1231                                 list_del(&path->list);
 1232                                 ipoib_path_free(priv, path);
 1233                         }
 1234                         list_del(&p->list);
 1235                         kfree(p);
 1236                 }
 1237         }
 1238 
 1239         spin_unlock_irqrestore(&priv->lock, flags);
 1240 }
 1241 
 1242 static void ipoib_cm_tx_reap(struct work_struct *work)
 1243 {
 1244         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 1245                                                    cm.reap_task);
 1246         struct ipoib_cm_tx *p;
 1247         unsigned long flags;
 1248 
 1249         spin_lock_irqsave(&priv->lock, flags);
 1250 
 1251         while (!list_empty(&priv->cm.reap_list)) {
 1252                 p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
 1253                 list_del(&p->list);
 1254                 spin_unlock_irqrestore(&priv->lock, flags);
 1255                 ipoib_cm_tx_destroy(p);
 1256                 spin_lock_irqsave(&priv->lock, flags);
 1257         }
 1258 
 1259         spin_unlock_irqrestore(&priv->lock, flags);
 1260 }
 1261 
 1262 static void ipoib_cm_mb_reap(struct work_struct *work)
 1263 {
 1264         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 1265                                                    cm.mb_task);
 1266         struct mbuf *mb;
 1267         unsigned long flags;
 1268 #if defined(INET) || defined(INET6)
 1269         unsigned mtu = priv->mcast_mtu;
 1270 #endif
 1271         uint16_t proto;
 1272 
 1273         spin_lock_irqsave(&priv->lock, flags);
 1274 
 1275         CURVNET_SET_QUIET(priv->dev->if_vnet);
 1276 
 1277         for (;;) {
 1278                 IF_DEQUEUE(&priv->cm.mb_queue, mb);
 1279                 if (mb == NULL)
 1280                         break;
 1281                 spin_unlock_irqrestore(&priv->lock, flags);
 1282 
 1283                 proto = htons(*mtod(mb, uint16_t *));
 1284                 m_adj(mb, IPOIB_ENCAP_LEN);
 1285                 switch (proto) {
 1286 #if defined(INET)
 1287                 case ETHERTYPE_IP:
 1288                         icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
 1289                         break;
 1290 #endif
 1291 #if defined(INET6)
 1292                 case ETHERTYPE_IPV6:
 1293                         icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu);
 1294                         break;
 1295 #endif
 1296                 default:
 1297                         m_freem(mb);
 1298                 }
 1299 
 1300                 spin_lock_irqsave(&priv->lock, flags);
 1301         }
 1302 
 1303         CURVNET_RESTORE();
 1304 
 1305         spin_unlock_irqrestore(&priv->lock, flags);
 1306 }
 1307 
 1308 void
 1309 ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu)
 1310 {
 1311         int e = priv->cm.mb_queue.ifq_len; 
 1312 
 1313         IF_ENQUEUE(&priv->cm.mb_queue, mb);
 1314         if (e == 0)
 1315                 queue_work(ipoib_workqueue, &priv->cm.mb_task);
 1316 }
 1317 
 1318 static void ipoib_cm_rx_reap(struct work_struct *work)
 1319 {
 1320         ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
 1321                                                 cm.rx_reap_task));
 1322 }
 1323 
 1324 static void ipoib_cm_stale_task(struct work_struct *work)
 1325 {
 1326         struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 1327                                                    cm.stale_task.work);
 1328         struct ipoib_cm_rx *p;
 1329         int ret;
 1330 
 1331         spin_lock_irq(&priv->lock);
 1332         while (!list_empty(&priv->cm.passive_ids)) {
 1333                 /* List is sorted by LRU, start from tail,
 1334                  * stop when we see a recently used entry */
 1335                 p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
 1336                 if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
 1337                         break;
 1338                 list_move(&p->list, &priv->cm.rx_error_list);
 1339                 p->state = IPOIB_CM_RX_ERROR;
 1340                 spin_unlock_irq(&priv->lock);
 1341                 ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
 1342                 if (ret)
 1343                         ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
 1344                 spin_lock_irq(&priv->lock);
 1345         }
 1346 
 1347         if (!list_empty(&priv->cm.passive_ids))
 1348                 queue_delayed_work(ipoib_workqueue,
 1349                                    &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
 1350         spin_unlock_irq(&priv->lock);
 1351 }
 1352 
 1353 
 1354 static void ipoib_cm_create_srq(struct ipoib_dev_priv *priv, int max_sge)
 1355 {
 1356         struct ib_srq_init_attr srq_init_attr = {
 1357                 .attr = {
 1358                         .max_wr  = ipoib_recvq_size,
 1359                         .max_sge = max_sge
 1360                 }
 1361         };
 1362 
 1363         priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
 1364         if (IS_ERR(priv->cm.srq)) {
 1365                 if (PTR_ERR(priv->cm.srq) != -ENOSYS)
 1366                         printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
 1367                                priv->ca->name, PTR_ERR(priv->cm.srq));
 1368                 priv->cm.srq = NULL;
 1369                 return;
 1370         }
 1371 
 1372         priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL);
 1373         if (!priv->cm.srq_ring) {
 1374                 printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
 1375                        priv->ca->name, ipoib_recvq_size);
 1376                 ib_destroy_srq(priv->cm.srq);
 1377                 priv->cm.srq = NULL;
 1378                 return;
 1379         }
 1380 
 1381         memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring);
 1382 }
 1383 
 1384 int ipoib_cm_dev_init(struct ipoib_dev_priv *priv)
 1385 {
 1386         struct ifnet *dev = priv->dev;
 1387         int i;
 1388         int max_srq_sge;
 1389 
 1390         INIT_LIST_HEAD(&priv->cm.passive_ids);
 1391         INIT_LIST_HEAD(&priv->cm.reap_list);
 1392         INIT_LIST_HEAD(&priv->cm.start_list);
 1393         INIT_LIST_HEAD(&priv->cm.rx_error_list);
 1394         INIT_LIST_HEAD(&priv->cm.rx_flush_list);
 1395         INIT_LIST_HEAD(&priv->cm.rx_drain_list);
 1396         INIT_LIST_HEAD(&priv->cm.rx_reap_list);
 1397         INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
 1398         INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
 1399         INIT_WORK(&priv->cm.mb_task, ipoib_cm_mb_reap);
 1400         INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
 1401         INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
 1402 
 1403         bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue));
 1404         mtx_init(&priv->cm.mb_queue.ifq_mtx,
 1405             dev->if_xname, "if send queue", MTX_DEF);
 1406 
 1407         max_srq_sge = priv->ca->attrs.max_srq_sge;
 1408 
 1409         ipoib_dbg(priv, "max_srq_sge=%d\n", max_srq_sge);
 1410 
 1411         max_srq_sge = min_t(int, IPOIB_CM_RX_SG, max_srq_sge);
 1412         ipoib_cm_create_srq(priv, max_srq_sge);
 1413         if (ipoib_cm_has_srq(priv)) {
 1414                 priv->cm.max_cm_mtu = max_srq_sge * MJUMPAGESIZE;
 1415                 priv->cm.num_frags  = max_srq_sge;
 1416                 ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
 1417                           priv->cm.max_cm_mtu, priv->cm.num_frags);
 1418         } else {
 1419                 priv->cm.max_cm_mtu = IPOIB_CM_MAX_MTU;
 1420                 priv->cm.num_frags  = IPOIB_CM_RX_SG;
 1421         }
 1422 
 1423         ipoib_cm_init_rx_wr(priv, &priv->cm.rx_wr, priv->cm.rx_sge);
 1424 
 1425         if (ipoib_cm_has_srq(priv)) {
 1426                 for (i = 0; i < ipoib_recvq_size; ++i) {
 1427                         if (!ipoib_cm_alloc_rx_mb(priv, &priv->cm.srq_ring[i])) {
 1428                                 ipoib_warn(priv, "failed to allocate "
 1429                                            "receive buffer %d\n", i);
 1430                                 ipoib_cm_dev_cleanup(priv);
 1431                                 return -ENOMEM;
 1432                         }
 1433 
 1434                         if (ipoib_cm_post_receive_srq(priv, i)) {
 1435                                 ipoib_warn(priv, "ipoib_cm_post_receive_srq "
 1436                                            "failed for buf %d\n", i);
 1437                                 ipoib_cm_dev_cleanup(priv);
 1438                                 return -EIO;
 1439                         }
 1440                 }
 1441         }
 1442 
 1443         IF_LLADDR(priv->dev)[0] = IPOIB_FLAGS_RC;
 1444         return 0;
 1445 }
 1446 
 1447 void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv)
 1448 {
 1449         int ret;
 1450 
 1451         if (!priv->cm.srq)
 1452                 return;
 1453 
 1454         ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
 1455 
 1456         ret = ib_destroy_srq(priv->cm.srq);
 1457         if (ret)
 1458                 ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
 1459 
 1460         priv->cm.srq = NULL;
 1461         if (!priv->cm.srq_ring)
 1462                 return;
 1463 
 1464         ipoib_cm_free_rx_ring(priv, priv->cm.srq_ring);
 1465         priv->cm.srq_ring = NULL;
 1466 
 1467         mtx_destroy(&priv->cm.mb_queue.ifq_mtx);
 1468 }
 1469 
 1470 #endif /* CONFIG_INFINIBAND_IPOIB_CM */

Cache object: 7bb9d685065143a56f9e7e8e7f35aff8


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.