The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/mlx4/mlx4_core/mlx4_catas.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
    3  * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved.
    4  *
    5  * This software is available to you under a choice of one of two
    6  * licenses.  You may choose to be licensed under the terms of the GNU
    7  * General Public License (GPL) Version 2, available from the file
    8  * COPYING in the main directory of this source tree, or the
    9  * OpenIB.org BSD license below:
   10  *
   11  *     Redistribution and use in source and binary forms, with or
   12  *     without modification, are permitted provided that the following
   13  *     conditions are met:
   14  *
   15  *      - Redistributions of source code must retain the above
   16  *        copyright notice, this list of conditions and the following
   17  *        disclaimer.
   18  *
   19  *      - Redistributions in binary form must reproduce the above
   20  *        copyright notice, this list of conditions and the following
   21  *        disclaimer in the documentation and/or other materials
   22  *        provided with the distribution.
   23  *
   24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   31  * SOFTWARE.
   32  */
   33 
   34 #define LINUXKPI_PARAM_PREFIX mlx4_
   35 
   36 #include <linux/workqueue.h>
   37 #include <linux/module.h>
   38 
   39 #include <asm/byteorder.h>
   40 
   41 #include "mlx4.h"
   42 
   43 #define         MLX4_CATAS_POLL_INTERVAL        (5 * HZ)
   44 
   45 
   46 
   47 int mlx4_internal_err_reset = 1;
   48 module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
   49 MODULE_PARM_DESC(internal_err_reset,
   50                  "Reset device on internal errors if non-zero (default 1)");
   51 
   52 static int read_vendor_id(struct mlx4_dev *dev)
   53 {
   54         u16 vendor_id = 0;
   55         int ret;
   56 
   57         ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
   58         if (ret) {
   59                 mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
   60                 return ret;
   61         }
   62 
   63         if (vendor_id == 0xffff) {
   64                 mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
   65                 return -EINVAL;
   66         }
   67 
   68         return 0;
   69 }
   70 
   71 static int mlx4_reset_master(struct mlx4_dev *dev)
   72 {
   73         int err = 0;
   74 
   75         if (mlx4_is_master(dev))
   76                 mlx4_report_internal_err_comm_event(dev);
   77 
   78         if (!pci_channel_offline(dev->persist->pdev)) {
   79                 err = read_vendor_id(dev);
   80                 /* If PCI can't be accessed to read vendor ID we assume that its
   81                  * link was disabled and chip was already reset.
   82                  */
   83                 if (err)
   84                         return 0;
   85 
   86                 err = mlx4_reset(dev);
   87                 if (err)
   88                         mlx4_err(dev, "Fail to reset HCA\n");
   89         }
   90 
   91         return err;
   92 }
   93 
   94 static int mlx4_reset_slave(struct mlx4_dev *dev)
   95 {
   96 #define COM_CHAN_RST_REQ_OFFSET 0x10
   97 #define COM_CHAN_RST_ACK_OFFSET 0x08
   98 
   99         u32 comm_flags;
  100         u32 rst_req;
  101         u32 rst_ack;
  102         unsigned long end;
  103         struct mlx4_priv *priv = mlx4_priv(dev);
  104 
  105         if (pci_channel_offline(dev->persist->pdev))
  106                 return 0;
  107 
  108         comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
  109                                   MLX4_COMM_CHAN_FLAGS));
  110         if (comm_flags == 0xffffffff) {
  111                 mlx4_err(dev, "VF reset is not needed\n");
  112                 return 0;
  113         }
  114 
  115         if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
  116                 mlx4_err(dev, "VF reset is not supported\n");
  117                 return -EOPNOTSUPP;
  118         }
  119 
  120         rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
  121                 COM_CHAN_RST_REQ_OFFSET;
  122         rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
  123                 COM_CHAN_RST_ACK_OFFSET;
  124         if (rst_req != rst_ack) {
  125                 mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
  126                 return -EIO;
  127         }
  128 
  129         rst_req ^= 1;
  130         mlx4_warn(dev, "VF is sending reset request to Firmware\n");
  131         comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
  132         __raw_writel((__force u32)cpu_to_be32(comm_flags),
  133                      (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
  134         /* Make sure that our comm channel write doesn't
  135          * get mixed in with writes from another CPU.
  136          */
  137         mmiowb();
  138 
  139         end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
  140         while (time_before(jiffies, end)) {
  141                 comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
  142                                           MLX4_COMM_CHAN_FLAGS));
  143                 rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
  144                         COM_CHAN_RST_ACK_OFFSET;
  145 
  146                 /* Reading rst_req again since the communication channel can
  147                  * be reset at any time by the PF and all its bits will be
  148                  * set to zero.
  149                  */
  150                 rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
  151                         COM_CHAN_RST_REQ_OFFSET;
  152 
  153                 if (rst_ack == rst_req) {
  154                         mlx4_warn(dev, "VF Reset succeed\n");
  155                         return 0;
  156                 }
  157                 cond_resched();
  158         }
  159         mlx4_err(dev, "Fail to send reset over the communication channel\n");
  160         return -ETIMEDOUT;
  161 }
  162 
  163 static int mlx4_comm_internal_err(u32 slave_read)
  164 {
  165         return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
  166                 (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
  167 }
  168 
  169 void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
  170 {
  171         int err;
  172         struct mlx4_dev *dev;
  173 
  174         if (!mlx4_internal_err_reset)
  175                 return;
  176 
  177         mutex_lock(&persist->device_state_mutex);
  178         if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
  179                 goto out;
  180 
  181         dev = persist->dev;
  182         mlx4_err(dev, "device is going to be reset\n");
  183         if (mlx4_is_slave(dev))
  184                 err = mlx4_reset_slave(dev);
  185         else
  186                 err = mlx4_reset_master(dev);
  187         BUG_ON(err != 0);
  188 
  189         dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
  190         mlx4_err(dev, "device was reset successfully\n");
  191         mutex_unlock(&persist->device_state_mutex);
  192 
  193         /* At that step HW was already reset, now notify clients */
  194         mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
  195         mlx4_cmd_wake_completions(dev);
  196         return;
  197 
  198 out:
  199         mutex_unlock(&persist->device_state_mutex);
  200 }
  201 
  202 static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
  203 {
  204         int err = 0;
  205 
  206         mlx4_enter_error_state(persist);
  207         mutex_lock(&persist->interface_state_mutex);
  208         if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
  209             !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
  210                 err = mlx4_restart_one(persist->pdev);
  211                 mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
  212                           err);
  213         }
  214         mutex_unlock(&persist->interface_state_mutex);
  215 }
  216 
  217 static void dump_err_buf(struct mlx4_dev *dev)
  218 {
  219         struct mlx4_priv *priv = mlx4_priv(dev);
  220 
  221         int i;
  222 
  223         mlx4_err(dev, "Internal error detected:\n");
  224         for (i = 0; i < priv->fw.catas_size; ++i)
  225                 mlx4_err(dev, "  buf[%02x]: %08x\n",
  226                          i, swab32(readl(priv->catas_err.map + i)));
  227 }
  228 
  229 static void poll_catas(unsigned long dev_ptr)
  230 {
  231         struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr;
  232         struct mlx4_priv *priv = mlx4_priv(dev);
  233         u32 slave_read;
  234 
  235         if (mlx4_is_slave(dev)) {
  236                 slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
  237                 if (mlx4_comm_internal_err(slave_read)) {
  238                         mlx4_warn(dev, "Internal error detected on the communication channel\n");
  239                         goto internal_err;
  240                 }
  241         } else if (readl(priv->catas_err.map)) {
  242                 dump_err_buf(dev);
  243                 goto internal_err;
  244         }
  245 
  246         if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
  247                 mlx4_warn(dev, "Internal error mark was detected on device\n");
  248                 goto internal_err;
  249         }
  250 
  251         mod_timer(&priv->catas_err.timer,
  252                   round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
  253         return;
  254 
  255 internal_err:
  256         if (mlx4_internal_err_reset)
  257                 queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
  258 }
  259 
  260 static void catas_reset(struct work_struct *work)
  261 {
  262         struct mlx4_dev_persistent *persist =
  263                 container_of(work, struct mlx4_dev_persistent,
  264                              catas_work);
  265 
  266         mlx4_handle_error_state(persist);
  267 }
  268 
  269 void mlx4_start_catas_poll(struct mlx4_dev *dev)
  270 {
  271         struct mlx4_priv *priv = mlx4_priv(dev);
  272         phys_addr_t addr;
  273 
  274         INIT_LIST_HEAD(&priv->catas_err.list);
  275         init_timer(&priv->catas_err.timer);
  276         priv->catas_err.map = NULL;
  277 
  278         if (!mlx4_is_slave(dev)) {
  279                 addr = pci_resource_start(dev->persist->pdev,
  280                                           priv->fw.catas_bar) +
  281                                           priv->fw.catas_offset;
  282 
  283                 priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
  284                 if (!priv->catas_err.map) {
  285                         mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
  286                                   (unsigned long long)addr);
  287                         return;
  288                 }
  289         }
  290 
  291         priv->catas_err.timer.data     = (unsigned long) dev;
  292         priv->catas_err.timer.function = poll_catas;
  293         priv->catas_err.timer.expires  =
  294                 round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
  295         add_timer(&priv->catas_err.timer);
  296 }
  297 
  298 void mlx4_stop_catas_poll(struct mlx4_dev *dev)
  299 {
  300         struct mlx4_priv *priv = mlx4_priv(dev);
  301 
  302         del_timer_sync(&priv->catas_err.timer);
  303 
  304         if (priv->catas_err.map) {
  305                 iounmap(priv->catas_err.map);
  306                 priv->catas_err.map = NULL;
  307         }
  308 
  309         if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
  310                 flush_workqueue(dev->persist->catas_wq);
  311 }
  312 
  313 int  mlx4_catas_init(struct mlx4_dev *dev)
  314 {
  315         INIT_WORK(&dev->persist->catas_work, catas_reset);
  316         dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
  317         if (!dev->persist->catas_wq)
  318                 return -ENOMEM;
  319 
  320         return 0;
  321 }
  322 
  323 void mlx4_catas_end(struct mlx4_dev *dev)
  324 {
  325         if (dev->persist->catas_wq) {
  326                 destroy_workqueue(dev->persist->catas_wq);
  327                 dev->persist->catas_wq = NULL;
  328         }
  329 }

Cache object: 99412508b6b4df4955aed09f818426a2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.