The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/xen/blkback/blkback.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2009-2012 Spectra Logic Corporation
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions, and the following disclaimer,
   12  *    without modification.
   13  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
   14  *    substantially similar to the "NO WARRANTY" disclaimer below
   15  *    ("Disclaimer") and any redistribution must be conditioned upon
   16  *    including a substantially similar Disclaimer requirement for further
   17  *    binary redistribution.
   18  *
   19  * NO WARRANTY
   20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
   23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
   28  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
   29  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   30  * POSSIBILITY OF SUCH DAMAGES.
   31  *
   32  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
   33  *          Ken Merry           (Spectra Logic Corporation)
   34  */
   35 #include <sys/cdefs.h>
   36 __FBSDID("$FreeBSD$");
   37 
   38 /**
   39  * \file blkback.c
   40  *
   41  * \brief Device driver supporting the vending of block storage from
   42  *        a FreeBSD domain to other domains.
   43  */
   44 
   45 #include <sys/param.h>
   46 #include <sys/systm.h>
   47 #include <sys/kernel.h>
   48 #include <sys/malloc.h>
   49 
   50 #include <sys/bio.h>
   51 #include <sys/bus.h>
   52 #include <sys/conf.h>
   53 #include <sys/devicestat.h>
   54 #include <sys/disk.h>
   55 #include <sys/fcntl.h>
   56 #include <sys/filedesc.h>
   57 #include <sys/kdb.h>
   58 #include <sys/module.h>
   59 #include <sys/namei.h>
   60 #include <sys/proc.h>
   61 #include <sys/rman.h>
   62 #include <sys/taskqueue.h>
   63 #include <sys/types.h>
   64 #include <sys/vnode.h>
   65 #include <sys/mount.h>
   66 #include <sys/sysctl.h>
   67 #include <sys/bitstring.h>
   68 #include <sys/sdt.h>
   69 
   70 #include <geom/geom.h>
   71 
   72 #include <machine/_inttypes.h>
   73 
   74 #include <vm/vm.h>
   75 #include <vm/vm_extern.h>
   76 #include <vm/vm_kern.h>
   77 
   78 #include <xen/xen-os.h>
   79 #include <xen/blkif.h>
   80 #include <xen/gnttab.h>
   81 #include <xen/xen_intr.h>
   82 
   83 #include <contrib/xen/event_channel.h>
   84 #include <contrib/xen/grant_table.h>
   85 
   86 #include <xen/xenbus/xenbusvar.h>
   87 
   88 /*--------------------------- Compile-time Tunables --------------------------*/
   89 /**
   90  * The maximum number of shared memory ring pages we will allow in a
   91  * negotiated block-front/back communication channel.  Allow enough
   92  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
   93  */
   94 #define XBB_MAX_RING_PAGES              32
   95 
   96 /**
   97  * The maximum number of outstanding request blocks (request headers plus
   98  * additional segment blocks) we will allow in a negotiated block-front/back
   99  * communication channel.
  100  */
  101 #define XBB_MAX_REQUESTS                                        \
  102         __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES)
  103 
  104 /**
  105  * \brief Define to enable rudimentary request logging to the console.
  106  */
  107 #undef XBB_DEBUG
  108 
  109 /*---------------------------------- Macros ----------------------------------*/
  110 /**
  111  * Custom malloc type for all driver allocations.
  112  */
  113 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
  114 
  115 #ifdef XBB_DEBUG
  116 #define DPRINTF(fmt, args...)                                   \
  117     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
  118 #else
  119 #define DPRINTF(fmt, args...) do {} while(0)
  120 #endif
  121 
  122 /**
  123  * The maximum mapped region size per request we will allow in a negotiated
  124  * block-front/back communication channel.
  125  * Use old default of MAXPHYS == 128K.
  126  */
  127 #define XBB_MAX_REQUEST_SIZE                                    \
  128         MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
  129 
  130 /**
  131  * The maximum number of segments (within a request header and accompanying
  132  * segment blocks) per request we will allow in a negotiated block-front/back
  133  * communication channel.
  134  */
  135 #define XBB_MAX_SEGMENTS_PER_REQUEST                            \
  136         (MIN(UIO_MAXIOV,                                        \
  137              MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,                \
  138                  (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
  139 
  140 /**
  141  * The maximum number of ring pages that we can allow per request list.
  142  * We limit this to the maximum number of segments per request, because
  143  * that is already a reasonable number of segments to aggregate.  This
  144  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
  145  * because that would leave situations where we can't dispatch even one
  146  * large request.
  147  */
  148 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
  149 
  150 /*--------------------------- Forward Declarations ---------------------------*/
  151 struct xbb_softc;
  152 struct xbb_xen_req;
  153 
  154 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
  155                               ...) __attribute__((format(printf, 3, 4)));
  156 static int  xbb_shutdown(struct xbb_softc *xbb);
  157 
  158 /*------------------------------ Data Structures -----------------------------*/
  159 
  160 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
  161 
  162 typedef enum {
  163         XBB_REQLIST_NONE        = 0x00,
  164         XBB_REQLIST_MAPPED      = 0x01
  165 } xbb_reqlist_flags;
  166 
  167 struct xbb_xen_reqlist {
  168         /**
  169          * Back reference to the parent block back instance for this
  170          * request.  Used during bio_done handling.
  171          */
  172         struct xbb_softc        *xbb;
  173 
  174         /**
  175          * BLKIF_OP code for this request.
  176          */
  177         int                      operation;
  178 
  179         /**
  180          * Set to BLKIF_RSP_* to indicate request status.
  181          *
  182          * This field allows an error status to be recorded even if the
  183          * delivery of this status must be deferred.  Deferred reporting
  184          * is necessary, for example, when an error is detected during
  185          * completion processing of one bio when other bios for this
  186          * request are still outstanding.
  187          */
  188         int                      status;
  189 
  190         /**
  191          * Number of 512 byte sectors not transferred.
  192          */
  193         int                      residual_512b_sectors;
  194 
  195         /**
  196          * Starting sector number of the first request in the list.
  197          */
  198         off_t                    starting_sector_number;
  199 
  200         /**
  201          * If we're going to coalesce, the next contiguous sector would be
  202          * this one.
  203          */
  204         off_t                    next_contig_sector;
  205 
  206         /**
  207          * Number of child requests in the list.
  208          */
  209         int                      num_children;
  210 
  211         /**
  212          * Number of I/O requests still pending on the backend.
  213          */
  214         int                      pendcnt;
  215 
  216         /**
  217          * Total number of segments for requests in the list.
  218          */
  219         int                      nr_segments;
  220 
  221         /**
  222          * Flags for this particular request list.
  223          */
  224         xbb_reqlist_flags        flags;
  225 
  226         /**
  227          * Kernel virtual address space reserved for this request
  228          * list structure and used to map the remote domain's pages for
  229          * this I/O, into our domain's address space.
  230          */
  231         uint8_t                 *kva;
  232 
  233         /**
  234          * Base, pseudo-physical address, corresponding to the start
  235          * of this request's kva region.
  236          */
  237         uint64_t                 gnt_base;
  238 
  239         /**
  240          * Array of grant handles (one per page) used to map this request.
  241          */
  242         grant_handle_t          *gnt_handles;
  243 
  244         /**
  245          * Device statistics request ordering type (ordered or simple).
  246          */
  247         devstat_tag_type         ds_tag_type;
  248 
  249         /**
  250          * Device statistics request type (read, write, no_data).
  251          */
  252         devstat_trans_flags      ds_trans_type;
  253 
  254         /**
  255          * The start time for this request.
  256          */
  257         struct bintime           ds_t0;
  258 
  259         /**
  260          * Linked list of contiguous requests with the same operation type.
  261          */
  262         struct xbb_xen_req_list  contig_req_list;
  263 
  264         /**
  265          * Linked list links used to aggregate idle requests in the
  266          * request list free pool (xbb->reqlist_free_stailq) and pending
  267          * requests waiting for execution (xbb->reqlist_pending_stailq).
  268          */
  269         STAILQ_ENTRY(xbb_xen_reqlist) links;
  270 };
  271 
  272 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
  273 
  274 /**
  275  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
  276  */
  277 struct xbb_xen_req {
  278         /**
  279          * Linked list links used to aggregate requests into a reqlist
  280          * and to store them in the request free pool.
  281          */
  282         STAILQ_ENTRY(xbb_xen_req) links;
  283 
  284         /**
  285          * The remote domain's identifier for this I/O request.
  286          */
  287         uint64_t                  id;
  288 
  289         /**
  290          * The number of pages currently mapped for this request.
  291          */
  292         int                       nr_pages;
  293 
  294         /**
  295          * The number of 512 byte sectors comprising this requests.
  296          */
  297         int                       nr_512b_sectors;
  298 
  299         /**
  300          * BLKIF_OP code for this request.
  301          */
  302         int                       operation;
  303 
  304         /**
  305          * Storage used for non-native ring requests.
  306          */
  307         blkif_request_t          ring_req_storage;
  308 
  309         /**
  310          * Pointer to the Xen request in the ring.
  311          */
  312         blkif_request_t         *ring_req;
  313 
  314         /**
  315          * Consumer index for this request.
  316          */
  317         RING_IDX                 req_ring_idx;
  318 
  319         /**
  320          * The start time for this request.
  321          */
  322         struct bintime           ds_t0;
  323 
  324         /**
  325          * Pointer back to our parent request list.
  326          */
  327         struct xbb_xen_reqlist  *reqlist;
  328 };
  329 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
  330 
  331 /**
  332  * \brief Configuration data for the shared memory request ring
  333  *        used to communicate with the front-end client of this
  334  *        this driver.
  335  */
  336 struct xbb_ring_config {
  337         /** KVA address where ring memory is mapped. */
  338         vm_offset_t     va;
  339 
  340         /** The pseudo-physical address where ring memory is mapped.*/
  341         uint64_t        gnt_addr;
  342 
  343         /**
  344          * Grant table handles, one per-ring page, returned by the
  345          * hyperpervisor upon mapping of the ring and required to
  346          * unmap it when a connection is torn down.
  347          */
  348         grant_handle_t  handle[XBB_MAX_RING_PAGES];
  349 
  350         /**
  351          * The device bus address returned by the hypervisor when
  352          * mapping the ring and required to unmap it when a connection
  353          * is torn down.
  354          */
  355         uint64_t        bus_addr[XBB_MAX_RING_PAGES];
  356 
  357         /** The number of ring pages mapped for the current connection. */
  358         u_int           ring_pages;
  359 
  360         /**
  361          * The grant references, one per-ring page, supplied by the
  362          * front-end, allowing us to reference the ring pages in the
  363          * front-end's domain and to map these pages into our own domain.
  364          */
  365         grant_ref_t     ring_ref[XBB_MAX_RING_PAGES];
  366 
  367         /** The interrupt driven even channel used to signal ring events. */
  368         evtchn_port_t   evtchn;
  369 };
  370 
  371 /**
  372  * Per-instance connection state flags.
  373  */
  374 typedef enum
  375 {
  376         /**
  377          * The front-end requested a read-only mount of the
  378          * back-end device/file.
  379          */
  380         XBBF_READ_ONLY         = 0x01,
  381 
  382         /** Communication with the front-end has been established. */
  383         XBBF_RING_CONNECTED    = 0x02,
  384 
  385         /**
  386          * Front-end requests exist in the ring and are waiting for
  387          * xbb_xen_req objects to free up.
  388          */
  389         XBBF_RESOURCE_SHORTAGE = 0x04,
  390 
  391         /** Connection teardown in progress. */
  392         XBBF_SHUTDOWN          = 0x08,
  393 
  394         /** A thread is already performing shutdown processing. */
  395         XBBF_IN_SHUTDOWN       = 0x10
  396 } xbb_flag_t;
  397 
  398 /** Backend device type.  */
  399 typedef enum {
  400         /** Backend type unknown. */
  401         XBB_TYPE_NONE           = 0x00,
  402 
  403         /**
  404          * Backend type disk (access via cdev switch
  405          * strategy routine).
  406          */
  407         XBB_TYPE_DISK           = 0x01,
  408 
  409         /** Backend type file (access vnode operations.). */
  410         XBB_TYPE_FILE           = 0x02
  411 } xbb_type;
  412 
  413 /**
  414  * \brief Structure used to memoize information about a per-request
  415  *        scatter-gather list.
  416  *
  417  * The chief benefit of using this data structure is it avoids having
  418  * to reparse the possibly discontiguous S/G list in the original
  419  * request.  Due to the way that the mapping of the memory backing an
  420  * I/O transaction is handled by Xen, a second pass is unavoidable.
  421  * At least this way the second walk is a simple array traversal.
  422  *
  423  * \note A single Scatter/Gather element in the block interface covers
  424  *       at most 1 machine page.  In this context a sector (blkif
  425  *       nomenclature, not what I'd choose) is a 512b aligned unit
  426  *       of mapping within the machine page referenced by an S/G
  427  *       element.
  428  */
  429 struct xbb_sg {
  430         /** The number of 512b data chunks mapped in this S/G element. */
  431         int16_t nsect;
  432 
  433         /**
  434          * The index (0 based) of the first 512b data chunk mapped
  435          * in this S/G element.
  436          */
  437         uint8_t first_sect;
  438 
  439         /**
  440          * The index (0 based) of the last 512b data chunk mapped
  441          * in this S/G element.
  442          */
  443         uint8_t last_sect;
  444 };
  445 
  446 /**
  447  * Character device backend specific configuration data.
  448  */
  449 struct xbb_dev_data {
  450         /** Cdev used for device backend access.  */
  451         struct cdev   *cdev;
  452 
  453         /** Cdev switch used for device backend access.  */
  454         struct cdevsw *csw;
  455 
  456         /** Used to hold a reference on opened cdev backend devices. */
  457         int            dev_ref;
  458 };
  459 
  460 /**
  461  * File backend specific configuration data.
  462  */
  463 struct xbb_file_data {
  464         /** Credentials to use for vnode backed (file based) I/O. */
  465         struct ucred   *cred;
  466 
  467         /**
  468          * \brief Array of io vectors used to process file based I/O.
  469          *
  470          * Only a single file based request is outstanding per-xbb instance,
  471          * so we only need one of these.
  472          */
  473         struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
  474 };
  475 
  476 /**
  477  * Collection of backend type specific data.
  478  */
  479 union xbb_backend_data {
  480         struct xbb_dev_data  dev;
  481         struct xbb_file_data file;
  482 };
  483 
  484 /**
  485  * Function signature of backend specific I/O handlers.
  486  */
  487 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
  488                               struct xbb_xen_reqlist *reqlist, int operation,
  489                               int flags);
  490 
  491 /**
  492  * Per-instance configuration data.
  493  */
  494 struct xbb_softc {
  495         /**
  496          * Task-queue used to process I/O requests.
  497          */
  498         struct taskqueue         *io_taskqueue;
  499 
  500         /**
  501          * Single "run the request queue" task enqueued
  502          * on io_taskqueue.
  503          */
  504         struct task               io_task;
  505 
  506         /** Device type for this instance. */
  507         xbb_type                  device_type;
  508 
  509         /** NewBus device corresponding to this instance. */
  510         device_t                  dev;
  511 
  512         /** Backend specific dispatch routine for this instance. */
  513         xbb_dispatch_t            dispatch_io;
  514 
  515         /** The number of requests outstanding on the backend device/file. */
  516         int                       active_request_count;
  517 
  518         /** Free pool of request tracking structures. */
  519         struct xbb_xen_req_list   request_free_stailq;
  520 
  521         /** Array, sized at connection time, of request tracking structures. */
  522         struct xbb_xen_req       *requests;
  523 
  524         /** Free pool of request list structures. */
  525         struct xbb_xen_reqlist_list reqlist_free_stailq;
  526 
  527         /** List of pending request lists awaiting execution. */
  528         struct xbb_xen_reqlist_list reqlist_pending_stailq;
  529 
  530         /** Array, sized at connection time, of request list structures. */
  531         struct xbb_xen_reqlist   *request_lists;
  532 
  533         /**
  534          * Global pool of kva used for mapping remote domain ring
  535          * and I/O transaction data.
  536          */
  537         vm_offset_t               kva;
  538 
  539         /** Pseudo-physical address corresponding to kva. */
  540         uint64_t                  gnt_base_addr;
  541 
  542         /** The size of the global kva pool. */
  543         int                       kva_size;
  544 
  545         /** The size of the KVA area used for request lists. */
  546         int                       reqlist_kva_size;
  547 
  548         /** The number of pages of KVA used for request lists */
  549         int                       reqlist_kva_pages;
  550 
  551         /** Bitmap of free KVA pages */
  552         bitstr_t                 *kva_free;
  553 
  554         /**
  555          * \brief Cached value of the front-end's domain id.
  556          * 
  557          * This value is used at once for each mapped page in
  558          * a transaction.  We cache it to avoid incuring the
  559          * cost of an ivar access every time this is needed.
  560          */
  561         domid_t                   otherend_id;
  562 
  563         /**
  564          * \brief The blkif protocol abi in effect.
  565          *
  566          * There are situations where the back and front ends can
  567          * have a different, native abi (e.g. intel x86_64 and
  568          * 32bit x86 domains on the same machine).  The back-end
  569          * always accommodates the front-end's native abi.  That
  570          * value is pulled from the XenStore and recorded here.
  571          */
  572         int                       abi;
  573 
  574         /**
  575          * \brief The maximum number of requests and request lists allowed
  576          *        to be in flight at a time.
  577          *
  578          * This value is negotiated via the XenStore.
  579          */
  580         u_int                     max_requests;
  581 
  582         /**
  583          * \brief The maximum number of segments (1 page per segment)
  584          *        that can be mapped by a request.
  585          *
  586          * This value is negotiated via the XenStore.
  587          */
  588         u_int                     max_request_segments;
  589 
  590         /**
  591          * \brief Maximum number of segments per request list.
  592          *
  593          * This value is derived from and will generally be larger than
  594          * max_request_segments.
  595          */
  596         u_int                     max_reqlist_segments;
  597 
  598         /**
  599          * The maximum size of any request to this back-end
  600          * device.
  601          *
  602          * This value is negotiated via the XenStore.
  603          */
  604         u_int                     max_request_size;
  605 
  606         /**
  607          * The maximum size of any request list.  This is derived directly
  608          * from max_reqlist_segments.
  609          */
  610         u_int                     max_reqlist_size;
  611 
  612         /** Various configuration and state bit flags. */
  613         xbb_flag_t                flags;
  614 
  615         /** Ring mapping and interrupt configuration data. */
  616         struct xbb_ring_config    ring_config;
  617 
  618         /** Runtime, cross-abi safe, structures for ring access. */
  619         blkif_back_rings_t        rings;
  620 
  621         /** IRQ mapping for the communication ring event channel. */
  622         xen_intr_handle_t         xen_intr_handle;
  623 
  624         /**
  625          * \brief Backend access mode flags (e.g. write, or read-only).
  626          *
  627          * This value is passed to us by the front-end via the XenStore.
  628          */
  629         char                     *dev_mode;
  630 
  631         /**
  632          * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
  633          *
  634          * This value is passed to us by the front-end via the XenStore.
  635          * Currently unused.
  636          */
  637         char                     *dev_type;
  638 
  639         /**
  640          * \brief Backend device/file identifier.
  641          *
  642          * This value is passed to us by the front-end via the XenStore.
  643          * We expect this to be a POSIX path indicating the file or
  644          * device to open.
  645          */
  646         char                     *dev_name;
  647 
  648         /**
  649          * Vnode corresponding to the backend device node or file
  650          * we are acessing.
  651          */
  652         struct vnode             *vn;
  653 
  654         union xbb_backend_data    backend;
  655 
  656         /** The native sector size of the backend. */
  657         u_int                     sector_size;
  658 
  659         /** log2 of sector_size.  */
  660         u_int                     sector_size_shift;
  661 
  662         /** Size in bytes of the backend device or file.  */
  663         off_t                     media_size;
  664 
  665         /**
  666          * \brief media_size expressed in terms of the backend native
  667          *        sector size.
  668          *
  669          * (e.g. xbb->media_size >> xbb->sector_size_shift).
  670          */
  671         uint64_t                  media_num_sectors;
  672 
  673         /**
  674          * \brief Array of memoized scatter gather data computed during the
  675          *        conversion of blkif ring requests to internal xbb_xen_req
  676          *        structures.
  677          *
  678          * Ring processing is serialized so we only need one of these.
  679          */
  680         struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
  681 
  682         /**
  683          * Temporary grant table map used in xbb_dispatch_io().  When
  684          * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
  685          * stack could cause a stack overflow.
  686          */
  687         struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
  688 
  689         /** Mutex protecting per-instance data. */
  690         struct mtx                lock;
  691 
  692         /**
  693          * Resource representing allocated physical address space
  694          * associated with our per-instance kva region.
  695          */
  696         struct resource          *pseudo_phys_res;
  697 
  698         /** Resource id for allocated physical address space. */
  699         int                       pseudo_phys_res_id;
  700 
  701         /**
  702          * I/O statistics from BlockBack dispatch down.  These are
  703          * coalesced requests, and we start them right before execution.
  704          */
  705         struct devstat           *xbb_stats;
  706 
  707         /**
  708          * I/O statistics coming into BlockBack.  These are the requests as
  709          * we get them from BlockFront.  They are started as soon as we
  710          * receive a request, and completed when the I/O is complete.
  711          */
  712         struct devstat           *xbb_stats_in;
  713 
  714         /** Disable sending flush to the backend */
  715         int                       disable_flush;
  716 
  717         /** Send a real flush for every N flush requests */
  718         int                       flush_interval;
  719 
  720         /** Count of flush requests in the interval */
  721         int                       flush_count;
  722 
  723         /** Don't coalesce requests if this is set */
  724         int                       no_coalesce_reqs;
  725 
  726         /** Number of requests we have received */
  727         uint64_t                  reqs_received;
  728 
  729         /** Number of requests we have completed*/
  730         uint64_t                  reqs_completed;
  731 
  732         /** Number of requests we queued but not pushed*/
  733         uint64_t                  reqs_queued_for_completion;
  734 
  735         /** Number of requests we completed with an error status*/
  736         uint64_t                  reqs_completed_with_error;
  737 
  738         /** How many forced dispatches (i.e. without coalescing) have happened */
  739         uint64_t                  forced_dispatch;
  740 
  741         /** How many normal dispatches have happened */
  742         uint64_t                  normal_dispatch;
  743 
  744         /** How many total dispatches have happened */
  745         uint64_t                  total_dispatch;
  746 
  747         /** How many times we have run out of KVA */
  748         uint64_t                  kva_shortages;
  749 
  750         /** How many times we have run out of request structures */
  751         uint64_t                  request_shortages;
  752 
  753         /** Watch to wait for hotplug script execution */
  754         struct xs_watch           hotplug_watch;
  755 
  756         /** Got the needed data from hotplug scripts? */
  757         bool                      hotplug_done;
  758 };
  759 
  760 /*---------------------------- Request Processing ----------------------------*/
  761 /**
  762  * Allocate an internal transaction tracking structure from the free pool.
  763  *
  764  * \param xbb  Per-instance xbb configuration structure.
  765  *
  766  * \return  On success, a pointer to the allocated xbb_xen_req structure.
  767  *          Otherwise NULL.
  768  */
  769 static inline struct xbb_xen_req *
  770 xbb_get_req(struct xbb_softc *xbb)
  771 {
  772         struct xbb_xen_req *req;
  773 
  774         req = NULL;
  775 
  776         mtx_assert(&xbb->lock, MA_OWNED);
  777 
  778         if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
  779                 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
  780                 xbb->active_request_count++;
  781         }
  782 
  783         return (req);
  784 }
  785 
  786 /**
  787  * Return an allocated transaction tracking structure to the free pool.
  788  *
  789  * \param xbb  Per-instance xbb configuration structure.
  790  * \param req  The request structure to free.
  791  */
  792 static inline void
  793 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
  794 {
  795         mtx_assert(&xbb->lock, MA_OWNED);
  796 
  797         STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
  798         xbb->active_request_count--;
  799 
  800         KASSERT(xbb->active_request_count >= 0,
  801                 ("xbb_release_req: negative active count"));
  802 }
  803 
  804 /**
  805  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
  806  *
  807  * \param xbb       Per-instance xbb configuration structure.
  808  * \param req_list  The list of requests to free.
  809  * \param nreqs     The number of items in the list.
  810  */
  811 static inline void
  812 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
  813                  int nreqs)
  814 {
  815         mtx_assert(&xbb->lock, MA_OWNED);
  816 
  817         STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
  818         xbb->active_request_count -= nreqs;
  819 
  820         KASSERT(xbb->active_request_count >= 0,
  821                 ("xbb_release_reqs: negative active count"));
  822 }
  823 
  824 /**
  825  * Given a page index and 512b sector offset within that page,
  826  * calculate an offset into a request's kva region.
  827  *
  828  * \param reqlist The request structure whose kva region will be accessed.
  829  * \param pagenr  The page index used to compute the kva offset.
  830  * \param sector  The 512b sector index used to compute the page relative
  831  *                kva offset.
  832  *
  833  * \return  The computed global KVA offset.
  834  */
  835 static inline uint8_t *
  836 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
  837 {
  838         return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
  839 }
  840 
  841 /**
  842  * Given a page number and 512b sector offset within that page,
  843  * calculate an offset into the request's memory region that the
  844  * underlying backend device/file should use for I/O.
  845  *
  846  * \param reqlist The request structure whose I/O region will be accessed.
  847  * \param pagenr  The page index used to compute the I/O offset.
  848  * \param sector  The 512b sector index used to compute the page relative
  849  *                I/O offset.
  850  *
  851  * \return  The computed global I/O address.
  852  *
  853  * Depending on configuration, this will either be a local bounce buffer
  854  * or a pointer to the memory mapped in from the front-end domain for
  855  * this request.
  856  */
  857 static inline uint8_t *
  858 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
  859 {
  860         return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
  861 }
  862 
  863 /**
  864  * Given a page index and 512b sector offset within that page, calculate
  865  * an offset into the local pseudo-physical address space used to map a
  866  * front-end's request data into a request.
  867  *
  868  * \param reqlist The request list structure whose pseudo-physical region
  869  *                will be accessed.
  870  * \param pagenr  The page index used to compute the pseudo-physical offset.
  871  * \param sector  The 512b sector index used to compute the page relative
  872  *                pseudo-physical offset.
  873  *
  874  * \return  The computed global pseudo-phsyical address.
  875  *
  876  * Depending on configuration, this will either be a local bounce buffer
  877  * or a pointer to the memory mapped in from the front-end domain for
  878  * this request.
  879  */
  880 static inline uintptr_t
  881 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
  882 {
  883         struct xbb_softc *xbb;
  884 
  885         xbb = reqlist->xbb;
  886 
  887         return ((uintptr_t)(xbb->gnt_base_addr +
  888                 (uintptr_t)(reqlist->kva - xbb->kva) +
  889                 (PAGE_SIZE * pagenr) + (sector << 9)));
  890 }
  891 
  892 /**
  893  * Get Kernel Virtual Address space for mapping requests.
  894  *
  895  * \param xbb         Per-instance xbb configuration structure.
  896  * \param nr_pages    Number of pages needed.
  897  * \param check_only  If set, check for free KVA but don't allocate it.
  898  * \param have_lock   If set, xbb lock is already held.
  899  *
  900  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
  901  *
  902  * Note:  This should be unnecessary once we have either chaining or
  903  * scatter/gather support for struct bio.  At that point we'll be able to
  904  * put multiple addresses and lengths in one bio/bio chain and won't need
  905  * to map everything into one virtual segment.
  906  */
  907 static uint8_t *
  908 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
  909 {
  910         int first_clear;
  911         int num_clear;
  912         uint8_t *free_kva;
  913         int      i;
  914 
  915         KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
  916 
  917         first_clear = 0;
  918         free_kva = NULL;
  919 
  920         mtx_lock(&xbb->lock);
  921 
  922         /*
  923          * Look for the first available page.  If there are none, we're done.
  924          */
  925         bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
  926 
  927         if (first_clear == -1)
  928                 goto bailout;
  929 
  930         /*
  931          * Starting at the first available page, look for consecutive free
  932          * pages that will satisfy the user's request.
  933          */
  934         for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
  935                 /*
  936                  * If this is true, the page is used, so we have to reset
  937                  * the number of clear pages and the first clear page
  938                  * (since it pointed to a region with an insufficient number
  939                  * of clear pages).
  940                  */
  941                 if (bit_test(xbb->kva_free, i)) {
  942                         num_clear = 0;
  943                         first_clear = -1;
  944                         continue;
  945                 }
  946 
  947                 if (first_clear == -1)
  948                         first_clear = i;
  949 
  950                 /*
  951                  * If this is true, we've found a large enough free region
  952                  * to satisfy the request.
  953                  */
  954                 if (++num_clear == nr_pages) {
  955                         bit_nset(xbb->kva_free, first_clear,
  956                                  first_clear + nr_pages - 1);
  957 
  958                         free_kva = xbb->kva +
  959                                 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE);
  960 
  961                         KASSERT(free_kva >= (uint8_t *)xbb->kva &&
  962                                 free_kva + (nr_pages * PAGE_SIZE) <=
  963                                 (uint8_t *)xbb->ring_config.va,
  964                                 ("Free KVA %p len %d out of range, "
  965                                  "kva = %#jx, ring VA = %#jx\n", free_kva,
  966                                  nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
  967                                  (uintmax_t)xbb->ring_config.va));
  968                         break;
  969                 }
  970         }
  971 
  972 bailout:
  973 
  974         if (free_kva == NULL) {
  975                 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
  976                 xbb->kva_shortages++;
  977         }
  978 
  979         mtx_unlock(&xbb->lock);
  980 
  981         return (free_kva);
  982 }
  983 
  984 /**
  985  * Free allocated KVA.
  986  *
  987  * \param xbb       Per-instance xbb configuration structure.
  988  * \param kva_ptr   Pointer to allocated KVA region.  
  989  * \param nr_pages  Number of pages in the KVA region.
  990  */
  991 static void
  992 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
  993 {
  994         intptr_t start_page;
  995 
  996         mtx_assert(&xbb->lock, MA_OWNED);
  997 
  998         start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
  999         bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
 1000 
 1001 }
 1002 
 1003 /**
 1004  * Unmap the front-end pages associated with this I/O request.
 1005  *
 1006  * \param req  The request structure to unmap.
 1007  */
 1008 static void
 1009 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
 1010 {
 1011         struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
 1012         u_int                         i;
 1013         u_int                         invcount;
 1014         int                           error __diagused;
 1015 
 1016         invcount = 0;
 1017         for (i = 0; i < reqlist->nr_segments; i++) {
 1018                 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
 1019                         continue;
 1020 
 1021                 unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
 1022                 unmap[invcount].dev_bus_addr = 0;
 1023                 unmap[invcount].handle       = reqlist->gnt_handles[i];
 1024                 reqlist->gnt_handles[i]      = GRANT_REF_INVALID;
 1025                 invcount++;
 1026         }
 1027 
 1028         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
 1029                                           unmap, invcount);
 1030         KASSERT(error == 0, ("Grant table operation failed"));
 1031 }
 1032 
 1033 /**
 1034  * Allocate an internal transaction tracking structure from the free pool.
 1035  *
 1036  * \param xbb  Per-instance xbb configuration structure.
 1037  *
 1038  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
 1039  *          Otherwise NULL.
 1040  */
 1041 static inline struct xbb_xen_reqlist *
 1042 xbb_get_reqlist(struct xbb_softc *xbb)
 1043 {
 1044         struct xbb_xen_reqlist *reqlist;
 1045 
 1046         reqlist = NULL;
 1047 
 1048         mtx_assert(&xbb->lock, MA_OWNED);
 1049 
 1050         if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
 1051                 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
 1052                 reqlist->flags = XBB_REQLIST_NONE;
 1053                 reqlist->kva = NULL;
 1054                 reqlist->status = BLKIF_RSP_OKAY;
 1055                 reqlist->residual_512b_sectors = 0;
 1056                 reqlist->num_children = 0;
 1057                 reqlist->nr_segments = 0;
 1058                 STAILQ_INIT(&reqlist->contig_req_list);
 1059         }
 1060 
 1061         return (reqlist);
 1062 }
 1063 
 1064 /**
 1065  * Return an allocated transaction tracking structure to the free pool.
 1066  *
 1067  * \param xbb        Per-instance xbb configuration structure.
 1068  * \param req        The request list structure to free.
 1069  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
 1070  *                   during a resource shortage condition.
 1071  */
 1072 static inline void
 1073 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 1074                     int wakeup)
 1075 {
 1076 
 1077         mtx_assert(&xbb->lock, MA_OWNED);
 1078 
 1079         if (wakeup) {
 1080                 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
 1081                 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
 1082         }
 1083 
 1084         if (reqlist->kva != NULL)
 1085                 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
 1086 
 1087         xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
 1088 
 1089         STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
 1090 
 1091         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
 1092                 /*
 1093                  * Shutdown is in progress.  See if we can
 1094                  * progress further now that one more request
 1095                  * has completed and been returned to the
 1096                  * free pool.
 1097                  */
 1098                 xbb_shutdown(xbb);
 1099         }
 1100 
 1101         if (wakeup != 0)
 1102                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 1103 }
 1104 
 1105 /**
 1106  * Request resources and do basic request setup.
 1107  *
 1108  * \param xbb          Per-instance xbb configuration structure.
 1109  * \param reqlist      Pointer to reqlist pointer.
 1110  * \param ring_req     Pointer to a block ring request.
 1111  * \param ring_index   The ring index of this request.
 1112  *
 1113  * \return  0 for success, non-zero for failure.
 1114  */
 1115 static int
 1116 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
 1117                   blkif_request_t *ring_req, RING_IDX ring_idx)
 1118 {
 1119         struct xbb_xen_reqlist *nreqlist;
 1120         struct xbb_xen_req     *nreq;
 1121 
 1122         nreqlist = NULL;
 1123         nreq     = NULL;
 1124 
 1125         mtx_lock(&xbb->lock);
 1126 
 1127         /*
 1128          * We don't allow new resources to be allocated if we're in the
 1129          * process of shutting down.
 1130          */
 1131         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
 1132                 mtx_unlock(&xbb->lock);
 1133                 return (1);
 1134         }
 1135 
 1136         /*
 1137          * Allocate a reqlist if the caller doesn't have one already.
 1138          */
 1139         if (*reqlist == NULL) {
 1140                 nreqlist = xbb_get_reqlist(xbb);
 1141                 if (nreqlist == NULL)
 1142                         goto bailout_error;
 1143         }
 1144 
 1145         /* We always allocate a request. */
 1146         nreq = xbb_get_req(xbb);
 1147         if (nreq == NULL)
 1148                 goto bailout_error;
 1149 
 1150         mtx_unlock(&xbb->lock);
 1151 
 1152         if (*reqlist == NULL) {
 1153                 *reqlist = nreqlist;
 1154                 nreqlist->operation = ring_req->operation;
 1155                 nreqlist->starting_sector_number = ring_req->sector_number;
 1156                 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
 1157                                    links);
 1158         }
 1159 
 1160         nreq->reqlist = *reqlist;
 1161         nreq->req_ring_idx = ring_idx;
 1162         nreq->id = ring_req->id;
 1163         nreq->operation = ring_req->operation;
 1164 
 1165         if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
 1166                 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
 1167                 nreq->ring_req = &nreq->ring_req_storage;
 1168         } else {
 1169                 nreq->ring_req = ring_req;
 1170         }
 1171 
 1172         binuptime(&nreq->ds_t0);
 1173         devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
 1174         STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
 1175         (*reqlist)->num_children++;
 1176         (*reqlist)->nr_segments += ring_req->nr_segments;
 1177 
 1178         return (0);
 1179 
 1180 bailout_error:
 1181 
 1182         /*
 1183          * We're out of resources, so set the shortage flag.  The next time
 1184          * a request is released, we'll try waking up the work thread to
 1185          * see if we can allocate more resources.
 1186          */
 1187         xbb->flags |= XBBF_RESOURCE_SHORTAGE;
 1188         xbb->request_shortages++;
 1189 
 1190         if (nreq != NULL)
 1191                 xbb_release_req(xbb, nreq);
 1192 
 1193         if (nreqlist != NULL)
 1194                 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
 1195 
 1196         mtx_unlock(&xbb->lock);
 1197 
 1198         return (1);
 1199 }
 1200 
 1201 /**
 1202  * Create and queue a response to a blkif request.
 1203  * 
 1204  * \param xbb     Per-instance xbb configuration structure.
 1205  * \param req     The request structure to which to respond.
 1206  * \param status  The status code to report.  See BLKIF_RSP_*
 1207  *                in sys/contrib/xen/io/blkif.h.
 1208  */
 1209 static void
 1210 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
 1211 {
 1212         blkif_response_t *resp;
 1213 
 1214         /*
 1215          * The mutex is required here, and should be held across this call
 1216          * until after the subsequent call to xbb_push_responses().  This
 1217          * is to guarantee that another context won't queue responses and
 1218          * push them while we're active.
 1219          *
 1220          * That could lead to the other end being notified of responses
 1221          * before the resources have been freed on this end.  The other end
 1222          * would then be able to queue additional I/O, and we may run out
 1223          * of resources because we haven't freed them all yet.
 1224          */
 1225         mtx_assert(&xbb->lock, MA_OWNED);
 1226 
 1227         /*
 1228          * Place on the response ring for the relevant domain.
 1229          * For now, only the spacing between entries is different
 1230          * in the different ABIs, not the response entry layout.
 1231          */
 1232         switch (xbb->abi) {
 1233         case BLKIF_PROTOCOL_NATIVE:
 1234                 resp = RING_GET_RESPONSE(&xbb->rings.native,
 1235                                          xbb->rings.native.rsp_prod_pvt);
 1236                 break;
 1237         case BLKIF_PROTOCOL_X86_32:
 1238                 resp = (blkif_response_t *)
 1239                     RING_GET_RESPONSE(&xbb->rings.x86_32,
 1240                                       xbb->rings.x86_32.rsp_prod_pvt);
 1241                 break;
 1242         case BLKIF_PROTOCOL_X86_64:
 1243                 resp = (blkif_response_t *)
 1244                     RING_GET_RESPONSE(&xbb->rings.x86_64,
 1245                                       xbb->rings.x86_64.rsp_prod_pvt);
 1246                 break;
 1247         default:
 1248                 panic("Unexpected blkif protocol ABI.");
 1249         }
 1250 
 1251         resp->id        = req->id;
 1252         resp->operation = req->operation;
 1253         resp->status    = status;
 1254 
 1255         if (status != BLKIF_RSP_OKAY)
 1256                 xbb->reqs_completed_with_error++;
 1257 
 1258         xbb->rings.common.rsp_prod_pvt++;
 1259 
 1260         xbb->reqs_queued_for_completion++;
 1261 
 1262 }
 1263 
 1264 /**
 1265  * Send queued responses to blkif requests.
 1266  * 
 1267  * \param xbb            Per-instance xbb configuration structure.
 1268  * \param run_taskqueue  Flag that is set to 1 if the taskqueue
 1269  *                       should be run, 0 if it does not need to be run.
 1270  * \param notify         Flag that is set to 1 if the other end should be
 1271  *                       notified via irq, 0 if the other end should not be
 1272  *                       notified.
 1273  */
 1274 static void
 1275 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify)
 1276 {
 1277         int more_to_do;
 1278 
 1279         /*
 1280          * The mutex is required here.
 1281          */
 1282         mtx_assert(&xbb->lock, MA_OWNED);
 1283 
 1284         more_to_do = 0;
 1285 
 1286         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify);
 1287 
 1288         if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
 1289                 /*
 1290                  * Tail check for pending requests. Allows frontend to avoid
 1291                  * notifications if requests are already in flight (lower
 1292                  * overheads and promotes batching).
 1293                  */
 1294                 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
 1295         } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
 1296                 more_to_do = 1;
 1297         }
 1298 
 1299         xbb->reqs_completed += xbb->reqs_queued_for_completion;
 1300         xbb->reqs_queued_for_completion = 0;
 1301 
 1302         *run_taskqueue = more_to_do;
 1303 }
 1304 
 1305 /**
 1306  * Complete a request list.
 1307  *
 1308  * \param xbb        Per-instance xbb configuration structure.
 1309  * \param reqlist    Allocated internal request list structure.
 1310  */
 1311 static void
 1312 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
 1313 {
 1314         struct xbb_xen_req *nreq;
 1315         off_t               sectors_sent;
 1316         int                 notify, run_taskqueue;
 1317 
 1318         sectors_sent = 0;
 1319 
 1320         if (reqlist->flags & XBB_REQLIST_MAPPED)
 1321                 xbb_unmap_reqlist(reqlist);
 1322 
 1323         mtx_lock(&xbb->lock);
 1324 
 1325         /*
 1326          * All I/O is done, send the response. A lock is not necessary
 1327          * to protect the request list, because all requests have
 1328          * completed.  Therefore this is the only context accessing this
 1329          * reqlist right now.  However, in order to make sure that no one
 1330          * else queues responses onto the queue or pushes them to the other
 1331          * side while we're active, we need to hold the lock across the
 1332          * calls to xbb_queue_response() and xbb_push_responses().
 1333          */
 1334         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
 1335                 off_t cur_sectors_sent;
 1336 
 1337                 /* Put this response on the ring, but don't push yet */
 1338                 xbb_queue_response(xbb, nreq, reqlist->status);
 1339 
 1340                 /* We don't report bytes sent if there is an error. */
 1341                 if (reqlist->status == BLKIF_RSP_OKAY)
 1342                         cur_sectors_sent = nreq->nr_512b_sectors;
 1343                 else
 1344                         cur_sectors_sent = 0;
 1345 
 1346                 sectors_sent += cur_sectors_sent;
 1347 
 1348                 devstat_end_transaction(xbb->xbb_stats_in,
 1349                                         /*bytes*/cur_sectors_sent << 9,
 1350                                         reqlist->ds_tag_type,
 1351                                         reqlist->ds_trans_type,
 1352                                         /*now*/NULL,
 1353                                         /*then*/&nreq->ds_t0);
 1354         }
 1355 
 1356         /*
 1357          * Take out any sectors not sent.  If we wind up negative (which
 1358          * might happen if an error is reported as well as a residual), just
 1359          * report 0 sectors sent.
 1360          */
 1361         sectors_sent -= reqlist->residual_512b_sectors;
 1362         if (sectors_sent < 0)
 1363                 sectors_sent = 0;
 1364 
 1365         devstat_end_transaction(xbb->xbb_stats,
 1366                                 /*bytes*/ sectors_sent << 9,
 1367                                 reqlist->ds_tag_type,
 1368                                 reqlist->ds_trans_type,
 1369                                 /*now*/NULL,
 1370                                 /*then*/&reqlist->ds_t0);
 1371 
 1372         xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
 1373 
 1374         xbb_push_responses(xbb, &run_taskqueue, &notify);
 1375 
 1376         mtx_unlock(&xbb->lock);
 1377 
 1378         if (run_taskqueue)
 1379                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 1380 
 1381         if (notify)
 1382                 xen_intr_signal(xbb->xen_intr_handle);
 1383 }
 1384 
 1385 /**
 1386  * Completion handler for buffer I/O requests issued by the device
 1387  * backend driver.
 1388  *
 1389  * \param bio  The buffer I/O request on which to perform completion
 1390  *             processing.
 1391  */
 1392 static void
 1393 xbb_bio_done(struct bio *bio)
 1394 {
 1395         struct xbb_softc       *xbb;
 1396         struct xbb_xen_reqlist *reqlist;
 1397 
 1398         reqlist = bio->bio_caller1;
 1399         xbb     = reqlist->xbb;
 1400 
 1401         reqlist->residual_512b_sectors += bio->bio_resid >> 9;
 1402 
 1403         /*
 1404          * This is a bit imprecise.  With aggregated I/O a single
 1405          * request list can contain multiple front-end requests and
 1406          * a multiple bios may point to a single request.  By carefully
 1407          * walking the request list, we could map residuals and errors
 1408          * back to the original front-end request, but the interface
 1409          * isn't sufficiently rich for us to properly report the error.
 1410          * So, we just treat the entire request list as having failed if an
 1411          * error occurs on any part.  And, if an error occurs, we treat
 1412          * the amount of data transferred as 0.
 1413          *
 1414          * For residuals, we report it on the overall aggregated device,
 1415          * but not on the individual requests, since we don't currently
 1416          * do the work to determine which front-end request to which the
 1417          * residual applies.
 1418          */
 1419         if (bio->bio_error) {
 1420                 DPRINTF("BIO returned error %d for operation on device %s\n",
 1421                         bio->bio_error, xbb->dev_name);
 1422                 reqlist->status = BLKIF_RSP_ERROR;
 1423 
 1424                 if (bio->bio_error == ENXIO
 1425                  && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
 1426                         /*
 1427                          * Backend device has disappeared.  Signal the
 1428                          * front-end that we (the device proxy) want to
 1429                          * go away.
 1430                          */
 1431                         xenbus_set_state(xbb->dev, XenbusStateClosing);
 1432                 }
 1433         }
 1434 
 1435         /*
 1436          * Decrement the pending count for the request list.  When we're
 1437          * done with the requests, send status back for all of them.
 1438          */
 1439         if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
 1440                 xbb_complete_reqlist(xbb, reqlist);
 1441 
 1442         g_destroy_bio(bio);
 1443 }
 1444 
 1445 /**
 1446  * Parse a blkif request into an internal request structure and send
 1447  * it to the backend for processing.
 1448  *
 1449  * \param xbb       Per-instance xbb configuration structure.
 1450  * \param reqlist   Allocated internal request list structure.
 1451  *
 1452  * \return          On success, 0.  For resource shortages, non-zero.
 1453  *  
 1454  * This routine performs the backend common aspects of request parsing
 1455  * including compiling an internal request structure, parsing the S/G
 1456  * list and any secondary ring requests in which they may reside, and
 1457  * the mapping of front-end I/O pages into our domain.
 1458  */
 1459 static int
 1460 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
 1461 {
 1462         struct xbb_sg                *xbb_sg;
 1463         struct gnttab_map_grant_ref  *map;
 1464         struct blkif_request_segment *sg;
 1465         struct blkif_request_segment *last_block_sg;
 1466         struct xbb_xen_req           *nreq;
 1467         u_int                         nseg;
 1468         u_int                         seg_idx;
 1469         u_int                         block_segs;
 1470         int                           nr_sects;
 1471         int                           total_sects;
 1472         int                           operation;
 1473         uint8_t                       bio_flags;
 1474         int                           error;
 1475 
 1476         reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
 1477         bio_flags            = 0;
 1478         total_sects          = 0;
 1479         nr_sects             = 0;
 1480 
 1481         /*
 1482          * First determine whether we have enough free KVA to satisfy this
 1483          * request list.  If not, tell xbb_run_queue() so it can go to
 1484          * sleep until we have more KVA.
 1485          */
 1486         reqlist->kva = NULL;
 1487         if (reqlist->nr_segments != 0) {
 1488                 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
 1489                 if (reqlist->kva == NULL) {
 1490                         /*
 1491                          * If we're out of KVA, return ENOMEM.
 1492                          */
 1493                         return (ENOMEM);
 1494                 }
 1495         }
 1496 
 1497         binuptime(&reqlist->ds_t0);
 1498         devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
 1499 
 1500         switch (reqlist->operation) {
 1501         case BLKIF_OP_WRITE_BARRIER:
 1502                 bio_flags       |= BIO_ORDERED;
 1503                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
 1504                 /* FALLTHROUGH */
 1505         case BLKIF_OP_WRITE:
 1506                 operation = BIO_WRITE;
 1507                 reqlist->ds_trans_type = DEVSTAT_WRITE;
 1508                 if ((xbb->flags & XBBF_READ_ONLY) != 0) {
 1509                         DPRINTF("Attempt to write to read only device %s\n",
 1510                                 xbb->dev_name);
 1511                         reqlist->status = BLKIF_RSP_ERROR;
 1512                         goto send_response;
 1513                 }
 1514                 break;
 1515         case BLKIF_OP_READ:
 1516                 operation = BIO_READ;
 1517                 reqlist->ds_trans_type = DEVSTAT_READ;
 1518                 break;
 1519         case BLKIF_OP_FLUSH_DISKCACHE:
 1520                 /*
 1521                  * If this is true, the user has requested that we disable
 1522                  * flush support.  So we just complete the requests
 1523                  * successfully.
 1524                  */
 1525                 if (xbb->disable_flush != 0) {
 1526                         goto send_response;
 1527                 }
 1528 
 1529                 /*
 1530                  * The user has requested that we only send a real flush
 1531                  * for every N flush requests.  So keep count, and either
 1532                  * complete the request immediately or queue it for the
 1533                  * backend.
 1534                  */
 1535                 if (xbb->flush_interval != 0) {
 1536                         if (++(xbb->flush_count) < xbb->flush_interval) {
 1537                                 goto send_response;
 1538                         } else
 1539                                 xbb->flush_count = 0;
 1540                 }
 1541 
 1542                 operation = BIO_FLUSH;
 1543                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
 1544                 reqlist->ds_trans_type = DEVSTAT_NO_DATA;
 1545                 goto do_dispatch;
 1546                 /*NOTREACHED*/
 1547         default:
 1548                 DPRINTF("error: unknown block io operation [%d]\n",
 1549                         reqlist->operation);
 1550                 reqlist->status = BLKIF_RSP_ERROR;
 1551                 goto send_response;
 1552         }
 1553 
 1554         reqlist->xbb  = xbb;
 1555         xbb_sg        = xbb->xbb_sgs;
 1556         map           = xbb->maps;
 1557         seg_idx       = 0;
 1558 
 1559         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
 1560                 blkif_request_t         *ring_req;
 1561 
 1562                 ring_req              = nreq->ring_req;
 1563                 nr_sects              = 0;
 1564                 nseg                  = ring_req->nr_segments;
 1565                 nreq->nr_pages        = nseg;
 1566                 nreq->nr_512b_sectors = 0;
 1567                 sg                    = NULL;
 1568 
 1569                 /* Check that number of segments is sane. */
 1570                 if (__predict_false(nseg == 0)
 1571                  || __predict_false(nseg > xbb->max_request_segments)) {
 1572                         DPRINTF("Bad number of segments in request (%d)\n",
 1573                                 nseg);
 1574                         reqlist->status = BLKIF_RSP_ERROR;
 1575                         goto send_response;
 1576                 }
 1577 
 1578                 block_segs    = nseg;
 1579                 sg            = ring_req->seg;
 1580                 last_block_sg = sg + block_segs;
 1581 
 1582                 while (sg < last_block_sg) {
 1583                         KASSERT(seg_idx <
 1584                                 XBB_MAX_SEGMENTS_PER_REQLIST,
 1585                                 ("seg_idx %d is too large, max "
 1586                                 "segs %d\n", seg_idx,
 1587                                 XBB_MAX_SEGMENTS_PER_REQLIST));
 1588 
 1589                         xbb_sg->first_sect = sg->first_sect;
 1590                         xbb_sg->last_sect  = sg->last_sect;
 1591                         xbb_sg->nsect =
 1592                             (int8_t)(sg->last_sect -
 1593                             sg->first_sect + 1);
 1594 
 1595                         if ((sg->last_sect >= (PAGE_SIZE >> 9))
 1596                          || (xbb_sg->nsect <= 0)) {
 1597                                 reqlist->status = BLKIF_RSP_ERROR;
 1598                                 goto send_response;
 1599                         }
 1600 
 1601                         nr_sects += xbb_sg->nsect;
 1602                         map->host_addr = xbb_get_gntaddr(reqlist,
 1603                                                 seg_idx, /*sector*/0);
 1604                         KASSERT(map->host_addr + PAGE_SIZE <=
 1605                                 xbb->ring_config.gnt_addr,
 1606                                 ("Host address %#jx len %d overlaps "
 1607                                  "ring address %#jx\n",
 1608                                 (uintmax_t)map->host_addr, PAGE_SIZE,
 1609                                 (uintmax_t)xbb->ring_config.gnt_addr));
 1610 
 1611                         map->flags     = GNTMAP_host_map;
 1612                         map->ref       = sg->gref;
 1613                         map->dom       = xbb->otherend_id;
 1614                         if (operation == BIO_WRITE)
 1615                                 map->flags |= GNTMAP_readonly;
 1616                         sg++;
 1617                         map++;
 1618                         xbb_sg++;
 1619                         seg_idx++;
 1620                 }
 1621 
 1622                 /* Convert to the disk's sector size */
 1623                 nreq->nr_512b_sectors = nr_sects;
 1624                 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
 1625                 total_sects += nr_sects;
 1626 
 1627                 if ((nreq->nr_512b_sectors &
 1628                     ((xbb->sector_size >> 9) - 1)) != 0) {
 1629                         device_printf(xbb->dev, "%s: I/O size (%d) is not "
 1630                                       "a multiple of the backing store sector "
 1631                                       "size (%d)\n", __func__,
 1632                                       nreq->nr_512b_sectors << 9,
 1633                                       xbb->sector_size);
 1634                         reqlist->status = BLKIF_RSP_ERROR;
 1635                         goto send_response;
 1636                 }
 1637         }
 1638 
 1639         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
 1640                                           xbb->maps, reqlist->nr_segments);
 1641         if (error != 0)
 1642                 panic("Grant table operation failed (%d)", error);
 1643 
 1644         reqlist->flags |= XBB_REQLIST_MAPPED;
 1645 
 1646         for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
 1647              seg_idx++, map++){
 1648                 if (__predict_false(map->status != 0)) {
 1649                         DPRINTF("invalid buffer -- could not remap "
 1650                                 "it (%d)\n", map->status);
 1651                         DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags "
 1652                                 "0x%x ref 0x%x, dom %d\n", seg_idx,
 1653                                 map->host_addr, map->flags, map->ref,
 1654                                 map->dom);
 1655                         reqlist->status = BLKIF_RSP_ERROR;
 1656                         goto send_response;
 1657                 }
 1658 
 1659                 reqlist->gnt_handles[seg_idx] = map->handle;
 1660         }
 1661         if (reqlist->starting_sector_number + total_sects >
 1662             xbb->media_num_sectors) {
 1663                 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
 1664                         "extends past end of device %s\n",
 1665                         operation == BIO_READ ? "read" : "write",
 1666                         reqlist->starting_sector_number,
 1667                         reqlist->starting_sector_number + total_sects,
 1668                         xbb->dev_name); 
 1669                 reqlist->status = BLKIF_RSP_ERROR;
 1670                 goto send_response;
 1671         }
 1672 
 1673 do_dispatch:
 1674 
 1675         error = xbb->dispatch_io(xbb,
 1676                                  reqlist,
 1677                                  operation,
 1678                                  bio_flags);
 1679 
 1680         if (error != 0) {
 1681                 reqlist->status = BLKIF_RSP_ERROR;
 1682                 goto send_response;
 1683         }
 1684 
 1685         return (0);
 1686 
 1687 send_response:
 1688 
 1689         xbb_complete_reqlist(xbb, reqlist);
 1690 
 1691         return (0);
 1692 }
 1693 
 1694 static __inline int
 1695 xbb_count_sects(blkif_request_t *ring_req)
 1696 {
 1697         int i;
 1698         int cur_size = 0;
 1699 
 1700         for (i = 0; i < ring_req->nr_segments; i++) {
 1701                 int nsect;
 1702 
 1703                 nsect = (int8_t)(ring_req->seg[i].last_sect -
 1704                         ring_req->seg[i].first_sect + 1);
 1705                 if (nsect <= 0)
 1706                         break;
 1707 
 1708                 cur_size += nsect;
 1709         }
 1710 
 1711         return (cur_size);
 1712 }
 1713 
 1714 /**
 1715  * Process incoming requests from the shared communication ring in response
 1716  * to a signal on the ring's event channel.
 1717  *
 1718  * \param context  Callback argument registerd during task initialization -
 1719  *                 the xbb_softc for this instance.
 1720  * \param pending  The number of taskqueue_enqueue events that have
 1721  *                 occurred since this handler was last run.
 1722  */
 1723 static void
 1724 xbb_run_queue(void *context, int pending)
 1725 {
 1726         struct xbb_softc       *xbb;
 1727         blkif_back_rings_t     *rings;
 1728         RING_IDX                rp;
 1729         uint64_t                cur_sector;
 1730         int                     cur_operation;
 1731         struct xbb_xen_reqlist *reqlist;
 1732 
 1733         xbb   = (struct xbb_softc *)context;
 1734         rings = &xbb->rings;
 1735 
 1736         /*
 1737          * Work gather and dispatch loop.  Note that we have a bias here
 1738          * towards gathering I/O sent by blockfront.  We first gather up
 1739          * everything in the ring, as long as we have resources.  Then we
 1740          * dispatch one request, and then attempt to gather up any
 1741          * additional requests that have come in while we were dispatching
 1742          * the request.
 1743          *
 1744          * This allows us to get a clearer picture (via devstat) of how
 1745          * many requests blockfront is queueing to us at any given time.
 1746          */
 1747         for (;;) {
 1748                 int retval;
 1749 
 1750                 /*
 1751                  * Initialize reqlist to the last element in the pending
 1752                  * queue, if there is one.  This allows us to add more
 1753                  * requests to that request list, if we have room.
 1754                  */
 1755                 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
 1756                                       xbb_xen_reqlist, links);
 1757                 if (reqlist != NULL) {
 1758                         cur_sector = reqlist->next_contig_sector;
 1759                         cur_operation = reqlist->operation;
 1760                 } else {
 1761                         cur_operation = 0;
 1762                         cur_sector    = 0;
 1763                 }
 1764 
 1765                 /*
 1766                  * Cache req_prod to avoid accessing a cache line shared
 1767                  * with the frontend.
 1768                  */
 1769                 rp = rings->common.sring->req_prod;
 1770 
 1771                 /* Ensure we see queued requests up to 'rp'. */
 1772                 rmb();
 1773 
 1774                 /**
 1775                  * Run so long as there is work to consume and the generation
 1776                  * of a response will not overflow the ring.
 1777                  *
 1778                  * @note There's a 1 to 1 relationship between requests and
 1779                  *       responses, so an overflow should never occur.  This
 1780                  *       test is to protect our domain from digesting bogus
 1781                  *       data.  Shouldn't we log this?
 1782                  */
 1783                 while (rings->common.req_cons != rp
 1784                     && RING_REQUEST_CONS_OVERFLOW(&rings->common,
 1785                                                   rings->common.req_cons) == 0){
 1786                         blkif_request_t         ring_req_storage;
 1787                         blkif_request_t        *ring_req;
 1788                         int                     cur_size;
 1789 
 1790                         switch (xbb->abi) {
 1791                         case BLKIF_PROTOCOL_NATIVE:
 1792                                 ring_req = RING_GET_REQUEST(&xbb->rings.native,
 1793                                     rings->common.req_cons);
 1794                                 break;
 1795                         case BLKIF_PROTOCOL_X86_32:
 1796                         {
 1797                                 struct blkif_x86_32_request *ring_req32;
 1798 
 1799                                 ring_req32 = RING_GET_REQUEST(
 1800                                     &xbb->rings.x86_32, rings->common.req_cons);
 1801                                 blkif_get_x86_32_req(&ring_req_storage,
 1802                                                      ring_req32);
 1803                                 ring_req = &ring_req_storage;
 1804                                 break;
 1805                         }
 1806                         case BLKIF_PROTOCOL_X86_64:
 1807                         {
 1808                                 struct blkif_x86_64_request *ring_req64;
 1809 
 1810                                 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
 1811                                     rings->common.req_cons);
 1812                                 blkif_get_x86_64_req(&ring_req_storage,
 1813                                                      ring_req64);
 1814                                 ring_req = &ring_req_storage;
 1815                                 break;
 1816                         }
 1817                         default:
 1818                                 panic("Unexpected blkif protocol ABI.");
 1819                                 /* NOTREACHED */
 1820                         } 
 1821 
 1822                         /*
 1823                          * Check for situations that would require closing
 1824                          * off this I/O for further coalescing:
 1825                          *  - Coalescing is turned off.
 1826                          *  - Current I/O is out of sequence with the previous
 1827                          *    I/O.
 1828                          *  - Coalesced I/O would be too large.
 1829                          */
 1830                         if ((reqlist != NULL)
 1831                          && ((xbb->no_coalesce_reqs != 0)
 1832                           || ((xbb->no_coalesce_reqs == 0)
 1833                            && ((ring_req->sector_number != cur_sector)
 1834                             || (ring_req->operation != cur_operation)
 1835                             || ((ring_req->nr_segments + reqlist->nr_segments) >
 1836                                  xbb->max_reqlist_segments))))) {
 1837                                 reqlist = NULL;
 1838                         }
 1839 
 1840                         /*
 1841                          * Grab and check for all resources in one shot.
 1842                          * If we can't get all of the resources we need,
 1843                          * the shortage is noted and the thread will get
 1844                          * woken up when more resources are available.
 1845                          */
 1846                         retval = xbb_get_resources(xbb, &reqlist, ring_req,
 1847                                                    xbb->rings.common.req_cons);
 1848 
 1849                         if (retval != 0) {
 1850                                 /*
 1851                                  * Resource shortage has been recorded.
 1852                                  * We'll be scheduled to run once a request
 1853                                  * object frees up due to a completion.
 1854                                  */
 1855                                 break;
 1856                         }
 1857 
 1858                         /*
 1859                          * Signify that we can overwrite this request with
 1860                          * a response by incrementing our consumer index.
 1861                          * The response won't be generated until after
 1862                          * we've already consumed all necessary data out
 1863                          * of the version of the request in the ring buffer
 1864                          * (for native mode).  We must update the consumer
 1865                          * index  before issuing back-end I/O so there is
 1866                          * no possibility that it will complete and a
 1867                          * response be generated before we make room in 
 1868                          * the queue for that response.
 1869                          */
 1870                         xbb->rings.common.req_cons++;
 1871                         xbb->reqs_received++;
 1872 
 1873                         cur_size = xbb_count_sects(ring_req);
 1874                         cur_sector = ring_req->sector_number + cur_size;
 1875                         reqlist->next_contig_sector = cur_sector;
 1876                         cur_operation = ring_req->operation;
 1877                 }
 1878 
 1879                 /* Check for I/O to dispatch */
 1880                 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
 1881                 if (reqlist == NULL) {
 1882                         /*
 1883                          * We're out of work to do, put the task queue to
 1884                          * sleep.
 1885                          */
 1886                         break;
 1887                 }
 1888 
 1889                 /*
 1890                  * Grab the first request off the queue and attempt
 1891                  * to dispatch it.
 1892                  */
 1893                 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
 1894 
 1895                 retval = xbb_dispatch_io(xbb, reqlist);
 1896                 if (retval != 0) {
 1897                         /*
 1898                          * xbb_dispatch_io() returns non-zero only when
 1899                          * there is a resource shortage.  If that's the
 1900                          * case, re-queue this request on the head of the
 1901                          * queue, and go to sleep until we have more
 1902                          * resources.
 1903                          */
 1904                         STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
 1905                                            reqlist, links);
 1906                         break;
 1907                 } else {
 1908                         /*
 1909                          * If we still have anything on the queue after
 1910                          * removing the head entry, that is because we
 1911                          * met one of the criteria to create a new
 1912                          * request list (outlined above), and we'll call
 1913                          * that a forced dispatch for statistical purposes.
 1914                          *
 1915                          * Otherwise, if there is only one element on the
 1916                          * queue, we coalesced everything available on
 1917                          * the ring and we'll call that a normal dispatch.
 1918                          */
 1919                         reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
 1920 
 1921                         if (reqlist != NULL)
 1922                                 xbb->forced_dispatch++;
 1923                         else
 1924                                 xbb->normal_dispatch++;
 1925 
 1926                         xbb->total_dispatch++;
 1927                 }
 1928         }
 1929 }
 1930 
 1931 /**
 1932  * Interrupt handler bound to the shared ring's event channel.
 1933  *
 1934  * \param arg  Callback argument registerd during event channel
 1935  *             binding - the xbb_softc for this instance.
 1936  */
 1937 static int
 1938 xbb_filter(void *arg)
 1939 {
 1940         struct xbb_softc *xbb;
 1941 
 1942         /* Defer to taskqueue thread. */
 1943         xbb = (struct xbb_softc *)arg;
 1944         taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 1945 
 1946         return (FILTER_HANDLED);
 1947 }
 1948 
 1949 SDT_PROVIDER_DEFINE(xbb);
 1950 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int");
 1951 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t",
 1952                   "uint64_t");
 1953 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int",
 1954                   "uint64_t", "uint64_t");
 1955 
 1956 /*----------------------------- Backend Handlers -----------------------------*/
 1957 /**
 1958  * Backend handler for character device access.
 1959  *
 1960  * \param xbb        Per-instance xbb configuration structure.
 1961  * \param reqlist    Allocated internal request list structure.
 1962  * \param operation  BIO_* I/O operation code.
 1963  * \param bio_flags  Additional bio_flag data to pass to any generated
 1964  *                   bios (e.g. BIO_ORDERED)..
 1965  *
 1966  * \return  0 for success, errno codes for failure.
 1967  */
 1968 static int
 1969 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 1970                  int operation, int bio_flags)
 1971 {
 1972         struct xbb_dev_data *dev_data;
 1973         struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
 1974         off_t                bio_offset;
 1975         struct bio          *bio;
 1976         struct xbb_sg       *xbb_sg;
 1977         u_int                nbio;
 1978         u_int                bio_idx;
 1979         u_int                nseg;
 1980         u_int                seg_idx;
 1981         int                  error;
 1982 
 1983         dev_data   = &xbb->backend.dev;
 1984         bio_offset = (off_t)reqlist->starting_sector_number
 1985                    << xbb->sector_size_shift;
 1986         error      = 0;
 1987         nbio       = 0;
 1988         bio_idx    = 0;
 1989 
 1990         if (operation == BIO_FLUSH) {
 1991                 bio = g_new_bio();
 1992                 if (__predict_false(bio == NULL)) {
 1993                         DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
 1994                         error = ENOMEM;
 1995                         return (error);
 1996                 }
 1997 
 1998                 bio->bio_cmd     = BIO_FLUSH;
 1999                 bio->bio_flags  |= BIO_ORDERED;
 2000                 bio->bio_dev     = dev_data->cdev;
 2001                 bio->bio_offset  = 0;
 2002                 bio->bio_data    = 0;
 2003                 bio->bio_done    = xbb_bio_done;
 2004                 bio->bio_caller1 = reqlist;
 2005                 bio->bio_pblkno  = 0;
 2006 
 2007                 reqlist->pendcnt = 1;
 2008 
 2009                 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
 2010                            device_get_unit(xbb->dev));
 2011 
 2012                 (*dev_data->csw->d_strategy)(bio);
 2013 
 2014                 return (0);
 2015         }
 2016 
 2017         xbb_sg = xbb->xbb_sgs;
 2018         bio    = NULL;
 2019         nseg = reqlist->nr_segments;
 2020 
 2021         for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
 2022                 /*
 2023                  * KVA will not be contiguous, so any additional
 2024                  * I/O will need to be represented in a new bio.
 2025                  */
 2026                 if ((bio != NULL)
 2027                  && (xbb_sg->first_sect != 0)) {
 2028                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
 2029                                 printf("%s: Discontiguous I/O request "
 2030                                        "from domain %d ends on "
 2031                                        "non-sector boundary\n",
 2032                                        __func__, xbb->otherend_id);
 2033                                 error = EINVAL;
 2034                                 goto fail_free_bios;
 2035                         }
 2036                         bio = NULL;
 2037                 }
 2038 
 2039                 if (bio == NULL) {
 2040                         /*
 2041                          * Make sure that the start of this bio is
 2042                          * aligned to a device sector.
 2043                          */
 2044                         if ((bio_offset & (xbb->sector_size - 1)) != 0){
 2045                                 printf("%s: Misaligned I/O request "
 2046                                        "from domain %d\n", __func__,
 2047                                        xbb->otherend_id);
 2048                                 error = EINVAL;
 2049                                 goto fail_free_bios;
 2050                         }
 2051 
 2052                         bio = bios[nbio++] = g_new_bio();
 2053                         if (__predict_false(bio == NULL)) {
 2054                                 error = ENOMEM;
 2055                                 goto fail_free_bios;
 2056                         }
 2057                         bio->bio_cmd     = operation;
 2058                         bio->bio_flags  |= bio_flags;
 2059                         bio->bio_dev     = dev_data->cdev;
 2060                         bio->bio_offset  = bio_offset;
 2061                         bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
 2062                                                 xbb_sg->first_sect);
 2063                         bio->bio_done    = xbb_bio_done;
 2064                         bio->bio_caller1 = reqlist;
 2065                         bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
 2066                 }
 2067 
 2068                 bio->bio_length += xbb_sg->nsect << 9;
 2069                 bio->bio_bcount  = bio->bio_length;
 2070                 bio_offset      += xbb_sg->nsect << 9;
 2071 
 2072                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
 2073                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
 2074                                 printf("%s: Discontiguous I/O request "
 2075                                        "from domain %d ends on "
 2076                                        "non-sector boundary\n",
 2077                                        __func__, xbb->otherend_id);
 2078                                 error = EINVAL;
 2079                                 goto fail_free_bios;
 2080                         }
 2081                         /*
 2082                          * KVA will not be contiguous, so any additional
 2083                          * I/O will need to be represented in a new bio.
 2084                          */
 2085                         bio = NULL;
 2086                 }
 2087         }
 2088 
 2089         reqlist->pendcnt = nbio;
 2090 
 2091         for (bio_idx = 0; bio_idx < nbio; bio_idx++)
 2092         {
 2093                 if (operation == BIO_READ) {
 2094                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
 2095                                    device_get_unit(xbb->dev),
 2096                                    bios[bio_idx]->bio_offset,
 2097                                    bios[bio_idx]->bio_length);
 2098                 } else if (operation == BIO_WRITE) {
 2099                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
 2100                                    device_get_unit(xbb->dev),
 2101                                    bios[bio_idx]->bio_offset,
 2102                                    bios[bio_idx]->bio_length);
 2103                 }
 2104                 (*dev_data->csw->d_strategy)(bios[bio_idx]);
 2105         }
 2106 
 2107         return (error);
 2108 
 2109 fail_free_bios:
 2110         for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
 2111                 g_destroy_bio(bios[bio_idx]);
 2112 
 2113         return (error);
 2114 }
 2115 
 2116 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int");
 2117 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t",
 2118                   "uint64_t");
 2119 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int",
 2120                   "uint64_t", "uint64_t");
 2121 
 2122 /**
 2123  * Backend handler for file access.
 2124  *
 2125  * \param xbb        Per-instance xbb configuration structure.
 2126  * \param reqlist    Allocated internal request list.
 2127  * \param operation  BIO_* I/O operation code.
 2128  * \param flags      Additional bio_flag data to pass to any generated bios
 2129  *                   (e.g. BIO_ORDERED)..
 2130  *
 2131  * \return  0 for success, errno codes for failure.
 2132  */
 2133 static int
 2134 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 2135                   int operation, int flags)
 2136 {
 2137         struct xbb_file_data *file_data;
 2138         u_int                 seg_idx;
 2139         u_int                 nseg;
 2140         struct uio            xuio;
 2141         struct xbb_sg        *xbb_sg;
 2142         struct iovec         *xiovec;
 2143         int                   error;
 2144 
 2145         file_data = &xbb->backend.file;
 2146         error = 0;
 2147         bzero(&xuio, sizeof(xuio));
 2148 
 2149         switch (operation) {
 2150         case BIO_READ:
 2151                 xuio.uio_rw = UIO_READ;
 2152                 break;
 2153         case BIO_WRITE:
 2154                 xuio.uio_rw = UIO_WRITE;
 2155                 break;
 2156         case BIO_FLUSH: {
 2157                 struct mount *mountpoint;
 2158 
 2159                 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
 2160                            device_get_unit(xbb->dev));
 2161 
 2162                 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
 2163 
 2164                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 2165                 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
 2166                 VOP_UNLOCK(xbb->vn);
 2167 
 2168                 vn_finished_write(mountpoint);
 2169 
 2170                 goto bailout_send_response;
 2171                 /* NOTREACHED */
 2172         }
 2173         default:
 2174                 panic("invalid operation %d", operation);
 2175                 /* NOTREACHED */
 2176         }
 2177         xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
 2178                         << xbb->sector_size_shift;
 2179         xuio.uio_segflg = UIO_SYSSPACE;
 2180         xuio.uio_iov = file_data->xiovecs;
 2181         xuio.uio_iovcnt = 0;
 2182         xbb_sg = xbb->xbb_sgs;
 2183         nseg = reqlist->nr_segments;
 2184 
 2185         for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
 2186                 /*
 2187                  * If the first sector is not 0, the KVA will
 2188                  * not be contiguous and we'll need to go on
 2189                  * to another segment.
 2190                  */
 2191                 if (xbb_sg->first_sect != 0)
 2192                         xiovec = NULL;
 2193 
 2194                 if (xiovec == NULL) {
 2195                         xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
 2196                         xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
 2197                             seg_idx, xbb_sg->first_sect);
 2198                         xiovec->iov_len = 0;
 2199                         xuio.uio_iovcnt++;
 2200                 }
 2201 
 2202                 xiovec->iov_len += xbb_sg->nsect << 9;
 2203 
 2204                 xuio.uio_resid += xbb_sg->nsect << 9;
 2205 
 2206                 /*
 2207                  * If the last sector is not the full page
 2208                  * size count, the next segment will not be
 2209                  * contiguous in KVA and we need a new iovec.
 2210                  */
 2211                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
 2212                         xiovec = NULL;
 2213         }
 2214 
 2215         xuio.uio_td = curthread;
 2216 
 2217         switch (operation) {
 2218         case BIO_READ:
 2219 
 2220                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
 2221                            device_get_unit(xbb->dev), xuio.uio_offset,
 2222                            xuio.uio_resid);
 2223 
 2224                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 2225 
 2226                 /*
 2227                  * UFS pays attention to IO_DIRECT for reads.  If the
 2228                  * DIRECTIO option is configured into the kernel, it calls
 2229                  * ffs_rawread().  But that only works for single-segment
 2230                  * uios with user space addresses.  In our case, with a
 2231                  * kernel uio, it still reads into the buffer cache, but it
 2232                  * will just try to release the buffer from the cache later
 2233                  * on in ffs_read().
 2234                  *
 2235                  * ZFS does not pay attention to IO_DIRECT for reads.
 2236                  *
 2237                  * UFS does not pay attention to IO_SYNC for reads.
 2238                  *
 2239                  * ZFS pays attention to IO_SYNC (which translates into the
 2240                  * Solaris define FRSYNC for zfs_read()) for reads.  It
 2241                  * attempts to sync the file before reading.
 2242                  *
 2243                  * So, to attempt to provide some barrier semantics in the
 2244                  * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
 2245                  */
 2246                 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
 2247                                  (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
 2248 
 2249                 VOP_UNLOCK(xbb->vn);
 2250                 break;
 2251         case BIO_WRITE: {
 2252                 struct mount *mountpoint;
 2253 
 2254                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
 2255                            device_get_unit(xbb->dev), xuio.uio_offset,
 2256                            xuio.uio_resid);
 2257 
 2258                 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
 2259 
 2260                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 2261 
 2262                 /*
 2263                  * UFS pays attention to IO_DIRECT for writes.  The write
 2264                  * is done asynchronously.  (Normally the write would just
 2265                  * get put into cache.
 2266                  *
 2267                  * UFS pays attention to IO_SYNC for writes.  It will
 2268                  * attempt to write the buffer out synchronously if that
 2269                  * flag is set.
 2270                  *
 2271                  * ZFS does not pay attention to IO_DIRECT for writes.
 2272                  *
 2273                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
 2274                  * for writes.  It will flush the transaction from the
 2275                  * cache before returning.
 2276                  *
 2277                  * So if we've got the BIO_ORDERED flag set, we want
 2278                  * IO_SYNC in either the UFS or ZFS case.
 2279                  */
 2280                 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
 2281                                   IO_SYNC : 0, file_data->cred);
 2282                 VOP_UNLOCK(xbb->vn);
 2283 
 2284                 vn_finished_write(mountpoint);
 2285 
 2286                 break;
 2287         }
 2288         default:
 2289                 panic("invalid operation %d", operation);
 2290                 /* NOTREACHED */
 2291         }
 2292 
 2293 bailout_send_response:
 2294 
 2295         if (error != 0)
 2296                 reqlist->status = BLKIF_RSP_ERROR;
 2297 
 2298         xbb_complete_reqlist(xbb, reqlist);
 2299 
 2300         return (0);
 2301 }
 2302 
 2303 /*--------------------------- Backend Configuration --------------------------*/
 2304 /**
 2305  * Close and cleanup any backend device/file specific state for this
 2306  * block back instance. 
 2307  *
 2308  * \param xbb  Per-instance xbb configuration structure.
 2309  */
 2310 static void
 2311 xbb_close_backend(struct xbb_softc *xbb)
 2312 {
 2313         DROP_GIANT();
 2314         DPRINTF("closing dev=%s\n", xbb->dev_name);
 2315         if (xbb->vn) {
 2316                 int flags = FREAD;
 2317 
 2318                 if ((xbb->flags & XBBF_READ_ONLY) == 0)
 2319                         flags |= FWRITE;
 2320 
 2321                 switch (xbb->device_type) {
 2322                 case XBB_TYPE_DISK:
 2323                         if (xbb->backend.dev.csw) {
 2324                                 dev_relthread(xbb->backend.dev.cdev,
 2325                                               xbb->backend.dev.dev_ref);
 2326                                 xbb->backend.dev.csw  = NULL;
 2327                                 xbb->backend.dev.cdev = NULL;
 2328                         }
 2329                         break;
 2330                 case XBB_TYPE_FILE:
 2331                         break;
 2332                 case XBB_TYPE_NONE:
 2333                 default:
 2334                         panic("Unexpected backend type.");
 2335                         break;
 2336                 }
 2337 
 2338                 (void)vn_close(xbb->vn, flags, NOCRED, curthread);
 2339                 xbb->vn = NULL;
 2340 
 2341                 switch (xbb->device_type) {
 2342                 case XBB_TYPE_DISK:
 2343                         break;
 2344                 case XBB_TYPE_FILE:
 2345                         if (xbb->backend.file.cred != NULL) {
 2346                                 crfree(xbb->backend.file.cred);
 2347                                 xbb->backend.file.cred = NULL;
 2348                         }
 2349                         break;
 2350                 case XBB_TYPE_NONE:
 2351                 default:
 2352                         panic("Unexpected backend type.");
 2353                         break;
 2354                 }
 2355         }
 2356         PICKUP_GIANT();
 2357 }
 2358 
 2359 /**
 2360  * Open a character device to be used for backend I/O.
 2361  *
 2362  * \param xbb  Per-instance xbb configuration structure.
 2363  *
 2364  * \return  0 for success, errno codes for failure.
 2365  */
 2366 static int
 2367 xbb_open_dev(struct xbb_softc *xbb)
 2368 {
 2369         struct vattr   vattr;
 2370         struct cdev   *dev;
 2371         struct cdevsw *devsw;
 2372         int            error;
 2373 
 2374         xbb->device_type = XBB_TYPE_DISK;
 2375         xbb->dispatch_io = xbb_dispatch_dev;
 2376         xbb->backend.dev.cdev = xbb->vn->v_rdev;
 2377         xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
 2378                                              &xbb->backend.dev.dev_ref);
 2379         if (xbb->backend.dev.csw == NULL)
 2380                 panic("Unable to retrieve device switch");
 2381 
 2382         error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
 2383         if (error) {
 2384                 xenbus_dev_fatal(xbb->dev, error, "error getting "
 2385                                  "vnode attributes for device %s",
 2386                                  xbb->dev_name);
 2387                 return (error);
 2388         }
 2389 
 2390         dev = xbb->vn->v_rdev;
 2391         devsw = dev->si_devsw;
 2392         if (!devsw->d_ioctl) {
 2393                 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
 2394                                  "device %s!", xbb->dev_name);
 2395                 return (ENODEV);
 2396         }
 2397 
 2398         error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
 2399                                (caddr_t)&xbb->sector_size, FREAD,
 2400                                curthread);
 2401         if (error) {
 2402                 xenbus_dev_fatal(xbb->dev, error,
 2403                                  "error calling ioctl DIOCGSECTORSIZE "
 2404                                  "for device %s", xbb->dev_name);
 2405                 return (error);
 2406         }
 2407 
 2408         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
 2409                                (caddr_t)&xbb->media_size, FREAD,
 2410                                curthread);
 2411         if (error) {
 2412                 xenbus_dev_fatal(xbb->dev, error,
 2413                                  "error calling ioctl DIOCGMEDIASIZE "
 2414                                  "for device %s", xbb->dev_name);
 2415                 return (error);
 2416         }
 2417 
 2418         return (0);
 2419 }
 2420 
 2421 /**
 2422  * Open a file to be used for backend I/O.
 2423  *
 2424  * \param xbb  Per-instance xbb configuration structure.
 2425  *
 2426  * \return  0 for success, errno codes for failure.
 2427  */
 2428 static int
 2429 xbb_open_file(struct xbb_softc *xbb)
 2430 {
 2431         struct xbb_file_data *file_data;
 2432         struct vattr          vattr;
 2433         int                   error;
 2434 
 2435         file_data = &xbb->backend.file;
 2436         xbb->device_type = XBB_TYPE_FILE;
 2437         xbb->dispatch_io = xbb_dispatch_file;
 2438         error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
 2439         if (error != 0) {
 2440                 xenbus_dev_fatal(xbb->dev, error,
 2441                                  "error calling VOP_GETATTR()"
 2442                                  "for file %s", xbb->dev_name);
 2443                 return (error);
 2444         }
 2445 
 2446         /*
 2447          * Verify that we have the ability to upgrade to exclusive
 2448          * access on this file so we can trap errors at open instead
 2449          * of reporting them during first access.
 2450          */
 2451         if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
 2452                 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
 2453                 if (VN_IS_DOOMED(xbb->vn)) {
 2454                         error = EBADF;
 2455                         xenbus_dev_fatal(xbb->dev, error,
 2456                                          "error locking file %s",
 2457                                          xbb->dev_name);
 2458 
 2459                         return (error);
 2460                 }
 2461         }
 2462 
 2463         file_data->cred = crhold(curthread->td_ucred);
 2464         xbb->media_size = vattr.va_size;
 2465 
 2466         /*
 2467          * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
 2468          * With ZFS, it is 131072 bytes.  Block sizes that large don't work
 2469          * with disklabel and UFS on FreeBSD at least.  Large block sizes
 2470          * may not work with other OSes as well.  So just export a sector
 2471          * size of 512 bytes, which should work with any OS or
 2472          * application.  Since our backing is a file, any block size will
 2473          * work fine for the backing store.
 2474          */
 2475 #if 0
 2476         xbb->sector_size = vattr.va_blocksize;
 2477 #endif
 2478         xbb->sector_size = 512;
 2479 
 2480         /*
 2481          * Sanity check.  The media size has to be at least one
 2482          * sector long.
 2483          */
 2484         if (xbb->media_size < xbb->sector_size) {
 2485                 error = EINVAL;
 2486                 xenbus_dev_fatal(xbb->dev, error,
 2487                                  "file %s size %ju < block size %u",
 2488                                  xbb->dev_name,
 2489                                  (uintmax_t)xbb->media_size,
 2490                                  xbb->sector_size);
 2491         }
 2492         return (error);
 2493 }
 2494 
 2495 /**
 2496  * Open the backend provider for this connection.
 2497  *
 2498  * \param xbb  Per-instance xbb configuration structure.
 2499  *
 2500  * \return  0 for success, errno codes for failure.
 2501  */
 2502 static int
 2503 xbb_open_backend(struct xbb_softc *xbb)
 2504 {
 2505         struct nameidata nd;
 2506         int              flags;
 2507         int              error;
 2508 
 2509         flags = FREAD;
 2510         error = 0;
 2511 
 2512         DPRINTF("opening dev=%s\n", xbb->dev_name);
 2513 
 2514         if (rootvnode == NULL) {
 2515                 xenbus_dev_fatal(xbb->dev, ENOENT,
 2516                                  "Root file system not mounted");
 2517                 return (ENOENT);
 2518         }
 2519 
 2520         if ((xbb->flags & XBBF_READ_ONLY) == 0)
 2521                 flags |= FWRITE;
 2522 
 2523         pwd_ensure_dirs();
 2524 
 2525  again:
 2526         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name);
 2527         error = vn_open(&nd, &flags, 0, NULL);
 2528         if (error) {
 2529                 /*
 2530                  * This is the only reasonable guess we can make as far as
 2531                  * path if the user doesn't give us a fully qualified path.
 2532                  * If they want to specify a file, they need to specify the
 2533                  * full path.
 2534                  */
 2535                 if (xbb->dev_name[0] != '/') {
 2536                         char *dev_path = "/dev/";
 2537                         char *dev_name;
 2538 
 2539                         /* Try adding device path at beginning of name */
 2540                         dev_name = malloc(strlen(xbb->dev_name)
 2541                                         + strlen(dev_path) + 1,
 2542                                           M_XENBLOCKBACK, M_NOWAIT);
 2543                         if (dev_name) {
 2544                                 sprintf(dev_name, "%s%s", dev_path,
 2545                                         xbb->dev_name);
 2546                                 free(xbb->dev_name, M_XENBLOCKBACK);
 2547                                 xbb->dev_name = dev_name;
 2548                                 goto again;
 2549                         }
 2550                 }
 2551                 xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
 2552                                  xbb->dev_name);
 2553                 return (error);
 2554         }
 2555 
 2556         NDFREE_PNBUF(&nd);
 2557                 
 2558         xbb->vn = nd.ni_vp;
 2559 
 2560         /* We only support disks and files. */
 2561         if (vn_isdisk_error(xbb->vn, &error)) {
 2562                 error = xbb_open_dev(xbb);
 2563         } else if (xbb->vn->v_type == VREG) {
 2564                 error = xbb_open_file(xbb);
 2565         } else {
 2566                 error = EINVAL;
 2567                 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
 2568                                  "or file", xbb->dev_name);
 2569         }
 2570         VOP_UNLOCK(xbb->vn);
 2571 
 2572         if (error != 0) {
 2573                 xbb_close_backend(xbb);
 2574                 return (error);
 2575         }
 2576 
 2577         xbb->sector_size_shift = fls(xbb->sector_size) - 1;
 2578         xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
 2579 
 2580         DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
 2581                 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
 2582                 xbb->dev_name, xbb->sector_size, xbb->media_size);
 2583 
 2584         return (0);
 2585 }
 2586 
 2587 /*------------------------ Inter-Domain Communication ------------------------*/
 2588 /**
 2589  * Free dynamically allocated KVA or pseudo-physical address allocations.
 2590  *
 2591  * \param xbb  Per-instance xbb configuration structure.
 2592  */
 2593 static void
 2594 xbb_free_communication_mem(struct xbb_softc *xbb)
 2595 {
 2596         if (xbb->kva != 0) {
 2597                 if (xbb->pseudo_phys_res != NULL) {
 2598                         xenmem_free(xbb->dev, xbb->pseudo_phys_res_id,
 2599                             xbb->pseudo_phys_res);
 2600                         xbb->pseudo_phys_res = NULL;
 2601                 }
 2602         }
 2603         xbb->kva = 0;
 2604         xbb->gnt_base_addr = 0;
 2605         if (xbb->kva_free != NULL) {
 2606                 free(xbb->kva_free, M_XENBLOCKBACK);
 2607                 xbb->kva_free = NULL;
 2608         }
 2609 }
 2610 
 2611 /**
 2612  * Cleanup all inter-domain communication mechanisms.
 2613  *
 2614  * \param xbb  Per-instance xbb configuration structure.
 2615  */
 2616 static int
 2617 xbb_disconnect(struct xbb_softc *xbb)
 2618 {
 2619         DPRINTF("\n");
 2620 
 2621         mtx_unlock(&xbb->lock);
 2622         xen_intr_unbind(&xbb->xen_intr_handle);
 2623         if (xbb->io_taskqueue != NULL)
 2624                 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task);
 2625         mtx_lock(&xbb->lock);
 2626 
 2627         /*
 2628          * No new interrupts can generate work, but we must wait
 2629          * for all currently active requests to drain.
 2630          */
 2631         if (xbb->active_request_count != 0)
 2632                 return (EAGAIN);
 2633 
 2634         if (xbb->flags & XBBF_RING_CONNECTED) {
 2635                 struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
 2636                 struct gnttab_unmap_grant_ref *op;
 2637                 unsigned int ring_idx;
 2638                 int error;
 2639 
 2640                 for (ring_idx = 0, op = ops;
 2641                      ring_idx < xbb->ring_config.ring_pages;
 2642                      ring_idx++, op++) {
 2643                         op->host_addr    = xbb->ring_config.gnt_addr
 2644                                          + (ring_idx * PAGE_SIZE);
 2645                         op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
 2646                         op->handle       = xbb->ring_config.handle[ring_idx];
 2647                 }
 2648 
 2649                 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
 2650                                                   xbb->ring_config.ring_pages);
 2651                 if (error != 0)
 2652                         panic("Grant table op failed (%d)", error);
 2653 
 2654                 xbb->flags &= ~XBBF_RING_CONNECTED;
 2655         }
 2656 
 2657         xbb_free_communication_mem(xbb);
 2658 
 2659         if (xbb->requests != NULL) {
 2660                 free(xbb->requests, M_XENBLOCKBACK);
 2661                 xbb->requests = NULL;
 2662         }
 2663 
 2664         if (xbb->request_lists != NULL) {
 2665                 struct xbb_xen_reqlist *reqlist;
 2666                 int i;
 2667 
 2668                 /* There is one request list for ever allocated request. */
 2669                 for (i = 0, reqlist = xbb->request_lists;
 2670                      i < xbb->max_requests; i++, reqlist++){
 2671                         if (reqlist->gnt_handles != NULL) {
 2672                                 free(reqlist->gnt_handles, M_XENBLOCKBACK);
 2673                                 reqlist->gnt_handles = NULL;
 2674                         }
 2675                 }
 2676                 free(xbb->request_lists, M_XENBLOCKBACK);
 2677                 xbb->request_lists = NULL;
 2678         }
 2679 
 2680         return (0);
 2681 }
 2682 
 2683 /**
 2684  * Map shared memory ring into domain local address space, initialize
 2685  * ring control structures, and bind an interrupt to the event channel
 2686  * used to notify us of ring changes.
 2687  *
 2688  * \param xbb  Per-instance xbb configuration structure.
 2689  */
 2690 static int
 2691 xbb_connect_ring(struct xbb_softc *xbb)
 2692 {
 2693         struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
 2694         struct gnttab_map_grant_ref *gnt;
 2695         u_int                        ring_idx;
 2696         int                          error;
 2697 
 2698         if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
 2699                 return (0);
 2700 
 2701         /*
 2702          * Kva for our ring is at the tail of the region of kva allocated
 2703          * by xbb_alloc_communication_mem().
 2704          */
 2705         xbb->ring_config.va = xbb->kva
 2706                             + (xbb->kva_size
 2707                              - (xbb->ring_config.ring_pages * PAGE_SIZE));
 2708         xbb->ring_config.gnt_addr = xbb->gnt_base_addr
 2709                                   + (xbb->kva_size
 2710                                    - (xbb->ring_config.ring_pages * PAGE_SIZE));
 2711 
 2712         for (ring_idx = 0, gnt = gnts;
 2713              ring_idx < xbb->ring_config.ring_pages;
 2714              ring_idx++, gnt++) {
 2715                 gnt->host_addr = xbb->ring_config.gnt_addr
 2716                                + (ring_idx * PAGE_SIZE);
 2717                 gnt->flags     = GNTMAP_host_map;
 2718                 gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
 2719                 gnt->dom       = xbb->otherend_id;
 2720         }
 2721 
 2722         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
 2723                                           xbb->ring_config.ring_pages);
 2724         if (error)
 2725                 panic("blkback: Ring page grant table op failed (%d)", error);
 2726 
 2727         for (ring_idx = 0, gnt = gnts;
 2728              ring_idx < xbb->ring_config.ring_pages;
 2729              ring_idx++, gnt++) {
 2730                 if (gnt->status != 0) {
 2731                         struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES];
 2732                         unsigned int i, j;
 2733 
 2734                         xbb->ring_config.va = 0;
 2735                         xenbus_dev_fatal(xbb->dev, EACCES,
 2736                                          "Ring shared page mapping failed. "
 2737                                          "Status %d.", gnt->status);
 2738 
 2739                         /* Unmap everything to avoid leaking grant table maps */
 2740                         for (i = 0, j = 0; i < xbb->ring_config.ring_pages;
 2741                             i++) {
 2742                                 if (gnts[i].status != GNTST_okay)
 2743                                         continue;
 2744 
 2745                                 unmap[j].host_addr = gnts[i].host_addr;
 2746                                 unmap[j].dev_bus_addr = gnts[i].dev_bus_addr;
 2747                                 unmap[j++].handle = gnts[i].handle;
 2748                         }
 2749                         if (j != 0) {
 2750                                 error = HYPERVISOR_grant_table_op(
 2751                                     GNTTABOP_unmap_grant_ref, unmap, j);
 2752                                 if (error != 0)
 2753                                         panic("Unable to unmap grants (%d)",
 2754                                             error);
 2755                         }
 2756                         return (EACCES);
 2757                 }
 2758                 xbb->ring_config.handle[ring_idx]   = gnt->handle;
 2759                 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
 2760         }
 2761 
 2762         /* Initialize the ring based on ABI. */
 2763         switch (xbb->abi) {
 2764         case BLKIF_PROTOCOL_NATIVE:
 2765         {
 2766                 blkif_sring_t *sring;
 2767                 sring = (blkif_sring_t *)xbb->ring_config.va;
 2768                 BACK_RING_INIT(&xbb->rings.native, sring,
 2769                                xbb->ring_config.ring_pages * PAGE_SIZE);
 2770                 break;
 2771         }
 2772         case BLKIF_PROTOCOL_X86_32:
 2773         {
 2774                 blkif_x86_32_sring_t *sring_x86_32;
 2775                 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
 2776                 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
 2777                                xbb->ring_config.ring_pages * PAGE_SIZE);
 2778                 break;
 2779         }
 2780         case BLKIF_PROTOCOL_X86_64:
 2781         {
 2782                 blkif_x86_64_sring_t *sring_x86_64;
 2783                 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
 2784                 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
 2785                                xbb->ring_config.ring_pages * PAGE_SIZE);
 2786                 break;
 2787         }
 2788         default:
 2789                 panic("Unexpected blkif protocol ABI.");
 2790         }
 2791 
 2792         xbb->flags |= XBBF_RING_CONNECTED;
 2793 
 2794         error = xen_intr_bind_remote_port(xbb->dev,
 2795                                           xbb->otherend_id,
 2796                                           xbb->ring_config.evtchn,
 2797                                           xbb_filter,
 2798                                           /*ithread_handler*/NULL,
 2799                                           /*arg*/xbb,
 2800                                           INTR_TYPE_BIO | INTR_MPSAFE,
 2801                                           &xbb->xen_intr_handle);
 2802         if (error) {
 2803                 xenbus_dev_fatal(xbb->dev, error, "binding event channel");
 2804                 return (error);
 2805         }
 2806 
 2807         DPRINTF("rings connected!\n");
 2808 
 2809         return 0;
 2810 }
 2811 
 2812 /**
 2813  * Size KVA and pseudo-physical address allocations based on negotiated
 2814  * values for the size and number of I/O requests, and the size of our
 2815  * communication ring.
 2816  *
 2817  * \param xbb  Per-instance xbb configuration structure.
 2818  *
 2819  * These address spaces are used to dynamically map pages in the
 2820  * front-end's domain into our own.
 2821  */
 2822 static int
 2823 xbb_alloc_communication_mem(struct xbb_softc *xbb)
 2824 {
 2825         xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
 2826         xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
 2827         xbb->kva_size = xbb->reqlist_kva_size +
 2828                         (xbb->ring_config.ring_pages * PAGE_SIZE);
 2829 
 2830         xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT);
 2831         if (xbb->kva_free == NULL)
 2832                 return (ENOMEM);
 2833 
 2834         DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
 2835                 device_get_nameunit(xbb->dev), xbb->kva_size,
 2836                 xbb->reqlist_kva_size);
 2837         /*
 2838          * Reserve a range of pseudo physical memory that we can map
 2839          * into kva.  These pages will only be backed by machine
 2840          * pages ("real memory") during the lifetime of front-end requests
 2841          * via grant table operations.
 2842          */
 2843         xbb->pseudo_phys_res_id = 0;
 2844         xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id,
 2845             xbb->kva_size);
 2846         if (xbb->pseudo_phys_res == NULL) {
 2847                 xbb->kva = 0;
 2848                 return (ENOMEM);
 2849         }
 2850         xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
 2851         xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
 2852 
 2853         DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
 2854                 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
 2855                 (uintmax_t)xbb->gnt_base_addr); 
 2856         return (0);
 2857 }
 2858 
 2859 /**
 2860  * Collect front-end information from the XenStore.
 2861  *
 2862  * \param xbb  Per-instance xbb configuration structure.
 2863  */
 2864 static int
 2865 xbb_collect_frontend_info(struct xbb_softc *xbb)
 2866 {
 2867         char        protocol_abi[64];
 2868         const char *otherend_path;
 2869         int         error;
 2870         u_int       ring_idx;
 2871         u_int       ring_page_order;
 2872         size_t      ring_size;
 2873 
 2874         otherend_path = xenbus_get_otherend_path(xbb->dev);
 2875 
 2876         /*
 2877          * Protocol defaults valid even if all negotiation fails.
 2878          */
 2879         xbb->ring_config.ring_pages = 1;
 2880         xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 2881         xbb->max_request_size       = xbb->max_request_segments * PAGE_SIZE;
 2882 
 2883         /*
 2884          * Mandatory data (used in all versions of the protocol) first.
 2885          */
 2886         error = xs_scanf(XST_NIL, otherend_path,
 2887                          "event-channel", NULL, "%" PRIu32,
 2888                          &xbb->ring_config.evtchn);
 2889         if (error != 0) {
 2890                 xenbus_dev_fatal(xbb->dev, error,
 2891                                  "Unable to retrieve event-channel information "
 2892                                  "from frontend %s.  Unable to connect.",
 2893                                  xenbus_get_otherend_path(xbb->dev));
 2894                 return (error);
 2895         }
 2896 
 2897         /*
 2898          * These fields are initialized to legacy protocol defaults
 2899          * so we only need to fail if reading the updated value succeeds
 2900          * and the new value is outside of its allowed range.
 2901          *
 2902          * \note xs_gather() returns on the first encountered error, so
 2903          *       we must use independent calls in order to guarantee
 2904          *       we don't miss information in a sparsly populated front-end
 2905          *       tree.
 2906          *
 2907          * \note xs_scanf() does not update variables for unmatched
 2908          *       fields.
 2909          */
 2910         ring_page_order = 0;
 2911         xbb->max_requests = 32;
 2912 
 2913         (void)xs_scanf(XST_NIL, otherend_path,
 2914                        "ring-page-order", NULL, "%u",
 2915                        &ring_page_order);
 2916         xbb->ring_config.ring_pages = 1 << ring_page_order;
 2917         ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
 2918         xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
 2919 
 2920         if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
 2921                 xenbus_dev_fatal(xbb->dev, EINVAL,
 2922                                  "Front-end specified ring-pages of %u "
 2923                                  "exceeds backend limit of %u.  "
 2924                                  "Unable to connect.",
 2925                                  xbb->ring_config.ring_pages,
 2926                                  XBB_MAX_RING_PAGES);
 2927                 return (EINVAL);
 2928         }
 2929 
 2930         if (xbb->ring_config.ring_pages == 1) {
 2931                 error = xs_gather(XST_NIL, otherend_path,
 2932                                   "ring-ref", "%" PRIu32,
 2933                                   &xbb->ring_config.ring_ref[0],
 2934                                   NULL);
 2935                 if (error != 0) {
 2936                         xenbus_dev_fatal(xbb->dev, error,
 2937                                          "Unable to retrieve ring information "
 2938                                          "from frontend %s.  Unable to "
 2939                                          "connect.",
 2940                                          xenbus_get_otherend_path(xbb->dev));
 2941                         return (error);
 2942                 }
 2943         } else {
 2944                 /* Multi-page ring format. */
 2945                 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
 2946                      ring_idx++) {
 2947                         char ring_ref_name[]= "ring_refXX";
 2948 
 2949                         snprintf(ring_ref_name, sizeof(ring_ref_name),
 2950                                  "ring-ref%u", ring_idx);
 2951                         error = xs_scanf(XST_NIL, otherend_path,
 2952                                          ring_ref_name, NULL, "%" PRIu32,
 2953                                          &xbb->ring_config.ring_ref[ring_idx]);
 2954                         if (error != 0) {
 2955                                 xenbus_dev_fatal(xbb->dev, error,
 2956                                                  "Failed to retriev grant "
 2957                                                  "reference for page %u of "
 2958                                                  "shared ring.  Unable "
 2959                                                  "to connect.", ring_idx);
 2960                                 return (error);
 2961                         }
 2962                 }
 2963         }
 2964 
 2965         error = xs_gather(XST_NIL, otherend_path,
 2966                           "protocol", "%63s", protocol_abi,
 2967                           NULL); 
 2968         if (error != 0
 2969          || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
 2970                 /*
 2971                  * Assume native if the frontend has not
 2972                  * published ABI data or it has published and
 2973                  * matches our own ABI.
 2974                  */
 2975                 xbb->abi = BLKIF_PROTOCOL_NATIVE;
 2976         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
 2977                 xbb->abi = BLKIF_PROTOCOL_X86_32;
 2978         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
 2979                 xbb->abi = BLKIF_PROTOCOL_X86_64;
 2980         } else {
 2981                 xenbus_dev_fatal(xbb->dev, EINVAL,
 2982                                  "Unknown protocol ABI (%s) published by "
 2983                                  "frontend.  Unable to connect.", protocol_abi);
 2984                 return (EINVAL);
 2985         }
 2986         return (0);
 2987 }
 2988 
 2989 /**
 2990  * Allocate per-request data structures given request size and number
 2991  * information negotiated with the front-end.
 2992  *
 2993  * \param xbb  Per-instance xbb configuration structure.
 2994  */
 2995 static int
 2996 xbb_alloc_requests(struct xbb_softc *xbb)
 2997 {
 2998         struct xbb_xen_req *req;
 2999         struct xbb_xen_req *last_req;
 3000 
 3001         /*
 3002          * Allocate request book keeping datastructures.
 3003          */
 3004         xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
 3005                                M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 3006         if (xbb->requests == NULL) {
 3007                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
 3008                                   "Unable to allocate request structures");
 3009                 return (ENOMEM);
 3010         }
 3011 
 3012         req      = xbb->requests;
 3013         last_req = &xbb->requests[xbb->max_requests - 1];
 3014         STAILQ_INIT(&xbb->request_free_stailq);
 3015         while (req <= last_req) {
 3016                 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
 3017                 req++;
 3018         }
 3019         return (0);
 3020 }
 3021 
 3022 static int
 3023 xbb_alloc_request_lists(struct xbb_softc *xbb)
 3024 {
 3025         struct xbb_xen_reqlist *reqlist;
 3026         int                     i;
 3027 
 3028         /*
 3029          * If no requests can be merged, we need 1 request list per
 3030          * in flight request.
 3031          */
 3032         xbb->request_lists = malloc(xbb->max_requests *
 3033                 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 3034         if (xbb->request_lists == NULL) {
 3035                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
 3036                                   "Unable to allocate request list structures");
 3037                 return (ENOMEM);
 3038         }
 3039 
 3040         STAILQ_INIT(&xbb->reqlist_free_stailq);
 3041         STAILQ_INIT(&xbb->reqlist_pending_stailq);
 3042         for (i = 0; i < xbb->max_requests; i++) {
 3043                 int seg;
 3044 
 3045                 reqlist      = &xbb->request_lists[i];
 3046 
 3047                 reqlist->xbb = xbb;
 3048 
 3049                 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
 3050                                               sizeof(*reqlist->gnt_handles),
 3051                                               M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 3052                 if (reqlist->gnt_handles == NULL) {
 3053                         xenbus_dev_fatal(xbb->dev, ENOMEM,
 3054                                           "Unable to allocate request "
 3055                                           "grant references");
 3056                         return (ENOMEM);
 3057                 }
 3058 
 3059                 for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
 3060                         reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
 3061 
 3062                 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
 3063         }
 3064         return (0);
 3065 }
 3066 
 3067 /**
 3068  * Supply information about the physical device to the frontend
 3069  * via XenBus.
 3070  *
 3071  * \param xbb  Per-instance xbb configuration structure.
 3072  */
 3073 static int
 3074 xbb_publish_backend_info(struct xbb_softc *xbb)
 3075 {
 3076         struct xs_transaction xst;
 3077         const char           *our_path;
 3078         const char           *leaf;
 3079         int                   error;
 3080 
 3081         our_path = xenbus_get_node(xbb->dev);
 3082         while (1) {
 3083                 error = xs_transaction_start(&xst);
 3084                 if (error != 0) {
 3085                         xenbus_dev_fatal(xbb->dev, error,
 3086                                          "Error publishing backend info "
 3087                                          "(start transaction)");
 3088                         return (error);
 3089                 }
 3090 
 3091                 leaf = "sectors";
 3092                 error = xs_printf(xst, our_path, leaf,
 3093                                   "%"PRIu64, xbb->media_num_sectors);
 3094                 if (error != 0)
 3095                         break;
 3096 
 3097                 /* XXX Support all VBD attributes here. */
 3098                 leaf = "info";
 3099                 error = xs_printf(xst, our_path, leaf, "%u",
 3100                                   xbb->flags & XBBF_READ_ONLY
 3101                                 ? VDISK_READONLY : 0);
 3102                 if (error != 0)
 3103                         break;
 3104 
 3105                 leaf = "sector-size";
 3106                 error = xs_printf(xst, our_path, leaf, "%u",
 3107                                   xbb->sector_size);
 3108                 if (error != 0)
 3109                         break;
 3110 
 3111                 error = xs_transaction_end(xst, 0);
 3112                 if (error == 0) {
 3113                         return (0);
 3114                 } else if (error != EAGAIN) {
 3115                         xenbus_dev_fatal(xbb->dev, error, "ending transaction");
 3116                         return (error);
 3117                 }
 3118         }
 3119 
 3120         xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
 3121                         our_path, leaf);
 3122         xs_transaction_end(xst, 1);
 3123         return (error);
 3124 }
 3125 
 3126 /**
 3127  * Connect to our blkfront peer now that it has completed publishing
 3128  * its configuration into the XenStore.
 3129  *
 3130  * \param xbb  Per-instance xbb configuration structure.
 3131  */
 3132 static void
 3133 xbb_connect(struct xbb_softc *xbb)
 3134 {
 3135         int error;
 3136 
 3137         if (!xbb->hotplug_done ||
 3138             (xenbus_get_state(xbb->dev) != XenbusStateInitWait) ||
 3139             (xbb_collect_frontend_info(xbb) != 0))
 3140                 return;
 3141 
 3142         xbb->flags &= ~XBBF_SHUTDOWN;
 3143 
 3144         /*
 3145          * We limit the maximum number of reqlist segments to the maximum
 3146          * number of segments in the ring, or our absolute maximum,
 3147          * whichever is smaller.
 3148          */
 3149         xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
 3150                 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
 3151 
 3152         /*
 3153          * The maximum size is simply a function of the number of segments
 3154          * we can handle.
 3155          */
 3156         xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
 3157 
 3158         /* Allocate resources whose size depends on front-end configuration. */
 3159         error = xbb_alloc_communication_mem(xbb);
 3160         if (error != 0) {
 3161                 xenbus_dev_fatal(xbb->dev, error,
 3162                                  "Unable to allocate communication memory");
 3163                 return;
 3164         }
 3165 
 3166         error = xbb_publish_backend_info(xbb);
 3167         if (error != 0) {
 3168                 xenbus_dev_fatal(xbb->dev, error,
 3169                     "Unable to publish device information");
 3170                 return;
 3171         }
 3172 
 3173         error = xbb_alloc_requests(xbb);
 3174         if (error != 0) {
 3175                 /* Specific errors are reported by xbb_alloc_requests(). */
 3176                 return;
 3177         }
 3178 
 3179         error = xbb_alloc_request_lists(xbb);
 3180         if (error != 0) {
 3181                 /* Specific errors are reported by xbb_alloc_request_lists(). */
 3182                 return;
 3183         }
 3184 
 3185         /*
 3186          * Connect communication channel.
 3187          */
 3188         error = xbb_connect_ring(xbb);
 3189         if (error != 0) {
 3190                 /* Specific errors are reported by xbb_connect_ring(). */
 3191                 return;
 3192         }
 3193 
 3194         /* Ready for I/O. */
 3195         xenbus_set_state(xbb->dev, XenbusStateConnected);
 3196 }
 3197 
 3198 /*-------------------------- Device Teardown Support -------------------------*/
 3199 /**
 3200  * Perform device shutdown functions.
 3201  *
 3202  * \param xbb  Per-instance xbb configuration structure.
 3203  *
 3204  * Mark this instance as shutting down, wait for any active I/O on the
 3205  * backend device/file to drain, disconnect from the front-end, and notify
 3206  * any waiters (e.g. a thread invoking our detach method) that detach can
 3207  * now proceed.
 3208  */
 3209 static int
 3210 xbb_shutdown(struct xbb_softc *xbb)
 3211 {
 3212         XenbusState frontState;
 3213         int         error;
 3214 
 3215         DPRINTF("\n");
 3216 
 3217         /*
 3218          * Due to the need to drop our mutex during some
 3219          * xenbus operations, it is possible for two threads
 3220          * to attempt to close out shutdown processing at
 3221          * the same time.  Tell the caller that hits this
 3222          * race to try back later. 
 3223          */
 3224         if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
 3225                 return (EAGAIN);
 3226 
 3227         xbb->flags |= XBBF_IN_SHUTDOWN;
 3228         mtx_unlock(&xbb->lock);
 3229 
 3230         if (xbb->hotplug_watch.node != NULL) {
 3231                 xs_unregister_watch(&xbb->hotplug_watch);
 3232                 free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
 3233                 xbb->hotplug_watch.node = NULL;
 3234         }
 3235 
 3236         if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
 3237                 xenbus_set_state(xbb->dev, XenbusStateClosing);
 3238 
 3239         frontState = xenbus_get_otherend_state(xbb->dev);
 3240         mtx_lock(&xbb->lock);
 3241         xbb->flags &= ~XBBF_IN_SHUTDOWN;
 3242 
 3243         /* Wait for the frontend to disconnect (if it's connected). */
 3244         if (frontState == XenbusStateConnected)
 3245                 return (EAGAIN);
 3246 
 3247         DPRINTF("\n");
 3248 
 3249         /* Indicate shutdown is in progress. */
 3250         xbb->flags |= XBBF_SHUTDOWN;
 3251 
 3252         /* Disconnect from the front-end. */
 3253         error = xbb_disconnect(xbb);
 3254         if (error != 0) {
 3255                 /*
 3256                  * Requests still outstanding.  We'll be called again
 3257                  * once they complete.
 3258                  */
 3259                 KASSERT(error == EAGAIN,
 3260                         ("%s: Unexpected xbb_disconnect() failure %d",
 3261                          __func__, error));
 3262 
 3263                 return (error);
 3264         }
 3265 
 3266         DPRINTF("\n");
 3267 
 3268         /* Indicate to xbb_detach() that is it safe to proceed. */
 3269         wakeup(xbb);
 3270 
 3271         return (0);
 3272 }
 3273 
 3274 /**
 3275  * Report an attach time error to the console and Xen, and cleanup
 3276  * this instance by forcing immediate detach processing.
 3277  *
 3278  * \param xbb  Per-instance xbb configuration structure.
 3279  * \param err  Errno describing the error.
 3280  * \param fmt  Printf style format and arguments
 3281  */
 3282 static void
 3283 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
 3284 {
 3285         va_list ap;
 3286         va_list ap_hotplug;
 3287 
 3288         va_start(ap, fmt);
 3289         va_copy(ap_hotplug, ap);
 3290         xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
 3291                   "hotplug-error", fmt, ap_hotplug);
 3292         va_end(ap_hotplug);
 3293         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 3294                   "hotplug-status", "error");
 3295 
 3296         xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
 3297         va_end(ap);
 3298 
 3299         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 3300                   "online", "");
 3301         mtx_lock(&xbb->lock);
 3302         xbb_shutdown(xbb);
 3303         mtx_unlock(&xbb->lock);
 3304 }
 3305 
 3306 /*---------------------------- NewBus Entrypoints ----------------------------*/
 3307 /**
 3308  * Inspect a XenBus device and claim it if is of the appropriate type.
 3309  * 
 3310  * \param dev  NewBus device object representing a candidate XenBus device.
 3311  *
 3312  * \return  0 for success, errno codes for failure.
 3313  */
 3314 static int
 3315 xbb_probe(device_t dev)
 3316 {
 3317 
 3318         if (strcmp(xenbus_get_type(dev), "vbd"))
 3319                 return (ENXIO);
 3320 
 3321         /* Only attach if Xen creates IOMMU entries for grant mapped pages. */
 3322         if (!xen_has_iommu_maps()) {
 3323                 static bool warned;
 3324 
 3325                 if (!warned) {
 3326                         warned = true;
 3327                         printf(
 3328         "xen-blkback disabled due to grant maps lacking IOMMU entries\n");
 3329                 }
 3330                 return (ENXIO);
 3331         }
 3332 
 3333         device_set_desc(dev, "Backend Virtual Block Device");
 3334         device_quiet(dev);
 3335         return (0);
 3336 }
 3337 
 3338 /**
 3339  * Setup sysctl variables to control various Block Back parameters.
 3340  *
 3341  * \param xbb  Xen Block Back softc.
 3342  *
 3343  */
 3344 static void
 3345 xbb_setup_sysctl(struct xbb_softc *xbb)
 3346 {
 3347         struct sysctl_ctx_list *sysctl_ctx = NULL;
 3348         struct sysctl_oid      *sysctl_tree = NULL;
 3349 
 3350         sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
 3351         if (sysctl_ctx == NULL)
 3352                 return;
 3353 
 3354         sysctl_tree = device_get_sysctl_tree(xbb->dev);
 3355         if (sysctl_tree == NULL)
 3356                 return;
 3357 
 3358         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3359                        "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
 3360                        "fake the flush command");
 3361 
 3362         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3363                        "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
 3364                        "send a real flush for N flush requests");
 3365 
 3366         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3367                        "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
 3368                        "Don't coalesce contiguous requests");
 3369 
 3370         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3371                          "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
 3372                          "how many I/O requests we have received");
 3373 
 3374         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3375                          "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
 3376                          "how many I/O requests have been completed");
 3377 
 3378         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3379                          "reqs_queued_for_completion", CTLFLAG_RW,
 3380                          &xbb->reqs_queued_for_completion,
 3381                          "how many I/O requests queued but not yet pushed");
 3382 
 3383         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3384                          "reqs_completed_with_error", CTLFLAG_RW,
 3385                          &xbb->reqs_completed_with_error,
 3386                          "how many I/O requests completed with error status");
 3387 
 3388         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3389                          "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
 3390                          "how many I/O dispatches were forced");
 3391 
 3392         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3393                          "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
 3394                          "how many I/O dispatches were normal");
 3395 
 3396         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3397                          "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
 3398                          "total number of I/O dispatches");
 3399 
 3400         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3401                          "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
 3402                          "how many times we have run out of KVA");
 3403 
 3404         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3405                          "request_shortages", CTLFLAG_RW,
 3406                          &xbb->request_shortages,
 3407                          "how many times we have run out of requests");
 3408 
 3409         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3410                         "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
 3411                         "maximum outstanding requests (negotiated)");
 3412 
 3413         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3414                         "max_request_segments", CTLFLAG_RD,
 3415                         &xbb->max_request_segments, 0,
 3416                         "maximum number of pages per requests (negotiated)");
 3417 
 3418         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3419                         "max_request_size", CTLFLAG_RD,
 3420                         &xbb->max_request_size, 0,
 3421                         "maximum size in bytes of a request (negotiated)");
 3422 
 3423         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 3424                         "ring_pages", CTLFLAG_RD,
 3425                         &xbb->ring_config.ring_pages, 0,
 3426                         "communication channel pages (negotiated)");
 3427 }
 3428 
 3429 static void
 3430 xbb_attach_disk(device_t dev)
 3431 {
 3432         struct xbb_softc        *xbb;
 3433         int                      error;
 3434 
 3435         xbb = device_get_softc(dev);
 3436 
 3437         KASSERT(xbb->hotplug_done, ("Missing hotplug execution"));
 3438 
 3439         /* Parse fopen style mode flags. */
 3440         if (strchr(xbb->dev_mode, 'w') == NULL)
 3441                 xbb->flags |= XBBF_READ_ONLY;
 3442 
 3443         /*
 3444          * Verify the physical device is present and can support
 3445          * the desired I/O mode.
 3446          */
 3447         error = xbb_open_backend(xbb);
 3448         if (error != 0) {
 3449                 xbb_attach_failed(xbb, error, "Unable to open %s",
 3450                                   xbb->dev_name);
 3451                 return;
 3452         }
 3453 
 3454         /* Use devstat(9) for recording statistics. */
 3455         xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
 3456                                            xbb->sector_size,
 3457                                            DEVSTAT_ALL_SUPPORTED,
 3458                                            DEVSTAT_TYPE_DIRECT
 3459                                          | DEVSTAT_TYPE_IF_OTHER,
 3460                                            DEVSTAT_PRIORITY_OTHER);
 3461 
 3462         xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
 3463                                               xbb->sector_size,
 3464                                               DEVSTAT_ALL_SUPPORTED,
 3465                                               DEVSTAT_TYPE_DIRECT
 3466                                             | DEVSTAT_TYPE_IF_OTHER,
 3467                                               DEVSTAT_PRIORITY_OTHER);
 3468         /*
 3469          * Setup sysctl variables.
 3470          */
 3471         xbb_setup_sysctl(xbb);
 3472 
 3473         /*
 3474          * Create a taskqueue for doing work that must occur from a
 3475          * thread context.
 3476          */
 3477         xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
 3478                                                   M_NOWAIT,
 3479                                                   taskqueue_thread_enqueue,
 3480                                                   /*contxt*/&xbb->io_taskqueue);
 3481         if (xbb->io_taskqueue == NULL) {
 3482                 xbb_attach_failed(xbb, error, "Unable to create taskqueue");
 3483                 return;
 3484         }
 3485 
 3486         taskqueue_start_threads(&xbb->io_taskqueue,
 3487                                 /*num threads*/1,
 3488                                 /*priority*/PWAIT,
 3489                                 /*thread name*/
 3490                                 "%s taskq", device_get_nameunit(dev));
 3491 
 3492         /* Update hot-plug status to satisfy xend. */
 3493         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 3494                           "hotplug-status", "connected");
 3495         if (error) {
 3496                 xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
 3497                                   xenbus_get_node(xbb->dev));
 3498                 return;
 3499         }
 3500 
 3501         /* The front end might be waiting for the backend, attach if so. */
 3502         if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised)
 3503                 xbb_connect(xbb);
 3504 }
 3505 
 3506 static void
 3507 xbb_attach_cb(struct xs_watch *watch, const char **vec, unsigned int len)
 3508 {
 3509         device_t dev;
 3510         struct xbb_softc *xbb;
 3511         int error;
 3512 
 3513         dev = (device_t)watch->callback_data;
 3514         xbb = device_get_softc(dev);
 3515 
 3516         error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path",
 3517             NULL, &xbb->dev_name, NULL);
 3518         if (error != 0)
 3519                 return;
 3520 
 3521         xs_unregister_watch(watch);
 3522         free(watch->node, M_XENBLOCKBACK);
 3523         watch->node = NULL;
 3524         xbb->hotplug_done = true;
 3525 
 3526         /* Collect physical device information. */
 3527         error = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "device-type",
 3528             NULL, &xbb->dev_type, NULL);
 3529         if (error != 0)
 3530                 xbb->dev_type = NULL;
 3531 
 3532         error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL,
 3533            &xbb->dev_mode, NULL);
 3534         if (error != 0) {
 3535                 xbb_attach_failed(xbb, error, "reading backend fields at %s",
 3536                     xenbus_get_node(dev));
 3537                 return;
 3538         }
 3539 
 3540         xbb_attach_disk(dev);
 3541 }
 3542 
 3543 /**
 3544  * Attach to a XenBus device that has been claimed by our probe routine.
 3545  *
 3546  * \param dev  NewBus device object representing this Xen Block Back instance.
 3547  *
 3548  * \return  0 for success, errno codes for failure.
 3549  */
 3550 static int
 3551 xbb_attach(device_t dev)
 3552 {
 3553         struct xbb_softc        *xbb;
 3554         int                      error;
 3555         u_int                    max_ring_page_order;
 3556         struct sbuf             *watch_path;
 3557 
 3558         DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
 3559 
 3560         /*
 3561          * Basic initialization.
 3562          * After this block it is safe to call xbb_detach()
 3563          * to clean up any allocated data for this instance.
 3564          */
 3565         xbb = device_get_softc(dev);
 3566         xbb->dev = dev;
 3567         xbb->otherend_id = xenbus_get_otherend_id(dev);
 3568         TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
 3569         mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
 3570 
 3571         /*
 3572          * Publish protocol capabilities for consumption by the
 3573          * front-end.
 3574          */
 3575         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 3576                           "feature-barrier", "1");
 3577         if (error) {
 3578                 xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
 3579                                   xenbus_get_node(xbb->dev));
 3580                 return (error);
 3581         }
 3582 
 3583         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 3584                           "feature-flush-cache", "1");
 3585         if (error) {
 3586                 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
 3587                                   xenbus_get_node(xbb->dev));
 3588                 return (error);
 3589         }
 3590 
 3591         max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
 3592         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 3593                           "max-ring-page-order", "%u", max_ring_page_order);
 3594         if (error) {
 3595                 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
 3596                                   xenbus_get_node(xbb->dev));
 3597                 return (error);
 3598         }
 3599 
 3600         /* Tell the toolstack blkback has attached. */
 3601         xenbus_set_state(dev, XenbusStateInitWait);
 3602 
 3603         if (xbb->hotplug_done) {
 3604                 xbb_attach_disk(dev);
 3605                 return (0);
 3606         }
 3607 
 3608         /*
 3609          * We need to wait for hotplug script execution before
 3610          * moving forward.
 3611          */
 3612         watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path");
 3613         xbb->hotplug_watch.callback_data = (uintptr_t)dev;
 3614         xbb->hotplug_watch.callback = xbb_attach_cb;
 3615         KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup"));
 3616         xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK);
 3617         /*
 3618          * We don't care about the path updated, just about the value changes
 3619          * on that single node, hence there's no need to queue more that one
 3620          * event.
 3621          */
 3622         xbb->hotplug_watch.max_pending = 1;
 3623         sbuf_delete(watch_path);
 3624         error = xs_register_watch(&xbb->hotplug_watch);
 3625         if (error != 0) {
 3626                 xbb_attach_failed(xbb, error, "failed to create watch on %s",
 3627                     xbb->hotplug_watch.node);
 3628                 free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
 3629                 return (error);
 3630         }
 3631 
 3632         return (0);
 3633 }
 3634 
 3635 /**
 3636  * Detach from a block back device instance.
 3637  *
 3638  * \param dev  NewBus device object representing this Xen Block Back instance.
 3639  *
 3640  * \return  0 for success, errno codes for failure.
 3641  * 
 3642  * \note A block back device may be detached at any time in its life-cycle,
 3643  *       including part way through the attach process.  For this reason,
 3644  *       initialization order and the initialization state checks in this
 3645  *       routine must be carefully coupled so that attach time failures
 3646  *       are gracefully handled.
 3647  */
 3648 static int
 3649 xbb_detach(device_t dev)
 3650 {
 3651         struct xbb_softc *xbb;
 3652 
 3653         DPRINTF("\n");
 3654 
 3655         xbb = device_get_softc(dev);
 3656         mtx_lock(&xbb->lock);
 3657         while (xbb_shutdown(xbb) == EAGAIN) {
 3658                 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
 3659                        "xbb_shutdown", 0);
 3660         }
 3661         mtx_unlock(&xbb->lock);
 3662 
 3663         DPRINTF("\n");
 3664 
 3665         if (xbb->io_taskqueue != NULL)
 3666                 taskqueue_free(xbb->io_taskqueue);
 3667 
 3668         if (xbb->xbb_stats != NULL)
 3669                 devstat_remove_entry(xbb->xbb_stats);
 3670 
 3671         if (xbb->xbb_stats_in != NULL)
 3672                 devstat_remove_entry(xbb->xbb_stats_in);
 3673 
 3674         xbb_close_backend(xbb);
 3675 
 3676         if (xbb->dev_mode != NULL) {
 3677                 free(xbb->dev_mode, M_XENSTORE);
 3678                 xbb->dev_mode = NULL;
 3679         }
 3680 
 3681         if (xbb->dev_type != NULL) {
 3682                 free(xbb->dev_type, M_XENSTORE);
 3683                 xbb->dev_type = NULL;
 3684         }
 3685 
 3686         if (xbb->dev_name != NULL) {
 3687                 free(xbb->dev_name, M_XENSTORE);
 3688                 xbb->dev_name = NULL;
 3689         }
 3690 
 3691         mtx_destroy(&xbb->lock);
 3692         return (0);
 3693 }
 3694 
 3695 /**
 3696  * Prepare this block back device for suspension of this VM.
 3697  * 
 3698  * \param dev  NewBus device object representing this Xen Block Back instance.
 3699  *
 3700  * \return  0 for success, errno codes for failure.
 3701  */
 3702 static int
 3703 xbb_suspend(device_t dev)
 3704 {
 3705 #ifdef NOT_YET
 3706         struct xbb_softc *sc = device_get_softc(dev);
 3707 
 3708         /* Prevent new requests being issued until we fix things up. */
 3709         mtx_lock(&sc->xb_io_lock);
 3710         sc->connected = BLKIF_STATE_SUSPENDED;
 3711         mtx_unlock(&sc->xb_io_lock);
 3712 #endif
 3713 
 3714         return (0);
 3715 }
 3716 
 3717 /**
 3718  * Perform any processing required to recover from a suspended state.
 3719  * 
 3720  * \param dev  NewBus device object representing this Xen Block Back instance.
 3721  *
 3722  * \return  0 for success, errno codes for failure.
 3723  */
 3724 static int
 3725 xbb_resume(device_t dev)
 3726 {
 3727         return (0);
 3728 }
 3729 
 3730 /**
 3731  * Handle state changes expressed via the XenStore by our front-end peer.
 3732  *
 3733  * \param dev             NewBus device object representing this Xen
 3734  *                        Block Back instance.
 3735  * \param frontend_state  The new state of the front-end.
 3736  *
 3737  * \return  0 for success, errno codes for failure.
 3738  */
 3739 static void
 3740 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
 3741 {
 3742         struct xbb_softc *xbb = device_get_softc(dev);
 3743 
 3744         DPRINTF("frontend_state=%s, xbb_state=%s\n",
 3745                 xenbus_strstate(frontend_state),
 3746                 xenbus_strstate(xenbus_get_state(xbb->dev)));
 3747 
 3748         switch (frontend_state) {
 3749         case XenbusStateInitialising:
 3750                 break;
 3751         case XenbusStateInitialised:
 3752         case XenbusStateConnected:
 3753                 xbb_connect(xbb);
 3754                 break;
 3755         case XenbusStateClosing:
 3756         case XenbusStateClosed:
 3757                 mtx_lock(&xbb->lock);
 3758                 xbb_shutdown(xbb);
 3759                 mtx_unlock(&xbb->lock);
 3760                 if (frontend_state == XenbusStateClosed)
 3761                         xenbus_set_state(xbb->dev, XenbusStateClosed);
 3762                 break;
 3763         default:
 3764                 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
 3765                                  frontend_state);
 3766                 break;
 3767         }
 3768 }
 3769 
 3770 /*---------------------------- NewBus Registration ---------------------------*/
 3771 static device_method_t xbb_methods[] = {
 3772         /* Device interface */
 3773         DEVMETHOD(device_probe,         xbb_probe),
 3774         DEVMETHOD(device_attach,        xbb_attach),
 3775         DEVMETHOD(device_detach,        xbb_detach),
 3776         DEVMETHOD(device_shutdown,      bus_generic_shutdown),
 3777         DEVMETHOD(device_suspend,       xbb_suspend),
 3778         DEVMETHOD(device_resume,        xbb_resume),
 3779 
 3780         /* Xenbus interface */
 3781         DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
 3782         { 0, 0 }
 3783 };
 3784 
 3785 static driver_t xbb_driver = {
 3786         "xbbd",
 3787         xbb_methods,
 3788         sizeof(struct xbb_softc),
 3789 };
 3790 
 3791 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, 0, 0);

Cache object: 7d4a7bb51cfde596c63c25b5dcbf1161


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.