1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (C) 2012-2014 Intel Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/conf.h>
35 #include <sys/domainset.h>
36 #include <sys/proc.h>
37
38 #include <dev/pci/pcivar.h>
39
40 #include "nvme_private.h"
41
42 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t;
43 #define DO_NOT_RETRY 1
44
45 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair,
46 struct nvme_request *req);
47 static void nvme_qpair_destroy(struct nvme_qpair *qpair);
48
49 struct nvme_opcode_string {
50 uint16_t opc;
51 const char * str;
52 };
53
54 static struct nvme_opcode_string admin_opcode[] = {
55 { NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" },
56 { NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" },
57 { NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" },
58 { NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" },
59 { NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" },
60 { NVME_OPC_IDENTIFY, "IDENTIFY" },
61 { NVME_OPC_ABORT, "ABORT" },
62 { NVME_OPC_SET_FEATURES, "SET FEATURES" },
63 { NVME_OPC_GET_FEATURES, "GET FEATURES" },
64 { NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
65 { NVME_OPC_FIRMWARE_ACTIVATE, "FIRMWARE ACTIVATE" },
66 { NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
67 { NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" },
68 { NVME_OPC_NAMESPACE_ATTACHMENT, "NAMESPACE ATTACHMENT" },
69 { NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" },
70 { NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" },
71 { NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" },
72 { NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" },
73 { NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" },
74 { NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" },
75 { NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" },
76 { NVME_OPC_FORMAT_NVM, "FORMAT NVM" },
77 { NVME_OPC_SECURITY_SEND, "SECURITY SEND" },
78 { NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" },
79 { NVME_OPC_SANITIZE, "SANITIZE" },
80 { NVME_OPC_GET_LBA_STATUS, "GET LBA STATUS" },
81 { 0xFFFF, "ADMIN COMMAND" }
82 };
83
84 static struct nvme_opcode_string io_opcode[] = {
85 { NVME_OPC_FLUSH, "FLUSH" },
86 { NVME_OPC_WRITE, "WRITE" },
87 { NVME_OPC_READ, "READ" },
88 { NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
89 { NVME_OPC_COMPARE, "COMPARE" },
90 { NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" },
91 { NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" },
92 { NVME_OPC_VERIFY, "VERIFY" },
93 { NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
94 { NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" },
95 { NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
96 { NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
97 { 0xFFFF, "IO COMMAND" }
98 };
99
100 static const char *
101 get_admin_opcode_string(uint16_t opc)
102 {
103 struct nvme_opcode_string *entry;
104
105 entry = admin_opcode;
106
107 while (entry->opc != 0xFFFF) {
108 if (entry->opc == opc)
109 return (entry->str);
110 entry++;
111 }
112 return (entry->str);
113 }
114
115 static const char *
116 get_io_opcode_string(uint16_t opc)
117 {
118 struct nvme_opcode_string *entry;
119
120 entry = io_opcode;
121
122 while (entry->opc != 0xFFFF) {
123 if (entry->opc == opc)
124 return (entry->str);
125 entry++;
126 }
127 return (entry->str);
128 }
129
130 static void
131 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
132 struct nvme_command *cmd)
133 {
134
135 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x "
136 "cdw10:%08x cdw11:%08x\n",
137 get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid,
138 le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11));
139 }
140
141 static void
142 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
143 struct nvme_command *cmd)
144 {
145
146 switch (cmd->opc) {
147 case NVME_OPC_WRITE:
148 case NVME_OPC_READ:
149 case NVME_OPC_WRITE_UNCORRECTABLE:
150 case NVME_OPC_COMPARE:
151 case NVME_OPC_WRITE_ZEROES:
152 case NVME_OPC_VERIFY:
153 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
154 "lba:%llu len:%d\n",
155 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid),
156 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
157 (le32toh(cmd->cdw12) & 0xFFFF) + 1);
158 break;
159 case NVME_OPC_FLUSH:
160 case NVME_OPC_DATASET_MANAGEMENT:
161 case NVME_OPC_RESERVATION_REGISTER:
162 case NVME_OPC_RESERVATION_REPORT:
163 case NVME_OPC_RESERVATION_ACQUIRE:
164 case NVME_OPC_RESERVATION_RELEASE:
165 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
166 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid));
167 break;
168 default:
169 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n",
170 get_io_opcode_string(cmd->opc), cmd->opc, qpair->id,
171 cmd->cid, le32toh(cmd->nsid));
172 break;
173 }
174 }
175
176 static void
177 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
178 {
179 if (qpair->id == 0)
180 nvme_admin_qpair_print_command(qpair, cmd);
181 else
182 nvme_io_qpair_print_command(qpair, cmd);
183 if (nvme_verbose_cmd_dump) {
184 nvme_printf(qpair->ctrlr,
185 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n",
186 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr,
187 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2);
188 nvme_printf(qpair->ctrlr,
189 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n",
190 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
191 cmd->cdw15);
192 }
193 }
194
195 struct nvme_status_string {
196 uint16_t sc;
197 const char * str;
198 };
199
200 static struct nvme_status_string generic_status[] = {
201 { NVME_SC_SUCCESS, "SUCCESS" },
202 { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
203 { NVME_SC_INVALID_FIELD, "INVALID_FIELD" },
204 { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
205 { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
206 { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
207 { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
208 { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
209 { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
210 { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
211 { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
212 { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
213 { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
214 { NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" },
215 { NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" },
216 { NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
217 { NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
218 { NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
219 { NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" },
220 { NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" },
221 { NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
222 { NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
223 { NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" },
224 { NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" },
225 { NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" },
226 { NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" },
227 { NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" },
228 { NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
229 { NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
230 { NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" },
231 { NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
232 { NVME_SC_NAMESPACE_IS_WRITE_PROTECTED, "NAMESPACE IS WRITE PROTECTED" },
233 { NVME_SC_COMMAND_INTERRUPTED, "COMMAND INTERRUPTED" },
234 { NVME_SC_TRANSIENT_TRANSPORT_ERROR, "TRANSIENT TRANSPORT ERROR" },
235
236 { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
237 { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
238 { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
239 { NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
240 { NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
241 { 0xFFFF, "GENERIC" }
242 };
243
244 static struct nvme_status_string command_specific_status[] = {
245 { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
246 { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
247 { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" },
248 { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
249 { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
250 { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
251 { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
252 { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
253 { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
254 { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
255 { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" },
256 { NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
257 { NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" },
258 { NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
259 { NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
260 { NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" },
261 { NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" },
262 { NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" },
263 { NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
264 { NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
265 { NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
266 { NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" },
267 { NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
268 { NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
269 { NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" },
270 { NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" },
271 { NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" },
272 { NVME_SC_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
273 { NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" },
274 { NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" },
275 { NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
276 { NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" },
277 { NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
278 { NVME_SC_SANITIZE_PROHIBITED_WPMRE, "SANITIZE PROHIBITED WRITE PERSISTENT MEMORY REGION ENABLED" },
279 { NVME_SC_ANA_GROUP_ID_INVALID, "ANA GROUP IDENTIFIED INVALID" },
280 { NVME_SC_ANA_ATTACH_FAILED, "ANA ATTACH FAILED" },
281
282 { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
283 { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
284 { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" },
285 { 0xFFFF, "COMMAND SPECIFIC" }
286 };
287
288 static struct nvme_status_string media_error_status[] = {
289 { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
290 { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
291 { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
292 { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
293 { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
294 { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
295 { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
296 { NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" },
297 { 0xFFFF, "MEDIA ERROR" }
298 };
299
300 static struct nvme_status_string path_related_status[] = {
301 { NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
302 { NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS, "ASYMMETRIC ACCESS PERSISTENT LOSS" },
303 { NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE, "ASYMMETRIC ACCESS INACCESSIBLE" },
304 { NVME_SC_ASYMMETRIC_ACCESS_TRANSITION, "ASYMMETRIC ACCESS TRANSITION" },
305 { NVME_SC_CONTROLLER_PATHING_ERROR, "CONTROLLER PATHING ERROR" },
306 { NVME_SC_HOST_PATHING_ERROR, "HOST PATHING ERROR" },
307 { NVME_SC_COMMAND_ABOTHED_BY_HOST, "COMMAND ABOTHED BY HOST" },
308 { 0xFFFF, "PATH RELATED" },
309 };
310
311 static const char *
312 get_status_string(uint16_t sct, uint16_t sc)
313 {
314 struct nvme_status_string *entry;
315
316 switch (sct) {
317 case NVME_SCT_GENERIC:
318 entry = generic_status;
319 break;
320 case NVME_SCT_COMMAND_SPECIFIC:
321 entry = command_specific_status;
322 break;
323 case NVME_SCT_MEDIA_ERROR:
324 entry = media_error_status;
325 break;
326 case NVME_SCT_PATH_RELATED:
327 entry = path_related_status;
328 break;
329 case NVME_SCT_VENDOR_SPECIFIC:
330 return ("VENDOR SPECIFIC");
331 default:
332 return ("RESERVED");
333 }
334
335 while (entry->sc != 0xFFFF) {
336 if (entry->sc == sc)
337 return (entry->str);
338 entry++;
339 }
340 return (entry->str);
341 }
342
343 static void
344 nvme_qpair_print_completion(struct nvme_qpair *qpair,
345 struct nvme_completion *cpl)
346 {
347 uint8_t sct, sc, crd, m, dnr;
348
349 sct = NVME_STATUS_GET_SCT(cpl->status);
350 sc = NVME_STATUS_GET_SC(cpl->status);
351 crd = NVME_STATUS_GET_CRD(cpl->status);
352 m = NVME_STATUS_GET_M(cpl->status);
353 dnr = NVME_STATUS_GET_DNR(cpl->status);
354
355 nvme_printf(qpair->ctrlr, "%s (%02x/%02x) crd:%x m:%x dnr:%x "
356 "sqid:%d cid:%d cdw0:%x\n",
357 get_status_string(sct, sc), sct, sc, crd, m, dnr,
358 cpl->sqid, cpl->cid, cpl->cdw0);
359 }
360
361 static bool
362 nvme_completion_is_retry(const struct nvme_completion *cpl)
363 {
364 uint8_t sct, sc, dnr;
365
366 sct = NVME_STATUS_GET_SCT(cpl->status);
367 sc = NVME_STATUS_GET_SC(cpl->status);
368 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */
369
370 /*
371 * TODO: spec is not clear how commands that are aborted due
372 * to TLER will be marked. So for now, it seems
373 * NAMESPACE_NOT_READY is the only case where we should
374 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
375 * set the DNR bit correctly since the driver controls that.
376 */
377 switch (sct) {
378 case NVME_SCT_GENERIC:
379 switch (sc) {
380 case NVME_SC_ABORTED_BY_REQUEST:
381 case NVME_SC_NAMESPACE_NOT_READY:
382 if (dnr)
383 return (0);
384 else
385 return (1);
386 case NVME_SC_INVALID_OPCODE:
387 case NVME_SC_INVALID_FIELD:
388 case NVME_SC_COMMAND_ID_CONFLICT:
389 case NVME_SC_DATA_TRANSFER_ERROR:
390 case NVME_SC_ABORTED_POWER_LOSS:
391 case NVME_SC_INTERNAL_DEVICE_ERROR:
392 case NVME_SC_ABORTED_SQ_DELETION:
393 case NVME_SC_ABORTED_FAILED_FUSED:
394 case NVME_SC_ABORTED_MISSING_FUSED:
395 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
396 case NVME_SC_COMMAND_SEQUENCE_ERROR:
397 case NVME_SC_LBA_OUT_OF_RANGE:
398 case NVME_SC_CAPACITY_EXCEEDED:
399 default:
400 return (0);
401 }
402 case NVME_SCT_COMMAND_SPECIFIC:
403 case NVME_SCT_MEDIA_ERROR:
404 return (0);
405 case NVME_SCT_PATH_RELATED:
406 switch (sc) {
407 case NVME_SC_INTERNAL_PATH_ERROR:
408 if (dnr)
409 return (0);
410 else
411 return (1);
412 default:
413 return (0);
414 }
415 case NVME_SCT_VENDOR_SPECIFIC:
416 default:
417 return (0);
418 }
419 }
420
421 static void
422 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
423 struct nvme_completion *cpl, error_print_t print_on_error)
424 {
425 struct nvme_qpair * qpair = tr->qpair;
426 struct nvme_request *req;
427 bool retry, error, retriable;
428
429 req = tr->req;
430 error = nvme_completion_is_error(cpl);
431 retriable = nvme_completion_is_retry(cpl);
432 retry = error && retriable && req->retries < nvme_retry_count;
433 if (retry)
434 qpair->num_retries++;
435 if (error && req->retries >= nvme_retry_count && retriable)
436 qpair->num_failures++;
437
438 if (error && (print_on_error == ERROR_PRINT_ALL ||
439 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
440 nvme_qpair_print_command(qpair, &req->cmd);
441 nvme_qpair_print_completion(qpair, cpl);
442 }
443
444 qpair->act_tr[cpl->cid] = NULL;
445
446 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n"));
447
448 if (!retry) {
449 if (req->type != NVME_REQUEST_NULL) {
450 bus_dmamap_sync(qpair->dma_tag_payload,
451 tr->payload_dma_map,
452 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
453 }
454 if (req->cb_fn)
455 req->cb_fn(req->cb_arg, cpl);
456 }
457
458 mtx_lock(&qpair->lock);
459
460 if (retry) {
461 req->retries++;
462 nvme_qpair_submit_tracker(qpair, tr);
463 } else {
464 if (req->type != NVME_REQUEST_NULL) {
465 bus_dmamap_unload(qpair->dma_tag_payload,
466 tr->payload_dma_map);
467 }
468
469 nvme_free_request(req);
470 tr->req = NULL;
471
472 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq);
473 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
474
475 /*
476 * If the controller is in the middle of resetting, don't
477 * try to submit queued requests here - let the reset logic
478 * handle that instead.
479 */
480 if (!STAILQ_EMPTY(&qpair->queued_req) &&
481 !qpair->ctrlr->is_resetting) {
482 req = STAILQ_FIRST(&qpair->queued_req);
483 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
484 _nvme_qpair_submit_request(qpair, req);
485 }
486 }
487
488 mtx_unlock(&qpair->lock);
489 }
490
491 static void
492 nvme_qpair_manual_complete_tracker(
493 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
494 error_print_t print_on_error)
495 {
496 struct nvme_completion cpl;
497
498 memset(&cpl, 0, sizeof(cpl));
499
500 struct nvme_qpair * qpair = tr->qpair;
501
502 cpl.sqid = qpair->id;
503 cpl.cid = tr->cid;
504 cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT;
505 cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
506 cpl.status |= (dnr & NVME_STATUS_DNR_MASK) << NVME_STATUS_DNR_SHIFT;
507 nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
508 }
509
510 void
511 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
512 struct nvme_request *req, uint32_t sct, uint32_t sc)
513 {
514 struct nvme_completion cpl;
515 bool error;
516
517 memset(&cpl, 0, sizeof(cpl));
518 cpl.sqid = qpair->id;
519 cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT;
520 cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
521
522 error = nvme_completion_is_error(&cpl);
523
524 if (error) {
525 nvme_qpair_print_command(qpair, &req->cmd);
526 nvme_qpair_print_completion(qpair, &cpl);
527 }
528
529 if (req->cb_fn)
530 req->cb_fn(req->cb_arg, &cpl);
531
532 nvme_free_request(req);
533 }
534
535 bool
536 nvme_qpair_process_completions(struct nvme_qpair *qpair)
537 {
538 struct nvme_tracker *tr;
539 struct nvme_completion cpl;
540 int done = 0;
541 bool in_panic = dumping || SCHEDULER_STOPPED();
542
543 /*
544 * qpair is not enabled, likely because a controller reset is in
545 * progress. Ignore the interrupt - any I/O that was associated with
546 * this interrupt will get retried when the reset is complete. Any
547 * pending completions for when we're in startup will be completed
548 * as soon as initialization is complete and we start sending commands
549 * to the device.
550 */
551 if (qpair->recovery_state != RECOVERY_NONE) {
552 qpair->num_ignored++;
553 return (false);
554 }
555
556 /*
557 * Sanity check initialization. After we reset the hardware, the phase
558 * is defined to be 1. So if we get here with zero prior calls and the
559 * phase is 0, it means that we've lost a race between the
560 * initialization and the ISR running. With the phase wrong, we'll
561 * process a bunch of completions that aren't really completions leading
562 * to a KASSERT below.
563 */
564 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0),
565 ("%s: Phase wrong for first interrupt call.",
566 device_get_nameunit(qpair->ctrlr->dev)));
567
568 qpair->num_intr_handler_calls++;
569
570 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
571 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
572 /*
573 * A panic can stop the CPU this routine is running on at any point. If
574 * we're called during a panic, complete the sq_head wrap protocol for
575 * the case where we are interrupted just after the increment at 1
576 * below, but before we can reset cq_head to zero at 2. Also cope with
577 * the case where we do the zero at 2, but may or may not have done the
578 * phase adjustment at step 3. The panic machinery flushes all pending
579 * memory writes, so we can make these strong ordering assumptions
580 * that would otherwise be unwise if we were racing in real time.
581 */
582 if (__predict_false(in_panic)) {
583 if (qpair->cq_head == qpair->num_entries) {
584 /*
585 * Here we know that we need to zero cq_head and then negate
586 * the phase, which hasn't been assigned if cq_head isn't
587 * zero due to the atomic_store_rel.
588 */
589 qpair->cq_head = 0;
590 qpair->phase = !qpair->phase;
591 } else if (qpair->cq_head == 0) {
592 /*
593 * In this case, we know that the assignment at 2
594 * happened below, but we don't know if it 3 happened or
595 * not. To do this, we look at the last completion
596 * entry and set the phase to the opposite phase
597 * that it has. This gets us back in sync
598 */
599 cpl = qpair->cpl[qpair->num_entries - 1];
600 nvme_completion_swapbytes(&cpl);
601 qpair->phase = !NVME_STATUS_GET_P(cpl.status);
602 }
603 }
604
605 while (1) {
606 uint16_t status;
607
608 /*
609 * We need to do this dance to avoid a race between the host and
610 * the device where the device overtakes the host while the host
611 * is reading this record, leaving the status field 'new' and
612 * the sqhd and cid fields potentially stale. If the phase
613 * doesn't match, that means status hasn't yet been updated and
614 * we'll get any pending changes next time. It also means that
615 * the phase must be the same the second time. We have to sync
616 * before reading to ensure any bouncing completes.
617 */
618 status = le16toh(qpair->cpl[qpair->cq_head].status);
619 if (NVME_STATUS_GET_P(status) != qpair->phase)
620 break;
621
622 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
623 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
624 cpl = qpair->cpl[qpair->cq_head];
625 nvme_completion_swapbytes(&cpl);
626
627 KASSERT(
628 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status),
629 ("Phase unexpectedly inconsistent"));
630
631 if (cpl.cid < qpair->num_trackers)
632 tr = qpair->act_tr[cpl.cid];
633 else
634 tr = NULL;
635
636 done++;
637 if (tr != NULL) {
638 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
639 qpair->sq_head = cpl.sqhd;
640 } else if (!in_panic) {
641 /*
642 * A missing tracker is normally an error. However, a
643 * panic can stop the CPU this routine is running on
644 * after completing an I/O but before updating
645 * qpair->cq_head at 1 below. Later, we re-enter this
646 * routine to poll I/O associated with the kernel
647 * dump. We find that the tr has been set to null before
648 * calling the completion routine. If it hasn't
649 * completed (or it triggers a panic), then '1' below
650 * won't have updated cq_head. Rather than panic again,
651 * ignore this condition because it's not unexpected.
652 */
653 nvme_printf(qpair->ctrlr,
654 "cpl (cid = %u) does not map to outstanding cmd\n",
655 cpl.cid);
656 /* nvme_dump_completion expects device endianess */
657 nvme_dump_completion(&qpair->cpl[qpair->cq_head]);
658 KASSERT(0, ("received completion for unknown cmd"));
659 }
660
661 /*
662 * There's a number of races with the following (see above) when
663 * the system panics. We compensate for each one of them by
664 * using the atomic store to force strong ordering (at least when
665 * viewed in the aftermath of a panic).
666 */
667 if (++qpair->cq_head == qpair->num_entries) { /* 1 */
668 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */
669 qpair->phase = !qpair->phase; /* 3 */
670 }
671 }
672
673 if (done != 0) {
674 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
675 qpair->cq_hdbl_off, qpair->cq_head);
676 }
677
678 return (done != 0);
679 }
680
681 static void
682 nvme_qpair_msi_handler(void *arg)
683 {
684 struct nvme_qpair *qpair = arg;
685
686 nvme_qpair_process_completions(qpair);
687 }
688
689 int
690 nvme_qpair_construct(struct nvme_qpair *qpair,
691 uint32_t num_entries, uint32_t num_trackers,
692 struct nvme_controller *ctrlr)
693 {
694 struct nvme_tracker *tr;
695 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz;
696 uint64_t queuemem_phys, prpmem_phys, list_phys;
697 uint8_t *queuemem, *prpmem, *prp_list;
698 int i, err;
699
700 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0;
701 qpair->num_entries = num_entries;
702 qpair->num_trackers = num_trackers;
703 qpair->ctrlr = ctrlr;
704
705 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
706
707 /* Note: NVMe PRP format is restricted to 4-byte alignment. */
708 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
709 4, ctrlr->page_size, BUS_SPACE_MAXADDR,
710 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size,
711 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1,
712 ctrlr->page_size, 0,
713 NULL, NULL, &qpair->dma_tag_payload);
714 if (err != 0) {
715 nvme_printf(ctrlr, "payload tag create failed %d\n", err);
716 goto out;
717 }
718
719 /*
720 * Each component must be page aligned, and individual PRP lists
721 * cannot cross a page boundary.
722 */
723 cmdsz = qpair->num_entries * sizeof(struct nvme_command);
724 cmdsz = roundup2(cmdsz, ctrlr->page_size);
725 cplsz = qpair->num_entries * sizeof(struct nvme_completion);
726 cplsz = roundup2(cplsz, ctrlr->page_size);
727 /*
728 * For commands requiring more than 2 PRP entries, one PRP will be
729 * embedded in the command (prp1), and the rest of the PRP entries
730 * will be in a list pointed to by the command (prp2).
731 */
732 prpsz = sizeof(uint64_t) *
733 howmany(ctrlr->max_xfer_size, ctrlr->page_size);
734 prpmemsz = qpair->num_trackers * prpsz;
735 allocsz = cmdsz + cplsz + prpmemsz;
736
737 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
738 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
739 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag);
740 if (err != 0) {
741 nvme_printf(ctrlr, "tag create failed %d\n", err);
742 goto out;
743 }
744 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
745
746 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
747 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
748 nvme_printf(ctrlr, "failed to alloc qpair memory\n");
749 goto out;
750 }
751
752 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map,
753 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) {
754 nvme_printf(ctrlr, "failed to load qpair memory\n");
755 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
756 qpair->queuemem_map);
757 goto out;
758 }
759
760 qpair->num_cmds = 0;
761 qpair->num_intr_handler_calls = 0;
762 qpair->num_retries = 0;
763 qpair->num_failures = 0;
764 qpair->num_ignored = 0;
765 qpair->cmd = (struct nvme_command *)queuemem;
766 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
767 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
768 qpair->cmd_bus_addr = queuemem_phys;
769 qpair->cpl_bus_addr = queuemem_phys + cmdsz;
770 prpmem_phys = queuemem_phys + cmdsz + cplsz;
771
772 callout_init(&qpair->timer, 1);
773 qpair->timer_armed = false;
774 qpair->recovery_state = RECOVERY_WAITING;
775
776 /*
777 * Calcuate the stride of the doorbell register. Many emulators set this
778 * value to correspond to a cache line. However, some hardware has set
779 * it to various small values.
780 */
781 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
782 (qpair->id << (ctrlr->dstrd + 1));
783 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
784 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
785
786 TAILQ_INIT(&qpair->free_tr);
787 TAILQ_INIT(&qpair->outstanding_tr);
788 STAILQ_INIT(&qpair->queued_req);
789
790 list_phys = prpmem_phys;
791 prp_list = prpmem;
792 for (i = 0; i < qpair->num_trackers; i++) {
793 if (list_phys + prpsz > prpmem_phys + prpmemsz) {
794 qpair->num_trackers = i;
795 break;
796 }
797
798 /*
799 * Make sure that the PRP list for this tracker doesn't
800 * overflow to another nvme page.
801 */
802 if (trunc_page(list_phys) !=
803 trunc_page(list_phys + prpsz - 1)) {
804 list_phys = roundup2(list_phys, ctrlr->page_size);
805 prp_list =
806 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size);
807 }
808
809 tr = malloc_domainset(sizeof(*tr), M_NVME,
810 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
811 bus_dmamap_create(qpair->dma_tag_payload, 0,
812 &tr->payload_dma_map);
813 tr->cid = i;
814 tr->qpair = qpair;
815 tr->prp = (uint64_t *)prp_list;
816 tr->prp_bus_addr = list_phys;
817 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
818 list_phys += prpsz;
819 prp_list += prpsz;
820 }
821
822 if (qpair->num_trackers == 0) {
823 nvme_printf(ctrlr, "failed to allocate enough trackers\n");
824 goto out;
825 }
826
827 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
828 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
829 M_ZERO | M_WAITOK);
830
831 if (ctrlr->msi_count > 1) {
832 /*
833 * MSI-X vector resource IDs start at 1, so we add one to
834 * the queue's vector to get the corresponding rid to use.
835 */
836 qpair->rid = qpair->vector + 1;
837
838 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
839 &qpair->rid, RF_ACTIVE);
840 if (qpair->res == NULL) {
841 nvme_printf(ctrlr, "unable to allocate MSI\n");
842 goto out;
843 }
844 if (bus_setup_intr(ctrlr->dev, qpair->res,
845 INTR_TYPE_MISC | INTR_MPSAFE, NULL,
846 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) {
847 nvme_printf(ctrlr, "unable to setup MSI\n");
848 goto out;
849 }
850 if (qpair->id == 0) {
851 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
852 "admin");
853 } else {
854 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
855 "io%d", qpair->id - 1);
856 }
857 }
858
859 return (0);
860
861 out:
862 nvme_qpair_destroy(qpair);
863 return (ENOMEM);
864 }
865
866 static void
867 nvme_qpair_destroy(struct nvme_qpair *qpair)
868 {
869 struct nvme_tracker *tr;
870
871 callout_drain(&qpair->timer);
872
873 if (qpair->tag) {
874 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
875 qpair->tag = NULL;
876 }
877
878 if (qpair->act_tr) {
879 free(qpair->act_tr, M_NVME);
880 qpair->act_tr = NULL;
881 }
882
883 while (!TAILQ_EMPTY(&qpair->free_tr)) {
884 tr = TAILQ_FIRST(&qpair->free_tr);
885 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
886 bus_dmamap_destroy(qpair->dma_tag_payload,
887 tr->payload_dma_map);
888 free(tr, M_NVME);
889 }
890
891 if (qpair->cmd != NULL) {
892 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map);
893 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
894 qpair->queuemem_map);
895 qpair->cmd = NULL;
896 }
897
898 if (qpair->dma_tag) {
899 bus_dma_tag_destroy(qpair->dma_tag);
900 qpair->dma_tag = NULL;
901 }
902
903 if (qpair->dma_tag_payload) {
904 bus_dma_tag_destroy(qpair->dma_tag_payload);
905 qpair->dma_tag_payload = NULL;
906 }
907
908 if (mtx_initialized(&qpair->lock))
909 mtx_destroy(&qpair->lock);
910
911 if (qpair->res) {
912 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
913 rman_get_rid(qpair->res), qpair->res);
914 qpair->res = NULL;
915 }
916 }
917
918 static void
919 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
920 {
921 struct nvme_tracker *tr;
922
923 tr = TAILQ_FIRST(&qpair->outstanding_tr);
924 while (tr != NULL) {
925 if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
926 nvme_qpair_manual_complete_tracker(tr,
927 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
928 ERROR_PRINT_NONE);
929 tr = TAILQ_FIRST(&qpair->outstanding_tr);
930 } else {
931 tr = TAILQ_NEXT(tr, tailq);
932 }
933 }
934 }
935
936 void
937 nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
938 {
939
940 nvme_admin_qpair_abort_aers(qpair);
941 nvme_qpair_destroy(qpair);
942 }
943
944 void
945 nvme_io_qpair_destroy(struct nvme_qpair *qpair)
946 {
947
948 nvme_qpair_destroy(qpair);
949 }
950
951 static void
952 nvme_qpair_timeout(void *arg)
953 {
954 struct nvme_qpair *qpair = arg;
955 struct nvme_controller *ctrlr = qpair->ctrlr;
956 struct nvme_tracker *tr;
957 sbintime_t now;
958 bool idle;
959 uint32_t csts;
960 uint8_t cfs;
961
962 mtx_lock(&qpair->lock);
963 idle = TAILQ_EMPTY(&qpair->outstanding_tr);
964 again:
965 switch (qpair->recovery_state) {
966 case RECOVERY_NONE:
967 if (idle)
968 break;
969 now = getsbinuptime();
970 idle = true;
971 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
972 if (tr->deadline == SBT_MAX)
973 continue;
974 idle = false;
975 if (now > tr->deadline) {
976 /*
977 * We're now passed our earliest deadline. We
978 * need to do expensive things to cope, but next
979 * time. Flag that and close the door to any
980 * further processing.
981 */
982 qpair->recovery_state = RECOVERY_START;
983 nvme_printf(ctrlr, "RECOVERY_START %jd vs %jd\n",
984 (uintmax_t)now, (uintmax_t)tr->deadline);
985 break;
986 }
987 }
988 break;
989 case RECOVERY_START:
990 /*
991 * Read csts to get value of cfs - controller fatal status.
992 * If no fatal status, try to call the completion routine, and
993 * if completes transactions, report a missed interrupt and
994 * return (this may need to be rate limited). Otherwise, if
995 * aborts are enabled and the controller is not reporting
996 * fatal status, abort the command. Otherwise, just reset the
997 * controller and hope for the best.
998 */
999 csts = nvme_mmio_read_4(ctrlr, csts);
1000 cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;
1001 if (cfs) {
1002 nvme_printf(ctrlr, "Controller in fatal status, resetting\n");
1003 qpair->recovery_state = RECOVERY_RESET;
1004 goto again;
1005 }
1006 mtx_unlock(&qpair->lock);
1007 if (nvme_qpair_process_completions(qpair)) {
1008 nvme_printf(ctrlr, "Completions present in output without an interrupt\n");
1009 qpair->recovery_state = RECOVERY_NONE;
1010 } else {
1011 nvme_printf(ctrlr, "timeout with nothing complete, resetting\n");
1012 qpair->recovery_state = RECOVERY_RESET;
1013 mtx_lock(&qpair->lock);
1014 goto again;
1015 }
1016 mtx_lock(&qpair->lock);
1017 break;
1018 case RECOVERY_RESET:
1019 /*
1020 * If we get here due to a possible surprise hot-unplug event,
1021 * then we let nvme_ctrlr_reset confirm and fail the
1022 * controller.
1023 */
1024 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
1025 (csts == 0xffffffff) ? " and possible hot unplug" :
1026 (cfs ? " and fatal error status" : ""));
1027 nvme_printf(ctrlr, "RECOVERY_WAITING\n");
1028 qpair->recovery_state = RECOVERY_WAITING;
1029 nvme_ctrlr_reset(ctrlr);
1030 break;
1031 case RECOVERY_WAITING:
1032 nvme_printf(ctrlr, "waiting\n");
1033 break;
1034 }
1035
1036 /*
1037 * Rearm the timeout.
1038 */
1039 if (!idle) {
1040 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0);
1041 } else {
1042 qpair->timer_armed = false;
1043 }
1044 mtx_unlock(&qpair->lock);
1045 }
1046
1047 /*
1048 * Submit the tracker to the hardware. Must already be in the
1049 * outstanding queue when called.
1050 */
1051 void
1052 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
1053 {
1054 struct nvme_request *req;
1055 struct nvme_controller *ctrlr;
1056 int timeout;
1057
1058 mtx_assert(&qpair->lock, MA_OWNED);
1059
1060 req = tr->req;
1061 req->cmd.cid = tr->cid;
1062 qpair->act_tr[tr->cid] = tr;
1063 ctrlr = qpair->ctrlr;
1064
1065 if (req->timeout) {
1066 if (req->cb_fn == nvme_completion_poll_cb)
1067 timeout = 1;
1068 else
1069 timeout = ctrlr->timeout_period;
1070 tr->deadline = getsbinuptime() + timeout * SBT_1S;
1071 if (!qpair->timer_armed) {
1072 qpair->timer_armed = true;
1073 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2,
1074 nvme_qpair_timeout, qpair, qpair->cpu, 0);
1075 }
1076 } else
1077 tr->deadline = SBT_MAX;
1078
1079 /* Copy the command from the tracker to the submission queue. */
1080 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
1081
1082 if (++qpair->sq_tail == qpair->num_entries)
1083 qpair->sq_tail = 0;
1084
1085 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
1086 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1087 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
1088 qpair->sq_tdbl_off, qpair->sq_tail);
1089 qpair->num_cmds++;
1090 }
1091
1092 static void
1093 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
1094 {
1095 struct nvme_tracker *tr = arg;
1096 uint32_t cur_nseg;
1097
1098 /*
1099 * If the mapping operation failed, return immediately. The caller
1100 * is responsible for detecting the error status and failing the
1101 * tracker manually.
1102 */
1103 if (error != 0) {
1104 nvme_printf(tr->qpair->ctrlr,
1105 "nvme_payload_map err %d\n", error);
1106 return;
1107 }
1108
1109 /*
1110 * Note that we specified ctrlr->page_size for alignment and max
1111 * segment size when creating the bus dma tags. So here we can safely
1112 * just transfer each segment to its associated PRP entry.
1113 */
1114 tr->req->cmd.prp1 = htole64(seg[0].ds_addr);
1115
1116 if (nseg == 2) {
1117 tr->req->cmd.prp2 = htole64(seg[1].ds_addr);
1118 } else if (nseg > 2) {
1119 cur_nseg = 1;
1120 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr);
1121 while (cur_nseg < nseg) {
1122 tr->prp[cur_nseg-1] =
1123 htole64((uint64_t)seg[cur_nseg].ds_addr);
1124 cur_nseg++;
1125 }
1126 } else {
1127 /*
1128 * prp2 should not be used by the controller
1129 * since there is only one segment, but set
1130 * to 0 just to be safe.
1131 */
1132 tr->req->cmd.prp2 = 0;
1133 }
1134
1135 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map,
1136 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1137 nvme_qpair_submit_tracker(tr->qpair, tr);
1138 }
1139
1140 static void
1141 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1142 {
1143 struct nvme_tracker *tr;
1144 int err = 0;
1145
1146 mtx_assert(&qpair->lock, MA_OWNED);
1147
1148 tr = TAILQ_FIRST(&qpair->free_tr);
1149 req->qpair = qpair;
1150
1151 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
1152 /*
1153 * No tracker is available, or the qpair is disabled due to
1154 * an in-progress controller-level reset or controller
1155 * failure.
1156 */
1157
1158 if (qpair->ctrlr->is_failed) {
1159 /*
1160 * The controller has failed, so fail the request.
1161 */
1162 nvme_qpair_manual_complete_request(qpair, req,
1163 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
1164 } else {
1165 /*
1166 * Put the request on the qpair's request queue to be
1167 * processed when a tracker frees up via a command
1168 * completion or when the controller reset is
1169 * completed.
1170 */
1171 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1172 }
1173 return;
1174 }
1175
1176 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
1177 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
1178 tr->deadline = SBT_MAX;
1179 tr->req = req;
1180
1181 switch (req->type) {
1182 case NVME_REQUEST_VADDR:
1183 KASSERT(req->payload_size <= qpair->ctrlr->max_xfer_size,
1184 ("payload_size (%d) exceeds max_xfer_size (%d)\n",
1185 req->payload_size, qpair->ctrlr->max_xfer_size));
1186 err = bus_dmamap_load(tr->qpair->dma_tag_payload,
1187 tr->payload_dma_map, req->u.payload, req->payload_size,
1188 nvme_payload_map, tr, 0);
1189 if (err != 0)
1190 nvme_printf(qpair->ctrlr,
1191 "bus_dmamap_load returned 0x%x!\n", err);
1192 break;
1193 case NVME_REQUEST_NULL:
1194 nvme_qpair_submit_tracker(tr->qpair, tr);
1195 break;
1196 case NVME_REQUEST_BIO:
1197 KASSERT(req->u.bio->bio_bcount <= qpair->ctrlr->max_xfer_size,
1198 ("bio->bio_bcount (%jd) exceeds max_xfer_size (%d)\n",
1199 (intmax_t)req->u.bio->bio_bcount,
1200 qpair->ctrlr->max_xfer_size));
1201 err = bus_dmamap_load_bio(tr->qpair->dma_tag_payload,
1202 tr->payload_dma_map, req->u.bio, nvme_payload_map, tr, 0);
1203 if (err != 0)
1204 nvme_printf(qpair->ctrlr,
1205 "bus_dmamap_load_bio returned 0x%x!\n", err);
1206 break;
1207 case NVME_REQUEST_CCB:
1208 err = bus_dmamap_load_ccb(tr->qpair->dma_tag_payload,
1209 tr->payload_dma_map, req->u.payload,
1210 nvme_payload_map, tr, 0);
1211 if (err != 0)
1212 nvme_printf(qpair->ctrlr,
1213 "bus_dmamap_load_ccb returned 0x%x!\n", err);
1214 break;
1215 default:
1216 panic("unknown nvme request type 0x%x\n", req->type);
1217 break;
1218 }
1219
1220 if (err != 0) {
1221 /*
1222 * The dmamap operation failed, so we manually fail the
1223 * tracker here with DATA_TRANSFER_ERROR status.
1224 *
1225 * nvme_qpair_manual_complete_tracker must not be called
1226 * with the qpair lock held.
1227 */
1228 mtx_unlock(&qpair->lock);
1229 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1230 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL);
1231 mtx_lock(&qpair->lock);
1232 }
1233 }
1234
1235 void
1236 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1237 {
1238
1239 mtx_lock(&qpair->lock);
1240 _nvme_qpair_submit_request(qpair, req);
1241 mtx_unlock(&qpair->lock);
1242 }
1243
1244 static void
1245 nvme_qpair_enable(struct nvme_qpair *qpair)
1246 {
1247 mtx_assert(&qpair->lock, MA_OWNED);
1248
1249 qpair->recovery_state = RECOVERY_NONE;
1250 }
1251
1252 void
1253 nvme_qpair_reset(struct nvme_qpair *qpair)
1254 {
1255
1256 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
1257
1258 /*
1259 * First time through the completion queue, HW will set phase
1260 * bit on completions to 1. So set this to 1 here, indicating
1261 * we're looking for a 1 to know which entries have completed.
1262 * we'll toggle the bit each time when the completion queue
1263 * rolls over.
1264 */
1265 qpair->phase = 1;
1266
1267 memset(qpair->cmd, 0,
1268 qpair->num_entries * sizeof(struct nvme_command));
1269 memset(qpair->cpl, 0,
1270 qpair->num_entries * sizeof(struct nvme_completion));
1271 }
1272
1273 void
1274 nvme_admin_qpair_enable(struct nvme_qpair *qpair)
1275 {
1276 struct nvme_tracker *tr;
1277 struct nvme_tracker *tr_temp;
1278
1279 /*
1280 * Manually abort each outstanding admin command. Do not retry
1281 * admin commands found here, since they will be left over from
1282 * a controller reset and its likely the context in which the
1283 * command was issued no longer applies.
1284 */
1285 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1286 nvme_printf(qpair->ctrlr,
1287 "aborting outstanding admin command\n");
1288 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1289 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1290 }
1291
1292 mtx_lock(&qpair->lock);
1293 nvme_qpair_enable(qpair);
1294 mtx_unlock(&qpair->lock);
1295 }
1296
1297 void
1298 nvme_io_qpair_enable(struct nvme_qpair *qpair)
1299 {
1300 STAILQ_HEAD(, nvme_request) temp;
1301 struct nvme_tracker *tr;
1302 struct nvme_tracker *tr_temp;
1303 struct nvme_request *req;
1304
1305 /*
1306 * Manually abort each outstanding I/O. This normally results in a
1307 * retry, unless the retry count on the associated request has
1308 * reached its limit.
1309 */
1310 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1311 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n");
1312 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1313 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY);
1314 }
1315
1316 mtx_lock(&qpair->lock);
1317
1318 nvme_qpair_enable(qpair);
1319
1320 STAILQ_INIT(&temp);
1321 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
1322
1323 while (!STAILQ_EMPTY(&temp)) {
1324 req = STAILQ_FIRST(&temp);
1325 STAILQ_REMOVE_HEAD(&temp, stailq);
1326 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n");
1327 nvme_qpair_print_command(qpair, &req->cmd);
1328 _nvme_qpair_submit_request(qpair, req);
1329 }
1330
1331 mtx_unlock(&qpair->lock);
1332 }
1333
1334 static void
1335 nvme_qpair_disable(struct nvme_qpair *qpair)
1336 {
1337 struct nvme_tracker *tr, *tr_temp;
1338
1339 mtx_lock(&qpair->lock);
1340 qpair->recovery_state = RECOVERY_WAITING;
1341 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1342 tr->deadline = SBT_MAX;
1343 }
1344 mtx_unlock(&qpair->lock);
1345 }
1346
1347 void
1348 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
1349 {
1350
1351 nvme_qpair_disable(qpair);
1352 nvme_admin_qpair_abort_aers(qpair);
1353 }
1354
1355 void
1356 nvme_io_qpair_disable(struct nvme_qpair *qpair)
1357 {
1358
1359 nvme_qpair_disable(qpair);
1360 }
1361
1362 void
1363 nvme_qpair_fail(struct nvme_qpair *qpair)
1364 {
1365 struct nvme_tracker *tr;
1366 struct nvme_request *req;
1367
1368 if (!mtx_initialized(&qpair->lock))
1369 return;
1370
1371 mtx_lock(&qpair->lock);
1372
1373 while (!STAILQ_EMPTY(&qpair->queued_req)) {
1374 req = STAILQ_FIRST(&qpair->queued_req);
1375 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1376 nvme_printf(qpair->ctrlr, "failing queued i/o\n");
1377 mtx_unlock(&qpair->lock);
1378 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1379 NVME_SC_ABORTED_BY_REQUEST);
1380 mtx_lock(&qpair->lock);
1381 }
1382
1383 /* Manually abort each outstanding I/O. */
1384 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1385 tr = TAILQ_FIRST(&qpair->outstanding_tr);
1386 /*
1387 * Do not remove the tracker. The abort_tracker path will
1388 * do that for us.
1389 */
1390 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
1391 mtx_unlock(&qpair->lock);
1392 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1393 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1394 mtx_lock(&qpair->lock);
1395 }
1396
1397 mtx_unlock(&qpair->lock);
1398 }
Cache object: f0d4aa49dabe2656afbef81a3bb18d8c
|