1 /*
2 * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 /*
27 * $FreeBSD: releng/8.4/sys/dev/netmap/netmap.c 231742 2012-02-15 06:16:52Z luigi $
28 * $Id: netmap.c 9795 2011-12-02 11:39:08Z luigi $
29 *
30 * This module supports memory mapped access to network devices,
31 * see netmap(4).
32 *
33 * The module uses a large, memory pool allocated by the kernel
34 * and accessible as mmapped memory by multiple userspace threads/processes.
35 * The memory pool contains packet buffers and "netmap rings",
36 * i.e. user-accessible copies of the interface's queues.
37 *
38 * Access to the network card works like this:
39 * 1. a process/thread issues one or more open() on /dev/netmap, to create
40 * select()able file descriptor on which events are reported.
41 * 2. on each descriptor, the process issues an ioctl() to identify
42 * the interface that should report events to the file descriptor.
43 * 3. on each descriptor, the process issues an mmap() request to
44 * map the shared memory region within the process' address space.
45 * The list of interesting queues is indicated by a location in
46 * the shared memory region.
47 * 4. using the functions in the netmap(4) userspace API, a process
48 * can look up the occupation state of a queue, access memory buffers,
49 * and retrieve received packets or enqueue packets to transmit.
50 * 5. using some ioctl()s the process can synchronize the userspace view
51 * of the queue with the actual status in the kernel. This includes both
52 * receiving the notification of new packets, and transmitting new
53 * packets on the output interface.
54 * 6. select() or poll() can be used to wait for events on individual
55 * transmit or receive queues (or all queues for a given interface).
56 */
57
58 #include <sys/cdefs.h> /* prerequisite */
59 __FBSDID("$FreeBSD: releng/8.4/sys/dev/netmap/netmap.c 231742 2012-02-15 06:16:52Z luigi $");
60
61 #include <sys/types.h>
62 #include <sys/module.h>
63 #include <sys/errno.h>
64 #include <sys/param.h> /* defines used in kernel.h */
65 #include <sys/jail.h>
66 #include <sys/kernel.h> /* types used in module initialization */
67 #include <sys/conf.h> /* cdevsw struct */
68 #include <sys/uio.h> /* uio struct */
69 #include <sys/sockio.h>
70 #include <sys/socketvar.h> /* struct socket */
71 #include <sys/malloc.h>
72 #include <sys/mman.h> /* PROT_EXEC */
73 #include <sys/poll.h>
74 #include <sys/proc.h>
75 #include <vm/vm.h> /* vtophys */
76 #include <vm/pmap.h> /* vtophys */
77 #include <sys/socket.h> /* sockaddrs */
78 #include <machine/bus.h>
79 #include <sys/selinfo.h>
80 #include <sys/sysctl.h>
81 #include <net/if.h>
82 #include <net/bpf.h> /* BIOCIMMEDIATE */
83 #include <net/vnet.h>
84 #include <net/netmap.h>
85 #include <dev/netmap/netmap_kern.h>
86 #include <machine/bus.h> /* bus_dmamap_* */
87
88 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
89
90 /*
91 * lock and unlock for the netmap memory allocator
92 */
93 #define NMA_LOCK() mtx_lock(&netmap_mem_d->nm_mtx);
94 #define NMA_UNLOCK() mtx_unlock(&netmap_mem_d->nm_mtx);
95 struct netmap_mem_d;
96 static struct netmap_mem_d *netmap_mem_d; /* Our memory allocator. */
97
98 u_int netmap_total_buffers;
99 char *netmap_buffer_base; /* address of an invalid buffer */
100
101 /* user-controlled variables */
102 int netmap_verbose;
103
104 static int netmap_no_timestamp; /* don't timestamp on rxsync */
105
106 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
107 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
108 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
109 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
110 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
111 int netmap_buf_size = 2048;
112 TUNABLE_INT("hw.netmap.buf_size", &netmap_buf_size);
113 SYSCTL_INT(_dev_netmap, OID_AUTO, buf_size,
114 CTLFLAG_RD, &netmap_buf_size, 0, "Size of packet buffers");
115 int netmap_mitigate = 1;
116 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
117 int netmap_no_pendintr;
118 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
119 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
120
121
122
123 /*----- memory allocator -----------------*/
124 /*
125 * Here we have the low level routines for memory allocator
126 * and its primary users.
127 */
128
129 /*
130 * Default amount of memory pre-allocated by the module.
131 * We start with a large size and then shrink our demand
132 * according to what is avalable when the module is loaded.
133 * At the moment the block is contiguous, but we can easily
134 * restrict our demand to smaller units (16..64k)
135 */
136 #define NETMAP_MEMORY_SIZE (64 * 1024 * 4096)
137 static void * netmap_malloc(size_t size, const char *msg);
138 static void netmap_free(void *addr, const char *msg);
139
140 #define netmap_if_malloc(len) netmap_malloc(len, "nifp")
141 #define netmap_if_free(v) netmap_free((v), "nifp")
142
143 #define netmap_ring_malloc(len) netmap_malloc(len, "ring")
144 #define netmap_free_rings(na) \
145 netmap_free((na)->tx_rings[0].ring, "shadow rings");
146
147 /*
148 * Allocator for a pool of packet buffers. For each buffer we have
149 * one entry in the bitmap to signal the state. Allocation scans
150 * the bitmap, but since this is done only on attach, we are not
151 * too worried about performance
152 * XXX if we need to allocate small blocks, a translation
153 * table is used both for kernel virtual address and physical
154 * addresses.
155 */
156 struct netmap_buf_pool {
157 u_int total_buffers; /* total buffers. */
158 u_int free;
159 u_int bufsize;
160 char *base; /* buffer base address */
161 uint32_t *bitmap; /* one bit per buffer, 1 means free */
162 };
163 struct netmap_buf_pool nm_buf_pool;
164 SYSCTL_INT(_dev_netmap, OID_AUTO, total_buffers,
165 CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers");
166 SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers,
167 CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers");
168
169
170
171
172 /*
173 * Allocate n buffers from the ring, and fill the slot.
174 * Buffer 0 is the 'junk' buffer.
175 */
176 static void
177 netmap_new_bufs(struct netmap_if *nifp __unused,
178 struct netmap_slot *slot, u_int n)
179 {
180 struct netmap_buf_pool *p = &nm_buf_pool;
181 uint32_t bi = 0; /* index in the bitmap */
182 uint32_t mask, j, i = 0; /* slot counter */
183
184 if (n > p->free) {
185 D("only %d out of %d buffers available", i, n);
186 return;
187 }
188 /* termination is guaranteed by p->free */
189 while (i < n && p->free > 0) {
190 uint32_t cur = p->bitmap[bi];
191 if (cur == 0) { /* bitmask is fully used */
192 bi++;
193 continue;
194 }
195 /* locate a slot */
196 for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ;
197 p->bitmap[bi] &= ~mask; /* slot in use */
198 p->free--;
199 slot[i].buf_idx = bi*32+j;
200 slot[i].len = p->bufsize;
201 slot[i].flags = NS_BUF_CHANGED;
202 i++;
203 }
204 ND("allocated %d buffers, %d available", n, p->free);
205 }
206
207
208 static void
209 netmap_free_buf(struct netmap_if *nifp __unused, uint32_t i)
210 {
211 struct netmap_buf_pool *p = &nm_buf_pool;
212
213 uint32_t pos, mask;
214 if (i >= p->total_buffers) {
215 D("invalid free index %d", i);
216 return;
217 }
218 pos = i / 32;
219 mask = 1 << (i % 32);
220 if (p->bitmap[pos] & mask) {
221 D("slot %d already free", i);
222 return;
223 }
224 p->bitmap[pos] |= mask;
225 p->free++;
226 }
227
228
229 /* Descriptor of the memory objects handled by our memory allocator. */
230 struct netmap_mem_obj {
231 TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the
232 chain. */
233 int nmo_used; /* flag set on used memory objects. */
234 size_t nmo_size; /* size of the memory area reserved for the
235 object. */
236 void *nmo_data; /* pointer to the memory area. */
237 };
238
239 /* Wrap our memory objects to make them ``chainable``. */
240 TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj);
241
242
243 /* Descriptor of our custom memory allocator. */
244 struct netmap_mem_d {
245 struct mtx nm_mtx; /* lock used to handle the chain of memory
246 objects. */
247 struct netmap_mem_obj_h nm_molist; /* list of memory objects */
248 size_t nm_size; /* total amount of memory used for rings etc. */
249 size_t nm_totalsize; /* total amount of allocated memory
250 (the difference is used for buffers) */
251 size_t nm_buf_start; /* offset of packet buffers.
252 This is page-aligned. */
253 size_t nm_buf_len; /* total memory for buffers */
254 void *nm_buffer; /* pointer to the whole pre-allocated memory
255 area. */
256 };
257
258 /* Shorthand to compute a netmap interface offset. */
259 #define netmap_if_offset(v) \
260 ((char *) (v) - (char *) netmap_mem_d->nm_buffer)
261 /* .. and get a physical address given a memory offset */
262 #define netmap_ofstophys(o) \
263 (vtophys(netmap_mem_d->nm_buffer) + (o))
264
265
266 /*------ netmap memory allocator -------*/
267 /*
268 * Request for a chunk of memory.
269 *
270 * Memory objects are arranged into a list, hence we need to walk this
271 * list until we find an object with the needed amount of data free.
272 * This sounds like a completely inefficient implementation, but given
273 * the fact that data allocation is done once, we can handle it
274 * flawlessly.
275 *
276 * Return NULL on failure.
277 */
278 static void *
279 netmap_malloc(size_t size, __unused const char *msg)
280 {
281 struct netmap_mem_obj *mem_obj, *new_mem_obj;
282 void *ret = NULL;
283
284 NMA_LOCK();
285 TAILQ_FOREACH(mem_obj, &netmap_mem_d->nm_molist, nmo_next) {
286 if (mem_obj->nmo_used != 0 || mem_obj->nmo_size < size)
287 continue;
288
289 new_mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP,
290 M_WAITOK | M_ZERO);
291 TAILQ_INSERT_BEFORE(mem_obj, new_mem_obj, nmo_next);
292
293 new_mem_obj->nmo_used = 1;
294 new_mem_obj->nmo_size = size;
295 new_mem_obj->nmo_data = mem_obj->nmo_data;
296 memset(new_mem_obj->nmo_data, 0, new_mem_obj->nmo_size);
297
298 mem_obj->nmo_size -= size;
299 mem_obj->nmo_data = (char *) mem_obj->nmo_data + size;
300 if (mem_obj->nmo_size == 0) {
301 TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj,
302 nmo_next);
303 free(mem_obj, M_NETMAP);
304 }
305
306 ret = new_mem_obj->nmo_data;
307
308 break;
309 }
310 NMA_UNLOCK();
311 ND("%s: %d bytes at %p", msg, size, ret);
312
313 return (ret);
314 }
315
316 /*
317 * Return the memory to the allocator.
318 *
319 * While freeing a memory object, we try to merge adjacent chunks in
320 * order to reduce memory fragmentation.
321 */
322 static void
323 netmap_free(void *addr, const char *msg)
324 {
325 size_t size;
326 struct netmap_mem_obj *cur, *prev, *next;
327
328 if (addr == NULL) {
329 D("NULL addr for %s", msg);
330 return;
331 }
332
333 NMA_LOCK();
334 TAILQ_FOREACH(cur, &netmap_mem_d->nm_molist, nmo_next) {
335 if (cur->nmo_data == addr && cur->nmo_used)
336 break;
337 }
338 if (cur == NULL) {
339 NMA_UNLOCK();
340 D("invalid addr %s %p", msg, addr);
341 return;
342 }
343
344 size = cur->nmo_size;
345 cur->nmo_used = 0;
346
347 /* merge current chunk of memory with the previous one,
348 if present. */
349 prev = TAILQ_PREV(cur, netmap_mem_obj_h, nmo_next);
350 if (prev && prev->nmo_used == 0) {
351 TAILQ_REMOVE(&netmap_mem_d->nm_molist, cur, nmo_next);
352 prev->nmo_size += cur->nmo_size;
353 free(cur, M_NETMAP);
354 cur = prev;
355 }
356
357 /* merge with the next one */
358 next = TAILQ_NEXT(cur, nmo_next);
359 if (next && next->nmo_used == 0) {
360 TAILQ_REMOVE(&netmap_mem_d->nm_molist, next, nmo_next);
361 cur->nmo_size += next->nmo_size;
362 free(next, M_NETMAP);
363 }
364 NMA_UNLOCK();
365 ND("freed %s %d bytes at %p", msg, size, addr);
366 }
367
368
369 /*
370 * Create and return a new ``netmap_if`` object, and possibly also
371 * rings and packet buffors.
372 *
373 * Return NULL on failure.
374 */
375 static void *
376 netmap_if_new(const char *ifname, struct netmap_adapter *na)
377 {
378 struct netmap_if *nifp;
379 struct netmap_ring *ring;
380 char *buff;
381 u_int i, len, ofs;
382 u_int n = na->num_queues + 1; /* shorthand, include stack queue */
383
384 /*
385 * the descriptor is followed inline by an array of offsets
386 * to the tx and rx rings in the shared memory region.
387 */
388 len = sizeof(struct netmap_if) + 2 * n * sizeof(ssize_t);
389 nifp = netmap_if_malloc(len);
390 if (nifp == NULL)
391 return (NULL);
392
393 /* initialize base fields */
394 *(int *)(uintptr_t)&nifp->ni_num_queues = na->num_queues;
395 strncpy(nifp->ni_name, ifname, IFNAMSIZ);
396
397 (na->refcount)++; /* XXX atomic ? we are under lock */
398 if (na->refcount > 1)
399 goto final;
400
401 /*
402 * If this is the first instance, allocate the shadow rings and
403 * buffers for this card (one for each hw queue, one for the host).
404 * The rings are contiguous, but have variable size.
405 * The entire block is reachable at
406 * na->tx_rings[0].ring
407 */
408
409 len = n * (2 * sizeof(struct netmap_ring) +
410 (na->num_tx_desc + na->num_rx_desc) *
411 sizeof(struct netmap_slot) );
412 buff = netmap_ring_malloc(len);
413 if (buff == NULL) {
414 D("failed to allocate %d bytes for %s shadow ring",
415 len, ifname);
416 error:
417 (na->refcount)--;
418 netmap_if_free(nifp);
419 return (NULL);
420 }
421 /* do we have the bufers ? we are in need of num_tx_desc buffers for
422 * each tx ring and num_tx_desc buffers for each rx ring. */
423 len = n * (na->num_tx_desc + na->num_rx_desc);
424 NMA_LOCK();
425 if (nm_buf_pool.free < len) {
426 NMA_UNLOCK();
427 netmap_free(buff, "not enough bufs");
428 goto error;
429 }
430 /*
431 * in the kring, store the pointers to the shared rings
432 * and initialize the rings. We are under NMA_LOCK().
433 */
434 ofs = 0;
435 for (i = 0; i < n; i++) {
436 struct netmap_kring *kring;
437 int numdesc;
438
439 /* Transmit rings */
440 kring = &na->tx_rings[i];
441 numdesc = na->num_tx_desc;
442 bzero(kring, sizeof(*kring));
443 kring->na = na;
444
445 ring = kring->ring = (struct netmap_ring *)(buff + ofs);
446 *(ssize_t *)(uintptr_t)&ring->buf_ofs =
447 nm_buf_pool.base - (char *)ring;
448 ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs);
449 *(uint32_t *)(uintptr_t)&ring->num_slots =
450 kring->nkr_num_slots = numdesc;
451
452 /*
453 * IMPORTANT:
454 * Always keep one slot empty, so we can detect new
455 * transmissions comparing cur and nr_hwcur (they are
456 * the same only if there are no new transmissions).
457 */
458 ring->avail = kring->nr_hwavail = numdesc - 1;
459 ring->cur = kring->nr_hwcur = 0;
460 *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE;
461 netmap_new_bufs(nifp, ring->slot, numdesc);
462
463 ofs += sizeof(struct netmap_ring) +
464 numdesc * sizeof(struct netmap_slot);
465
466 /* Receive rings */
467 kring = &na->rx_rings[i];
468 numdesc = na->num_rx_desc;
469 bzero(kring, sizeof(*kring));
470 kring->na = na;
471
472 ring = kring->ring = (struct netmap_ring *)(buff + ofs);
473 *(ssize_t *)(uintptr_t)&ring->buf_ofs =
474 nm_buf_pool.base - (char *)ring;
475 ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs);
476 *(uint32_t *)(uintptr_t)&ring->num_slots =
477 kring->nkr_num_slots = numdesc;
478 ring->cur = kring->nr_hwcur = 0;
479 ring->avail = kring->nr_hwavail = 0; /* empty */
480 *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE;
481 netmap_new_bufs(nifp, ring->slot, numdesc);
482 ofs += sizeof(struct netmap_ring) +
483 numdesc * sizeof(struct netmap_slot);
484 }
485 NMA_UNLOCK();
486 for (i = 0; i < n+1; i++) {
487 // XXX initialize the selrecord structs.
488 }
489 final:
490 /*
491 * fill the slots for the rx and tx queues. They contain the offset
492 * between the ring and nifp, so the information is usable in
493 * userspace to reach the ring from the nifp.
494 */
495 for (i = 0; i < n; i++) {
496 char *base = (char *)nifp;
497 *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] =
498 (char *)na->tx_rings[i].ring - base;
499 *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n] =
500 (char *)na->rx_rings[i].ring - base;
501 }
502 return (nifp);
503 }
504
505 /*
506 * Initialize the memory allocator.
507 *
508 * Create the descriptor for the memory , allocate the pool of memory
509 * and initialize the list of memory objects with a single chunk
510 * containing the whole pre-allocated memory marked as free.
511 *
512 * Start with a large size, then halve as needed if we fail to
513 * allocate the block. While halving, always add one extra page
514 * because buffers 0 and 1 are used for special purposes.
515 * Return 0 on success, errno otherwise.
516 */
517 static int
518 netmap_memory_init(void)
519 {
520 struct netmap_mem_obj *mem_obj;
521 void *buf = NULL;
522 int i, n, sz = NETMAP_MEMORY_SIZE;
523 int extra_sz = 0; // space for rings and two spare buffers
524
525 for (; sz >= 1<<20; sz >>=1) {
526 extra_sz = sz/200;
527 extra_sz = (extra_sz + 2*PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
528 buf = contigmalloc(sz + extra_sz,
529 M_NETMAP,
530 M_WAITOK | M_ZERO,
531 0, /* low address */
532 -1UL, /* high address */
533 PAGE_SIZE, /* alignment */
534 0 /* boundary */
535 );
536 if (buf)
537 break;
538 }
539 if (buf == NULL)
540 return (ENOMEM);
541 sz += extra_sz;
542 netmap_mem_d = malloc(sizeof(struct netmap_mem_d), M_NETMAP,
543 M_WAITOK | M_ZERO);
544 mtx_init(&netmap_mem_d->nm_mtx, "netmap memory allocator lock", NULL,
545 MTX_DEF);
546 TAILQ_INIT(&netmap_mem_d->nm_molist);
547 netmap_mem_d->nm_buffer = buf;
548 netmap_mem_d->nm_totalsize = sz;
549
550 /*
551 * A buffer takes 2k, a slot takes 8 bytes + ring overhead,
552 * so the ratio is 200:1. In other words, we can use 1/200 of
553 * the memory for the rings, and the rest for the buffers,
554 * and be sure we never run out.
555 */
556 netmap_mem_d->nm_size = sz/200;
557 netmap_mem_d->nm_buf_start =
558 (netmap_mem_d->nm_size + PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
559 netmap_mem_d->nm_buf_len = sz - netmap_mem_d->nm_buf_start;
560
561 nm_buf_pool.base = netmap_mem_d->nm_buffer;
562 nm_buf_pool.base += netmap_mem_d->nm_buf_start;
563 netmap_buffer_base = nm_buf_pool.base;
564 D("netmap_buffer_base %p (offset %d)",
565 netmap_buffer_base, (int)netmap_mem_d->nm_buf_start);
566 /* number of buffers, they all start as free */
567
568 netmap_total_buffers = nm_buf_pool.total_buffers =
569 netmap_mem_d->nm_buf_len / NETMAP_BUF_SIZE;
570 nm_buf_pool.bufsize = NETMAP_BUF_SIZE;
571
572 D("Have %d MB, use %dKB for rings, %d buffers at %p",
573 (sz >> 20), (int)(netmap_mem_d->nm_size >> 10),
574 nm_buf_pool.total_buffers, nm_buf_pool.base);
575
576 /* allocate and initialize the bitmap. Entry 0 is considered
577 * always busy (used as default when there are no buffers left).
578 */
579 n = (nm_buf_pool.total_buffers + 31) / 32;
580 nm_buf_pool.bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP,
581 M_WAITOK | M_ZERO);
582 nm_buf_pool.bitmap[0] = ~3; /* slot 0 and 1 always busy */
583 for (i = 1; i < n; i++)
584 nm_buf_pool.bitmap[i] = ~0;
585 nm_buf_pool.free = nm_buf_pool.total_buffers - 2;
586
587 mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP,
588 M_WAITOK | M_ZERO);
589 TAILQ_INSERT_HEAD(&netmap_mem_d->nm_molist, mem_obj, nmo_next);
590 mem_obj->nmo_used = 0;
591 mem_obj->nmo_size = netmap_mem_d->nm_size;
592 mem_obj->nmo_data = netmap_mem_d->nm_buffer;
593
594 return (0);
595 }
596
597
598 /*
599 * Finalize the memory allocator.
600 *
601 * Free all the memory objects contained inside the list, and deallocate
602 * the pool of memory; finally free the memory allocator descriptor.
603 */
604 static void
605 netmap_memory_fini(void)
606 {
607 struct netmap_mem_obj *mem_obj;
608
609 while (!TAILQ_EMPTY(&netmap_mem_d->nm_molist)) {
610 mem_obj = TAILQ_FIRST(&netmap_mem_d->nm_molist);
611 TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, nmo_next);
612 if (mem_obj->nmo_used == 1) {
613 printf("netmap: leaked %d bytes at %p\n",
614 (int)mem_obj->nmo_size,
615 mem_obj->nmo_data);
616 }
617 free(mem_obj, M_NETMAP);
618 }
619 contigfree(netmap_mem_d->nm_buffer, netmap_mem_d->nm_totalsize, M_NETMAP);
620 // XXX mutex_destroy(nm_mtx);
621 free(netmap_mem_d, M_NETMAP);
622 }
623 /*------------- end of memory allocator -----------------*/
624
625
626 /* Structure associated to each thread which registered an interface. */
627 struct netmap_priv_d {
628 struct netmap_if *np_nifp; /* netmap interface descriptor. */
629
630 struct ifnet *np_ifp; /* device for which we hold a reference */
631 int np_ringid; /* from the ioctl */
632 u_int np_qfirst, np_qlast; /* range of rings to scan */
633 uint16_t np_txpoll;
634 };
635
636
637 static struct cdev *netmap_dev; /* /dev/netmap character device. */
638
639
640 static d_mmap_t netmap_mmap;
641 static d_ioctl_t netmap_ioctl;
642 static d_poll_t netmap_poll;
643
644 #ifdef NETMAP_KEVENT
645 static d_kqfilter_t netmap_kqfilter;
646 #endif
647
648 static struct cdevsw netmap_cdevsw = {
649 .d_version = D_VERSION,
650 .d_name = "netmap",
651 .d_mmap = netmap_mmap,
652 .d_ioctl = netmap_ioctl,
653 .d_poll = netmap_poll,
654 #ifdef NETMAP_KEVENT
655 .d_kqfilter = netmap_kqfilter,
656 #endif
657 };
658
659 #ifdef NETMAP_KEVENT
660 static int netmap_kqread(struct knote *, long);
661 static int netmap_kqwrite(struct knote *, long);
662 static void netmap_kqdetach(struct knote *);
663
664 static struct filterops netmap_read_filterops = {
665 .f_isfd = 1,
666 .f_attach = NULL,
667 .f_detach = netmap_kqdetach,
668 .f_event = netmap_kqread,
669 };
670
671 static struct filterops netmap_write_filterops = {
672 .f_isfd = 1,
673 .f_attach = NULL,
674 .f_detach = netmap_kqdetach,
675 .f_event = netmap_kqwrite,
676 };
677
678 /*
679 * support for the kevent() system call.
680 *
681 * This is the kevent filter, and is executed each time a new event
682 * is triggered on the device. This function execute some operation
683 * depending on the received filter.
684 *
685 * The implementation should test the filters and should implement
686 * filter operations we are interested on (a full list in /sys/event.h).
687 *
688 * On a match we should:
689 * - set kn->kn_fop
690 * - set kn->kn_hook
691 * - call knlist_add() to deliver the event to the application.
692 *
693 * Return 0 if the event should be delivered to the application.
694 */
695 static int
696 netmap_kqfilter(struct cdev *dev, struct knote *kn)
697 {
698 /* declare variables needed to read/write */
699
700 switch(kn->kn_filter) {
701 case EVFILT_READ:
702 if (netmap_verbose)
703 D("%s kqfilter: EVFILT_READ" ifp->if_xname);
704
705 /* read operations */
706 kn->kn_fop = &netmap_read_filterops;
707 break;
708
709 case EVFILT_WRITE:
710 if (netmap_verbose)
711 D("%s kqfilter: EVFILT_WRITE" ifp->if_xname);
712
713 /* write operations */
714 kn->kn_fop = &netmap_write_filterops;
715 break;
716
717 default:
718 if (netmap_verbose)
719 D("%s kqfilter: invalid filter" ifp->if_xname);
720 return(EINVAL);
721 }
722
723 kn->kn_hook = 0;//
724 knlist_add(&netmap_sc->tun_rsel.si_note, kn, 0);
725
726 return (0);
727 }
728 #endif /* NETMAP_KEVENT */
729
730
731 /*
732 * File descriptor's private data destructor.
733 *
734 * Call nm_register(ifp,0) to stop netmap mode on the interface and
735 * revert to normal operation. We expect that np_ifp has not gone.
736 */
737 static void
738 netmap_dtor_locked(void *data)
739 {
740 struct netmap_priv_d *priv = data;
741 struct ifnet *ifp = priv->np_ifp;
742 struct netmap_adapter *na = NA(ifp);
743 struct netmap_if *nifp = priv->np_nifp;
744
745 na->refcount--;
746 if (na->refcount <= 0) { /* last instance */
747 u_int i;
748
749 D("deleting last netmap instance for %s", ifp->if_xname);
750 /*
751 * there is a race here with *_netmap_task() and
752 * netmap_poll(), which don't run under NETMAP_REG_LOCK.
753 * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP
754 * (aka NETMAP_DELETING(na)) are a unique marker that the
755 * device is dying.
756 * Before destroying stuff we sleep a bit, and then complete
757 * the job. NIOCREG should realize the condition and
758 * loop until they can continue; the other routines
759 * should check the condition at entry and quit if
760 * they cannot run.
761 */
762 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
763 tsleep(na, 0, "NIOCUNREG", 4);
764 na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
765 na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
766 /* Wake up any sleeping threads. netmap_poll will
767 * then return POLLERR
768 */
769 for (i = 0; i < na->num_queues + 2; i++) {
770 selwakeuppri(&na->tx_rings[i].si, PI_NET);
771 selwakeuppri(&na->rx_rings[i].si, PI_NET);
772 }
773 /* release all buffers */
774 NMA_LOCK();
775 for (i = 0; i < na->num_queues + 1; i++) {
776 int j, lim;
777 struct netmap_ring *ring;
778
779 ND("tx queue %d", i);
780 ring = na->tx_rings[i].ring;
781 lim = na->tx_rings[i].nkr_num_slots;
782 for (j = 0; j < lim; j++)
783 netmap_free_buf(nifp, ring->slot[j].buf_idx);
784
785 ND("rx queue %d", i);
786 ring = na->rx_rings[i].ring;
787 lim = na->rx_rings[i].nkr_num_slots;
788 for (j = 0; j < lim; j++)
789 netmap_free_buf(nifp, ring->slot[j].buf_idx);
790 }
791 NMA_UNLOCK();
792 netmap_free_rings(na);
793 wakeup(na);
794 }
795 netmap_if_free(nifp);
796 }
797
798
799 static void
800 netmap_dtor(void *data)
801 {
802 struct netmap_priv_d *priv = data;
803 struct ifnet *ifp = priv->np_ifp;
804 struct netmap_adapter *na = NA(ifp);
805
806 na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
807 netmap_dtor_locked(data);
808 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
809
810 if_rele(ifp);
811 bzero(priv, sizeof(*priv)); /* XXX for safety */
812 free(priv, M_DEVBUF);
813 }
814
815
816 /*
817 * mmap(2) support for the "netmap" device.
818 *
819 * Expose all the memory previously allocated by our custom memory
820 * allocator: this way the user has only to issue a single mmap(2), and
821 * can work on all the data structures flawlessly.
822 *
823 * Return 0 on success, -1 otherwise.
824 */
825 static int
826 #if __FreeBSD_version < 900000
827 netmap_mmap(__unused struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr,
828 int nprot)
829 #else
830 netmap_mmap(__unused struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
831 int nprot, __unused vm_memattr_t *memattr)
832 #endif
833 {
834 if (nprot & PROT_EXEC)
835 return (-1); // XXX -1 or EINVAL ?
836
837 ND("request for offset 0x%x", (uint32_t)offset);
838 *paddr = netmap_ofstophys(offset);
839
840 return (0);
841 }
842
843
844 /*
845 * Handlers for synchronization of the queues from/to the host.
846 *
847 * netmap_sync_to_host() passes packets up. We are called from a
848 * system call in user process context, and the only contention
849 * can be among multiple user threads erroneously calling
850 * this routine concurrently. In principle we should not even
851 * need to lock.
852 */
853 static void
854 netmap_sync_to_host(struct netmap_adapter *na)
855 {
856 struct netmap_kring *kring = &na->tx_rings[na->num_queues];
857 struct netmap_ring *ring = kring->ring;
858 struct mbuf *head = NULL, *tail = NULL, *m;
859 u_int k, n, lim = kring->nkr_num_slots - 1;
860
861 k = ring->cur;
862 if (k > lim) {
863 netmap_ring_reinit(kring);
864 return;
865 }
866 // na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0);
867
868 /* Take packets from hwcur to cur and pass them up.
869 * In case of no buffers we give up. At the end of the loop,
870 * the queue is drained in all cases.
871 */
872 for (n = kring->nr_hwcur; n != k;) {
873 struct netmap_slot *slot = &ring->slot[n];
874
875 n = (n == lim) ? 0 : n + 1;
876 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) {
877 D("bad pkt at %d len %d", n, slot->len);
878 continue;
879 }
880 m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL);
881
882 if (m == NULL)
883 break;
884 if (tail)
885 tail->m_nextpkt = m;
886 else
887 head = m;
888 tail = m;
889 m->m_nextpkt = NULL;
890 }
891 kring->nr_hwcur = k;
892 kring->nr_hwavail = ring->avail = lim;
893 // na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0);
894
895 /* send packets up, outside the lock */
896 while ((m = head) != NULL) {
897 head = head->m_nextpkt;
898 m->m_nextpkt = NULL;
899 if (netmap_verbose & NM_VERB_HOST)
900 D("sending up pkt %p size %d", m, MBUF_LEN(m));
901 NM_SEND_UP(na->ifp, m);
902 }
903 }
904
905 /*
906 * rxsync backend for packets coming from the host stack.
907 * They have been put in the queue by netmap_start() so we
908 * need to protect access to the kring using a lock.
909 *
910 * This routine also does the selrecord if called from the poll handler
911 * (we know because td != NULL).
912 */
913 static void
914 netmap_sync_from_host(struct netmap_adapter *na, struct thread *td)
915 {
916 struct netmap_kring *kring = &na->rx_rings[na->num_queues];
917 struct netmap_ring *ring = kring->ring;
918 int error = 1, delta;
919 u_int k = ring->cur, lim = kring->nkr_num_slots;
920
921 na->nm_lock(na->ifp, NETMAP_CORE_LOCK, 0);
922 if (k >= lim) /* bad value */
923 goto done;
924 delta = k - kring->nr_hwcur;
925 if (delta < 0)
926 delta += lim;
927 kring->nr_hwavail -= delta;
928 if (kring->nr_hwavail < 0) /* error */
929 goto done;
930 kring->nr_hwcur = k;
931 error = 0;
932 k = ring->avail = kring->nr_hwavail;
933 if (k == 0 && td)
934 selrecord(td, &kring->si);
935 if (k && (netmap_verbose & NM_VERB_HOST))
936 D("%d pkts from stack", k);
937 done:
938 na->nm_lock(na->ifp, NETMAP_CORE_UNLOCK, 0);
939 if (error)
940 netmap_ring_reinit(kring);
941 }
942
943
944 /*
945 * get a refcounted reference to an interface.
946 * Return ENXIO if the interface does not exist, EINVAL if netmap
947 * is not supported by the interface.
948 * If successful, hold a reference.
949 */
950 static int
951 get_ifp(const char *name, struct ifnet **ifp)
952 {
953 *ifp = ifunit_ref(name);
954 if (*ifp == NULL)
955 return (ENXIO);
956 /* can do this if the capability exists and if_pspare[0]
957 * points to the netmap descriptor.
958 */
959 if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp))
960 return 0; /* valid pointer, we hold the refcount */
961 if_rele(*ifp);
962 return EINVAL; // not NETMAP capable
963 }
964
965
966 /*
967 * Error routine called when txsync/rxsync detects an error.
968 * Can't do much more than resetting cur = hwcur, avail = hwavail.
969 * Return 1 on reinit.
970 *
971 * This routine is only called by the upper half of the kernel.
972 * It only reads hwcur (which is changed only by the upper half, too)
973 * and hwavail (which may be changed by the lower half, but only on
974 * a tx ring and only to increase it, so any error will be recovered
975 * on the next call). For the above, we don't strictly need to call
976 * it under lock.
977 */
978 int
979 netmap_ring_reinit(struct netmap_kring *kring)
980 {
981 struct netmap_ring *ring = kring->ring;
982 u_int i, lim = kring->nkr_num_slots - 1;
983 int errors = 0;
984
985 D("called for %s", kring->na->ifp->if_xname);
986 if (ring->cur > lim)
987 errors++;
988 for (i = 0; i <= lim; i++) {
989 u_int idx = ring->slot[i].buf_idx;
990 u_int len = ring->slot[i].len;
991 if (idx < 2 || idx >= netmap_total_buffers) {
992 if (!errors++)
993 D("bad buffer at slot %d idx %d len %d ", i, idx, len);
994 ring->slot[i].buf_idx = 0;
995 ring->slot[i].len = 0;
996 } else if (len > NETMAP_BUF_SIZE) {
997 ring->slot[i].len = 0;
998 if (!errors++)
999 D("bad len %d at slot %d idx %d",
1000 len, i, idx);
1001 }
1002 }
1003 if (errors) {
1004 int pos = kring - kring->na->tx_rings;
1005 int n = kring->na->num_queues + 2;
1006
1007 D("total %d errors", errors);
1008 errors++;
1009 D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
1010 kring->na->ifp->if_xname,
1011 pos < n ? "TX" : "RX", pos < n ? pos : pos - n,
1012 ring->cur, kring->nr_hwcur,
1013 ring->avail, kring->nr_hwavail);
1014 ring->cur = kring->nr_hwcur;
1015 ring->avail = kring->nr_hwavail;
1016 }
1017 return (errors ? 1 : 0);
1018 }
1019
1020
1021 /*
1022 * Set the ring ID. For devices with a single queue, a request
1023 * for all rings is the same as a single ring.
1024 */
1025 static int
1026 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1027 {
1028 struct ifnet *ifp = priv->np_ifp;
1029 struct netmap_adapter *na = NA(ifp);
1030 u_int i = ringid & NETMAP_RING_MASK;
1031 /* first time we don't lock */
1032 int need_lock = (priv->np_qfirst != priv->np_qlast);
1033
1034 if ( (ringid & NETMAP_HW_RING) && i >= na->num_queues) {
1035 D("invalid ring id %d", i);
1036 return (EINVAL);
1037 }
1038 if (need_lock)
1039 na->nm_lock(ifp, NETMAP_CORE_LOCK, 0);
1040 priv->np_ringid = ringid;
1041 if (ringid & NETMAP_SW_RING) {
1042 priv->np_qfirst = na->num_queues;
1043 priv->np_qlast = na->num_queues + 1;
1044 } else if (ringid & NETMAP_HW_RING) {
1045 priv->np_qfirst = i;
1046 priv->np_qlast = i + 1;
1047 } else {
1048 priv->np_qfirst = 0;
1049 priv->np_qlast = na->num_queues;
1050 }
1051 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1052 if (need_lock)
1053 na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0);
1054 if (ringid & NETMAP_SW_RING)
1055 D("ringid %s set to SW RING", ifp->if_xname);
1056 else if (ringid & NETMAP_HW_RING)
1057 D("ringid %s set to HW RING %d", ifp->if_xname,
1058 priv->np_qfirst);
1059 else
1060 D("ringid %s set to all %d HW RINGS", ifp->if_xname,
1061 priv->np_qlast);
1062 return 0;
1063 }
1064
1065 /*
1066 * ioctl(2) support for the "netmap" device.
1067 *
1068 * Following a list of accepted commands:
1069 * - NIOCGINFO
1070 * - SIOCGIFADDR just for convenience
1071 * - NIOCREGIF
1072 * - NIOCUNREGIF
1073 * - NIOCTXSYNC
1074 * - NIOCRXSYNC
1075 *
1076 * Return 0 on success, errno otherwise.
1077 */
1078 static int
1079 netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data,
1080 __unused int fflag, struct thread *td)
1081 {
1082 struct netmap_priv_d *priv = NULL;
1083 struct ifnet *ifp;
1084 struct nmreq *nmr = (struct nmreq *) data;
1085 struct netmap_adapter *na;
1086 int error;
1087 u_int i;
1088 struct netmap_if *nifp;
1089
1090 CURVNET_SET(TD_TO_VNET(td));
1091
1092 error = devfs_get_cdevpriv((void **)&priv);
1093 if (error != ENOENT && error != 0) {
1094 CURVNET_RESTORE();
1095 return (error);
1096 }
1097
1098 error = 0; /* Could be ENOENT */
1099 switch (cmd) {
1100 case NIOCGINFO: /* return capabilities etc */
1101 /* memsize is always valid */
1102 nmr->nr_memsize = netmap_mem_d->nm_totalsize;
1103 nmr->nr_offset = 0;
1104 nmr->nr_numrings = 0;
1105 nmr->nr_numslots = 0;
1106 if (nmr->nr_name[0] == '\0') /* just get memory info */
1107 break;
1108 error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */
1109 if (error)
1110 break;
1111 na = NA(ifp); /* retrieve netmap_adapter */
1112 nmr->nr_numrings = na->num_queues;
1113 nmr->nr_numslots = na->num_tx_desc;
1114 if_rele(ifp); /* return the refcount */
1115 break;
1116
1117 case NIOCREGIF:
1118 if (priv != NULL) { /* thread already registered */
1119 error = netmap_set_ringid(priv, nmr->nr_ringid);
1120 break;
1121 }
1122 /* find the interface and a reference */
1123 error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
1124 if (error)
1125 break;
1126 na = NA(ifp); /* retrieve netmap adapter */
1127 /*
1128 * Allocate the private per-thread structure.
1129 * XXX perhaps we can use a blocking malloc ?
1130 */
1131 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
1132 M_NOWAIT | M_ZERO);
1133 if (priv == NULL) {
1134 error = ENOMEM;
1135 if_rele(ifp); /* return the refcount */
1136 break;
1137 }
1138
1139 for (i = 10; i > 0; i--) {
1140 na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
1141 if (!NETMAP_DELETING(na))
1142 break;
1143 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
1144 tsleep(na, 0, "NIOCREGIF", hz/10);
1145 }
1146 if (i == 0) {
1147 D("too many NIOCREGIF attempts, give up");
1148 error = EINVAL;
1149 free(priv, M_DEVBUF);
1150 if_rele(ifp); /* return the refcount */
1151 break;
1152 }
1153
1154 priv->np_ifp = ifp; /* store the reference */
1155 error = netmap_set_ringid(priv, nmr->nr_ringid);
1156 if (error)
1157 goto error;
1158 priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na);
1159 if (nifp == NULL) { /* allocation failed */
1160 error = ENOMEM;
1161 } else if (ifp->if_capenable & IFCAP_NETMAP) {
1162 /* was already set */
1163 } else {
1164 /* Otherwise set the card in netmap mode
1165 * and make it use the shared buffers.
1166 */
1167 error = na->nm_register(ifp, 1); /* mode on */
1168 if (error)
1169 netmap_dtor_locked(priv);
1170 }
1171
1172 if (error) { /* reg. failed, release priv and ref */
1173 error:
1174 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
1175 if_rele(ifp); /* return the refcount */
1176 bzero(priv, sizeof(*priv));
1177 free(priv, M_DEVBUF);
1178 break;
1179 }
1180
1181 na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
1182 error = devfs_set_cdevpriv(priv, netmap_dtor);
1183
1184 if (error != 0) {
1185 /* could not assign the private storage for the
1186 * thread, call the destructor explicitly.
1187 */
1188 netmap_dtor(priv);
1189 break;
1190 }
1191
1192 /* return the offset of the netmap_if object */
1193 nmr->nr_numrings = na->num_queues;
1194 nmr->nr_numslots = na->num_tx_desc;
1195 nmr->nr_memsize = netmap_mem_d->nm_totalsize;
1196 nmr->nr_offset = netmap_if_offset(nifp);
1197 break;
1198
1199 case NIOCUNREGIF:
1200 if (priv == NULL) {
1201 error = ENXIO;
1202 break;
1203 }
1204
1205 /* the interface is unregistered inside the
1206 destructor of the private data. */
1207 devfs_clear_cdevpriv();
1208 break;
1209
1210 case NIOCTXSYNC:
1211 case NIOCRXSYNC:
1212 if (priv == NULL) {
1213 error = ENXIO;
1214 break;
1215 }
1216 ifp = priv->np_ifp; /* we have a reference */
1217 na = NA(ifp); /* retrieve netmap adapter */
1218
1219 if (priv->np_qfirst == na->num_queues) {
1220 /* queues to/from host */
1221 if (cmd == NIOCTXSYNC)
1222 netmap_sync_to_host(na);
1223 else
1224 netmap_sync_from_host(na, NULL);
1225 break;
1226 }
1227
1228 for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
1229 if (cmd == NIOCTXSYNC) {
1230 struct netmap_kring *kring = &na->tx_rings[i];
1231 if (netmap_verbose & NM_VERB_TXSYNC)
1232 D("sync tx ring %d cur %d hwcur %d",
1233 i, kring->ring->cur,
1234 kring->nr_hwcur);
1235 na->nm_txsync(ifp, i, 1 /* do lock */);
1236 if (netmap_verbose & NM_VERB_TXSYNC)
1237 D("after sync tx ring %d cur %d hwcur %d",
1238 i, kring->ring->cur,
1239 kring->nr_hwcur);
1240 } else {
1241 na->nm_rxsync(ifp, i, 1 /* do lock */);
1242 microtime(&na->rx_rings[i].ring->ts);
1243 }
1244 }
1245
1246 break;
1247
1248 case BIOCIMMEDIATE:
1249 case BIOCGHDRCMPLT:
1250 case BIOCSHDRCMPLT:
1251 case BIOCSSEESENT:
1252 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1253 break;
1254
1255 default:
1256 {
1257 /*
1258 * allow device calls
1259 */
1260 struct socket so;
1261 bzero(&so, sizeof(so));
1262 error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
1263 if (error)
1264 break;
1265 so.so_vnet = ifp->if_vnet;
1266 // so->so_proto not null.
1267 error = ifioctl(&so, cmd, data, td);
1268 if_rele(ifp);
1269 }
1270 }
1271
1272 CURVNET_RESTORE();
1273 return (error);
1274 }
1275
1276
1277 /*
1278 * select(2) and poll(2) handlers for the "netmap" device.
1279 *
1280 * Can be called for one or more queues.
1281 * Return true the event mask corresponding to ready events.
1282 * If there are no ready events, do a selrecord on either individual
1283 * selfd or on the global one.
1284 * Device-dependent parts (locking and sync of tx/rx rings)
1285 * are done through callbacks.
1286 */
1287 static int
1288 netmap_poll(__unused struct cdev *dev, int events, struct thread *td)
1289 {
1290 struct netmap_priv_d *priv = NULL;
1291 struct netmap_adapter *na;
1292 struct ifnet *ifp;
1293 struct netmap_kring *kring;
1294 u_int core_lock, i, check_all, want_tx, want_rx, revents = 0;
1295 enum {NO_CL, NEED_CL, LOCKED_CL }; /* see below */
1296
1297 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
1298 return POLLERR;
1299
1300 ifp = priv->np_ifp;
1301 // XXX check for deleting() ?
1302 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1303 return POLLERR;
1304
1305 if (netmap_verbose & 0x8000)
1306 D("device %s events 0x%x", ifp->if_xname, events);
1307 want_tx = events & (POLLOUT | POLLWRNORM);
1308 want_rx = events & (POLLIN | POLLRDNORM);
1309
1310 na = NA(ifp); /* retrieve netmap adapter */
1311
1312 /* how many queues we are scanning */
1313 i = priv->np_qfirst;
1314 if (i == na->num_queues) { /* from/to host */
1315 if (priv->np_txpoll || want_tx) {
1316 /* push any packets up, then we are always ready */
1317 kring = &na->tx_rings[i];
1318 netmap_sync_to_host(na);
1319 revents |= want_tx;
1320 }
1321 if (want_rx) {
1322 kring = &na->rx_rings[i];
1323 if (kring->ring->avail == 0)
1324 netmap_sync_from_host(na, td);
1325 if (kring->ring->avail > 0) {
1326 revents |= want_rx;
1327 }
1328 }
1329 return (revents);
1330 }
1331
1332 /*
1333 * check_all is set if the card has more than one queue and
1334 * the client is polling all of them. If true, we sleep on
1335 * the "global" selfd, otherwise we sleep on individual selfd
1336 * (we can only sleep on one of them per direction).
1337 * The interrupt routine in the driver should always wake on
1338 * the individual selfd, and also on the global one if the card
1339 * has more than one ring.
1340 *
1341 * If the card has only one lock, we just use that.
1342 * If the card has separate ring locks, we just use those
1343 * unless we are doing check_all, in which case the whole
1344 * loop is wrapped by the global lock.
1345 * We acquire locks only when necessary: if poll is called
1346 * when buffers are available, we can just return without locks.
1347 *
1348 * rxsync() is only called if we run out of buffers on a POLLIN.
1349 * txsync() is called if we run out of buffers on POLLOUT, or
1350 * there are pending packets to send. The latter can be disabled
1351 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1352 */
1353 check_all = (i + 1 != priv->np_qlast);
1354
1355 /*
1356 * core_lock indicates what to do with the core lock.
1357 * The core lock is used when either the card has no individual
1358 * locks, or it has individual locks but we are cheking all
1359 * rings so we need the core lock to avoid missing wakeup events.
1360 *
1361 * It has three possible states:
1362 * NO_CL we don't need to use the core lock, e.g.
1363 * because we are protected by individual locks.
1364 * NEED_CL we need the core lock. In this case, when we
1365 * call the lock routine, move to LOCKED_CL
1366 * to remember to release the lock once done.
1367 * LOCKED_CL core lock is set, so we need to release it.
1368 */
1369 core_lock = (check_all || !na->separate_locks) ? NEED_CL : NO_CL;
1370 /*
1371 * We start with a lock free round which is good if we have
1372 * data available. If this fails, then lock and call the sync
1373 * routines.
1374 */
1375 for (i = priv->np_qfirst; want_rx && i < priv->np_qlast; i++) {
1376 kring = &na->rx_rings[i];
1377 if (kring->ring->avail > 0) {
1378 revents |= want_rx;
1379 want_rx = 0; /* also breaks the loop */
1380 }
1381 }
1382 for (i = priv->np_qfirst; want_tx && i < priv->np_qlast; i++) {
1383 kring = &na->tx_rings[i];
1384 if (kring->ring->avail > 0) {
1385 revents |= want_tx;
1386 want_tx = 0; /* also breaks the loop */
1387 }
1388 }
1389
1390 /*
1391 * If we to push packets out (priv->np_txpoll) or want_tx is
1392 * still set, we do need to run the txsync calls (on all rings,
1393 * to avoid that the tx rings stall).
1394 */
1395 if (priv->np_txpoll || want_tx) {
1396 for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
1397 kring = &na->tx_rings[i];
1398 /*
1399 * Skip the current ring if want_tx == 0
1400 * (we have already done a successful sync on
1401 * a previous ring) AND kring->cur == kring->hwcur
1402 * (there are no pending transmissions for this ring).
1403 */
1404 if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1405 continue;
1406 if (core_lock == NEED_CL) {
1407 na->nm_lock(ifp, NETMAP_CORE_LOCK, 0);
1408 core_lock = LOCKED_CL;
1409 }
1410 if (na->separate_locks)
1411 na->nm_lock(ifp, NETMAP_TX_LOCK, i);
1412 if (netmap_verbose & NM_VERB_TXSYNC)
1413 D("send %d on %s %d",
1414 kring->ring->cur,
1415 ifp->if_xname, i);
1416 if (na->nm_txsync(ifp, i, 0 /* no lock */))
1417 revents |= POLLERR;
1418
1419 /* Check avail/call selrecord only if called with POLLOUT */
1420 if (want_tx) {
1421 if (kring->ring->avail > 0) {
1422 /* stop at the first ring. We don't risk
1423 * starvation.
1424 */
1425 revents |= want_tx;
1426 want_tx = 0;
1427 } else if (!check_all)
1428 selrecord(td, &kring->si);
1429 }
1430 if (na->separate_locks)
1431 na->nm_lock(ifp, NETMAP_TX_UNLOCK, i);
1432 }
1433 }
1434
1435 /*
1436 * now if want_rx is still set we need to lock and rxsync.
1437 * Do it on all rings because otherwise we starve.
1438 */
1439 if (want_rx) {
1440 for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
1441 kring = &na->rx_rings[i];
1442 if (core_lock == NEED_CL) {
1443 na->nm_lock(ifp, NETMAP_CORE_LOCK, 0);
1444 core_lock = LOCKED_CL;
1445 }
1446 if (na->separate_locks)
1447 na->nm_lock(ifp, NETMAP_RX_LOCK, i);
1448
1449 if (na->nm_rxsync(ifp, i, 0 /* no lock */))
1450 revents |= POLLERR;
1451 if (netmap_no_timestamp == 0 ||
1452 kring->ring->flags & NR_TIMESTAMP) {
1453 microtime(&kring->ring->ts);
1454 }
1455
1456 if (kring->ring->avail > 0)
1457 revents |= want_rx;
1458 else if (!check_all)
1459 selrecord(td, &kring->si);
1460 if (na->separate_locks)
1461 na->nm_lock(ifp, NETMAP_RX_UNLOCK, i);
1462 }
1463 }
1464 if (check_all && revents == 0) {
1465 i = na->num_queues + 1; /* the global queue */
1466 if (want_tx)
1467 selrecord(td, &na->tx_rings[i].si);
1468 if (want_rx)
1469 selrecord(td, &na->rx_rings[i].si);
1470 }
1471 if (core_lock == LOCKED_CL)
1472 na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0);
1473
1474 return (revents);
1475 }
1476
1477 /*------- driver support routines ------*/
1478
1479 /*
1480 * default lock wrapper. On linux we use mostly netmap-specific locks.
1481 */
1482 static void
1483 netmap_lock_wrapper(struct ifnet *_a, int what, u_int queueid)
1484 {
1485 struct netmap_adapter *na = NA(_a);
1486
1487 switch (what) {
1488 #ifndef __FreeBSD__ /* some system do not need lock on register */
1489 case NETMAP_REG_LOCK:
1490 case NETMAP_REG_UNLOCK:
1491 break;
1492 #endif
1493
1494 case NETMAP_CORE_LOCK:
1495 mtx_lock(&na->core_lock);
1496 break;
1497
1498 case NETMAP_CORE_UNLOCK:
1499 mtx_unlock(&na->core_lock);
1500 break;
1501
1502 case NETMAP_TX_LOCK:
1503 mtx_lock(&na->tx_rings[queueid].q_lock);
1504 break;
1505
1506 case NETMAP_TX_UNLOCK:
1507 mtx_unlock(&na->tx_rings[queueid].q_lock);
1508 break;
1509
1510 case NETMAP_RX_LOCK:
1511 mtx_lock(&na->rx_rings[queueid].q_lock);
1512 break;
1513
1514 case NETMAP_RX_UNLOCK:
1515 mtx_unlock(&na->rx_rings[queueid].q_lock);
1516 break;
1517 }
1518 }
1519
1520
1521 /*
1522 * Initialize a ``netmap_adapter`` object created by driver on attach.
1523 * We allocate a block of memory with room for a struct netmap_adapter
1524 * plus two sets of N+2 struct netmap_kring (where N is the number
1525 * of hardware rings):
1526 * krings 0..N-1 are for the hardware queues.
1527 * kring N is for the host stack queue
1528 * kring N+1 is only used for the selinfo for all queues.
1529 * Return 0 on success, ENOMEM otherwise.
1530 */
1531 int
1532 netmap_attach(struct netmap_adapter *na, int num_queues)
1533 {
1534 int n = num_queues + 2;
1535 int size = sizeof(*na) + 2 * n * sizeof(struct netmap_kring);
1536 void *buf;
1537 struct ifnet *ifp = na->ifp;
1538 int i;
1539
1540 if (ifp == NULL) {
1541 D("ifp not set, giving up");
1542 return EINVAL;
1543 }
1544 na->refcount = 0;
1545 na->num_queues = num_queues;
1546
1547 buf = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
1548 if (buf) {
1549 WNA(ifp) = buf;
1550 na->tx_rings = (void *)((char *)buf + sizeof(*na));
1551 na->rx_rings = na->tx_rings + n;
1552 na->buff_size = NETMAP_BUF_SIZE;
1553 bcopy(na, buf, sizeof(*na));
1554 ifp->if_capabilities |= IFCAP_NETMAP;
1555
1556 na = buf;
1557 if (na->nm_lock == NULL)
1558 na->nm_lock = netmap_lock_wrapper;
1559 mtx_init(&na->core_lock, "netmap core lock", NULL, MTX_DEF);
1560 for (i = 0 ; i < num_queues; i++)
1561 mtx_init(&na->tx_rings[i].q_lock, "netmap txq lock", NULL, MTX_DEF);
1562 for (i = 0 ; i < num_queues; i++)
1563 mtx_init(&na->rx_rings[i].q_lock, "netmap rxq lock", NULL, MTX_DEF);
1564 }
1565 D("%s for %s", buf ? "ok" : "failed", ifp->if_xname);
1566
1567 return (buf ? 0 : ENOMEM);
1568 }
1569
1570
1571 /*
1572 * Free the allocated memory linked to the given ``netmap_adapter``
1573 * object.
1574 */
1575 void
1576 netmap_detach(struct ifnet *ifp)
1577 {
1578 u_int i;
1579 struct netmap_adapter *na = NA(ifp);
1580
1581 if (!na)
1582 return;
1583
1584 for (i = 0; i < na->num_queues + 2; i++) {
1585 knlist_destroy(&na->tx_rings[i].si.si_note);
1586 knlist_destroy(&na->rx_rings[i].si.si_note);
1587 }
1588 bzero(na, sizeof(*na));
1589 WNA(ifp) = NULL;
1590 free(na, M_DEVBUF);
1591 }
1592
1593
1594 /*
1595 * Intercept packets from the network stack and pass them
1596 * to netmap as incoming packets on the 'software' ring.
1597 * We are not locked when called.
1598 */
1599 int
1600 netmap_start(struct ifnet *ifp, struct mbuf *m)
1601 {
1602 struct netmap_adapter *na = NA(ifp);
1603 struct netmap_kring *kring = &na->rx_rings[na->num_queues];
1604 u_int i, len = MBUF_LEN(m);
1605 int error = EBUSY, lim = kring->nkr_num_slots - 1;
1606 struct netmap_slot *slot;
1607
1608 if (netmap_verbose & NM_VERB_HOST)
1609 D("%s packet %d len %d from the stack", ifp->if_xname,
1610 kring->nr_hwcur + kring->nr_hwavail, len);
1611 na->nm_lock(ifp, NETMAP_CORE_LOCK, 0);
1612 if (kring->nr_hwavail >= lim) {
1613 D("stack ring %s full\n", ifp->if_xname);
1614 goto done; /* no space */
1615 }
1616 if (len > na->buff_size) {
1617 D("drop packet size %d > %d", len, na->buff_size);
1618 goto done; /* too long for us */
1619 }
1620
1621 /* compute the insert position */
1622 i = kring->nr_hwcur + kring->nr_hwavail;
1623 if (i > lim)
1624 i -= lim + 1;
1625 slot = &kring->ring->slot[i];
1626 m_copydata(m, 0, len, NMB(slot));
1627 slot->len = len;
1628 kring->nr_hwavail++;
1629 if (netmap_verbose & NM_VERB_HOST)
1630 D("wake up host ring %s %d", na->ifp->if_xname, na->num_queues);
1631 selwakeuppri(&kring->si, PI_NET);
1632 error = 0;
1633 done:
1634 na->nm_lock(ifp, NETMAP_CORE_UNLOCK, 0);
1635
1636 /* release the mbuf in either cases of success or failure. As an
1637 * alternative, put the mbuf in a free list and free the list
1638 * only when really necessary.
1639 */
1640 m_freem(m);
1641
1642 return (error);
1643 }
1644
1645
1646 /*
1647 * netmap_reset() is called by the driver routines when reinitializing
1648 * a ring. The driver is in charge of locking to protect the kring.
1649 * If netmap mode is not set just return NULL.
1650 */
1651 struct netmap_slot *
1652 netmap_reset(struct netmap_adapter *na, enum txrx tx, int n,
1653 u_int new_cur)
1654 {
1655 struct netmap_kring *kring;
1656 struct netmap_ring *ring;
1657 int new_hwofs, lim;
1658
1659 if (na == NULL)
1660 return NULL; /* no netmap support here */
1661 if (!(na->ifp->if_capenable & IFCAP_NETMAP))
1662 return NULL; /* nothing to reinitialize */
1663 kring = tx == NR_TX ? na->tx_rings + n : na->rx_rings + n;
1664 ring = kring->ring;
1665 lim = kring->nkr_num_slots - 1;
1666
1667 if (tx == NR_TX)
1668 new_hwofs = kring->nr_hwcur - new_cur;
1669 else
1670 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
1671 if (new_hwofs > lim)
1672 new_hwofs -= lim + 1;
1673
1674 /* Alwayws set the new offset value and realign the ring. */
1675 kring->nkr_hwofs = new_hwofs;
1676 if (tx == NR_TX)
1677 kring->nr_hwavail = kring->nkr_num_slots - 1;
1678 D("new hwofs %d on %s %s[%d]",
1679 kring->nkr_hwofs, na->ifp->if_xname,
1680 tx == NR_TX ? "TX" : "RX", n);
1681
1682 /*
1683 * We do the wakeup here, but the ring is not yet reconfigured.
1684 * However, we are under lock so there are no races.
1685 */
1686 selwakeuppri(&kring->si, PI_NET);
1687 selwakeuppri(&kring[na->num_queues + 1 - n].si, PI_NET);
1688 return kring->ring->slot;
1689 }
1690
1691
1692 /*
1693 * Default functions to handle rx/tx interrupts
1694 * we have 4 cases:
1695 * 1 ring, single lock:
1696 * lock(core); wake(i=0); unlock(core)
1697 * N rings, single lock:
1698 * lock(core); wake(i); wake(N+1) unlock(core)
1699 * 1 ring, separate locks: (i=0)
1700 * lock(i); wake(i); unlock(i)
1701 * N rings, separate locks:
1702 * lock(i); wake(i); unlock(i); lock(core) wake(N+1) unlock(core)
1703 */
1704 int netmap_rx_irq(struct ifnet *ifp, int q, int *work_done)
1705 {
1706 struct netmap_adapter *na;
1707 struct netmap_kring *r;
1708
1709 if (!(ifp->if_capenable & IFCAP_NETMAP))
1710 return 0;
1711 na = NA(ifp);
1712 r = work_done ? na->rx_rings : na->tx_rings;
1713 if (na->separate_locks) {
1714 mtx_lock(&r[q].q_lock);
1715 selwakeuppri(&r[q].si, PI_NET);
1716 mtx_unlock(&r[q].q_lock);
1717 if (na->num_queues > 1) {
1718 mtx_lock(&na->core_lock);
1719 selwakeuppri(&r[na->num_queues + 1].si, PI_NET);
1720 mtx_unlock(&na->core_lock);
1721 }
1722 } else {
1723 mtx_lock(&na->core_lock);
1724 selwakeuppri(&r[q].si, PI_NET);
1725 if (na->num_queues > 1)
1726 selwakeuppri(&r[na->num_queues + 1].si, PI_NET);
1727 mtx_unlock(&na->core_lock);
1728 }
1729 if (work_done)
1730 *work_done = 1; /* do not fire napi again */
1731 return 1;
1732 }
1733
1734 /*
1735 * Module loader.
1736 *
1737 * Create the /dev/netmap device and initialize all global
1738 * variables.
1739 *
1740 * Return 0 on success, errno on failure.
1741 */
1742 static int
1743 netmap_init(void)
1744 {
1745 int error;
1746
1747
1748 error = netmap_memory_init();
1749 if (error != 0) {
1750 printf("netmap: unable to initialize the memory allocator.");
1751 return (error);
1752 }
1753 printf("netmap: loaded module with %d Mbytes\n",
1754 (int)(netmap_mem_d->nm_totalsize >> 20));
1755
1756 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
1757 "netmap");
1758
1759 return (0);
1760 }
1761
1762
1763 /*
1764 * Module unloader.
1765 *
1766 * Free all the memory, and destroy the ``/dev/netmap`` device.
1767 */
1768 static void
1769 netmap_fini(void)
1770 {
1771 destroy_dev(netmap_dev);
1772
1773 netmap_memory_fini();
1774
1775 printf("netmap: unloaded module.\n");
1776 }
1777
1778
1779 /*
1780 * Kernel entry point.
1781 *
1782 * Initialize/finalize the module and return.
1783 *
1784 * Return 0 on success, errno on failure.
1785 */
1786 static int
1787 netmap_loader(__unused struct module *module, int event, __unused void *arg)
1788 {
1789 int error = 0;
1790
1791 switch (event) {
1792 case MOD_LOAD:
1793 error = netmap_init();
1794 break;
1795
1796 case MOD_UNLOAD:
1797 netmap_fini();
1798 break;
1799
1800 default:
1801 error = EOPNOTSUPP;
1802 break;
1803 }
1804
1805 return (error);
1806 }
1807
1808
1809 DEV_MODULE(netmap, netmap_loader, NULL);
Cache object: 3535dd63f976761d0bbbf59fe029c81c
|