1 /*-
2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
12 * disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*-
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 #include <sys/epoch.h>
87
88 #include <machine/atomic.h>
89 #include <machine/in_cksum.h>
90
91 #include <net/bpf.h>
92 #include <net/ethernet.h>
93 #include <net/if.h>
94 #include <net/if_dl.h>
95 #include <net/if_media.h>
96 #include <net/if_types.h>
97 #include <net/if_var.h>
98 #include <net/rndis.h>
99 #ifdef RSS
100 #include <net/rss_config.h>
101 #endif
102
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121
122 #include "vmbus_if.h"
123
124 #define HN_IFSTART_SUPPORT
125
126 #define HN_RING_CNT_DEF_MAX 8
127
128 #define HN_VFMAP_SIZE_DEF 8
129
130 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
131
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT 512
134
135 #define HN_RNDIS_PKT_LEN \
136 (sizeof(struct rndis_packet_msg) + \
137 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
138 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
139 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
140 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
143
144 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
149
150 #define HN_DIRECT_TX_SIZE_DEF 128
151
152 #define HN_EARLY_TXEOF_THRESH 8
153
154 #define HN_PKTBUF_LEN_DEF (16 * 1024)
155
156 #define HN_LROENT_CNT_DEF 128
157
158 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
162
163 #define HN_LRO_ACKCNT_DEF 1
164
165 #define HN_LOCK_INIT(sc) \
166 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc) \
170 do { \
171 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \
172 /* Relinquish cpu to avoid deadlock */ \
173 sched_relinquish(curthread); \
174 DELAY(1000); \
175 } \
176 } while (0)
177 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
178
179 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc) \
182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc) \
184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185
186 #define HN_PKTSIZE_MIN(align) \
187 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align) \
190 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 SLIST_ENTRY(hn_txdesc) link;
201 #endif
202 STAILQ_ENTRY(hn_txdesc) agg_link;
203
204 /* Aggregated txdescs, in sending order. */
205 STAILQ_HEAD(, hn_txdesc) agg_list;
206
207 /* The oldest packet, if transmission aggregation happens. */
208 struct mbuf *m;
209 struct hn_tx_ring *txr;
210 int refs;
211 uint32_t flags; /* HN_TXD_FLAG_ */
212 struct hn_nvs_sendctx send_ctx;
213 uint32_t chim_index;
214 int chim_size;
215
216 bus_dmamap_t data_dmap;
217
218 bus_addr_t rndis_pkt_paddr;
219 struct rndis_packet_msg *rndis_pkt;
220 bus_dmamap_t rndis_pkt_dmap;
221 };
222
223 #define HN_TXD_FLAG_ONLIST 0x0001
224 #define HN_TXD_FLAG_DMAMAP 0x0002
225 #define HN_TXD_FLAG_ONAGG 0x0004
226
227 #define HN_NDIS_PKTINFO_SUBALLOC 0x01
228 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02
229 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04
230
231 struct packet_info_id {
232 uint8_t ver;
233 uint8_t flag;
234 uint16_t pkt_id;
235 };
236
237 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id)
238
239
240 struct hn_rxinfo {
241 const uint32_t *vlan_info;
242 const uint32_t *csum_info;
243 const uint32_t *hash_info;
244 const uint32_t *hash_value;
245 const struct packet_info_id *pktinfo_id;
246 };
247
248 struct hn_rxvf_setarg {
249 struct hn_rx_ring *rxr;
250 struct ifnet *vf_ifp;
251 };
252
253 #define HN_RXINFO_VLAN 0x0001
254 #define HN_RXINFO_CSUM 0x0002
255 #define HN_RXINFO_HASHINF 0x0004
256 #define HN_RXINFO_HASHVAL 0x0008
257 #define HN_RXINFO_PKTINFO_ID 0x0010
258 #define HN_RXINFO_ALL \
259 (HN_RXINFO_VLAN | \
260 HN_RXINFO_CSUM | \
261 HN_RXINFO_HASHINF | \
262 HN_RXINFO_HASHVAL | \
263 HN_RXINFO_PKTINFO_ID)
264
265 static int hn_probe(device_t);
266 static int hn_attach(device_t);
267 static int hn_detach(device_t);
268 static int hn_shutdown(device_t);
269 static void hn_chan_callback(struct vmbus_channel *,
270 void *);
271
272 static void hn_init(void *);
273 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void hn_start(struct ifnet *);
276 #endif
277 static int hn_transmit(struct ifnet *, struct mbuf *);
278 static void hn_xmit_qflush(struct ifnet *);
279 static int hn_ifmedia_upd(struct ifnet *);
280 static void hn_ifmedia_sts(struct ifnet *,
281 struct ifmediareq *);
282
283 static void hn_ifnet_event(void *, struct ifnet *, int);
284 static void hn_ifaddr_event(void *, struct ifnet *);
285 static void hn_ifnet_attevent(void *, struct ifnet *);
286 static void hn_ifnet_detevent(void *, struct ifnet *);
287 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
288
289 static bool hn_ismyvf(const struct hn_softc *,
290 const struct ifnet *);
291 static void hn_rxvf_change(struct hn_softc *,
292 struct ifnet *, bool);
293 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
294 static void hn_rxvf_set_task(void *, int);
295 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
296 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 struct ifreq *);
299 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool hn_xpnt_vf_isready(struct hn_softc *);
301 static void hn_xpnt_vf_setready(struct hn_softc *);
302 static void hn_xpnt_vf_init_taskfunc(void *, int);
303 static void hn_xpnt_vf_init(struct hn_softc *);
304 static void hn_xpnt_vf_setenable(struct hn_softc *);
305 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void hn_vf_rss_restore(struct hn_softc *);
308
309 static int hn_rndis_rxinfo(const void *, int,
310 struct hn_rxinfo *);
311 static void hn_rndis_rx_data(struct hn_rx_ring *,
312 const void *, int);
313 static void hn_rndis_rx_status(struct hn_softc *,
314 const void *, int);
315 static void hn_rndis_init_fixat(struct hn_softc *, int);
316
317 static void hn_nvs_handle_notify(struct hn_softc *,
318 const struct vmbus_chanpkt_hdr *);
319 static void hn_nvs_handle_comp(struct hn_softc *,
320 struct vmbus_channel *,
321 const struct vmbus_chanpkt_hdr *);
322 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 struct vmbus_channel *,
324 const struct vmbus_chanpkt_hdr *);
325 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 struct vmbus_channel *, uint64_t);
327
328 #if __FreeBSD_version >= 1100099
329 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 #if __FreeBSD_version < 1100095
335 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
336 #else
337 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
338 #endif
339 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
340 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
341 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
342 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
343 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
344 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
345 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
346 #ifndef RSS
347 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
348 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
349 #endif
350 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
351 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
352 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
354 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
355 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
356 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
357 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
358 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
360 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
361 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
362 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
363 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
364
365 static void hn_stop(struct hn_softc *, bool);
366 static void hn_init_locked(struct hn_softc *);
367 static int hn_chan_attach(struct hn_softc *,
368 struct vmbus_channel *);
369 static void hn_chan_detach(struct hn_softc *,
370 struct vmbus_channel *);
371 static int hn_attach_subchans(struct hn_softc *);
372 static void hn_detach_allchans(struct hn_softc *);
373 static void hn_chan_rollup(struct hn_rx_ring *,
374 struct hn_tx_ring *);
375 static void hn_set_ring_inuse(struct hn_softc *, int);
376 static int hn_synth_attach(struct hn_softc *, int);
377 static void hn_synth_detach(struct hn_softc *);
378 static int hn_synth_alloc_subchans(struct hn_softc *,
379 int *);
380 static bool hn_synth_attachable(const struct hn_softc *);
381 static void hn_suspend(struct hn_softc *);
382 static void hn_suspend_data(struct hn_softc *);
383 static void hn_suspend_mgmt(struct hn_softc *);
384 static void hn_resume(struct hn_softc *);
385 static void hn_resume_data(struct hn_softc *);
386 static void hn_resume_mgmt(struct hn_softc *);
387 static void hn_suspend_mgmt_taskfunc(void *, int);
388 static void hn_chan_drain(struct hn_softc *,
389 struct vmbus_channel *);
390 static void hn_disable_rx(struct hn_softc *);
391 static void hn_drain_rxtx(struct hn_softc *, int);
392 static void hn_polling(struct hn_softc *, u_int);
393 static void hn_chan_polling(struct vmbus_channel *, u_int);
394 static void hn_mtu_change_fixup(struct hn_softc *);
395
396 static void hn_update_link_status(struct hn_softc *);
397 static void hn_change_network(struct hn_softc *);
398 static void hn_link_taskfunc(void *, int);
399 static void hn_netchg_init_taskfunc(void *, int);
400 static void hn_netchg_status_taskfunc(void *, int);
401 static void hn_link_status(struct hn_softc *);
402
403 static int hn_create_rx_data(struct hn_softc *, int);
404 static void hn_destroy_rx_data(struct hn_softc *);
405 static int hn_check_iplen(const struct mbuf *, int);
406 static void hn_rxpkt_proto(const struct mbuf *, int *, int *);
407 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
408 static int hn_rxfilter_config(struct hn_softc *);
409 static int hn_rss_reconfig(struct hn_softc *);
410 static void hn_rss_ind_fixup(struct hn_softc *);
411 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
412 static int hn_rxpkt(struct hn_rx_ring *);
413 static uint32_t hn_rss_type_fromndis(uint32_t);
414 static uint32_t hn_rss_type_tondis(uint32_t);
415
416 static int hn_tx_ring_create(struct hn_softc *, int);
417 static void hn_tx_ring_destroy(struct hn_tx_ring *);
418 static int hn_create_tx_data(struct hn_softc *, int);
419 static void hn_fixup_tx_data(struct hn_softc *);
420 static void hn_fixup_rx_data(struct hn_softc *);
421 static void hn_destroy_tx_data(struct hn_softc *);
422 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
423 static void hn_txdesc_gc(struct hn_tx_ring *,
424 struct hn_txdesc *);
425 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
426 struct hn_txdesc *, struct mbuf **);
427 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
428 struct hn_txdesc *);
429 static void hn_set_chim_size(struct hn_softc *, int);
430 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
431 static bool hn_tx_ring_pending(struct hn_tx_ring *);
432 static void hn_tx_ring_qflush(struct hn_tx_ring *);
433 static void hn_resume_tx(struct hn_softc *, int);
434 static void hn_set_txagg(struct hn_softc *);
435 static void *hn_try_txagg(struct ifnet *,
436 struct hn_tx_ring *, struct hn_txdesc *,
437 int);
438 static int hn_get_txswq_depth(const struct hn_tx_ring *);
439 static void hn_txpkt_done(struct hn_nvs_sendctx *,
440 struct hn_softc *, struct vmbus_channel *,
441 const void *, int);
442 static int hn_txpkt_sglist(struct hn_tx_ring *,
443 struct hn_txdesc *);
444 static int hn_txpkt_chim(struct hn_tx_ring *,
445 struct hn_txdesc *);
446 static int hn_xmit(struct hn_tx_ring *, int);
447 static void hn_xmit_taskfunc(void *, int);
448 static void hn_xmit_txeof(struct hn_tx_ring *);
449 static void hn_xmit_txeof_taskfunc(void *, int);
450 #ifdef HN_IFSTART_SUPPORT
451 static int hn_start_locked(struct hn_tx_ring *, int);
452 static void hn_start_taskfunc(void *, int);
453 static void hn_start_txeof(struct hn_tx_ring *);
454 static void hn_start_txeof_taskfunc(void *, int);
455 #endif
456
457 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
458
459 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
460 "Hyper-V network interface");
461
462 /* Trust tcp segment verification on host side. */
463 static int hn_trust_hosttcp = 1;
464 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
465 &hn_trust_hosttcp, 0,
466 "Trust tcp segment verification on host side, "
467 "when csum info is missing (global setting)");
468
469 /* Trust udp datagrams verification on host side. */
470 static int hn_trust_hostudp = 1;
471 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
472 &hn_trust_hostudp, 0,
473 "Trust udp datagram verification on host side, "
474 "when csum info is missing (global setting)");
475
476 /* Trust ip packets verification on host side. */
477 static int hn_trust_hostip = 1;
478 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
479 &hn_trust_hostip, 0,
480 "Trust ip packet verification on host side, "
481 "when csum info is missing (global setting)");
482
483 /*
484 * Offload UDP/IPv4 checksum.
485 */
486 static int hn_enable_udp4cs = 1;
487 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
488 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
489
490 /*
491 * Offload UDP/IPv6 checksum.
492 */
493 static int hn_enable_udp6cs = 1;
494 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
495 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
496
497 /* Stats. */
498 static counter_u64_t hn_udpcs_fixup;
499 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
500 &hn_udpcs_fixup, "# of UDP checksum fixup");
501
502 /*
503 * See hn_set_hlen().
504 *
505 * This value is for Azure. For Hyper-V, set this above
506 * 65536 to disable UDP datagram checksum fixup.
507 */
508 static int hn_udpcs_fixup_mtu = 1420;
509 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
510 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
511
512 /* Limit TSO burst size */
513 static int hn_tso_maxlen = IP_MAXPACKET;
514 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
515 &hn_tso_maxlen, 0, "TSO burst limit");
516
517 /* Limit chimney send size */
518 static int hn_tx_chimney_size = 0;
519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
520 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
521
522 /* Limit the size of packet for direct transmission */
523 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
524 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
525 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
526
527 /* # of LRO entries per RX ring */
528 #if defined(INET) || defined(INET6)
529 #if __FreeBSD_version >= 1100095
530 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
531 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
532 &hn_lro_entry_count, 0, "LRO entry count");
533 #endif
534 #endif
535
536 static int hn_tx_taskq_cnt = 1;
537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
538 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
539
540 #define HN_TX_TASKQ_M_INDEP 0
541 #define HN_TX_TASKQ_M_GLOBAL 1
542 #define HN_TX_TASKQ_M_EVTTQ 2
543
544 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
545 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
546 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
547 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
548
549 #ifndef HN_USE_TXDESC_BUFRING
550 static int hn_use_txdesc_bufring = 0;
551 #else
552 static int hn_use_txdesc_bufring = 1;
553 #endif
554 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
555 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
556
557 #ifdef HN_IFSTART_SUPPORT
558 /* Use ifnet.if_start instead of ifnet.if_transmit */
559 static int hn_use_if_start = 0;
560 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
561 &hn_use_if_start, 0, "Use if_start TX method");
562 #endif
563
564 /* # of channels to use */
565 static int hn_chan_cnt = 0;
566 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
567 &hn_chan_cnt, 0,
568 "# of channels to use; each channel has one RX ring and one TX ring");
569
570 /* # of transmit rings to use */
571 static int hn_tx_ring_cnt = 0;
572 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
573 &hn_tx_ring_cnt, 0, "# of TX rings to use");
574
575 /* Software TX ring deptch */
576 static int hn_tx_swq_depth = 0;
577 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
578 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
579
580 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
581 #if __FreeBSD_version >= 1100095
582 static u_int hn_lro_mbufq_depth = 0;
583 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
584 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
585 #endif
586
587 /* Packet transmission aggregation size limit */
588 static int hn_tx_agg_size = -1;
589 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
590 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
591
592 /* Packet transmission aggregation count limit */
593 static int hn_tx_agg_pkts = -1;
594 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
595 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
596
597 /* VF list */
598 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
599 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
600 hn_vflist_sysctl, "A",
601 "VF list");
602
603 /* VF mapping */
604 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
605 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
606 hn_vfmap_sysctl, "A",
607 "VF mapping");
608
609 /* Transparent VF */
610 static int hn_xpnt_vf = 1;
611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
612 &hn_xpnt_vf, 0, "Transparent VF mod");
613
614 /* Accurate BPF support for Transparent VF */
615 static int hn_xpnt_vf_accbpf = 0;
616 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
617 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
618
619 /* Extra wait for transparent VF attach routing; unit seconds. */
620 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
621 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
622 &hn_xpnt_vf_attwait, 0,
623 "Extra wait for transparent VF attach routing; unit: seconds");
624
625 static u_int hn_cpu_index; /* next CPU for channel */
626 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
627
628 static struct rmlock hn_vfmap_lock;
629 static int hn_vfmap_size;
630 static struct ifnet **hn_vfmap;
631
632 #ifndef RSS
633 static const uint8_t
634 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
635 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
636 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
637 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
638 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
639 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
640 };
641 #endif /* !RSS */
642
643 static const struct hyperv_guid hn_guid = {
644 .hv_guid = {
645 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
646 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
647 };
648
649 static device_method_t hn_methods[] = {
650 /* Device interface */
651 DEVMETHOD(device_probe, hn_probe),
652 DEVMETHOD(device_attach, hn_attach),
653 DEVMETHOD(device_detach, hn_detach),
654 DEVMETHOD(device_shutdown, hn_shutdown),
655 DEVMETHOD_END
656 };
657
658 static driver_t hn_driver = {
659 "hn",
660 hn_methods,
661 sizeof(struct hn_softc)
662 };
663
664 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
665 MODULE_VERSION(hn, 1);
666 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
667
668 #if __FreeBSD_version >= 1100099
669 static void
670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
671 {
672 int i;
673
674 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
675 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
676 }
677 #endif
678
679 static int
680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
681 {
682
683 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
684 txd->chim_size == 0, ("invalid rndis sglist txd"));
685 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
686 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
687 }
688
689 static int
690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
691 {
692 struct hn_nvs_rndis rndis;
693
694 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
695 txd->chim_size > 0, ("invalid rndis chim txd"));
696
697 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
698 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
699 rndis.nvs_chim_idx = txd->chim_index;
700 rndis.nvs_chim_sz = txd->chim_size;
701
702 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
703 &rndis, sizeof(rndis), &txd->send_ctx));
704 }
705
706 static __inline uint32_t
707 hn_chim_alloc(struct hn_softc *sc)
708 {
709 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
710 u_long *bmap = sc->hn_chim_bmap;
711 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
712
713 for (i = 0; i < bmap_cnt; ++i) {
714 int idx;
715
716 idx = ffsl(~bmap[i]);
717 if (idx == 0)
718 continue;
719
720 --idx; /* ffsl is 1-based */
721 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
722 ("invalid i %d and idx %d", i, idx));
723
724 if (atomic_testandset_long(&bmap[i], idx))
725 continue;
726
727 ret = i * LONG_BIT + idx;
728 break;
729 }
730 return (ret);
731 }
732
733 static __inline void
734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
735 {
736 u_long mask;
737 uint32_t idx;
738
739 idx = chim_idx / LONG_BIT;
740 KASSERT(idx < sc->hn_chim_bmap_cnt,
741 ("invalid chimney index 0x%x", chim_idx));
742
743 mask = 1UL << (chim_idx % LONG_BIT);
744 KASSERT(sc->hn_chim_bmap[idx] & mask,
745 ("index bitmap 0x%lx, chimney index %u, "
746 "bitmap idx %d, bitmask 0x%lx",
747 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
748
749 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
750 }
751
752 #if defined(INET6) || defined(INET)
753
754 #define PULLUP_HDR(m, len) \
755 do { \
756 if (__predict_false((m)->m_len < (len))) { \
757 (m) = m_pullup((m), (len)); \
758 if ((m) == NULL) \
759 return (NULL); \
760 } \
761 } while (0)
762
763 /*
764 * NOTE: If this function failed, the m_head would be freed.
765 */
766 static __inline struct mbuf *
767 hn_tso_fixup(struct mbuf *m_head)
768 {
769 struct ether_vlan_header *evl;
770 struct tcphdr *th;
771 int ehlen;
772
773 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
774
775 PULLUP_HDR(m_head, sizeof(*evl));
776 evl = mtod(m_head, struct ether_vlan_header *);
777 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
778 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
779 else
780 ehlen = ETHER_HDR_LEN;
781 m_head->m_pkthdr.l2hlen = ehlen;
782
783 #ifdef INET
784 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
785 struct ip *ip;
786 int iphlen;
787
788 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
789 ip = mtodo(m_head, ehlen);
790 iphlen = ip->ip_hl << 2;
791 m_head->m_pkthdr.l3hlen = iphlen;
792
793 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
794 th = mtodo(m_head, ehlen + iphlen);
795
796 ip->ip_len = 0;
797 ip->ip_sum = 0;
798 th->th_sum = in_pseudo(ip->ip_src.s_addr,
799 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
800 }
801 #endif
802 #if defined(INET6) && defined(INET)
803 else
804 #endif
805 #ifdef INET6
806 {
807 struct ip6_hdr *ip6;
808
809 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
810 ip6 = mtodo(m_head, ehlen);
811 if (ip6->ip6_nxt != IPPROTO_TCP) {
812 m_freem(m_head);
813 return (NULL);
814 }
815 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
816
817 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
818 th = mtodo(m_head, ehlen + sizeof(*ip6));
819
820 ip6->ip6_plen = 0;
821 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
822 }
823 #endif
824 return (m_head);
825 }
826
827 /*
828 * NOTE: If this function failed, the m_head would be freed.
829 */
830 static __inline struct mbuf *
831 hn_set_hlen(struct mbuf *m_head)
832 {
833 const struct ether_vlan_header *evl;
834 int ehlen;
835
836 PULLUP_HDR(m_head, sizeof(*evl));
837 evl = mtod(m_head, const struct ether_vlan_header *);
838 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
839 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
840 else
841 ehlen = ETHER_HDR_LEN;
842 m_head->m_pkthdr.l2hlen = ehlen;
843
844 #ifdef INET
845 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
846 const struct ip *ip;
847 int iphlen;
848
849 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
850 ip = mtodo(m_head, ehlen);
851 iphlen = ip->ip_hl << 2;
852 m_head->m_pkthdr.l3hlen = iphlen;
853
854 /*
855 * UDP checksum offload does not work in Azure, if the
856 * following conditions meet:
857 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
858 * - IP_DF is not set in the IP hdr.
859 *
860 * Fallback to software checksum for these UDP datagrams.
861 */
862 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
863 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
864 (ntohs(ip->ip_off) & IP_DF) == 0) {
865 uint16_t off = ehlen + iphlen;
866
867 counter_u64_add(hn_udpcs_fixup, 1);
868 PULLUP_HDR(m_head, off + sizeof(struct udphdr));
869 *(uint16_t *)(m_head->m_data + off +
870 m_head->m_pkthdr.csum_data) = in_cksum_skip(
871 m_head, m_head->m_pkthdr.len, off);
872 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
873 }
874 }
875 #endif
876 #if defined(INET6) && defined(INET)
877 else
878 #endif
879 #ifdef INET6
880 {
881 const struct ip6_hdr *ip6;
882
883 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
884 ip6 = mtodo(m_head, ehlen);
885 if (ip6->ip6_nxt != IPPROTO_TCP &&
886 ip6->ip6_nxt != IPPROTO_UDP) {
887 m_freem(m_head);
888 return (NULL);
889 }
890 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
891 }
892 #endif
893 return (m_head);
894 }
895
896 /*
897 * NOTE: If this function failed, the m_head would be freed.
898 */
899 static __inline struct mbuf *
900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
901 {
902 const struct tcphdr *th;
903 int ehlen, iphlen;
904
905 *tcpsyn = 0;
906 ehlen = m_head->m_pkthdr.l2hlen;
907 iphlen = m_head->m_pkthdr.l3hlen;
908
909 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
910 th = mtodo(m_head, ehlen + iphlen);
911 if (th->th_flags & TH_SYN)
912 *tcpsyn = 1;
913 return (m_head);
914 }
915
916 #undef PULLUP_HDR
917
918 #endif /* INET6 || INET */
919
920 static int
921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
922 {
923 int error = 0;
924
925 HN_LOCK_ASSERT(sc);
926
927 if (sc->hn_rx_filter != filter) {
928 error = hn_rndis_set_rxfilter(sc, filter);
929 if (!error)
930 sc->hn_rx_filter = filter;
931 }
932 return (error);
933 }
934
935 static int
936 hn_rxfilter_config(struct hn_softc *sc)
937 {
938 struct ifnet *ifp = sc->hn_ifp;
939 uint32_t filter;
940
941 HN_LOCK_ASSERT(sc);
942
943 /*
944 * If the non-transparent mode VF is activated, we don't know how
945 * its RX filter is configured, so stick the synthetic device in
946 * the promiscous mode.
947 */
948 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
949 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
950 } else {
951 filter = NDIS_PACKET_TYPE_DIRECTED;
952 if (ifp->if_flags & IFF_BROADCAST)
953 filter |= NDIS_PACKET_TYPE_BROADCAST;
954 /* TODO: support multicast list */
955 if ((ifp->if_flags & IFF_ALLMULTI) ||
956 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
957 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
958 }
959 return (hn_set_rxfilter(sc, filter));
960 }
961
962 static void
963 hn_set_txagg(struct hn_softc *sc)
964 {
965 uint32_t size, pkts;
966 int i;
967
968 /*
969 * Setup aggregation size.
970 */
971 if (sc->hn_agg_size < 0)
972 size = UINT32_MAX;
973 else
974 size = sc->hn_agg_size;
975
976 if (sc->hn_rndis_agg_size < size)
977 size = sc->hn_rndis_agg_size;
978
979 /* NOTE: We only aggregate packets using chimney sending buffers. */
980 if (size > (uint32_t)sc->hn_chim_szmax)
981 size = sc->hn_chim_szmax;
982
983 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
984 /* Disable */
985 size = 0;
986 pkts = 0;
987 goto done;
988 }
989
990 /* NOTE: Type of the per TX ring setting is 'int'. */
991 if (size > INT_MAX)
992 size = INT_MAX;
993
994 /*
995 * Setup aggregation packet count.
996 */
997 if (sc->hn_agg_pkts < 0)
998 pkts = UINT32_MAX;
999 else
1000 pkts = sc->hn_agg_pkts;
1001
1002 if (sc->hn_rndis_agg_pkts < pkts)
1003 pkts = sc->hn_rndis_agg_pkts;
1004
1005 if (pkts <= 1) {
1006 /* Disable */
1007 size = 0;
1008 pkts = 0;
1009 goto done;
1010 }
1011
1012 /* NOTE: Type of the per TX ring setting is 'short'. */
1013 if (pkts > SHRT_MAX)
1014 pkts = SHRT_MAX;
1015
1016 done:
1017 /* NOTE: Type of the per TX ring setting is 'short'. */
1018 if (sc->hn_rndis_agg_align > SHRT_MAX) {
1019 /* Disable */
1020 size = 0;
1021 pkts = 0;
1022 }
1023
1024 if (bootverbose) {
1025 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1026 size, pkts, sc->hn_rndis_agg_align);
1027 }
1028
1029 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1030 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1031
1032 mtx_lock(&txr->hn_tx_lock);
1033 txr->hn_agg_szmax = size;
1034 txr->hn_agg_pktmax = pkts;
1035 txr->hn_agg_align = sc->hn_rndis_agg_align;
1036 mtx_unlock(&txr->hn_tx_lock);
1037 }
1038 }
1039
1040 static int
1041 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1042 {
1043
1044 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1045 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1046 return txr->hn_txdesc_cnt;
1047 return hn_tx_swq_depth;
1048 }
1049
1050 static int
1051 hn_rss_reconfig(struct hn_softc *sc)
1052 {
1053 int error;
1054
1055 HN_LOCK_ASSERT(sc);
1056
1057 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1058 return (ENXIO);
1059
1060 /*
1061 * Disable RSS first.
1062 *
1063 * NOTE:
1064 * Direct reconfiguration by setting the UNCHG flags does
1065 * _not_ work properly.
1066 */
1067 if (bootverbose)
1068 if_printf(sc->hn_ifp, "disable RSS\n");
1069 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1070 if (error) {
1071 if_printf(sc->hn_ifp, "RSS disable failed\n");
1072 return (error);
1073 }
1074
1075 /*
1076 * Reenable the RSS w/ the updated RSS key or indirect
1077 * table.
1078 */
1079 if (bootverbose)
1080 if_printf(sc->hn_ifp, "reconfig RSS\n");
1081 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1082 if (error) {
1083 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1084 return (error);
1085 }
1086 return (0);
1087 }
1088
1089 static void
1090 hn_rss_ind_fixup(struct hn_softc *sc)
1091 {
1092 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1093 int i, nchan;
1094
1095 nchan = sc->hn_rx_ring_inuse;
1096 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1097
1098 /*
1099 * Check indirect table to make sure that all channels in it
1100 * can be used.
1101 */
1102 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1103 if (rss->rss_ind[i] >= nchan) {
1104 if_printf(sc->hn_ifp,
1105 "RSS indirect table %d fixup: %u -> %d\n",
1106 i, rss->rss_ind[i], nchan - 1);
1107 rss->rss_ind[i] = nchan - 1;
1108 }
1109 }
1110 }
1111
1112 static int
1113 hn_ifmedia_upd(struct ifnet *ifp __unused)
1114 {
1115
1116 return EOPNOTSUPP;
1117 }
1118
1119 static void
1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1121 {
1122 struct hn_softc *sc = ifp->if_softc;
1123
1124 ifmr->ifm_status = IFM_AVALID;
1125 ifmr->ifm_active = IFM_ETHER;
1126
1127 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1128 ifmr->ifm_active |= IFM_NONE;
1129 return;
1130 }
1131 ifmr->ifm_status |= IFM_ACTIVE;
1132 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1133 }
1134
1135 static void
1136 hn_rxvf_set_task(void *xarg, int pending __unused)
1137 {
1138 struct hn_rxvf_setarg *arg = xarg;
1139
1140 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1141 }
1142
1143 static void
1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1145 {
1146 struct hn_rx_ring *rxr;
1147 struct hn_rxvf_setarg arg;
1148 struct task task;
1149 int i;
1150
1151 HN_LOCK_ASSERT(sc);
1152
1153 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1154
1155 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1156 rxr = &sc->hn_rx_ring[i];
1157
1158 if (i < sc->hn_rx_ring_inuse) {
1159 arg.rxr = rxr;
1160 arg.vf_ifp = vf_ifp;
1161 vmbus_chan_run_task(rxr->hn_chan, &task);
1162 } else {
1163 rxr->hn_rxvf_ifp = vf_ifp;
1164 }
1165 }
1166 }
1167
1168 static bool
1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1170 {
1171 const struct ifnet *hn_ifp;
1172
1173 hn_ifp = sc->hn_ifp;
1174
1175 if (ifp == hn_ifp)
1176 return (false);
1177
1178 if (ifp->if_alloctype != IFT_ETHER)
1179 return (false);
1180
1181 /* Ignore lagg/vlan interfaces */
1182 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1183 strcmp(ifp->if_dname, "vlan") == 0)
1184 return (false);
1185
1186 /*
1187 * During detach events ifp->if_addr might be NULL.
1188 * Make sure the bcmp() below doesn't panic on that:
1189 */
1190 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1191 return (false);
1192
1193 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1194 return (false);
1195
1196 return (true);
1197 }
1198
1199 static void
1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1201 {
1202 struct ifnet *hn_ifp;
1203
1204 HN_LOCK(sc);
1205
1206 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1207 goto out;
1208
1209 if (!hn_ismyvf(sc, ifp))
1210 goto out;
1211 hn_ifp = sc->hn_ifp;
1212
1213 if (rxvf) {
1214 if (sc->hn_flags & HN_FLAG_RXVF)
1215 goto out;
1216
1217 sc->hn_flags |= HN_FLAG_RXVF;
1218 hn_rxfilter_config(sc);
1219 } else {
1220 if (!(sc->hn_flags & HN_FLAG_RXVF))
1221 goto out;
1222
1223 sc->hn_flags &= ~HN_FLAG_RXVF;
1224 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1225 hn_rxfilter_config(sc);
1226 else
1227 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1228 }
1229
1230 hn_nvs_set_datapath(sc,
1231 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1232
1233 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1234
1235 if (rxvf) {
1236 hn_vf_rss_fixup(sc, true);
1237 hn_suspend_mgmt(sc);
1238 sc->hn_link_flags &=
1239 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1240 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1241 } else {
1242 hn_vf_rss_restore(sc);
1243 hn_resume_mgmt(sc);
1244 }
1245
1246 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1247 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1248
1249 if (bootverbose) {
1250 if_printf(hn_ifp, "datapath is switched %s %s\n",
1251 rxvf ? "to" : "from", ifp->if_xname);
1252 }
1253 out:
1254 HN_UNLOCK(sc);
1255 }
1256
1257 static void
1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1259 {
1260
1261 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1262 return;
1263 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1264 }
1265
1266 static void
1267 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1268 {
1269
1270 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1271 }
1272
1273 static int
1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1275 {
1276 struct ifnet *ifp, *vf_ifp;
1277 uint64_t tmp;
1278 int error;
1279
1280 HN_LOCK_ASSERT(sc);
1281 ifp = sc->hn_ifp;
1282 vf_ifp = sc->hn_vf_ifp;
1283
1284 /*
1285 * Fix up requested capabilities w/ supported capabilities,
1286 * since the supported capabilities could have been changed.
1287 */
1288 ifr->ifr_reqcap &= ifp->if_capabilities;
1289 /* Pass SIOCSIFCAP to VF. */
1290 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1291
1292 /*
1293 * NOTE:
1294 * The error will be propagated to the callers, however, it
1295 * is _not_ useful here.
1296 */
1297
1298 /*
1299 * Merge VF's enabled capabilities.
1300 */
1301 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1302
1303 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1304 if (ifp->if_capenable & IFCAP_TXCSUM)
1305 ifp->if_hwassist |= tmp;
1306 else
1307 ifp->if_hwassist &= ~tmp;
1308
1309 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1310 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1311 ifp->if_hwassist |= tmp;
1312 else
1313 ifp->if_hwassist &= ~tmp;
1314
1315 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1316 if (ifp->if_capenable & IFCAP_TSO4)
1317 ifp->if_hwassist |= tmp;
1318 else
1319 ifp->if_hwassist &= ~tmp;
1320
1321 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1322 if (ifp->if_capenable & IFCAP_TSO6)
1323 ifp->if_hwassist |= tmp;
1324 else
1325 ifp->if_hwassist &= ~tmp;
1326
1327 return (error);
1328 }
1329
1330 static int
1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1332 {
1333 struct ifnet *vf_ifp;
1334 struct ifreq ifr;
1335
1336 HN_LOCK_ASSERT(sc);
1337 vf_ifp = sc->hn_vf_ifp;
1338
1339 memset(&ifr, 0, sizeof(ifr));
1340 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1341 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1342 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1343 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1344 }
1345
1346 static void
1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1348 {
1349 struct ifnet *ifp = sc->hn_ifp;
1350 int allmulti = 0;
1351
1352 HN_LOCK_ASSERT(sc);
1353
1354 /* XXX vlan(4) style mcast addr maintenance */
1355 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1356 allmulti = IFF_ALLMULTI;
1357
1358 /* Always set the VF's if_flags */
1359 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1360 }
1361
1362 static void
1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1364 {
1365 struct rm_priotracker pt;
1366 struct ifnet *hn_ifp = NULL;
1367 struct mbuf *mn;
1368
1369 /*
1370 * XXX racy, if hn(4) ever detached.
1371 */
1372 rm_rlock(&hn_vfmap_lock, &pt);
1373 if (vf_ifp->if_index < hn_vfmap_size)
1374 hn_ifp = hn_vfmap[vf_ifp->if_index];
1375 rm_runlock(&hn_vfmap_lock, &pt);
1376
1377 if (hn_ifp != NULL) {
1378 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1379 /*
1380 * Allow tapping on the VF.
1381 */
1382 ETHER_BPF_MTAP(vf_ifp, mn);
1383
1384 /*
1385 * Update VF stats.
1386 */
1387 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1388 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1389 mn->m_pkthdr.len);
1390 }
1391 /*
1392 * XXX IFCOUNTER_IMCAST
1393 * This stat updating is kinda invasive, since it
1394 * requires two checks on the mbuf: the length check
1395 * and the ethernet header check. As of this write,
1396 * all multicast packets go directly to hn(4), which
1397 * makes imcast stat updating in the VF a try in vian.
1398 */
1399
1400 /*
1401 * Fix up rcvif and increase hn(4)'s ipackets.
1402 */
1403 mn->m_pkthdr.rcvif = hn_ifp;
1404 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1405 }
1406 /*
1407 * Go through hn(4)'s if_input.
1408 */
1409 hn_ifp->if_input(hn_ifp, m);
1410 } else {
1411 /*
1412 * In the middle of the transition; free this
1413 * mbuf chain.
1414 */
1415 while (m != NULL) {
1416 mn = m->m_nextpkt;
1417 m->m_nextpkt = NULL;
1418 m_freem(m);
1419 m = mn;
1420 }
1421 }
1422 }
1423
1424 static void
1425 hn_mtu_change_fixup(struct hn_softc *sc)
1426 {
1427 struct ifnet *ifp;
1428
1429 HN_LOCK_ASSERT(sc);
1430 ifp = sc->hn_ifp;
1431
1432 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1433 #if __FreeBSD_version >= 1100099
1434 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1435 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1436 #endif
1437 }
1438
1439 static uint32_t
1440 hn_rss_type_fromndis(uint32_t rss_hash)
1441 {
1442 uint32_t types = 0;
1443
1444 if (rss_hash & NDIS_HASH_IPV4)
1445 types |= RSS_TYPE_IPV4;
1446 if (rss_hash & NDIS_HASH_TCP_IPV4)
1447 types |= RSS_TYPE_TCP_IPV4;
1448 if (rss_hash & NDIS_HASH_IPV6)
1449 types |= RSS_TYPE_IPV6;
1450 if (rss_hash & NDIS_HASH_IPV6_EX)
1451 types |= RSS_TYPE_IPV6_EX;
1452 if (rss_hash & NDIS_HASH_TCP_IPV6)
1453 types |= RSS_TYPE_TCP_IPV6;
1454 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1455 types |= RSS_TYPE_TCP_IPV6_EX;
1456 if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1457 types |= RSS_TYPE_UDP_IPV4;
1458 return (types);
1459 }
1460
1461 static uint32_t
1462 hn_rss_type_tondis(uint32_t types)
1463 {
1464 uint32_t rss_hash = 0;
1465
1466 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1467 ("UDP6 and UDP6EX are not supported"));
1468
1469 if (types & RSS_TYPE_IPV4)
1470 rss_hash |= NDIS_HASH_IPV4;
1471 if (types & RSS_TYPE_TCP_IPV4)
1472 rss_hash |= NDIS_HASH_TCP_IPV4;
1473 if (types & RSS_TYPE_IPV6)
1474 rss_hash |= NDIS_HASH_IPV6;
1475 if (types & RSS_TYPE_IPV6_EX)
1476 rss_hash |= NDIS_HASH_IPV6_EX;
1477 if (types & RSS_TYPE_TCP_IPV6)
1478 rss_hash |= NDIS_HASH_TCP_IPV6;
1479 if (types & RSS_TYPE_TCP_IPV6_EX)
1480 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1481 if (types & RSS_TYPE_UDP_IPV4)
1482 rss_hash |= NDIS_HASH_UDP_IPV4_X;
1483 return (rss_hash);
1484 }
1485
1486 static void
1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1488 {
1489 int i;
1490
1491 HN_LOCK_ASSERT(sc);
1492
1493 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1494 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1495 }
1496
1497 static void
1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1499 {
1500 struct ifnet *ifp, *vf_ifp;
1501 struct ifrsshash ifrh;
1502 struct ifrsskey ifrk;
1503 int error;
1504 uint32_t my_types, diff_types, mbuf_types = 0;
1505
1506 HN_LOCK_ASSERT(sc);
1507 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1508 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1509
1510 if (sc->hn_rx_ring_inuse == 1) {
1511 /* No RSS on synthetic parts; done. */
1512 return;
1513 }
1514 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1515 /* Synthetic parts do not support Toeplitz; done. */
1516 return;
1517 }
1518
1519 ifp = sc->hn_ifp;
1520 vf_ifp = sc->hn_vf_ifp;
1521
1522 /*
1523 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1524 * supported.
1525 */
1526 memset(&ifrk, 0, sizeof(ifrk));
1527 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1528 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1529 if (error) {
1530 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1531 vf_ifp->if_xname, error);
1532 goto done;
1533 }
1534 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1535 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1536 vf_ifp->if_xname, ifrk.ifrk_func);
1537 goto done;
1538 }
1539 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1540 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1541 vf_ifp->if_xname, ifrk.ifrk_keylen);
1542 goto done;
1543 }
1544
1545 /*
1546 * Extract VF's RSS hash. Only Toeplitz is supported.
1547 */
1548 memset(&ifrh, 0, sizeof(ifrh));
1549 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1550 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1551 if (error) {
1552 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1553 vf_ifp->if_xname, error);
1554 goto done;
1555 }
1556 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1557 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1558 vf_ifp->if_xname, ifrh.ifrh_func);
1559 goto done;
1560 }
1561
1562 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1563 if ((ifrh.ifrh_types & my_types) == 0) {
1564 /* This disables RSS; ignore it then */
1565 if_printf(ifp, "%s intersection of RSS types failed. "
1566 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1567 ifrh.ifrh_types, my_types);
1568 goto done;
1569 }
1570
1571 diff_types = my_types ^ ifrh.ifrh_types;
1572 my_types &= ifrh.ifrh_types;
1573 mbuf_types = my_types;
1574
1575 /*
1576 * Detect RSS hash value/type confliction.
1577 *
1578 * NOTE:
1579 * We don't disable the hash type, but stop delivery the hash
1580 * value/type through mbufs on RX path.
1581 *
1582 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1583 * hash is delivered with type of TCP_IPV4. This means if
1584 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1585 * least to hn_mbuf_hash. However, given that _all_ of the
1586 * NICs implement TCP_IPV4, this will _not_ impose any issues
1587 * here.
1588 */
1589 if ((my_types & RSS_TYPE_IPV4) &&
1590 (diff_types & ifrh.ifrh_types &
1591 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1592 /* Conflict; disable IPV4 hash type/value delivery. */
1593 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1594 mbuf_types &= ~RSS_TYPE_IPV4;
1595 }
1596 if ((my_types & RSS_TYPE_IPV6) &&
1597 (diff_types & ifrh.ifrh_types &
1598 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1599 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1600 RSS_TYPE_IPV6_EX))) {
1601 /* Conflict; disable IPV6 hash type/value delivery. */
1602 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1603 mbuf_types &= ~RSS_TYPE_IPV6;
1604 }
1605 if ((my_types & RSS_TYPE_IPV6_EX) &&
1606 (diff_types & ifrh.ifrh_types &
1607 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1608 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1609 RSS_TYPE_IPV6))) {
1610 /* Conflict; disable IPV6_EX hash type/value delivery. */
1611 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1612 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1613 }
1614 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1615 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1616 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1617 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1618 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1619 }
1620 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1621 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1622 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1623 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1624 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1625 }
1626 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1627 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1628 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1629 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1630 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1631 }
1632 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1633 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1634 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1635 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1636 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1637 }
1638
1639 /*
1640 * Indirect table does not matter.
1641 */
1642
1643 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1644 hn_rss_type_tondis(my_types);
1645 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1646 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1647
1648 if (reconf) {
1649 error = hn_rss_reconfig(sc);
1650 if (error) {
1651 /* XXX roll-back? */
1652 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1653 /* XXX keep going. */
1654 }
1655 }
1656 done:
1657 /* Hash deliverability for mbufs. */
1658 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1659 }
1660
1661 static void
1662 hn_vf_rss_restore(struct hn_softc *sc)
1663 {
1664
1665 HN_LOCK_ASSERT(sc);
1666 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1667 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1668
1669 if (sc->hn_rx_ring_inuse == 1)
1670 goto done;
1671
1672 /*
1673 * Restore hash types. Key does _not_ matter.
1674 */
1675 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1676 int error;
1677
1678 sc->hn_rss_hash = sc->hn_rss_hcap;
1679 error = hn_rss_reconfig(sc);
1680 if (error) {
1681 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1682 error);
1683 /* XXX keep going. */
1684 }
1685 }
1686 done:
1687 /* Hash deliverability for mbufs. */
1688 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1689 }
1690
1691 static void
1692 hn_xpnt_vf_setready(struct hn_softc *sc)
1693 {
1694 struct ifnet *ifp, *vf_ifp;
1695 struct ifreq ifr;
1696
1697 HN_LOCK_ASSERT(sc);
1698 ifp = sc->hn_ifp;
1699 vf_ifp = sc->hn_vf_ifp;
1700
1701 /*
1702 * Mark the VF ready.
1703 */
1704 sc->hn_vf_rdytick = 0;
1705
1706 /*
1707 * Save information for restoration.
1708 */
1709 sc->hn_saved_caps = ifp->if_capabilities;
1710 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1711 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1712 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1713
1714 /*
1715 * Intersect supported/enabled capabilities.
1716 *
1717 * NOTE:
1718 * if_hwassist is not changed here.
1719 */
1720 ifp->if_capabilities &= vf_ifp->if_capabilities;
1721 ifp->if_capenable &= ifp->if_capabilities;
1722
1723 /*
1724 * Fix TSO settings.
1725 */
1726 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1727 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1728 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1729 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1730 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1731 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1732
1733 /*
1734 * Change VF's enabled capabilities.
1735 */
1736 memset(&ifr, 0, sizeof(ifr));
1737 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1738 ifr.ifr_reqcap = ifp->if_capenable;
1739 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1740
1741 if (ifp->if_mtu != ETHERMTU) {
1742 int error;
1743
1744 /*
1745 * Change VF's MTU.
1746 */
1747 memset(&ifr, 0, sizeof(ifr));
1748 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1749 ifr.ifr_mtu = ifp->if_mtu;
1750 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1751 if (error) {
1752 if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1753 vf_ifp->if_xname, ifp->if_mtu);
1754 if (ifp->if_mtu > ETHERMTU) {
1755 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1756
1757 /*
1758 * XXX
1759 * No need to adjust the synthetic parts' MTU;
1760 * failure of the adjustment will cause us
1761 * infinite headache.
1762 */
1763 ifp->if_mtu = ETHERMTU;
1764 hn_mtu_change_fixup(sc);
1765 }
1766 }
1767 }
1768 }
1769
1770 static bool
1771 hn_xpnt_vf_isready(struct hn_softc *sc)
1772 {
1773
1774 HN_LOCK_ASSERT(sc);
1775
1776 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1777 return (false);
1778
1779 if (sc->hn_vf_rdytick == 0)
1780 return (true);
1781
1782 if (sc->hn_vf_rdytick > ticks)
1783 return (false);
1784
1785 /* Mark VF as ready. */
1786 hn_xpnt_vf_setready(sc);
1787 return (true);
1788 }
1789
1790 static void
1791 hn_xpnt_vf_setenable(struct hn_softc *sc)
1792 {
1793 int i;
1794
1795 HN_LOCK_ASSERT(sc);
1796
1797 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1798 rm_wlock(&sc->hn_vf_lock);
1799 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1800 rm_wunlock(&sc->hn_vf_lock);
1801
1802 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1803 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1804 }
1805
1806 static void
1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1808 {
1809 int i;
1810
1811 HN_LOCK_ASSERT(sc);
1812
1813 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1814 rm_wlock(&sc->hn_vf_lock);
1815 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1816 if (clear_vf)
1817 sc->hn_vf_ifp = NULL;
1818 rm_wunlock(&sc->hn_vf_lock);
1819
1820 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1821 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1822 }
1823
1824 static void
1825 hn_xpnt_vf_init(struct hn_softc *sc)
1826 {
1827 int error;
1828
1829 HN_LOCK_ASSERT(sc);
1830
1831 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1832 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1833
1834 if (bootverbose) {
1835 if_printf(sc->hn_ifp, "try bringing up %s\n",
1836 sc->hn_vf_ifp->if_xname);
1837 }
1838
1839 /*
1840 * Bring the VF up.
1841 */
1842 hn_xpnt_vf_saveifflags(sc);
1843 sc->hn_vf_ifp->if_flags |= IFF_UP;
1844 error = hn_xpnt_vf_iocsetflags(sc);
1845 if (error) {
1846 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1847 sc->hn_vf_ifp->if_xname, error);
1848 return;
1849 }
1850
1851 /*
1852 * NOTE:
1853 * Datapath setting must happen _after_ bringing the VF up.
1854 */
1855 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1856
1857 /*
1858 * NOTE:
1859 * Fixup RSS related bits _after_ the VF is brought up, since
1860 * many VFs generate RSS key during it's initialization.
1861 */
1862 hn_vf_rss_fixup(sc, true);
1863
1864 /* Mark transparent mode VF as enabled. */
1865 hn_xpnt_vf_setenable(sc);
1866 }
1867
1868 static void
1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1870 {
1871 struct hn_softc *sc = xsc;
1872
1873 HN_LOCK(sc);
1874
1875 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1876 goto done;
1877 if (sc->hn_vf_ifp == NULL)
1878 goto done;
1879 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1880 goto done;
1881
1882 if (sc->hn_vf_rdytick != 0) {
1883 /* Mark VF as ready. */
1884 hn_xpnt_vf_setready(sc);
1885 }
1886
1887 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1888 /*
1889 * Delayed VF initialization.
1890 */
1891 if (bootverbose) {
1892 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1893 sc->hn_vf_ifp->if_xname);
1894 }
1895 hn_xpnt_vf_init(sc);
1896 }
1897 done:
1898 HN_UNLOCK(sc);
1899 }
1900
1901 static void
1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1903 {
1904 struct hn_softc *sc = xsc;
1905
1906 HN_LOCK(sc);
1907
1908 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1909 goto done;
1910
1911 if (!hn_ismyvf(sc, ifp))
1912 goto done;
1913
1914 if (sc->hn_vf_ifp != NULL) {
1915 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1916 sc->hn_vf_ifp->if_xname);
1917 goto done;
1918 }
1919
1920 if (hn_xpnt_vf && ifp->if_start != NULL) {
1921 /*
1922 * ifnet.if_start is _not_ supported by transparent
1923 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1924 */
1925 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1926 "in transparent VF mode.\n", ifp->if_xname);
1927 goto done;
1928 }
1929
1930 rm_wlock(&hn_vfmap_lock);
1931
1932 if (ifp->if_index >= hn_vfmap_size) {
1933 struct ifnet **newmap;
1934 int newsize;
1935
1936 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1937 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1938 M_WAITOK | M_ZERO);
1939
1940 memcpy(newmap, hn_vfmap,
1941 sizeof(struct ifnet *) * hn_vfmap_size);
1942 free(hn_vfmap, M_DEVBUF);
1943 hn_vfmap = newmap;
1944 hn_vfmap_size = newsize;
1945 }
1946 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1947 ("%s: ifindex %d was mapped to %s",
1948 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1949 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1950
1951 rm_wunlock(&hn_vfmap_lock);
1952
1953 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1954 rm_wlock(&sc->hn_vf_lock);
1955 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1956 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1957 sc->hn_vf_ifp = ifp;
1958 rm_wunlock(&sc->hn_vf_lock);
1959
1960 if (hn_xpnt_vf) {
1961 int wait_ticks;
1962
1963 /*
1964 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1965 * Save vf_ifp's current if_input for later restoration.
1966 */
1967 sc->hn_vf_input = ifp->if_input;
1968 ifp->if_input = hn_xpnt_vf_input;
1969
1970 /*
1971 * Stop link status management; use the VF's.
1972 */
1973 hn_suspend_mgmt(sc);
1974
1975 /*
1976 * Give VF sometime to complete its attach routing.
1977 */
1978 wait_ticks = hn_xpnt_vf_attwait * hz;
1979 sc->hn_vf_rdytick = ticks + wait_ticks;
1980
1981 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1982 wait_ticks);
1983 }
1984 done:
1985 HN_UNLOCK(sc);
1986 }
1987
1988 static void
1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1990 {
1991 struct hn_softc *sc = xsc;
1992
1993 HN_LOCK(sc);
1994
1995 if (sc->hn_vf_ifp == NULL)
1996 goto done;
1997
1998 if (!hn_ismyvf(sc, ifp))
1999 goto done;
2000
2001 if (hn_xpnt_vf) {
2002 /*
2003 * Make sure that the delayed initialization is not running.
2004 *
2005 * NOTE:
2006 * - This lock _must_ be released, since the hn_vf_init task
2007 * will try holding this lock.
2008 * - It is safe to release this lock here, since the
2009 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2010 *
2011 * XXX racy, if hn(4) ever detached.
2012 */
2013 HN_UNLOCK(sc);
2014 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2015 HN_LOCK(sc);
2016
2017 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2018 sc->hn_ifp->if_xname));
2019 ifp->if_input = sc->hn_vf_input;
2020 sc->hn_vf_input = NULL;
2021
2022 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2023 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2024 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2025
2026 if (sc->hn_vf_rdytick == 0) {
2027 /*
2028 * The VF was ready; restore some settings.
2029 */
2030 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2031 /*
2032 * NOTE:
2033 * There is _no_ need to fixup if_capenable and
2034 * if_hwassist, since the if_capabilities before
2035 * restoration was an intersection of the VF's
2036 * if_capabilites and the synthetic device's
2037 * if_capabilites.
2038 */
2039 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2040 sc->hn_ifp->if_hw_tsomaxsegcount =
2041 sc->hn_saved_tsosegcnt;
2042 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2043 }
2044
2045 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2046 /*
2047 * Restore RSS settings.
2048 */
2049 hn_vf_rss_restore(sc);
2050
2051 /*
2052 * Resume link status management, which was suspended
2053 * by hn_ifnet_attevent().
2054 */
2055 hn_resume_mgmt(sc);
2056 }
2057 }
2058
2059 /* Mark transparent mode VF as disabled. */
2060 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2061
2062 rm_wlock(&hn_vfmap_lock);
2063
2064 KASSERT(ifp->if_index < hn_vfmap_size,
2065 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2066 if (hn_vfmap[ifp->if_index] != NULL) {
2067 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2068 ("%s: ifindex %d was mapped to %s",
2069 ifp->if_xname, ifp->if_index,
2070 hn_vfmap[ifp->if_index]->if_xname));
2071 hn_vfmap[ifp->if_index] = NULL;
2072 }
2073
2074 rm_wunlock(&hn_vfmap_lock);
2075 done:
2076 HN_UNLOCK(sc);
2077 }
2078
2079 static void
2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2081 {
2082 struct hn_softc *sc = xsc;
2083
2084 if (sc->hn_vf_ifp == ifp)
2085 if_link_state_change(sc->hn_ifp, link_state);
2086 }
2087
2088 static int
2089 hn_probe(device_t dev)
2090 {
2091
2092 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2093 device_set_desc(dev, "Hyper-V Network Interface");
2094 return BUS_PROBE_DEFAULT;
2095 }
2096 return ENXIO;
2097 }
2098
2099 static int
2100 hn_attach(device_t dev)
2101 {
2102 struct hn_softc *sc = device_get_softc(dev);
2103 struct sysctl_oid_list *child;
2104 struct sysctl_ctx_list *ctx;
2105 uint8_t eaddr[ETHER_ADDR_LEN];
2106 struct ifnet *ifp = NULL;
2107 int error, ring_cnt, tx_ring_cnt;
2108 uint32_t mtu;
2109
2110 sc->hn_dev = dev;
2111 sc->hn_prichan = vmbus_get_channel(dev);
2112 HN_LOCK_INIT(sc);
2113 rm_init(&sc->hn_vf_lock, "hnvf");
2114 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2115 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2116
2117 /*
2118 * Initialize these tunables once.
2119 */
2120 sc->hn_agg_size = hn_tx_agg_size;
2121 sc->hn_agg_pkts = hn_tx_agg_pkts;
2122
2123 /*
2124 * Setup taskqueue for transmission.
2125 */
2126 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2127 int i;
2128
2129 sc->hn_tx_taskqs =
2130 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2131 M_DEVBUF, M_WAITOK);
2132 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2133 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2134 M_WAITOK, taskqueue_thread_enqueue,
2135 &sc->hn_tx_taskqs[i]);
2136 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2137 "%s tx%d", device_get_nameunit(dev), i);
2138 }
2139 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2140 sc->hn_tx_taskqs = hn_tx_taskque;
2141 }
2142
2143 /*
2144 * Setup taskqueue for mangement tasks, e.g. link status.
2145 */
2146 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2147 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2148 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2149 device_get_nameunit(dev));
2150 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2151 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2152 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2153 hn_netchg_status_taskfunc, sc);
2154
2155 if (hn_xpnt_vf) {
2156 /*
2157 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2158 */
2159 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2160 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2161 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2162 device_get_nameunit(dev));
2163 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2164 hn_xpnt_vf_init_taskfunc, sc);
2165 }
2166
2167 /*
2168 * Allocate ifnet and setup its name earlier, so that if_printf
2169 * can be used by functions, which will be called after
2170 * ether_ifattach().
2171 */
2172 ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2173 ifp->if_softc = sc;
2174 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2175
2176 /*
2177 * Initialize ifmedia earlier so that it can be unconditionally
2178 * destroyed, if error happened later on.
2179 */
2180 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2181
2182 /*
2183 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2184 * to use (tx_ring_cnt).
2185 *
2186 * NOTE:
2187 * The # of RX rings to use is same as the # of channels to use.
2188 */
2189 ring_cnt = hn_chan_cnt;
2190 if (ring_cnt <= 0) {
2191 /* Default */
2192 ring_cnt = mp_ncpus;
2193 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2194 ring_cnt = HN_RING_CNT_DEF_MAX;
2195 } else if (ring_cnt > mp_ncpus) {
2196 ring_cnt = mp_ncpus;
2197 }
2198 #ifdef RSS
2199 if (ring_cnt > rss_getnumbuckets())
2200 ring_cnt = rss_getnumbuckets();
2201 #endif
2202
2203 tx_ring_cnt = hn_tx_ring_cnt;
2204 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2205 tx_ring_cnt = ring_cnt;
2206 #ifdef HN_IFSTART_SUPPORT
2207 if (hn_use_if_start) {
2208 /* ifnet.if_start only needs one TX ring. */
2209 tx_ring_cnt = 1;
2210 }
2211 #endif
2212
2213 /*
2214 * Set the leader CPU for channels.
2215 */
2216 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2217
2218 /*
2219 * Create enough TX/RX rings, even if only limited number of
2220 * channels can be allocated.
2221 */
2222 error = hn_create_tx_data(sc, tx_ring_cnt);
2223 if (error)
2224 goto failed;
2225 error = hn_create_rx_data(sc, ring_cnt);
2226 if (error)
2227 goto failed;
2228
2229 /*
2230 * Create transaction context for NVS and RNDIS transactions.
2231 */
2232 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2233 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2234 if (sc->hn_xact == NULL) {
2235 error = ENXIO;
2236 goto failed;
2237 }
2238
2239 /*
2240 * Install orphan handler for the revocation of this device's
2241 * primary channel.
2242 *
2243 * NOTE:
2244 * The processing order is critical here:
2245 * Install the orphan handler, _before_ testing whether this
2246 * device's primary channel has been revoked or not.
2247 */
2248 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2249 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2250 error = ENXIO;
2251 goto failed;
2252 }
2253
2254 /*
2255 * Attach the synthetic parts, i.e. NVS and RNDIS.
2256 */
2257 error = hn_synth_attach(sc, ETHERMTU);
2258 if (error)
2259 goto failed;
2260
2261 error = hn_rndis_get_eaddr(sc, eaddr);
2262 if (error)
2263 goto failed;
2264
2265 error = hn_rndis_get_mtu(sc, &mtu);
2266 if (error)
2267 mtu = ETHERMTU;
2268 else if (bootverbose)
2269 device_printf(dev, "RNDIS mtu %u\n", mtu);
2270
2271 #if __FreeBSD_version >= 1100099
2272 if (sc->hn_rx_ring_inuse > 1) {
2273 /*
2274 * Reduce TCP segment aggregation limit for multiple
2275 * RX rings to increase ACK timeliness.
2276 */
2277 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2278 }
2279 #endif
2280
2281 /*
2282 * Fixup TX/RX stuffs after synthetic parts are attached.
2283 */
2284 hn_fixup_tx_data(sc);
2285 hn_fixup_rx_data(sc);
2286
2287 ctx = device_get_sysctl_ctx(dev);
2288 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2289 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2290 &sc->hn_nvs_ver, 0, "NVS version");
2291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2292 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2293 hn_ndis_version_sysctl, "A", "NDIS version");
2294 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2295 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2296 hn_caps_sysctl, "A", "capabilities");
2297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2298 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2299 hn_hwassist_sysctl, "A", "hwassist");
2300 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2301 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2302 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2303 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2304 "max # of TSO segments");
2305 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2306 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2307 "max size of TSO segment");
2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2309 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2310 hn_rxfilter_sysctl, "A", "rxfilter");
2311 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2312 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2313 hn_rss_hash_sysctl, "A", "RSS hash");
2314 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2315 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2316 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2319 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2320 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2321 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2322 #ifndef RSS
2323 /*
2324 * Don't allow RSS key/indirect table changes, if RSS is defined.
2325 */
2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2327 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2328 hn_rss_key_sysctl, "IU", "RSS key");
2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2330 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2331 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2332 #endif
2333 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2334 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2335 "RNDIS offered packet transmission aggregation size limit");
2336 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2337 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2338 "RNDIS offered packet transmission aggregation count limit");
2339 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2340 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2341 "RNDIS packet transmission aggregation alignment");
2342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2343 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2344 hn_txagg_size_sysctl, "I",
2345 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2347 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2348 hn_txagg_pkts_sysctl, "I",
2349 "Packet transmission aggregation packets, "
2350 "0 -- disable, -1 -- auto");
2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2352 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2353 hn_polling_sysctl, "I",
2354 "Polling frequency: [100,1000000], 0 disable polling");
2355 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2356 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2357 hn_vf_sysctl, "A", "Virtual Function's name");
2358 if (!hn_xpnt_vf) {
2359 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2360 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2361 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2362 } else {
2363 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2364 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2365 hn_xpnt_vf_enabled_sysctl, "I",
2366 "Transparent VF enabled");
2367 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2368 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2369 hn_xpnt_vf_accbpf_sysctl, "I",
2370 "Accurate BPF for transparent VF");
2371 }
2372
2373 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2374 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2375 "switch to rsc");
2376
2377 /*
2378 * Setup the ifmedia, which has been initialized earlier.
2379 */
2380 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2381 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2382 /* XXX ifmedia_set really should do this for us */
2383 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2384
2385 /*
2386 * Setup the ifnet for this interface.
2387 */
2388
2389 ifp->if_baudrate = IF_Gbps(10);
2390 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2391 ifp->if_ioctl = hn_ioctl;
2392 ifp->if_init = hn_init;
2393 #ifdef HN_IFSTART_SUPPORT
2394 if (hn_use_if_start) {
2395 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2396
2397 ifp->if_start = hn_start;
2398 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2399 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2400 IFQ_SET_READY(&ifp->if_snd);
2401 } else
2402 #endif
2403 {
2404 ifp->if_transmit = hn_transmit;
2405 ifp->if_qflush = hn_xmit_qflush;
2406 }
2407
2408 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2409 #ifdef foo
2410 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2411 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2412 #endif
2413 if (sc->hn_caps & HN_CAP_VLAN) {
2414 /* XXX not sure about VLAN_MTU. */
2415 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2416 }
2417
2418 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2419 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2420 ifp->if_capabilities |= IFCAP_TXCSUM;
2421 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2422 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2423 if (sc->hn_caps & HN_CAP_TSO4) {
2424 ifp->if_capabilities |= IFCAP_TSO4;
2425 ifp->if_hwassist |= CSUM_IP_TSO;
2426 }
2427 if (sc->hn_caps & HN_CAP_TSO6) {
2428 ifp->if_capabilities |= IFCAP_TSO6;
2429 ifp->if_hwassist |= CSUM_IP6_TSO;
2430 }
2431
2432 /* Enable all available capabilities by default. */
2433 ifp->if_capenable = ifp->if_capabilities;
2434
2435 /*
2436 * Disable IPv6 TSO and TXCSUM by default, they still can
2437 * be enabled through SIOCSIFCAP.
2438 */
2439 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2440 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2441
2442 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2443 /*
2444 * Lock hn_set_tso_maxsize() to simplify its
2445 * internal logic.
2446 */
2447 HN_LOCK(sc);
2448 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2449 HN_UNLOCK(sc);
2450 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2451 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2452 }
2453
2454 ether_ifattach(ifp, eaddr);
2455
2456 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2457 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2458 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2459 }
2460 if (mtu < ETHERMTU) {
2461 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2462 ifp->if_mtu = mtu;
2463 }
2464
2465 /* Inform the upper layer about the long frame support. */
2466 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2467
2468 /*
2469 * Kick off link status check.
2470 */
2471 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2472 hn_update_link_status(sc);
2473
2474 if (!hn_xpnt_vf) {
2475 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2476 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2477 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2478 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2479 } else {
2480 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2481 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2482 }
2483
2484 /*
2485 * NOTE:
2486 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2487 * since interface's LLADDR is needed; interface LLADDR is not
2488 * available when ifnet_arrival event is triggered.
2489 */
2490 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2491 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2492 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2493 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2494
2495 return (0);
2496 failed:
2497 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2498 hn_synth_detach(sc);
2499 hn_detach(dev);
2500 return (error);
2501 }
2502
2503 static int
2504 hn_detach(device_t dev)
2505 {
2506 struct hn_softc *sc = device_get_softc(dev);
2507 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2508
2509 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2510 /*
2511 * In case that the vmbus missed the orphan handler
2512 * installation.
2513 */
2514 vmbus_xact_ctx_orphan(sc->hn_xact);
2515 }
2516
2517 if (sc->hn_ifaddr_evthand != NULL)
2518 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2519 if (sc->hn_ifnet_evthand != NULL)
2520 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2521 if (sc->hn_ifnet_atthand != NULL) {
2522 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2523 sc->hn_ifnet_atthand);
2524 }
2525 if (sc->hn_ifnet_dethand != NULL) {
2526 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2527 sc->hn_ifnet_dethand);
2528 }
2529 if (sc->hn_ifnet_lnkhand != NULL)
2530 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2531
2532 vf_ifp = sc->hn_vf_ifp;
2533 __compiler_membar();
2534 if (vf_ifp != NULL)
2535 hn_ifnet_detevent(sc, vf_ifp);
2536
2537 if (device_is_attached(dev)) {
2538 HN_LOCK(sc);
2539 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2540 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2541 hn_stop(sc, true);
2542 /*
2543 * NOTE:
2544 * hn_stop() only suspends data, so managment
2545 * stuffs have to be suspended manually here.
2546 */
2547 hn_suspend_mgmt(sc);
2548 hn_synth_detach(sc);
2549 }
2550 HN_UNLOCK(sc);
2551 ether_ifdetach(ifp);
2552 }
2553
2554 ifmedia_removeall(&sc->hn_media);
2555 hn_destroy_rx_data(sc);
2556 hn_destroy_tx_data(sc);
2557
2558 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2559 int i;
2560
2561 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2562 taskqueue_free(sc->hn_tx_taskqs[i]);
2563 free(sc->hn_tx_taskqs, M_DEVBUF);
2564 }
2565 taskqueue_free(sc->hn_mgmt_taskq0);
2566 if (sc->hn_vf_taskq != NULL)
2567 taskqueue_free(sc->hn_vf_taskq);
2568
2569 if (sc->hn_xact != NULL) {
2570 /*
2571 * Uninstall the orphan handler _before_ the xact is
2572 * destructed.
2573 */
2574 vmbus_chan_unset_orphan(sc->hn_prichan);
2575 vmbus_xact_ctx_destroy(sc->hn_xact);
2576 }
2577
2578 if_free(ifp);
2579
2580 HN_LOCK_DESTROY(sc);
2581 rm_destroy(&sc->hn_vf_lock);
2582 return (0);
2583 }
2584
2585 static int
2586 hn_shutdown(device_t dev)
2587 {
2588
2589 return (0);
2590 }
2591
2592 static void
2593 hn_link_status(struct hn_softc *sc)
2594 {
2595 uint32_t link_status;
2596 int error;
2597
2598 error = hn_rndis_get_linkstatus(sc, &link_status);
2599 if (error) {
2600 /* XXX what to do? */
2601 return;
2602 }
2603
2604 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2605 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2606 else
2607 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2608 if_link_state_change(sc->hn_ifp,
2609 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2610 LINK_STATE_UP : LINK_STATE_DOWN);
2611 }
2612
2613 static void
2614 hn_link_taskfunc(void *xsc, int pending __unused)
2615 {
2616 struct hn_softc *sc = xsc;
2617
2618 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2619 return;
2620 hn_link_status(sc);
2621 }
2622
2623 static void
2624 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2625 {
2626 struct hn_softc *sc = xsc;
2627
2628 /* Prevent any link status checks from running. */
2629 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2630
2631 /*
2632 * Fake up a [link down --> link up] state change; 5 seconds
2633 * delay is used, which closely simulates miibus reaction
2634 * upon link down event.
2635 */
2636 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2637 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2638 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2639 &sc->hn_netchg_status, 5 * hz);
2640 }
2641
2642 static void
2643 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2644 {
2645 struct hn_softc *sc = xsc;
2646
2647 /* Re-allow link status checks. */
2648 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2649 hn_link_status(sc);
2650 }
2651
2652 static void
2653 hn_update_link_status(struct hn_softc *sc)
2654 {
2655
2656 if (sc->hn_mgmt_taskq != NULL)
2657 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2658 }
2659
2660 static void
2661 hn_change_network(struct hn_softc *sc)
2662 {
2663
2664 if (sc->hn_mgmt_taskq != NULL)
2665 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2666 }
2667
2668 static __inline int
2669 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2670 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2671 {
2672 struct mbuf *m = *m_head;
2673 int error;
2674
2675 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2676
2677 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2678 m, segs, nsegs, BUS_DMA_NOWAIT);
2679 if (error == EFBIG) {
2680 struct mbuf *m_new;
2681
2682 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2683 if (m_new == NULL)
2684 return ENOBUFS;
2685 else
2686 *m_head = m = m_new;
2687 txr->hn_tx_collapsed++;
2688
2689 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2690 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2691 }
2692 if (!error) {
2693 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2694 BUS_DMASYNC_PREWRITE);
2695 txd->flags |= HN_TXD_FLAG_DMAMAP;
2696 }
2697 return error;
2698 }
2699
2700 static __inline int
2701 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2702 {
2703
2704 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2705 ("put an onlist txd %#x", txd->flags));
2706 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2707 ("put an onagg txd %#x", txd->flags));
2708
2709 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2710 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2711 return 0;
2712
2713 if (!STAILQ_EMPTY(&txd->agg_list)) {
2714 struct hn_txdesc *tmp_txd;
2715
2716 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2717 int freed __diagused;
2718
2719 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2720 ("resursive aggregation on aggregated txdesc"));
2721 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2722 ("not aggregated txdesc"));
2723 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2724 ("aggregated txdesc uses dmamap"));
2725 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2726 ("aggregated txdesc consumes "
2727 "chimney sending buffer"));
2728 KASSERT(tmp_txd->chim_size == 0,
2729 ("aggregated txdesc has non-zero "
2730 "chimney sending size"));
2731
2732 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2733 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2734 freed = hn_txdesc_put(txr, tmp_txd);
2735 KASSERT(freed, ("failed to free aggregated txdesc"));
2736 }
2737 }
2738
2739 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2740 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2741 ("chim txd uses dmamap"));
2742 hn_chim_free(txr->hn_sc, txd->chim_index);
2743 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2744 txd->chim_size = 0;
2745 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2746 bus_dmamap_sync(txr->hn_tx_data_dtag,
2747 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2748 bus_dmamap_unload(txr->hn_tx_data_dtag,
2749 txd->data_dmap);
2750 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2751 }
2752
2753 if (txd->m != NULL) {
2754 m_freem(txd->m);
2755 txd->m = NULL;
2756 }
2757
2758 txd->flags |= HN_TXD_FLAG_ONLIST;
2759 #ifndef HN_USE_TXDESC_BUFRING
2760 mtx_lock_spin(&txr->hn_txlist_spin);
2761 KASSERT(txr->hn_txdesc_avail >= 0 &&
2762 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2763 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2764 txr->hn_txdesc_avail++;
2765 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2766 mtx_unlock_spin(&txr->hn_txlist_spin);
2767 #else /* HN_USE_TXDESC_BUFRING */
2768 #ifdef HN_DEBUG
2769 atomic_add_int(&txr->hn_txdesc_avail, 1);
2770 #endif
2771 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2772 #endif /* !HN_USE_TXDESC_BUFRING */
2773
2774 return 1;
2775 }
2776
2777 static __inline struct hn_txdesc *
2778 hn_txdesc_get(struct hn_tx_ring *txr)
2779 {
2780 struct hn_txdesc *txd;
2781
2782 #ifndef HN_USE_TXDESC_BUFRING
2783 mtx_lock_spin(&txr->hn_txlist_spin);
2784 txd = SLIST_FIRST(&txr->hn_txlist);
2785 if (txd != NULL) {
2786 KASSERT(txr->hn_txdesc_avail > 0,
2787 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2788 txr->hn_txdesc_avail--;
2789 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2790 }
2791 mtx_unlock_spin(&txr->hn_txlist_spin);
2792 #else
2793 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2794 #endif
2795
2796 if (txd != NULL) {
2797 #ifdef HN_USE_TXDESC_BUFRING
2798 #ifdef HN_DEBUG
2799 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2800 #endif
2801 #endif /* HN_USE_TXDESC_BUFRING */
2802 KASSERT(txd->m == NULL && txd->refs == 0 &&
2803 STAILQ_EMPTY(&txd->agg_list) &&
2804 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2805 txd->chim_size == 0 &&
2806 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2807 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2808 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2809 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2810 txd->refs = 1;
2811 }
2812 return txd;
2813 }
2814
2815 static __inline void
2816 hn_txdesc_hold(struct hn_txdesc *txd)
2817 {
2818
2819 /* 0->1 transition will never work */
2820 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2821 atomic_add_int(&txd->refs, 1);
2822 }
2823
2824 static __inline void
2825 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2826 {
2827
2828 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2829 ("recursive aggregation on aggregating txdesc"));
2830
2831 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2832 ("already aggregated"));
2833 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2834 ("recursive aggregation on to-be-aggregated txdesc"));
2835
2836 txd->flags |= HN_TXD_FLAG_ONAGG;
2837 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2838 }
2839
2840 static bool
2841 hn_tx_ring_pending(struct hn_tx_ring *txr)
2842 {
2843 bool pending = false;
2844
2845 #ifndef HN_USE_TXDESC_BUFRING
2846 mtx_lock_spin(&txr->hn_txlist_spin);
2847 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2848 pending = true;
2849 mtx_unlock_spin(&txr->hn_txlist_spin);
2850 #else
2851 if (!buf_ring_full(txr->hn_txdesc_br))
2852 pending = true;
2853 #endif
2854 return (pending);
2855 }
2856
2857 static __inline void
2858 hn_txeof(struct hn_tx_ring *txr)
2859 {
2860 txr->hn_has_txeof = 0;
2861 txr->hn_txeof(txr);
2862 }
2863
2864 static void
2865 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2866 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2867 {
2868 struct hn_txdesc *txd = sndc->hn_cbarg;
2869 struct hn_tx_ring *txr;
2870
2871 txr = txd->txr;
2872 KASSERT(txr->hn_chan == chan,
2873 ("channel mismatch, on chan%u, should be chan%u",
2874 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2875
2876 txr->hn_has_txeof = 1;
2877 hn_txdesc_put(txr, txd);
2878
2879 ++txr->hn_txdone_cnt;
2880 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2881 txr->hn_txdone_cnt = 0;
2882 if (txr->hn_oactive)
2883 hn_txeof(txr);
2884 }
2885 }
2886
2887 static void
2888 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2889 {
2890 #if defined(INET) || defined(INET6)
2891 struct epoch_tracker et;
2892
2893 NET_EPOCH_ENTER(et);
2894 tcp_lro_flush_all(&rxr->hn_lro);
2895 NET_EPOCH_EXIT(et);
2896 #endif
2897
2898 /*
2899 * NOTE:
2900 * 'txr' could be NULL, if multiple channels and
2901 * ifnet.if_start method are enabled.
2902 */
2903 if (txr == NULL || !txr->hn_has_txeof)
2904 return;
2905
2906 txr->hn_txdone_cnt = 0;
2907 hn_txeof(txr);
2908 }
2909
2910 static __inline uint32_t
2911 hn_rndis_pktmsg_offset(uint32_t ofs)
2912 {
2913
2914 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2915 ("invalid RNDIS packet msg offset %u", ofs));
2916 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2917 }
2918
2919 static __inline void *
2920 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2921 size_t pi_dlen, uint32_t pi_type)
2922 {
2923 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2924 struct rndis_pktinfo *pi;
2925
2926 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2927 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2928
2929 /*
2930 * Per-packet-info does not move; it only grows.
2931 *
2932 * NOTE:
2933 * rm_pktinfooffset in this phase counts from the beginning
2934 * of rndis_packet_msg.
2935 */
2936 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2937 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2938 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2939 pkt->rm_pktinfolen);
2940 pkt->rm_pktinfolen += pi_size;
2941
2942 pi->rm_size = pi_size;
2943 pi->rm_type = pi_type;
2944 pi->rm_internal = 0;
2945 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2946
2947 return (pi->rm_data);
2948 }
2949
2950 static __inline int
2951 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2952 {
2953 struct hn_txdesc *txd;
2954 struct mbuf *m;
2955 int error, pkts;
2956
2957 txd = txr->hn_agg_txd;
2958 KASSERT(txd != NULL, ("no aggregate txdesc"));
2959
2960 /*
2961 * Since hn_txpkt() will reset this temporary stat, save
2962 * it now, so that oerrors can be updated properly, if
2963 * hn_txpkt() ever fails.
2964 */
2965 pkts = txr->hn_stat_pkts;
2966
2967 /*
2968 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2969 * failure, save it for later freeing, if hn_txpkt() ever
2970 * fails.
2971 */
2972 m = txd->m;
2973 error = hn_txpkt(ifp, txr, txd);
2974 if (__predict_false(error)) {
2975 /* txd is freed, but m is not. */
2976 m_freem(m);
2977
2978 txr->hn_flush_failed++;
2979 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2980 }
2981
2982 /* Reset all aggregation states. */
2983 txr->hn_agg_txd = NULL;
2984 txr->hn_agg_szleft = 0;
2985 txr->hn_agg_pktleft = 0;
2986 txr->hn_agg_prevpkt = NULL;
2987
2988 return (error);
2989 }
2990
2991 static void *
2992 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2993 int pktsize)
2994 {
2995 void *chim;
2996
2997 if (txr->hn_agg_txd != NULL) {
2998 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2999 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
3000 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
3001 int olen;
3002
3003 /*
3004 * Update the previous RNDIS packet's total length,
3005 * it can be increased due to the mandatory alignment
3006 * padding for this RNDIS packet. And update the
3007 * aggregating txdesc's chimney sending buffer size
3008 * accordingly.
3009 *
3010 * XXX
3011 * Zero-out the padding, as required by the RNDIS spec.
3012 */
3013 olen = pkt->rm_len;
3014 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3015 agg_txd->chim_size += pkt->rm_len - olen;
3016
3017 /* Link this txdesc to the parent. */
3018 hn_txdesc_agg(agg_txd, txd);
3019
3020 chim = (uint8_t *)pkt + pkt->rm_len;
3021 /* Save the current packet for later fixup. */
3022 txr->hn_agg_prevpkt = chim;
3023
3024 txr->hn_agg_pktleft--;
3025 txr->hn_agg_szleft -= pktsize;
3026 if (txr->hn_agg_szleft <=
3027 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3028 /*
3029 * Probably can't aggregate more packets,
3030 * flush this aggregating txdesc proactively.
3031 */
3032 txr->hn_agg_pktleft = 0;
3033 }
3034 /* Done! */
3035 return (chim);
3036 }
3037 hn_flush_txagg(ifp, txr);
3038 }
3039 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3040
3041 txr->hn_tx_chimney_tried++;
3042 txd->chim_index = hn_chim_alloc(txr->hn_sc);
3043 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3044 return (NULL);
3045 txr->hn_tx_chimney++;
3046
3047 chim = txr->hn_sc->hn_chim +
3048 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3049
3050 if (txr->hn_agg_pktmax > 1 &&
3051 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3052 txr->hn_agg_txd = txd;
3053 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3054 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3055 txr->hn_agg_prevpkt = chim;
3056 }
3057 return (chim);
3058 }
3059
3060 /*
3061 * NOTE:
3062 * If this function fails, then both txd and m_head0 will be freed.
3063 */
3064 static int
3065 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3066 struct mbuf **m_head0)
3067 {
3068 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3069 int error, nsegs, i;
3070 struct mbuf *m_head = *m_head0;
3071 struct rndis_packet_msg *pkt;
3072 uint32_t *pi_data;
3073 void *chim = NULL;
3074 int pkt_hlen, pkt_size;
3075
3076 pkt = txd->rndis_pkt;
3077 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3078 if (pkt_size < txr->hn_chim_size) {
3079 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3080 if (chim != NULL)
3081 pkt = chim;
3082 } else {
3083 if (txr->hn_agg_txd != NULL)
3084 hn_flush_txagg(ifp, txr);
3085 }
3086
3087 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3088 pkt->rm_len = m_head->m_pkthdr.len;
3089 pkt->rm_dataoffset = 0;
3090 pkt->rm_datalen = m_head->m_pkthdr.len;
3091 pkt->rm_oobdataoffset = 0;
3092 pkt->rm_oobdatalen = 0;
3093 pkt->rm_oobdataelements = 0;
3094 pkt->rm_pktinfooffset = sizeof(*pkt);
3095 pkt->rm_pktinfolen = 0;
3096 pkt->rm_vchandle = 0;
3097 pkt->rm_reserved = 0;
3098
3099 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3100 /*
3101 * Set the hash value for this packet.
3102 */
3103 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3104 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3105
3106 if (M_HASHTYPE_ISHASH(m_head))
3107 /*
3108 * The flowid field contains the hash value host
3109 * set in the rx queue if it is a ip forwarding pkt.
3110 * Set the same hash value so host can send on the
3111 * cpu it was received.
3112 */
3113 *pi_data = m_head->m_pkthdr.flowid;
3114 else
3115 /*
3116 * Otherwise just put the tx queue index.
3117 */
3118 *pi_data = txr->hn_tx_idx;
3119 }
3120
3121 if (m_head->m_flags & M_VLANTAG) {
3122 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3123 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3124 *pi_data = NDIS_VLAN_INFO_MAKE(
3125 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3126 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3127 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3128 }
3129
3130 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3131 #if defined(INET6) || defined(INET)
3132 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3133 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3134 #ifdef INET
3135 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3136 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3137 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3138 m_head->m_pkthdr.tso_segsz);
3139 }
3140 #endif
3141 #if defined(INET6) && defined(INET)
3142 else
3143 #endif
3144 #ifdef INET6
3145 {
3146 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3147 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3148 m_head->m_pkthdr.tso_segsz);
3149 }
3150 #endif
3151 #endif /* INET6 || INET */
3152 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3153 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3154 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3155 if (m_head->m_pkthdr.csum_flags &
3156 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3157 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3158 } else {
3159 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3160 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3161 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3162 }
3163
3164 if (m_head->m_pkthdr.csum_flags &
3165 (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3166 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3167 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3168 } else if (m_head->m_pkthdr.csum_flags &
3169 (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3170 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3171 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3172 }
3173 }
3174
3175 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3176 /* Fixup RNDIS packet message total length */
3177 pkt->rm_len += pkt_hlen;
3178 /* Convert RNDIS packet message offsets */
3179 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3180 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3181
3182 /*
3183 * Fast path: Chimney sending.
3184 */
3185 if (chim != NULL) {
3186 struct hn_txdesc *tgt_txd = txd;
3187
3188 if (txr->hn_agg_txd != NULL) {
3189 tgt_txd = txr->hn_agg_txd;
3190 #ifdef INVARIANTS
3191 *m_head0 = NULL;
3192 #endif
3193 }
3194
3195 KASSERT(pkt == chim,
3196 ("RNDIS pkt not in chimney sending buffer"));
3197 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3198 ("chimney sending buffer is not used"));
3199 tgt_txd->chim_size += pkt->rm_len;
3200
3201 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3202 ((uint8_t *)chim) + pkt_hlen);
3203
3204 txr->hn_gpa_cnt = 0;
3205 txr->hn_sendpkt = hn_txpkt_chim;
3206 goto done;
3207 }
3208
3209 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3210 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3211 ("chimney buffer is used"));
3212 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3213
3214 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3215 if (__predict_false(error)) {
3216 int freed __diagused;
3217
3218 /*
3219 * This mbuf is not linked w/ the txd yet, so free it now.
3220 */
3221 m_freem(m_head);
3222 *m_head0 = NULL;
3223
3224 freed = hn_txdesc_put(txr, txd);
3225 KASSERT(freed != 0,
3226 ("fail to free txd upon txdma error"));
3227
3228 txr->hn_txdma_failed++;
3229 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3230 return error;
3231 }
3232 *m_head0 = m_head;
3233
3234 /* +1 RNDIS packet message */
3235 txr->hn_gpa_cnt = nsegs + 1;
3236
3237 /* send packet with page buffer */
3238 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3239 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3240 txr->hn_gpa[0].gpa_len = pkt_hlen;
3241
3242 /*
3243 * Fill the page buffers with mbuf info after the page
3244 * buffer for RNDIS packet message.
3245 */
3246 for (i = 0; i < nsegs; ++i) {
3247 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3248
3249 gpa->gpa_page = atop(segs[i].ds_addr);
3250 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3251 gpa->gpa_len = segs[i].ds_len;
3252 }
3253
3254 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3255 txd->chim_size = 0;
3256 txr->hn_sendpkt = hn_txpkt_sglist;
3257 done:
3258 txd->m = m_head;
3259
3260 /* Set the completion routine */
3261 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3262
3263 /* Update temporary stats for later use. */
3264 txr->hn_stat_pkts++;
3265 txr->hn_stat_size += m_head->m_pkthdr.len;
3266 if (m_head->m_flags & M_MCAST)
3267 txr->hn_stat_mcasts++;
3268
3269 return 0;
3270 }
3271
3272 /*
3273 * NOTE:
3274 * If this function fails, then txd will be freed, but the mbuf
3275 * associated w/ the txd will _not_ be freed.
3276 */
3277 static int
3278 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3279 {
3280 int error, send_failed = 0, has_bpf;
3281
3282 again:
3283 has_bpf = bpf_peers_present(ifp->if_bpf);
3284 if (has_bpf) {
3285 /*
3286 * Make sure that this txd and any aggregated txds are not
3287 * freed before ETHER_BPF_MTAP.
3288 */
3289 hn_txdesc_hold(txd);
3290 }
3291 error = txr->hn_sendpkt(txr, txd);
3292 if (!error) {
3293 if (has_bpf) {
3294 const struct hn_txdesc *tmp_txd;
3295
3296 ETHER_BPF_MTAP(ifp, txd->m);
3297 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3298 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3299 }
3300
3301 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3302 #ifdef HN_IFSTART_SUPPORT
3303 if (!hn_use_if_start)
3304 #endif
3305 {
3306 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3307 txr->hn_stat_size);
3308 if (txr->hn_stat_mcasts != 0) {
3309 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3310 txr->hn_stat_mcasts);
3311 }
3312 }
3313 txr->hn_pkts += txr->hn_stat_pkts;
3314 txr->hn_sends++;
3315 }
3316 if (has_bpf)
3317 hn_txdesc_put(txr, txd);
3318
3319 if (__predict_false(error)) {
3320 int freed __diagused;
3321
3322 /*
3323 * This should "really rarely" happen.
3324 *
3325 * XXX Too many RX to be acked or too many sideband
3326 * commands to run? Ask netvsc_channel_rollup()
3327 * to kick start later.
3328 */
3329 txr->hn_has_txeof = 1;
3330 if (!send_failed) {
3331 txr->hn_send_failed++;
3332 send_failed = 1;
3333 /*
3334 * Try sending again after set hn_has_txeof;
3335 * in case that we missed the last
3336 * netvsc_channel_rollup().
3337 */
3338 goto again;
3339 }
3340 if_printf(ifp, "send failed\n");
3341
3342 /*
3343 * Caller will perform further processing on the
3344 * associated mbuf, so don't free it in hn_txdesc_put();
3345 * only unload it from the DMA map in hn_txdesc_put(),
3346 * if it was loaded.
3347 */
3348 txd->m = NULL;
3349 freed = hn_txdesc_put(txr, txd);
3350 KASSERT(freed != 0,
3351 ("fail to free txd upon send error"));
3352
3353 txr->hn_send_failed++;
3354 }
3355
3356 /* Reset temporary stats, after this sending is done. */
3357 txr->hn_stat_size = 0;
3358 txr->hn_stat_pkts = 0;
3359 txr->hn_stat_mcasts = 0;
3360
3361 return (error);
3362 }
3363
3364 /*
3365 * Append the specified data to the indicated mbuf chain,
3366 * Extend the mbuf chain if the new data does not fit in
3367 * existing space.
3368 *
3369 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3370 * There should be an equivalent in the kernel mbuf code,
3371 * but there does not appear to be one yet.
3372 *
3373 * Differs from m_append() in that additional mbufs are
3374 * allocated with cluster size MJUMPAGESIZE, and filled
3375 * accordingly.
3376 *
3377 * Return the last mbuf in the chain or NULL if failed to
3378 * allocate new mbuf.
3379 */
3380 static struct mbuf *
3381 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3382 {
3383 struct mbuf *m, *n;
3384 int remainder, space;
3385
3386 for (m = m0; m->m_next != NULL; m = m->m_next)
3387 ;
3388 remainder = len;
3389 space = M_TRAILINGSPACE(m);
3390 if (space > 0) {
3391 /*
3392 * Copy into available space.
3393 */
3394 if (space > remainder)
3395 space = remainder;
3396 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3397 m->m_len += space;
3398 cp += space;
3399 remainder -= space;
3400 }
3401 while (remainder > 0) {
3402 /*
3403 * Allocate a new mbuf; could check space
3404 * and allocate a cluster instead.
3405 */
3406 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3407 if (n == NULL)
3408 return NULL;
3409 n->m_len = min(MJUMPAGESIZE, remainder);
3410 bcopy(cp, mtod(n, caddr_t), n->m_len);
3411 cp += n->m_len;
3412 remainder -= n->m_len;
3413 m->m_next = n;
3414 m = n;
3415 }
3416
3417 return m;
3418 }
3419
3420 #if defined(INET) || defined(INET6)
3421 static __inline int
3422 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3423 {
3424 #if __FreeBSD_version >= 1100095
3425 if (hn_lro_mbufq_depth) {
3426 tcp_lro_queue_mbuf(lc, m);
3427 return 0;
3428 }
3429 #endif
3430 return tcp_lro_rx(lc, m, 0);
3431 }
3432 #endif
3433
3434 static int
3435 hn_rxpkt(struct hn_rx_ring *rxr)
3436 {
3437 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3438 struct mbuf *m_new, *n;
3439 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3440 int hash_type = M_HASHTYPE_NONE;
3441 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3442 int i;
3443
3444 ifp = hn_ifp;
3445 if (rxr->hn_rxvf_ifp != NULL) {
3446 /*
3447 * Non-transparent mode VF; pretend this packet is from
3448 * the VF.
3449 */
3450 ifp = rxr->hn_rxvf_ifp;
3451 is_vf = 1;
3452 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3453 /* Transparent mode VF. */
3454 is_vf = 1;
3455 }
3456
3457 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3458 /*
3459 * NOTE:
3460 * See the NOTE of hn_rndis_init_fixat(). This
3461 * function can be reached, immediately after the
3462 * RNDIS is initialized but before the ifnet is
3463 * setup on the hn_attach() path; drop the unexpected
3464 * packets.
3465 */
3466 return (0);
3467 }
3468
3469 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3470 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3471 return (0);
3472 }
3473
3474 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3475 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3476 if (m_new == NULL) {
3477 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3478 return (0);
3479 }
3480 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3481 rxr->rsc.frag_len[0]);
3482 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3483 } else {
3484 /*
3485 * Get an mbuf with a cluster. For packets 2K or less,
3486 * get a standard 2K cluster. For anything larger, get a
3487 * 4K cluster. Any buffers larger than 4K can cause problems
3488 * if looped around to the Hyper-V TX channel, so avoid them.
3489 */
3490 size = MCLBYTES;
3491 if (rxr->rsc.pktlen > MCLBYTES) {
3492 /* 4096 */
3493 size = MJUMPAGESIZE;
3494 }
3495
3496 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3497 if (m_new == NULL) {
3498 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3499 return (0);
3500 }
3501
3502 n = m_new;
3503 for (i = 0; i < rxr->rsc.cnt; i++) {
3504 n = hv_m_append(n, rxr->rsc.frag_len[i],
3505 rxr->rsc.frag_data[i]);
3506 if (n == NULL) {
3507 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3508 return (0);
3509 } else {
3510 m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3511 }
3512 }
3513 }
3514 if (rxr->rsc.pktlen <= MHLEN)
3515 rxr->hn_small_pkts++;
3516
3517 m_new->m_pkthdr.rcvif = ifp;
3518
3519 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3520 do_csum = 0;
3521
3522 /* receive side checksum offload */
3523 if (rxr->rsc.csum_info != NULL) {
3524 /* IP csum offload */
3525 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3526 m_new->m_pkthdr.csum_flags |=
3527 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3528 rxr->hn_csum_ip++;
3529 }
3530
3531 /* TCP/UDP csum offload */
3532 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3533 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3534 m_new->m_pkthdr.csum_flags |=
3535 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3536 m_new->m_pkthdr.csum_data = 0xffff;
3537 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3538 rxr->hn_csum_tcp++;
3539 else
3540 rxr->hn_csum_udp++;
3541 }
3542
3543 /*
3544 * XXX
3545 * As of this write (Oct 28th, 2016), host side will turn
3546 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3547 * the do_lro setting here is actually _not_ accurate. We
3548 * depend on the RSS hash type check to reset do_lro.
3549 */
3550 if ((*(rxr->rsc.csum_info) &
3551 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3552 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3553 do_lro = 1;
3554 } else {
3555 hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3556 if (l3proto == ETHERTYPE_IP) {
3557 if (l4proto == IPPROTO_TCP) {
3558 if (do_csum &&
3559 (rxr->hn_trust_hcsum &
3560 HN_TRUST_HCSUM_TCP)) {
3561 rxr->hn_csum_trusted++;
3562 m_new->m_pkthdr.csum_flags |=
3563 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3564 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3565 m_new->m_pkthdr.csum_data = 0xffff;
3566 }
3567 do_lro = 1;
3568 } else if (l4proto == IPPROTO_UDP) {
3569 if (do_csum &&
3570 (rxr->hn_trust_hcsum &
3571 HN_TRUST_HCSUM_UDP)) {
3572 rxr->hn_csum_trusted++;
3573 m_new->m_pkthdr.csum_flags |=
3574 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3575 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3576 m_new->m_pkthdr.csum_data = 0xffff;
3577 }
3578 } else if (l4proto != IPPROTO_DONE && do_csum &&
3579 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3580 rxr->hn_csum_trusted++;
3581 m_new->m_pkthdr.csum_flags |=
3582 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3583 }
3584 }
3585 }
3586
3587 if (rxr->rsc.vlan_info != NULL) {
3588 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3589 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3590 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3591 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3592 m_new->m_flags |= M_VLANTAG;
3593 }
3594
3595 /*
3596 * If VF is activated (tranparent/non-transparent mode does not
3597 * matter here).
3598 *
3599 * - Disable LRO
3600 *
3601 * hn(4) will only receive broadcast packets, multicast packets,
3602 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3603 * packet types.
3604 *
3605 * For non-transparent, we definitely _cannot_ enable LRO at
3606 * all, since the LRO flush will use hn(4) as the receiving
3607 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3608 */
3609 if (is_vf)
3610 do_lro = 0;
3611
3612 /*
3613 * If VF is activated (tranparent/non-transparent mode does not
3614 * matter here), do _not_ mess with unsupported hash types or
3615 * functions.
3616 */
3617 if (rxr->rsc.hash_info != NULL) {
3618 rxr->hn_rss_pkts++;
3619 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3620 if (!is_vf)
3621 hash_type = M_HASHTYPE_OPAQUE_HASH;
3622 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3623 NDIS_HASH_FUNCTION_TOEPLITZ) {
3624 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3625 rxr->hn_mbuf_hash);
3626
3627 /*
3628 * NOTE:
3629 * do_lro is resetted, if the hash types are not TCP
3630 * related. See the comment in the above csum_flags
3631 * setup section.
3632 */
3633 switch (type) {
3634 case NDIS_HASH_IPV4:
3635 hash_type = M_HASHTYPE_RSS_IPV4;
3636 do_lro = 0;
3637 break;
3638
3639 case NDIS_HASH_TCP_IPV4:
3640 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3641 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3642 int def_htype = M_HASHTYPE_OPAQUE_HASH;
3643
3644 if (is_vf)
3645 def_htype = M_HASHTYPE_NONE;
3646
3647 /*
3648 * UDP 4-tuple hash is delivered as
3649 * TCP 4-tuple hash.
3650 */
3651 if (l3proto == ETHERTYPE_MAX) {
3652 hn_rxpkt_proto(m_new,
3653 &l3proto, &l4proto);
3654 }
3655 if (l3proto == ETHERTYPE_IP) {
3656 if (l4proto == IPPROTO_UDP &&
3657 (rxr->hn_mbuf_hash &
3658 NDIS_HASH_UDP_IPV4_X)) {
3659 hash_type =
3660 M_HASHTYPE_RSS_UDP_IPV4;
3661 do_lro = 0;
3662 } else if (l4proto !=
3663 IPPROTO_TCP) {
3664 hash_type = def_htype;
3665 do_lro = 0;
3666 }
3667 } else {
3668 hash_type = def_htype;
3669 do_lro = 0;
3670 }
3671 }
3672 break;
3673
3674 case NDIS_HASH_IPV6:
3675 hash_type = M_HASHTYPE_RSS_IPV6;
3676 do_lro = 0;
3677 break;
3678
3679 case NDIS_HASH_IPV6_EX:
3680 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3681 do_lro = 0;
3682 break;
3683
3684 case NDIS_HASH_TCP_IPV6:
3685 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3686 break;
3687
3688 case NDIS_HASH_TCP_IPV6_EX:
3689 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3690 break;
3691 }
3692 }
3693 } else if (!is_vf) {
3694 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3695 hash_type = M_HASHTYPE_OPAQUE;
3696 }
3697 M_HASHTYPE_SET(m_new, hash_type);
3698
3699 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3700 if (hn_ifp != ifp) {
3701 const struct ether_header *eh;
3702
3703 /*
3704 * Non-transparent mode VF is activated.
3705 */
3706
3707 /*
3708 * Allow tapping on hn(4).
3709 */
3710 ETHER_BPF_MTAP(hn_ifp, m_new);
3711
3712 /*
3713 * Update hn(4)'s stats.
3714 */
3715 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3716 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3717 /* Checked at the beginning of this function. */
3718 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3719 eh = mtod(m_new, struct ether_header *);
3720 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3721 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3722 }
3723 rxr->hn_pkts++;
3724
3725 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3726 #if defined(INET) || defined(INET6)
3727 struct lro_ctrl *lro = &rxr->hn_lro;
3728
3729 if (lro->lro_cnt) {
3730 rxr->hn_lro_tried++;
3731 if (hn_lro_rx(lro, m_new) == 0) {
3732 /* DONE! */
3733 return 0;
3734 }
3735 }
3736 #endif
3737 }
3738 ifp->if_input(ifp, m_new);
3739
3740 return (0);
3741 }
3742
3743 static int
3744 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3745 {
3746 struct hn_softc *sc = ifp->if_softc;
3747 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3748 struct ifnet *vf_ifp;
3749 int mask, error = 0;
3750 struct ifrsskey *ifrk;
3751 struct ifrsshash *ifrh;
3752 uint32_t mtu;
3753
3754 switch (cmd) {
3755 case SIOCSIFMTU:
3756 if (ifr->ifr_mtu > HN_MTU_MAX) {
3757 error = EINVAL;
3758 break;
3759 }
3760
3761 HN_LOCK(sc);
3762
3763 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3764 HN_UNLOCK(sc);
3765 break;
3766 }
3767
3768 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3769 /* Can't change MTU */
3770 HN_UNLOCK(sc);
3771 error = EOPNOTSUPP;
3772 break;
3773 }
3774
3775 if (ifp->if_mtu == ifr->ifr_mtu) {
3776 HN_UNLOCK(sc);
3777 break;
3778 }
3779
3780 if (hn_xpnt_vf_isready(sc)) {
3781 vf_ifp = sc->hn_vf_ifp;
3782 ifr_vf = *ifr;
3783 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3784 sizeof(ifr_vf.ifr_name));
3785 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3786 (caddr_t)&ifr_vf);
3787 if (error) {
3788 HN_UNLOCK(sc);
3789 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3790 vf_ifp->if_xname, ifr->ifr_mtu, error);
3791 break;
3792 }
3793 }
3794
3795 /*
3796 * Suspend this interface before the synthetic parts
3797 * are ripped.
3798 */
3799 hn_suspend(sc);
3800
3801 /*
3802 * Detach the synthetics parts, i.e. NVS and RNDIS.
3803 */
3804 hn_synth_detach(sc);
3805
3806 /*
3807 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3808 * with the new MTU setting.
3809 */
3810 error = hn_synth_attach(sc, ifr->ifr_mtu);
3811 if (error) {
3812 HN_UNLOCK(sc);
3813 break;
3814 }
3815
3816 error = hn_rndis_get_mtu(sc, &mtu);
3817 if (error)
3818 mtu = ifr->ifr_mtu;
3819 else if (bootverbose)
3820 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3821
3822 /*
3823 * Commit the requested MTU, after the synthetic parts
3824 * have been successfully attached.
3825 */
3826 if (mtu >= ifr->ifr_mtu) {
3827 mtu = ifr->ifr_mtu;
3828 } else {
3829 if_printf(ifp, "fixup mtu %d -> %u\n",
3830 ifr->ifr_mtu, mtu);
3831 }
3832 ifp->if_mtu = mtu;
3833
3834 /*
3835 * Synthetic parts' reattach may change the chimney
3836 * sending size; update it.
3837 */
3838 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3839 hn_set_chim_size(sc, sc->hn_chim_szmax);
3840
3841 /*
3842 * Make sure that various parameters based on MTU are
3843 * still valid, after the MTU change.
3844 */
3845 hn_mtu_change_fixup(sc);
3846
3847 /*
3848 * All done! Resume the interface now.
3849 */
3850 hn_resume(sc);
3851
3852 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3853 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3854 /*
3855 * Since we have reattached the NVS part,
3856 * change the datapath to VF again; in case
3857 * that it is lost, after the NVS was detached.
3858 */
3859 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3860 }
3861
3862 HN_UNLOCK(sc);
3863 break;
3864
3865 case SIOCSIFFLAGS:
3866 HN_LOCK(sc);
3867
3868 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3869 HN_UNLOCK(sc);
3870 break;
3871 }
3872
3873 if (hn_xpnt_vf_isready(sc))
3874 hn_xpnt_vf_saveifflags(sc);
3875
3876 if (ifp->if_flags & IFF_UP) {
3877 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3878 /*
3879 * Caller meight hold mutex, e.g.
3880 * bpf; use busy-wait for the RNDIS
3881 * reply.
3882 */
3883 HN_NO_SLEEPING(sc);
3884 hn_rxfilter_config(sc);
3885 HN_SLEEPING_OK(sc);
3886
3887 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3888 error = hn_xpnt_vf_iocsetflags(sc);
3889 } else {
3890 hn_init_locked(sc);
3891 }
3892 } else {
3893 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3894 hn_stop(sc, false);
3895 }
3896 sc->hn_if_flags = ifp->if_flags;
3897
3898 HN_UNLOCK(sc);
3899 break;
3900
3901 case SIOCSIFCAP:
3902 HN_LOCK(sc);
3903
3904 if (hn_xpnt_vf_isready(sc)) {
3905 ifr_vf = *ifr;
3906 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3907 sizeof(ifr_vf.ifr_name));
3908 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3909 HN_UNLOCK(sc);
3910 break;
3911 }
3912
3913 /*
3914 * Fix up requested capabilities w/ supported capabilities,
3915 * since the supported capabilities could have been changed.
3916 */
3917 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3918 ifp->if_capenable;
3919
3920 if (mask & IFCAP_TXCSUM) {
3921 ifp->if_capenable ^= IFCAP_TXCSUM;
3922 if (ifp->if_capenable & IFCAP_TXCSUM)
3923 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3924 else
3925 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3926 }
3927 if (mask & IFCAP_TXCSUM_IPV6) {
3928 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3929 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3930 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3931 else
3932 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3933 }
3934
3935 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3936 if (mask & IFCAP_RXCSUM)
3937 ifp->if_capenable ^= IFCAP_RXCSUM;
3938 #ifdef foo
3939 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3940 if (mask & IFCAP_RXCSUM_IPV6)
3941 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3942 #endif
3943
3944 if (mask & IFCAP_LRO)
3945 ifp->if_capenable ^= IFCAP_LRO;
3946
3947 if (mask & IFCAP_TSO4) {
3948 ifp->if_capenable ^= IFCAP_TSO4;
3949 if (ifp->if_capenable & IFCAP_TSO4)
3950 ifp->if_hwassist |= CSUM_IP_TSO;
3951 else
3952 ifp->if_hwassist &= ~CSUM_IP_TSO;
3953 }
3954 if (mask & IFCAP_TSO6) {
3955 ifp->if_capenable ^= IFCAP_TSO6;
3956 if (ifp->if_capenable & IFCAP_TSO6)
3957 ifp->if_hwassist |= CSUM_IP6_TSO;
3958 else
3959 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3960 }
3961
3962 HN_UNLOCK(sc);
3963 break;
3964
3965 case SIOCADDMULTI:
3966 case SIOCDELMULTI:
3967 HN_LOCK(sc);
3968
3969 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3970 HN_UNLOCK(sc);
3971 break;
3972 }
3973 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3974 /*
3975 * Multicast uses mutex; use busy-wait for
3976 * the RNDIS reply.
3977 */
3978 HN_NO_SLEEPING(sc);
3979 hn_rxfilter_config(sc);
3980 HN_SLEEPING_OK(sc);
3981 }
3982
3983 /* XXX vlan(4) style mcast addr maintenance */
3984 if (hn_xpnt_vf_isready(sc)) {
3985 int old_if_flags;
3986
3987 old_if_flags = sc->hn_vf_ifp->if_flags;
3988 hn_xpnt_vf_saveifflags(sc);
3989
3990 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3991 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3992 IFF_ALLMULTI))
3993 error = hn_xpnt_vf_iocsetflags(sc);
3994 }
3995
3996 HN_UNLOCK(sc);
3997 break;
3998
3999 case SIOCSIFMEDIA:
4000 case SIOCGIFMEDIA:
4001 HN_LOCK(sc);
4002 if (hn_xpnt_vf_isready(sc)) {
4003 /*
4004 * SIOCGIFMEDIA expects ifmediareq, so don't
4005 * create and pass ifr_vf to the VF here; just
4006 * replace the ifr_name.
4007 */
4008 vf_ifp = sc->hn_vf_ifp;
4009 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
4010 sizeof(ifr->ifr_name));
4011 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
4012 /* Restore the ifr_name. */
4013 strlcpy(ifr->ifr_name, ifp->if_xname,
4014 sizeof(ifr->ifr_name));
4015 HN_UNLOCK(sc);
4016 break;
4017 }
4018 HN_UNLOCK(sc);
4019 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4020 break;
4021
4022 case SIOCGIFRSSHASH:
4023 ifrh = (struct ifrsshash *)data;
4024 HN_LOCK(sc);
4025 if (sc->hn_rx_ring_inuse == 1) {
4026 HN_UNLOCK(sc);
4027 ifrh->ifrh_func = RSS_FUNC_NONE;
4028 ifrh->ifrh_types = 0;
4029 break;
4030 }
4031
4032 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4033 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4034 else
4035 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4036 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4037 HN_UNLOCK(sc);
4038 break;
4039
4040 case SIOCGIFRSSKEY:
4041 ifrk = (struct ifrsskey *)data;
4042 HN_LOCK(sc);
4043 if (sc->hn_rx_ring_inuse == 1) {
4044 HN_UNLOCK(sc);
4045 ifrk->ifrk_func = RSS_FUNC_NONE;
4046 ifrk->ifrk_keylen = 0;
4047 break;
4048 }
4049 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4050 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4051 else
4052 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4053 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4054 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4055 NDIS_HASH_KEYSIZE_TOEPLITZ);
4056 HN_UNLOCK(sc);
4057 break;
4058
4059 default:
4060 error = ether_ioctl(ifp, cmd, data);
4061 break;
4062 }
4063 return (error);
4064 }
4065
4066 static void
4067 hn_stop(struct hn_softc *sc, bool detaching)
4068 {
4069 struct ifnet *ifp = sc->hn_ifp;
4070 int i;
4071
4072 HN_LOCK_ASSERT(sc);
4073
4074 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4075 ("synthetic parts were not attached"));
4076
4077 /* Clear RUNNING bit ASAP. */
4078 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4079
4080 /* Disable polling. */
4081 hn_polling(sc, 0);
4082
4083 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4084 KASSERT(sc->hn_vf_ifp != NULL,
4085 ("%s: VF is not attached", ifp->if_xname));
4086
4087 /* Mark transparent mode VF as disabled. */
4088 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4089
4090 /*
4091 * NOTE:
4092 * Datapath setting must happen _before_ bringing
4093 * the VF down.
4094 */
4095 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4096
4097 /*
4098 * Bring the VF down.
4099 */
4100 hn_xpnt_vf_saveifflags(sc);
4101 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4102 hn_xpnt_vf_iocsetflags(sc);
4103 }
4104
4105 /* Suspend data transfers. */
4106 hn_suspend_data(sc);
4107
4108 /* Clear OACTIVE bit. */
4109 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4110 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4111 sc->hn_tx_ring[i].hn_oactive = 0;
4112
4113 /*
4114 * If the non-transparent mode VF is active, make sure
4115 * that the RX filter still allows packet reception.
4116 */
4117 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4118 hn_rxfilter_config(sc);
4119 }
4120
4121 static void
4122 hn_init_locked(struct hn_softc *sc)
4123 {
4124 struct ifnet *ifp = sc->hn_ifp;
4125 int i;
4126
4127 HN_LOCK_ASSERT(sc);
4128
4129 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4130 return;
4131
4132 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4133 return;
4134
4135 /* Configure RX filter */
4136 hn_rxfilter_config(sc);
4137
4138 /* Clear OACTIVE bit. */
4139 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4140 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4141 sc->hn_tx_ring[i].hn_oactive = 0;
4142
4143 /* Clear TX 'suspended' bit. */
4144 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4145
4146 if (hn_xpnt_vf_isready(sc)) {
4147 /* Initialize transparent VF. */
4148 hn_xpnt_vf_init(sc);
4149 }
4150
4151 /* Everything is ready; unleash! */
4152 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4153
4154 /* Re-enable polling if requested. */
4155 if (sc->hn_pollhz > 0)
4156 hn_polling(sc, sc->hn_pollhz);
4157 }
4158
4159 static void
4160 hn_init(void *xsc)
4161 {
4162 struct hn_softc *sc = xsc;
4163
4164 HN_LOCK(sc);
4165 hn_init_locked(sc);
4166 HN_UNLOCK(sc);
4167 }
4168
4169 #if __FreeBSD_version >= 1100099
4170
4171 static int
4172 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4173 {
4174 struct hn_softc *sc = arg1;
4175 unsigned int lenlim;
4176 int error;
4177
4178 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4179 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4180 if (error || req->newptr == NULL)
4181 return error;
4182
4183 HN_LOCK(sc);
4184 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4185 lenlim > TCP_LRO_LENGTH_MAX) {
4186 HN_UNLOCK(sc);
4187 return EINVAL;
4188 }
4189 hn_set_lro_lenlim(sc, lenlim);
4190 HN_UNLOCK(sc);
4191
4192 return 0;
4193 }
4194
4195 static int
4196 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4197 {
4198 struct hn_softc *sc = arg1;
4199 int ackcnt, error, i;
4200
4201 /*
4202 * lro_ackcnt_lim is append count limit,
4203 * +1 to turn it into aggregation limit.
4204 */
4205 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4206 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4207 if (error || req->newptr == NULL)
4208 return error;
4209
4210 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4211 return EINVAL;
4212
4213 /*
4214 * Convert aggregation limit back to append
4215 * count limit.
4216 */
4217 --ackcnt;
4218 HN_LOCK(sc);
4219 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4220 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4221 HN_UNLOCK(sc);
4222 return 0;
4223 }
4224
4225 #endif
4226
4227 static int
4228 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4229 {
4230 struct hn_softc *sc = arg1;
4231 int hcsum = arg2;
4232 int on, error, i;
4233
4234 on = 0;
4235 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4236 on = 1;
4237
4238 error = sysctl_handle_int(oidp, &on, 0, req);
4239 if (error || req->newptr == NULL)
4240 return error;
4241
4242 HN_LOCK(sc);
4243 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4244 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4245
4246 if (on)
4247 rxr->hn_trust_hcsum |= hcsum;
4248 else
4249 rxr->hn_trust_hcsum &= ~hcsum;
4250 }
4251 HN_UNLOCK(sc);
4252 return 0;
4253 }
4254
4255 static int
4256 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4257 {
4258 struct hn_softc *sc = arg1;
4259 int chim_size, error;
4260
4261 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4262 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4263 if (error || req->newptr == NULL)
4264 return error;
4265
4266 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4267 return EINVAL;
4268
4269 HN_LOCK(sc);
4270 hn_set_chim_size(sc, chim_size);
4271 HN_UNLOCK(sc);
4272 return 0;
4273 }
4274
4275 #if __FreeBSD_version < 1100095
4276 static int
4277 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4278 {
4279 struct hn_softc *sc = arg1;
4280 int ofs = arg2, i, error;
4281 struct hn_rx_ring *rxr;
4282 uint64_t stat;
4283
4284 stat = 0;
4285 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4286 rxr = &sc->hn_rx_ring[i];
4287 stat += *((int *)((uint8_t *)rxr + ofs));
4288 }
4289
4290 error = sysctl_handle_64(oidp, &stat, 0, req);
4291 if (error || req->newptr == NULL)
4292 return error;
4293
4294 /* Zero out this stat. */
4295 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4296 rxr = &sc->hn_rx_ring[i];
4297 *((int *)((uint8_t *)rxr + ofs)) = 0;
4298 }
4299 return 0;
4300 }
4301 #else
4302 static int
4303 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4304 {
4305 struct hn_softc *sc = arg1;
4306 int ofs = arg2, i, error;
4307 struct hn_rx_ring *rxr;
4308 uint64_t stat;
4309
4310 stat = 0;
4311 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4312 rxr = &sc->hn_rx_ring[i];
4313 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4314 }
4315
4316 error = sysctl_handle_64(oidp, &stat, 0, req);
4317 if (error || req->newptr == NULL)
4318 return error;
4319
4320 /* Zero out this stat. */
4321 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4322 rxr = &sc->hn_rx_ring[i];
4323 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4324 }
4325 return 0;
4326 }
4327
4328 #endif
4329
4330 static int
4331 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4332 {
4333 struct hn_softc *sc = arg1;
4334 int ofs = arg2, i, error;
4335 struct hn_rx_ring *rxr;
4336 u_long stat;
4337
4338 stat = 0;
4339 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4340 rxr = &sc->hn_rx_ring[i];
4341 stat += *((u_long *)((uint8_t *)rxr + ofs));
4342 }
4343
4344 error = sysctl_handle_long(oidp, &stat, 0, req);
4345 if (error || req->newptr == NULL)
4346 return error;
4347
4348 /* Zero out this stat. */
4349 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4350 rxr = &sc->hn_rx_ring[i];
4351 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4352 }
4353 return 0;
4354 }
4355
4356 static int
4357 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4358 {
4359 struct hn_softc *sc = arg1;
4360 int ofs = arg2, i, error;
4361 struct hn_tx_ring *txr;
4362 u_long stat;
4363
4364 stat = 0;
4365 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4366 txr = &sc->hn_tx_ring[i];
4367 stat += *((u_long *)((uint8_t *)txr + ofs));
4368 }
4369
4370 error = sysctl_handle_long(oidp, &stat, 0, req);
4371 if (error || req->newptr == NULL)
4372 return error;
4373
4374 /* Zero out this stat. */
4375 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4376 txr = &sc->hn_tx_ring[i];
4377 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4378 }
4379 return 0;
4380 }
4381
4382 static int
4383 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4384 {
4385 struct hn_softc *sc = arg1;
4386 int ofs = arg2, i, error, conf;
4387 struct hn_tx_ring *txr;
4388
4389 txr = &sc->hn_tx_ring[0];
4390 conf = *((int *)((uint8_t *)txr + ofs));
4391
4392 error = sysctl_handle_int(oidp, &conf, 0, req);
4393 if (error || req->newptr == NULL)
4394 return error;
4395
4396 HN_LOCK(sc);
4397 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4398 txr = &sc->hn_tx_ring[i];
4399 *((int *)((uint8_t *)txr + ofs)) = conf;
4400 }
4401 HN_UNLOCK(sc);
4402
4403 return 0;
4404 }
4405
4406 static int
4407 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4408 {
4409 struct hn_softc *sc = arg1;
4410 int error, size;
4411
4412 size = sc->hn_agg_size;
4413 error = sysctl_handle_int(oidp, &size, 0, req);
4414 if (error || req->newptr == NULL)
4415 return (error);
4416
4417 HN_LOCK(sc);
4418 sc->hn_agg_size = size;
4419 hn_set_txagg(sc);
4420 HN_UNLOCK(sc);
4421
4422 return (0);
4423 }
4424
4425 static int
4426 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4427 {
4428 struct hn_softc *sc = arg1;
4429 int error, pkts;
4430
4431 pkts = sc->hn_agg_pkts;
4432 error = sysctl_handle_int(oidp, &pkts, 0, req);
4433 if (error || req->newptr == NULL)
4434 return (error);
4435
4436 HN_LOCK(sc);
4437 sc->hn_agg_pkts = pkts;
4438 hn_set_txagg(sc);
4439 HN_UNLOCK(sc);
4440
4441 return (0);
4442 }
4443
4444 static int
4445 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4446 {
4447 struct hn_softc *sc = arg1;
4448 int pkts;
4449
4450 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4451 return (sysctl_handle_int(oidp, &pkts, 0, req));
4452 }
4453
4454 static int
4455 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4456 {
4457 struct hn_softc *sc = arg1;
4458 int align;
4459
4460 align = sc->hn_tx_ring[0].hn_agg_align;
4461 return (sysctl_handle_int(oidp, &align, 0, req));
4462 }
4463
4464 static void
4465 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4466 {
4467 if (pollhz == 0)
4468 vmbus_chan_poll_disable(chan);
4469 else
4470 vmbus_chan_poll_enable(chan, pollhz);
4471 }
4472
4473 static void
4474 hn_polling(struct hn_softc *sc, u_int pollhz)
4475 {
4476 int nsubch = sc->hn_rx_ring_inuse - 1;
4477
4478 HN_LOCK_ASSERT(sc);
4479
4480 if (nsubch > 0) {
4481 struct vmbus_channel **subch;
4482 int i;
4483
4484 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4485 for (i = 0; i < nsubch; ++i)
4486 hn_chan_polling(subch[i], pollhz);
4487 vmbus_subchan_rel(subch, nsubch);
4488 }
4489 hn_chan_polling(sc->hn_prichan, pollhz);
4490 }
4491
4492 static int
4493 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4494 {
4495 struct hn_softc *sc = arg1;
4496 int pollhz, error;
4497
4498 pollhz = sc->hn_pollhz;
4499 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4500 if (error || req->newptr == NULL)
4501 return (error);
4502
4503 if (pollhz != 0 &&
4504 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4505 return (EINVAL);
4506
4507 HN_LOCK(sc);
4508 if (sc->hn_pollhz != pollhz) {
4509 sc->hn_pollhz = pollhz;
4510 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4511 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4512 hn_polling(sc, sc->hn_pollhz);
4513 }
4514 HN_UNLOCK(sc);
4515
4516 return (0);
4517 }
4518
4519 static int
4520 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4521 {
4522 struct hn_softc *sc = arg1;
4523 char verstr[16];
4524
4525 snprintf(verstr, sizeof(verstr), "%u.%u",
4526 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4527 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4528 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4529 }
4530
4531 static int
4532 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4533 {
4534 struct hn_softc *sc = arg1;
4535 char caps_str[128];
4536 uint32_t caps;
4537
4538 HN_LOCK(sc);
4539 caps = sc->hn_caps;
4540 HN_UNLOCK(sc);
4541 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4542 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4543 }
4544
4545 static int
4546 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4547 {
4548 struct hn_softc *sc = arg1;
4549 char assist_str[128];
4550 uint32_t hwassist;
4551
4552 HN_LOCK(sc);
4553 hwassist = sc->hn_ifp->if_hwassist;
4554 HN_UNLOCK(sc);
4555 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4556 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4557 }
4558
4559 static int
4560 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4561 {
4562 struct hn_softc *sc = arg1;
4563 char filter_str[128];
4564 uint32_t filter;
4565
4566 HN_LOCK(sc);
4567 filter = sc->hn_rx_filter;
4568 HN_UNLOCK(sc);
4569 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4570 NDIS_PACKET_TYPES);
4571 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4572 }
4573
4574 static int
4575 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4576 {
4577 struct hn_softc *sc = arg1;
4578 uint32_t mtu;
4579 int error;
4580 HN_LOCK(sc);
4581 error = hn_rndis_get_mtu(sc, &mtu);
4582 if (error) {
4583 if_printf(sc->hn_ifp, "failed to get mtu\n");
4584 goto back;
4585 }
4586 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4587 if (error || req->newptr == NULL)
4588 goto back;
4589
4590 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4591 if (error)
4592 goto back;
4593 error = hn_rndis_reconf_offload(sc, mtu);
4594 back:
4595 HN_UNLOCK(sc);
4596 return (error);
4597 }
4598 #ifndef RSS
4599
4600 static int
4601 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4602 {
4603 struct hn_softc *sc = arg1;
4604 int error;
4605
4606 HN_LOCK(sc);
4607
4608 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4609 if (error || req->newptr == NULL)
4610 goto back;
4611
4612 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4613 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4614 /*
4615 * RSS key is synchronized w/ VF's, don't allow users
4616 * to change it.
4617 */
4618 error = EBUSY;
4619 goto back;
4620 }
4621
4622 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4623 if (error)
4624 goto back;
4625 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4626
4627 if (sc->hn_rx_ring_inuse > 1) {
4628 error = hn_rss_reconfig(sc);
4629 } else {
4630 /* Not RSS capable, at least for now; just save the RSS key. */
4631 error = 0;
4632 }
4633 back:
4634 HN_UNLOCK(sc);
4635 return (error);
4636 }
4637
4638 static int
4639 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4640 {
4641 struct hn_softc *sc = arg1;
4642 int error;
4643
4644 HN_LOCK(sc);
4645
4646 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4647 if (error || req->newptr == NULL)
4648 goto back;
4649
4650 /*
4651 * Don't allow RSS indirect table change, if this interface is not
4652 * RSS capable currently.
4653 */
4654 if (sc->hn_rx_ring_inuse == 1) {
4655 error = EOPNOTSUPP;
4656 goto back;
4657 }
4658
4659 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4660 if (error)
4661 goto back;
4662 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4663
4664 hn_rss_ind_fixup(sc);
4665 error = hn_rss_reconfig(sc);
4666 back:
4667 HN_UNLOCK(sc);
4668 return (error);
4669 }
4670
4671 #endif /* !RSS */
4672
4673 static int
4674 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4675 {
4676 struct hn_softc *sc = arg1;
4677 char hash_str[128];
4678 uint32_t hash;
4679
4680 HN_LOCK(sc);
4681 hash = sc->hn_rss_hash;
4682 HN_UNLOCK(sc);
4683 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4684 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4685 }
4686
4687 static int
4688 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4689 {
4690 struct hn_softc *sc = arg1;
4691 char hash_str[128];
4692 uint32_t hash;
4693
4694 HN_LOCK(sc);
4695 hash = sc->hn_rss_hcap;
4696 HN_UNLOCK(sc);
4697 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4698 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4699 }
4700
4701 static int
4702 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4703 {
4704 struct hn_softc *sc = arg1;
4705 char hash_str[128];
4706 uint32_t hash;
4707
4708 HN_LOCK(sc);
4709 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4710 HN_UNLOCK(sc);
4711 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4712 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4713 }
4714
4715 static int
4716 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4717 {
4718 struct hn_softc *sc = arg1;
4719 char vf_name[IFNAMSIZ + 1];
4720 struct ifnet *vf_ifp;
4721
4722 HN_LOCK(sc);
4723 vf_name[0] = '\0';
4724 vf_ifp = sc->hn_vf_ifp;
4725 if (vf_ifp != NULL)
4726 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4727 HN_UNLOCK(sc);
4728 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4729 }
4730
4731 static int
4732 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4733 {
4734 struct hn_softc *sc = arg1;
4735 char vf_name[IFNAMSIZ + 1];
4736 struct ifnet *vf_ifp;
4737
4738 HN_LOCK(sc);
4739 vf_name[0] = '\0';
4740 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4741 if (vf_ifp != NULL)
4742 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4743 HN_UNLOCK(sc);
4744 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4745 }
4746
4747 static int
4748 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4749 {
4750 struct rm_priotracker pt;
4751 struct sbuf *sb;
4752 int error, i;
4753 bool first;
4754
4755 error = sysctl_wire_old_buffer(req, 0);
4756 if (error != 0)
4757 return (error);
4758
4759 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4760 if (sb == NULL)
4761 return (ENOMEM);
4762
4763 rm_rlock(&hn_vfmap_lock, &pt);
4764
4765 first = true;
4766 for (i = 0; i < hn_vfmap_size; ++i) {
4767 struct epoch_tracker et;
4768 struct ifnet *ifp;
4769
4770 if (hn_vfmap[i] == NULL)
4771 continue;
4772
4773 NET_EPOCH_ENTER(et);
4774 ifp = ifnet_byindex(i);
4775 if (ifp != NULL) {
4776 if (first)
4777 sbuf_printf(sb, "%s", ifp->if_xname);
4778 else
4779 sbuf_printf(sb, " %s", ifp->if_xname);
4780 first = false;
4781 }
4782 NET_EPOCH_EXIT(et);
4783 }
4784
4785 rm_runlock(&hn_vfmap_lock, &pt);
4786
4787 error = sbuf_finish(sb);
4788 sbuf_delete(sb);
4789 return (error);
4790 }
4791
4792 static int
4793 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4794 {
4795 struct rm_priotracker pt;
4796 struct sbuf *sb;
4797 int error, i;
4798 bool first;
4799
4800 error = sysctl_wire_old_buffer(req, 0);
4801 if (error != 0)
4802 return (error);
4803
4804 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4805 if (sb == NULL)
4806 return (ENOMEM);
4807
4808 rm_rlock(&hn_vfmap_lock, &pt);
4809
4810 first = true;
4811 for (i = 0; i < hn_vfmap_size; ++i) {
4812 struct epoch_tracker et;
4813 struct ifnet *ifp, *hn_ifp;
4814
4815 hn_ifp = hn_vfmap[i];
4816 if (hn_ifp == NULL)
4817 continue;
4818
4819 NET_EPOCH_ENTER(et);
4820 ifp = ifnet_byindex(i);
4821 if (ifp != NULL) {
4822 if (first) {
4823 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4824 hn_ifp->if_xname);
4825 } else {
4826 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4827 hn_ifp->if_xname);
4828 }
4829 first = false;
4830 }
4831 NET_EPOCH_EXIT(et);
4832 }
4833
4834 rm_runlock(&hn_vfmap_lock, &pt);
4835
4836 error = sbuf_finish(sb);
4837 sbuf_delete(sb);
4838 return (error);
4839 }
4840
4841 static int
4842 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4843 {
4844 struct hn_softc *sc = arg1;
4845 int error, onoff = 0;
4846
4847 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4848 onoff = 1;
4849 error = sysctl_handle_int(oidp, &onoff, 0, req);
4850 if (error || req->newptr == NULL)
4851 return (error);
4852
4853 HN_LOCK(sc);
4854 /* NOTE: hn_vf_lock for hn_transmit() */
4855 rm_wlock(&sc->hn_vf_lock);
4856 if (onoff)
4857 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4858 else
4859 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4860 rm_wunlock(&sc->hn_vf_lock);
4861 HN_UNLOCK(sc);
4862
4863 return (0);
4864 }
4865
4866 static int
4867 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4868 {
4869 struct hn_softc *sc = arg1;
4870 int enabled = 0;
4871
4872 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4873 enabled = 1;
4874 return (sysctl_handle_int(oidp, &enabled, 0, req));
4875 }
4876
4877 static int
4878 hn_check_iplen(const struct mbuf *m, int hoff)
4879 {
4880 const struct ip *ip;
4881 int len, iphlen, iplen;
4882 const struct tcphdr *th;
4883 int thoff; /* TCP data offset */
4884
4885 len = hoff + sizeof(struct ip);
4886
4887 /* The packet must be at least the size of an IP header. */
4888 if (m->m_pkthdr.len < len)
4889 return IPPROTO_DONE;
4890
4891 /* The fixed IP header must reside completely in the first mbuf. */
4892 if (m->m_len < len)
4893 return IPPROTO_DONE;
4894
4895 ip = mtodo(m, hoff);
4896
4897 /* Bound check the packet's stated IP header length. */
4898 iphlen = ip->ip_hl << 2;
4899 if (iphlen < sizeof(struct ip)) /* minimum header length */
4900 return IPPROTO_DONE;
4901
4902 /* The full IP header must reside completely in the one mbuf. */
4903 if (m->m_len < hoff + iphlen)
4904 return IPPROTO_DONE;
4905
4906 iplen = ntohs(ip->ip_len);
4907
4908 /*
4909 * Check that the amount of data in the buffers is as
4910 * at least much as the IP header would have us expect.
4911 */
4912 if (m->m_pkthdr.len < hoff + iplen)
4913 return IPPROTO_DONE;
4914
4915 /*
4916 * Ignore IP fragments.
4917 */
4918 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4919 return IPPROTO_DONE;
4920
4921 /*
4922 * The TCP/IP or UDP/IP header must be entirely contained within
4923 * the first fragment of a packet.
4924 */
4925 switch (ip->ip_p) {
4926 case IPPROTO_TCP:
4927 if (iplen < iphlen + sizeof(struct tcphdr))
4928 return IPPROTO_DONE;
4929 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4930 return IPPROTO_DONE;
4931 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4932 thoff = th->th_off << 2;
4933 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4934 return IPPROTO_DONE;
4935 if (m->m_len < hoff + iphlen + thoff)
4936 return IPPROTO_DONE;
4937 break;
4938 case IPPROTO_UDP:
4939 if (iplen < iphlen + sizeof(struct udphdr))
4940 return IPPROTO_DONE;
4941 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4942 return IPPROTO_DONE;
4943 break;
4944 default:
4945 if (iplen < iphlen)
4946 return IPPROTO_DONE;
4947 break;
4948 }
4949 return ip->ip_p;
4950 }
4951
4952 static void
4953 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4954 {
4955 const struct ether_header *eh;
4956 uint16_t etype;
4957 int hoff;
4958
4959 hoff = sizeof(*eh);
4960 /* Checked at the beginning of this function. */
4961 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4962
4963 eh = mtod(m_new, const struct ether_header *);
4964 etype = ntohs(eh->ether_type);
4965 if (etype == ETHERTYPE_VLAN) {
4966 const struct ether_vlan_header *evl;
4967
4968 hoff = sizeof(*evl);
4969 if (m_new->m_len < hoff)
4970 return;
4971 evl = mtod(m_new, const struct ether_vlan_header *);
4972 etype = ntohs(evl->evl_proto);
4973 }
4974 *l3proto = etype;
4975
4976 if (etype == ETHERTYPE_IP)
4977 *l4proto = hn_check_iplen(m_new, hoff);
4978 else
4979 *l4proto = IPPROTO_DONE;
4980 }
4981
4982 static int
4983 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4984 {
4985 struct sysctl_oid_list *child;
4986 struct sysctl_ctx_list *ctx;
4987 device_t dev = sc->hn_dev;
4988 #if defined(INET) || defined(INET6)
4989 #if __FreeBSD_version >= 1100095
4990 int lroent_cnt;
4991 #endif
4992 #endif
4993 int i;
4994
4995 /*
4996 * Create RXBUF for reception.
4997 *
4998 * NOTE:
4999 * - It is shared by all channels.
5000 * - A large enough buffer is allocated, certain version of NVSes
5001 * may further limit the usable space.
5002 */
5003 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5004 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
5005 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5006 if (sc->hn_rxbuf == NULL) {
5007 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
5008 return (ENOMEM);
5009 }
5010
5011 sc->hn_rx_ring_cnt = ring_cnt;
5012 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
5013
5014 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
5015 M_DEVBUF, M_WAITOK | M_ZERO);
5016
5017 #if defined(INET) || defined(INET6)
5018 #if __FreeBSD_version >= 1100095
5019 lroent_cnt = hn_lro_entry_count;
5020 if (lroent_cnt < TCP_LRO_ENTRIES)
5021 lroent_cnt = TCP_LRO_ENTRIES;
5022 if (bootverbose)
5023 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
5024 #endif
5025 #endif /* INET || INET6 */
5026
5027 ctx = device_get_sysctl_ctx(dev);
5028 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
5029
5030 /* Create dev.hn.UNIT.rx sysctl tree */
5031 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
5032 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5033
5034 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5035 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5036
5037 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5038 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
5039 &rxr->hn_br_dma, BUS_DMA_WAITOK);
5040 if (rxr->hn_br == NULL) {
5041 device_printf(dev, "allocate bufring failed\n");
5042 return (ENOMEM);
5043 }
5044
5045 if (hn_trust_hosttcp)
5046 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5047 if (hn_trust_hostudp)
5048 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5049 if (hn_trust_hostip)
5050 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5051 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5052 rxr->hn_ifp = sc->hn_ifp;
5053 if (i < sc->hn_tx_ring_cnt)
5054 rxr->hn_txr = &sc->hn_tx_ring[i];
5055 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5056 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5057 rxr->hn_rx_idx = i;
5058 rxr->hn_rxbuf = sc->hn_rxbuf;
5059
5060 /*
5061 * Initialize LRO.
5062 */
5063 #if defined(INET) || defined(INET6)
5064 #if __FreeBSD_version >= 1100095
5065 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5066 hn_lro_mbufq_depth);
5067 #else
5068 tcp_lro_init(&rxr->hn_lro);
5069 rxr->hn_lro.ifp = sc->hn_ifp;
5070 #endif
5071 #if __FreeBSD_version >= 1100099
5072 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5073 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5074 #endif
5075 #endif /* INET || INET6 */
5076
5077 if (sc->hn_rx_sysctl_tree != NULL) {
5078 char name[16];
5079
5080 /*
5081 * Create per RX ring sysctl tree:
5082 * dev.hn.UNIT.rx.RINGID
5083 */
5084 snprintf(name, sizeof(name), "%d", i);
5085 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5086 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5087 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5088
5089 if (rxr->hn_rx_sysctl_tree != NULL) {
5090 SYSCTL_ADD_ULONG(ctx,
5091 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5092 OID_AUTO, "packets",
5093 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5094 "# of packets received");
5095 SYSCTL_ADD_ULONG(ctx,
5096 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5097 OID_AUTO, "rss_pkts",
5098 CTLFLAG_RW | CTLFLAG_STATS,
5099 &rxr->hn_rss_pkts,
5100 "# of packets w/ RSS info received");
5101 SYSCTL_ADD_ULONG(ctx,
5102 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5103 OID_AUTO, "rsc_pkts",
5104 CTLFLAG_RW | CTLFLAG_STATS,
5105 &rxr->hn_rsc_pkts,
5106 "# of RSC packets received");
5107 SYSCTL_ADD_ULONG(ctx,
5108 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5109 OID_AUTO, "rsc_drop",
5110 CTLFLAG_RW | CTLFLAG_STATS,
5111 &rxr->hn_rsc_drop,
5112 "# of RSC fragments dropped");
5113 SYSCTL_ADD_INT(ctx,
5114 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5115 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5116 &rxr->hn_pktbuf_len, 0,
5117 "Temporary channel packet buffer length");
5118 }
5119 }
5120 }
5121
5122 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5123 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5124 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5125 #if __FreeBSD_version < 1100095
5126 hn_rx_stat_int_sysctl,
5127 #else
5128 hn_rx_stat_u64_sysctl,
5129 #endif
5130 "LU", "LRO queued");
5131 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5132 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5133 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5134 #if __FreeBSD_version < 1100095
5135 hn_rx_stat_int_sysctl,
5136 #else
5137 hn_rx_stat_u64_sysctl,
5138 #endif
5139 "LU", "LRO flushed");
5140 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5141 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5142 __offsetof(struct hn_rx_ring, hn_lro_tried),
5143 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5144 #if __FreeBSD_version >= 1100099
5145 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5146 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5147 hn_lro_lenlim_sysctl, "IU",
5148 "Max # of data bytes to be aggregated by LRO");
5149 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5150 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5151 hn_lro_ackcnt_sysctl, "I",
5152 "Max # of ACKs to be aggregated by LRO");
5153 #endif
5154 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5155 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5156 hn_trust_hcsum_sysctl, "I",
5157 "Trust tcp segment verification on host side, "
5158 "when csum info is missing");
5159 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5160 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5161 hn_trust_hcsum_sysctl, "I",
5162 "Trust udp datagram verification on host side, "
5163 "when csum info is missing");
5164 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5165 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5166 hn_trust_hcsum_sysctl, "I",
5167 "Trust ip packet verification on host side, "
5168 "when csum info is missing");
5169 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5170 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5171 __offsetof(struct hn_rx_ring, hn_csum_ip),
5172 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5173 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5174 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5175 __offsetof(struct hn_rx_ring, hn_csum_tcp),
5176 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5177 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5178 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5179 __offsetof(struct hn_rx_ring, hn_csum_udp),
5180 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5181 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5182 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5183 __offsetof(struct hn_rx_ring, hn_csum_trusted),
5184 hn_rx_stat_ulong_sysctl, "LU",
5185 "# of packets that we trust host's csum verification");
5186 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5187 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5188 __offsetof(struct hn_rx_ring, hn_small_pkts),
5189 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5190 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5191 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5192 __offsetof(struct hn_rx_ring, hn_ack_failed),
5193 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5194 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5195 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5196 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5197 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5198
5199 return (0);
5200 }
5201
5202 static void
5203 hn_destroy_rx_data(struct hn_softc *sc)
5204 {
5205 int i;
5206
5207 if (sc->hn_rxbuf != NULL) {
5208 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5209 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5210 else
5211 device_printf(sc->hn_dev, "RXBUF is referenced\n");
5212 sc->hn_rxbuf = NULL;
5213 }
5214
5215 if (sc->hn_rx_ring_cnt == 0)
5216 return;
5217
5218 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5219 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5220
5221 if (rxr->hn_br == NULL)
5222 continue;
5223 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5224 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5225 } else {
5226 device_printf(sc->hn_dev,
5227 "%dth channel bufring is referenced", i);
5228 }
5229 rxr->hn_br = NULL;
5230
5231 #if defined(INET) || defined(INET6)
5232 tcp_lro_free(&rxr->hn_lro);
5233 #endif
5234 free(rxr->hn_pktbuf, M_DEVBUF);
5235 }
5236 free(sc->hn_rx_ring, M_DEVBUF);
5237 sc->hn_rx_ring = NULL;
5238
5239 sc->hn_rx_ring_cnt = 0;
5240 sc->hn_rx_ring_inuse = 0;
5241 }
5242
5243 static int
5244 hn_tx_ring_create(struct hn_softc *sc, int id)
5245 {
5246 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5247 device_t dev = sc->hn_dev;
5248 bus_dma_tag_t parent_dtag;
5249 int error, i;
5250
5251 txr->hn_sc = sc;
5252 txr->hn_tx_idx = id;
5253
5254 #ifndef HN_USE_TXDESC_BUFRING
5255 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5256 #endif
5257 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5258
5259 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5260 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5261 M_DEVBUF, M_WAITOK | M_ZERO);
5262 #ifndef HN_USE_TXDESC_BUFRING
5263 SLIST_INIT(&txr->hn_txlist);
5264 #else
5265 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5266 M_WAITOK, &txr->hn_tx_lock);
5267 #endif
5268
5269 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5270 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5271 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5272 } else {
5273 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5274 }
5275
5276 #ifdef HN_IFSTART_SUPPORT
5277 if (hn_use_if_start) {
5278 txr->hn_txeof = hn_start_txeof;
5279 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5280 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5281 } else
5282 #endif
5283 {
5284 int br_depth;
5285
5286 txr->hn_txeof = hn_xmit_txeof;
5287 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5288 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5289
5290 br_depth = hn_get_txswq_depth(txr);
5291 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5292 M_WAITOK, &txr->hn_tx_lock);
5293 }
5294
5295 txr->hn_direct_tx_size = hn_direct_tx_size;
5296
5297 /*
5298 * Always schedule transmission instead of trying to do direct
5299 * transmission. This one gives the best performance so far.
5300 */
5301 txr->hn_sched_tx = 1;
5302
5303 parent_dtag = bus_get_dma_tag(dev);
5304
5305 /* DMA tag for RNDIS packet messages. */
5306 error = bus_dma_tag_create(parent_dtag, /* parent */
5307 HN_RNDIS_PKT_ALIGN, /* alignment */
5308 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5309 BUS_SPACE_MAXADDR, /* lowaddr */
5310 BUS_SPACE_MAXADDR, /* highaddr */
5311 NULL, NULL, /* filter, filterarg */
5312 HN_RNDIS_PKT_LEN, /* maxsize */
5313 1, /* nsegments */
5314 HN_RNDIS_PKT_LEN, /* maxsegsize */
5315 0, /* flags */
5316 NULL, /* lockfunc */
5317 NULL, /* lockfuncarg */
5318 &txr->hn_tx_rndis_dtag);
5319 if (error) {
5320 device_printf(dev, "failed to create rndis dmatag\n");
5321 return error;
5322 }
5323
5324 /* DMA tag for data. */
5325 error = bus_dma_tag_create(parent_dtag, /* parent */
5326 1, /* alignment */
5327 HN_TX_DATA_BOUNDARY, /* boundary */
5328 BUS_SPACE_MAXADDR, /* lowaddr */
5329 BUS_SPACE_MAXADDR, /* highaddr */
5330 NULL, NULL, /* filter, filterarg */
5331 HN_TX_DATA_MAXSIZE, /* maxsize */
5332 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5333 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5334 0, /* flags */
5335 NULL, /* lockfunc */
5336 NULL, /* lockfuncarg */
5337 &txr->hn_tx_data_dtag);
5338 if (error) {
5339 device_printf(dev, "failed to create data dmatag\n");
5340 return error;
5341 }
5342
5343 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5344 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5345
5346 txd->txr = txr;
5347 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5348 STAILQ_INIT(&txd->agg_list);
5349
5350 /*
5351 * Allocate and load RNDIS packet message.
5352 */
5353 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5354 (void **)&txd->rndis_pkt,
5355 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5356 &txd->rndis_pkt_dmap);
5357 if (error) {
5358 device_printf(dev,
5359 "failed to allocate rndis_packet_msg, %d\n", i);
5360 return error;
5361 }
5362
5363 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5364 txd->rndis_pkt_dmap,
5365 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5366 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5367 BUS_DMA_NOWAIT);
5368 if (error) {
5369 device_printf(dev,
5370 "failed to load rndis_packet_msg, %d\n", i);
5371 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5372 txd->rndis_pkt, txd->rndis_pkt_dmap);
5373 return error;
5374 }
5375
5376 /* DMA map for TX data. */
5377 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5378 &txd->data_dmap);
5379 if (error) {
5380 device_printf(dev,
5381 "failed to allocate tx data dmamap\n");
5382 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5383 txd->rndis_pkt_dmap);
5384 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5385 txd->rndis_pkt, txd->rndis_pkt_dmap);
5386 return error;
5387 }
5388
5389 /* All set, put it to list */
5390 txd->flags |= HN_TXD_FLAG_ONLIST;
5391 #ifndef HN_USE_TXDESC_BUFRING
5392 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5393 #else
5394 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5395 #endif
5396 }
5397 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5398
5399 if (sc->hn_tx_sysctl_tree != NULL) {
5400 struct sysctl_oid_list *child;
5401 struct sysctl_ctx_list *ctx;
5402 char name[16];
5403
5404 /*
5405 * Create per TX ring sysctl tree:
5406 * dev.hn.UNIT.tx.RINGID
5407 */
5408 ctx = device_get_sysctl_ctx(dev);
5409 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5410
5411 snprintf(name, sizeof(name), "%d", id);
5412 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5413 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5414
5415 if (txr->hn_tx_sysctl_tree != NULL) {
5416 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5417
5418 #ifdef HN_DEBUG
5419 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5420 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5421 "# of available TX descs");
5422 #endif
5423 #ifdef HN_IFSTART_SUPPORT
5424 if (!hn_use_if_start)
5425 #endif
5426 {
5427 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5428 CTLFLAG_RD, &txr->hn_oactive, 0,
5429 "over active");
5430 }
5431 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5432 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5433 "# of packets transmitted");
5434 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5435 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5436 "# of sends");
5437 }
5438 }
5439
5440 return 0;
5441 }
5442
5443 static void
5444 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5445 {
5446 struct hn_tx_ring *txr = txd->txr;
5447
5448 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5449 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5450
5451 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5452 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5453 txd->rndis_pkt_dmap);
5454 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5455 }
5456
5457 static void
5458 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5459 {
5460
5461 KASSERT(txd->refs == 0 || txd->refs == 1,
5462 ("invalid txd refs %d", txd->refs));
5463
5464 /* Aggregated txds will be freed by their aggregating txd. */
5465 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5466 int freed __diagused;
5467
5468 freed = hn_txdesc_put(txr, txd);
5469 KASSERT(freed, ("can't free txdesc"));
5470 }
5471 }
5472
5473 static void
5474 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5475 {
5476 int i;
5477
5478 if (txr->hn_txdesc == NULL)
5479 return;
5480
5481 /*
5482 * NOTE:
5483 * Because the freeing of aggregated txds will be deferred
5484 * to the aggregating txd, two passes are used here:
5485 * - The first pass GCes any pending txds. This GC is necessary,
5486 * since if the channels are revoked, hypervisor will not
5487 * deliver send-done for all pending txds.
5488 * - The second pass frees the busdma stuffs, i.e. after all txds
5489 * were freed.
5490 */
5491 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5492 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5493 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5494 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5495
5496 if (txr->hn_tx_data_dtag != NULL)
5497 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5498 if (txr->hn_tx_rndis_dtag != NULL)
5499 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5500
5501 #ifdef HN_USE_TXDESC_BUFRING
5502 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5503 #endif
5504
5505 free(txr->hn_txdesc, M_DEVBUF);
5506 txr->hn_txdesc = NULL;
5507
5508 if (txr->hn_mbuf_br != NULL)
5509 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5510
5511 #ifndef HN_USE_TXDESC_BUFRING
5512 mtx_destroy(&txr->hn_txlist_spin);
5513 #endif
5514 mtx_destroy(&txr->hn_tx_lock);
5515 }
5516
5517 static int
5518 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5519 {
5520 struct sysctl_oid_list *child;
5521 struct sysctl_ctx_list *ctx;
5522 int i;
5523
5524 /*
5525 * Create TXBUF for chimney sending.
5526 *
5527 * NOTE: It is shared by all channels.
5528 */
5529 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5530 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5531 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5532 if (sc->hn_chim == NULL) {
5533 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5534 return (ENOMEM);
5535 }
5536
5537 sc->hn_tx_ring_cnt = ring_cnt;
5538 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5539
5540 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5541 M_DEVBUF, M_WAITOK | M_ZERO);
5542
5543 ctx = device_get_sysctl_ctx(sc->hn_dev);
5544 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5545
5546 /* Create dev.hn.UNIT.tx sysctl tree */
5547 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5548 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5549
5550 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5551 int error;
5552
5553 error = hn_tx_ring_create(sc, i);
5554 if (error)
5555 return error;
5556 }
5557
5558 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5559 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5560 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5561 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5562 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5563 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5564 __offsetof(struct hn_tx_ring, hn_send_failed),
5565 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5566 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5567 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5568 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5569 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5570 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5571 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5572 __offsetof(struct hn_tx_ring, hn_flush_failed),
5573 hn_tx_stat_ulong_sysctl, "LU",
5574 "# of packet transmission aggregation flush failure");
5575 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5576 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5577 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5578 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5579 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5580 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5581 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5582 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5583 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5584 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5585 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5586 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5587 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5588 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5589 "# of total TX descs");
5590 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5591 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5592 "Chimney send packet size upper boundary");
5593 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5594 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5595 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5596 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5597 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5598 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5599 hn_tx_conf_int_sysctl, "I",
5600 "Size of the packet for direct transmission");
5601 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5602 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5603 __offsetof(struct hn_tx_ring, hn_sched_tx),
5604 hn_tx_conf_int_sysctl, "I",
5605 "Always schedule transmission "
5606 "instead of doing direct transmission");
5607 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5608 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5609 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5610 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5611 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5612 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5613 "Applied packet transmission aggregation size");
5614 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5615 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5616 hn_txagg_pktmax_sysctl, "I",
5617 "Applied packet transmission aggregation packets");
5618 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5619 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5620 hn_txagg_align_sysctl, "I",
5621 "Applied packet transmission aggregation alignment");
5622
5623 return 0;
5624 }
5625
5626 static void
5627 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5628 {
5629 int i;
5630
5631 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5632 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5633 }
5634
5635 static void
5636 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5637 {
5638 struct ifnet *ifp = sc->hn_ifp;
5639 u_int hw_tsomax;
5640 int tso_minlen;
5641
5642 HN_LOCK_ASSERT(sc);
5643
5644 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5645 return;
5646
5647 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5648 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5649 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5650
5651 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5652 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5653 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5654
5655 if (tso_maxlen < tso_minlen)
5656 tso_maxlen = tso_minlen;
5657 else if (tso_maxlen > IP_MAXPACKET)
5658 tso_maxlen = IP_MAXPACKET;
5659 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5660 tso_maxlen = sc->hn_ndis_tso_szmax;
5661 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5662
5663 if (hn_xpnt_vf_isready(sc)) {
5664 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5665 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5666 }
5667 ifp->if_hw_tsomax = hw_tsomax;
5668 if (bootverbose)
5669 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5670 }
5671
5672 static void
5673 hn_fixup_tx_data(struct hn_softc *sc)
5674 {
5675 uint64_t csum_assist;
5676 int i;
5677
5678 hn_set_chim_size(sc, sc->hn_chim_szmax);
5679 if (hn_tx_chimney_size > 0 &&
5680 hn_tx_chimney_size < sc->hn_chim_szmax)
5681 hn_set_chim_size(sc, hn_tx_chimney_size);
5682
5683 csum_assist = 0;
5684 if (sc->hn_caps & HN_CAP_IPCS)
5685 csum_assist |= CSUM_IP;
5686 if (sc->hn_caps & HN_CAP_TCP4CS)
5687 csum_assist |= CSUM_IP_TCP;
5688 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5689 csum_assist |= CSUM_IP_UDP;
5690 if (sc->hn_caps & HN_CAP_TCP6CS)
5691 csum_assist |= CSUM_IP6_TCP;
5692 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5693 csum_assist |= CSUM_IP6_UDP;
5694 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5695 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5696
5697 if (sc->hn_caps & HN_CAP_HASHVAL) {
5698 /*
5699 * Support HASHVAL pktinfo on TX path.
5700 */
5701 if (bootverbose)
5702 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5703 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5704 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5705 }
5706 }
5707
5708 static void
5709 hn_fixup_rx_data(struct hn_softc *sc)
5710 {
5711
5712 if (sc->hn_caps & HN_CAP_UDPHASH) {
5713 int i;
5714
5715 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5716 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5717 }
5718 }
5719
5720 static void
5721 hn_destroy_tx_data(struct hn_softc *sc)
5722 {
5723 int i;
5724
5725 if (sc->hn_chim != NULL) {
5726 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5727 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5728 } else {
5729 device_printf(sc->hn_dev,
5730 "chimney sending buffer is referenced");
5731 }
5732 sc->hn_chim = NULL;
5733 }
5734
5735 if (sc->hn_tx_ring_cnt == 0)
5736 return;
5737
5738 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5739 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5740
5741 free(sc->hn_tx_ring, M_DEVBUF);
5742 sc->hn_tx_ring = NULL;
5743
5744 sc->hn_tx_ring_cnt = 0;
5745 sc->hn_tx_ring_inuse = 0;
5746 }
5747
5748 #ifdef HN_IFSTART_SUPPORT
5749
5750 static void
5751 hn_start_taskfunc(void *xtxr, int pending __unused)
5752 {
5753 struct hn_tx_ring *txr = xtxr;
5754
5755 mtx_lock(&txr->hn_tx_lock);
5756 hn_start_locked(txr, 0);
5757 mtx_unlock(&txr->hn_tx_lock);
5758 }
5759
5760 static int
5761 hn_start_locked(struct hn_tx_ring *txr, int len)
5762 {
5763 struct hn_softc *sc = txr->hn_sc;
5764 struct ifnet *ifp = sc->hn_ifp;
5765 int sched = 0;
5766
5767 KASSERT(hn_use_if_start,
5768 ("hn_start_locked is called, when if_start is disabled"));
5769 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5770 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5771 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5772
5773 if (__predict_false(txr->hn_suspended))
5774 return (0);
5775
5776 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5777 IFF_DRV_RUNNING)
5778 return (0);
5779
5780 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5781 struct hn_txdesc *txd;
5782 struct mbuf *m_head;
5783 int error;
5784
5785 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5786 if (m_head == NULL)
5787 break;
5788
5789 if (len > 0 && m_head->m_pkthdr.len > len) {
5790 /*
5791 * This sending could be time consuming; let callers
5792 * dispatch this packet sending (and sending of any
5793 * following up packets) to tx taskqueue.
5794 */
5795 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5796 sched = 1;
5797 break;
5798 }
5799
5800 #if defined(INET6) || defined(INET)
5801 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5802 m_head = hn_tso_fixup(m_head);
5803 if (__predict_false(m_head == NULL)) {
5804 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5805 continue;
5806 }
5807 } else if (m_head->m_pkthdr.csum_flags &
5808 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5809 m_head = hn_set_hlen(m_head);
5810 if (__predict_false(m_head == NULL)) {
5811 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5812 continue;
5813 }
5814 }
5815 #endif
5816
5817 txd = hn_txdesc_get(txr);
5818 if (txd == NULL) {
5819 txr->hn_no_txdescs++;
5820 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5821 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5822 break;
5823 }
5824
5825 error = hn_encap(ifp, txr, txd, &m_head);
5826 if (error) {
5827 /* Both txd and m_head are freed */
5828 KASSERT(txr->hn_agg_txd == NULL,
5829 ("encap failed w/ pending aggregating txdesc"));
5830 continue;
5831 }
5832
5833 if (txr->hn_agg_pktleft == 0) {
5834 if (txr->hn_agg_txd != NULL) {
5835 KASSERT(m_head == NULL,
5836 ("pending mbuf for aggregating txdesc"));
5837 error = hn_flush_txagg(ifp, txr);
5838 if (__predict_false(error)) {
5839 atomic_set_int(&ifp->if_drv_flags,
5840 IFF_DRV_OACTIVE);
5841 break;
5842 }
5843 } else {
5844 KASSERT(m_head != NULL, ("mbuf was freed"));
5845 error = hn_txpkt(ifp, txr, txd);
5846 if (__predict_false(error)) {
5847 /* txd is freed, but m_head is not */
5848 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5849 atomic_set_int(&ifp->if_drv_flags,
5850 IFF_DRV_OACTIVE);
5851 break;
5852 }
5853 }
5854 }
5855 #ifdef INVARIANTS
5856 else {
5857 KASSERT(txr->hn_agg_txd != NULL,
5858 ("no aggregating txdesc"));
5859 KASSERT(m_head == NULL,
5860 ("pending mbuf for aggregating txdesc"));
5861 }
5862 #endif
5863 }
5864
5865 /* Flush pending aggerated transmission. */
5866 if (txr->hn_agg_txd != NULL)
5867 hn_flush_txagg(ifp, txr);
5868 return (sched);
5869 }
5870
5871 static void
5872 hn_start(struct ifnet *ifp)
5873 {
5874 struct hn_softc *sc = ifp->if_softc;
5875 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5876
5877 if (txr->hn_sched_tx)
5878 goto do_sched;
5879
5880 if (mtx_trylock(&txr->hn_tx_lock)) {
5881 int sched;
5882
5883 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5884 mtx_unlock(&txr->hn_tx_lock);
5885 if (!sched)
5886 return;
5887 }
5888 do_sched:
5889 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5890 }
5891
5892 static void
5893 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5894 {
5895 struct hn_tx_ring *txr = xtxr;
5896
5897 mtx_lock(&txr->hn_tx_lock);
5898 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5899 hn_start_locked(txr, 0);
5900 mtx_unlock(&txr->hn_tx_lock);
5901 }
5902
5903 static void
5904 hn_start_txeof(struct hn_tx_ring *txr)
5905 {
5906 struct hn_softc *sc = txr->hn_sc;
5907 struct ifnet *ifp = sc->hn_ifp;
5908
5909 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5910
5911 if (txr->hn_sched_tx)
5912 goto do_sched;
5913
5914 if (mtx_trylock(&txr->hn_tx_lock)) {
5915 int sched;
5916
5917 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5918 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5919 mtx_unlock(&txr->hn_tx_lock);
5920 if (sched) {
5921 taskqueue_enqueue(txr->hn_tx_taskq,
5922 &txr->hn_tx_task);
5923 }
5924 } else {
5925 do_sched:
5926 /*
5927 * Release the OACTIVE earlier, with the hope, that
5928 * others could catch up. The task will clear the
5929 * flag again with the hn_tx_lock to avoid possible
5930 * races.
5931 */
5932 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5933 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5934 }
5935 }
5936
5937 #endif /* HN_IFSTART_SUPPORT */
5938
5939 static int
5940 hn_xmit(struct hn_tx_ring *txr, int len)
5941 {
5942 struct hn_softc *sc = txr->hn_sc;
5943 struct ifnet *ifp = sc->hn_ifp;
5944 struct mbuf *m_head;
5945 int sched = 0;
5946
5947 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5948 #ifdef HN_IFSTART_SUPPORT
5949 KASSERT(hn_use_if_start == 0,
5950 ("hn_xmit is called, when if_start is enabled"));
5951 #endif
5952 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5953
5954 if (__predict_false(txr->hn_suspended))
5955 return (0);
5956
5957 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5958 return (0);
5959
5960 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5961 struct hn_txdesc *txd;
5962 int error;
5963
5964 if (len > 0 && m_head->m_pkthdr.len > len) {
5965 /*
5966 * This sending could be time consuming; let callers
5967 * dispatch this packet sending (and sending of any
5968 * following up packets) to tx taskqueue.
5969 */
5970 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5971 sched = 1;
5972 break;
5973 }
5974
5975 txd = hn_txdesc_get(txr);
5976 if (txd == NULL) {
5977 txr->hn_no_txdescs++;
5978 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5979 txr->hn_oactive = 1;
5980 break;
5981 }
5982
5983 error = hn_encap(ifp, txr, txd, &m_head);
5984 if (error) {
5985 /* Both txd and m_head are freed; discard */
5986 KASSERT(txr->hn_agg_txd == NULL,
5987 ("encap failed w/ pending aggregating txdesc"));
5988 drbr_advance(ifp, txr->hn_mbuf_br);
5989 continue;
5990 }
5991
5992 if (txr->hn_agg_pktleft == 0) {
5993 if (txr->hn_agg_txd != NULL) {
5994 KASSERT(m_head == NULL,
5995 ("pending mbuf for aggregating txdesc"));
5996 error = hn_flush_txagg(ifp, txr);
5997 if (__predict_false(error)) {
5998 txr->hn_oactive = 1;
5999 break;
6000 }
6001 } else {
6002 KASSERT(m_head != NULL, ("mbuf was freed"));
6003 error = hn_txpkt(ifp, txr, txd);
6004 if (__predict_false(error)) {
6005 /* txd is freed, but m_head is not */
6006 drbr_putback(ifp, txr->hn_mbuf_br,
6007 m_head);
6008 txr->hn_oactive = 1;
6009 break;
6010 }
6011 }
6012 }
6013 #ifdef INVARIANTS
6014 else {
6015 KASSERT(txr->hn_agg_txd != NULL,
6016 ("no aggregating txdesc"));
6017 KASSERT(m_head == NULL,
6018 ("pending mbuf for aggregating txdesc"));
6019 }
6020 #endif
6021
6022 /* Sent */
6023 drbr_advance(ifp, txr->hn_mbuf_br);
6024 }
6025
6026 /* Flush pending aggerated transmission. */
6027 if (txr->hn_agg_txd != NULL)
6028 hn_flush_txagg(ifp, txr);
6029 return (sched);
6030 }
6031
6032 static int
6033 hn_transmit(struct ifnet *ifp, struct mbuf *m)
6034 {
6035 struct hn_softc *sc = ifp->if_softc;
6036 struct hn_tx_ring *txr;
6037 int error, idx = 0;
6038
6039 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
6040 struct rm_priotracker pt;
6041
6042 rm_rlock(&sc->hn_vf_lock, &pt);
6043 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6044 struct mbuf *m_bpf = NULL;
6045 int obytes, omcast;
6046
6047 obytes = m->m_pkthdr.len;
6048 omcast = (m->m_flags & M_MCAST) != 0;
6049
6050 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6051 if (bpf_peers_present(ifp->if_bpf)) {
6052 m_bpf = m_copypacket(m, M_NOWAIT);
6053 if (m_bpf == NULL) {
6054 /*
6055 * Failed to grab a shallow
6056 * copy; tap now.
6057 */
6058 ETHER_BPF_MTAP(ifp, m);
6059 }
6060 }
6061 } else {
6062 ETHER_BPF_MTAP(ifp, m);
6063 }
6064
6065 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
6066 rm_runlock(&sc->hn_vf_lock, &pt);
6067
6068 if (m_bpf != NULL) {
6069 if (!error)
6070 ETHER_BPF_MTAP(ifp, m_bpf);
6071 m_freem(m_bpf);
6072 }
6073
6074 if (error == ENOBUFS) {
6075 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6076 } else if (error) {
6077 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6078 } else {
6079 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6080 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6081 if (omcast) {
6082 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6083 omcast);
6084 }
6085 }
6086 return (error);
6087 }
6088 rm_runlock(&sc->hn_vf_lock, &pt);
6089 }
6090
6091 #if defined(INET6) || defined(INET)
6092 /*
6093 * Perform TSO packet header fixup or get l2/l3 header length now,
6094 * since packet headers should be cache-hot.
6095 */
6096 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6097 m = hn_tso_fixup(m);
6098 if (__predict_false(m == NULL)) {
6099 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6100 return EIO;
6101 }
6102 } else if (m->m_pkthdr.csum_flags &
6103 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6104 m = hn_set_hlen(m);
6105 if (__predict_false(m == NULL)) {
6106 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6107 return EIO;
6108 }
6109 }
6110 #endif
6111
6112 /*
6113 * Select the TX ring based on flowid
6114 */
6115 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6116 #ifdef RSS
6117 uint32_t bid;
6118
6119 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6120 &bid) == 0)
6121 idx = bid % sc->hn_tx_ring_inuse;
6122 else
6123 #endif
6124 {
6125 #if defined(INET6) || defined(INET)
6126 int tcpsyn = 0;
6127
6128 if (m->m_pkthdr.len < 128 &&
6129 (m->m_pkthdr.csum_flags &
6130 (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6131 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6132 m = hn_check_tcpsyn(m, &tcpsyn);
6133 if (__predict_false(m == NULL)) {
6134 if_inc_counter(ifp,
6135 IFCOUNTER_OERRORS, 1);
6136 return (EIO);
6137 }
6138 }
6139 #else
6140 const int tcpsyn = 0;
6141 #endif
6142 if (tcpsyn)
6143 idx = 0;
6144 else
6145 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6146 }
6147 }
6148 txr = &sc->hn_tx_ring[idx];
6149
6150 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6151 if (error) {
6152 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6153 return error;
6154 }
6155
6156 if (txr->hn_oactive)
6157 return 0;
6158
6159 if (txr->hn_sched_tx)
6160 goto do_sched;
6161
6162 if (mtx_trylock(&txr->hn_tx_lock)) {
6163 int sched;
6164
6165 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6166 mtx_unlock(&txr->hn_tx_lock);
6167 if (!sched)
6168 return 0;
6169 }
6170 do_sched:
6171 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6172 return 0;
6173 }
6174
6175 static void
6176 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6177 {
6178 struct mbuf *m;
6179
6180 mtx_lock(&txr->hn_tx_lock);
6181 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6182 m_freem(m);
6183 mtx_unlock(&txr->hn_tx_lock);
6184 }
6185
6186 static void
6187 hn_xmit_qflush(struct ifnet *ifp)
6188 {
6189 struct hn_softc *sc = ifp->if_softc;
6190 struct rm_priotracker pt;
6191 int i;
6192
6193 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6194 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6195 if_qflush(ifp);
6196
6197 rm_rlock(&sc->hn_vf_lock, &pt);
6198 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6199 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6200 rm_runlock(&sc->hn_vf_lock, &pt);
6201 }
6202
6203 static void
6204 hn_xmit_txeof(struct hn_tx_ring *txr)
6205 {
6206
6207 if (txr->hn_sched_tx)
6208 goto do_sched;
6209
6210 if (mtx_trylock(&txr->hn_tx_lock)) {
6211 int sched;
6212
6213 txr->hn_oactive = 0;
6214 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6215 mtx_unlock(&txr->hn_tx_lock);
6216 if (sched) {
6217 taskqueue_enqueue(txr->hn_tx_taskq,
6218 &txr->hn_tx_task);
6219 }
6220 } else {
6221 do_sched:
6222 /*
6223 * Release the oactive earlier, with the hope, that
6224 * others could catch up. The task will clear the
6225 * oactive again with the hn_tx_lock to avoid possible
6226 * races.
6227 */
6228 txr->hn_oactive = 0;
6229 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6230 }
6231 }
6232
6233 static void
6234 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6235 {
6236 struct hn_tx_ring *txr = xtxr;
6237
6238 mtx_lock(&txr->hn_tx_lock);
6239 hn_xmit(txr, 0);
6240 mtx_unlock(&txr->hn_tx_lock);
6241 }
6242
6243 static void
6244 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6245 {
6246 struct hn_tx_ring *txr = xtxr;
6247
6248 mtx_lock(&txr->hn_tx_lock);
6249 txr->hn_oactive = 0;
6250 hn_xmit(txr, 0);
6251 mtx_unlock(&txr->hn_tx_lock);
6252 }
6253
6254 static int
6255 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6256 {
6257 struct vmbus_chan_br cbr;
6258 struct hn_rx_ring *rxr;
6259 struct hn_tx_ring *txr = NULL;
6260 int idx, error;
6261
6262 idx = vmbus_chan_subidx(chan);
6263
6264 /*
6265 * Link this channel to RX/TX ring.
6266 */
6267 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6268 ("invalid channel index %d, should > 0 && < %d",
6269 idx, sc->hn_rx_ring_inuse));
6270 rxr = &sc->hn_rx_ring[idx];
6271 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6272 ("RX ring %d already attached", idx));
6273 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6274 rxr->hn_chan = chan;
6275
6276 if (bootverbose) {
6277 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6278 idx, vmbus_chan_id(chan));
6279 }
6280
6281 if (idx < sc->hn_tx_ring_inuse) {
6282 txr = &sc->hn_tx_ring[idx];
6283 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6284 ("TX ring %d already attached", idx));
6285 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6286
6287 txr->hn_chan = chan;
6288 if (bootverbose) {
6289 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6290 idx, vmbus_chan_id(chan));
6291 }
6292 }
6293
6294 /* Bind this channel to a proper CPU. */
6295 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6296
6297 /*
6298 * Open this channel
6299 */
6300 cbr.cbr = rxr->hn_br;
6301 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6302 cbr.cbr_txsz = HN_TXBR_SIZE;
6303 cbr.cbr_rxsz = HN_RXBR_SIZE;
6304 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6305 if (error) {
6306 if (error == EISCONN) {
6307 if_printf(sc->hn_ifp, "bufring is connected after "
6308 "chan%u open failure\n", vmbus_chan_id(chan));
6309 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6310 } else {
6311 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6312 vmbus_chan_id(chan), error);
6313 }
6314 }
6315 return (error);
6316 }
6317
6318 static void
6319 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6320 {
6321 struct hn_rx_ring *rxr;
6322 int idx, error;
6323
6324 idx = vmbus_chan_subidx(chan);
6325
6326 /*
6327 * Link this channel to RX/TX ring.
6328 */
6329 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6330 ("invalid channel index %d, should > 0 && < %d",
6331 idx, sc->hn_rx_ring_inuse));
6332 rxr = &sc->hn_rx_ring[idx];
6333 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6334 ("RX ring %d is not attached", idx));
6335 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6336
6337 if (idx < sc->hn_tx_ring_inuse) {
6338 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6339
6340 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6341 ("TX ring %d is not attached attached", idx));
6342 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6343 }
6344
6345 /*
6346 * Close this channel.
6347 *
6348 * NOTE:
6349 * Channel closing does _not_ destroy the target channel.
6350 */
6351 error = vmbus_chan_close_direct(chan);
6352 if (error == EISCONN) {
6353 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6354 "after being closed\n", vmbus_chan_id(chan));
6355 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6356 } else if (error) {
6357 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6358 vmbus_chan_id(chan), error);
6359 }
6360 }
6361
6362 static int
6363 hn_attach_subchans(struct hn_softc *sc)
6364 {
6365 struct vmbus_channel **subchans;
6366 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6367 int i, error = 0;
6368
6369 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6370
6371 /* Attach the sub-channels. */
6372 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6373 for (i = 0; i < subchan_cnt; ++i) {
6374 int error1;
6375
6376 error1 = hn_chan_attach(sc, subchans[i]);
6377 if (error1) {
6378 error = error1;
6379 /* Move on; all channels will be detached later. */
6380 }
6381 }
6382 vmbus_subchan_rel(subchans, subchan_cnt);
6383
6384 if (error) {
6385 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6386 } else {
6387 if (bootverbose) {
6388 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6389 subchan_cnt);
6390 }
6391 }
6392 return (error);
6393 }
6394
6395 static void
6396 hn_detach_allchans(struct hn_softc *sc)
6397 {
6398 struct vmbus_channel **subchans;
6399 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6400 int i;
6401
6402 if (subchan_cnt == 0)
6403 goto back;
6404
6405 /* Detach the sub-channels. */
6406 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6407 for (i = 0; i < subchan_cnt; ++i)
6408 hn_chan_detach(sc, subchans[i]);
6409 vmbus_subchan_rel(subchans, subchan_cnt);
6410
6411 back:
6412 /*
6413 * Detach the primary channel, _after_ all sub-channels
6414 * are detached.
6415 */
6416 hn_chan_detach(sc, sc->hn_prichan);
6417
6418 /* Wait for sub-channels to be destroyed, if any. */
6419 vmbus_subchan_drain(sc->hn_prichan);
6420
6421 #ifdef INVARIANTS
6422 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6423 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6424 HN_RX_FLAG_ATTACHED) == 0,
6425 ("%dth RX ring is still attached", i));
6426 }
6427 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6428 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6429 HN_TX_FLAG_ATTACHED) == 0,
6430 ("%dth TX ring is still attached", i));
6431 }
6432 #endif
6433 }
6434
6435 static int
6436 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6437 {
6438 struct vmbus_channel **subchans;
6439 int nchan, rxr_cnt, error;
6440
6441 nchan = *nsubch + 1;
6442 if (nchan == 1) {
6443 /*
6444 * Multiple RX/TX rings are not requested.
6445 */
6446 *nsubch = 0;
6447 return (0);
6448 }
6449
6450 /*
6451 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6452 * table entries.
6453 */
6454 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6455 if (error) {
6456 /* No RSS; this is benign. */
6457 *nsubch = 0;
6458 return (0);
6459 }
6460 if (bootverbose) {
6461 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6462 rxr_cnt, nchan);
6463 }
6464
6465 if (nchan > rxr_cnt)
6466 nchan = rxr_cnt;
6467 if (nchan == 1) {
6468 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6469 *nsubch = 0;
6470 return (0);
6471 }
6472
6473 /*
6474 * Allocate sub-channels from NVS.
6475 */
6476 *nsubch = nchan - 1;
6477 error = hn_nvs_alloc_subchans(sc, nsubch);
6478 if (error || *nsubch == 0) {
6479 /* Failed to allocate sub-channels. */
6480 *nsubch = 0;
6481 return (0);
6482 }
6483
6484 /*
6485 * Wait for all sub-channels to become ready before moving on.
6486 */
6487 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6488 vmbus_subchan_rel(subchans, *nsubch);
6489 return (0);
6490 }
6491
6492 static bool
6493 hn_synth_attachable(const struct hn_softc *sc)
6494 {
6495 int i;
6496
6497 if (sc->hn_flags & HN_FLAG_ERRORS)
6498 return (false);
6499
6500 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6501 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6502
6503 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6504 return (false);
6505 }
6506 return (true);
6507 }
6508
6509 /*
6510 * Make sure that the RX filter is zero after the successful
6511 * RNDIS initialization.
6512 *
6513 * NOTE:
6514 * Under certain conditions on certain versions of Hyper-V,
6515 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6516 * after the successful RNDIS initialization, which breaks
6517 * the assumption of any following code (well, it breaks the
6518 * RNDIS API contract actually). Clear the RNDIS rxfilter
6519 * explicitly, drain packets sneaking through, and drain the
6520 * interrupt taskqueues scheduled due to the stealth packets.
6521 */
6522 static void
6523 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6524 {
6525
6526 hn_disable_rx(sc);
6527 hn_drain_rxtx(sc, nchan);
6528 }
6529
6530 static int
6531 hn_synth_attach(struct hn_softc *sc, int mtu)
6532 {
6533 #define ATTACHED_NVS 0x0002
6534 #define ATTACHED_RNDIS 0x0004
6535
6536 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6537 int error, nsubch, nchan = 1, i, rndis_inited;
6538 uint32_t old_caps, attached = 0;
6539
6540 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6541 ("synthetic parts were attached"));
6542
6543 if (!hn_synth_attachable(sc))
6544 return (ENXIO);
6545
6546 /* Save capabilities for later verification. */
6547 old_caps = sc->hn_caps;
6548 sc->hn_caps = 0;
6549
6550 /* Clear RSS stuffs. */
6551 sc->hn_rss_ind_size = 0;
6552 sc->hn_rss_hash = 0;
6553 sc->hn_rss_hcap = 0;
6554
6555 /*
6556 * Attach the primary channel _before_ attaching NVS and RNDIS.
6557 */
6558 error = hn_chan_attach(sc, sc->hn_prichan);
6559 if (error)
6560 goto failed;
6561
6562 /*
6563 * Attach NVS.
6564 */
6565 error = hn_nvs_attach(sc, mtu);
6566 if (error)
6567 goto failed;
6568 attached |= ATTACHED_NVS;
6569
6570 /*
6571 * Attach RNDIS _after_ NVS is attached.
6572 */
6573 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6574 if (rndis_inited)
6575 attached |= ATTACHED_RNDIS;
6576 if (error)
6577 goto failed;
6578
6579 /*
6580 * Make sure capabilities are not changed.
6581 */
6582 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6583 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6584 old_caps, sc->hn_caps);
6585 error = ENXIO;
6586 goto failed;
6587 }
6588
6589 /*
6590 * Allocate sub-channels for multi-TX/RX rings.
6591 *
6592 * NOTE:
6593 * The # of RX rings that can be used is equivalent to the # of
6594 * channels to be requested.
6595 */
6596 nsubch = sc->hn_rx_ring_cnt - 1;
6597 error = hn_synth_alloc_subchans(sc, &nsubch);
6598 if (error)
6599 goto failed;
6600 /* NOTE: _Full_ synthetic parts detach is required now. */
6601 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6602
6603 /*
6604 * Set the # of TX/RX rings that could be used according to
6605 * the # of channels that NVS offered.
6606 */
6607 nchan = nsubch + 1;
6608 hn_set_ring_inuse(sc, nchan);
6609 if (nchan == 1) {
6610 /* Only the primary channel can be used; done */
6611 goto back;
6612 }
6613
6614 /*
6615 * Attach the sub-channels.
6616 *
6617 * NOTE: hn_set_ring_inuse() _must_ have been called.
6618 */
6619 error = hn_attach_subchans(sc);
6620 if (error)
6621 goto failed;
6622
6623 /*
6624 * Configure RSS key and indirect table _after_ all sub-channels
6625 * are attached.
6626 */
6627 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6628 /*
6629 * RSS key is not set yet; set it to the default RSS key.
6630 */
6631 if (bootverbose)
6632 if_printf(sc->hn_ifp, "setup default RSS key\n");
6633 #ifdef RSS
6634 rss_getkey(rss->rss_key);
6635 #else
6636 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6637 #endif
6638 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6639 }
6640
6641 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6642 /*
6643 * RSS indirect table is not set yet; set it up in round-
6644 * robin fashion.
6645 */
6646 if (bootverbose) {
6647 if_printf(sc->hn_ifp, "setup default RSS indirect "
6648 "table\n");
6649 }
6650 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6651 uint32_t subidx;
6652
6653 #ifdef RSS
6654 subidx = rss_get_indirection_to_bucket(i);
6655 #else
6656 subidx = i;
6657 #endif
6658 rss->rss_ind[i] = subidx % nchan;
6659 }
6660 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6661 } else {
6662 /*
6663 * # of usable channels may be changed, so we have to
6664 * make sure that all entries in RSS indirect table
6665 * are valid.
6666 *
6667 * NOTE: hn_set_ring_inuse() _must_ have been called.
6668 */
6669 hn_rss_ind_fixup(sc);
6670 }
6671
6672 sc->hn_rss_hash = sc->hn_rss_hcap;
6673 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6674 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6675 /* NOTE: Don't reconfigure RSS; will do immediately. */
6676 hn_vf_rss_fixup(sc, false);
6677 }
6678 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6679 if (error)
6680 goto failed;
6681 back:
6682 /*
6683 * Fixup transmission aggregation setup.
6684 */
6685 hn_set_txagg(sc);
6686 hn_rndis_init_fixat(sc, nchan);
6687 return (0);
6688
6689 failed:
6690 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6691 hn_rndis_init_fixat(sc, nchan);
6692 hn_synth_detach(sc);
6693 } else {
6694 if (attached & ATTACHED_RNDIS) {
6695 hn_rndis_init_fixat(sc, nchan);
6696 hn_rndis_detach(sc);
6697 }
6698 if (attached & ATTACHED_NVS)
6699 hn_nvs_detach(sc);
6700 hn_chan_detach(sc, sc->hn_prichan);
6701 /* Restore old capabilities. */
6702 sc->hn_caps = old_caps;
6703 }
6704 return (error);
6705
6706 #undef ATTACHED_RNDIS
6707 #undef ATTACHED_NVS
6708 }
6709
6710 /*
6711 * NOTE:
6712 * The interface must have been suspended though hn_suspend(), before
6713 * this function get called.
6714 */
6715 static void
6716 hn_synth_detach(struct hn_softc *sc)
6717 {
6718
6719 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6720 ("synthetic parts were not attached"));
6721
6722 /* Detach the RNDIS first. */
6723 hn_rndis_detach(sc);
6724
6725 /* Detach NVS. */
6726 hn_nvs_detach(sc);
6727
6728 /* Detach all of the channels. */
6729 hn_detach_allchans(sc);
6730
6731 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6732 /*
6733 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6734 */
6735 int error;
6736
6737 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6738 sc->hn_rxbuf_gpadl);
6739 if (error) {
6740 if_printf(sc->hn_ifp,
6741 "rxbuf gpadl disconn failed: %d\n", error);
6742 sc->hn_flags |= HN_FLAG_RXBUF_REF;
6743 }
6744 sc->hn_rxbuf_gpadl = 0;
6745 }
6746
6747 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6748 /*
6749 * Host is post-Win2016, disconnect chimney sending buffer from
6750 * primary channel here.
6751 */
6752 int error;
6753
6754 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6755 sc->hn_chim_gpadl);
6756 if (error) {
6757 if_printf(sc->hn_ifp,
6758 "chim gpadl disconn failed: %d\n", error);
6759 sc->hn_flags |= HN_FLAG_CHIM_REF;
6760 }
6761 sc->hn_chim_gpadl = 0;
6762 }
6763 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6764 }
6765
6766 static void
6767 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6768 {
6769 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6770 ("invalid ring count %d", ring_cnt));
6771
6772 if (sc->hn_tx_ring_cnt > ring_cnt)
6773 sc->hn_tx_ring_inuse = ring_cnt;
6774 else
6775 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6776 sc->hn_rx_ring_inuse = ring_cnt;
6777
6778 #ifdef RSS
6779 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6780 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6781 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6782 rss_getnumbuckets());
6783 }
6784 #endif
6785
6786 if (bootverbose) {
6787 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6788 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6789 }
6790 }
6791
6792 static void
6793 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6794 {
6795
6796 /*
6797 * NOTE:
6798 * The TX bufring will not be drained by the hypervisor,
6799 * if the primary channel is revoked.
6800 */
6801 while (!vmbus_chan_rx_empty(chan) ||
6802 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6803 !vmbus_chan_tx_empty(chan)))
6804 pause("waitch", 1);
6805 vmbus_chan_intr_drain(chan);
6806 }
6807
6808 static void
6809 hn_disable_rx(struct hn_softc *sc)
6810 {
6811
6812 /*
6813 * Disable RX by clearing RX filter forcefully.
6814 */
6815 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6816 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6817
6818 /*
6819 * Give RNDIS enough time to flush all pending data packets.
6820 */
6821 pause("waitrx", (200 * hz) / 1000);
6822 }
6823
6824 /*
6825 * NOTE:
6826 * RX/TX _must_ have been suspended/disabled, before this function
6827 * is called.
6828 */
6829 static void
6830 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6831 {
6832 struct vmbus_channel **subch = NULL;
6833 int nsubch;
6834
6835 /*
6836 * Drain RX/TX bufrings and interrupts.
6837 */
6838 nsubch = nchan - 1;
6839 if (nsubch > 0)
6840 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6841
6842 if (subch != NULL) {
6843 int i;
6844
6845 for (i = 0; i < nsubch; ++i)
6846 hn_chan_drain(sc, subch[i]);
6847 }
6848 hn_chan_drain(sc, sc->hn_prichan);
6849
6850 if (subch != NULL)
6851 vmbus_subchan_rel(subch, nsubch);
6852 }
6853
6854 static void
6855 hn_suspend_data(struct hn_softc *sc)
6856 {
6857 struct hn_tx_ring *txr;
6858 int i;
6859
6860 HN_LOCK_ASSERT(sc);
6861
6862 /*
6863 * Suspend TX.
6864 */
6865 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6866 txr = &sc->hn_tx_ring[i];
6867
6868 mtx_lock(&txr->hn_tx_lock);
6869 txr->hn_suspended = 1;
6870 mtx_unlock(&txr->hn_tx_lock);
6871 /* No one is able send more packets now. */
6872
6873 /*
6874 * Wait for all pending sends to finish.
6875 *
6876 * NOTE:
6877 * We will _not_ receive all pending send-done, if the
6878 * primary channel is revoked.
6879 */
6880 while (hn_tx_ring_pending(txr) &&
6881 !vmbus_chan_is_revoked(sc->hn_prichan))
6882 pause("hnwtx", 1 /* 1 tick */);
6883 }
6884
6885 /*
6886 * Disable RX.
6887 */
6888 hn_disable_rx(sc);
6889
6890 /*
6891 * Drain RX/TX.
6892 */
6893 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6894
6895 /*
6896 * Drain any pending TX tasks.
6897 *
6898 * NOTE:
6899 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6900 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6901 */
6902 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6903 txr = &sc->hn_tx_ring[i];
6904
6905 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6906 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6907 }
6908 }
6909
6910 static void
6911 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6912 {
6913
6914 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6915 }
6916
6917 static void
6918 hn_suspend_mgmt(struct hn_softc *sc)
6919 {
6920 struct task task;
6921
6922 HN_LOCK_ASSERT(sc);
6923
6924 /*
6925 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6926 * through hn_mgmt_taskq.
6927 */
6928 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6929 vmbus_chan_run_task(sc->hn_prichan, &task);
6930
6931 /*
6932 * Make sure that all pending management tasks are completed.
6933 */
6934 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6935 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6936 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6937 }
6938
6939 static void
6940 hn_suspend(struct hn_softc *sc)
6941 {
6942
6943 /* Disable polling. */
6944 hn_polling(sc, 0);
6945
6946 /*
6947 * If the non-transparent mode VF is activated, the synthetic
6948 * device is receiving packets, so the data path of the
6949 * synthetic device must be suspended.
6950 */
6951 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6952 (sc->hn_flags & HN_FLAG_RXVF))
6953 hn_suspend_data(sc);
6954 hn_suspend_mgmt(sc);
6955 }
6956
6957 static void
6958 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6959 {
6960 int i;
6961
6962 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6963 ("invalid TX ring count %d", tx_ring_cnt));
6964
6965 for (i = 0; i < tx_ring_cnt; ++i) {
6966 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6967
6968 mtx_lock(&txr->hn_tx_lock);
6969 txr->hn_suspended = 0;
6970 mtx_unlock(&txr->hn_tx_lock);
6971 }
6972 }
6973
6974 static void
6975 hn_resume_data(struct hn_softc *sc)
6976 {
6977 int i;
6978
6979 HN_LOCK_ASSERT(sc);
6980
6981 /*
6982 * Re-enable RX.
6983 */
6984 hn_rxfilter_config(sc);
6985
6986 /*
6987 * Make sure to clear suspend status on "all" TX rings,
6988 * since hn_tx_ring_inuse can be changed after
6989 * hn_suspend_data().
6990 */
6991 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6992
6993 #ifdef HN_IFSTART_SUPPORT
6994 if (!hn_use_if_start)
6995 #endif
6996 {
6997 /*
6998 * Flush unused drbrs, since hn_tx_ring_inuse may be
6999 * reduced.
7000 */
7001 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
7002 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
7003 }
7004
7005 /*
7006 * Kick start TX.
7007 */
7008 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
7009 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
7010
7011 /*
7012 * Use txeof task, so that any pending oactive can be
7013 * cleared properly.
7014 */
7015 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
7016 }
7017 }
7018
7019 static void
7020 hn_resume_mgmt(struct hn_softc *sc)
7021 {
7022
7023 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
7024
7025 /*
7026 * Kick off network change detection, if it was pending.
7027 * If no network change was pending, start link status
7028 * checks, which is more lightweight than network change
7029 * detection.
7030 */
7031 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
7032 hn_change_network(sc);
7033 else
7034 hn_update_link_status(sc);
7035 }
7036
7037 static void
7038 hn_resume(struct hn_softc *sc)
7039 {
7040
7041 /*
7042 * If the non-transparent mode VF is activated, the synthetic
7043 * device have to receive packets, so the data path of the
7044 * synthetic device must be resumed.
7045 */
7046 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
7047 (sc->hn_flags & HN_FLAG_RXVF))
7048 hn_resume_data(sc);
7049
7050 /*
7051 * Don't resume link status change if VF is attached/activated.
7052 * - In the non-transparent VF mode, the synthetic device marks
7053 * link down until the VF is deactivated; i.e. VF is down.
7054 * - In transparent VF mode, VF's media status is used until
7055 * the VF is detached.
7056 */
7057 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7058 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7059 hn_resume_mgmt(sc);
7060
7061 /*
7062 * Re-enable polling if this interface is running and
7063 * the polling is requested.
7064 */
7065 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7066 hn_polling(sc, sc->hn_pollhz);
7067 }
7068
7069 static void
7070 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7071 {
7072 const struct rndis_status_msg *msg;
7073 int ofs;
7074
7075 if (dlen < sizeof(*msg)) {
7076 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7077 return;
7078 }
7079 msg = data;
7080
7081 switch (msg->rm_status) {
7082 case RNDIS_STATUS_MEDIA_CONNECT:
7083 case RNDIS_STATUS_MEDIA_DISCONNECT:
7084 hn_update_link_status(sc);
7085 break;
7086
7087 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7088 case RNDIS_STATUS_LINK_SPEED_CHANGE:
7089 /* Not really useful; ignore. */
7090 break;
7091
7092 case RNDIS_STATUS_NETWORK_CHANGE:
7093 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7094 if (dlen < ofs + msg->rm_stbuflen ||
7095 msg->rm_stbuflen < sizeof(uint32_t)) {
7096 if_printf(sc->hn_ifp, "network changed\n");
7097 } else {
7098 uint32_t change;
7099
7100 memcpy(&change, ((const uint8_t *)msg) + ofs,
7101 sizeof(change));
7102 if_printf(sc->hn_ifp, "network changed, change %u\n",
7103 change);
7104 }
7105 hn_change_network(sc);
7106 break;
7107
7108 default:
7109 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7110 msg->rm_status);
7111 break;
7112 }
7113 }
7114
7115 static int
7116 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7117 {
7118 const struct rndis_pktinfo *pi = info_data;
7119 uint32_t mask = 0;
7120
7121 while (info_dlen != 0) {
7122 const void *data;
7123 uint32_t dlen;
7124
7125 if (__predict_false(info_dlen < sizeof(*pi)))
7126 return (EINVAL);
7127 if (__predict_false(info_dlen < pi->rm_size))
7128 return (EINVAL);
7129 info_dlen -= pi->rm_size;
7130
7131 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7132 return (EINVAL);
7133 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7134 return (EINVAL);
7135 dlen = pi->rm_size - pi->rm_pktinfooffset;
7136 data = pi->rm_data;
7137
7138 if (pi->rm_internal == 1) {
7139 switch (pi->rm_type) {
7140 case NDIS_PKTINFO_IT_PKTINFO_ID:
7141 if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7142 return (EINVAL);
7143 info->pktinfo_id =
7144 (const struct packet_info_id *)data;
7145 mask |= HN_RXINFO_PKTINFO_ID;
7146 break;
7147
7148 default:
7149 goto next;
7150 }
7151 } else {
7152 switch (pi->rm_type) {
7153 case NDIS_PKTINFO_TYPE_VLAN:
7154 if (__predict_false(dlen
7155 < NDIS_VLAN_INFO_SIZE))
7156 return (EINVAL);
7157 info->vlan_info = (const uint32_t *)data;
7158 mask |= HN_RXINFO_VLAN;
7159 break;
7160
7161 case NDIS_PKTINFO_TYPE_CSUM:
7162 if (__predict_false(dlen
7163 < NDIS_RXCSUM_INFO_SIZE))
7164 return (EINVAL);
7165 info->csum_info = (const uint32_t *)data;
7166 mask |= HN_RXINFO_CSUM;
7167 break;
7168
7169 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7170 if (__predict_false(dlen
7171 < HN_NDIS_HASH_VALUE_SIZE))
7172 return (EINVAL);
7173 info->hash_value = (const uint32_t *)data;
7174 mask |= HN_RXINFO_HASHVAL;
7175 break;
7176
7177 case HN_NDIS_PKTINFO_TYPE_HASHINF:
7178 if (__predict_false(dlen
7179 < HN_NDIS_HASH_INFO_SIZE))
7180 return (EINVAL);
7181 info->hash_info = (const uint32_t *)data;
7182 mask |= HN_RXINFO_HASHINF;
7183 break;
7184
7185 default:
7186 goto next;
7187 }
7188 }
7189
7190 if (mask == HN_RXINFO_ALL) {
7191 /* All found; done */
7192 break;
7193 }
7194 next:
7195 pi = (const struct rndis_pktinfo *)
7196 ((const uint8_t *)pi + pi->rm_size);
7197 }
7198
7199 /*
7200 * Final fixup.
7201 * - If there is no hash value, invalidate the hash info.
7202 */
7203 if ((mask & HN_RXINFO_HASHVAL) == 0)
7204 info->hash_info = NULL;
7205 return (0);
7206 }
7207
7208 static __inline bool
7209 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7210 {
7211
7212 if (off < check_off) {
7213 if (__predict_true(off + len <= check_off))
7214 return (false);
7215 } else if (off > check_off) {
7216 if (__predict_true(check_off + check_len <= off))
7217 return (false);
7218 }
7219 return (true);
7220 }
7221
7222 static __inline void
7223 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7224 uint32_t len, struct hn_rxinfo *info)
7225 {
7226 uint32_t cnt = rxr->rsc.cnt;
7227
7228 if (cnt) {
7229 rxr->rsc.pktlen += len;
7230 } else {
7231 rxr->rsc.vlan_info = info->vlan_info;
7232 rxr->rsc.csum_info = info->csum_info;
7233 rxr->rsc.hash_info = info->hash_info;
7234 rxr->rsc.hash_value = info->hash_value;
7235 rxr->rsc.pktlen = len;
7236 }
7237
7238 rxr->rsc.frag_data[cnt] = data;
7239 rxr->rsc.frag_len[cnt] = len;
7240 rxr->rsc.cnt++;
7241 }
7242
7243 static void
7244 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7245 {
7246 const struct rndis_packet_msg *pkt;
7247 struct hn_rxinfo info;
7248 int data_off, pktinfo_off, data_len, pktinfo_len;
7249 bool rsc_more= false;
7250
7251 /*
7252 * Check length.
7253 */
7254 if (__predict_false(dlen < sizeof(*pkt))) {
7255 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7256 return;
7257 }
7258 pkt = data;
7259
7260 if (__predict_false(dlen < pkt->rm_len)) {
7261 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7262 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7263 return;
7264 }
7265 if (__predict_false(pkt->rm_len <
7266 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7267 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7268 "msglen %u, data %u, oob %u, pktinfo %u\n",
7269 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7270 pkt->rm_pktinfolen);
7271 return;
7272 }
7273 if (__predict_false(pkt->rm_datalen == 0)) {
7274 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7275 return;
7276 }
7277
7278 /*
7279 * Check offests.
7280 */
7281 #define IS_OFFSET_INVALID(ofs) \
7282 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
7283 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7284
7285 /* XXX Hyper-V does not meet data offset alignment requirement */
7286 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7287 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7288 "data offset %u\n", pkt->rm_dataoffset);
7289 return;
7290 }
7291 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7292 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7293 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7294 "oob offset %u\n", pkt->rm_oobdataoffset);
7295 return;
7296 }
7297 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7298 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7299 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7300 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7301 return;
7302 }
7303
7304 #undef IS_OFFSET_INVALID
7305
7306 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7307 data_len = pkt->rm_datalen;
7308 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7309 pktinfo_len = pkt->rm_pktinfolen;
7310
7311 /*
7312 * Check OOB coverage.
7313 */
7314 if (__predict_false(pkt->rm_oobdatalen != 0)) {
7315 int oob_off, oob_len;
7316
7317 if_printf(rxr->hn_ifp, "got oobdata\n");
7318 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7319 oob_len = pkt->rm_oobdatalen;
7320
7321 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7322 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7323 "oob overflow, msglen %u, oob abs %d len %d\n",
7324 pkt->rm_len, oob_off, oob_len);
7325 return;
7326 }
7327
7328 /*
7329 * Check against data.
7330 */
7331 if (hn_rndis_check_overlap(oob_off, oob_len,
7332 data_off, data_len)) {
7333 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7334 "oob overlaps data, oob abs %d len %d, "
7335 "data abs %d len %d\n",
7336 oob_off, oob_len, data_off, data_len);
7337 return;
7338 }
7339
7340 /*
7341 * Check against pktinfo.
7342 */
7343 if (pktinfo_len != 0 &&
7344 hn_rndis_check_overlap(oob_off, oob_len,
7345 pktinfo_off, pktinfo_len)) {
7346 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7347 "oob overlaps pktinfo, oob abs %d len %d, "
7348 "pktinfo abs %d len %d\n",
7349 oob_off, oob_len, pktinfo_off, pktinfo_len);
7350 return;
7351 }
7352 }
7353
7354 /*
7355 * Check per-packet-info coverage and find useful per-packet-info.
7356 */
7357 info.vlan_info = NULL;
7358 info.csum_info = NULL;
7359 info.hash_info = NULL;
7360 info.pktinfo_id = NULL;
7361
7362 if (__predict_true(pktinfo_len != 0)) {
7363 bool overlap;
7364 int error;
7365
7366 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7367 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7368 "pktinfo overflow, msglen %u, "
7369 "pktinfo abs %d len %d\n",
7370 pkt->rm_len, pktinfo_off, pktinfo_len);
7371 return;
7372 }
7373
7374 /*
7375 * Check packet info coverage.
7376 */
7377 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7378 data_off, data_len);
7379 if (__predict_false(overlap)) {
7380 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7381 "pktinfo overlap data, pktinfo abs %d len %d, "
7382 "data abs %d len %d\n",
7383 pktinfo_off, pktinfo_len, data_off, data_len);
7384 return;
7385 }
7386
7387 /*
7388 * Find useful per-packet-info.
7389 */
7390 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7391 pktinfo_len, &info);
7392 if (__predict_false(error)) {
7393 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7394 "pktinfo\n");
7395 return;
7396 }
7397 }
7398
7399 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7400 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7401 "data overflow, msglen %u, data abs %d len %d\n",
7402 pkt->rm_len, data_off, data_len);
7403 return;
7404 }
7405
7406 /* Identify RSC fragments, drop invalid packets */
7407 if ((info.pktinfo_id != NULL) &&
7408 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7409 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7410 rxr->rsc.cnt = 0;
7411 rxr->hn_rsc_pkts++;
7412 } else if (rxr->rsc.cnt == 0)
7413 goto drop;
7414
7415 rsc_more = true;
7416
7417 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7418 rsc_more = false;
7419
7420 if (rsc_more && rxr->rsc.is_last)
7421 goto drop;
7422 } else {
7423 rxr->rsc.cnt = 0;
7424 }
7425
7426 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7427 goto drop;
7428
7429 /* Store data in per rx ring structure */
7430 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7431 data_len, &info);
7432
7433 if (rsc_more)
7434 return;
7435
7436 hn_rxpkt(rxr);
7437 rxr->rsc.cnt = 0;
7438 return;
7439 drop:
7440 rxr->hn_rsc_drop++;
7441 return;
7442 }
7443
7444 static __inline void
7445 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7446 {
7447 const struct rndis_msghdr *hdr;
7448
7449 if (__predict_false(dlen < sizeof(*hdr))) {
7450 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7451 return;
7452 }
7453 hdr = data;
7454
7455 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7456 /* Hot data path. */
7457 hn_rndis_rx_data(rxr, data, dlen);
7458 /* Done! */
7459 return;
7460 }
7461
7462 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7463 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7464 else
7465 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7466 }
7467
7468 static void
7469 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7470 {
7471 const struct hn_nvs_hdr *hdr;
7472
7473 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7474 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7475 return;
7476 }
7477 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7478
7479 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7480 /* Useless; ignore */
7481 return;
7482 }
7483 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7484 }
7485
7486 static void
7487 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7488 const struct vmbus_chanpkt_hdr *pkt)
7489 {
7490 struct hn_nvs_sendctx *sndc;
7491
7492 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7493 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7494 VMBUS_CHANPKT_DATALEN(pkt));
7495 /*
7496 * NOTE:
7497 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7498 * its callback.
7499 */
7500 }
7501
7502 static void
7503 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7504 const struct vmbus_chanpkt_hdr *pkthdr)
7505 {
7506 struct epoch_tracker et;
7507 const struct vmbus_chanpkt_rxbuf *pkt;
7508 const struct hn_nvs_hdr *nvs_hdr;
7509 int count, i, hlen;
7510
7511 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7512 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7513 return;
7514 }
7515 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7516
7517 /* Make sure that this is a RNDIS message. */
7518 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7519 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7520 nvs_hdr->nvs_type);
7521 return;
7522 }
7523
7524 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7525 if (__predict_false(hlen < sizeof(*pkt))) {
7526 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7527 return;
7528 }
7529 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7530
7531 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7532 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7533 pkt->cp_rxbuf_id);
7534 return;
7535 }
7536
7537 count = pkt->cp_rxbuf_cnt;
7538 if (__predict_false(hlen <
7539 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7540 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7541 return;
7542 }
7543
7544 NET_EPOCH_ENTER(et);
7545 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7546 for (i = 0; i < count; ++i) {
7547 int ofs, len;
7548
7549 ofs = pkt->cp_rxbuf[i].rb_ofs;
7550 len = pkt->cp_rxbuf[i].rb_len;
7551 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7552 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7553 "ofs %d, len %d\n", i, ofs, len);
7554 continue;
7555 }
7556
7557 rxr->rsc.is_last = (i == (count - 1));
7558 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7559 }
7560 NET_EPOCH_EXIT(et);
7561
7562 /*
7563 * Ack the consumed RXBUF associated w/ this channel packet,
7564 * so that this RXBUF can be recycled by the hypervisor.
7565 */
7566 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7567 }
7568
7569 static void
7570 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7571 uint64_t tid)
7572 {
7573 struct hn_nvs_rndis_ack ack;
7574 int retries, error;
7575
7576 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7577 ack.nvs_status = HN_NVS_STATUS_OK;
7578
7579 retries = 0;
7580 again:
7581 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7582 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7583 if (__predict_false(error == EAGAIN)) {
7584 /*
7585 * NOTE:
7586 * This should _not_ happen in real world, since the
7587 * consumption of the TX bufring from the TX path is
7588 * controlled.
7589 */
7590 if (rxr->hn_ack_failed == 0)
7591 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7592 rxr->hn_ack_failed++;
7593 retries++;
7594 if (retries < 10) {
7595 DELAY(100);
7596 goto again;
7597 }
7598 /* RXBUF leaks! */
7599 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7600 }
7601 }
7602
7603 static void
7604 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7605 {
7606 struct hn_rx_ring *rxr = xrxr;
7607 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7608
7609 for (;;) {
7610 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7611 int error, pktlen;
7612
7613 pktlen = rxr->hn_pktbuf_len;
7614 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7615 if (__predict_false(error == ENOBUFS)) {
7616 void *nbuf;
7617 int nlen;
7618
7619 /*
7620 * Expand channel packet buffer.
7621 *
7622 * XXX
7623 * Use M_WAITOK here, since allocation failure
7624 * is fatal.
7625 */
7626 nlen = rxr->hn_pktbuf_len * 2;
7627 while (nlen < pktlen)
7628 nlen *= 2;
7629 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7630
7631 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7632 rxr->hn_pktbuf_len, nlen);
7633
7634 free(rxr->hn_pktbuf, M_DEVBUF);
7635 rxr->hn_pktbuf = nbuf;
7636 rxr->hn_pktbuf_len = nlen;
7637 /* Retry! */
7638 continue;
7639 } else if (__predict_false(error == EAGAIN)) {
7640 /* No more channel packets; done! */
7641 break;
7642 }
7643 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7644
7645 switch (pkt->cph_type) {
7646 case VMBUS_CHANPKT_TYPE_COMP:
7647 hn_nvs_handle_comp(sc, chan, pkt);
7648 break;
7649
7650 case VMBUS_CHANPKT_TYPE_RXBUF:
7651 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7652 break;
7653
7654 case VMBUS_CHANPKT_TYPE_INBAND:
7655 hn_nvs_handle_notify(sc, pkt);
7656 break;
7657
7658 default:
7659 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7660 pkt->cph_type);
7661 break;
7662 }
7663 }
7664 hn_chan_rollup(rxr, rxr->hn_txr);
7665 }
7666
7667 static void
7668 hn_sysinit(void *arg __unused)
7669 {
7670 int i;
7671
7672 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7673
7674 #ifdef HN_IFSTART_SUPPORT
7675 /*
7676 * Don't use ifnet.if_start if transparent VF mode is requested;
7677 * mainly due to the IFF_DRV_OACTIVE flag.
7678 */
7679 if (hn_xpnt_vf && hn_use_if_start) {
7680 hn_use_if_start = 0;
7681 printf("hn: tranparent VF mode, if_transmit will be used, "
7682 "instead of if_start\n");
7683 }
7684 #endif
7685 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7686 printf("hn: invalid transparent VF attach routing "
7687 "wait timeout %d, reset to %d\n",
7688 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7689 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7690 }
7691
7692 /*
7693 * Initialize VF map.
7694 */
7695 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7696 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7697 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7698 M_WAITOK | M_ZERO);
7699
7700 /*
7701 * Fix the # of TX taskqueues.
7702 */
7703 if (hn_tx_taskq_cnt <= 0)
7704 hn_tx_taskq_cnt = 1;
7705 else if (hn_tx_taskq_cnt > mp_ncpus)
7706 hn_tx_taskq_cnt = mp_ncpus;
7707
7708 /*
7709 * Fix the TX taskqueue mode.
7710 */
7711 switch (hn_tx_taskq_mode) {
7712 case HN_TX_TASKQ_M_INDEP:
7713 case HN_TX_TASKQ_M_GLOBAL:
7714 case HN_TX_TASKQ_M_EVTTQ:
7715 break;
7716 default:
7717 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7718 break;
7719 }
7720
7721 if (vm_guest != VM_GUEST_HV)
7722 return;
7723
7724 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7725 return;
7726
7727 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7728 M_DEVBUF, M_WAITOK);
7729 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7730 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7731 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7732 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7733 "hn tx%d", i);
7734 }
7735 }
7736 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7737
7738 static void
7739 hn_sysuninit(void *arg __unused)
7740 {
7741
7742 if (hn_tx_taskque != NULL) {
7743 int i;
7744
7745 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7746 taskqueue_free(hn_tx_taskque[i]);
7747 free(hn_tx_taskque, M_DEVBUF);
7748 }
7749
7750 if (hn_vfmap != NULL)
7751 free(hn_vfmap, M_DEVBUF);
7752 rm_destroy(&hn_vfmap_lock);
7753
7754 counter_u64_free(hn_udpcs_fixup);
7755 }
7756 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
Cache object: 070876905aa6e409c696ad1201abcd15
|