The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/hyperv/netvsc/if_hn.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2010-2012 Citrix Inc.
    3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
    4  * Copyright (c) 2012 NetApp Inc.
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice unmodified, this list of conditions, and the following
   12  *    disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   27  */
   28 
   29 /*-
   30  * Copyright (c) 2004-2006 Kip Macy
   31  * All rights reserved.
   32  *
   33  * Redistribution and use in source and binary forms, with or without
   34  * modification, are permitted provided that the following conditions
   35  * are met:
   36  * 1. Redistributions of source code must retain the above copyright
   37  *    notice, this list of conditions and the following disclaimer.
   38  * 2. Redistributions in binary form must reproduce the above copyright
   39  *    notice, this list of conditions and the following disclaimer in the
   40  *    documentation and/or other materials provided with the distribution.
   41  *
   42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   52  * SUCH DAMAGE.
   53  */
   54 
   55 #include <sys/cdefs.h>
   56 __FBSDID("$FreeBSD$");
   57 
   58 #include "opt_hn.h"
   59 #include "opt_inet6.h"
   60 #include "opt_inet.h"
   61 #include "opt_rss.h"
   62 
   63 #include <sys/param.h>
   64 #include <sys/systm.h>
   65 #include <sys/bus.h>
   66 #include <sys/counter.h>
   67 #include <sys/kernel.h>
   68 #include <sys/limits.h>
   69 #include <sys/malloc.h>
   70 #include <sys/mbuf.h>
   71 #include <sys/module.h>
   72 #include <sys/queue.h>
   73 #include <sys/lock.h>
   74 #include <sys/proc.h>
   75 #include <sys/rmlock.h>
   76 #include <sys/sbuf.h>
   77 #include <sys/sched.h>
   78 #include <sys/smp.h>
   79 #include <sys/socket.h>
   80 #include <sys/sockio.h>
   81 #include <sys/sx.h>
   82 #include <sys/sysctl.h>
   83 #include <sys/taskqueue.h>
   84 #include <sys/buf_ring.h>
   85 #include <sys/eventhandler.h>
   86 #include <sys/epoch.h>
   87 
   88 #include <machine/atomic.h>
   89 #include <machine/in_cksum.h>
   90 
   91 #include <net/bpf.h>
   92 #include <net/ethernet.h>
   93 #include <net/if.h>
   94 #include <net/if_dl.h>
   95 #include <net/if_media.h>
   96 #include <net/if_types.h>
   97 #include <net/if_var.h>
   98 #include <net/rndis.h>
   99 #ifdef RSS
  100 #include <net/rss_config.h>
  101 #endif
  102 
  103 #include <netinet/in_systm.h>
  104 #include <netinet/in.h>
  105 #include <netinet/ip.h>
  106 #include <netinet/ip6.h>
  107 #include <netinet/tcp.h>
  108 #include <netinet/tcp_lro.h>
  109 #include <netinet/udp.h>
  110 
  111 #include <dev/hyperv/include/hyperv.h>
  112 #include <dev/hyperv/include/hyperv_busdma.h>
  113 #include <dev/hyperv/include/vmbus.h>
  114 #include <dev/hyperv/include/vmbus_xact.h>
  115 
  116 #include <dev/hyperv/netvsc/ndis.h>
  117 #include <dev/hyperv/netvsc/if_hnreg.h>
  118 #include <dev/hyperv/netvsc/if_hnvar.h>
  119 #include <dev/hyperv/netvsc/hn_nvs.h>
  120 #include <dev/hyperv/netvsc/hn_rndis.h>
  121 
  122 #include "vmbus_if.h"
  123 
  124 #define HN_IFSTART_SUPPORT
  125 
  126 #define HN_RING_CNT_DEF_MAX             8
  127 
  128 #define HN_VFMAP_SIZE_DEF               8
  129 
  130 #define HN_XPNT_VF_ATTWAIT_MIN          2       /* seconds */
  131 
  132 /* YYY should get it from the underlying channel */
  133 #define HN_TX_DESC_CNT                  512
  134 
  135 #define HN_RNDIS_PKT_LEN                                        \
  136         (sizeof(struct rndis_packet_msg) +                      \
  137          HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +       \
  138          HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +           \
  139          HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +           \
  140          HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
  141 #define HN_RNDIS_PKT_BOUNDARY           PAGE_SIZE
  142 #define HN_RNDIS_PKT_ALIGN              CACHE_LINE_SIZE
  143 
  144 #define HN_TX_DATA_BOUNDARY             PAGE_SIZE
  145 #define HN_TX_DATA_MAXSIZE              IP_MAXPACKET
  146 #define HN_TX_DATA_SEGSIZE              PAGE_SIZE
  147 /* -1 for RNDIS packet message */
  148 #define HN_TX_DATA_SEGCNT_MAX           (HN_GPACNT_MAX - 1)
  149 
  150 #define HN_DIRECT_TX_SIZE_DEF           128
  151 
  152 #define HN_EARLY_TXEOF_THRESH           8
  153 
  154 #define HN_PKTBUF_LEN_DEF               (16 * 1024)
  155 
  156 #define HN_LROENT_CNT_DEF               128
  157 
  158 #define HN_LRO_LENLIM_MULTIRX_DEF       (12 * ETHERMTU)
  159 #define HN_LRO_LENLIM_DEF               (25 * ETHERMTU)
  160 /* YYY 2*MTU is a bit rough, but should be good enough. */
  161 #define HN_LRO_LENLIM_MIN(ifp)          (2 * (ifp)->if_mtu)
  162 
  163 #define HN_LRO_ACKCNT_DEF               1
  164 
  165 #define HN_LOCK_INIT(sc)                \
  166         sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
  167 #define HN_LOCK_DESTROY(sc)             sx_destroy(&(sc)->hn_lock)
  168 #define HN_LOCK_ASSERT(sc)              sx_assert(&(sc)->hn_lock, SA_XLOCKED)
  169 #define HN_LOCK(sc)                                     \
  170 do {                                                    \
  171         while (sx_try_xlock(&(sc)->hn_lock) == 0) {     \
  172                 /* Relinquish cpu to avoid deadlock */  \
  173                 sched_relinquish(curthread);            \
  174                 DELAY(1000);                            \
  175         }                                               \
  176 } while (0)
  177 #define HN_UNLOCK(sc)                   sx_xunlock(&(sc)->hn_lock)
  178 
  179 #define HN_CSUM_IP_MASK                 (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
  180 #define HN_CSUM_IP6_MASK                (CSUM_IP6_TCP | CSUM_IP6_UDP)
  181 #define HN_CSUM_IP_HWASSIST(sc)         \
  182         ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
  183 #define HN_CSUM_IP6_HWASSIST(sc)        \
  184         ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
  185 
  186 #define HN_PKTSIZE_MIN(align)           \
  187         roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
  188             HN_RNDIS_PKT_LEN, (align))
  189 #define HN_PKTSIZE(m, align)            \
  190         roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
  191 
  192 #ifdef RSS
  193 #define HN_RING_IDX2CPU(sc, idx)        rss_getcpu((idx) % rss_getnumbuckets())
  194 #else
  195 #define HN_RING_IDX2CPU(sc, idx)        (((sc)->hn_cpu + (idx)) % mp_ncpus)
  196 #endif
  197 
  198 struct hn_txdesc {
  199 #ifndef HN_USE_TXDESC_BUFRING
  200         SLIST_ENTRY(hn_txdesc)          link;
  201 #endif
  202         STAILQ_ENTRY(hn_txdesc)         agg_link;
  203 
  204         /* Aggregated txdescs, in sending order. */
  205         STAILQ_HEAD(, hn_txdesc)        agg_list;
  206 
  207         /* The oldest packet, if transmission aggregation happens. */
  208         struct mbuf                     *m;
  209         struct hn_tx_ring               *txr;
  210         int                             refs;
  211         uint32_t                        flags;  /* HN_TXD_FLAG_ */
  212         struct hn_nvs_sendctx           send_ctx;
  213         uint32_t                        chim_index;
  214         int                             chim_size;
  215 
  216         bus_dmamap_t                    data_dmap;
  217 
  218         bus_addr_t                      rndis_pkt_paddr;
  219         struct rndis_packet_msg         *rndis_pkt;
  220         bus_dmamap_t                    rndis_pkt_dmap;
  221 };
  222 
  223 #define HN_TXD_FLAG_ONLIST              0x0001
  224 #define HN_TXD_FLAG_DMAMAP              0x0002
  225 #define HN_TXD_FLAG_ONAGG               0x0004
  226 
  227 #define HN_NDIS_PKTINFO_SUBALLOC        0x01
  228 #define HN_NDIS_PKTINFO_1ST_FRAG        0x02
  229 #define HN_NDIS_PKTINFO_LAST_FRAG       0x04
  230 
  231 struct packet_info_id {
  232         uint8_t                         ver;
  233         uint8_t                         flag;
  234         uint16_t                        pkt_id;
  235 };
  236 
  237 #define NDIS_PKTINFOID_SZ               sizeof(struct packet_info_id)
  238 
  239 
  240 struct hn_rxinfo {
  241         const uint32_t                  *vlan_info;
  242         const uint32_t                  *csum_info;
  243         const uint32_t                  *hash_info;
  244         const uint32_t                  *hash_value;
  245         const struct packet_info_id     *pktinfo_id;
  246 };
  247 
  248 struct hn_rxvf_setarg {
  249         struct hn_rx_ring       *rxr;
  250         struct ifnet            *vf_ifp;
  251 };
  252 
  253 #define HN_RXINFO_VLAN                  0x0001
  254 #define HN_RXINFO_CSUM                  0x0002
  255 #define HN_RXINFO_HASHINF               0x0004
  256 #define HN_RXINFO_HASHVAL               0x0008
  257 #define HN_RXINFO_PKTINFO_ID            0x0010
  258 #define HN_RXINFO_ALL                   \
  259         (HN_RXINFO_VLAN |               \
  260          HN_RXINFO_CSUM |               \
  261          HN_RXINFO_HASHINF |            \
  262          HN_RXINFO_HASHVAL |            \
  263          HN_RXINFO_PKTINFO_ID)
  264 
  265 static int                      hn_probe(device_t);
  266 static int                      hn_attach(device_t);
  267 static int                      hn_detach(device_t);
  268 static int                      hn_shutdown(device_t);
  269 static void                     hn_chan_callback(struct vmbus_channel *,
  270                                     void *);
  271 
  272 static void                     hn_init(void *);
  273 static int                      hn_ioctl(struct ifnet *, u_long, caddr_t);
  274 #ifdef HN_IFSTART_SUPPORT
  275 static void                     hn_start(struct ifnet *);
  276 #endif
  277 static int                      hn_transmit(struct ifnet *, struct mbuf *);
  278 static void                     hn_xmit_qflush(struct ifnet *);
  279 static int                      hn_ifmedia_upd(struct ifnet *);
  280 static void                     hn_ifmedia_sts(struct ifnet *,
  281                                     struct ifmediareq *);
  282 
  283 static void                     hn_ifnet_event(void *, struct ifnet *, int);
  284 static void                     hn_ifaddr_event(void *, struct ifnet *);
  285 static void                     hn_ifnet_attevent(void *, struct ifnet *);
  286 static void                     hn_ifnet_detevent(void *, struct ifnet *);
  287 static void                     hn_ifnet_lnkevent(void *, struct ifnet *, int);
  288 
  289 static bool                     hn_ismyvf(const struct hn_softc *,
  290                                     const struct ifnet *);
  291 static void                     hn_rxvf_change(struct hn_softc *,
  292                                     struct ifnet *, bool);
  293 static void                     hn_rxvf_set(struct hn_softc *, struct ifnet *);
  294 static void                     hn_rxvf_set_task(void *, int);
  295 static void                     hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
  296 static int                      hn_xpnt_vf_iocsetflags(struct hn_softc *);
  297 static int                      hn_xpnt_vf_iocsetcaps(struct hn_softc *,
  298                                     struct ifreq *);
  299 static void                     hn_xpnt_vf_saveifflags(struct hn_softc *);
  300 static bool                     hn_xpnt_vf_isready(struct hn_softc *);
  301 static void                     hn_xpnt_vf_setready(struct hn_softc *);
  302 static void                     hn_xpnt_vf_init_taskfunc(void *, int);
  303 static void                     hn_xpnt_vf_init(struct hn_softc *);
  304 static void                     hn_xpnt_vf_setenable(struct hn_softc *);
  305 static void                     hn_xpnt_vf_setdisable(struct hn_softc *, bool);
  306 static void                     hn_vf_rss_fixup(struct hn_softc *, bool);
  307 static void                     hn_vf_rss_restore(struct hn_softc *);
  308 
  309 static int                      hn_rndis_rxinfo(const void *, int,
  310                                     struct hn_rxinfo *);
  311 static void                     hn_rndis_rx_data(struct hn_rx_ring *,
  312                                     const void *, int);
  313 static void                     hn_rndis_rx_status(struct hn_softc *,
  314                                     const void *, int);
  315 static void                     hn_rndis_init_fixat(struct hn_softc *, int);
  316 
  317 static void                     hn_nvs_handle_notify(struct hn_softc *,
  318                                     const struct vmbus_chanpkt_hdr *);
  319 static void                     hn_nvs_handle_comp(struct hn_softc *,
  320                                     struct vmbus_channel *,
  321                                     const struct vmbus_chanpkt_hdr *);
  322 static void                     hn_nvs_handle_rxbuf(struct hn_rx_ring *,
  323                                     struct vmbus_channel *,
  324                                     const struct vmbus_chanpkt_hdr *);
  325 static void                     hn_nvs_ack_rxbuf(struct hn_rx_ring *,
  326                                     struct vmbus_channel *, uint64_t);
  327 
  328 #if __FreeBSD_version >= 1100099
  329 static int                      hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
  330 static int                      hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
  331 #endif
  332 static int                      hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
  333 static int                      hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
  334 #if __FreeBSD_version < 1100095
  335 static int                      hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
  336 #else
  337 static int                      hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
  338 #endif
  339 static int                      hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
  340 static int                      hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
  341 static int                      hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
  342 static int                      hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
  343 static int                      hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
  344 static int                      hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
  345 static int                      hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
  346 #ifndef RSS
  347 static int                      hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
  348 static int                      hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
  349 #endif
  350 static int                      hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
  351 static int                      hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
  352 static int                      hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
  353 static int                      hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
  354 static int                      hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
  355 static int                      hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
  356 static int                      hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
  357 static int                      hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
  358 static int                      hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
  359 static int                      hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
  360 static int                      hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
  361 static int                      hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
  362 static int                      hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
  363 static int                      hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
  364 
  365 static void                     hn_stop(struct hn_softc *, bool);
  366 static void                     hn_init_locked(struct hn_softc *);
  367 static int                      hn_chan_attach(struct hn_softc *,
  368                                     struct vmbus_channel *);
  369 static void                     hn_chan_detach(struct hn_softc *,
  370                                     struct vmbus_channel *);
  371 static int                      hn_attach_subchans(struct hn_softc *);
  372 static void                     hn_detach_allchans(struct hn_softc *);
  373 static void                     hn_chan_rollup(struct hn_rx_ring *,
  374                                     struct hn_tx_ring *);
  375 static void                     hn_set_ring_inuse(struct hn_softc *, int);
  376 static int                      hn_synth_attach(struct hn_softc *, int);
  377 static void                     hn_synth_detach(struct hn_softc *);
  378 static int                      hn_synth_alloc_subchans(struct hn_softc *,
  379                                     int *);
  380 static bool                     hn_synth_attachable(const struct hn_softc *);
  381 static void                     hn_suspend(struct hn_softc *);
  382 static void                     hn_suspend_data(struct hn_softc *);
  383 static void                     hn_suspend_mgmt(struct hn_softc *);
  384 static void                     hn_resume(struct hn_softc *);
  385 static void                     hn_resume_data(struct hn_softc *);
  386 static void                     hn_resume_mgmt(struct hn_softc *);
  387 static void                     hn_suspend_mgmt_taskfunc(void *, int);
  388 static void                     hn_chan_drain(struct hn_softc *,
  389                                     struct vmbus_channel *);
  390 static void                     hn_disable_rx(struct hn_softc *);
  391 static void                     hn_drain_rxtx(struct hn_softc *, int);
  392 static void                     hn_polling(struct hn_softc *, u_int);
  393 static void                     hn_chan_polling(struct vmbus_channel *, u_int);
  394 static void                     hn_mtu_change_fixup(struct hn_softc *);
  395 
  396 static void                     hn_update_link_status(struct hn_softc *);
  397 static void                     hn_change_network(struct hn_softc *);
  398 static void                     hn_link_taskfunc(void *, int);
  399 static void                     hn_netchg_init_taskfunc(void *, int);
  400 static void                     hn_netchg_status_taskfunc(void *, int);
  401 static void                     hn_link_status(struct hn_softc *);
  402 
  403 static int                      hn_create_rx_data(struct hn_softc *, int);
  404 static void                     hn_destroy_rx_data(struct hn_softc *);
  405 static int                      hn_check_iplen(const struct mbuf *, int);
  406 static void                     hn_rxpkt_proto(const struct mbuf *, int *, int *);
  407 static int                      hn_set_rxfilter(struct hn_softc *, uint32_t);
  408 static int                      hn_rxfilter_config(struct hn_softc *);
  409 static int                      hn_rss_reconfig(struct hn_softc *);
  410 static void                     hn_rss_ind_fixup(struct hn_softc *);
  411 static void                     hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
  412 static int                      hn_rxpkt(struct hn_rx_ring *);
  413 static uint32_t                 hn_rss_type_fromndis(uint32_t);
  414 static uint32_t                 hn_rss_type_tondis(uint32_t);
  415 
  416 static int                      hn_tx_ring_create(struct hn_softc *, int);
  417 static void                     hn_tx_ring_destroy(struct hn_tx_ring *);
  418 static int                      hn_create_tx_data(struct hn_softc *, int);
  419 static void                     hn_fixup_tx_data(struct hn_softc *);
  420 static void                     hn_fixup_rx_data(struct hn_softc *);
  421 static void                     hn_destroy_tx_data(struct hn_softc *);
  422 static void                     hn_txdesc_dmamap_destroy(struct hn_txdesc *);
  423 static void                     hn_txdesc_gc(struct hn_tx_ring *,
  424                                     struct hn_txdesc *);
  425 static int                      hn_encap(struct ifnet *, struct hn_tx_ring *,
  426                                     struct hn_txdesc *, struct mbuf **);
  427 static int                      hn_txpkt(struct ifnet *, struct hn_tx_ring *,
  428                                     struct hn_txdesc *);
  429 static void                     hn_set_chim_size(struct hn_softc *, int);
  430 static void                     hn_set_tso_maxsize(struct hn_softc *, int, int);
  431 static bool                     hn_tx_ring_pending(struct hn_tx_ring *);
  432 static void                     hn_tx_ring_qflush(struct hn_tx_ring *);
  433 static void                     hn_resume_tx(struct hn_softc *, int);
  434 static void                     hn_set_txagg(struct hn_softc *);
  435 static void                     *hn_try_txagg(struct ifnet *,
  436                                     struct hn_tx_ring *, struct hn_txdesc *,
  437                                     int);
  438 static int                      hn_get_txswq_depth(const struct hn_tx_ring *);
  439 static void                     hn_txpkt_done(struct hn_nvs_sendctx *,
  440                                     struct hn_softc *, struct vmbus_channel *,
  441                                     const void *, int);
  442 static int                      hn_txpkt_sglist(struct hn_tx_ring *,
  443                                     struct hn_txdesc *);
  444 static int                      hn_txpkt_chim(struct hn_tx_ring *,
  445                                     struct hn_txdesc *);
  446 static int                      hn_xmit(struct hn_tx_ring *, int);
  447 static void                     hn_xmit_taskfunc(void *, int);
  448 static void                     hn_xmit_txeof(struct hn_tx_ring *);
  449 static void                     hn_xmit_txeof_taskfunc(void *, int);
  450 #ifdef HN_IFSTART_SUPPORT
  451 static int                      hn_start_locked(struct hn_tx_ring *, int);
  452 static void                     hn_start_taskfunc(void *, int);
  453 static void                     hn_start_txeof(struct hn_tx_ring *);
  454 static void                     hn_start_txeof_taskfunc(void *, int);
  455 #endif
  456 
  457 static int                      hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
  458 
  459 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
  460     "Hyper-V network interface");
  461 
  462 /* Trust tcp segment verification on host side. */
  463 static int                      hn_trust_hosttcp = 1;
  464 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
  465     &hn_trust_hosttcp, 0,
  466     "Trust tcp segment verification on host side, "
  467     "when csum info is missing (global setting)");
  468 
  469 /* Trust udp datagrams verification on host side. */
  470 static int                      hn_trust_hostudp = 1;
  471 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
  472     &hn_trust_hostudp, 0,
  473     "Trust udp datagram verification on host side, "
  474     "when csum info is missing (global setting)");
  475 
  476 /* Trust ip packets verification on host side. */
  477 static int                      hn_trust_hostip = 1;
  478 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
  479     &hn_trust_hostip, 0,
  480     "Trust ip packet verification on host side, "
  481     "when csum info is missing (global setting)");
  482 
  483 /*
  484  * Offload UDP/IPv4 checksum.
  485  */
  486 static int                      hn_enable_udp4cs = 1;
  487 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
  488     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
  489 
  490 /*
  491  * Offload UDP/IPv6 checksum.
  492  */
  493 static int                      hn_enable_udp6cs = 1;
  494 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
  495     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
  496 
  497 /* Stats. */
  498 static counter_u64_t            hn_udpcs_fixup;
  499 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
  500     &hn_udpcs_fixup, "# of UDP checksum fixup");
  501 
  502 /*
  503  * See hn_set_hlen().
  504  *
  505  * This value is for Azure.  For Hyper-V, set this above
  506  * 65536 to disable UDP datagram checksum fixup.
  507  */
  508 static int                      hn_udpcs_fixup_mtu = 1420;
  509 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
  510     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
  511 
  512 /* Limit TSO burst size */
  513 static int                      hn_tso_maxlen = IP_MAXPACKET;
  514 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
  515     &hn_tso_maxlen, 0, "TSO burst limit");
  516 
  517 /* Limit chimney send size */
  518 static int                      hn_tx_chimney_size = 0;
  519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
  520     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
  521 
  522 /* Limit the size of packet for direct transmission */
  523 static int                      hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
  524 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
  525     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
  526 
  527 /* # of LRO entries per RX ring */
  528 #if defined(INET) || defined(INET6)
  529 #if __FreeBSD_version >= 1100095
  530 static int                      hn_lro_entry_count = HN_LROENT_CNT_DEF;
  531 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
  532     &hn_lro_entry_count, 0, "LRO entry count");
  533 #endif
  534 #endif
  535 
  536 static int                      hn_tx_taskq_cnt = 1;
  537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
  538     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
  539 
  540 #define HN_TX_TASKQ_M_INDEP     0
  541 #define HN_TX_TASKQ_M_GLOBAL    1
  542 #define HN_TX_TASKQ_M_EVTTQ     2
  543 
  544 static int                      hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
  545 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
  546     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
  547     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
  548 
  549 #ifndef HN_USE_TXDESC_BUFRING
  550 static int                      hn_use_txdesc_bufring = 0;
  551 #else
  552 static int                      hn_use_txdesc_bufring = 1;
  553 #endif
  554 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
  555     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
  556 
  557 #ifdef HN_IFSTART_SUPPORT
  558 /* Use ifnet.if_start instead of ifnet.if_transmit */
  559 static int                      hn_use_if_start = 0;
  560 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
  561     &hn_use_if_start, 0, "Use if_start TX method");
  562 #endif
  563 
  564 /* # of channels to use */
  565 static int                      hn_chan_cnt = 0;
  566 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
  567     &hn_chan_cnt, 0,
  568     "# of channels to use; each channel has one RX ring and one TX ring");
  569 
  570 /* # of transmit rings to use */
  571 static int                      hn_tx_ring_cnt = 0;
  572 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
  573     &hn_tx_ring_cnt, 0, "# of TX rings to use");
  574 
  575 /* Software TX ring deptch */
  576 static int                      hn_tx_swq_depth = 0;
  577 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
  578     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
  579 
  580 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
  581 #if __FreeBSD_version >= 1100095
  582 static u_int                    hn_lro_mbufq_depth = 0;
  583 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
  584     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
  585 #endif
  586 
  587 /* Packet transmission aggregation size limit */
  588 static int                      hn_tx_agg_size = -1;
  589 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
  590     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
  591 
  592 /* Packet transmission aggregation count limit */
  593 static int                      hn_tx_agg_pkts = -1;
  594 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
  595     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
  596 
  597 /* VF list */
  598 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
  599     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
  600     hn_vflist_sysctl, "A",
  601     "VF list");
  602 
  603 /* VF mapping */
  604 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
  605     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
  606     hn_vfmap_sysctl, "A",
  607     "VF mapping");
  608 
  609 /* Transparent VF */
  610 static int                      hn_xpnt_vf = 1;
  611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
  612     &hn_xpnt_vf, 0, "Transparent VF mod");
  613 
  614 /* Accurate BPF support for Transparent VF */
  615 static int                      hn_xpnt_vf_accbpf = 0;
  616 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
  617     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
  618 
  619 /* Extra wait for transparent VF attach routing; unit seconds. */
  620 static int                      hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
  621 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
  622     &hn_xpnt_vf_attwait, 0,
  623     "Extra wait for transparent VF attach routing; unit: seconds");
  624 
  625 static u_int                    hn_cpu_index;   /* next CPU for channel */
  626 static struct taskqueue         **hn_tx_taskque;/* shared TX taskqueues */
  627 
  628 static struct rmlock            hn_vfmap_lock;
  629 static int                      hn_vfmap_size;
  630 static struct ifnet             **hn_vfmap;
  631 
  632 #ifndef RSS
  633 static const uint8_t
  634 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
  635         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
  636         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
  637         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
  638         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
  639         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
  640 };
  641 #endif  /* !RSS */
  642 
  643 static const struct hyperv_guid hn_guid = {
  644         .hv_guid = {
  645             0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
  646             0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
  647 };
  648 
  649 static device_method_t hn_methods[] = {
  650         /* Device interface */
  651         DEVMETHOD(device_probe,         hn_probe),
  652         DEVMETHOD(device_attach,        hn_attach),
  653         DEVMETHOD(device_detach,        hn_detach),
  654         DEVMETHOD(device_shutdown,      hn_shutdown),
  655         DEVMETHOD_END
  656 };
  657 
  658 static driver_t hn_driver = {
  659         "hn",
  660         hn_methods,
  661         sizeof(struct hn_softc)
  662 };
  663 
  664 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
  665 MODULE_VERSION(hn, 1);
  666 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
  667 
  668 #if __FreeBSD_version >= 1100099
  669 static void
  670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
  671 {
  672         int i;
  673 
  674         for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
  675                 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
  676 }
  677 #endif
  678 
  679 static int
  680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
  681 {
  682 
  683         KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
  684             txd->chim_size == 0, ("invalid rndis sglist txd"));
  685         return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
  686             &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
  687 }
  688 
  689 static int
  690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
  691 {
  692         struct hn_nvs_rndis rndis;
  693 
  694         KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
  695             txd->chim_size > 0, ("invalid rndis chim txd"));
  696 
  697         rndis.nvs_type = HN_NVS_TYPE_RNDIS;
  698         rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
  699         rndis.nvs_chim_idx = txd->chim_index;
  700         rndis.nvs_chim_sz = txd->chim_size;
  701 
  702         return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
  703             &rndis, sizeof(rndis), &txd->send_ctx));
  704 }
  705 
  706 static __inline uint32_t
  707 hn_chim_alloc(struct hn_softc *sc)
  708 {
  709         int i, bmap_cnt = sc->hn_chim_bmap_cnt;
  710         u_long *bmap = sc->hn_chim_bmap;
  711         uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
  712 
  713         for (i = 0; i < bmap_cnt; ++i) {
  714                 int idx;
  715 
  716                 idx = ffsl(~bmap[i]);
  717                 if (idx == 0)
  718                         continue;
  719 
  720                 --idx; /* ffsl is 1-based */
  721                 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
  722                     ("invalid i %d and idx %d", i, idx));
  723 
  724                 if (atomic_testandset_long(&bmap[i], idx))
  725                         continue;
  726 
  727                 ret = i * LONG_BIT + idx;
  728                 break;
  729         }
  730         return (ret);
  731 }
  732 
  733 static __inline void
  734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
  735 {
  736         u_long mask;
  737         uint32_t idx;
  738 
  739         idx = chim_idx / LONG_BIT;
  740         KASSERT(idx < sc->hn_chim_bmap_cnt,
  741             ("invalid chimney index 0x%x", chim_idx));
  742 
  743         mask = 1UL << (chim_idx % LONG_BIT);
  744         KASSERT(sc->hn_chim_bmap[idx] & mask,
  745             ("index bitmap 0x%lx, chimney index %u, "
  746              "bitmap idx %d, bitmask 0x%lx",
  747              sc->hn_chim_bmap[idx], chim_idx, idx, mask));
  748 
  749         atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
  750 }
  751 
  752 #if defined(INET6) || defined(INET)
  753 
  754 #define PULLUP_HDR(m, len)                              \
  755 do {                                                    \
  756         if (__predict_false((m)->m_len < (len))) {      \
  757                 (m) = m_pullup((m), (len));             \
  758                 if ((m) == NULL)                        \
  759                         return (NULL);                  \
  760         }                                               \
  761 } while (0)
  762 
  763 /*
  764  * NOTE: If this function failed, the m_head would be freed.
  765  */
  766 static __inline struct mbuf *
  767 hn_tso_fixup(struct mbuf *m_head)
  768 {
  769         struct ether_vlan_header *evl;
  770         struct tcphdr *th;
  771         int ehlen;
  772 
  773         KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
  774 
  775         PULLUP_HDR(m_head, sizeof(*evl));
  776         evl = mtod(m_head, struct ether_vlan_header *);
  777         if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
  778                 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
  779         else
  780                 ehlen = ETHER_HDR_LEN;
  781         m_head->m_pkthdr.l2hlen = ehlen;
  782 
  783 #ifdef INET
  784         if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
  785                 struct ip *ip;
  786                 int iphlen;
  787 
  788                 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
  789                 ip = mtodo(m_head, ehlen);
  790                 iphlen = ip->ip_hl << 2;
  791                 m_head->m_pkthdr.l3hlen = iphlen;
  792 
  793                 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
  794                 th = mtodo(m_head, ehlen + iphlen);
  795 
  796                 ip->ip_len = 0;
  797                 ip->ip_sum = 0;
  798                 th->th_sum = in_pseudo(ip->ip_src.s_addr,
  799                     ip->ip_dst.s_addr, htons(IPPROTO_TCP));
  800         }
  801 #endif
  802 #if defined(INET6) && defined(INET)
  803         else
  804 #endif
  805 #ifdef INET6
  806         {
  807                 struct ip6_hdr *ip6;
  808 
  809                 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
  810                 ip6 = mtodo(m_head, ehlen);
  811                 if (ip6->ip6_nxt != IPPROTO_TCP) {
  812                         m_freem(m_head);
  813                         return (NULL);
  814                 }
  815                 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
  816 
  817                 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
  818                 th = mtodo(m_head, ehlen + sizeof(*ip6));
  819 
  820                 ip6->ip6_plen = 0;
  821                 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
  822         }
  823 #endif
  824         return (m_head);
  825 }
  826 
  827 /*
  828  * NOTE: If this function failed, the m_head would be freed.
  829  */
  830 static __inline struct mbuf *
  831 hn_set_hlen(struct mbuf *m_head)
  832 {
  833         const struct ether_vlan_header *evl;
  834         int ehlen;
  835 
  836         PULLUP_HDR(m_head, sizeof(*evl));
  837         evl = mtod(m_head, const struct ether_vlan_header *);
  838         if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
  839                 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
  840         else
  841                 ehlen = ETHER_HDR_LEN;
  842         m_head->m_pkthdr.l2hlen = ehlen;
  843 
  844 #ifdef INET
  845         if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
  846                 const struct ip *ip;
  847                 int iphlen;
  848 
  849                 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
  850                 ip = mtodo(m_head, ehlen);
  851                 iphlen = ip->ip_hl << 2;
  852                 m_head->m_pkthdr.l3hlen = iphlen;
  853 
  854                 /*
  855                  * UDP checksum offload does not work in Azure, if the
  856                  * following conditions meet:
  857                  * - sizeof(IP hdr + UDP hdr + payload) > 1420.
  858                  * - IP_DF is not set in the IP hdr.
  859                  *
  860                  * Fallback to software checksum for these UDP datagrams.
  861                  */
  862                 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
  863                     m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
  864                     (ntohs(ip->ip_off) & IP_DF) == 0) {
  865                         uint16_t off = ehlen + iphlen;
  866 
  867                         counter_u64_add(hn_udpcs_fixup, 1);
  868                         PULLUP_HDR(m_head, off + sizeof(struct udphdr));
  869                         *(uint16_t *)(m_head->m_data + off +
  870                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
  871                             m_head, m_head->m_pkthdr.len, off);
  872                         m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
  873                 }
  874         }
  875 #endif
  876 #if defined(INET6) && defined(INET)
  877         else
  878 #endif
  879 #ifdef INET6
  880         {
  881                 const struct ip6_hdr *ip6;
  882 
  883                 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
  884                 ip6 = mtodo(m_head, ehlen);
  885                 if (ip6->ip6_nxt != IPPROTO_TCP &&
  886                     ip6->ip6_nxt != IPPROTO_UDP) {
  887                         m_freem(m_head);
  888                         return (NULL);
  889                 }
  890                 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
  891         }
  892 #endif
  893         return (m_head);
  894 }
  895 
  896 /*
  897  * NOTE: If this function failed, the m_head would be freed.
  898  */
  899 static __inline struct mbuf *
  900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
  901 {
  902         const struct tcphdr *th;
  903         int ehlen, iphlen;
  904 
  905         *tcpsyn = 0;
  906         ehlen = m_head->m_pkthdr.l2hlen;
  907         iphlen = m_head->m_pkthdr.l3hlen;
  908 
  909         PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
  910         th = mtodo(m_head, ehlen + iphlen);
  911         if (th->th_flags & TH_SYN)
  912                 *tcpsyn = 1;
  913         return (m_head);
  914 }
  915 
  916 #undef PULLUP_HDR
  917 
  918 #endif  /* INET6 || INET */
  919 
  920 static int
  921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
  922 {
  923         int error = 0;
  924 
  925         HN_LOCK_ASSERT(sc);
  926 
  927         if (sc->hn_rx_filter != filter) {
  928                 error = hn_rndis_set_rxfilter(sc, filter);
  929                 if (!error)
  930                         sc->hn_rx_filter = filter;
  931         }
  932         return (error);
  933 }
  934 
  935 static int
  936 hn_rxfilter_config(struct hn_softc *sc)
  937 {
  938         struct ifnet *ifp = sc->hn_ifp;
  939         uint32_t filter;
  940 
  941         HN_LOCK_ASSERT(sc);
  942 
  943         /*
  944          * If the non-transparent mode VF is activated, we don't know how
  945          * its RX filter is configured, so stick the synthetic device in
  946          * the promiscous mode.
  947          */
  948         if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
  949                 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
  950         } else {
  951                 filter = NDIS_PACKET_TYPE_DIRECTED;
  952                 if (ifp->if_flags & IFF_BROADCAST)
  953                         filter |= NDIS_PACKET_TYPE_BROADCAST;
  954                 /* TODO: support multicast list */
  955                 if ((ifp->if_flags & IFF_ALLMULTI) ||
  956                     !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
  957                         filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
  958         }
  959         return (hn_set_rxfilter(sc, filter));
  960 }
  961 
  962 static void
  963 hn_set_txagg(struct hn_softc *sc)
  964 {
  965         uint32_t size, pkts;
  966         int i;
  967 
  968         /*
  969          * Setup aggregation size.
  970          */
  971         if (sc->hn_agg_size < 0)
  972                 size = UINT32_MAX;
  973         else
  974                 size = sc->hn_agg_size;
  975 
  976         if (sc->hn_rndis_agg_size < size)
  977                 size = sc->hn_rndis_agg_size;
  978 
  979         /* NOTE: We only aggregate packets using chimney sending buffers. */
  980         if (size > (uint32_t)sc->hn_chim_szmax)
  981                 size = sc->hn_chim_szmax;
  982 
  983         if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
  984                 /* Disable */
  985                 size = 0;
  986                 pkts = 0;
  987                 goto done;
  988         }
  989 
  990         /* NOTE: Type of the per TX ring setting is 'int'. */
  991         if (size > INT_MAX)
  992                 size = INT_MAX;
  993 
  994         /*
  995          * Setup aggregation packet count.
  996          */
  997         if (sc->hn_agg_pkts < 0)
  998                 pkts = UINT32_MAX;
  999         else
 1000                 pkts = sc->hn_agg_pkts;
 1001 
 1002         if (sc->hn_rndis_agg_pkts < pkts)
 1003                 pkts = sc->hn_rndis_agg_pkts;
 1004 
 1005         if (pkts <= 1) {
 1006                 /* Disable */
 1007                 size = 0;
 1008                 pkts = 0;
 1009                 goto done;
 1010         }
 1011 
 1012         /* NOTE: Type of the per TX ring setting is 'short'. */
 1013         if (pkts > SHRT_MAX)
 1014                 pkts = SHRT_MAX;
 1015 
 1016 done:
 1017         /* NOTE: Type of the per TX ring setting is 'short'. */
 1018         if (sc->hn_rndis_agg_align > SHRT_MAX) {
 1019                 /* Disable */
 1020                 size = 0;
 1021                 pkts = 0;
 1022         }
 1023 
 1024         if (bootverbose) {
 1025                 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
 1026                     size, pkts, sc->hn_rndis_agg_align);
 1027         }
 1028 
 1029         for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 1030                 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
 1031 
 1032                 mtx_lock(&txr->hn_tx_lock);
 1033                 txr->hn_agg_szmax = size;
 1034                 txr->hn_agg_pktmax = pkts;
 1035                 txr->hn_agg_align = sc->hn_rndis_agg_align;
 1036                 mtx_unlock(&txr->hn_tx_lock);
 1037         }
 1038 }
 1039 
 1040 static int
 1041 hn_get_txswq_depth(const struct hn_tx_ring *txr)
 1042 {
 1043 
 1044         KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
 1045         if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
 1046                 return txr->hn_txdesc_cnt;
 1047         return hn_tx_swq_depth;
 1048 }
 1049 
 1050 static int
 1051 hn_rss_reconfig(struct hn_softc *sc)
 1052 {
 1053         int error;
 1054 
 1055         HN_LOCK_ASSERT(sc);
 1056 
 1057         if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
 1058                 return (ENXIO);
 1059 
 1060         /*
 1061          * Disable RSS first.
 1062          *
 1063          * NOTE:
 1064          * Direct reconfiguration by setting the UNCHG flags does
 1065          * _not_ work properly.
 1066          */
 1067         if (bootverbose)
 1068                 if_printf(sc->hn_ifp, "disable RSS\n");
 1069         error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
 1070         if (error) {
 1071                 if_printf(sc->hn_ifp, "RSS disable failed\n");
 1072                 return (error);
 1073         }
 1074 
 1075         /*
 1076          * Reenable the RSS w/ the updated RSS key or indirect
 1077          * table.
 1078          */
 1079         if (bootverbose)
 1080                 if_printf(sc->hn_ifp, "reconfig RSS\n");
 1081         error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
 1082         if (error) {
 1083                 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
 1084                 return (error);
 1085         }
 1086         return (0);
 1087 }
 1088 
 1089 static void
 1090 hn_rss_ind_fixup(struct hn_softc *sc)
 1091 {
 1092         struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
 1093         int i, nchan;
 1094 
 1095         nchan = sc->hn_rx_ring_inuse;
 1096         KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
 1097 
 1098         /*
 1099          * Check indirect table to make sure that all channels in it
 1100          * can be used.
 1101          */
 1102         for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
 1103                 if (rss->rss_ind[i] >= nchan) {
 1104                         if_printf(sc->hn_ifp,
 1105                             "RSS indirect table %d fixup: %u -> %d\n",
 1106                             i, rss->rss_ind[i], nchan - 1);
 1107                         rss->rss_ind[i] = nchan - 1;
 1108                 }
 1109         }
 1110 }
 1111 
 1112 static int
 1113 hn_ifmedia_upd(struct ifnet *ifp __unused)
 1114 {
 1115 
 1116         return EOPNOTSUPP;
 1117 }
 1118 
 1119 static void
 1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
 1121 {
 1122         struct hn_softc *sc = ifp->if_softc;
 1123 
 1124         ifmr->ifm_status = IFM_AVALID;
 1125         ifmr->ifm_active = IFM_ETHER;
 1126 
 1127         if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
 1128                 ifmr->ifm_active |= IFM_NONE;
 1129                 return;
 1130         }
 1131         ifmr->ifm_status |= IFM_ACTIVE;
 1132         ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
 1133 }
 1134 
 1135 static void
 1136 hn_rxvf_set_task(void *xarg, int pending __unused)
 1137 {
 1138         struct hn_rxvf_setarg *arg = xarg;
 1139 
 1140         arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
 1141 }
 1142 
 1143 static void
 1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
 1145 {
 1146         struct hn_rx_ring *rxr;
 1147         struct hn_rxvf_setarg arg;
 1148         struct task task;
 1149         int i;
 1150 
 1151         HN_LOCK_ASSERT(sc);
 1152 
 1153         TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
 1154 
 1155         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 1156                 rxr = &sc->hn_rx_ring[i];
 1157 
 1158                 if (i < sc->hn_rx_ring_inuse) {
 1159                         arg.rxr = rxr;
 1160                         arg.vf_ifp = vf_ifp;
 1161                         vmbus_chan_run_task(rxr->hn_chan, &task);
 1162                 } else {
 1163                         rxr->hn_rxvf_ifp = vf_ifp;
 1164                 }
 1165         }
 1166 }
 1167 
 1168 static bool
 1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
 1170 {
 1171         const struct ifnet *hn_ifp;
 1172 
 1173         hn_ifp = sc->hn_ifp;
 1174 
 1175         if (ifp == hn_ifp)
 1176                 return (false);
 1177 
 1178         if (ifp->if_alloctype != IFT_ETHER)
 1179                 return (false);
 1180 
 1181         /* Ignore lagg/vlan interfaces */
 1182         if (strcmp(ifp->if_dname, "lagg") == 0 ||
 1183             strcmp(ifp->if_dname, "vlan") == 0)
 1184                 return (false);
 1185 
 1186         /*
 1187          * During detach events ifp->if_addr might be NULL.
 1188          * Make sure the bcmp() below doesn't panic on that:
 1189          */
 1190         if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
 1191                 return (false);
 1192 
 1193         if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
 1194                 return (false);
 1195 
 1196         return (true);
 1197 }
 1198 
 1199 static void
 1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
 1201 {
 1202         struct ifnet *hn_ifp;
 1203 
 1204         HN_LOCK(sc);
 1205 
 1206         if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
 1207                 goto out;
 1208 
 1209         if (!hn_ismyvf(sc, ifp))
 1210                 goto out;
 1211         hn_ifp = sc->hn_ifp;
 1212 
 1213         if (rxvf) {
 1214                 if (sc->hn_flags & HN_FLAG_RXVF)
 1215                         goto out;
 1216 
 1217                 sc->hn_flags |= HN_FLAG_RXVF;
 1218                 hn_rxfilter_config(sc);
 1219         } else {
 1220                 if (!(sc->hn_flags & HN_FLAG_RXVF))
 1221                         goto out;
 1222 
 1223                 sc->hn_flags &= ~HN_FLAG_RXVF;
 1224                 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
 1225                         hn_rxfilter_config(sc);
 1226                 else
 1227                         hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
 1228         }
 1229 
 1230         hn_nvs_set_datapath(sc,
 1231             rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
 1232 
 1233         hn_rxvf_set(sc, rxvf ? ifp : NULL);
 1234 
 1235         if (rxvf) {
 1236                 hn_vf_rss_fixup(sc, true);
 1237                 hn_suspend_mgmt(sc);
 1238                 sc->hn_link_flags &=
 1239                     ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
 1240                 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
 1241         } else {
 1242                 hn_vf_rss_restore(sc);
 1243                 hn_resume_mgmt(sc);
 1244         }
 1245 
 1246         devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
 1247             rxvf ? "VF_UP" : "VF_DOWN", NULL);
 1248 
 1249         if (bootverbose) {
 1250                 if_printf(hn_ifp, "datapath is switched %s %s\n",
 1251                     rxvf ? "to" : "from", ifp->if_xname);
 1252         }
 1253 out:
 1254         HN_UNLOCK(sc);
 1255 }
 1256 
 1257 static void
 1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
 1259 {
 1260 
 1261         if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
 1262                 return;
 1263         hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
 1264 }
 1265 
 1266 static void
 1267 hn_ifaddr_event(void *arg, struct ifnet *ifp)
 1268 {
 1269 
 1270         hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
 1271 }
 1272 
 1273 static int
 1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
 1275 {
 1276         struct ifnet *ifp, *vf_ifp;
 1277         uint64_t tmp;
 1278         int error;
 1279 
 1280         HN_LOCK_ASSERT(sc);
 1281         ifp = sc->hn_ifp;
 1282         vf_ifp = sc->hn_vf_ifp;
 1283 
 1284         /*
 1285          * Fix up requested capabilities w/ supported capabilities,
 1286          * since the supported capabilities could have been changed.
 1287          */
 1288         ifr->ifr_reqcap &= ifp->if_capabilities;
 1289         /* Pass SIOCSIFCAP to VF. */
 1290         error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
 1291 
 1292         /*
 1293          * NOTE:
 1294          * The error will be propagated to the callers, however, it
 1295          * is _not_ useful here.
 1296          */
 1297 
 1298         /*
 1299          * Merge VF's enabled capabilities.
 1300          */
 1301         ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
 1302 
 1303         tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
 1304         if (ifp->if_capenable & IFCAP_TXCSUM)
 1305                 ifp->if_hwassist |= tmp;
 1306         else
 1307                 ifp->if_hwassist &= ~tmp;
 1308 
 1309         tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
 1310         if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 1311                 ifp->if_hwassist |= tmp;
 1312         else
 1313                 ifp->if_hwassist &= ~tmp;
 1314 
 1315         tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
 1316         if (ifp->if_capenable & IFCAP_TSO4)
 1317                 ifp->if_hwassist |= tmp;
 1318         else
 1319                 ifp->if_hwassist &= ~tmp;
 1320 
 1321         tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
 1322         if (ifp->if_capenable & IFCAP_TSO6)
 1323                 ifp->if_hwassist |= tmp;
 1324         else
 1325                 ifp->if_hwassist &= ~tmp;
 1326 
 1327         return (error);
 1328 }
 1329 
 1330 static int
 1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
 1332 {
 1333         struct ifnet *vf_ifp;
 1334         struct ifreq ifr;
 1335 
 1336         HN_LOCK_ASSERT(sc);
 1337         vf_ifp = sc->hn_vf_ifp;
 1338 
 1339         memset(&ifr, 0, sizeof(ifr));
 1340         strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
 1341         ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
 1342         ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
 1343         return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
 1344 }
 1345 
 1346 static void
 1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
 1348 {
 1349         struct ifnet *ifp = sc->hn_ifp;
 1350         int allmulti = 0;
 1351 
 1352         HN_LOCK_ASSERT(sc);
 1353 
 1354         /* XXX vlan(4) style mcast addr maintenance */
 1355         if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
 1356                 allmulti = IFF_ALLMULTI;
 1357 
 1358         /* Always set the VF's if_flags */
 1359         sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
 1360 }
 1361 
 1362 static void
 1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
 1364 {
 1365         struct rm_priotracker pt;
 1366         struct ifnet *hn_ifp = NULL;
 1367         struct mbuf *mn;
 1368 
 1369         /*
 1370          * XXX racy, if hn(4) ever detached.
 1371          */
 1372         rm_rlock(&hn_vfmap_lock, &pt);
 1373         if (vf_ifp->if_index < hn_vfmap_size)
 1374                 hn_ifp = hn_vfmap[vf_ifp->if_index];
 1375         rm_runlock(&hn_vfmap_lock, &pt);
 1376 
 1377         if (hn_ifp != NULL) {
 1378                 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
 1379                         /*
 1380                          * Allow tapping on the VF.
 1381                          */
 1382                         ETHER_BPF_MTAP(vf_ifp, mn);
 1383 
 1384                         /*
 1385                          * Update VF stats.
 1386                          */
 1387                         if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
 1388                                 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
 1389                                     mn->m_pkthdr.len);
 1390                         }
 1391                         /*
 1392                          * XXX IFCOUNTER_IMCAST
 1393                          * This stat updating is kinda invasive, since it
 1394                          * requires two checks on the mbuf: the length check
 1395                          * and the ethernet header check.  As of this write,
 1396                          * all multicast packets go directly to hn(4), which
 1397                          * makes imcast stat updating in the VF a try in vian.
 1398                          */
 1399 
 1400                         /*
 1401                          * Fix up rcvif and increase hn(4)'s ipackets.
 1402                          */
 1403                         mn->m_pkthdr.rcvif = hn_ifp;
 1404                         if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
 1405                 }
 1406                 /*
 1407                  * Go through hn(4)'s if_input.
 1408                  */
 1409                 hn_ifp->if_input(hn_ifp, m);
 1410         } else {
 1411                 /*
 1412                  * In the middle of the transition; free this
 1413                  * mbuf chain.
 1414                  */
 1415                 while (m != NULL) {
 1416                         mn = m->m_nextpkt;
 1417                         m->m_nextpkt = NULL;
 1418                         m_freem(m);
 1419                         m = mn;
 1420                 }
 1421         }
 1422 }
 1423 
 1424 static void
 1425 hn_mtu_change_fixup(struct hn_softc *sc)
 1426 {
 1427         struct ifnet *ifp;
 1428 
 1429         HN_LOCK_ASSERT(sc);
 1430         ifp = sc->hn_ifp;
 1431 
 1432         hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
 1433 #if __FreeBSD_version >= 1100099
 1434         if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
 1435                 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
 1436 #endif
 1437 }
 1438 
 1439 static uint32_t
 1440 hn_rss_type_fromndis(uint32_t rss_hash)
 1441 {
 1442         uint32_t types = 0;
 1443 
 1444         if (rss_hash & NDIS_HASH_IPV4)
 1445                 types |= RSS_TYPE_IPV4;
 1446         if (rss_hash & NDIS_HASH_TCP_IPV4)
 1447                 types |= RSS_TYPE_TCP_IPV4;
 1448         if (rss_hash & NDIS_HASH_IPV6)
 1449                 types |= RSS_TYPE_IPV6;
 1450         if (rss_hash & NDIS_HASH_IPV6_EX)
 1451                 types |= RSS_TYPE_IPV6_EX;
 1452         if (rss_hash & NDIS_HASH_TCP_IPV6)
 1453                 types |= RSS_TYPE_TCP_IPV6;
 1454         if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
 1455                 types |= RSS_TYPE_TCP_IPV6_EX;
 1456         if (rss_hash & NDIS_HASH_UDP_IPV4_X)
 1457                 types |= RSS_TYPE_UDP_IPV4;
 1458         return (types);
 1459 }
 1460 
 1461 static uint32_t
 1462 hn_rss_type_tondis(uint32_t types)
 1463 {
 1464         uint32_t rss_hash = 0;
 1465 
 1466         KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
 1467             ("UDP6 and UDP6EX are not supported"));
 1468 
 1469         if (types & RSS_TYPE_IPV4)
 1470                 rss_hash |= NDIS_HASH_IPV4;
 1471         if (types & RSS_TYPE_TCP_IPV4)
 1472                 rss_hash |= NDIS_HASH_TCP_IPV4;
 1473         if (types & RSS_TYPE_IPV6)
 1474                 rss_hash |= NDIS_HASH_IPV6;
 1475         if (types & RSS_TYPE_IPV6_EX)
 1476                 rss_hash |= NDIS_HASH_IPV6_EX;
 1477         if (types & RSS_TYPE_TCP_IPV6)
 1478                 rss_hash |= NDIS_HASH_TCP_IPV6;
 1479         if (types & RSS_TYPE_TCP_IPV6_EX)
 1480                 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
 1481         if (types & RSS_TYPE_UDP_IPV4)
 1482                 rss_hash |= NDIS_HASH_UDP_IPV4_X;
 1483         return (rss_hash);
 1484 }
 1485 
 1486 static void
 1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
 1488 {
 1489         int i;
 1490 
 1491         HN_LOCK_ASSERT(sc);
 1492 
 1493         for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
 1494                 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
 1495 }
 1496 
 1497 static void
 1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
 1499 {
 1500         struct ifnet *ifp, *vf_ifp;
 1501         struct ifrsshash ifrh;
 1502         struct ifrsskey ifrk;
 1503         int error;
 1504         uint32_t my_types, diff_types, mbuf_types = 0;
 1505 
 1506         HN_LOCK_ASSERT(sc);
 1507         KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
 1508             ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
 1509 
 1510         if (sc->hn_rx_ring_inuse == 1) {
 1511                 /* No RSS on synthetic parts; done. */
 1512                 return;
 1513         }
 1514         if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
 1515                 /* Synthetic parts do not support Toeplitz; done. */
 1516                 return;
 1517         }
 1518 
 1519         ifp = sc->hn_ifp;
 1520         vf_ifp = sc->hn_vf_ifp;
 1521 
 1522         /*
 1523          * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
 1524          * supported.
 1525          */
 1526         memset(&ifrk, 0, sizeof(ifrk));
 1527         strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
 1528         error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
 1529         if (error) {
 1530                 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
 1531                     vf_ifp->if_xname, error);
 1532                 goto done;
 1533         }
 1534         if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
 1535                 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
 1536                     vf_ifp->if_xname, ifrk.ifrk_func);
 1537                 goto done;
 1538         }
 1539         if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
 1540                 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
 1541                     vf_ifp->if_xname, ifrk.ifrk_keylen);
 1542                 goto done;
 1543         }
 1544 
 1545         /*
 1546          * Extract VF's RSS hash.  Only Toeplitz is supported.
 1547          */
 1548         memset(&ifrh, 0, sizeof(ifrh));
 1549         strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
 1550         error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
 1551         if (error) {
 1552                 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
 1553                     vf_ifp->if_xname, error);
 1554                 goto done;
 1555         }
 1556         if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
 1557                 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
 1558                     vf_ifp->if_xname, ifrh.ifrh_func);
 1559                 goto done;
 1560         }
 1561 
 1562         my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
 1563         if ((ifrh.ifrh_types & my_types) == 0) {
 1564                 /* This disables RSS; ignore it then */
 1565                 if_printf(ifp, "%s intersection of RSS types failed.  "
 1566                     "VF %#x, mine %#x\n", vf_ifp->if_xname,
 1567                     ifrh.ifrh_types, my_types);
 1568                 goto done;
 1569         }
 1570 
 1571         diff_types = my_types ^ ifrh.ifrh_types;
 1572         my_types &= ifrh.ifrh_types;
 1573         mbuf_types = my_types;
 1574 
 1575         /*
 1576          * Detect RSS hash value/type confliction.
 1577          *
 1578          * NOTE:
 1579          * We don't disable the hash type, but stop delivery the hash
 1580          * value/type through mbufs on RX path.
 1581          *
 1582          * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
 1583          * hash is delivered with type of TCP_IPV4.  This means if
 1584          * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
 1585          * least to hn_mbuf_hash.  However, given that _all_ of the
 1586          * NICs implement TCP_IPV4, this will _not_ impose any issues
 1587          * here.
 1588          */
 1589         if ((my_types & RSS_TYPE_IPV4) &&
 1590             (diff_types & ifrh.ifrh_types &
 1591              (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
 1592                 /* Conflict; disable IPV4 hash type/value delivery. */
 1593                 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
 1594                 mbuf_types &= ~RSS_TYPE_IPV4;
 1595         }
 1596         if ((my_types & RSS_TYPE_IPV6) &&
 1597             (diff_types & ifrh.ifrh_types &
 1598              (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
 1599               RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
 1600               RSS_TYPE_IPV6_EX))) {
 1601                 /* Conflict; disable IPV6 hash type/value delivery. */
 1602                 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
 1603                 mbuf_types &= ~RSS_TYPE_IPV6;
 1604         }
 1605         if ((my_types & RSS_TYPE_IPV6_EX) &&
 1606             (diff_types & ifrh.ifrh_types &
 1607              (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
 1608               RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
 1609               RSS_TYPE_IPV6))) {
 1610                 /* Conflict; disable IPV6_EX hash type/value delivery. */
 1611                 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
 1612                 mbuf_types &= ~RSS_TYPE_IPV6_EX;
 1613         }
 1614         if ((my_types & RSS_TYPE_TCP_IPV6) &&
 1615             (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
 1616                 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
 1617                 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
 1618                 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
 1619         }
 1620         if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
 1621             (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
 1622                 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
 1623                 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
 1624                 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
 1625         }
 1626         if ((my_types & RSS_TYPE_UDP_IPV6) &&
 1627             (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
 1628                 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
 1629                 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
 1630                 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
 1631         }
 1632         if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
 1633             (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
 1634                 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
 1635                 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
 1636                 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
 1637         }
 1638 
 1639         /*
 1640          * Indirect table does not matter.
 1641          */
 1642 
 1643         sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
 1644             hn_rss_type_tondis(my_types);
 1645         memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
 1646         sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
 1647 
 1648         if (reconf) {
 1649                 error = hn_rss_reconfig(sc);
 1650                 if (error) {
 1651                         /* XXX roll-back? */
 1652                         if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
 1653                         /* XXX keep going. */
 1654                 }
 1655         }
 1656 done:
 1657         /* Hash deliverability for mbufs. */
 1658         hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
 1659 }
 1660 
 1661 static void
 1662 hn_vf_rss_restore(struct hn_softc *sc)
 1663 {
 1664 
 1665         HN_LOCK_ASSERT(sc);
 1666         KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
 1667             ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
 1668 
 1669         if (sc->hn_rx_ring_inuse == 1)
 1670                 goto done;
 1671 
 1672         /*
 1673          * Restore hash types.  Key does _not_ matter.
 1674          */
 1675         if (sc->hn_rss_hash != sc->hn_rss_hcap) {
 1676                 int error;
 1677 
 1678                 sc->hn_rss_hash = sc->hn_rss_hcap;
 1679                 error = hn_rss_reconfig(sc);
 1680                 if (error) {
 1681                         if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
 1682                             error);
 1683                         /* XXX keep going. */
 1684                 }
 1685         }
 1686 done:
 1687         /* Hash deliverability for mbufs. */
 1688         hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
 1689 }
 1690 
 1691 static void
 1692 hn_xpnt_vf_setready(struct hn_softc *sc)
 1693 {
 1694         struct ifnet *ifp, *vf_ifp;
 1695         struct ifreq ifr;
 1696 
 1697         HN_LOCK_ASSERT(sc);
 1698         ifp = sc->hn_ifp;
 1699         vf_ifp = sc->hn_vf_ifp;
 1700 
 1701         /*
 1702          * Mark the VF ready.
 1703          */
 1704         sc->hn_vf_rdytick = 0;
 1705 
 1706         /*
 1707          * Save information for restoration.
 1708          */
 1709         sc->hn_saved_caps = ifp->if_capabilities;
 1710         sc->hn_saved_tsomax = ifp->if_hw_tsomax;
 1711         sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
 1712         sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
 1713 
 1714         /*
 1715          * Intersect supported/enabled capabilities.
 1716          *
 1717          * NOTE:
 1718          * if_hwassist is not changed here.
 1719          */
 1720         ifp->if_capabilities &= vf_ifp->if_capabilities;
 1721         ifp->if_capenable &= ifp->if_capabilities;
 1722 
 1723         /*
 1724          * Fix TSO settings.
 1725          */
 1726         if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
 1727                 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
 1728         if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
 1729                 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
 1730         if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
 1731                 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
 1732 
 1733         /*
 1734          * Change VF's enabled capabilities.
 1735          */
 1736         memset(&ifr, 0, sizeof(ifr));
 1737         strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
 1738         ifr.ifr_reqcap = ifp->if_capenable;
 1739         hn_xpnt_vf_iocsetcaps(sc, &ifr);
 1740 
 1741         if (ifp->if_mtu != ETHERMTU) {
 1742                 int error;
 1743 
 1744                 /*
 1745                  * Change VF's MTU.
 1746                  */
 1747                 memset(&ifr, 0, sizeof(ifr));
 1748                 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
 1749                 ifr.ifr_mtu = ifp->if_mtu;
 1750                 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
 1751                 if (error) {
 1752                         if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
 1753                             vf_ifp->if_xname, ifp->if_mtu);
 1754                         if (ifp->if_mtu > ETHERMTU) {
 1755                                 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
 1756 
 1757                                 /*
 1758                                  * XXX
 1759                                  * No need to adjust the synthetic parts' MTU;
 1760                                  * failure of the adjustment will cause us
 1761                                  * infinite headache.
 1762                                  */
 1763                                 ifp->if_mtu = ETHERMTU;
 1764                                 hn_mtu_change_fixup(sc);
 1765                         }
 1766                 }
 1767         }
 1768 }
 1769 
 1770 static bool
 1771 hn_xpnt_vf_isready(struct hn_softc *sc)
 1772 {
 1773 
 1774         HN_LOCK_ASSERT(sc);
 1775 
 1776         if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
 1777                 return (false);
 1778 
 1779         if (sc->hn_vf_rdytick == 0)
 1780                 return (true);
 1781 
 1782         if (sc->hn_vf_rdytick > ticks)
 1783                 return (false);
 1784 
 1785         /* Mark VF as ready. */
 1786         hn_xpnt_vf_setready(sc);
 1787         return (true);
 1788 }
 1789 
 1790 static void
 1791 hn_xpnt_vf_setenable(struct hn_softc *sc)
 1792 {
 1793         int i;
 1794 
 1795         HN_LOCK_ASSERT(sc);
 1796 
 1797         /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
 1798         rm_wlock(&sc->hn_vf_lock);
 1799         sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
 1800         rm_wunlock(&sc->hn_vf_lock);
 1801 
 1802         for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
 1803                 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
 1804 }
 1805 
 1806 static void
 1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
 1808 {
 1809         int i;
 1810 
 1811         HN_LOCK_ASSERT(sc);
 1812 
 1813         /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
 1814         rm_wlock(&sc->hn_vf_lock);
 1815         sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
 1816         if (clear_vf)
 1817                 sc->hn_vf_ifp = NULL;
 1818         rm_wunlock(&sc->hn_vf_lock);
 1819 
 1820         for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
 1821                 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
 1822 }
 1823 
 1824 static void
 1825 hn_xpnt_vf_init(struct hn_softc *sc)
 1826 {
 1827         int error;
 1828 
 1829         HN_LOCK_ASSERT(sc);
 1830 
 1831         KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
 1832             ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
 1833 
 1834         if (bootverbose) {
 1835                 if_printf(sc->hn_ifp, "try bringing up %s\n",
 1836                     sc->hn_vf_ifp->if_xname);
 1837         }
 1838 
 1839         /*
 1840          * Bring the VF up.
 1841          */
 1842         hn_xpnt_vf_saveifflags(sc);
 1843         sc->hn_vf_ifp->if_flags |= IFF_UP;
 1844         error = hn_xpnt_vf_iocsetflags(sc);
 1845         if (error) {
 1846                 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
 1847                     sc->hn_vf_ifp->if_xname, error);
 1848                 return;
 1849         }
 1850 
 1851         /*
 1852          * NOTE:
 1853          * Datapath setting must happen _after_ bringing the VF up.
 1854          */
 1855         hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
 1856 
 1857         /*
 1858          * NOTE:
 1859          * Fixup RSS related bits _after_ the VF is brought up, since
 1860          * many VFs generate RSS key during it's initialization.
 1861          */
 1862         hn_vf_rss_fixup(sc, true);
 1863 
 1864         /* Mark transparent mode VF as enabled. */
 1865         hn_xpnt_vf_setenable(sc);
 1866 }
 1867 
 1868 static void
 1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
 1870 {
 1871         struct hn_softc *sc = xsc;
 1872 
 1873         HN_LOCK(sc);
 1874 
 1875         if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
 1876                 goto done;
 1877         if (sc->hn_vf_ifp == NULL)
 1878                 goto done;
 1879         if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
 1880                 goto done;
 1881 
 1882         if (sc->hn_vf_rdytick != 0) {
 1883                 /* Mark VF as ready. */
 1884                 hn_xpnt_vf_setready(sc);
 1885         }
 1886 
 1887         if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
 1888                 /*
 1889                  * Delayed VF initialization.
 1890                  */
 1891                 if (bootverbose) {
 1892                         if_printf(sc->hn_ifp, "delayed initialize %s\n",
 1893                             sc->hn_vf_ifp->if_xname);
 1894                 }
 1895                 hn_xpnt_vf_init(sc);
 1896         }
 1897 done:
 1898         HN_UNLOCK(sc);
 1899 }
 1900 
 1901 static void
 1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
 1903 {
 1904         struct hn_softc *sc = xsc;
 1905 
 1906         HN_LOCK(sc);
 1907 
 1908         if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
 1909                 goto done;
 1910 
 1911         if (!hn_ismyvf(sc, ifp))
 1912                 goto done;
 1913 
 1914         if (sc->hn_vf_ifp != NULL) {
 1915                 if_printf(sc->hn_ifp, "%s was attached as VF\n",
 1916                     sc->hn_vf_ifp->if_xname);
 1917                 goto done;
 1918         }
 1919 
 1920         if (hn_xpnt_vf && ifp->if_start != NULL) {
 1921                 /*
 1922                  * ifnet.if_start is _not_ supported by transparent
 1923                  * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
 1924                  */
 1925                 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
 1926                     "in transparent VF mode.\n", ifp->if_xname);
 1927                 goto done;
 1928         }
 1929 
 1930         rm_wlock(&hn_vfmap_lock);
 1931 
 1932         if (ifp->if_index >= hn_vfmap_size) {
 1933                 struct ifnet **newmap;
 1934                 int newsize;
 1935 
 1936                 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
 1937                 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
 1938                     M_WAITOK | M_ZERO);
 1939 
 1940                 memcpy(newmap, hn_vfmap,
 1941                     sizeof(struct ifnet *) * hn_vfmap_size);
 1942                 free(hn_vfmap, M_DEVBUF);
 1943                 hn_vfmap = newmap;
 1944                 hn_vfmap_size = newsize;
 1945         }
 1946         KASSERT(hn_vfmap[ifp->if_index] == NULL,
 1947             ("%s: ifindex %d was mapped to %s",
 1948              ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
 1949         hn_vfmap[ifp->if_index] = sc->hn_ifp;
 1950 
 1951         rm_wunlock(&hn_vfmap_lock);
 1952 
 1953         /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
 1954         rm_wlock(&sc->hn_vf_lock);
 1955         KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
 1956             ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
 1957         sc->hn_vf_ifp = ifp;
 1958         rm_wunlock(&sc->hn_vf_lock);
 1959 
 1960         if (hn_xpnt_vf) {
 1961                 int wait_ticks;
 1962 
 1963                 /*
 1964                  * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
 1965                  * Save vf_ifp's current if_input for later restoration.
 1966                  */
 1967                 sc->hn_vf_input = ifp->if_input;
 1968                 ifp->if_input = hn_xpnt_vf_input;
 1969 
 1970                 /*
 1971                  * Stop link status management; use the VF's.
 1972                  */
 1973                 hn_suspend_mgmt(sc);
 1974 
 1975                 /*
 1976                  * Give VF sometime to complete its attach routing.
 1977                  */
 1978                 wait_ticks = hn_xpnt_vf_attwait * hz;
 1979                 sc->hn_vf_rdytick = ticks + wait_ticks;
 1980 
 1981                 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
 1982                     wait_ticks);
 1983         }
 1984 done:
 1985         HN_UNLOCK(sc);
 1986 }
 1987 
 1988 static void
 1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
 1990 {
 1991         struct hn_softc *sc = xsc;
 1992 
 1993         HN_LOCK(sc);
 1994 
 1995         if (sc->hn_vf_ifp == NULL)
 1996                 goto done;
 1997 
 1998         if (!hn_ismyvf(sc, ifp))
 1999                 goto done;
 2000 
 2001         if (hn_xpnt_vf) {
 2002                 /*
 2003                  * Make sure that the delayed initialization is not running.
 2004                  *
 2005                  * NOTE:
 2006                  * - This lock _must_ be released, since the hn_vf_init task
 2007                  *   will try holding this lock.
 2008                  * - It is safe to release this lock here, since the
 2009                  *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
 2010                  *
 2011                  * XXX racy, if hn(4) ever detached.
 2012                  */
 2013                 HN_UNLOCK(sc);
 2014                 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
 2015                 HN_LOCK(sc);
 2016 
 2017                 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
 2018                     sc->hn_ifp->if_xname));
 2019                 ifp->if_input = sc->hn_vf_input;
 2020                 sc->hn_vf_input = NULL;
 2021 
 2022                 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
 2023                     (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
 2024                         hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
 2025 
 2026                 if (sc->hn_vf_rdytick == 0) {
 2027                         /*
 2028                          * The VF was ready; restore some settings.
 2029                          */
 2030                         sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
 2031                         /*
 2032                          * NOTE:
 2033                          * There is _no_ need to fixup if_capenable and
 2034                          * if_hwassist, since the if_capabilities before
 2035                          * restoration was an intersection of the VF's
 2036                          * if_capabilites and the synthetic device's
 2037                          * if_capabilites.
 2038                          */
 2039                         sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
 2040                         sc->hn_ifp->if_hw_tsomaxsegcount =
 2041                             sc->hn_saved_tsosegcnt;
 2042                         sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
 2043                 }
 2044 
 2045                 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
 2046                         /*
 2047                          * Restore RSS settings.
 2048                          */
 2049                         hn_vf_rss_restore(sc);
 2050 
 2051                         /*
 2052                          * Resume link status management, which was suspended
 2053                          * by hn_ifnet_attevent().
 2054                          */
 2055                         hn_resume_mgmt(sc);
 2056                 }
 2057         }
 2058 
 2059         /* Mark transparent mode VF as disabled. */
 2060         hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
 2061 
 2062         rm_wlock(&hn_vfmap_lock);
 2063 
 2064         KASSERT(ifp->if_index < hn_vfmap_size,
 2065             ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
 2066         if (hn_vfmap[ifp->if_index] != NULL) {
 2067                 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
 2068                     ("%s: ifindex %d was mapped to %s",
 2069                      ifp->if_xname, ifp->if_index,
 2070                      hn_vfmap[ifp->if_index]->if_xname));
 2071                 hn_vfmap[ifp->if_index] = NULL;
 2072         }
 2073 
 2074         rm_wunlock(&hn_vfmap_lock);
 2075 done:
 2076         HN_UNLOCK(sc);
 2077 }
 2078 
 2079 static void
 2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
 2081 {
 2082         struct hn_softc *sc = xsc;
 2083 
 2084         if (sc->hn_vf_ifp == ifp)
 2085                 if_link_state_change(sc->hn_ifp, link_state);
 2086 }
 2087 
 2088 static int
 2089 hn_probe(device_t dev)
 2090 {
 2091 
 2092         if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
 2093                 device_set_desc(dev, "Hyper-V Network Interface");
 2094                 return BUS_PROBE_DEFAULT;
 2095         }
 2096         return ENXIO;
 2097 }
 2098 
 2099 static int
 2100 hn_attach(device_t dev)
 2101 {
 2102         struct hn_softc *sc = device_get_softc(dev);
 2103         struct sysctl_oid_list *child;
 2104         struct sysctl_ctx_list *ctx;
 2105         uint8_t eaddr[ETHER_ADDR_LEN];
 2106         struct ifnet *ifp = NULL;
 2107         int error, ring_cnt, tx_ring_cnt;
 2108         uint32_t mtu;
 2109 
 2110         sc->hn_dev = dev;
 2111         sc->hn_prichan = vmbus_get_channel(dev);
 2112         HN_LOCK_INIT(sc);
 2113         rm_init(&sc->hn_vf_lock, "hnvf");
 2114         if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
 2115                 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
 2116 
 2117         /*
 2118          * Initialize these tunables once.
 2119          */
 2120         sc->hn_agg_size = hn_tx_agg_size;
 2121         sc->hn_agg_pkts = hn_tx_agg_pkts;
 2122 
 2123         /*
 2124          * Setup taskqueue for transmission.
 2125          */
 2126         if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
 2127                 int i;
 2128 
 2129                 sc->hn_tx_taskqs =
 2130                     malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
 2131                     M_DEVBUF, M_WAITOK);
 2132                 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
 2133                         sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
 2134                             M_WAITOK, taskqueue_thread_enqueue,
 2135                             &sc->hn_tx_taskqs[i]);
 2136                         taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
 2137                             "%s tx%d", device_get_nameunit(dev), i);
 2138                 }
 2139         } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
 2140                 sc->hn_tx_taskqs = hn_tx_taskque;
 2141         }
 2142 
 2143         /*
 2144          * Setup taskqueue for mangement tasks, e.g. link status.
 2145          */
 2146         sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
 2147             taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
 2148         taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
 2149             device_get_nameunit(dev));
 2150         TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
 2151         TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
 2152         TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
 2153             hn_netchg_status_taskfunc, sc);
 2154 
 2155         if (hn_xpnt_vf) {
 2156                 /*
 2157                  * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
 2158                  */
 2159                 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
 2160                     taskqueue_thread_enqueue, &sc->hn_vf_taskq);
 2161                 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
 2162                     device_get_nameunit(dev));
 2163                 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
 2164                     hn_xpnt_vf_init_taskfunc, sc);
 2165         }
 2166 
 2167         /*
 2168          * Allocate ifnet and setup its name earlier, so that if_printf
 2169          * can be used by functions, which will be called after
 2170          * ether_ifattach().
 2171          */
 2172         ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
 2173         ifp->if_softc = sc;
 2174         if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 2175 
 2176         /*
 2177          * Initialize ifmedia earlier so that it can be unconditionally
 2178          * destroyed, if error happened later on.
 2179          */
 2180         ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
 2181 
 2182         /*
 2183          * Figure out the # of RX rings (ring_cnt) and the # of TX rings
 2184          * to use (tx_ring_cnt).
 2185          *
 2186          * NOTE:
 2187          * The # of RX rings to use is same as the # of channels to use.
 2188          */
 2189         ring_cnt = hn_chan_cnt;
 2190         if (ring_cnt <= 0) {
 2191                 /* Default */
 2192                 ring_cnt = mp_ncpus;
 2193                 if (ring_cnt > HN_RING_CNT_DEF_MAX)
 2194                         ring_cnt = HN_RING_CNT_DEF_MAX;
 2195         } else if (ring_cnt > mp_ncpus) {
 2196                 ring_cnt = mp_ncpus;
 2197         }
 2198 #ifdef RSS
 2199         if (ring_cnt > rss_getnumbuckets())
 2200                 ring_cnt = rss_getnumbuckets();
 2201 #endif
 2202 
 2203         tx_ring_cnt = hn_tx_ring_cnt;
 2204         if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
 2205                 tx_ring_cnt = ring_cnt;
 2206 #ifdef HN_IFSTART_SUPPORT
 2207         if (hn_use_if_start) {
 2208                 /* ifnet.if_start only needs one TX ring. */
 2209                 tx_ring_cnt = 1;
 2210         }
 2211 #endif
 2212 
 2213         /*
 2214          * Set the leader CPU for channels.
 2215          */
 2216         sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
 2217 
 2218         /*
 2219          * Create enough TX/RX rings, even if only limited number of
 2220          * channels can be allocated.
 2221          */
 2222         error = hn_create_tx_data(sc, tx_ring_cnt);
 2223         if (error)
 2224                 goto failed;
 2225         error = hn_create_rx_data(sc, ring_cnt);
 2226         if (error)
 2227                 goto failed;
 2228 
 2229         /*
 2230          * Create transaction context for NVS and RNDIS transactions.
 2231          */
 2232         sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
 2233             HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
 2234         if (sc->hn_xact == NULL) {
 2235                 error = ENXIO;
 2236                 goto failed;
 2237         }
 2238 
 2239         /*
 2240          * Install orphan handler for the revocation of this device's
 2241          * primary channel.
 2242          *
 2243          * NOTE:
 2244          * The processing order is critical here:
 2245          * Install the orphan handler, _before_ testing whether this
 2246          * device's primary channel has been revoked or not.
 2247          */
 2248         vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
 2249         if (vmbus_chan_is_revoked(sc->hn_prichan)) {
 2250                 error = ENXIO;
 2251                 goto failed;
 2252         }
 2253 
 2254         /*
 2255          * Attach the synthetic parts, i.e. NVS and RNDIS.
 2256          */
 2257         error = hn_synth_attach(sc, ETHERMTU);
 2258         if (error)
 2259                 goto failed;
 2260 
 2261         error = hn_rndis_get_eaddr(sc, eaddr);
 2262         if (error)
 2263                 goto failed;
 2264 
 2265         error = hn_rndis_get_mtu(sc, &mtu);
 2266         if (error)
 2267                 mtu = ETHERMTU;
 2268         else if (bootverbose)
 2269                 device_printf(dev, "RNDIS mtu %u\n", mtu);
 2270 
 2271 #if __FreeBSD_version >= 1100099
 2272         if (sc->hn_rx_ring_inuse > 1) {
 2273                 /*
 2274                  * Reduce TCP segment aggregation limit for multiple
 2275                  * RX rings to increase ACK timeliness.
 2276                  */
 2277                 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
 2278         }
 2279 #endif
 2280 
 2281         /*
 2282          * Fixup TX/RX stuffs after synthetic parts are attached.
 2283          */
 2284         hn_fixup_tx_data(sc);
 2285         hn_fixup_rx_data(sc);
 2286 
 2287         ctx = device_get_sysctl_ctx(dev);
 2288         child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 2289         SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
 2290             &sc->hn_nvs_ver, 0, "NVS version");
 2291         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
 2292             CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2293             hn_ndis_version_sysctl, "A", "NDIS version");
 2294         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
 2295             CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2296             hn_caps_sysctl, "A", "capabilities");
 2297         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
 2298             CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2299             hn_hwassist_sysctl, "A", "hwassist");
 2300         SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
 2301             CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
 2302         SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
 2303             CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
 2304             "max # of TSO segments");
 2305         SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
 2306             CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
 2307             "max size of TSO segment");
 2308         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
 2309             CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2310             hn_rxfilter_sysctl, "A", "rxfilter");
 2311         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
 2312             CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2313             hn_rss_hash_sysctl, "A", "RSS hash");
 2314         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
 2315             CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2316             hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
 2317         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
 2318             CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2319             hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
 2320         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
 2321             CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
 2322 #ifndef RSS
 2323         /*
 2324          * Don't allow RSS key/indirect table changes, if RSS is defined.
 2325          */
 2326         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
 2327             CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 2328             hn_rss_key_sysctl, "IU", "RSS key");
 2329         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
 2330             CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 2331             hn_rss_ind_sysctl, "IU", "RSS indirect table");
 2332 #endif
 2333         SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
 2334             CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
 2335             "RNDIS offered packet transmission aggregation size limit");
 2336         SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
 2337             CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
 2338             "RNDIS offered packet transmission aggregation count limit");
 2339         SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
 2340             CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
 2341             "RNDIS packet transmission aggregation alignment");
 2342         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
 2343             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 2344             hn_txagg_size_sysctl, "I",
 2345             "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
 2346         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
 2347             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 2348             hn_txagg_pkts_sysctl, "I",
 2349             "Packet transmission aggregation packets, "
 2350             "0 -- disable, -1 -- auto");
 2351         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
 2352             CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 2353             hn_polling_sysctl, "I",
 2354             "Polling frequency: [100,1000000], 0 disable polling");
 2355         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
 2356             CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2357             hn_vf_sysctl, "A", "Virtual Function's name");
 2358         if (!hn_xpnt_vf) {
 2359                 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
 2360                     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2361                     hn_rxvf_sysctl, "A", "activated Virtual Function's name");
 2362         } else {
 2363                 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
 2364                     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 2365                     hn_xpnt_vf_enabled_sysctl, "I",
 2366                     "Transparent VF enabled");
 2367                 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
 2368                     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 2369                     hn_xpnt_vf_accbpf_sysctl, "I",
 2370                     "Accurate BPF for transparent VF");
 2371         }
 2372 
 2373         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
 2374             CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
 2375             "switch to rsc");
 2376 
 2377         /*
 2378          * Setup the ifmedia, which has been initialized earlier.
 2379          */
 2380         ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 2381         ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
 2382         /* XXX ifmedia_set really should do this for us */
 2383         sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
 2384 
 2385         /*
 2386          * Setup the ifnet for this interface.
 2387          */
 2388 
 2389         ifp->if_baudrate = IF_Gbps(10);
 2390         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 2391         ifp->if_ioctl = hn_ioctl;
 2392         ifp->if_init = hn_init;
 2393 #ifdef HN_IFSTART_SUPPORT
 2394         if (hn_use_if_start) {
 2395                 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
 2396 
 2397                 ifp->if_start = hn_start;
 2398                 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
 2399                 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
 2400                 IFQ_SET_READY(&ifp->if_snd);
 2401         } else
 2402 #endif
 2403         {
 2404                 ifp->if_transmit = hn_transmit;
 2405                 ifp->if_qflush = hn_xmit_qflush;
 2406         }
 2407 
 2408         ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
 2409 #ifdef foo
 2410         /* We can't diff IPv6 packets from IPv4 packets on RX path. */
 2411         ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
 2412 #endif
 2413         if (sc->hn_caps & HN_CAP_VLAN) {
 2414                 /* XXX not sure about VLAN_MTU. */
 2415                 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
 2416         }
 2417 
 2418         ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
 2419         if (ifp->if_hwassist & HN_CSUM_IP_MASK)
 2420                 ifp->if_capabilities |= IFCAP_TXCSUM;
 2421         if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
 2422                 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
 2423         if (sc->hn_caps & HN_CAP_TSO4) {
 2424                 ifp->if_capabilities |= IFCAP_TSO4;
 2425                 ifp->if_hwassist |= CSUM_IP_TSO;
 2426         }
 2427         if (sc->hn_caps & HN_CAP_TSO6) {
 2428                 ifp->if_capabilities |= IFCAP_TSO6;
 2429                 ifp->if_hwassist |= CSUM_IP6_TSO;
 2430         }
 2431 
 2432         /* Enable all available capabilities by default. */
 2433         ifp->if_capenable = ifp->if_capabilities;
 2434 
 2435         /*
 2436          * Disable IPv6 TSO and TXCSUM by default, they still can
 2437          * be enabled through SIOCSIFCAP.
 2438          */
 2439         ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
 2440         ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
 2441 
 2442         if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
 2443                 /*
 2444                  * Lock hn_set_tso_maxsize() to simplify its
 2445                  * internal logic.
 2446                  */
 2447                 HN_LOCK(sc);
 2448                 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
 2449                 HN_UNLOCK(sc);
 2450                 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
 2451                 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
 2452         }
 2453 
 2454         ether_ifattach(ifp, eaddr);
 2455 
 2456         if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
 2457                 if_printf(ifp, "TSO segcnt %u segsz %u\n",
 2458                     ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
 2459         }
 2460         if (mtu < ETHERMTU) {
 2461                 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
 2462                 ifp->if_mtu = mtu;
 2463         }
 2464 
 2465         /* Inform the upper layer about the long frame support. */
 2466         ifp->if_hdrlen = sizeof(struct ether_vlan_header);
 2467 
 2468         /*
 2469          * Kick off link status check.
 2470          */
 2471         sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
 2472         hn_update_link_status(sc);
 2473 
 2474         if (!hn_xpnt_vf) {
 2475                 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
 2476                     hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
 2477                 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
 2478                     hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
 2479         } else {
 2480                 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
 2481                     hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
 2482         }
 2483 
 2484         /*
 2485          * NOTE:
 2486          * Subscribe ether_ifattach event, instead of ifnet_arrival event,
 2487          * since interface's LLADDR is needed; interface LLADDR is not
 2488          * available when ifnet_arrival event is triggered.
 2489          */
 2490         sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
 2491             hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
 2492         sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
 2493             hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
 2494 
 2495         return (0);
 2496 failed:
 2497         if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
 2498                 hn_synth_detach(sc);
 2499         hn_detach(dev);
 2500         return (error);
 2501 }
 2502 
 2503 static int
 2504 hn_detach(device_t dev)
 2505 {
 2506         struct hn_softc *sc = device_get_softc(dev);
 2507         struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
 2508 
 2509         if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
 2510                 /*
 2511                  * In case that the vmbus missed the orphan handler
 2512                  * installation.
 2513                  */
 2514                 vmbus_xact_ctx_orphan(sc->hn_xact);
 2515         }
 2516 
 2517         if (sc->hn_ifaddr_evthand != NULL)
 2518                 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
 2519         if (sc->hn_ifnet_evthand != NULL)
 2520                 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
 2521         if (sc->hn_ifnet_atthand != NULL) {
 2522                 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
 2523                     sc->hn_ifnet_atthand);
 2524         }
 2525         if (sc->hn_ifnet_dethand != NULL) {
 2526                 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 2527                     sc->hn_ifnet_dethand);
 2528         }
 2529         if (sc->hn_ifnet_lnkhand != NULL)
 2530                 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
 2531 
 2532         vf_ifp = sc->hn_vf_ifp;
 2533         __compiler_membar();
 2534         if (vf_ifp != NULL)
 2535                 hn_ifnet_detevent(sc, vf_ifp);
 2536 
 2537         if (device_is_attached(dev)) {
 2538                 HN_LOCK(sc);
 2539                 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
 2540                         if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 2541                                 hn_stop(sc, true);
 2542                         /*
 2543                          * NOTE:
 2544                          * hn_stop() only suspends data, so managment
 2545                          * stuffs have to be suspended manually here.
 2546                          */
 2547                         hn_suspend_mgmt(sc);
 2548                         hn_synth_detach(sc);
 2549                 }
 2550                 HN_UNLOCK(sc);
 2551                 ether_ifdetach(ifp);
 2552         }
 2553 
 2554         ifmedia_removeall(&sc->hn_media);
 2555         hn_destroy_rx_data(sc);
 2556         hn_destroy_tx_data(sc);
 2557 
 2558         if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
 2559                 int i;
 2560 
 2561                 for (i = 0; i < hn_tx_taskq_cnt; ++i)
 2562                         taskqueue_free(sc->hn_tx_taskqs[i]);
 2563                 free(sc->hn_tx_taskqs, M_DEVBUF);
 2564         }
 2565         taskqueue_free(sc->hn_mgmt_taskq0);
 2566         if (sc->hn_vf_taskq != NULL)
 2567                 taskqueue_free(sc->hn_vf_taskq);
 2568 
 2569         if (sc->hn_xact != NULL) {
 2570                 /*
 2571                  * Uninstall the orphan handler _before_ the xact is
 2572                  * destructed.
 2573                  */
 2574                 vmbus_chan_unset_orphan(sc->hn_prichan);
 2575                 vmbus_xact_ctx_destroy(sc->hn_xact);
 2576         }
 2577 
 2578         if_free(ifp);
 2579 
 2580         HN_LOCK_DESTROY(sc);
 2581         rm_destroy(&sc->hn_vf_lock);
 2582         return (0);
 2583 }
 2584 
 2585 static int
 2586 hn_shutdown(device_t dev)
 2587 {
 2588 
 2589         return (0);
 2590 }
 2591 
 2592 static void
 2593 hn_link_status(struct hn_softc *sc)
 2594 {
 2595         uint32_t link_status;
 2596         int error;
 2597 
 2598         error = hn_rndis_get_linkstatus(sc, &link_status);
 2599         if (error) {
 2600                 /* XXX what to do? */
 2601                 return;
 2602         }
 2603 
 2604         if (link_status == NDIS_MEDIA_STATE_CONNECTED)
 2605                 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
 2606         else
 2607                 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
 2608         if_link_state_change(sc->hn_ifp,
 2609             (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
 2610             LINK_STATE_UP : LINK_STATE_DOWN);
 2611 }
 2612 
 2613 static void
 2614 hn_link_taskfunc(void *xsc, int pending __unused)
 2615 {
 2616         struct hn_softc *sc = xsc;
 2617 
 2618         if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
 2619                 return;
 2620         hn_link_status(sc);
 2621 }
 2622 
 2623 static void
 2624 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
 2625 {
 2626         struct hn_softc *sc = xsc;
 2627 
 2628         /* Prevent any link status checks from running. */
 2629         sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
 2630 
 2631         /*
 2632          * Fake up a [link down --> link up] state change; 5 seconds
 2633          * delay is used, which closely simulates miibus reaction
 2634          * upon link down event.
 2635          */
 2636         sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
 2637         if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
 2638         taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
 2639             &sc->hn_netchg_status, 5 * hz);
 2640 }
 2641 
 2642 static void
 2643 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
 2644 {
 2645         struct hn_softc *sc = xsc;
 2646 
 2647         /* Re-allow link status checks. */
 2648         sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
 2649         hn_link_status(sc);
 2650 }
 2651 
 2652 static void
 2653 hn_update_link_status(struct hn_softc *sc)
 2654 {
 2655 
 2656         if (sc->hn_mgmt_taskq != NULL)
 2657                 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
 2658 }
 2659 
 2660 static void
 2661 hn_change_network(struct hn_softc *sc)
 2662 {
 2663 
 2664         if (sc->hn_mgmt_taskq != NULL)
 2665                 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
 2666 }
 2667 
 2668 static __inline int
 2669 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
 2670     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
 2671 {
 2672         struct mbuf *m = *m_head;
 2673         int error;
 2674 
 2675         KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
 2676 
 2677         error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
 2678             m, segs, nsegs, BUS_DMA_NOWAIT);
 2679         if (error == EFBIG) {
 2680                 struct mbuf *m_new;
 2681 
 2682                 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
 2683                 if (m_new == NULL)
 2684                         return ENOBUFS;
 2685                 else
 2686                         *m_head = m = m_new;
 2687                 txr->hn_tx_collapsed++;
 2688 
 2689                 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
 2690                     txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
 2691         }
 2692         if (!error) {
 2693                 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
 2694                     BUS_DMASYNC_PREWRITE);
 2695                 txd->flags |= HN_TXD_FLAG_DMAMAP;
 2696         }
 2697         return error;
 2698 }
 2699 
 2700 static __inline int
 2701 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
 2702 {
 2703 
 2704         KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
 2705             ("put an onlist txd %#x", txd->flags));
 2706         KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
 2707             ("put an onagg txd %#x", txd->flags));
 2708 
 2709         KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
 2710         if (atomic_fetchadd_int(&txd->refs, -1) != 1)
 2711                 return 0;
 2712 
 2713         if (!STAILQ_EMPTY(&txd->agg_list)) {
 2714                 struct hn_txdesc *tmp_txd;
 2715 
 2716                 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
 2717                         int freed __diagused;
 2718 
 2719                         KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
 2720                             ("resursive aggregation on aggregated txdesc"));
 2721                         KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
 2722                             ("not aggregated txdesc"));
 2723                         KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
 2724                             ("aggregated txdesc uses dmamap"));
 2725                         KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
 2726                             ("aggregated txdesc consumes "
 2727                              "chimney sending buffer"));
 2728                         KASSERT(tmp_txd->chim_size == 0,
 2729                             ("aggregated txdesc has non-zero "
 2730                              "chimney sending size"));
 2731 
 2732                         STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
 2733                         tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
 2734                         freed = hn_txdesc_put(txr, tmp_txd);
 2735                         KASSERT(freed, ("failed to free aggregated txdesc"));
 2736                 }
 2737         }
 2738 
 2739         if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
 2740                 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
 2741                     ("chim txd uses dmamap"));
 2742                 hn_chim_free(txr->hn_sc, txd->chim_index);
 2743                 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
 2744                 txd->chim_size = 0;
 2745         } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
 2746                 bus_dmamap_sync(txr->hn_tx_data_dtag,
 2747                     txd->data_dmap, BUS_DMASYNC_POSTWRITE);
 2748                 bus_dmamap_unload(txr->hn_tx_data_dtag,
 2749                     txd->data_dmap);
 2750                 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
 2751         }
 2752 
 2753         if (txd->m != NULL) {
 2754                 m_freem(txd->m);
 2755                 txd->m = NULL;
 2756         }
 2757 
 2758         txd->flags |= HN_TXD_FLAG_ONLIST;
 2759 #ifndef HN_USE_TXDESC_BUFRING
 2760         mtx_lock_spin(&txr->hn_txlist_spin);
 2761         KASSERT(txr->hn_txdesc_avail >= 0 &&
 2762             txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
 2763             ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
 2764         txr->hn_txdesc_avail++;
 2765         SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
 2766         mtx_unlock_spin(&txr->hn_txlist_spin);
 2767 #else   /* HN_USE_TXDESC_BUFRING */
 2768 #ifdef HN_DEBUG
 2769         atomic_add_int(&txr->hn_txdesc_avail, 1);
 2770 #endif
 2771         buf_ring_enqueue(txr->hn_txdesc_br, txd);
 2772 #endif  /* !HN_USE_TXDESC_BUFRING */
 2773 
 2774         return 1;
 2775 }
 2776 
 2777 static __inline struct hn_txdesc *
 2778 hn_txdesc_get(struct hn_tx_ring *txr)
 2779 {
 2780         struct hn_txdesc *txd;
 2781 
 2782 #ifndef HN_USE_TXDESC_BUFRING
 2783         mtx_lock_spin(&txr->hn_txlist_spin);
 2784         txd = SLIST_FIRST(&txr->hn_txlist);
 2785         if (txd != NULL) {
 2786                 KASSERT(txr->hn_txdesc_avail > 0,
 2787                     ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
 2788                 txr->hn_txdesc_avail--;
 2789                 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
 2790         }
 2791         mtx_unlock_spin(&txr->hn_txlist_spin);
 2792 #else
 2793         txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
 2794 #endif
 2795 
 2796         if (txd != NULL) {
 2797 #ifdef HN_USE_TXDESC_BUFRING
 2798 #ifdef HN_DEBUG
 2799                 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
 2800 #endif
 2801 #endif  /* HN_USE_TXDESC_BUFRING */
 2802                 KASSERT(txd->m == NULL && txd->refs == 0 &&
 2803                     STAILQ_EMPTY(&txd->agg_list) &&
 2804                     txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
 2805                     txd->chim_size == 0 &&
 2806                     (txd->flags & HN_TXD_FLAG_ONLIST) &&
 2807                     (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
 2808                     (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
 2809                 txd->flags &= ~HN_TXD_FLAG_ONLIST;
 2810                 txd->refs = 1;
 2811         }
 2812         return txd;
 2813 }
 2814 
 2815 static __inline void
 2816 hn_txdesc_hold(struct hn_txdesc *txd)
 2817 {
 2818 
 2819         /* 0->1 transition will never work */
 2820         KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
 2821         atomic_add_int(&txd->refs, 1);
 2822 }
 2823 
 2824 static __inline void
 2825 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
 2826 {
 2827 
 2828         KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
 2829             ("recursive aggregation on aggregating txdesc"));
 2830 
 2831         KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
 2832             ("already aggregated"));
 2833         KASSERT(STAILQ_EMPTY(&txd->agg_list),
 2834             ("recursive aggregation on to-be-aggregated txdesc"));
 2835 
 2836         txd->flags |= HN_TXD_FLAG_ONAGG;
 2837         STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
 2838 }
 2839 
 2840 static bool
 2841 hn_tx_ring_pending(struct hn_tx_ring *txr)
 2842 {
 2843         bool pending = false;
 2844 
 2845 #ifndef HN_USE_TXDESC_BUFRING
 2846         mtx_lock_spin(&txr->hn_txlist_spin);
 2847         if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
 2848                 pending = true;
 2849         mtx_unlock_spin(&txr->hn_txlist_spin);
 2850 #else
 2851         if (!buf_ring_full(txr->hn_txdesc_br))
 2852                 pending = true;
 2853 #endif
 2854         return (pending);
 2855 }
 2856 
 2857 static __inline void
 2858 hn_txeof(struct hn_tx_ring *txr)
 2859 {
 2860         txr->hn_has_txeof = 0;
 2861         txr->hn_txeof(txr);
 2862 }
 2863 
 2864 static void
 2865 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
 2866     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
 2867 {
 2868         struct hn_txdesc *txd = sndc->hn_cbarg;
 2869         struct hn_tx_ring *txr;
 2870 
 2871         txr = txd->txr;
 2872         KASSERT(txr->hn_chan == chan,
 2873             ("channel mismatch, on chan%u, should be chan%u",
 2874              vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
 2875 
 2876         txr->hn_has_txeof = 1;
 2877         hn_txdesc_put(txr, txd);
 2878 
 2879         ++txr->hn_txdone_cnt;
 2880         if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
 2881                 txr->hn_txdone_cnt = 0;
 2882                 if (txr->hn_oactive)
 2883                         hn_txeof(txr);
 2884         }
 2885 }
 2886 
 2887 static void
 2888 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
 2889 {
 2890 #if defined(INET) || defined(INET6)
 2891         struct epoch_tracker et;
 2892 
 2893         NET_EPOCH_ENTER(et);
 2894         tcp_lro_flush_all(&rxr->hn_lro);
 2895         NET_EPOCH_EXIT(et);
 2896 #endif
 2897 
 2898         /*
 2899          * NOTE:
 2900          * 'txr' could be NULL, if multiple channels and
 2901          * ifnet.if_start method are enabled.
 2902          */
 2903         if (txr == NULL || !txr->hn_has_txeof)
 2904                 return;
 2905 
 2906         txr->hn_txdone_cnt = 0;
 2907         hn_txeof(txr);
 2908 }
 2909 
 2910 static __inline uint32_t
 2911 hn_rndis_pktmsg_offset(uint32_t ofs)
 2912 {
 2913 
 2914         KASSERT(ofs >= sizeof(struct rndis_packet_msg),
 2915             ("invalid RNDIS packet msg offset %u", ofs));
 2916         return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
 2917 }
 2918 
 2919 static __inline void *
 2920 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
 2921     size_t pi_dlen, uint32_t pi_type)
 2922 {
 2923         const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
 2924         struct rndis_pktinfo *pi;
 2925 
 2926         KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
 2927             ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
 2928 
 2929         /*
 2930          * Per-packet-info does not move; it only grows.
 2931          *
 2932          * NOTE:
 2933          * rm_pktinfooffset in this phase counts from the beginning
 2934          * of rndis_packet_msg.
 2935          */
 2936         KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
 2937             ("%u pktinfo overflows RNDIS packet msg", pi_type));
 2938         pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
 2939             pkt->rm_pktinfolen);
 2940         pkt->rm_pktinfolen += pi_size;
 2941 
 2942         pi->rm_size = pi_size;
 2943         pi->rm_type = pi_type;
 2944         pi->rm_internal = 0;
 2945         pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
 2946 
 2947         return (pi->rm_data);
 2948 }
 2949 
 2950 static __inline int
 2951 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
 2952 {
 2953         struct hn_txdesc *txd;
 2954         struct mbuf *m;
 2955         int error, pkts;
 2956 
 2957         txd = txr->hn_agg_txd;
 2958         KASSERT(txd != NULL, ("no aggregate txdesc"));
 2959 
 2960         /*
 2961          * Since hn_txpkt() will reset this temporary stat, save
 2962          * it now, so that oerrors can be updated properly, if
 2963          * hn_txpkt() ever fails.
 2964          */
 2965         pkts = txr->hn_stat_pkts;
 2966 
 2967         /*
 2968          * Since txd's mbuf will _not_ be freed upon hn_txpkt()
 2969          * failure, save it for later freeing, if hn_txpkt() ever
 2970          * fails.
 2971          */
 2972         m = txd->m;
 2973         error = hn_txpkt(ifp, txr, txd);
 2974         if (__predict_false(error)) {
 2975                 /* txd is freed, but m is not. */
 2976                 m_freem(m);
 2977 
 2978                 txr->hn_flush_failed++;
 2979                 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
 2980         }
 2981 
 2982         /* Reset all aggregation states. */
 2983         txr->hn_agg_txd = NULL;
 2984         txr->hn_agg_szleft = 0;
 2985         txr->hn_agg_pktleft = 0;
 2986         txr->hn_agg_prevpkt = NULL;
 2987 
 2988         return (error);
 2989 }
 2990 
 2991 static void *
 2992 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
 2993     int pktsize)
 2994 {
 2995         void *chim;
 2996 
 2997         if (txr->hn_agg_txd != NULL) {
 2998                 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
 2999                         struct hn_txdesc *agg_txd = txr->hn_agg_txd;
 3000                         struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
 3001                         int olen;
 3002 
 3003                         /*
 3004                          * Update the previous RNDIS packet's total length,
 3005                          * it can be increased due to the mandatory alignment
 3006                          * padding for this RNDIS packet.  And update the
 3007                          * aggregating txdesc's chimney sending buffer size
 3008                          * accordingly.
 3009                          *
 3010                          * XXX
 3011                          * Zero-out the padding, as required by the RNDIS spec.
 3012                          */
 3013                         olen = pkt->rm_len;
 3014                         pkt->rm_len = roundup2(olen, txr->hn_agg_align);
 3015                         agg_txd->chim_size += pkt->rm_len - olen;
 3016 
 3017                         /* Link this txdesc to the parent. */
 3018                         hn_txdesc_agg(agg_txd, txd);
 3019 
 3020                         chim = (uint8_t *)pkt + pkt->rm_len;
 3021                         /* Save the current packet for later fixup. */
 3022                         txr->hn_agg_prevpkt = chim;
 3023 
 3024                         txr->hn_agg_pktleft--;
 3025                         txr->hn_agg_szleft -= pktsize;
 3026                         if (txr->hn_agg_szleft <=
 3027                             HN_PKTSIZE_MIN(txr->hn_agg_align)) {
 3028                                 /*
 3029                                  * Probably can't aggregate more packets,
 3030                                  * flush this aggregating txdesc proactively.
 3031                                  */
 3032                                 txr->hn_agg_pktleft = 0;
 3033                         }
 3034                         /* Done! */
 3035                         return (chim);
 3036                 }
 3037                 hn_flush_txagg(ifp, txr);
 3038         }
 3039         KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
 3040 
 3041         txr->hn_tx_chimney_tried++;
 3042         txd->chim_index = hn_chim_alloc(txr->hn_sc);
 3043         if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
 3044                 return (NULL);
 3045         txr->hn_tx_chimney++;
 3046 
 3047         chim = txr->hn_sc->hn_chim +
 3048             (txd->chim_index * txr->hn_sc->hn_chim_szmax);
 3049 
 3050         if (txr->hn_agg_pktmax > 1 &&
 3051             txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
 3052                 txr->hn_agg_txd = txd;
 3053                 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
 3054                 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
 3055                 txr->hn_agg_prevpkt = chim;
 3056         }
 3057         return (chim);
 3058 }
 3059 
 3060 /*
 3061  * NOTE:
 3062  * If this function fails, then both txd and m_head0 will be freed.
 3063  */
 3064 static int
 3065 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
 3066     struct mbuf **m_head0)
 3067 {
 3068         bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
 3069         int error, nsegs, i;
 3070         struct mbuf *m_head = *m_head0;
 3071         struct rndis_packet_msg *pkt;
 3072         uint32_t *pi_data;
 3073         void *chim = NULL;
 3074         int pkt_hlen, pkt_size;
 3075 
 3076         pkt = txd->rndis_pkt;
 3077         pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
 3078         if (pkt_size < txr->hn_chim_size) {
 3079                 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
 3080                 if (chim != NULL)
 3081                         pkt = chim;
 3082         } else {
 3083                 if (txr->hn_agg_txd != NULL)
 3084                         hn_flush_txagg(ifp, txr);
 3085         }
 3086 
 3087         pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
 3088         pkt->rm_len = m_head->m_pkthdr.len;
 3089         pkt->rm_dataoffset = 0;
 3090         pkt->rm_datalen = m_head->m_pkthdr.len;
 3091         pkt->rm_oobdataoffset = 0;
 3092         pkt->rm_oobdatalen = 0;
 3093         pkt->rm_oobdataelements = 0;
 3094         pkt->rm_pktinfooffset = sizeof(*pkt);
 3095         pkt->rm_pktinfolen = 0;
 3096         pkt->rm_vchandle = 0;
 3097         pkt->rm_reserved = 0;
 3098 
 3099         if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
 3100                 /*
 3101                  * Set the hash value for this packet.
 3102                  */
 3103                 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
 3104                     HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
 3105 
 3106                 if (M_HASHTYPE_ISHASH(m_head))
 3107                         /*
 3108                          * The flowid field contains the hash value host
 3109                          * set in the rx queue if it is a ip forwarding pkt.
 3110                          * Set the same hash value so host can send on the
 3111                          * cpu it was received.
 3112                          */
 3113                         *pi_data = m_head->m_pkthdr.flowid;
 3114                 else
 3115                         /*
 3116                          * Otherwise just put the tx queue index.
 3117                          */
 3118                         *pi_data = txr->hn_tx_idx;
 3119         }
 3120 
 3121         if (m_head->m_flags & M_VLANTAG) {
 3122                 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
 3123                     NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
 3124                 *pi_data = NDIS_VLAN_INFO_MAKE(
 3125                     EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
 3126                     EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
 3127                     EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
 3128         }
 3129 
 3130         if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 3131 #if defined(INET6) || defined(INET)
 3132                 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
 3133                     NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
 3134 #ifdef INET
 3135                 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
 3136                         *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
 3137                             m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
 3138                             m_head->m_pkthdr.tso_segsz);
 3139                 }
 3140 #endif
 3141 #if defined(INET6) && defined(INET)
 3142                 else
 3143 #endif
 3144 #ifdef INET6
 3145                 {
 3146                         *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
 3147                             m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
 3148                             m_head->m_pkthdr.tso_segsz);
 3149                 }
 3150 #endif
 3151 #endif  /* INET6 || INET */
 3152         } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
 3153                 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
 3154                     NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
 3155                 if (m_head->m_pkthdr.csum_flags &
 3156                     (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
 3157                         *pi_data = NDIS_TXCSUM_INFO_IPV6;
 3158                 } else {
 3159                         *pi_data = NDIS_TXCSUM_INFO_IPV4;
 3160                         if (m_head->m_pkthdr.csum_flags & CSUM_IP)
 3161                                 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
 3162                 }
 3163 
 3164                 if (m_head->m_pkthdr.csum_flags &
 3165                     (CSUM_IP_TCP | CSUM_IP6_TCP)) {
 3166                         *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
 3167                             m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
 3168                 } else if (m_head->m_pkthdr.csum_flags &
 3169                     (CSUM_IP_UDP | CSUM_IP6_UDP)) {
 3170                         *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
 3171                             m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
 3172                 }
 3173         }
 3174 
 3175         pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
 3176         /* Fixup RNDIS packet message total length */
 3177         pkt->rm_len += pkt_hlen;
 3178         /* Convert RNDIS packet message offsets */
 3179         pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
 3180         pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
 3181 
 3182         /*
 3183          * Fast path: Chimney sending.
 3184          */
 3185         if (chim != NULL) {
 3186                 struct hn_txdesc *tgt_txd = txd;
 3187 
 3188                 if (txr->hn_agg_txd != NULL) {
 3189                         tgt_txd = txr->hn_agg_txd;
 3190 #ifdef INVARIANTS
 3191                         *m_head0 = NULL;
 3192 #endif
 3193                 }
 3194 
 3195                 KASSERT(pkt == chim,
 3196                     ("RNDIS pkt not in chimney sending buffer"));
 3197                 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
 3198                     ("chimney sending buffer is not used"));
 3199                 tgt_txd->chim_size += pkt->rm_len;
 3200 
 3201                 m_copydata(m_head, 0, m_head->m_pkthdr.len,
 3202                     ((uint8_t *)chim) + pkt_hlen);
 3203 
 3204                 txr->hn_gpa_cnt = 0;
 3205                 txr->hn_sendpkt = hn_txpkt_chim;
 3206                 goto done;
 3207         }
 3208 
 3209         KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
 3210         KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
 3211             ("chimney buffer is used"));
 3212         KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
 3213 
 3214         error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
 3215         if (__predict_false(error)) {
 3216                 int freed __diagused;
 3217 
 3218                 /*
 3219                  * This mbuf is not linked w/ the txd yet, so free it now.
 3220                  */
 3221                 m_freem(m_head);
 3222                 *m_head0 = NULL;
 3223 
 3224                 freed = hn_txdesc_put(txr, txd);
 3225                 KASSERT(freed != 0,
 3226                     ("fail to free txd upon txdma error"));
 3227 
 3228                 txr->hn_txdma_failed++;
 3229                 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 3230                 return error;
 3231         }
 3232         *m_head0 = m_head;
 3233 
 3234         /* +1 RNDIS packet message */
 3235         txr->hn_gpa_cnt = nsegs + 1;
 3236 
 3237         /* send packet with page buffer */
 3238         txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
 3239         txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
 3240         txr->hn_gpa[0].gpa_len = pkt_hlen;
 3241 
 3242         /*
 3243          * Fill the page buffers with mbuf info after the page
 3244          * buffer for RNDIS packet message.
 3245          */
 3246         for (i = 0; i < nsegs; ++i) {
 3247                 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
 3248 
 3249                 gpa->gpa_page = atop(segs[i].ds_addr);
 3250                 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
 3251                 gpa->gpa_len = segs[i].ds_len;
 3252         }
 3253 
 3254         txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
 3255         txd->chim_size = 0;
 3256         txr->hn_sendpkt = hn_txpkt_sglist;
 3257 done:
 3258         txd->m = m_head;
 3259 
 3260         /* Set the completion routine */
 3261         hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
 3262 
 3263         /* Update temporary stats for later use. */
 3264         txr->hn_stat_pkts++;
 3265         txr->hn_stat_size += m_head->m_pkthdr.len;
 3266         if (m_head->m_flags & M_MCAST)
 3267                 txr->hn_stat_mcasts++;
 3268 
 3269         return 0;
 3270 }
 3271 
 3272 /*
 3273  * NOTE:
 3274  * If this function fails, then txd will be freed, but the mbuf
 3275  * associated w/ the txd will _not_ be freed.
 3276  */
 3277 static int
 3278 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
 3279 {
 3280         int error, send_failed = 0, has_bpf;
 3281 
 3282 again:
 3283         has_bpf = bpf_peers_present(ifp->if_bpf);
 3284         if (has_bpf) {
 3285                 /*
 3286                  * Make sure that this txd and any aggregated txds are not
 3287                  * freed before ETHER_BPF_MTAP.
 3288                  */
 3289                 hn_txdesc_hold(txd);
 3290         }
 3291         error = txr->hn_sendpkt(txr, txd);
 3292         if (!error) {
 3293                 if (has_bpf) {
 3294                         const struct hn_txdesc *tmp_txd;
 3295 
 3296                         ETHER_BPF_MTAP(ifp, txd->m);
 3297                         STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
 3298                                 ETHER_BPF_MTAP(ifp, tmp_txd->m);
 3299                 }
 3300 
 3301                 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
 3302 #ifdef HN_IFSTART_SUPPORT
 3303                 if (!hn_use_if_start)
 3304 #endif
 3305                 {
 3306                         if_inc_counter(ifp, IFCOUNTER_OBYTES,
 3307                             txr->hn_stat_size);
 3308                         if (txr->hn_stat_mcasts != 0) {
 3309                                 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
 3310                                     txr->hn_stat_mcasts);
 3311                         }
 3312                 }
 3313                 txr->hn_pkts += txr->hn_stat_pkts;
 3314                 txr->hn_sends++;
 3315         }
 3316         if (has_bpf)
 3317                 hn_txdesc_put(txr, txd);
 3318 
 3319         if (__predict_false(error)) {
 3320                 int freed __diagused;
 3321 
 3322                 /*
 3323                  * This should "really rarely" happen.
 3324                  *
 3325                  * XXX Too many RX to be acked or too many sideband
 3326                  * commands to run?  Ask netvsc_channel_rollup()
 3327                  * to kick start later.
 3328                  */
 3329                 txr->hn_has_txeof = 1;
 3330                 if (!send_failed) {
 3331                         txr->hn_send_failed++;
 3332                         send_failed = 1;
 3333                         /*
 3334                          * Try sending again after set hn_has_txeof;
 3335                          * in case that we missed the last
 3336                          * netvsc_channel_rollup().
 3337                          */
 3338                         goto again;
 3339                 }
 3340                 if_printf(ifp, "send failed\n");
 3341 
 3342                 /*
 3343                  * Caller will perform further processing on the
 3344                  * associated mbuf, so don't free it in hn_txdesc_put();
 3345                  * only unload it from the DMA map in hn_txdesc_put(),
 3346                  * if it was loaded.
 3347                  */
 3348                 txd->m = NULL;
 3349                 freed = hn_txdesc_put(txr, txd);
 3350                 KASSERT(freed != 0,
 3351                     ("fail to free txd upon send error"));
 3352 
 3353                 txr->hn_send_failed++;
 3354         }
 3355 
 3356         /* Reset temporary stats, after this sending is done. */
 3357         txr->hn_stat_size = 0;
 3358         txr->hn_stat_pkts = 0;
 3359         txr->hn_stat_mcasts = 0;
 3360 
 3361         return (error);
 3362 }
 3363 
 3364 /*
 3365  * Append the specified data to the indicated mbuf chain,
 3366  * Extend the mbuf chain if the new data does not fit in
 3367  * existing space.
 3368  *
 3369  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
 3370  * There should be an equivalent in the kernel mbuf code,
 3371  * but there does not appear to be one yet.
 3372  *
 3373  * Differs from m_append() in that additional mbufs are
 3374  * allocated with cluster size MJUMPAGESIZE, and filled
 3375  * accordingly.
 3376  *
 3377  * Return the last mbuf in the chain or NULL if failed to
 3378  * allocate new mbuf.
 3379  */
 3380 static struct mbuf *
 3381 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
 3382 {
 3383         struct mbuf *m, *n;
 3384         int remainder, space;
 3385 
 3386         for (m = m0; m->m_next != NULL; m = m->m_next)
 3387                 ;
 3388         remainder = len;
 3389         space = M_TRAILINGSPACE(m);
 3390         if (space > 0) {
 3391                 /*
 3392                  * Copy into available space.
 3393                  */
 3394                 if (space > remainder)
 3395                         space = remainder;
 3396                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
 3397                 m->m_len += space;
 3398                 cp += space;
 3399                 remainder -= space;
 3400         }
 3401         while (remainder > 0) {
 3402                 /*
 3403                  * Allocate a new mbuf; could check space
 3404                  * and allocate a cluster instead.
 3405                  */
 3406                 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
 3407                 if (n == NULL)
 3408                         return NULL;
 3409                 n->m_len = min(MJUMPAGESIZE, remainder);
 3410                 bcopy(cp, mtod(n, caddr_t), n->m_len);
 3411                 cp += n->m_len;
 3412                 remainder -= n->m_len;
 3413                 m->m_next = n;
 3414                 m = n;
 3415         }
 3416 
 3417         return m;
 3418 }
 3419 
 3420 #if defined(INET) || defined(INET6)
 3421 static __inline int
 3422 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
 3423 {
 3424 #if __FreeBSD_version >= 1100095
 3425         if (hn_lro_mbufq_depth) {
 3426                 tcp_lro_queue_mbuf(lc, m);
 3427                 return 0;
 3428         }
 3429 #endif
 3430         return tcp_lro_rx(lc, m, 0);
 3431 }
 3432 #endif
 3433 
 3434 static int
 3435 hn_rxpkt(struct hn_rx_ring *rxr)
 3436 {
 3437         struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
 3438         struct mbuf *m_new, *n;
 3439         int size, do_lro = 0, do_csum = 1, is_vf = 0;
 3440         int hash_type = M_HASHTYPE_NONE;
 3441         int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
 3442         int i;
 3443 
 3444         ifp = hn_ifp;
 3445         if (rxr->hn_rxvf_ifp != NULL) {
 3446                 /*
 3447                  * Non-transparent mode VF; pretend this packet is from
 3448                  * the VF.
 3449                  */
 3450                 ifp = rxr->hn_rxvf_ifp;
 3451                 is_vf = 1;
 3452         } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
 3453                 /* Transparent mode VF. */
 3454                 is_vf = 1;
 3455         }
 3456 
 3457         if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 3458                 /*
 3459                  * NOTE:
 3460                  * See the NOTE of hn_rndis_init_fixat().  This
 3461                  * function can be reached, immediately after the
 3462                  * RNDIS is initialized but before the ifnet is
 3463                  * setup on the hn_attach() path; drop the unexpected
 3464                  * packets.
 3465                  */
 3466                 return (0);
 3467         }
 3468 
 3469         if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
 3470                 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
 3471                 return (0);
 3472         }
 3473 
 3474         if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
 3475                 m_new = m_gethdr(M_NOWAIT, MT_DATA);
 3476                 if (m_new == NULL) {
 3477                         if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
 3478                         return (0);
 3479                 }
 3480                 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
 3481                     rxr->rsc.frag_len[0]);
 3482                 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
 3483         } else {
 3484                 /*
 3485                  * Get an mbuf with a cluster.  For packets 2K or less,
 3486                  * get a standard 2K cluster.  For anything larger, get a
 3487                  * 4K cluster.  Any buffers larger than 4K can cause problems
 3488                  * if looped around to the Hyper-V TX channel, so avoid them.
 3489                  */
 3490                 size = MCLBYTES;
 3491                 if (rxr->rsc.pktlen > MCLBYTES) {
 3492                         /* 4096 */
 3493                         size = MJUMPAGESIZE;
 3494                 }
 3495 
 3496                 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
 3497                 if (m_new == NULL) {
 3498                         if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
 3499                         return (0);
 3500                 }
 3501 
 3502                 n = m_new;
 3503                 for (i = 0; i < rxr->rsc.cnt; i++) {
 3504                         n = hv_m_append(n, rxr->rsc.frag_len[i],
 3505                             rxr->rsc.frag_data[i]);
 3506                         if (n == NULL) {
 3507                                 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
 3508                                 return (0);
 3509                         } else {
 3510                                 m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
 3511                         }
 3512                 }
 3513         }
 3514         if (rxr->rsc.pktlen <= MHLEN)
 3515                 rxr->hn_small_pkts++;
 3516 
 3517         m_new->m_pkthdr.rcvif = ifp;
 3518 
 3519         if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
 3520                 do_csum = 0;
 3521 
 3522         /* receive side checksum offload */
 3523         if (rxr->rsc.csum_info != NULL) {
 3524                 /* IP csum offload */
 3525                 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
 3526                         m_new->m_pkthdr.csum_flags |=
 3527                             (CSUM_IP_CHECKED | CSUM_IP_VALID);
 3528                         rxr->hn_csum_ip++;
 3529                 }
 3530 
 3531                 /* TCP/UDP csum offload */
 3532                 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
 3533                      NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
 3534                         m_new->m_pkthdr.csum_flags |=
 3535                             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 3536                         m_new->m_pkthdr.csum_data = 0xffff;
 3537                         if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
 3538                                 rxr->hn_csum_tcp++;
 3539                         else
 3540                                 rxr->hn_csum_udp++;
 3541                 }
 3542 
 3543                 /*
 3544                  * XXX
 3545                  * As of this write (Oct 28th, 2016), host side will turn
 3546                  * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
 3547                  * the do_lro setting here is actually _not_ accurate.  We
 3548                  * depend on the RSS hash type check to reset do_lro.
 3549                  */
 3550                 if ((*(rxr->rsc.csum_info) &
 3551                      (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
 3552                     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
 3553                         do_lro = 1;
 3554         } else {
 3555                 hn_rxpkt_proto(m_new, &l3proto, &l4proto);
 3556                 if (l3proto == ETHERTYPE_IP) {
 3557                         if (l4proto == IPPROTO_TCP) {
 3558                                 if (do_csum &&
 3559                                     (rxr->hn_trust_hcsum &
 3560                                      HN_TRUST_HCSUM_TCP)) {
 3561                                         rxr->hn_csum_trusted++;
 3562                                         m_new->m_pkthdr.csum_flags |=
 3563                                            (CSUM_IP_CHECKED | CSUM_IP_VALID |
 3564                                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 3565                                         m_new->m_pkthdr.csum_data = 0xffff;
 3566                                 }
 3567                                 do_lro = 1;
 3568                         } else if (l4proto == IPPROTO_UDP) {
 3569                                 if (do_csum &&
 3570                                     (rxr->hn_trust_hcsum &
 3571                                      HN_TRUST_HCSUM_UDP)) {
 3572                                         rxr->hn_csum_trusted++;
 3573                                         m_new->m_pkthdr.csum_flags |=
 3574                                            (CSUM_IP_CHECKED | CSUM_IP_VALID |
 3575                                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 3576                                         m_new->m_pkthdr.csum_data = 0xffff;
 3577                                 }
 3578                         } else if (l4proto != IPPROTO_DONE && do_csum &&
 3579                             (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
 3580                                 rxr->hn_csum_trusted++;
 3581                                 m_new->m_pkthdr.csum_flags |=
 3582                                     (CSUM_IP_CHECKED | CSUM_IP_VALID);
 3583                         }
 3584                 }
 3585         }
 3586 
 3587         if (rxr->rsc.vlan_info != NULL) {
 3588                 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
 3589                     NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
 3590                     NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
 3591                     NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
 3592                 m_new->m_flags |= M_VLANTAG;
 3593         }
 3594 
 3595         /*
 3596          * If VF is activated (tranparent/non-transparent mode does not
 3597          * matter here).
 3598          *
 3599          * - Disable LRO
 3600          *
 3601          *   hn(4) will only receive broadcast packets, multicast packets,
 3602          *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
 3603          *   packet types.
 3604          *
 3605          *   For non-transparent, we definitely _cannot_ enable LRO at
 3606          *   all, since the LRO flush will use hn(4) as the receiving
 3607          *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
 3608          */
 3609         if (is_vf)
 3610                 do_lro = 0;
 3611 
 3612         /*
 3613          * If VF is activated (tranparent/non-transparent mode does not
 3614          * matter here), do _not_ mess with unsupported hash types or
 3615          * functions.
 3616          */
 3617         if (rxr->rsc.hash_info != NULL) {
 3618                 rxr->hn_rss_pkts++;
 3619                 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
 3620                 if (!is_vf)
 3621                         hash_type = M_HASHTYPE_OPAQUE_HASH;
 3622                 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
 3623                     NDIS_HASH_FUNCTION_TOEPLITZ) {
 3624                         uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
 3625                             rxr->hn_mbuf_hash);
 3626 
 3627                         /*
 3628                          * NOTE:
 3629                          * do_lro is resetted, if the hash types are not TCP
 3630                          * related.  See the comment in the above csum_flags
 3631                          * setup section.
 3632                          */
 3633                         switch (type) {
 3634                         case NDIS_HASH_IPV4:
 3635                                 hash_type = M_HASHTYPE_RSS_IPV4;
 3636                                 do_lro = 0;
 3637                                 break;
 3638 
 3639                         case NDIS_HASH_TCP_IPV4:
 3640                                 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
 3641                                 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
 3642                                         int def_htype = M_HASHTYPE_OPAQUE_HASH;
 3643 
 3644                                         if (is_vf)
 3645                                                 def_htype = M_HASHTYPE_NONE;
 3646 
 3647                                         /*
 3648                                          * UDP 4-tuple hash is delivered as
 3649                                          * TCP 4-tuple hash.
 3650                                          */
 3651                                         if (l3proto == ETHERTYPE_MAX) {
 3652                                                 hn_rxpkt_proto(m_new,
 3653                                                     &l3proto, &l4proto);
 3654                                         }
 3655                                         if (l3proto == ETHERTYPE_IP) {
 3656                                                 if (l4proto == IPPROTO_UDP &&
 3657                                                     (rxr->hn_mbuf_hash &
 3658                                                      NDIS_HASH_UDP_IPV4_X)) {
 3659                                                         hash_type =
 3660                                                         M_HASHTYPE_RSS_UDP_IPV4;
 3661                                                         do_lro = 0;
 3662                                                 } else if (l4proto !=
 3663                                                     IPPROTO_TCP) {
 3664                                                         hash_type = def_htype;
 3665                                                         do_lro = 0;
 3666                                                 }
 3667                                         } else {
 3668                                                 hash_type = def_htype;
 3669                                                 do_lro = 0;
 3670                                         }
 3671                                 }
 3672                                 break;
 3673 
 3674                         case NDIS_HASH_IPV6:
 3675                                 hash_type = M_HASHTYPE_RSS_IPV6;
 3676                                 do_lro = 0;
 3677                                 break;
 3678 
 3679                         case NDIS_HASH_IPV6_EX:
 3680                                 hash_type = M_HASHTYPE_RSS_IPV6_EX;
 3681                                 do_lro = 0;
 3682                                 break;
 3683 
 3684                         case NDIS_HASH_TCP_IPV6:
 3685                                 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
 3686                                 break;
 3687 
 3688                         case NDIS_HASH_TCP_IPV6_EX:
 3689                                 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
 3690                                 break;
 3691                         }
 3692                 }
 3693         } else if (!is_vf) {
 3694                 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
 3695                 hash_type = M_HASHTYPE_OPAQUE;
 3696         }
 3697         M_HASHTYPE_SET(m_new, hash_type);
 3698 
 3699         if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 3700         if (hn_ifp != ifp) {
 3701                 const struct ether_header *eh;
 3702 
 3703                 /*
 3704                  * Non-transparent mode VF is activated.
 3705                  */
 3706 
 3707                 /*
 3708                  * Allow tapping on hn(4).
 3709                  */
 3710                 ETHER_BPF_MTAP(hn_ifp, m_new);
 3711 
 3712                 /*
 3713                  * Update hn(4)'s stats.
 3714                  */
 3715                 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
 3716                 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
 3717                 /* Checked at the beginning of this function. */
 3718                 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
 3719                 eh = mtod(m_new, struct ether_header *);
 3720                 if (ETHER_IS_MULTICAST(eh->ether_dhost))
 3721                         if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
 3722         }
 3723         rxr->hn_pkts++;
 3724 
 3725         if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
 3726 #if defined(INET) || defined(INET6)
 3727                 struct lro_ctrl *lro = &rxr->hn_lro;
 3728 
 3729                 if (lro->lro_cnt) {
 3730                         rxr->hn_lro_tried++;
 3731                         if (hn_lro_rx(lro, m_new) == 0) {
 3732                                 /* DONE! */
 3733                                 return 0;
 3734                         }
 3735                 }
 3736 #endif
 3737         }
 3738         ifp->if_input(ifp, m_new);
 3739 
 3740         return (0);
 3741 }
 3742 
 3743 static int
 3744 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 3745 {
 3746         struct hn_softc *sc = ifp->if_softc;
 3747         struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
 3748         struct ifnet *vf_ifp;
 3749         int mask, error = 0;
 3750         struct ifrsskey *ifrk;
 3751         struct ifrsshash *ifrh;
 3752         uint32_t mtu;
 3753 
 3754         switch (cmd) {
 3755         case SIOCSIFMTU:
 3756                 if (ifr->ifr_mtu > HN_MTU_MAX) {
 3757                         error = EINVAL;
 3758                         break;
 3759                 }
 3760 
 3761                 HN_LOCK(sc);
 3762 
 3763                 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
 3764                         HN_UNLOCK(sc);
 3765                         break;
 3766                 }
 3767 
 3768                 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
 3769                         /* Can't change MTU */
 3770                         HN_UNLOCK(sc);
 3771                         error = EOPNOTSUPP;
 3772                         break;
 3773                 }
 3774 
 3775                 if (ifp->if_mtu == ifr->ifr_mtu) {
 3776                         HN_UNLOCK(sc);
 3777                         break;
 3778                 }
 3779 
 3780                 if (hn_xpnt_vf_isready(sc)) {
 3781                         vf_ifp = sc->hn_vf_ifp;
 3782                         ifr_vf = *ifr;
 3783                         strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
 3784                             sizeof(ifr_vf.ifr_name));
 3785                         error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
 3786                             (caddr_t)&ifr_vf);
 3787                         if (error) {
 3788                                 HN_UNLOCK(sc);
 3789                                 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
 3790                                     vf_ifp->if_xname, ifr->ifr_mtu, error);
 3791                                 break;
 3792                         }
 3793                 }
 3794 
 3795                 /*
 3796                  * Suspend this interface before the synthetic parts
 3797                  * are ripped.
 3798                  */
 3799                 hn_suspend(sc);
 3800 
 3801                 /*
 3802                  * Detach the synthetics parts, i.e. NVS and RNDIS.
 3803                  */
 3804                 hn_synth_detach(sc);
 3805 
 3806                 /*
 3807                  * Reattach the synthetic parts, i.e. NVS and RNDIS,
 3808                  * with the new MTU setting.
 3809                  */
 3810                 error = hn_synth_attach(sc, ifr->ifr_mtu);
 3811                 if (error) {
 3812                         HN_UNLOCK(sc);
 3813                         break;
 3814                 }
 3815 
 3816                 error = hn_rndis_get_mtu(sc, &mtu);
 3817                 if (error)
 3818                         mtu = ifr->ifr_mtu;
 3819                 else if (bootverbose)
 3820                         if_printf(ifp, "RNDIS mtu %u\n", mtu);
 3821 
 3822                 /*
 3823                  * Commit the requested MTU, after the synthetic parts
 3824                  * have been successfully attached.
 3825                  */
 3826                 if (mtu >= ifr->ifr_mtu) {
 3827                         mtu = ifr->ifr_mtu;
 3828                 } else {
 3829                         if_printf(ifp, "fixup mtu %d -> %u\n",
 3830                             ifr->ifr_mtu, mtu);
 3831                 }
 3832                 ifp->if_mtu = mtu;
 3833 
 3834                 /*
 3835                  * Synthetic parts' reattach may change the chimney
 3836                  * sending size; update it.
 3837                  */
 3838                 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
 3839                         hn_set_chim_size(sc, sc->hn_chim_szmax);
 3840 
 3841                 /*
 3842                  * Make sure that various parameters based on MTU are
 3843                  * still valid, after the MTU change.
 3844                  */
 3845                 hn_mtu_change_fixup(sc);
 3846 
 3847                 /*
 3848                  * All done!  Resume the interface now.
 3849                  */
 3850                 hn_resume(sc);
 3851 
 3852                 if ((sc->hn_flags & HN_FLAG_RXVF) ||
 3853                     (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
 3854                         /*
 3855                          * Since we have reattached the NVS part,
 3856                          * change the datapath to VF again; in case
 3857                          * that it is lost, after the NVS was detached.
 3858                          */
 3859                         hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
 3860                 }
 3861 
 3862                 HN_UNLOCK(sc);
 3863                 break;
 3864 
 3865         case SIOCSIFFLAGS:
 3866                 HN_LOCK(sc);
 3867 
 3868                 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
 3869                         HN_UNLOCK(sc);
 3870                         break;
 3871                 }
 3872 
 3873                 if (hn_xpnt_vf_isready(sc))
 3874                         hn_xpnt_vf_saveifflags(sc);
 3875 
 3876                 if (ifp->if_flags & IFF_UP) {
 3877                         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 3878                                 /*
 3879                                  * Caller meight hold mutex, e.g.
 3880                                  * bpf; use busy-wait for the RNDIS
 3881                                  * reply.
 3882                                  */
 3883                                 HN_NO_SLEEPING(sc);
 3884                                 hn_rxfilter_config(sc);
 3885                                 HN_SLEEPING_OK(sc);
 3886 
 3887                                 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
 3888                                         error = hn_xpnt_vf_iocsetflags(sc);
 3889                         } else {
 3890                                 hn_init_locked(sc);
 3891                         }
 3892                 } else {
 3893                         if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 3894                                 hn_stop(sc, false);
 3895                 }
 3896                 sc->hn_if_flags = ifp->if_flags;
 3897 
 3898                 HN_UNLOCK(sc);
 3899                 break;
 3900 
 3901         case SIOCSIFCAP:
 3902                 HN_LOCK(sc);
 3903 
 3904                 if (hn_xpnt_vf_isready(sc)) {
 3905                         ifr_vf = *ifr;
 3906                         strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
 3907                             sizeof(ifr_vf.ifr_name));
 3908                         error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
 3909                         HN_UNLOCK(sc);
 3910                         break;
 3911                 }
 3912 
 3913                 /*
 3914                  * Fix up requested capabilities w/ supported capabilities,
 3915                  * since the supported capabilities could have been changed.
 3916                  */
 3917                 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
 3918                     ifp->if_capenable;
 3919 
 3920                 if (mask & IFCAP_TXCSUM) {
 3921                         ifp->if_capenable ^= IFCAP_TXCSUM;
 3922                         if (ifp->if_capenable & IFCAP_TXCSUM)
 3923                                 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
 3924                         else
 3925                                 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
 3926                 }
 3927                 if (mask & IFCAP_TXCSUM_IPV6) {
 3928                         ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
 3929                         if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
 3930                                 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
 3931                         else
 3932                                 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
 3933                 }
 3934 
 3935                 /* TODO: flip RNDIS offload parameters for RXCSUM. */
 3936                 if (mask & IFCAP_RXCSUM)
 3937                         ifp->if_capenable ^= IFCAP_RXCSUM;
 3938 #ifdef foo
 3939                 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
 3940                 if (mask & IFCAP_RXCSUM_IPV6)
 3941                         ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
 3942 #endif
 3943 
 3944                 if (mask & IFCAP_LRO)
 3945                         ifp->if_capenable ^= IFCAP_LRO;
 3946 
 3947                 if (mask & IFCAP_TSO4) {
 3948                         ifp->if_capenable ^= IFCAP_TSO4;
 3949                         if (ifp->if_capenable & IFCAP_TSO4)
 3950                                 ifp->if_hwassist |= CSUM_IP_TSO;
 3951                         else
 3952                                 ifp->if_hwassist &= ~CSUM_IP_TSO;
 3953                 }
 3954                 if (mask & IFCAP_TSO6) {
 3955                         ifp->if_capenable ^= IFCAP_TSO6;
 3956                         if (ifp->if_capenable & IFCAP_TSO6)
 3957                                 ifp->if_hwassist |= CSUM_IP6_TSO;
 3958                         else
 3959                                 ifp->if_hwassist &= ~CSUM_IP6_TSO;
 3960                 }
 3961 
 3962                 HN_UNLOCK(sc);
 3963                 break;
 3964 
 3965         case SIOCADDMULTI:
 3966         case SIOCDELMULTI:
 3967                 HN_LOCK(sc);
 3968 
 3969                 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
 3970                         HN_UNLOCK(sc);
 3971                         break;
 3972                 }
 3973                 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 3974                         /*
 3975                          * Multicast uses mutex; use busy-wait for
 3976                          * the RNDIS reply.
 3977                          */
 3978                         HN_NO_SLEEPING(sc);
 3979                         hn_rxfilter_config(sc);
 3980                         HN_SLEEPING_OK(sc);
 3981                 }
 3982 
 3983                 /* XXX vlan(4) style mcast addr maintenance */
 3984                 if (hn_xpnt_vf_isready(sc)) {
 3985                         int old_if_flags;
 3986 
 3987                         old_if_flags = sc->hn_vf_ifp->if_flags;
 3988                         hn_xpnt_vf_saveifflags(sc);
 3989 
 3990                         if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
 3991                             ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
 3992                              IFF_ALLMULTI))
 3993                                 error = hn_xpnt_vf_iocsetflags(sc);
 3994                 }
 3995 
 3996                 HN_UNLOCK(sc);
 3997                 break;
 3998 
 3999         case SIOCSIFMEDIA:
 4000         case SIOCGIFMEDIA:
 4001                 HN_LOCK(sc);
 4002                 if (hn_xpnt_vf_isready(sc)) {
 4003                         /*
 4004                          * SIOCGIFMEDIA expects ifmediareq, so don't
 4005                          * create and pass ifr_vf to the VF here; just
 4006                          * replace the ifr_name.
 4007                          */
 4008                         vf_ifp = sc->hn_vf_ifp;
 4009                         strlcpy(ifr->ifr_name, vf_ifp->if_xname,
 4010                             sizeof(ifr->ifr_name));
 4011                         error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
 4012                         /* Restore the ifr_name. */
 4013                         strlcpy(ifr->ifr_name, ifp->if_xname,
 4014                             sizeof(ifr->ifr_name));
 4015                         HN_UNLOCK(sc);
 4016                         break;
 4017                 }
 4018                 HN_UNLOCK(sc);
 4019                 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
 4020                 break;
 4021 
 4022         case SIOCGIFRSSHASH:
 4023                 ifrh = (struct ifrsshash *)data;
 4024                 HN_LOCK(sc);
 4025                 if (sc->hn_rx_ring_inuse == 1) {
 4026                         HN_UNLOCK(sc);
 4027                         ifrh->ifrh_func = RSS_FUNC_NONE;
 4028                         ifrh->ifrh_types = 0;
 4029                         break;
 4030                 }
 4031 
 4032                 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
 4033                         ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
 4034                 else
 4035                         ifrh->ifrh_func = RSS_FUNC_PRIVATE;
 4036                 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
 4037                 HN_UNLOCK(sc);
 4038                 break;
 4039 
 4040         case SIOCGIFRSSKEY:
 4041                 ifrk = (struct ifrsskey *)data;
 4042                 HN_LOCK(sc);
 4043                 if (sc->hn_rx_ring_inuse == 1) {
 4044                         HN_UNLOCK(sc);
 4045                         ifrk->ifrk_func = RSS_FUNC_NONE;
 4046                         ifrk->ifrk_keylen = 0;
 4047                         break;
 4048                 }
 4049                 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
 4050                         ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
 4051                 else
 4052                         ifrk->ifrk_func = RSS_FUNC_PRIVATE;
 4053                 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
 4054                 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
 4055                     NDIS_HASH_KEYSIZE_TOEPLITZ);
 4056                 HN_UNLOCK(sc);
 4057                 break;
 4058 
 4059         default:
 4060                 error = ether_ioctl(ifp, cmd, data);
 4061                 break;
 4062         }
 4063         return (error);
 4064 }
 4065 
 4066 static void
 4067 hn_stop(struct hn_softc *sc, bool detaching)
 4068 {
 4069         struct ifnet *ifp = sc->hn_ifp;
 4070         int i;
 4071 
 4072         HN_LOCK_ASSERT(sc);
 4073 
 4074         KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
 4075             ("synthetic parts were not attached"));
 4076 
 4077         /* Clear RUNNING bit ASAP. */
 4078         atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
 4079 
 4080         /* Disable polling. */
 4081         hn_polling(sc, 0);
 4082 
 4083         if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
 4084                 KASSERT(sc->hn_vf_ifp != NULL,
 4085                     ("%s: VF is not attached", ifp->if_xname));
 4086 
 4087                 /* Mark transparent mode VF as disabled. */
 4088                 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
 4089 
 4090                 /*
 4091                  * NOTE:
 4092                  * Datapath setting must happen _before_ bringing
 4093                  * the VF down.
 4094                  */
 4095                 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
 4096 
 4097                 /*
 4098                  * Bring the VF down.
 4099                  */
 4100                 hn_xpnt_vf_saveifflags(sc);
 4101                 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
 4102                 hn_xpnt_vf_iocsetflags(sc);
 4103         }
 4104 
 4105         /* Suspend data transfers. */
 4106         hn_suspend_data(sc);
 4107 
 4108         /* Clear OACTIVE bit. */
 4109         atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 4110         for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
 4111                 sc->hn_tx_ring[i].hn_oactive = 0;
 4112 
 4113         /*
 4114          * If the non-transparent mode VF is active, make sure
 4115          * that the RX filter still allows packet reception.
 4116          */
 4117         if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
 4118                 hn_rxfilter_config(sc);
 4119 }
 4120 
 4121 static void
 4122 hn_init_locked(struct hn_softc *sc)
 4123 {
 4124         struct ifnet *ifp = sc->hn_ifp;
 4125         int i;
 4126 
 4127         HN_LOCK_ASSERT(sc);
 4128 
 4129         if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
 4130                 return;
 4131 
 4132         if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 4133                 return;
 4134 
 4135         /* Configure RX filter */
 4136         hn_rxfilter_config(sc);
 4137 
 4138         /* Clear OACTIVE bit. */
 4139         atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 4140         for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
 4141                 sc->hn_tx_ring[i].hn_oactive = 0;
 4142 
 4143         /* Clear TX 'suspended' bit. */
 4144         hn_resume_tx(sc, sc->hn_tx_ring_inuse);
 4145 
 4146         if (hn_xpnt_vf_isready(sc)) {
 4147                 /* Initialize transparent VF. */
 4148                 hn_xpnt_vf_init(sc);
 4149         }
 4150 
 4151         /* Everything is ready; unleash! */
 4152         atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
 4153 
 4154         /* Re-enable polling if requested. */
 4155         if (sc->hn_pollhz > 0)
 4156                 hn_polling(sc, sc->hn_pollhz);
 4157 }
 4158 
 4159 static void
 4160 hn_init(void *xsc)
 4161 {
 4162         struct hn_softc *sc = xsc;
 4163 
 4164         HN_LOCK(sc);
 4165         hn_init_locked(sc);
 4166         HN_UNLOCK(sc);
 4167 }
 4168 
 4169 #if __FreeBSD_version >= 1100099
 4170 
 4171 static int
 4172 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
 4173 {
 4174         struct hn_softc *sc = arg1;
 4175         unsigned int lenlim;
 4176         int error;
 4177 
 4178         lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
 4179         error = sysctl_handle_int(oidp, &lenlim, 0, req);
 4180         if (error || req->newptr == NULL)
 4181                 return error;
 4182 
 4183         HN_LOCK(sc);
 4184         if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
 4185             lenlim > TCP_LRO_LENGTH_MAX) {
 4186                 HN_UNLOCK(sc);
 4187                 return EINVAL;
 4188         }
 4189         hn_set_lro_lenlim(sc, lenlim);
 4190         HN_UNLOCK(sc);
 4191 
 4192         return 0;
 4193 }
 4194 
 4195 static int
 4196 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
 4197 {
 4198         struct hn_softc *sc = arg1;
 4199         int ackcnt, error, i;
 4200 
 4201         /*
 4202          * lro_ackcnt_lim is append count limit,
 4203          * +1 to turn it into aggregation limit.
 4204          */
 4205         ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
 4206         error = sysctl_handle_int(oidp, &ackcnt, 0, req);
 4207         if (error || req->newptr == NULL)
 4208                 return error;
 4209 
 4210         if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
 4211                 return EINVAL;
 4212 
 4213         /*
 4214          * Convert aggregation limit back to append
 4215          * count limit.
 4216          */
 4217         --ackcnt;
 4218         HN_LOCK(sc);
 4219         for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
 4220                 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
 4221         HN_UNLOCK(sc);
 4222         return 0;
 4223 }
 4224 
 4225 #endif
 4226 
 4227 static int
 4228 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
 4229 {
 4230         struct hn_softc *sc = arg1;
 4231         int hcsum = arg2;
 4232         int on, error, i;
 4233 
 4234         on = 0;
 4235         if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
 4236                 on = 1;
 4237 
 4238         error = sysctl_handle_int(oidp, &on, 0, req);
 4239         if (error || req->newptr == NULL)
 4240                 return error;
 4241 
 4242         HN_LOCK(sc);
 4243         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 4244                 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 4245 
 4246                 if (on)
 4247                         rxr->hn_trust_hcsum |= hcsum;
 4248                 else
 4249                         rxr->hn_trust_hcsum &= ~hcsum;
 4250         }
 4251         HN_UNLOCK(sc);
 4252         return 0;
 4253 }
 4254 
 4255 static int
 4256 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
 4257 {
 4258         struct hn_softc *sc = arg1;
 4259         int chim_size, error;
 4260 
 4261         chim_size = sc->hn_tx_ring[0].hn_chim_size;
 4262         error = sysctl_handle_int(oidp, &chim_size, 0, req);
 4263         if (error || req->newptr == NULL)
 4264                 return error;
 4265 
 4266         if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
 4267                 return EINVAL;
 4268 
 4269         HN_LOCK(sc);
 4270         hn_set_chim_size(sc, chim_size);
 4271         HN_UNLOCK(sc);
 4272         return 0;
 4273 }
 4274 
 4275 #if __FreeBSD_version < 1100095
 4276 static int
 4277 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
 4278 {
 4279         struct hn_softc *sc = arg1;
 4280         int ofs = arg2, i, error;
 4281         struct hn_rx_ring *rxr;
 4282         uint64_t stat;
 4283 
 4284         stat = 0;
 4285         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 4286                 rxr = &sc->hn_rx_ring[i];
 4287                 stat += *((int *)((uint8_t *)rxr + ofs));
 4288         }
 4289 
 4290         error = sysctl_handle_64(oidp, &stat, 0, req);
 4291         if (error || req->newptr == NULL)
 4292                 return error;
 4293 
 4294         /* Zero out this stat. */
 4295         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 4296                 rxr = &sc->hn_rx_ring[i];
 4297                 *((int *)((uint8_t *)rxr + ofs)) = 0;
 4298         }
 4299         return 0;
 4300 }
 4301 #else
 4302 static int
 4303 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
 4304 {
 4305         struct hn_softc *sc = arg1;
 4306         int ofs = arg2, i, error;
 4307         struct hn_rx_ring *rxr;
 4308         uint64_t stat;
 4309 
 4310         stat = 0;
 4311         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 4312                 rxr = &sc->hn_rx_ring[i];
 4313                 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
 4314         }
 4315 
 4316         error = sysctl_handle_64(oidp, &stat, 0, req);
 4317         if (error || req->newptr == NULL)
 4318                 return error;
 4319 
 4320         /* Zero out this stat. */
 4321         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 4322                 rxr = &sc->hn_rx_ring[i];
 4323                 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
 4324         }
 4325         return 0;
 4326 }
 4327 
 4328 #endif
 4329 
 4330 static int
 4331 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
 4332 {
 4333         struct hn_softc *sc = arg1;
 4334         int ofs = arg2, i, error;
 4335         struct hn_rx_ring *rxr;
 4336         u_long stat;
 4337 
 4338         stat = 0;
 4339         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 4340                 rxr = &sc->hn_rx_ring[i];
 4341                 stat += *((u_long *)((uint8_t *)rxr + ofs));
 4342         }
 4343 
 4344         error = sysctl_handle_long(oidp, &stat, 0, req);
 4345         if (error || req->newptr == NULL)
 4346                 return error;
 4347 
 4348         /* Zero out this stat. */
 4349         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 4350                 rxr = &sc->hn_rx_ring[i];
 4351                 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
 4352         }
 4353         return 0;
 4354 }
 4355 
 4356 static int
 4357 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
 4358 {
 4359         struct hn_softc *sc = arg1;
 4360         int ofs = arg2, i, error;
 4361         struct hn_tx_ring *txr;
 4362         u_long stat;
 4363 
 4364         stat = 0;
 4365         for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 4366                 txr = &sc->hn_tx_ring[i];
 4367                 stat += *((u_long *)((uint8_t *)txr + ofs));
 4368         }
 4369 
 4370         error = sysctl_handle_long(oidp, &stat, 0, req);
 4371         if (error || req->newptr == NULL)
 4372                 return error;
 4373 
 4374         /* Zero out this stat. */
 4375         for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 4376                 txr = &sc->hn_tx_ring[i];
 4377                 *((u_long *)((uint8_t *)txr + ofs)) = 0;
 4378         }
 4379         return 0;
 4380 }
 4381 
 4382 static int
 4383 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
 4384 {
 4385         struct hn_softc *sc = arg1;
 4386         int ofs = arg2, i, error, conf;
 4387         struct hn_tx_ring *txr;
 4388 
 4389         txr = &sc->hn_tx_ring[0];
 4390         conf = *((int *)((uint8_t *)txr + ofs));
 4391 
 4392         error = sysctl_handle_int(oidp, &conf, 0, req);
 4393         if (error || req->newptr == NULL)
 4394                 return error;
 4395 
 4396         HN_LOCK(sc);
 4397         for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 4398                 txr = &sc->hn_tx_ring[i];
 4399                 *((int *)((uint8_t *)txr + ofs)) = conf;
 4400         }
 4401         HN_UNLOCK(sc);
 4402 
 4403         return 0;
 4404 }
 4405 
 4406 static int
 4407 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
 4408 {
 4409         struct hn_softc *sc = arg1;
 4410         int error, size;
 4411 
 4412         size = sc->hn_agg_size;
 4413         error = sysctl_handle_int(oidp, &size, 0, req);
 4414         if (error || req->newptr == NULL)
 4415                 return (error);
 4416 
 4417         HN_LOCK(sc);
 4418         sc->hn_agg_size = size;
 4419         hn_set_txagg(sc);
 4420         HN_UNLOCK(sc);
 4421 
 4422         return (0);
 4423 }
 4424 
 4425 static int
 4426 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
 4427 {
 4428         struct hn_softc *sc = arg1;
 4429         int error, pkts;
 4430 
 4431         pkts = sc->hn_agg_pkts;
 4432         error = sysctl_handle_int(oidp, &pkts, 0, req);
 4433         if (error || req->newptr == NULL)
 4434                 return (error);
 4435 
 4436         HN_LOCK(sc);
 4437         sc->hn_agg_pkts = pkts;
 4438         hn_set_txagg(sc);
 4439         HN_UNLOCK(sc);
 4440 
 4441         return (0);
 4442 }
 4443 
 4444 static int
 4445 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
 4446 {
 4447         struct hn_softc *sc = arg1;
 4448         int pkts;
 4449 
 4450         pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
 4451         return (sysctl_handle_int(oidp, &pkts, 0, req));
 4452 }
 4453 
 4454 static int
 4455 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
 4456 {
 4457         struct hn_softc *sc = arg1;
 4458         int align;
 4459 
 4460         align = sc->hn_tx_ring[0].hn_agg_align;
 4461         return (sysctl_handle_int(oidp, &align, 0, req));
 4462 }
 4463 
 4464 static void
 4465 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
 4466 {
 4467         if (pollhz == 0)
 4468                 vmbus_chan_poll_disable(chan);
 4469         else
 4470                 vmbus_chan_poll_enable(chan, pollhz);
 4471 }
 4472 
 4473 static void
 4474 hn_polling(struct hn_softc *sc, u_int pollhz)
 4475 {
 4476         int nsubch = sc->hn_rx_ring_inuse - 1;
 4477 
 4478         HN_LOCK_ASSERT(sc);
 4479 
 4480         if (nsubch > 0) {
 4481                 struct vmbus_channel **subch;
 4482                 int i;
 4483 
 4484                 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
 4485                 for (i = 0; i < nsubch; ++i)
 4486                         hn_chan_polling(subch[i], pollhz);
 4487                 vmbus_subchan_rel(subch, nsubch);
 4488         }
 4489         hn_chan_polling(sc->hn_prichan, pollhz);
 4490 }
 4491 
 4492 static int
 4493 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
 4494 {
 4495         struct hn_softc *sc = arg1;
 4496         int pollhz, error;
 4497 
 4498         pollhz = sc->hn_pollhz;
 4499         error = sysctl_handle_int(oidp, &pollhz, 0, req);
 4500         if (error || req->newptr == NULL)
 4501                 return (error);
 4502 
 4503         if (pollhz != 0 &&
 4504             (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
 4505                 return (EINVAL);
 4506 
 4507         HN_LOCK(sc);
 4508         if (sc->hn_pollhz != pollhz) {
 4509                 sc->hn_pollhz = pollhz;
 4510                 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
 4511                     (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
 4512                         hn_polling(sc, sc->hn_pollhz);
 4513         }
 4514         HN_UNLOCK(sc);
 4515 
 4516         return (0);
 4517 }
 4518 
 4519 static int
 4520 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
 4521 {
 4522         struct hn_softc *sc = arg1;
 4523         char verstr[16];
 4524 
 4525         snprintf(verstr, sizeof(verstr), "%u.%u",
 4526             HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
 4527             HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
 4528         return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
 4529 }
 4530 
 4531 static int
 4532 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
 4533 {
 4534         struct hn_softc *sc = arg1;
 4535         char caps_str[128];
 4536         uint32_t caps;
 4537 
 4538         HN_LOCK(sc);
 4539         caps = sc->hn_caps;
 4540         HN_UNLOCK(sc);
 4541         snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
 4542         return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
 4543 }
 4544 
 4545 static int
 4546 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
 4547 {
 4548         struct hn_softc *sc = arg1;
 4549         char assist_str[128];
 4550         uint32_t hwassist;
 4551 
 4552         HN_LOCK(sc);
 4553         hwassist = sc->hn_ifp->if_hwassist;
 4554         HN_UNLOCK(sc);
 4555         snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
 4556         return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
 4557 }
 4558 
 4559 static int
 4560 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
 4561 {
 4562         struct hn_softc *sc = arg1;
 4563         char filter_str[128];
 4564         uint32_t filter;
 4565 
 4566         HN_LOCK(sc);
 4567         filter = sc->hn_rx_filter;
 4568         HN_UNLOCK(sc);
 4569         snprintf(filter_str, sizeof(filter_str), "%b", filter,
 4570             NDIS_PACKET_TYPES);
 4571         return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
 4572 }
 4573 
 4574 static int
 4575 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
 4576 {
 4577         struct hn_softc *sc = arg1;
 4578         uint32_t mtu;
 4579         int error;
 4580         HN_LOCK(sc);
 4581         error = hn_rndis_get_mtu(sc, &mtu);
 4582         if (error) {
 4583                 if_printf(sc->hn_ifp, "failed to get mtu\n");
 4584                 goto back;
 4585         }
 4586         error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
 4587         if (error || req->newptr == NULL)
 4588                 goto back;
 4589 
 4590         error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
 4591         if (error)
 4592                 goto back;
 4593         error = hn_rndis_reconf_offload(sc, mtu);
 4594 back:
 4595         HN_UNLOCK(sc);
 4596         return (error);
 4597 }
 4598 #ifndef RSS
 4599 
 4600 static int
 4601 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
 4602 {
 4603         struct hn_softc *sc = arg1;
 4604         int error;
 4605 
 4606         HN_LOCK(sc);
 4607 
 4608         error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
 4609         if (error || req->newptr == NULL)
 4610                 goto back;
 4611 
 4612         if ((sc->hn_flags & HN_FLAG_RXVF) ||
 4613             (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
 4614                 /*
 4615                  * RSS key is synchronized w/ VF's, don't allow users
 4616                  * to change it.
 4617                  */
 4618                 error = EBUSY;
 4619                 goto back;
 4620         }
 4621 
 4622         error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
 4623         if (error)
 4624                 goto back;
 4625         sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
 4626 
 4627         if (sc->hn_rx_ring_inuse > 1) {
 4628                 error = hn_rss_reconfig(sc);
 4629         } else {
 4630                 /* Not RSS capable, at least for now; just save the RSS key. */
 4631                 error = 0;
 4632         }
 4633 back:
 4634         HN_UNLOCK(sc);
 4635         return (error);
 4636 }
 4637 
 4638 static int
 4639 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
 4640 {
 4641         struct hn_softc *sc = arg1;
 4642         int error;
 4643 
 4644         HN_LOCK(sc);
 4645 
 4646         error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
 4647         if (error || req->newptr == NULL)
 4648                 goto back;
 4649 
 4650         /*
 4651          * Don't allow RSS indirect table change, if this interface is not
 4652          * RSS capable currently.
 4653          */
 4654         if (sc->hn_rx_ring_inuse == 1) {
 4655                 error = EOPNOTSUPP;
 4656                 goto back;
 4657         }
 4658 
 4659         error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
 4660         if (error)
 4661                 goto back;
 4662         sc->hn_flags |= HN_FLAG_HAS_RSSIND;
 4663 
 4664         hn_rss_ind_fixup(sc);
 4665         error = hn_rss_reconfig(sc);
 4666 back:
 4667         HN_UNLOCK(sc);
 4668         return (error);
 4669 }
 4670 
 4671 #endif  /* !RSS */
 4672 
 4673 static int
 4674 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
 4675 {
 4676         struct hn_softc *sc = arg1;
 4677         char hash_str[128];
 4678         uint32_t hash;
 4679 
 4680         HN_LOCK(sc);
 4681         hash = sc->hn_rss_hash;
 4682         HN_UNLOCK(sc);
 4683         snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
 4684         return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
 4685 }
 4686 
 4687 static int
 4688 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
 4689 {
 4690         struct hn_softc *sc = arg1;
 4691         char hash_str[128];
 4692         uint32_t hash;
 4693 
 4694         HN_LOCK(sc);
 4695         hash = sc->hn_rss_hcap;
 4696         HN_UNLOCK(sc);
 4697         snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
 4698         return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
 4699 }
 4700 
 4701 static int
 4702 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
 4703 {
 4704         struct hn_softc *sc = arg1;
 4705         char hash_str[128];
 4706         uint32_t hash;
 4707 
 4708         HN_LOCK(sc);
 4709         hash = sc->hn_rx_ring[0].hn_mbuf_hash;
 4710         HN_UNLOCK(sc);
 4711         snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
 4712         return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
 4713 }
 4714 
 4715 static int
 4716 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
 4717 {
 4718         struct hn_softc *sc = arg1;
 4719         char vf_name[IFNAMSIZ + 1];
 4720         struct ifnet *vf_ifp;
 4721 
 4722         HN_LOCK(sc);
 4723         vf_name[0] = '\0';
 4724         vf_ifp = sc->hn_vf_ifp;
 4725         if (vf_ifp != NULL)
 4726                 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
 4727         HN_UNLOCK(sc);
 4728         return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
 4729 }
 4730 
 4731 static int
 4732 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
 4733 {
 4734         struct hn_softc *sc = arg1;
 4735         char vf_name[IFNAMSIZ + 1];
 4736         struct ifnet *vf_ifp;
 4737 
 4738         HN_LOCK(sc);
 4739         vf_name[0] = '\0';
 4740         vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
 4741         if (vf_ifp != NULL)
 4742                 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
 4743         HN_UNLOCK(sc);
 4744         return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
 4745 }
 4746 
 4747 static int
 4748 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
 4749 {
 4750         struct rm_priotracker pt;
 4751         struct sbuf *sb;
 4752         int error, i;
 4753         bool first;
 4754 
 4755         error = sysctl_wire_old_buffer(req, 0);
 4756         if (error != 0)
 4757                 return (error);
 4758 
 4759         sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 4760         if (sb == NULL)
 4761                 return (ENOMEM);
 4762 
 4763         rm_rlock(&hn_vfmap_lock, &pt);
 4764 
 4765         first = true;
 4766         for (i = 0; i < hn_vfmap_size; ++i) {
 4767                 struct epoch_tracker et;
 4768                 struct ifnet *ifp;
 4769 
 4770                 if (hn_vfmap[i] == NULL)
 4771                         continue;
 4772 
 4773                 NET_EPOCH_ENTER(et);
 4774                 ifp = ifnet_byindex(i);
 4775                 if (ifp != NULL) {
 4776                         if (first)
 4777                                 sbuf_printf(sb, "%s", ifp->if_xname);
 4778                         else
 4779                                 sbuf_printf(sb, " %s", ifp->if_xname);
 4780                         first = false;
 4781                 }
 4782                 NET_EPOCH_EXIT(et);
 4783         }
 4784 
 4785         rm_runlock(&hn_vfmap_lock, &pt);
 4786 
 4787         error = sbuf_finish(sb);
 4788         sbuf_delete(sb);
 4789         return (error);
 4790 }
 4791 
 4792 static int
 4793 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
 4794 {
 4795         struct rm_priotracker pt;
 4796         struct sbuf *sb;
 4797         int error, i;
 4798         bool first;
 4799 
 4800         error = sysctl_wire_old_buffer(req, 0);
 4801         if (error != 0)
 4802                 return (error);
 4803 
 4804         sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 4805         if (sb == NULL)
 4806                 return (ENOMEM);
 4807 
 4808         rm_rlock(&hn_vfmap_lock, &pt);
 4809 
 4810         first = true;
 4811         for (i = 0; i < hn_vfmap_size; ++i) {
 4812                 struct epoch_tracker et;
 4813                 struct ifnet *ifp, *hn_ifp;
 4814 
 4815                 hn_ifp = hn_vfmap[i];
 4816                 if (hn_ifp == NULL)
 4817                         continue;
 4818 
 4819                 NET_EPOCH_ENTER(et);
 4820                 ifp = ifnet_byindex(i);
 4821                 if (ifp != NULL) {
 4822                         if (first) {
 4823                                 sbuf_printf(sb, "%s:%s", ifp->if_xname,
 4824                                     hn_ifp->if_xname);
 4825                         } else {
 4826                                 sbuf_printf(sb, " %s:%s", ifp->if_xname,
 4827                                     hn_ifp->if_xname);
 4828                         }
 4829                         first = false;
 4830                 }
 4831                 NET_EPOCH_EXIT(et);
 4832         }
 4833 
 4834         rm_runlock(&hn_vfmap_lock, &pt);
 4835 
 4836         error = sbuf_finish(sb);
 4837         sbuf_delete(sb);
 4838         return (error);
 4839 }
 4840 
 4841 static int
 4842 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
 4843 {
 4844         struct hn_softc *sc = arg1;
 4845         int error, onoff = 0;
 4846 
 4847         if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
 4848                 onoff = 1;
 4849         error = sysctl_handle_int(oidp, &onoff, 0, req);
 4850         if (error || req->newptr == NULL)
 4851                 return (error);
 4852 
 4853         HN_LOCK(sc);
 4854         /* NOTE: hn_vf_lock for hn_transmit() */
 4855         rm_wlock(&sc->hn_vf_lock);
 4856         if (onoff)
 4857                 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
 4858         else
 4859                 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
 4860         rm_wunlock(&sc->hn_vf_lock);
 4861         HN_UNLOCK(sc);
 4862 
 4863         return (0);
 4864 }
 4865 
 4866 static int
 4867 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
 4868 {
 4869         struct hn_softc *sc = arg1;
 4870         int enabled = 0;
 4871 
 4872         if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
 4873                 enabled = 1;
 4874         return (sysctl_handle_int(oidp, &enabled, 0, req));
 4875 }
 4876 
 4877 static int
 4878 hn_check_iplen(const struct mbuf *m, int hoff)
 4879 {
 4880         const struct ip *ip;
 4881         int len, iphlen, iplen;
 4882         const struct tcphdr *th;
 4883         int thoff;                              /* TCP data offset */
 4884 
 4885         len = hoff + sizeof(struct ip);
 4886 
 4887         /* The packet must be at least the size of an IP header. */
 4888         if (m->m_pkthdr.len < len)
 4889                 return IPPROTO_DONE;
 4890 
 4891         /* The fixed IP header must reside completely in the first mbuf. */
 4892         if (m->m_len < len)
 4893                 return IPPROTO_DONE;
 4894 
 4895         ip = mtodo(m, hoff);
 4896 
 4897         /* Bound check the packet's stated IP header length. */
 4898         iphlen = ip->ip_hl << 2;
 4899         if (iphlen < sizeof(struct ip))         /* minimum header length */
 4900                 return IPPROTO_DONE;
 4901 
 4902         /* The full IP header must reside completely in the one mbuf. */
 4903         if (m->m_len < hoff + iphlen)
 4904                 return IPPROTO_DONE;
 4905 
 4906         iplen = ntohs(ip->ip_len);
 4907 
 4908         /*
 4909          * Check that the amount of data in the buffers is as
 4910          * at least much as the IP header would have us expect.
 4911          */
 4912         if (m->m_pkthdr.len < hoff + iplen)
 4913                 return IPPROTO_DONE;
 4914 
 4915         /*
 4916          * Ignore IP fragments.
 4917          */
 4918         if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
 4919                 return IPPROTO_DONE;
 4920 
 4921         /*
 4922          * The TCP/IP or UDP/IP header must be entirely contained within
 4923          * the first fragment of a packet.
 4924          */
 4925         switch (ip->ip_p) {
 4926         case IPPROTO_TCP:
 4927                 if (iplen < iphlen + sizeof(struct tcphdr))
 4928                         return IPPROTO_DONE;
 4929                 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
 4930                         return IPPROTO_DONE;
 4931                 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
 4932                 thoff = th->th_off << 2;
 4933                 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
 4934                         return IPPROTO_DONE;
 4935                 if (m->m_len < hoff + iphlen + thoff)
 4936                         return IPPROTO_DONE;
 4937                 break;
 4938         case IPPROTO_UDP:
 4939                 if (iplen < iphlen + sizeof(struct udphdr))
 4940                         return IPPROTO_DONE;
 4941                 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
 4942                         return IPPROTO_DONE;
 4943                 break;
 4944         default:
 4945                 if (iplen < iphlen)
 4946                         return IPPROTO_DONE;
 4947                 break;
 4948         }
 4949         return ip->ip_p;
 4950 }
 4951 
 4952 static void
 4953 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
 4954 {
 4955         const struct ether_header *eh;
 4956         uint16_t etype;
 4957         int hoff;
 4958 
 4959         hoff = sizeof(*eh);
 4960         /* Checked at the beginning of this function. */
 4961         KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
 4962 
 4963         eh = mtod(m_new, const struct ether_header *);
 4964         etype = ntohs(eh->ether_type);
 4965         if (etype == ETHERTYPE_VLAN) {
 4966                 const struct ether_vlan_header *evl;
 4967 
 4968                 hoff = sizeof(*evl);
 4969                 if (m_new->m_len < hoff)
 4970                         return;
 4971                 evl = mtod(m_new, const struct ether_vlan_header *);
 4972                 etype = ntohs(evl->evl_proto);
 4973         }
 4974         *l3proto = etype;
 4975 
 4976         if (etype == ETHERTYPE_IP)
 4977                 *l4proto = hn_check_iplen(m_new, hoff);
 4978         else
 4979                 *l4proto = IPPROTO_DONE;
 4980 }
 4981 
 4982 static int
 4983 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
 4984 {
 4985         struct sysctl_oid_list *child;
 4986         struct sysctl_ctx_list *ctx;
 4987         device_t dev = sc->hn_dev;
 4988 #if defined(INET) || defined(INET6)
 4989 #if __FreeBSD_version >= 1100095
 4990         int lroent_cnt;
 4991 #endif
 4992 #endif
 4993         int i;
 4994 
 4995         /*
 4996          * Create RXBUF for reception.
 4997          *
 4998          * NOTE:
 4999          * - It is shared by all channels.
 5000          * - A large enough buffer is allocated, certain version of NVSes
 5001          *   may further limit the usable space.
 5002          */
 5003         sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
 5004             PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
 5005             BUS_DMA_WAITOK | BUS_DMA_ZERO);
 5006         if (sc->hn_rxbuf == NULL) {
 5007                 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
 5008                 return (ENOMEM);
 5009         }
 5010 
 5011         sc->hn_rx_ring_cnt = ring_cnt;
 5012         sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
 5013 
 5014         sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
 5015             M_DEVBUF, M_WAITOK | M_ZERO);
 5016 
 5017 #if defined(INET) || defined(INET6)
 5018 #if __FreeBSD_version >= 1100095
 5019         lroent_cnt = hn_lro_entry_count;
 5020         if (lroent_cnt < TCP_LRO_ENTRIES)
 5021                 lroent_cnt = TCP_LRO_ENTRIES;
 5022         if (bootverbose)
 5023                 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
 5024 #endif
 5025 #endif  /* INET || INET6 */
 5026 
 5027         ctx = device_get_sysctl_ctx(dev);
 5028         child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 5029 
 5030         /* Create dev.hn.UNIT.rx sysctl tree */
 5031         sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
 5032             CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 5033 
 5034         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 5035                 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 5036 
 5037                 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
 5038                     PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
 5039                     &rxr->hn_br_dma, BUS_DMA_WAITOK);
 5040                 if (rxr->hn_br == NULL) {
 5041                         device_printf(dev, "allocate bufring failed\n");
 5042                         return (ENOMEM);
 5043                 }
 5044 
 5045                 if (hn_trust_hosttcp)
 5046                         rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
 5047                 if (hn_trust_hostudp)
 5048                         rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
 5049                 if (hn_trust_hostip)
 5050                         rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
 5051                 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
 5052                 rxr->hn_ifp = sc->hn_ifp;
 5053                 if (i < sc->hn_tx_ring_cnt)
 5054                         rxr->hn_txr = &sc->hn_tx_ring[i];
 5055                 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
 5056                 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
 5057                 rxr->hn_rx_idx = i;
 5058                 rxr->hn_rxbuf = sc->hn_rxbuf;
 5059 
 5060                 /*
 5061                  * Initialize LRO.
 5062                  */
 5063 #if defined(INET) || defined(INET6)
 5064 #if __FreeBSD_version >= 1100095
 5065                 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
 5066                     hn_lro_mbufq_depth);
 5067 #else
 5068                 tcp_lro_init(&rxr->hn_lro);
 5069                 rxr->hn_lro.ifp = sc->hn_ifp;
 5070 #endif
 5071 #if __FreeBSD_version >= 1100099
 5072                 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
 5073                 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
 5074 #endif
 5075 #endif  /* INET || INET6 */
 5076 
 5077                 if (sc->hn_rx_sysctl_tree != NULL) {
 5078                         char name[16];
 5079 
 5080                         /*
 5081                          * Create per RX ring sysctl tree:
 5082                          * dev.hn.UNIT.rx.RINGID
 5083                          */
 5084                         snprintf(name, sizeof(name), "%d", i);
 5085                         rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
 5086                             SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
 5087                             OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 5088 
 5089                         if (rxr->hn_rx_sysctl_tree != NULL) {
 5090                                 SYSCTL_ADD_ULONG(ctx,
 5091                                     SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
 5092                                     OID_AUTO, "packets",
 5093                                     CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
 5094                                     "# of packets received");
 5095                                 SYSCTL_ADD_ULONG(ctx,
 5096                                     SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
 5097                                     OID_AUTO, "rss_pkts",
 5098                                     CTLFLAG_RW | CTLFLAG_STATS,
 5099                                     &rxr->hn_rss_pkts,
 5100                                     "# of packets w/ RSS info received");
 5101                                 SYSCTL_ADD_ULONG(ctx,
 5102                                     SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
 5103                                     OID_AUTO, "rsc_pkts",
 5104                                     CTLFLAG_RW | CTLFLAG_STATS,
 5105                                     &rxr->hn_rsc_pkts,
 5106                                     "# of RSC packets received");
 5107                                 SYSCTL_ADD_ULONG(ctx,
 5108                                     SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
 5109                                     OID_AUTO, "rsc_drop",
 5110                                     CTLFLAG_RW | CTLFLAG_STATS,
 5111                                     &rxr->hn_rsc_drop,
 5112                                     "# of RSC fragments dropped");
 5113                                 SYSCTL_ADD_INT(ctx,
 5114                                     SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
 5115                                     OID_AUTO, "pktbuf_len", CTLFLAG_RD,
 5116                                     &rxr->hn_pktbuf_len, 0,
 5117                                     "Temporary channel packet buffer length");
 5118                         }
 5119                 }
 5120         }
 5121 
 5122         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
 5123             CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
 5124             __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
 5125 #if __FreeBSD_version < 1100095
 5126             hn_rx_stat_int_sysctl,
 5127 #else
 5128             hn_rx_stat_u64_sysctl,
 5129 #endif
 5130             "LU", "LRO queued");
 5131         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
 5132             CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
 5133             __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
 5134 #if __FreeBSD_version < 1100095
 5135             hn_rx_stat_int_sysctl,
 5136 #else
 5137             hn_rx_stat_u64_sysctl,
 5138 #endif
 5139             "LU", "LRO flushed");
 5140         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
 5141             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
 5142             __offsetof(struct hn_rx_ring, hn_lro_tried),
 5143             hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
 5144 #if __FreeBSD_version >= 1100099
 5145         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
 5146             CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 5147             hn_lro_lenlim_sysctl, "IU",
 5148             "Max # of data bytes to be aggregated by LRO");
 5149         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
 5150             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 5151             hn_lro_ackcnt_sysctl, "I",
 5152             "Max # of ACKs to be aggregated by LRO");
 5153 #endif
 5154         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
 5155             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
 5156             hn_trust_hcsum_sysctl, "I",
 5157             "Trust tcp segment verification on host side, "
 5158             "when csum info is missing");
 5159         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
 5160             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
 5161             hn_trust_hcsum_sysctl, "I",
 5162             "Trust udp datagram verification on host side, "
 5163             "when csum info is missing");
 5164         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
 5165             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
 5166             hn_trust_hcsum_sysctl, "I",
 5167             "Trust ip packet verification on host side, "
 5168             "when csum info is missing");
 5169         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
 5170             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
 5171             __offsetof(struct hn_rx_ring, hn_csum_ip),
 5172             hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
 5173         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
 5174             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
 5175             __offsetof(struct hn_rx_ring, hn_csum_tcp),
 5176             hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
 5177         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
 5178             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
 5179             __offsetof(struct hn_rx_ring, hn_csum_udp),
 5180             hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
 5181         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
 5182             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 5183             __offsetof(struct hn_rx_ring, hn_csum_trusted),
 5184             hn_rx_stat_ulong_sysctl, "LU",
 5185             "# of packets that we trust host's csum verification");
 5186         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
 5187             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
 5188             __offsetof(struct hn_rx_ring, hn_small_pkts),
 5189             hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
 5190         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
 5191             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
 5192             __offsetof(struct hn_rx_ring, hn_ack_failed),
 5193             hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
 5194         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
 5195             CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
 5196         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
 5197             CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
 5198 
 5199         return (0);
 5200 }
 5201 
 5202 static void
 5203 hn_destroy_rx_data(struct hn_softc *sc)
 5204 {
 5205         int i;
 5206 
 5207         if (sc->hn_rxbuf != NULL) {
 5208                 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
 5209                         hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
 5210                 else
 5211                         device_printf(sc->hn_dev, "RXBUF is referenced\n");
 5212                 sc->hn_rxbuf = NULL;
 5213         }
 5214 
 5215         if (sc->hn_rx_ring_cnt == 0)
 5216                 return;
 5217 
 5218         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 5219                 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 5220 
 5221                 if (rxr->hn_br == NULL)
 5222                         continue;
 5223                 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
 5224                         hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
 5225                 } else {
 5226                         device_printf(sc->hn_dev,
 5227                             "%dth channel bufring is referenced", i);
 5228                 }
 5229                 rxr->hn_br = NULL;
 5230 
 5231 #if defined(INET) || defined(INET6)
 5232                 tcp_lro_free(&rxr->hn_lro);
 5233 #endif
 5234                 free(rxr->hn_pktbuf, M_DEVBUF);
 5235         }
 5236         free(sc->hn_rx_ring, M_DEVBUF);
 5237         sc->hn_rx_ring = NULL;
 5238 
 5239         sc->hn_rx_ring_cnt = 0;
 5240         sc->hn_rx_ring_inuse = 0;
 5241 }
 5242 
 5243 static int
 5244 hn_tx_ring_create(struct hn_softc *sc, int id)
 5245 {
 5246         struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
 5247         device_t dev = sc->hn_dev;
 5248         bus_dma_tag_t parent_dtag;
 5249         int error, i;
 5250 
 5251         txr->hn_sc = sc;
 5252         txr->hn_tx_idx = id;
 5253 
 5254 #ifndef HN_USE_TXDESC_BUFRING
 5255         mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
 5256 #endif
 5257         mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
 5258 
 5259         txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
 5260         txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
 5261             M_DEVBUF, M_WAITOK | M_ZERO);
 5262 #ifndef HN_USE_TXDESC_BUFRING
 5263         SLIST_INIT(&txr->hn_txlist);
 5264 #else
 5265         txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
 5266             M_WAITOK, &txr->hn_tx_lock);
 5267 #endif
 5268 
 5269         if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
 5270                 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
 5271                     device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
 5272         } else {
 5273                 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
 5274         }
 5275 
 5276 #ifdef HN_IFSTART_SUPPORT
 5277         if (hn_use_if_start) {
 5278                 txr->hn_txeof = hn_start_txeof;
 5279                 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
 5280                 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
 5281         } else
 5282 #endif
 5283         {
 5284                 int br_depth;
 5285 
 5286                 txr->hn_txeof = hn_xmit_txeof;
 5287                 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
 5288                 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
 5289 
 5290                 br_depth = hn_get_txswq_depth(txr);
 5291                 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
 5292                     M_WAITOK, &txr->hn_tx_lock);
 5293         }
 5294 
 5295         txr->hn_direct_tx_size = hn_direct_tx_size;
 5296 
 5297         /*
 5298          * Always schedule transmission instead of trying to do direct
 5299          * transmission.  This one gives the best performance so far.
 5300          */
 5301         txr->hn_sched_tx = 1;
 5302 
 5303         parent_dtag = bus_get_dma_tag(dev);
 5304 
 5305         /* DMA tag for RNDIS packet messages. */
 5306         error = bus_dma_tag_create(parent_dtag, /* parent */
 5307             HN_RNDIS_PKT_ALIGN,         /* alignment */
 5308             HN_RNDIS_PKT_BOUNDARY,      /* boundary */
 5309             BUS_SPACE_MAXADDR,          /* lowaddr */
 5310             BUS_SPACE_MAXADDR,          /* highaddr */
 5311             NULL, NULL,                 /* filter, filterarg */
 5312             HN_RNDIS_PKT_LEN,           /* maxsize */
 5313             1,                          /* nsegments */
 5314             HN_RNDIS_PKT_LEN,           /* maxsegsize */
 5315             0,                          /* flags */
 5316             NULL,                       /* lockfunc */
 5317             NULL,                       /* lockfuncarg */
 5318             &txr->hn_tx_rndis_dtag);
 5319         if (error) {
 5320                 device_printf(dev, "failed to create rndis dmatag\n");
 5321                 return error;
 5322         }
 5323 
 5324         /* DMA tag for data. */
 5325         error = bus_dma_tag_create(parent_dtag, /* parent */
 5326             1,                          /* alignment */
 5327             HN_TX_DATA_BOUNDARY,        /* boundary */
 5328             BUS_SPACE_MAXADDR,          /* lowaddr */
 5329             BUS_SPACE_MAXADDR,          /* highaddr */
 5330             NULL, NULL,                 /* filter, filterarg */
 5331             HN_TX_DATA_MAXSIZE,         /* maxsize */
 5332             HN_TX_DATA_SEGCNT_MAX,      /* nsegments */
 5333             HN_TX_DATA_SEGSIZE,         /* maxsegsize */
 5334             0,                          /* flags */
 5335             NULL,                       /* lockfunc */
 5336             NULL,                       /* lockfuncarg */
 5337             &txr->hn_tx_data_dtag);
 5338         if (error) {
 5339                 device_printf(dev, "failed to create data dmatag\n");
 5340                 return error;
 5341         }
 5342 
 5343         for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
 5344                 struct hn_txdesc *txd = &txr->hn_txdesc[i];
 5345 
 5346                 txd->txr = txr;
 5347                 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
 5348                 STAILQ_INIT(&txd->agg_list);
 5349 
 5350                 /*
 5351                  * Allocate and load RNDIS packet message.
 5352                  */
 5353                 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
 5354                     (void **)&txd->rndis_pkt,
 5355                     BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
 5356                     &txd->rndis_pkt_dmap);
 5357                 if (error) {
 5358                         device_printf(dev,
 5359                             "failed to allocate rndis_packet_msg, %d\n", i);
 5360                         return error;
 5361                 }
 5362 
 5363                 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
 5364                     txd->rndis_pkt_dmap,
 5365                     txd->rndis_pkt, HN_RNDIS_PKT_LEN,
 5366                     hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
 5367                     BUS_DMA_NOWAIT);
 5368                 if (error) {
 5369                         device_printf(dev,
 5370                             "failed to load rndis_packet_msg, %d\n", i);
 5371                         bus_dmamem_free(txr->hn_tx_rndis_dtag,
 5372                             txd->rndis_pkt, txd->rndis_pkt_dmap);
 5373                         return error;
 5374                 }
 5375 
 5376                 /* DMA map for TX data. */
 5377                 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
 5378                     &txd->data_dmap);
 5379                 if (error) {
 5380                         device_printf(dev,
 5381                             "failed to allocate tx data dmamap\n");
 5382                         bus_dmamap_unload(txr->hn_tx_rndis_dtag,
 5383                             txd->rndis_pkt_dmap);
 5384                         bus_dmamem_free(txr->hn_tx_rndis_dtag,
 5385                             txd->rndis_pkt, txd->rndis_pkt_dmap);
 5386                         return error;
 5387                 }
 5388 
 5389                 /* All set, put it to list */
 5390                 txd->flags |= HN_TXD_FLAG_ONLIST;
 5391 #ifndef HN_USE_TXDESC_BUFRING
 5392                 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
 5393 #else
 5394                 buf_ring_enqueue(txr->hn_txdesc_br, txd);
 5395 #endif
 5396         }
 5397         txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
 5398 
 5399         if (sc->hn_tx_sysctl_tree != NULL) {
 5400                 struct sysctl_oid_list *child;
 5401                 struct sysctl_ctx_list *ctx;
 5402                 char name[16];
 5403 
 5404                 /*
 5405                  * Create per TX ring sysctl tree:
 5406                  * dev.hn.UNIT.tx.RINGID
 5407                  */
 5408                 ctx = device_get_sysctl_ctx(dev);
 5409                 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
 5410 
 5411                 snprintf(name, sizeof(name), "%d", id);
 5412                 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
 5413                     name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 5414 
 5415                 if (txr->hn_tx_sysctl_tree != NULL) {
 5416                         child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
 5417 
 5418 #ifdef HN_DEBUG
 5419                         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
 5420                             CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
 5421                             "# of available TX descs");
 5422 #endif
 5423 #ifdef HN_IFSTART_SUPPORT
 5424                         if (!hn_use_if_start)
 5425 #endif
 5426                         {
 5427                                 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
 5428                                     CTLFLAG_RD, &txr->hn_oactive, 0,
 5429                                     "over active");
 5430                         }
 5431                         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
 5432                             CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
 5433                             "# of packets transmitted");
 5434                         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
 5435                             CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
 5436                             "# of sends");
 5437                 }
 5438         }
 5439 
 5440         return 0;
 5441 }
 5442 
 5443 static void
 5444 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
 5445 {
 5446         struct hn_tx_ring *txr = txd->txr;
 5447 
 5448         KASSERT(txd->m == NULL, ("still has mbuf installed"));
 5449         KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
 5450 
 5451         bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
 5452         bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
 5453             txd->rndis_pkt_dmap);
 5454         bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
 5455 }
 5456 
 5457 static void
 5458 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
 5459 {
 5460 
 5461         KASSERT(txd->refs == 0 || txd->refs == 1,
 5462             ("invalid txd refs %d", txd->refs));
 5463 
 5464         /* Aggregated txds will be freed by their aggregating txd. */
 5465         if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
 5466                 int freed __diagused;
 5467 
 5468                 freed = hn_txdesc_put(txr, txd);
 5469                 KASSERT(freed, ("can't free txdesc"));
 5470         }
 5471 }
 5472 
 5473 static void
 5474 hn_tx_ring_destroy(struct hn_tx_ring *txr)
 5475 {
 5476         int i;
 5477 
 5478         if (txr->hn_txdesc == NULL)
 5479                 return;
 5480 
 5481         /*
 5482          * NOTE:
 5483          * Because the freeing of aggregated txds will be deferred
 5484          * to the aggregating txd, two passes are used here:
 5485          * - The first pass GCes any pending txds.  This GC is necessary,
 5486          *   since if the channels are revoked, hypervisor will not
 5487          *   deliver send-done for all pending txds.
 5488          * - The second pass frees the busdma stuffs, i.e. after all txds
 5489          *   were freed.
 5490          */
 5491         for (i = 0; i < txr->hn_txdesc_cnt; ++i)
 5492                 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
 5493         for (i = 0; i < txr->hn_txdesc_cnt; ++i)
 5494                 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
 5495 
 5496         if (txr->hn_tx_data_dtag != NULL)
 5497                 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
 5498         if (txr->hn_tx_rndis_dtag != NULL)
 5499                 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
 5500 
 5501 #ifdef HN_USE_TXDESC_BUFRING
 5502         buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
 5503 #endif
 5504 
 5505         free(txr->hn_txdesc, M_DEVBUF);
 5506         txr->hn_txdesc = NULL;
 5507 
 5508         if (txr->hn_mbuf_br != NULL)
 5509                 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
 5510 
 5511 #ifndef HN_USE_TXDESC_BUFRING
 5512         mtx_destroy(&txr->hn_txlist_spin);
 5513 #endif
 5514         mtx_destroy(&txr->hn_tx_lock);
 5515 }
 5516 
 5517 static int
 5518 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
 5519 {
 5520         struct sysctl_oid_list *child;
 5521         struct sysctl_ctx_list *ctx;
 5522         int i;
 5523 
 5524         /*
 5525          * Create TXBUF for chimney sending.
 5526          *
 5527          * NOTE: It is shared by all channels.
 5528          */
 5529         sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
 5530             PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
 5531             BUS_DMA_WAITOK | BUS_DMA_ZERO);
 5532         if (sc->hn_chim == NULL) {
 5533                 device_printf(sc->hn_dev, "allocate txbuf failed\n");
 5534                 return (ENOMEM);
 5535         }
 5536 
 5537         sc->hn_tx_ring_cnt = ring_cnt;
 5538         sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
 5539 
 5540         sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
 5541             M_DEVBUF, M_WAITOK | M_ZERO);
 5542 
 5543         ctx = device_get_sysctl_ctx(sc->hn_dev);
 5544         child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
 5545 
 5546         /* Create dev.hn.UNIT.tx sysctl tree */
 5547         sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
 5548             CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 5549 
 5550         for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 5551                 int error;
 5552 
 5553                 error = hn_tx_ring_create(sc, i);
 5554                 if (error)
 5555                         return error;
 5556         }
 5557 
 5558         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
 5559             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
 5560             __offsetof(struct hn_tx_ring, hn_no_txdescs),
 5561             hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
 5562         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
 5563             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
 5564             __offsetof(struct hn_tx_ring, hn_send_failed),
 5565             hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
 5566         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
 5567             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
 5568             __offsetof(struct hn_tx_ring, hn_txdma_failed),
 5569             hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
 5570         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
 5571             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
 5572             __offsetof(struct hn_tx_ring, hn_flush_failed),
 5573             hn_tx_stat_ulong_sysctl, "LU",
 5574             "# of packet transmission aggregation flush failure");
 5575         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
 5576             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
 5577             __offsetof(struct hn_tx_ring, hn_tx_collapsed),
 5578             hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
 5579         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
 5580             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
 5581             __offsetof(struct hn_tx_ring, hn_tx_chimney),
 5582             hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
 5583         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
 5584             CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
 5585             __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
 5586             hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
 5587         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
 5588             CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
 5589             "# of total TX descs");
 5590         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
 5591             CTLFLAG_RD, &sc->hn_chim_szmax, 0,
 5592             "Chimney send packet size upper boundary");
 5593         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
 5594             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
 5595             hn_chim_size_sysctl, "I", "Chimney send packet size limit");
 5596         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
 5597             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 5598             __offsetof(struct hn_tx_ring, hn_direct_tx_size),
 5599             hn_tx_conf_int_sysctl, "I",
 5600             "Size of the packet for direct transmission");
 5601         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
 5602             CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
 5603             __offsetof(struct hn_tx_ring, hn_sched_tx),
 5604             hn_tx_conf_int_sysctl, "I",
 5605             "Always schedule transmission "
 5606             "instead of doing direct transmission");
 5607         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
 5608             CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
 5609         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
 5610             CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
 5611         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
 5612             CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
 5613             "Applied packet transmission aggregation size");
 5614         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
 5615             CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 5616             hn_txagg_pktmax_sysctl, "I",
 5617             "Applied packet transmission aggregation packets");
 5618         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
 5619             CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 5620             hn_txagg_align_sysctl, "I",
 5621             "Applied packet transmission aggregation alignment");
 5622 
 5623         return 0;
 5624 }
 5625 
 5626 static void
 5627 hn_set_chim_size(struct hn_softc *sc, int chim_size)
 5628 {
 5629         int i;
 5630 
 5631         for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
 5632                 sc->hn_tx_ring[i].hn_chim_size = chim_size;
 5633 }
 5634 
 5635 static void
 5636 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
 5637 {
 5638         struct ifnet *ifp = sc->hn_ifp;
 5639         u_int hw_tsomax;
 5640         int tso_minlen;
 5641 
 5642         HN_LOCK_ASSERT(sc);
 5643 
 5644         if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
 5645                 return;
 5646 
 5647         KASSERT(sc->hn_ndis_tso_sgmin >= 2,
 5648             ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
 5649         tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
 5650 
 5651         KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
 5652             sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
 5653             ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
 5654 
 5655         if (tso_maxlen < tso_minlen)
 5656                 tso_maxlen = tso_minlen;
 5657         else if (tso_maxlen > IP_MAXPACKET)
 5658                 tso_maxlen = IP_MAXPACKET;
 5659         if (tso_maxlen > sc->hn_ndis_tso_szmax)
 5660                 tso_maxlen = sc->hn_ndis_tso_szmax;
 5661         hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
 5662 
 5663         if (hn_xpnt_vf_isready(sc)) {
 5664                 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
 5665                         hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
 5666         }
 5667         ifp->if_hw_tsomax = hw_tsomax;
 5668         if (bootverbose)
 5669                 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
 5670 }
 5671 
 5672 static void
 5673 hn_fixup_tx_data(struct hn_softc *sc)
 5674 {
 5675         uint64_t csum_assist;
 5676         int i;
 5677 
 5678         hn_set_chim_size(sc, sc->hn_chim_szmax);
 5679         if (hn_tx_chimney_size > 0 &&
 5680             hn_tx_chimney_size < sc->hn_chim_szmax)
 5681                 hn_set_chim_size(sc, hn_tx_chimney_size);
 5682 
 5683         csum_assist = 0;
 5684         if (sc->hn_caps & HN_CAP_IPCS)
 5685                 csum_assist |= CSUM_IP;
 5686         if (sc->hn_caps & HN_CAP_TCP4CS)
 5687                 csum_assist |= CSUM_IP_TCP;
 5688         if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
 5689                 csum_assist |= CSUM_IP_UDP;
 5690         if (sc->hn_caps & HN_CAP_TCP6CS)
 5691                 csum_assist |= CSUM_IP6_TCP;
 5692         if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
 5693                 csum_assist |= CSUM_IP6_UDP;
 5694         for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
 5695                 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
 5696 
 5697         if (sc->hn_caps & HN_CAP_HASHVAL) {
 5698                 /*
 5699                  * Support HASHVAL pktinfo on TX path.
 5700                  */
 5701                 if (bootverbose)
 5702                         if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
 5703                 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
 5704                         sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
 5705         }
 5706 }
 5707 
 5708 static void
 5709 hn_fixup_rx_data(struct hn_softc *sc)
 5710 {
 5711 
 5712         if (sc->hn_caps & HN_CAP_UDPHASH) {
 5713                 int i;
 5714 
 5715                 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
 5716                         sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
 5717         }
 5718 }
 5719 
 5720 static void
 5721 hn_destroy_tx_data(struct hn_softc *sc)
 5722 {
 5723         int i;
 5724 
 5725         if (sc->hn_chim != NULL) {
 5726                 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
 5727                         hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
 5728                 } else {
 5729                         device_printf(sc->hn_dev,
 5730                             "chimney sending buffer is referenced");
 5731                 }
 5732                 sc->hn_chim = NULL;
 5733         }
 5734 
 5735         if (sc->hn_tx_ring_cnt == 0)
 5736                 return;
 5737 
 5738         for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
 5739                 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
 5740 
 5741         free(sc->hn_tx_ring, M_DEVBUF);
 5742         sc->hn_tx_ring = NULL;
 5743 
 5744         sc->hn_tx_ring_cnt = 0;
 5745         sc->hn_tx_ring_inuse = 0;
 5746 }
 5747 
 5748 #ifdef HN_IFSTART_SUPPORT
 5749 
 5750 static void
 5751 hn_start_taskfunc(void *xtxr, int pending __unused)
 5752 {
 5753         struct hn_tx_ring *txr = xtxr;
 5754 
 5755         mtx_lock(&txr->hn_tx_lock);
 5756         hn_start_locked(txr, 0);
 5757         mtx_unlock(&txr->hn_tx_lock);
 5758 }
 5759 
 5760 static int
 5761 hn_start_locked(struct hn_tx_ring *txr, int len)
 5762 {
 5763         struct hn_softc *sc = txr->hn_sc;
 5764         struct ifnet *ifp = sc->hn_ifp;
 5765         int sched = 0;
 5766 
 5767         KASSERT(hn_use_if_start,
 5768             ("hn_start_locked is called, when if_start is disabled"));
 5769         KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
 5770         mtx_assert(&txr->hn_tx_lock, MA_OWNED);
 5771         KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
 5772 
 5773         if (__predict_false(txr->hn_suspended))
 5774                 return (0);
 5775 
 5776         if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 5777             IFF_DRV_RUNNING)
 5778                 return (0);
 5779 
 5780         while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
 5781                 struct hn_txdesc *txd;
 5782                 struct mbuf *m_head;
 5783                 int error;
 5784 
 5785                 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
 5786                 if (m_head == NULL)
 5787                         break;
 5788 
 5789                 if (len > 0 && m_head->m_pkthdr.len > len) {
 5790                         /*
 5791                          * This sending could be time consuming; let callers
 5792                          * dispatch this packet sending (and sending of any
 5793                          * following up packets) to tx taskqueue.
 5794                          */
 5795                         IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
 5796                         sched = 1;
 5797                         break;
 5798                 }
 5799 
 5800 #if defined(INET6) || defined(INET)
 5801                 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 5802                         m_head = hn_tso_fixup(m_head);
 5803                         if (__predict_false(m_head == NULL)) {
 5804                                 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 5805                                 continue;
 5806                         }
 5807                 } else if (m_head->m_pkthdr.csum_flags &
 5808                     (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
 5809                         m_head = hn_set_hlen(m_head);
 5810                         if (__predict_false(m_head == NULL)) {
 5811                                 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 5812                                 continue;
 5813                         }
 5814                 }
 5815 #endif
 5816 
 5817                 txd = hn_txdesc_get(txr);
 5818                 if (txd == NULL) {
 5819                         txr->hn_no_txdescs++;
 5820                         IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
 5821                         atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 5822                         break;
 5823                 }
 5824 
 5825                 error = hn_encap(ifp, txr, txd, &m_head);
 5826                 if (error) {
 5827                         /* Both txd and m_head are freed */
 5828                         KASSERT(txr->hn_agg_txd == NULL,
 5829                             ("encap failed w/ pending aggregating txdesc"));
 5830                         continue;
 5831                 }
 5832 
 5833                 if (txr->hn_agg_pktleft == 0) {
 5834                         if (txr->hn_agg_txd != NULL) {
 5835                                 KASSERT(m_head == NULL,
 5836                                     ("pending mbuf for aggregating txdesc"));
 5837                                 error = hn_flush_txagg(ifp, txr);
 5838                                 if (__predict_false(error)) {
 5839                                         atomic_set_int(&ifp->if_drv_flags,
 5840                                             IFF_DRV_OACTIVE);
 5841                                         break;
 5842                                 }
 5843                         } else {
 5844                                 KASSERT(m_head != NULL, ("mbuf was freed"));
 5845                                 error = hn_txpkt(ifp, txr, txd);
 5846                                 if (__predict_false(error)) {
 5847                                         /* txd is freed, but m_head is not */
 5848                                         IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
 5849                                         atomic_set_int(&ifp->if_drv_flags,
 5850                                             IFF_DRV_OACTIVE);
 5851                                         break;
 5852                                 }
 5853                         }
 5854                 }
 5855 #ifdef INVARIANTS
 5856                 else {
 5857                         KASSERT(txr->hn_agg_txd != NULL,
 5858                             ("no aggregating txdesc"));
 5859                         KASSERT(m_head == NULL,
 5860                             ("pending mbuf for aggregating txdesc"));
 5861                 }
 5862 #endif
 5863         }
 5864 
 5865         /* Flush pending aggerated transmission. */
 5866         if (txr->hn_agg_txd != NULL)
 5867                 hn_flush_txagg(ifp, txr);
 5868         return (sched);
 5869 }
 5870 
 5871 static void
 5872 hn_start(struct ifnet *ifp)
 5873 {
 5874         struct hn_softc *sc = ifp->if_softc;
 5875         struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
 5876 
 5877         if (txr->hn_sched_tx)
 5878                 goto do_sched;
 5879 
 5880         if (mtx_trylock(&txr->hn_tx_lock)) {
 5881                 int sched;
 5882 
 5883                 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
 5884                 mtx_unlock(&txr->hn_tx_lock);
 5885                 if (!sched)
 5886                         return;
 5887         }
 5888 do_sched:
 5889         taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
 5890 }
 5891 
 5892 static void
 5893 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
 5894 {
 5895         struct hn_tx_ring *txr = xtxr;
 5896 
 5897         mtx_lock(&txr->hn_tx_lock);
 5898         atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
 5899         hn_start_locked(txr, 0);
 5900         mtx_unlock(&txr->hn_tx_lock);
 5901 }
 5902 
 5903 static void
 5904 hn_start_txeof(struct hn_tx_ring *txr)
 5905 {
 5906         struct hn_softc *sc = txr->hn_sc;
 5907         struct ifnet *ifp = sc->hn_ifp;
 5908 
 5909         KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
 5910 
 5911         if (txr->hn_sched_tx)
 5912                 goto do_sched;
 5913 
 5914         if (mtx_trylock(&txr->hn_tx_lock)) {
 5915                 int sched;
 5916 
 5917                 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 5918                 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
 5919                 mtx_unlock(&txr->hn_tx_lock);
 5920                 if (sched) {
 5921                         taskqueue_enqueue(txr->hn_tx_taskq,
 5922                             &txr->hn_tx_task);
 5923                 }
 5924         } else {
 5925 do_sched:
 5926                 /*
 5927                  * Release the OACTIVE earlier, with the hope, that
 5928                  * others could catch up.  The task will clear the
 5929                  * flag again with the hn_tx_lock to avoid possible
 5930                  * races.
 5931                  */
 5932                 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
 5933                 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
 5934         }
 5935 }
 5936 
 5937 #endif  /* HN_IFSTART_SUPPORT */
 5938 
 5939 static int
 5940 hn_xmit(struct hn_tx_ring *txr, int len)
 5941 {
 5942         struct hn_softc *sc = txr->hn_sc;
 5943         struct ifnet *ifp = sc->hn_ifp;
 5944         struct mbuf *m_head;
 5945         int sched = 0;
 5946 
 5947         mtx_assert(&txr->hn_tx_lock, MA_OWNED);
 5948 #ifdef HN_IFSTART_SUPPORT
 5949         KASSERT(hn_use_if_start == 0,
 5950             ("hn_xmit is called, when if_start is enabled"));
 5951 #endif
 5952         KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
 5953 
 5954         if (__predict_false(txr->hn_suspended))
 5955                 return (0);
 5956 
 5957         if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
 5958                 return (0);
 5959 
 5960         while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
 5961                 struct hn_txdesc *txd;
 5962                 int error;
 5963 
 5964                 if (len > 0 && m_head->m_pkthdr.len > len) {
 5965                         /*
 5966                          * This sending could be time consuming; let callers
 5967                          * dispatch this packet sending (and sending of any
 5968                          * following up packets) to tx taskqueue.
 5969                          */
 5970                         drbr_putback(ifp, txr->hn_mbuf_br, m_head);
 5971                         sched = 1;
 5972                         break;
 5973                 }
 5974 
 5975                 txd = hn_txdesc_get(txr);
 5976                 if (txd == NULL) {
 5977                         txr->hn_no_txdescs++;
 5978                         drbr_putback(ifp, txr->hn_mbuf_br, m_head);
 5979                         txr->hn_oactive = 1;
 5980                         break;
 5981                 }
 5982 
 5983                 error = hn_encap(ifp, txr, txd, &m_head);
 5984                 if (error) {
 5985                         /* Both txd and m_head are freed; discard */
 5986                         KASSERT(txr->hn_agg_txd == NULL,
 5987                             ("encap failed w/ pending aggregating txdesc"));
 5988                         drbr_advance(ifp, txr->hn_mbuf_br);
 5989                         continue;
 5990                 }
 5991 
 5992                 if (txr->hn_agg_pktleft == 0) {
 5993                         if (txr->hn_agg_txd != NULL) {
 5994                                 KASSERT(m_head == NULL,
 5995                                     ("pending mbuf for aggregating txdesc"));
 5996                                 error = hn_flush_txagg(ifp, txr);
 5997                                 if (__predict_false(error)) {
 5998                                         txr->hn_oactive = 1;
 5999                                         break;
 6000                                 }
 6001                         } else {
 6002                                 KASSERT(m_head != NULL, ("mbuf was freed"));
 6003                                 error = hn_txpkt(ifp, txr, txd);
 6004                                 if (__predict_false(error)) {
 6005                                         /* txd is freed, but m_head is not */
 6006                                         drbr_putback(ifp, txr->hn_mbuf_br,
 6007                                             m_head);
 6008                                         txr->hn_oactive = 1;
 6009                                         break;
 6010                                 }
 6011                         }
 6012                 }
 6013 #ifdef INVARIANTS
 6014                 else {
 6015                         KASSERT(txr->hn_agg_txd != NULL,
 6016                             ("no aggregating txdesc"));
 6017                         KASSERT(m_head == NULL,
 6018                             ("pending mbuf for aggregating txdesc"));
 6019                 }
 6020 #endif
 6021 
 6022                 /* Sent */
 6023                 drbr_advance(ifp, txr->hn_mbuf_br);
 6024         }
 6025 
 6026         /* Flush pending aggerated transmission. */
 6027         if (txr->hn_agg_txd != NULL)
 6028                 hn_flush_txagg(ifp, txr);
 6029         return (sched);
 6030 }
 6031 
 6032 static int
 6033 hn_transmit(struct ifnet *ifp, struct mbuf *m)
 6034 {
 6035         struct hn_softc *sc = ifp->if_softc;
 6036         struct hn_tx_ring *txr;
 6037         int error, idx = 0;
 6038 
 6039         if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
 6040                 struct rm_priotracker pt;
 6041 
 6042                 rm_rlock(&sc->hn_vf_lock, &pt);
 6043                 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
 6044                         struct mbuf *m_bpf = NULL;
 6045                         int obytes, omcast;
 6046 
 6047                         obytes = m->m_pkthdr.len;
 6048                         omcast = (m->m_flags & M_MCAST) != 0;
 6049 
 6050                         if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
 6051                                 if (bpf_peers_present(ifp->if_bpf)) {
 6052                                         m_bpf = m_copypacket(m, M_NOWAIT);
 6053                                         if (m_bpf == NULL) {
 6054                                                 /*
 6055                                                  * Failed to grab a shallow
 6056                                                  * copy; tap now.
 6057                                                  */
 6058                                                 ETHER_BPF_MTAP(ifp, m);
 6059                                         }
 6060                                 }
 6061                         } else {
 6062                                 ETHER_BPF_MTAP(ifp, m);
 6063                         }
 6064 
 6065                         error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
 6066                         rm_runlock(&sc->hn_vf_lock, &pt);
 6067 
 6068                         if (m_bpf != NULL) {
 6069                                 if (!error)
 6070                                         ETHER_BPF_MTAP(ifp, m_bpf);
 6071                                 m_freem(m_bpf);
 6072                         }
 6073 
 6074                         if (error == ENOBUFS) {
 6075                                 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 6076                         } else if (error) {
 6077                                 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 6078                         } else {
 6079                                 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 6080                                 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
 6081                                 if (omcast) {
 6082                                         if_inc_counter(ifp, IFCOUNTER_OMCASTS,
 6083                                             omcast);
 6084                                 }
 6085                         }
 6086                         return (error);
 6087                 }
 6088                 rm_runlock(&sc->hn_vf_lock, &pt);
 6089         }
 6090 
 6091 #if defined(INET6) || defined(INET)
 6092         /*
 6093          * Perform TSO packet header fixup or get l2/l3 header length now,
 6094          * since packet headers should be cache-hot.
 6095          */
 6096         if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 6097                 m = hn_tso_fixup(m);
 6098                 if (__predict_false(m == NULL)) {
 6099                         if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 6100                         return EIO;
 6101                 }
 6102         } else if (m->m_pkthdr.csum_flags &
 6103             (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
 6104                 m = hn_set_hlen(m);
 6105                 if (__predict_false(m == NULL)) {
 6106                         if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 6107                         return EIO;
 6108                 }
 6109         }
 6110 #endif
 6111 
 6112         /*
 6113          * Select the TX ring based on flowid
 6114          */
 6115         if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
 6116 #ifdef RSS
 6117                 uint32_t bid;
 6118 
 6119                 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
 6120                     &bid) == 0)
 6121                         idx = bid % sc->hn_tx_ring_inuse;
 6122                 else
 6123 #endif
 6124                 {
 6125 #if defined(INET6) || defined(INET)
 6126                         int tcpsyn = 0;
 6127 
 6128                         if (m->m_pkthdr.len < 128 &&
 6129                             (m->m_pkthdr.csum_flags &
 6130                              (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
 6131                             (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
 6132                                 m = hn_check_tcpsyn(m, &tcpsyn);
 6133                                 if (__predict_false(m == NULL)) {
 6134                                         if_inc_counter(ifp,
 6135                                             IFCOUNTER_OERRORS, 1);
 6136                                         return (EIO);
 6137                                 }
 6138                         }
 6139 #else
 6140                         const int tcpsyn = 0;
 6141 #endif
 6142                         if (tcpsyn)
 6143                                 idx = 0;
 6144                         else
 6145                                 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
 6146                 }
 6147         }
 6148         txr = &sc->hn_tx_ring[idx];
 6149 
 6150         error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
 6151         if (error) {
 6152                 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 6153                 return error;
 6154         }
 6155 
 6156         if (txr->hn_oactive)
 6157                 return 0;
 6158 
 6159         if (txr->hn_sched_tx)
 6160                 goto do_sched;
 6161 
 6162         if (mtx_trylock(&txr->hn_tx_lock)) {
 6163                 int sched;
 6164 
 6165                 sched = hn_xmit(txr, txr->hn_direct_tx_size);
 6166                 mtx_unlock(&txr->hn_tx_lock);
 6167                 if (!sched)
 6168                         return 0;
 6169         }
 6170 do_sched:
 6171         taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
 6172         return 0;
 6173 }
 6174 
 6175 static void
 6176 hn_tx_ring_qflush(struct hn_tx_ring *txr)
 6177 {
 6178         struct mbuf *m;
 6179 
 6180         mtx_lock(&txr->hn_tx_lock);
 6181         while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
 6182                 m_freem(m);
 6183         mtx_unlock(&txr->hn_tx_lock);
 6184 }
 6185 
 6186 static void
 6187 hn_xmit_qflush(struct ifnet *ifp)
 6188 {
 6189         struct hn_softc *sc = ifp->if_softc;
 6190         struct rm_priotracker pt;
 6191         int i;
 6192 
 6193         for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
 6194                 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
 6195         if_qflush(ifp);
 6196 
 6197         rm_rlock(&sc->hn_vf_lock, &pt);
 6198         if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
 6199                 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
 6200         rm_runlock(&sc->hn_vf_lock, &pt);
 6201 }
 6202 
 6203 static void
 6204 hn_xmit_txeof(struct hn_tx_ring *txr)
 6205 {
 6206 
 6207         if (txr->hn_sched_tx)
 6208                 goto do_sched;
 6209 
 6210         if (mtx_trylock(&txr->hn_tx_lock)) {
 6211                 int sched;
 6212 
 6213                 txr->hn_oactive = 0;
 6214                 sched = hn_xmit(txr, txr->hn_direct_tx_size);
 6215                 mtx_unlock(&txr->hn_tx_lock);
 6216                 if (sched) {
 6217                         taskqueue_enqueue(txr->hn_tx_taskq,
 6218                             &txr->hn_tx_task);
 6219                 }
 6220         } else {
 6221 do_sched:
 6222                 /*
 6223                  * Release the oactive earlier, with the hope, that
 6224                  * others could catch up.  The task will clear the
 6225                  * oactive again with the hn_tx_lock to avoid possible
 6226                  * races.
 6227                  */
 6228                 txr->hn_oactive = 0;
 6229                 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
 6230         }
 6231 }
 6232 
 6233 static void
 6234 hn_xmit_taskfunc(void *xtxr, int pending __unused)
 6235 {
 6236         struct hn_tx_ring *txr = xtxr;
 6237 
 6238         mtx_lock(&txr->hn_tx_lock);
 6239         hn_xmit(txr, 0);
 6240         mtx_unlock(&txr->hn_tx_lock);
 6241 }
 6242 
 6243 static void
 6244 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
 6245 {
 6246         struct hn_tx_ring *txr = xtxr;
 6247 
 6248         mtx_lock(&txr->hn_tx_lock);
 6249         txr->hn_oactive = 0;
 6250         hn_xmit(txr, 0);
 6251         mtx_unlock(&txr->hn_tx_lock);
 6252 }
 6253 
 6254 static int
 6255 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
 6256 {
 6257         struct vmbus_chan_br cbr;
 6258         struct hn_rx_ring *rxr;
 6259         struct hn_tx_ring *txr = NULL;
 6260         int idx, error;
 6261 
 6262         idx = vmbus_chan_subidx(chan);
 6263 
 6264         /*
 6265          * Link this channel to RX/TX ring.
 6266          */
 6267         KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
 6268             ("invalid channel index %d, should > 0 && < %d",
 6269              idx, sc->hn_rx_ring_inuse));
 6270         rxr = &sc->hn_rx_ring[idx];
 6271         KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
 6272             ("RX ring %d already attached", idx));
 6273         rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
 6274         rxr->hn_chan = chan;
 6275 
 6276         if (bootverbose) {
 6277                 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
 6278                     idx, vmbus_chan_id(chan));
 6279         }
 6280 
 6281         if (idx < sc->hn_tx_ring_inuse) {
 6282                 txr = &sc->hn_tx_ring[idx];
 6283                 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
 6284                     ("TX ring %d already attached", idx));
 6285                 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
 6286 
 6287                 txr->hn_chan = chan;
 6288                 if (bootverbose) {
 6289                         if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
 6290                             idx, vmbus_chan_id(chan));
 6291                 }
 6292         }
 6293 
 6294         /* Bind this channel to a proper CPU. */
 6295         vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
 6296 
 6297         /*
 6298          * Open this channel
 6299          */
 6300         cbr.cbr = rxr->hn_br;
 6301         cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
 6302         cbr.cbr_txsz = HN_TXBR_SIZE;
 6303         cbr.cbr_rxsz = HN_RXBR_SIZE;
 6304         error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
 6305         if (error) {
 6306                 if (error == EISCONN) {
 6307                         if_printf(sc->hn_ifp, "bufring is connected after "
 6308                             "chan%u open failure\n", vmbus_chan_id(chan));
 6309                         rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
 6310                 } else {
 6311                         if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
 6312                             vmbus_chan_id(chan), error);
 6313                 }
 6314         }
 6315         return (error);
 6316 }
 6317 
 6318 static void
 6319 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
 6320 {
 6321         struct hn_rx_ring *rxr;
 6322         int idx, error;
 6323 
 6324         idx = vmbus_chan_subidx(chan);
 6325 
 6326         /*
 6327          * Link this channel to RX/TX ring.
 6328          */
 6329         KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
 6330             ("invalid channel index %d, should > 0 && < %d",
 6331              idx, sc->hn_rx_ring_inuse));
 6332         rxr = &sc->hn_rx_ring[idx];
 6333         KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
 6334             ("RX ring %d is not attached", idx));
 6335         rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
 6336 
 6337         if (idx < sc->hn_tx_ring_inuse) {
 6338                 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
 6339 
 6340                 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
 6341                     ("TX ring %d is not attached attached", idx));
 6342                 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
 6343         }
 6344 
 6345         /*
 6346          * Close this channel.
 6347          *
 6348          * NOTE:
 6349          * Channel closing does _not_ destroy the target channel.
 6350          */
 6351         error = vmbus_chan_close_direct(chan);
 6352         if (error == EISCONN) {
 6353                 if_printf(sc->hn_ifp, "chan%u bufring is connected "
 6354                     "after being closed\n", vmbus_chan_id(chan));
 6355                 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
 6356         } else if (error) {
 6357                 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
 6358                     vmbus_chan_id(chan), error);
 6359         }
 6360 }
 6361 
 6362 static int
 6363 hn_attach_subchans(struct hn_softc *sc)
 6364 {
 6365         struct vmbus_channel **subchans;
 6366         int subchan_cnt = sc->hn_rx_ring_inuse - 1;
 6367         int i, error = 0;
 6368 
 6369         KASSERT(subchan_cnt > 0, ("no sub-channels"));
 6370 
 6371         /* Attach the sub-channels. */
 6372         subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
 6373         for (i = 0; i < subchan_cnt; ++i) {
 6374                 int error1;
 6375 
 6376                 error1 = hn_chan_attach(sc, subchans[i]);
 6377                 if (error1) {
 6378                         error = error1;
 6379                         /* Move on; all channels will be detached later. */
 6380                 }
 6381         }
 6382         vmbus_subchan_rel(subchans, subchan_cnt);
 6383 
 6384         if (error) {
 6385                 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
 6386         } else {
 6387                 if (bootverbose) {
 6388                         if_printf(sc->hn_ifp, "%d sub-channels attached\n",
 6389                             subchan_cnt);
 6390                 }
 6391         }
 6392         return (error);
 6393 }
 6394 
 6395 static void
 6396 hn_detach_allchans(struct hn_softc *sc)
 6397 {
 6398         struct vmbus_channel **subchans;
 6399         int subchan_cnt = sc->hn_rx_ring_inuse - 1;
 6400         int i;
 6401 
 6402         if (subchan_cnt == 0)
 6403                 goto back;
 6404 
 6405         /* Detach the sub-channels. */
 6406         subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
 6407         for (i = 0; i < subchan_cnt; ++i)
 6408                 hn_chan_detach(sc, subchans[i]);
 6409         vmbus_subchan_rel(subchans, subchan_cnt);
 6410 
 6411 back:
 6412         /*
 6413          * Detach the primary channel, _after_ all sub-channels
 6414          * are detached.
 6415          */
 6416         hn_chan_detach(sc, sc->hn_prichan);
 6417 
 6418         /* Wait for sub-channels to be destroyed, if any. */
 6419         vmbus_subchan_drain(sc->hn_prichan);
 6420 
 6421 #ifdef INVARIANTS
 6422         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 6423                 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
 6424                     HN_RX_FLAG_ATTACHED) == 0,
 6425                     ("%dth RX ring is still attached", i));
 6426         }
 6427         for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
 6428                 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
 6429                     HN_TX_FLAG_ATTACHED) == 0,
 6430                     ("%dth TX ring is still attached", i));
 6431         }
 6432 #endif
 6433 }
 6434 
 6435 static int
 6436 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
 6437 {
 6438         struct vmbus_channel **subchans;
 6439         int nchan, rxr_cnt, error;
 6440 
 6441         nchan = *nsubch + 1;
 6442         if (nchan == 1) {
 6443                 /*
 6444                  * Multiple RX/TX rings are not requested.
 6445                  */
 6446                 *nsubch = 0;
 6447                 return (0);
 6448         }
 6449 
 6450         /*
 6451          * Query RSS capabilities, e.g. # of RX rings, and # of indirect
 6452          * table entries.
 6453          */
 6454         error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
 6455         if (error) {
 6456                 /* No RSS; this is benign. */
 6457                 *nsubch = 0;
 6458                 return (0);
 6459         }
 6460         if (bootverbose) {
 6461                 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
 6462                     rxr_cnt, nchan);
 6463         }
 6464 
 6465         if (nchan > rxr_cnt)
 6466                 nchan = rxr_cnt;
 6467         if (nchan == 1) {
 6468                 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
 6469                 *nsubch = 0;
 6470                 return (0);
 6471         }
 6472 
 6473         /*
 6474          * Allocate sub-channels from NVS.
 6475          */
 6476         *nsubch = nchan - 1;
 6477         error = hn_nvs_alloc_subchans(sc, nsubch);
 6478         if (error || *nsubch == 0) {
 6479                 /* Failed to allocate sub-channels. */
 6480                 *nsubch = 0;
 6481                 return (0);
 6482         }
 6483 
 6484         /*
 6485          * Wait for all sub-channels to become ready before moving on.
 6486          */
 6487         subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
 6488         vmbus_subchan_rel(subchans, *nsubch);
 6489         return (0);
 6490 }
 6491 
 6492 static bool
 6493 hn_synth_attachable(const struct hn_softc *sc)
 6494 {
 6495         int i;
 6496 
 6497         if (sc->hn_flags & HN_FLAG_ERRORS)
 6498                 return (false);
 6499 
 6500         for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
 6501                 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
 6502 
 6503                 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
 6504                         return (false);
 6505         }
 6506         return (true);
 6507 }
 6508 
 6509 /*
 6510  * Make sure that the RX filter is zero after the successful
 6511  * RNDIS initialization.
 6512  *
 6513  * NOTE:
 6514  * Under certain conditions on certain versions of Hyper-V,
 6515  * the RNDIS rxfilter is _not_ zero on the hypervisor side
 6516  * after the successful RNDIS initialization, which breaks
 6517  * the assumption of any following code (well, it breaks the
 6518  * RNDIS API contract actually).  Clear the RNDIS rxfilter
 6519  * explicitly, drain packets sneaking through, and drain the
 6520  * interrupt taskqueues scheduled due to the stealth packets.
 6521  */
 6522 static void
 6523 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
 6524 {
 6525 
 6526         hn_disable_rx(sc);
 6527         hn_drain_rxtx(sc, nchan);
 6528 }
 6529 
 6530 static int
 6531 hn_synth_attach(struct hn_softc *sc, int mtu)
 6532 {
 6533 #define ATTACHED_NVS            0x0002
 6534 #define ATTACHED_RNDIS          0x0004
 6535 
 6536         struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
 6537         int error, nsubch, nchan = 1, i, rndis_inited;
 6538         uint32_t old_caps, attached = 0;
 6539 
 6540         KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
 6541             ("synthetic parts were attached"));
 6542 
 6543         if (!hn_synth_attachable(sc))
 6544                 return (ENXIO);
 6545 
 6546         /* Save capabilities for later verification. */
 6547         old_caps = sc->hn_caps;
 6548         sc->hn_caps = 0;
 6549 
 6550         /* Clear RSS stuffs. */
 6551         sc->hn_rss_ind_size = 0;
 6552         sc->hn_rss_hash = 0;
 6553         sc->hn_rss_hcap = 0;
 6554 
 6555         /*
 6556          * Attach the primary channel _before_ attaching NVS and RNDIS.
 6557          */
 6558         error = hn_chan_attach(sc, sc->hn_prichan);
 6559         if (error)
 6560                 goto failed;
 6561 
 6562         /*
 6563          * Attach NVS.
 6564          */
 6565         error = hn_nvs_attach(sc, mtu);
 6566         if (error)
 6567                 goto failed;
 6568         attached |= ATTACHED_NVS;
 6569 
 6570         /*
 6571          * Attach RNDIS _after_ NVS is attached.
 6572          */
 6573         error = hn_rndis_attach(sc, mtu, &rndis_inited);
 6574         if (rndis_inited)
 6575                 attached |= ATTACHED_RNDIS;
 6576         if (error)
 6577                 goto failed;
 6578 
 6579         /*
 6580          * Make sure capabilities are not changed.
 6581          */
 6582         if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
 6583                 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
 6584                     old_caps, sc->hn_caps);
 6585                 error = ENXIO;
 6586                 goto failed;
 6587         }
 6588 
 6589         /*
 6590          * Allocate sub-channels for multi-TX/RX rings.
 6591          *
 6592          * NOTE:
 6593          * The # of RX rings that can be used is equivalent to the # of
 6594          * channels to be requested.
 6595          */
 6596         nsubch = sc->hn_rx_ring_cnt - 1;
 6597         error = hn_synth_alloc_subchans(sc, &nsubch);
 6598         if (error)
 6599                 goto failed;
 6600         /* NOTE: _Full_ synthetic parts detach is required now. */
 6601         sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
 6602 
 6603         /*
 6604          * Set the # of TX/RX rings that could be used according to
 6605          * the # of channels that NVS offered.
 6606          */
 6607         nchan = nsubch + 1;
 6608         hn_set_ring_inuse(sc, nchan);
 6609         if (nchan == 1) {
 6610                 /* Only the primary channel can be used; done */
 6611                 goto back;
 6612         }
 6613 
 6614         /*
 6615          * Attach the sub-channels.
 6616          *
 6617          * NOTE: hn_set_ring_inuse() _must_ have been called.
 6618          */
 6619         error = hn_attach_subchans(sc);
 6620         if (error)
 6621                 goto failed;
 6622 
 6623         /*
 6624          * Configure RSS key and indirect table _after_ all sub-channels
 6625          * are attached.
 6626          */
 6627         if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
 6628                 /*
 6629                  * RSS key is not set yet; set it to the default RSS key.
 6630                  */
 6631                 if (bootverbose)
 6632                         if_printf(sc->hn_ifp, "setup default RSS key\n");
 6633 #ifdef RSS
 6634                 rss_getkey(rss->rss_key);
 6635 #else
 6636                 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
 6637 #endif
 6638                 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
 6639         }
 6640 
 6641         if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
 6642                 /*
 6643                  * RSS indirect table is not set yet; set it up in round-
 6644                  * robin fashion.
 6645                  */
 6646                 if (bootverbose) {
 6647                         if_printf(sc->hn_ifp, "setup default RSS indirect "
 6648                             "table\n");
 6649                 }
 6650                 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
 6651                         uint32_t subidx;
 6652 
 6653 #ifdef RSS
 6654                         subidx = rss_get_indirection_to_bucket(i);
 6655 #else
 6656                         subidx = i;
 6657 #endif
 6658                         rss->rss_ind[i] = subidx % nchan;
 6659                 }
 6660                 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
 6661         } else {
 6662                 /*
 6663                  * # of usable channels may be changed, so we have to
 6664                  * make sure that all entries in RSS indirect table
 6665                  * are valid.
 6666                  *
 6667                  * NOTE: hn_set_ring_inuse() _must_ have been called.
 6668                  */
 6669                 hn_rss_ind_fixup(sc);
 6670         }
 6671 
 6672         sc->hn_rss_hash = sc->hn_rss_hcap;
 6673         if ((sc->hn_flags & HN_FLAG_RXVF) ||
 6674             (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
 6675                 /* NOTE: Don't reconfigure RSS; will do immediately. */
 6676                 hn_vf_rss_fixup(sc, false);
 6677         }
 6678         error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
 6679         if (error)
 6680                 goto failed;
 6681 back:
 6682         /*
 6683          * Fixup transmission aggregation setup.
 6684          */
 6685         hn_set_txagg(sc);
 6686         hn_rndis_init_fixat(sc, nchan);
 6687         return (0);
 6688 
 6689 failed:
 6690         if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
 6691                 hn_rndis_init_fixat(sc, nchan);
 6692                 hn_synth_detach(sc);
 6693         } else {
 6694                 if (attached & ATTACHED_RNDIS) {
 6695                         hn_rndis_init_fixat(sc, nchan);
 6696                         hn_rndis_detach(sc);
 6697                 }
 6698                 if (attached & ATTACHED_NVS)
 6699                         hn_nvs_detach(sc);
 6700                 hn_chan_detach(sc, sc->hn_prichan);
 6701                 /* Restore old capabilities. */
 6702                 sc->hn_caps = old_caps;
 6703         }
 6704         return (error);
 6705 
 6706 #undef ATTACHED_RNDIS
 6707 #undef ATTACHED_NVS
 6708 }
 6709 
 6710 /*
 6711  * NOTE:
 6712  * The interface must have been suspended though hn_suspend(), before
 6713  * this function get called.
 6714  */
 6715 static void
 6716 hn_synth_detach(struct hn_softc *sc)
 6717 {
 6718 
 6719         KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
 6720             ("synthetic parts were not attached"));
 6721 
 6722         /* Detach the RNDIS first. */
 6723         hn_rndis_detach(sc);
 6724 
 6725         /* Detach NVS. */
 6726         hn_nvs_detach(sc);
 6727 
 6728         /* Detach all of the channels. */
 6729         hn_detach_allchans(sc);
 6730 
 6731         if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
 6732                 /*
 6733                  * Host is post-Win2016, disconnect RXBUF from primary channel here.
 6734                  */
 6735                 int error;
 6736 
 6737                 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
 6738                     sc->hn_rxbuf_gpadl);
 6739                 if (error) {
 6740                         if_printf(sc->hn_ifp,
 6741                             "rxbuf gpadl disconn failed: %d\n", error);
 6742                         sc->hn_flags |= HN_FLAG_RXBUF_REF;
 6743                 }
 6744                 sc->hn_rxbuf_gpadl = 0;
 6745         }
 6746 
 6747         if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
 6748                 /*
 6749                  * Host is post-Win2016, disconnect chimney sending buffer from
 6750                  * primary channel here.
 6751                  */
 6752                 int error;
 6753 
 6754                 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
 6755                     sc->hn_chim_gpadl);
 6756                 if (error) {
 6757                         if_printf(sc->hn_ifp,
 6758                             "chim gpadl disconn failed: %d\n", error);
 6759                         sc->hn_flags |= HN_FLAG_CHIM_REF;
 6760                 }
 6761                 sc->hn_chim_gpadl = 0;
 6762         }
 6763         sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
 6764 }
 6765 
 6766 static void
 6767 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
 6768 {
 6769         KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
 6770             ("invalid ring count %d", ring_cnt));
 6771 
 6772         if (sc->hn_tx_ring_cnt > ring_cnt)
 6773                 sc->hn_tx_ring_inuse = ring_cnt;
 6774         else
 6775                 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
 6776         sc->hn_rx_ring_inuse = ring_cnt;
 6777 
 6778 #ifdef RSS
 6779         if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
 6780                 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
 6781                     "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
 6782                     rss_getnumbuckets());
 6783         }
 6784 #endif
 6785 
 6786         if (bootverbose) {
 6787                 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
 6788                     sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
 6789         }
 6790 }
 6791 
 6792 static void
 6793 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
 6794 {
 6795 
 6796         /*
 6797          * NOTE:
 6798          * The TX bufring will not be drained by the hypervisor,
 6799          * if the primary channel is revoked.
 6800          */
 6801         while (!vmbus_chan_rx_empty(chan) ||
 6802             (!vmbus_chan_is_revoked(sc->hn_prichan) &&
 6803              !vmbus_chan_tx_empty(chan)))
 6804                 pause("waitch", 1);
 6805         vmbus_chan_intr_drain(chan);
 6806 }
 6807 
 6808 static void
 6809 hn_disable_rx(struct hn_softc *sc)
 6810 {
 6811 
 6812         /*
 6813          * Disable RX by clearing RX filter forcefully.
 6814          */
 6815         sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
 6816         hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
 6817 
 6818         /*
 6819          * Give RNDIS enough time to flush all pending data packets.
 6820          */
 6821         pause("waitrx", (200 * hz) / 1000);
 6822 }
 6823 
 6824 /*
 6825  * NOTE:
 6826  * RX/TX _must_ have been suspended/disabled, before this function
 6827  * is called.
 6828  */
 6829 static void
 6830 hn_drain_rxtx(struct hn_softc *sc, int nchan)
 6831 {
 6832         struct vmbus_channel **subch = NULL;
 6833         int nsubch;
 6834 
 6835         /*
 6836          * Drain RX/TX bufrings and interrupts.
 6837          */
 6838         nsubch = nchan - 1;
 6839         if (nsubch > 0)
 6840                 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
 6841 
 6842         if (subch != NULL) {
 6843                 int i;
 6844 
 6845                 for (i = 0; i < nsubch; ++i)
 6846                         hn_chan_drain(sc, subch[i]);
 6847         }
 6848         hn_chan_drain(sc, sc->hn_prichan);
 6849 
 6850         if (subch != NULL)
 6851                 vmbus_subchan_rel(subch, nsubch);
 6852 }
 6853 
 6854 static void
 6855 hn_suspend_data(struct hn_softc *sc)
 6856 {
 6857         struct hn_tx_ring *txr;
 6858         int i;
 6859 
 6860         HN_LOCK_ASSERT(sc);
 6861 
 6862         /*
 6863          * Suspend TX.
 6864          */
 6865         for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 6866                 txr = &sc->hn_tx_ring[i];
 6867 
 6868                 mtx_lock(&txr->hn_tx_lock);
 6869                 txr->hn_suspended = 1;
 6870                 mtx_unlock(&txr->hn_tx_lock);
 6871                 /* No one is able send more packets now. */
 6872 
 6873                 /*
 6874                  * Wait for all pending sends to finish.
 6875                  *
 6876                  * NOTE:
 6877                  * We will _not_ receive all pending send-done, if the
 6878                  * primary channel is revoked.
 6879                  */
 6880                 while (hn_tx_ring_pending(txr) &&
 6881                     !vmbus_chan_is_revoked(sc->hn_prichan))
 6882                         pause("hnwtx", 1 /* 1 tick */);
 6883         }
 6884 
 6885         /*
 6886          * Disable RX.
 6887          */
 6888         hn_disable_rx(sc);
 6889 
 6890         /*
 6891          * Drain RX/TX.
 6892          */
 6893         hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
 6894 
 6895         /*
 6896          * Drain any pending TX tasks.
 6897          *
 6898          * NOTE:
 6899          * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
 6900          * tasks will have to be drained _after_ the above hn_drain_rxtx().
 6901          */
 6902         for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 6903                 txr = &sc->hn_tx_ring[i];
 6904 
 6905                 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
 6906                 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
 6907         }
 6908 }
 6909 
 6910 static void
 6911 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
 6912 {
 6913 
 6914         ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
 6915 }
 6916 
 6917 static void
 6918 hn_suspend_mgmt(struct hn_softc *sc)
 6919 {
 6920         struct task task;
 6921 
 6922         HN_LOCK_ASSERT(sc);
 6923 
 6924         /*
 6925          * Make sure that hn_mgmt_taskq0 can nolonger be accessed
 6926          * through hn_mgmt_taskq.
 6927          */
 6928         TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
 6929         vmbus_chan_run_task(sc->hn_prichan, &task);
 6930 
 6931         /*
 6932          * Make sure that all pending management tasks are completed.
 6933          */
 6934         taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
 6935         taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
 6936         taskqueue_drain_all(sc->hn_mgmt_taskq0);
 6937 }
 6938 
 6939 static void
 6940 hn_suspend(struct hn_softc *sc)
 6941 {
 6942 
 6943         /* Disable polling. */
 6944         hn_polling(sc, 0);
 6945 
 6946         /*
 6947          * If the non-transparent mode VF is activated, the synthetic
 6948          * device is receiving packets, so the data path of the
 6949          * synthetic device must be suspended.
 6950          */
 6951         if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
 6952             (sc->hn_flags & HN_FLAG_RXVF))
 6953                 hn_suspend_data(sc);
 6954         hn_suspend_mgmt(sc);
 6955 }
 6956 
 6957 static void
 6958 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
 6959 {
 6960         int i;
 6961 
 6962         KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
 6963             ("invalid TX ring count %d", tx_ring_cnt));
 6964 
 6965         for (i = 0; i < tx_ring_cnt; ++i) {
 6966                 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
 6967 
 6968                 mtx_lock(&txr->hn_tx_lock);
 6969                 txr->hn_suspended = 0;
 6970                 mtx_unlock(&txr->hn_tx_lock);
 6971         }
 6972 }
 6973 
 6974 static void
 6975 hn_resume_data(struct hn_softc *sc)
 6976 {
 6977         int i;
 6978 
 6979         HN_LOCK_ASSERT(sc);
 6980 
 6981         /*
 6982          * Re-enable RX.
 6983          */
 6984         hn_rxfilter_config(sc);
 6985 
 6986         /*
 6987          * Make sure to clear suspend status on "all" TX rings,
 6988          * since hn_tx_ring_inuse can be changed after
 6989          * hn_suspend_data().
 6990          */
 6991         hn_resume_tx(sc, sc->hn_tx_ring_cnt);
 6992 
 6993 #ifdef HN_IFSTART_SUPPORT
 6994         if (!hn_use_if_start)
 6995 #endif
 6996         {
 6997                 /*
 6998                  * Flush unused drbrs, since hn_tx_ring_inuse may be
 6999                  * reduced.
 7000                  */
 7001                 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
 7002                         hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
 7003         }
 7004 
 7005         /*
 7006          * Kick start TX.
 7007          */
 7008         for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
 7009                 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
 7010 
 7011                 /*
 7012                  * Use txeof task, so that any pending oactive can be
 7013                  * cleared properly.
 7014                  */
 7015                 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
 7016         }
 7017 }
 7018 
 7019 static void
 7020 hn_resume_mgmt(struct hn_softc *sc)
 7021 {
 7022 
 7023         sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
 7024 
 7025         /*
 7026          * Kick off network change detection, if it was pending.
 7027          * If no network change was pending, start link status
 7028          * checks, which is more lightweight than network change
 7029          * detection.
 7030          */
 7031         if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
 7032                 hn_change_network(sc);
 7033         else
 7034                 hn_update_link_status(sc);
 7035 }
 7036 
 7037 static void
 7038 hn_resume(struct hn_softc *sc)
 7039 {
 7040 
 7041         /*
 7042          * If the non-transparent mode VF is activated, the synthetic
 7043          * device have to receive packets, so the data path of the
 7044          * synthetic device must be resumed.
 7045          */
 7046         if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
 7047             (sc->hn_flags & HN_FLAG_RXVF))
 7048                 hn_resume_data(sc);
 7049 
 7050         /*
 7051          * Don't resume link status change if VF is attached/activated.
 7052          * - In the non-transparent VF mode, the synthetic device marks
 7053          *   link down until the VF is deactivated; i.e. VF is down.
 7054          * - In transparent VF mode, VF's media status is used until
 7055          *   the VF is detached.
 7056          */
 7057         if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
 7058             !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
 7059                 hn_resume_mgmt(sc);
 7060 
 7061         /*
 7062          * Re-enable polling if this interface is running and
 7063          * the polling is requested.
 7064          */
 7065         if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
 7066                 hn_polling(sc, sc->hn_pollhz);
 7067 }
 7068 
 7069 static void 
 7070 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
 7071 {
 7072         const struct rndis_status_msg *msg;
 7073         int ofs;
 7074 
 7075         if (dlen < sizeof(*msg)) {
 7076                 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
 7077                 return;
 7078         }
 7079         msg = data;
 7080 
 7081         switch (msg->rm_status) {
 7082         case RNDIS_STATUS_MEDIA_CONNECT:
 7083         case RNDIS_STATUS_MEDIA_DISCONNECT:
 7084                 hn_update_link_status(sc);
 7085                 break;
 7086 
 7087         case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
 7088         case RNDIS_STATUS_LINK_SPEED_CHANGE:
 7089                 /* Not really useful; ignore. */
 7090                 break;
 7091 
 7092         case RNDIS_STATUS_NETWORK_CHANGE:
 7093                 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
 7094                 if (dlen < ofs + msg->rm_stbuflen ||
 7095                     msg->rm_stbuflen < sizeof(uint32_t)) {
 7096                         if_printf(sc->hn_ifp, "network changed\n");
 7097                 } else {
 7098                         uint32_t change;
 7099 
 7100                         memcpy(&change, ((const uint8_t *)msg) + ofs,
 7101                             sizeof(change));
 7102                         if_printf(sc->hn_ifp, "network changed, change %u\n",
 7103                             change);
 7104                 }
 7105                 hn_change_network(sc);
 7106                 break;
 7107 
 7108         default:
 7109                 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
 7110                     msg->rm_status);
 7111                 break;
 7112         }
 7113 }
 7114 
 7115 static int
 7116 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
 7117 {
 7118         const struct rndis_pktinfo *pi = info_data;
 7119         uint32_t mask = 0;
 7120 
 7121         while (info_dlen != 0) {
 7122                 const void *data;
 7123                 uint32_t dlen;
 7124 
 7125                 if (__predict_false(info_dlen < sizeof(*pi)))
 7126                         return (EINVAL);
 7127                 if (__predict_false(info_dlen < pi->rm_size))
 7128                         return (EINVAL);
 7129                 info_dlen -= pi->rm_size;
 7130 
 7131                 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
 7132                         return (EINVAL);
 7133                 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
 7134                         return (EINVAL);
 7135                 dlen = pi->rm_size - pi->rm_pktinfooffset;
 7136                 data = pi->rm_data;
 7137 
 7138                 if (pi->rm_internal == 1) {
 7139                         switch (pi->rm_type) {
 7140                         case NDIS_PKTINFO_IT_PKTINFO_ID:
 7141                                 if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
 7142                                         return (EINVAL);
 7143                                 info->pktinfo_id =
 7144                                     (const struct packet_info_id *)data;
 7145                                 mask |= HN_RXINFO_PKTINFO_ID;
 7146                                 break;
 7147 
 7148                         default:
 7149                                 goto next;
 7150                         }
 7151                 } else {
 7152                         switch (pi->rm_type) {
 7153                         case NDIS_PKTINFO_TYPE_VLAN:
 7154                                 if (__predict_false(dlen
 7155                                     < NDIS_VLAN_INFO_SIZE))
 7156                                         return (EINVAL);
 7157                                 info->vlan_info = (const uint32_t *)data;
 7158                                 mask |= HN_RXINFO_VLAN;
 7159                                 break;
 7160 
 7161                         case NDIS_PKTINFO_TYPE_CSUM:
 7162                                 if (__predict_false(dlen
 7163                                     < NDIS_RXCSUM_INFO_SIZE))
 7164                                         return (EINVAL);
 7165                                 info->csum_info = (const uint32_t *)data;
 7166                                 mask |= HN_RXINFO_CSUM;
 7167                                 break;
 7168 
 7169                         case HN_NDIS_PKTINFO_TYPE_HASHVAL:
 7170                                 if (__predict_false(dlen
 7171                                     < HN_NDIS_HASH_VALUE_SIZE))
 7172                                         return (EINVAL);
 7173                                 info->hash_value = (const uint32_t *)data;
 7174                                 mask |= HN_RXINFO_HASHVAL;
 7175                                 break;
 7176 
 7177                         case HN_NDIS_PKTINFO_TYPE_HASHINF:
 7178                                 if (__predict_false(dlen
 7179                                     < HN_NDIS_HASH_INFO_SIZE))
 7180                                         return (EINVAL);
 7181                                 info->hash_info = (const uint32_t *)data;
 7182                                 mask |= HN_RXINFO_HASHINF;
 7183                                 break;
 7184 
 7185                         default:
 7186                                 goto next;
 7187                         }
 7188                 }
 7189 
 7190                 if (mask == HN_RXINFO_ALL) {
 7191                         /* All found; done */
 7192                         break;
 7193                 }
 7194 next:
 7195                 pi = (const struct rndis_pktinfo *)
 7196                     ((const uint8_t *)pi + pi->rm_size);
 7197         }
 7198 
 7199         /*
 7200          * Final fixup.
 7201          * - If there is no hash value, invalidate the hash info.
 7202          */
 7203         if ((mask & HN_RXINFO_HASHVAL) == 0)
 7204                 info->hash_info = NULL;
 7205         return (0);
 7206 }
 7207 
 7208 static __inline bool
 7209 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
 7210 {
 7211 
 7212         if (off < check_off) {
 7213                 if (__predict_true(off + len <= check_off))
 7214                         return (false);
 7215         } else if (off > check_off) {
 7216                 if (__predict_true(check_off + check_len <= off))
 7217                         return (false);
 7218         }
 7219         return (true);
 7220 }
 7221 
 7222 static __inline void
 7223 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
 7224                 uint32_t len, struct hn_rxinfo *info)
 7225 {
 7226         uint32_t cnt = rxr->rsc.cnt;
 7227 
 7228         if (cnt) {
 7229                 rxr->rsc.pktlen += len;
 7230         } else {
 7231                 rxr->rsc.vlan_info = info->vlan_info;
 7232                 rxr->rsc.csum_info = info->csum_info;
 7233                 rxr->rsc.hash_info = info->hash_info;
 7234                 rxr->rsc.hash_value = info->hash_value;
 7235                 rxr->rsc.pktlen = len;
 7236         }
 7237 
 7238         rxr->rsc.frag_data[cnt] = data;
 7239         rxr->rsc.frag_len[cnt] = len;
 7240         rxr->rsc.cnt++;
 7241 }
 7242 
 7243 static void
 7244 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
 7245 {
 7246         const struct rndis_packet_msg *pkt;
 7247         struct hn_rxinfo info;
 7248         int data_off, pktinfo_off, data_len, pktinfo_len;
 7249         bool rsc_more= false;
 7250 
 7251         /*
 7252          * Check length.
 7253          */
 7254         if (__predict_false(dlen < sizeof(*pkt))) {
 7255                 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
 7256                 return;
 7257         }
 7258         pkt = data;
 7259 
 7260         if (__predict_false(dlen < pkt->rm_len)) {
 7261                 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
 7262                     "dlen %d, msglen %u\n", dlen, pkt->rm_len);
 7263                 return;
 7264         }
 7265         if (__predict_false(pkt->rm_len <
 7266             pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
 7267                 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
 7268                     "msglen %u, data %u, oob %u, pktinfo %u\n",
 7269                     pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
 7270                     pkt->rm_pktinfolen);
 7271                 return;
 7272         }
 7273         if (__predict_false(pkt->rm_datalen == 0)) {
 7274                 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
 7275                 return;
 7276         }
 7277 
 7278         /*
 7279          * Check offests.
 7280          */
 7281 #define IS_OFFSET_INVALID(ofs)                  \
 7282         ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
 7283          ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
 7284 
 7285         /* XXX Hyper-V does not meet data offset alignment requirement */
 7286         if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
 7287                 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7288                     "data offset %u\n", pkt->rm_dataoffset);
 7289                 return;
 7290         }
 7291         if (__predict_false(pkt->rm_oobdataoffset > 0 &&
 7292             IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
 7293                 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7294                     "oob offset %u\n", pkt->rm_oobdataoffset);
 7295                 return;
 7296         }
 7297         if (__predict_true(pkt->rm_pktinfooffset > 0) &&
 7298             __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
 7299                 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7300                     "pktinfo offset %u\n", pkt->rm_pktinfooffset);
 7301                 return;
 7302         }
 7303 
 7304 #undef IS_OFFSET_INVALID
 7305 
 7306         data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
 7307         data_len = pkt->rm_datalen;
 7308         pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
 7309         pktinfo_len = pkt->rm_pktinfolen;
 7310 
 7311         /*
 7312          * Check OOB coverage.
 7313          */
 7314         if (__predict_false(pkt->rm_oobdatalen != 0)) {
 7315                 int oob_off, oob_len;
 7316 
 7317                 if_printf(rxr->hn_ifp, "got oobdata\n");
 7318                 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
 7319                 oob_len = pkt->rm_oobdatalen;
 7320 
 7321                 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
 7322                         if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7323                             "oob overflow, msglen %u, oob abs %d len %d\n",
 7324                             pkt->rm_len, oob_off, oob_len);
 7325                         return;
 7326                 }
 7327 
 7328                 /*
 7329                  * Check against data.
 7330                  */
 7331                 if (hn_rndis_check_overlap(oob_off, oob_len,
 7332                     data_off, data_len)) {
 7333                         if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7334                             "oob overlaps data, oob abs %d len %d, "
 7335                             "data abs %d len %d\n",
 7336                             oob_off, oob_len, data_off, data_len);
 7337                         return;
 7338                 }
 7339 
 7340                 /*
 7341                  * Check against pktinfo.
 7342                  */
 7343                 if (pktinfo_len != 0 &&
 7344                     hn_rndis_check_overlap(oob_off, oob_len,
 7345                     pktinfo_off, pktinfo_len)) {
 7346                         if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7347                             "oob overlaps pktinfo, oob abs %d len %d, "
 7348                             "pktinfo abs %d len %d\n",
 7349                             oob_off, oob_len, pktinfo_off, pktinfo_len);
 7350                         return;
 7351                 }
 7352         }
 7353 
 7354         /*
 7355          * Check per-packet-info coverage and find useful per-packet-info.
 7356          */
 7357         info.vlan_info = NULL;
 7358         info.csum_info = NULL;
 7359         info.hash_info = NULL;
 7360         info.pktinfo_id = NULL;
 7361 
 7362         if (__predict_true(pktinfo_len != 0)) {
 7363                 bool overlap;
 7364                 int error;
 7365 
 7366                 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
 7367                         if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7368                             "pktinfo overflow, msglen %u, "
 7369                             "pktinfo abs %d len %d\n",
 7370                             pkt->rm_len, pktinfo_off, pktinfo_len);
 7371                         return;
 7372                 }
 7373 
 7374                 /*
 7375                  * Check packet info coverage.
 7376                  */
 7377                 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
 7378                     data_off, data_len);
 7379                 if (__predict_false(overlap)) {
 7380                         if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7381                             "pktinfo overlap data, pktinfo abs %d len %d, "
 7382                             "data abs %d len %d\n",
 7383                             pktinfo_off, pktinfo_len, data_off, data_len);
 7384                         return;
 7385                 }
 7386 
 7387                 /*
 7388                  * Find useful per-packet-info.
 7389                  */
 7390                 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
 7391                     pktinfo_len, &info);
 7392                 if (__predict_false(error)) {
 7393                         if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
 7394                             "pktinfo\n");
 7395                         return;
 7396                 }
 7397         }
 7398 
 7399         if (__predict_false(data_off + data_len > pkt->rm_len)) {
 7400                 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
 7401                     "data overflow, msglen %u, data abs %d len %d\n",
 7402                     pkt->rm_len, data_off, data_len);
 7403                 return;
 7404         }
 7405 
 7406         /* Identify RSC fragments, drop invalid packets */
 7407         if ((info.pktinfo_id != NULL) &&
 7408             (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
 7409                 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
 7410                         rxr->rsc.cnt = 0;
 7411                         rxr->hn_rsc_pkts++;
 7412                 } else if (rxr->rsc.cnt == 0)
 7413                         goto drop;
 7414 
 7415                 rsc_more = true;
 7416 
 7417                 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
 7418                         rsc_more = false;
 7419 
 7420                 if (rsc_more && rxr->rsc.is_last)
 7421                         goto drop;
 7422         } else {
 7423                 rxr->rsc.cnt = 0;
 7424         }
 7425 
 7426         if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
 7427                 goto drop;
 7428 
 7429         /* Store data in per rx ring structure */
 7430         hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
 7431             data_len, &info);
 7432 
 7433         if (rsc_more)
 7434                 return;
 7435 
 7436         hn_rxpkt(rxr);
 7437         rxr->rsc.cnt = 0;
 7438         return;
 7439 drop:
 7440         rxr->hn_rsc_drop++;
 7441         return;
 7442 }
 7443 
 7444 static __inline void
 7445 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
 7446 {
 7447         const struct rndis_msghdr *hdr;
 7448 
 7449         if (__predict_false(dlen < sizeof(*hdr))) {
 7450                 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
 7451                 return;
 7452         }
 7453         hdr = data;
 7454 
 7455         if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
 7456                 /* Hot data path. */
 7457                 hn_rndis_rx_data(rxr, data, dlen);
 7458                 /* Done! */
 7459                 return;
 7460         }
 7461 
 7462         if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
 7463                 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
 7464         else
 7465                 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
 7466 }
 7467 
 7468 static void
 7469 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
 7470 {
 7471         const struct hn_nvs_hdr *hdr;
 7472 
 7473         if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
 7474                 if_printf(sc->hn_ifp, "invalid nvs notify\n");
 7475                 return;
 7476         }
 7477         hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
 7478 
 7479         if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
 7480                 /* Useless; ignore */
 7481                 return;
 7482         }
 7483         if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
 7484 }
 7485 
 7486 static void
 7487 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
 7488     const struct vmbus_chanpkt_hdr *pkt)
 7489 {
 7490         struct hn_nvs_sendctx *sndc;
 7491 
 7492         sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
 7493         sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
 7494             VMBUS_CHANPKT_DATALEN(pkt));
 7495         /*
 7496          * NOTE:
 7497          * 'sndc' CAN NOT be accessed anymore, since it can be freed by
 7498          * its callback.
 7499          */
 7500 }
 7501 
 7502 static void
 7503 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
 7504     const struct vmbus_chanpkt_hdr *pkthdr)
 7505 {
 7506         struct epoch_tracker et;
 7507         const struct vmbus_chanpkt_rxbuf *pkt;
 7508         const struct hn_nvs_hdr *nvs_hdr;
 7509         int count, i, hlen;
 7510 
 7511         if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
 7512                 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
 7513                 return;
 7514         }
 7515         nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
 7516 
 7517         /* Make sure that this is a RNDIS message. */
 7518         if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
 7519                 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
 7520                     nvs_hdr->nvs_type);
 7521                 return;
 7522         }
 7523 
 7524         hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
 7525         if (__predict_false(hlen < sizeof(*pkt))) {
 7526                 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
 7527                 return;
 7528         }
 7529         pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
 7530 
 7531         if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
 7532                 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
 7533                     pkt->cp_rxbuf_id);
 7534                 return;
 7535         }
 7536 
 7537         count = pkt->cp_rxbuf_cnt;
 7538         if (__predict_false(hlen <
 7539             __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
 7540                 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
 7541                 return;
 7542         }
 7543 
 7544         NET_EPOCH_ENTER(et);
 7545         /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
 7546         for (i = 0; i < count; ++i) {
 7547                 int ofs, len;
 7548 
 7549                 ofs = pkt->cp_rxbuf[i].rb_ofs;
 7550                 len = pkt->cp_rxbuf[i].rb_len;
 7551                 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
 7552                         if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
 7553                             "ofs %d, len %d\n", i, ofs, len);
 7554                         continue;
 7555                 }
 7556 
 7557                 rxr->rsc.is_last = (i == (count - 1));
 7558                 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
 7559         }
 7560         NET_EPOCH_EXIT(et);
 7561 
 7562         /*
 7563          * Ack the consumed RXBUF associated w/ this channel packet,
 7564          * so that this RXBUF can be recycled by the hypervisor.
 7565          */
 7566         hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
 7567 }
 7568 
 7569 static void
 7570 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
 7571     uint64_t tid)
 7572 {
 7573         struct hn_nvs_rndis_ack ack;
 7574         int retries, error;
 7575         
 7576         ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
 7577         ack.nvs_status = HN_NVS_STATUS_OK;
 7578 
 7579         retries = 0;
 7580 again:
 7581         error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
 7582             VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
 7583         if (__predict_false(error == EAGAIN)) {
 7584                 /*
 7585                  * NOTE:
 7586                  * This should _not_ happen in real world, since the
 7587                  * consumption of the TX bufring from the TX path is
 7588                  * controlled.
 7589                  */
 7590                 if (rxr->hn_ack_failed == 0)
 7591                         if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
 7592                 rxr->hn_ack_failed++;
 7593                 retries++;
 7594                 if (retries < 10) {
 7595                         DELAY(100);
 7596                         goto again;
 7597                 }
 7598                 /* RXBUF leaks! */
 7599                 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
 7600         }
 7601 }
 7602 
 7603 static void
 7604 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
 7605 {
 7606         struct hn_rx_ring *rxr = xrxr;
 7607         struct hn_softc *sc = rxr->hn_ifp->if_softc;
 7608 
 7609         for (;;) {
 7610                 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
 7611                 int error, pktlen;
 7612 
 7613                 pktlen = rxr->hn_pktbuf_len;
 7614                 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
 7615                 if (__predict_false(error == ENOBUFS)) {
 7616                         void *nbuf;
 7617                         int nlen;
 7618 
 7619                         /*
 7620                          * Expand channel packet buffer.
 7621                          *
 7622                          * XXX
 7623                          * Use M_WAITOK here, since allocation failure
 7624                          * is fatal.
 7625                          */
 7626                         nlen = rxr->hn_pktbuf_len * 2;
 7627                         while (nlen < pktlen)
 7628                                 nlen *= 2;
 7629                         nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
 7630 
 7631                         if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
 7632                             rxr->hn_pktbuf_len, nlen);
 7633 
 7634                         free(rxr->hn_pktbuf, M_DEVBUF);
 7635                         rxr->hn_pktbuf = nbuf;
 7636                         rxr->hn_pktbuf_len = nlen;
 7637                         /* Retry! */
 7638                         continue;
 7639                 } else if (__predict_false(error == EAGAIN)) {
 7640                         /* No more channel packets; done! */
 7641                         break;
 7642                 }
 7643                 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
 7644 
 7645                 switch (pkt->cph_type) {
 7646                 case VMBUS_CHANPKT_TYPE_COMP:
 7647                         hn_nvs_handle_comp(sc, chan, pkt);
 7648                         break;
 7649 
 7650                 case VMBUS_CHANPKT_TYPE_RXBUF:
 7651                         hn_nvs_handle_rxbuf(rxr, chan, pkt);
 7652                         break;
 7653 
 7654                 case VMBUS_CHANPKT_TYPE_INBAND:
 7655                         hn_nvs_handle_notify(sc, pkt);
 7656                         break;
 7657 
 7658                 default:
 7659                         if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
 7660                             pkt->cph_type);
 7661                         break;
 7662                 }
 7663         }
 7664         hn_chan_rollup(rxr, rxr->hn_txr);
 7665 }
 7666 
 7667 static void
 7668 hn_sysinit(void *arg __unused)
 7669 {
 7670         int i;
 7671 
 7672         hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
 7673 
 7674 #ifdef HN_IFSTART_SUPPORT
 7675         /*
 7676          * Don't use ifnet.if_start if transparent VF mode is requested;
 7677          * mainly due to the IFF_DRV_OACTIVE flag.
 7678          */
 7679         if (hn_xpnt_vf && hn_use_if_start) {
 7680                 hn_use_if_start = 0;
 7681                 printf("hn: tranparent VF mode, if_transmit will be used, "
 7682                     "instead of if_start\n");
 7683         }
 7684 #endif
 7685         if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
 7686                 printf("hn: invalid transparent VF attach routing "
 7687                     "wait timeout %d, reset to %d\n",
 7688                     hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
 7689                 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
 7690         }
 7691 
 7692         /*
 7693          * Initialize VF map.
 7694          */
 7695         rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
 7696         hn_vfmap_size = HN_VFMAP_SIZE_DEF;
 7697         hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
 7698             M_WAITOK | M_ZERO);
 7699 
 7700         /*
 7701          * Fix the # of TX taskqueues.
 7702          */
 7703         if (hn_tx_taskq_cnt <= 0)
 7704                 hn_tx_taskq_cnt = 1;
 7705         else if (hn_tx_taskq_cnt > mp_ncpus)
 7706                 hn_tx_taskq_cnt = mp_ncpus;
 7707 
 7708         /*
 7709          * Fix the TX taskqueue mode.
 7710          */
 7711         switch (hn_tx_taskq_mode) {
 7712         case HN_TX_TASKQ_M_INDEP:
 7713         case HN_TX_TASKQ_M_GLOBAL:
 7714         case HN_TX_TASKQ_M_EVTTQ:
 7715                 break;
 7716         default:
 7717                 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
 7718                 break;
 7719         }
 7720 
 7721         if (vm_guest != VM_GUEST_HV)
 7722                 return;
 7723 
 7724         if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
 7725                 return;
 7726 
 7727         hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
 7728             M_DEVBUF, M_WAITOK);
 7729         for (i = 0; i < hn_tx_taskq_cnt; ++i) {
 7730                 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
 7731                     taskqueue_thread_enqueue, &hn_tx_taskque[i]);
 7732                 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
 7733                     "hn tx%d", i);
 7734         }
 7735 }
 7736 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
 7737 
 7738 static void
 7739 hn_sysuninit(void *arg __unused)
 7740 {
 7741 
 7742         if (hn_tx_taskque != NULL) {
 7743                 int i;
 7744 
 7745                 for (i = 0; i < hn_tx_taskq_cnt; ++i)
 7746                         taskqueue_free(hn_tx_taskque[i]);
 7747                 free(hn_tx_taskque, M_DEVBUF);
 7748         }
 7749 
 7750         if (hn_vfmap != NULL)
 7751                 free(hn_vfmap, M_DEVBUF);
 7752         rm_destroy(&hn_vfmap_lock);
 7753 
 7754         counter_u64_free(hn_udpcs_fixup);
 7755 }
 7756 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);

Cache object: 070876905aa6e409c696ad1201abcd15


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.