From da2f98a1cf890caeb84b303d3bd94da2b518785c Mon Sep 17 00:00:00 2001 From: Wei Hu Date: Wed, 29 Apr 2015 10:12:34 +0000 Subject: [PATCH] Microsoft vmbus, storage and other related driver enhancements for HyperV. - Vmbus multi channel support. - Vector interrupt support. - Signal optimization. - Storvsc driver performance improvement. - Scatter and gather support for storvsc driver. - Minor bug fix for KVP driver. Thanks royger, jhb and delphij from FreeBSD community for the reviews and comments. Also thanks Hovy Xu from NetApp for the contributions to the storvsc driver. PR: 195238 Submitted by: whu Reviewed by: royger, jhb, delphij Approved by: royger MFC after: 2 weeks Relnotes: yes Sponsored by: Microsoft OSTC --- sys/amd64/amd64/apic_vector.S | 16 + sys/amd64/conf/GENERIC | 4 +- sys/amd64/conf/NOTES | 2 + sys/conf/options.amd64 | 2 + sys/conf/options.i386 | 2 + sys/dev/hyperv/include/hyperv.h | 167 +++- .../hyperv/storvsc/hv_storvsc_drv_freebsd.c | 801 +++++++++++++++--- sys/dev/hyperv/storvsc/hv_vstorage.h | 16 +- sys/dev/hyperv/utilities/hv_kvp.c | 11 +- sys/dev/hyperv/utilities/hv_util.c | 9 + sys/dev/hyperv/vmbus/hv_channel.c | 98 ++- sys/dev/hyperv/vmbus/hv_channel_mgmt.c | 267 +++++- sys/dev/hyperv/vmbus/hv_connection.c | 282 ++++-- sys/dev/hyperv/vmbus/hv_hv.c | 66 +- sys/dev/hyperv/vmbus/hv_ring_buffer.c | 76 +- sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c | 372 +++++--- sys/dev/hyperv/vmbus/hv_vmbus_priv.h | 73 +- sys/i386/conf/GENERIC | 4 +- sys/i386/i386/apic_vector.s | 19 + sys/x86/include/apicvar.h | 1 + 20 files changed, 1808 insertions(+), 480 deletions(-) diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index c3aac336a506..4455cab0e787 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -174,6 +174,22 @@ IDTVEC(xen_intr_upcall) jmp doreti #endif +#ifdef HYPERV +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT +IDTVEC(hv_vmbus_callback) + PUSH_FRAME + FAKE_MCOUNT(TF_RIP(%rsp)) + movq %rsp, %rdi + call hv_vector_handler + MEXITCOUNT + jmp doreti +#endif + #ifdef SMP /* * Global address space TLB shootdown. diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index bdaca33960f3..c24dd5ac918b 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -340,7 +340,9 @@ device virtio_blk # VirtIO Block device device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device -# HyperV drivers +# HyperV drivers and enchancement support +# NOTE: HYPERV depends on hyperv. They must be added or removed together. +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES index 9b697f087e0d..e0fe46577fd7 100644 --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -494,6 +494,8 @@ device virtio_balloon # VirtIO Memory Balloon device device virtio_random # VirtIO Entropy device device virtio_console # VirtIO Console device +# Microsoft Hyper-V enchancement support +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations diff --git a/sys/conf/options.amd64 b/sys/conf/options.amd64 index f1d4b4a57e48..0e591878dece 100644 --- a/sys/conf/options.amd64 +++ b/sys/conf/options.amd64 @@ -63,5 +63,7 @@ BPF_JITTER opt_bpf.h XENHVM opt_global.h +HYPERV opt_global.h + # options for the Intel C600 SAS driver (isci) ISCI_LOGGING opt_isci.h diff --git a/sys/conf/options.i386 b/sys/conf/options.i386 index 0a1a52f21991..6f5c45b7e1b2 100644 --- a/sys/conf/options.i386 +++ b/sys/conf/options.i386 @@ -125,5 +125,7 @@ NATIVE opt_global.h XEN opt_global.h XENHVM opt_global.h +HYPERV opt_global.h + # options for the Intel C600 SAS driver (isci) ISCI_LOGGING opt_isci.h diff --git a/sys/dev/hyperv/include/hyperv.h b/sys/dev/hyperv/include/hyperv.h index 8a45d89cd063..5360b7c160b9 100644 --- a/sys/dev/hyperv/include/hyperv.h +++ b/sys/dev/hyperv/include/hyperv.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -63,11 +64,22 @@ typedef uint8_t hv_bool_uint8_t; #define HV_ERROR_MACHINE_LOCKED 0x800704F7 /* - * A revision number of vmbus that is used for ensuring both ends on a - * partition are using compatible versions. + * VMBUS version is 32 bit, upper 16 bit for major_number and lower + * 16 bit for minor_number. + * + * 0.13 -- Windows Server 2008 + * 1.1 -- Windows 7 + * 2.4 -- Windows 8 + * 3.0 -- Windows 8.1 */ +#define HV_VMBUS_VERSION_WS2008 ((0 << 16) | (13)) +#define HV_VMBUS_VERSION_WIN7 ((1 << 16) | (1)) +#define HV_VMBUS_VERSION_WIN8 ((2 << 16) | (4)) +#define HV_VMBUS_VERSION_WIN8_1 ((3 << 16) | (0)) -#define HV_VMBUS_REVISION_NUMBER 13 +#define HV_VMBUS_VERSION_INVALID -1 + +#define HV_VMBUS_VERSION_CURRENT HV_VMBUS_VERSION_WIN8_1 /* * Make maximum size of pipe payload of 16K @@ -112,6 +124,18 @@ typedef struct hv_guid { unsigned char data[16]; } __packed hv_guid; +#define HV_NIC_GUID \ + .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, \ + 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} + +#define HV_IDE_GUID \ + .data = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, \ + 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5} + +#define HV_SCSI_GUID \ + .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, \ + 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f} + /* * At the center of the Channel Management library is * the Channel Offer. This struct contains the @@ -147,7 +171,11 @@ typedef struct hv_vmbus_channel_offer { } __packed pipe; } u; - uint32_t padding; + /* + * Sub_channel_index, newly added in Win8. + */ + uint16_t sub_channel_index; + uint16_t padding; } __packed hv_vmbus_channel_offer; @@ -344,7 +372,25 @@ typedef struct { hv_vmbus_channel_offer offer; uint32_t child_rel_id; uint8_t monitor_id; - hv_bool_uint8_t monitor_allocated; + /* + * This field has been split into a bit field on Win7 + * and higher. + */ + uint8_t monitor_allocated:1; + uint8_t reserved:7; + /* + * Following fields were added in win7 and higher. + * Make sure to check the version before accessing these fields. + * + * If "is_dedicated_interrupt" is set, we must not set the + * associated bit in the channel bitmap while sending the + * interrupt to the host. + * + * connection_id is used in signaling the host. + */ + uint16_t is_dedicated_interrupt:1; + uint16_t reserved1:15; + uint32_t connection_id; } __packed hv_vmbus_channel_offer_channel; /* @@ -394,9 +440,11 @@ typedef struct hv_gpadl_handle ring_buffer_gpadl_handle; /* - * GPADL for the channel's server context save area. + * Before win8, all incoming channel interrupts are only + * delivered on cpu 0. Setting this value to 0 would + * preserve the earlier behavior. */ - hv_gpadl_handle server_context_area_gpadl_handle; + uint32_t target_vcpu; /* * The upstream ring buffer begins at offset zero in the memory described @@ -646,14 +694,42 @@ typedef struct { } hv_vmbus_ring_buffer_info; typedef void (*hv_vmbus_pfn_channel_callback)(void *context); +typedef void (*hv_vmbus_sc_creation_callback)(void *context); typedef enum { HV_CHANNEL_OFFER_STATE, HV_CHANNEL_OPENING_STATE, HV_CHANNEL_OPEN_STATE, + HV_CHANNEL_OPENED_STATE, HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE, } hv_vmbus_channel_state; +/* + * Connection identifier type + */ +typedef union { + uint32_t as_uint32_t; + struct { + uint32_t id:24; + uint32_t reserved:8; + } u; + +} __packed hv_vmbus_connection_id; + +/* + * Definition of the hv_vmbus_signal_event hypercall input structure + */ +typedef struct { + hv_vmbus_connection_id connection_id; + uint16_t flag_number; + uint16_t rsvd_z; +} __packed hv_vmbus_input_signal_event; + +typedef struct { + uint64_t align8; + hv_vmbus_input_signal_event event; +} __packed hv_vmbus_input_signal_event_buffer; + typedef struct hv_vmbus_channel { TAILQ_ENTRY(hv_vmbus_channel) list_entry; struct hv_device* device; @@ -688,8 +764,82 @@ typedef struct hv_vmbus_channel { hv_vmbus_pfn_channel_callback on_channel_callback; void* channel_callback_context; + /* + * If batched_reading is set to "true", mask the interrupt + * and read until the channel is empty. + * If batched_reading is set to "false", the channel is not + * going to perform batched reading. + * + * Batched reading is enabled by default; specific + * drivers that don't want this behavior can turn it off. + */ + boolean_t batched_reading; + + boolean_t is_dedicated_interrupt; + + /* + * Used as an input param for HV_CALL_SIGNAL_EVENT hypercall. + */ + hv_vmbus_input_signal_event_buffer signal_event_buffer; + /* + * 8-bytes aligned of the buffer above + */ + hv_vmbus_input_signal_event *signal_event_param; + + /* + * From Win8, this field specifies the target virtual process + * on which to deliver the interupt from the host to guest. + * Before Win8, all channel interrupts would only be + * delivered on cpu 0. Setting this value to 0 would preserve + * the earlier behavior. + */ + uint32_t target_vcpu; + /* The corresponding CPUID in the guest */ + uint32_t target_cpu; + + /* + * Support for multi-channels. + * The initial offer is considered the primary channel and this + * offer message will indicate if the host supports multi-channels. + * The guest is free to ask for multi-channels to be offerred and can + * open these multi-channels as a normal "primary" channel. However, + * all multi-channels will have the same type and instance guids as the + * primary channel. Requests sent on a given channel will result in a + * response on the same channel. + */ + + /* + * Multi-channel creation callback. This callback will be called in + * process context when a Multi-channel offer is received from the host. + * The guest can open the Multi-channel in the context of this callback. + */ + hv_vmbus_sc_creation_callback sc_creation_callback; + + struct mtx sc_lock; + + /* + * Link list of all the multi-channels if this is a primary channel + */ + TAILQ_HEAD(, hv_vmbus_channel) sc_list_anchor; + TAILQ_ENTRY(hv_vmbus_channel) sc_list_entry; + + /* + * The primary channel this sub-channle belongs to. + * This will be NULL for the primary channel. + */ + struct hv_vmbus_channel *primary_channel; + /* + * Support per channel state for use by vmbus drivers. + */ + void *per_channel_state; } hv_vmbus_channel; +static inline void +hv_set_channel_read_state(hv_vmbus_channel* channel, boolean_t state) +{ + channel->batched_reading = state; +} + typedef struct hv_device { hv_guid class_id; hv_guid device_id; @@ -760,6 +910,8 @@ int hv_vmbus_channel_teardown_gpdal( hv_vmbus_channel* channel, uint32_t gpadl_handle); +struct hv_vmbus_channel* vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); + /* * Work abstraction defines */ @@ -819,6 +971,7 @@ typedef struct hv_vmbus_service { extern uint8_t* receive_buffer[]; extern hv_vmbus_service service_table[]; +extern uint32_t hv_vmbus_protocal_version; void hv_kvp_callback(void *context); int hv_kvp_init(hv_vmbus_service *serv); diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c index d00d279285ad..f8a871b285a9 100644 --- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c +++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -53,8 +54,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include +#include +#include +#include #include #include @@ -66,7 +71,6 @@ __FBSDID("$FreeBSD$"); #include #include - #include #include "hv_vstorage.h" @@ -77,8 +81,29 @@ __FBSDID("$FreeBSD$"); #define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS #define STORVSC_MAX_TARGETS (2) +#define STORVSC_WIN7_MAJOR 4 +#define STORVSC_WIN7_MINOR 2 + +#define STORVSC_WIN8_MAJOR 5 +#define STORVSC_WIN8_MINOR 1 + +#define HV_ALIGN(x, a) roundup2(x, a) + struct storvsc_softc; +struct hv_sgl_node { + LIST_ENTRY(hv_sgl_node) link; + struct sglist *sgl_data; +}; + +struct hv_sgl_page_pool{ + LIST_HEAD(, hv_sgl_node) in_use_sgl_list; + LIST_HEAD(, hv_sgl_node) free_sgl_list; + boolean_t is_init; +} g_hv_sgl_page_pool; + +#define STORVSC_MAX_SG_PAGE_CNT STORVSC_MAX_IO_REQUESTS * HV_MAX_MULTIPAGE_BUFFER_COUNT + enum storvsc_request_type { WRITE_TYPE, READ_TYPE, @@ -96,20 +121,24 @@ struct hv_storvsc_request { struct storvsc_softc *softc; struct callout callout; struct sema synch_sema; /*Synchronize the request/response if needed */ + struct sglist *bounce_sgl; + unsigned int bounce_sgl_count; + uint64_t not_aligned_seg_bits; }; struct storvsc_softc { struct hv_device *hs_dev; - LIST_HEAD(, hv_storvsc_request) hs_free_list; - struct mtx hs_lock; - struct storvsc_driver_props *hs_drv_props; - int hs_unit; - uint32_t hs_frozen; - struct cam_sim *hs_sim; - struct cam_path *hs_path; + LIST_HEAD(, hv_storvsc_request) hs_free_list; + struct mtx hs_lock; + struct storvsc_driver_props *hs_drv_props; + int hs_unit; + uint32_t hs_frozen; + struct cam_sim *hs_sim; + struct cam_path *hs_path; uint32_t hs_num_out_reqs; boolean_t hs_destroy; boolean_t hs_drain_notify; + boolean_t hs_open_multi_channel; struct sema hs_drain_sema; struct hv_storvsc_request hs_init_req; struct hv_storvsc_request hs_reset_req; @@ -124,7 +153,7 @@ struct storvsc_softc { * The first can be tested by "sg_senddiag -vv /dev/daX", * and the second and third can be done by * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX". - */ + */ #define HVS_TIMEOUT_TEST 0 /* @@ -138,7 +167,7 @@ struct storvsc_driver_props { char *drv_name; char *drv_desc; uint8_t drv_max_luns_per_target; - uint8_t drv_max_ios_per_target; + uint8_t drv_max_ios_per_target; uint32_t drv_ringbuffer_size; }; @@ -150,6 +179,8 @@ enum hv_storage_type { #define HS_MAX_ADAPTERS 10 +#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1 + /* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */ static const hv_guid gStorVscDeviceType={ .data = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, @@ -171,13 +202,16 @@ static struct storvsc_driver_props g_drv_props_table[] = { STORVSC_RINGBUFFER_SIZE} }; +static int storvsc_current_major; +static int storvsc_current_minor; + /* static functions */ static int storvsc_probe(device_t dev); static int storvsc_attach(device_t dev); static int storvsc_detach(device_t dev); static void storvsc_poll(struct cam_sim * sim); static void storvsc_action(struct cam_sim * sim, union ccb * ccb); -static void create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); +static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp); static enum hv_storage_type storvsc_get_storage_type(device_t dev); static void hv_storvsc_on_channel_callback(void *context); @@ -186,6 +220,14 @@ static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc, struct hv_storvsc_request *request); static int hv_storvsc_connect_vsp(struct hv_device *device); static void storvsc_io_done(struct hv_storvsc_request *reqp); +static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, + bus_dma_segment_t *orig_sgl, + unsigned int orig_sgl_count, + uint64_t seg_bits); +void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, + unsigned int dest_sgl_count, + struct sglist* src_sgl, + uint64_t seg_bits); static device_method_t storvsc_methods[] = { /* Device interface */ @@ -207,7 +249,7 @@ MODULE_DEPEND(storvsc, vmbus, 1, 1, 1); /** - * The host is capable of sending messages to us that are + * The host is capable of sending messages to us that are * completely unsolicited. So, we need to address the race * condition where we may be in the process of unloading the * driver when the host may send us an unsolicited message. @@ -223,7 +265,7 @@ MODULE_DEPEND(storvsc, vmbus, 1, 1, 1); * destroyed. * * 3. Once the device is marked as being destroyed, we only - * permit incoming traffic to properly account for + * permit incoming traffic to properly account for * packets already sent out. */ static inline struct storvsc_softc * @@ -259,6 +301,113 @@ get_stor_device(struct hv_device *device, return sc; } +/** + * @brief Callback handler, will be invoked when receive mutil-channel offer + * + * @param context new multi-channel + */ +static void +storvsc_handle_sc_creation(void *context) +{ + hv_vmbus_channel *new_channel; + struct hv_device *device; + struct storvsc_softc *sc; + struct vmstor_chan_props props; + int ret = 0; + + new_channel = (hv_vmbus_channel *)context; + device = new_channel->primary_channel->device; + sc = get_stor_device(device, TRUE); + if (sc == NULL) + return; + + if (FALSE == sc->hs_open_multi_channel) + return; + + memset(&props, 0, sizeof(props)); + + ret = hv_vmbus_channel_open(new_channel, + sc->hs_drv_props->drv_ringbuffer_size, + sc->hs_drv_props->drv_ringbuffer_size, + (void *)&props, + sizeof(struct vmstor_chan_props), + hv_storvsc_on_channel_callback, + new_channel); + + return; +} + +/** + * @brief Send multi-channel creation request to host + * + * @param device a Hyper-V device pointer + * @param max_chans the max channels supported by vmbus + */ +static void +storvsc_send_multichannel_request(struct hv_device *dev, int max_chans) +{ + struct storvsc_softc *sc; + struct hv_storvsc_request *request; + struct vstor_packet *vstor_packet; + int request_channels_cnt = 0; + int ret; + + /* get multichannels count that need to create */ + request_channels_cnt = MIN(max_chans, mp_ncpus); + + sc = get_stor_device(dev, TRUE); + if (sc == NULL) { + printf("Storvsc_error: get sc failed while send mutilchannel " + "request\n"); + return; + } + + request = &sc->hs_init_req; + + /* Establish a handler for multi-channel */ + dev->channel->sc_creation_callback = storvsc_handle_sc_creation; + + /* request the host to create multi-channel */ + memset(request, 0, sizeof(struct hv_storvsc_request)); + + sema_init(&request->synch_sema, 0, ("stor_synch_sema")); + + vstor_packet = &request->vstor_packet; + + vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS; + vstor_packet->flags = REQUEST_COMPLETION_FLAG; + vstor_packet->u.multi_channels_cnt = request_channels_cnt; + + ret = hv_vmbus_channel_send_packet( + dev->channel, + vstor_packet, + sizeof(struct vstor_packet), + (uint64_t)(uintptr_t)request, + HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, + HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + + /* wait for 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); + if (ret != 0) { + printf("Storvsc_error: create multi-channel timeout, %d\n", + ret); + return; + } + + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || + vstor_packet->status != 0) { + printf("Storvsc_error: create multi-channel invalid operation " + "(%d) or statue (%u)\n", + vstor_packet->operation, vstor_packet->status); + return; + } + + sc->hs_open_multi_channel = TRUE; + + if (bootverbose) + printf("Storvsc create multi-channel success!\n"); +} + /** * @brief initialize channel connection to parent partition * @@ -272,11 +421,15 @@ hv_storvsc_channel_init(struct hv_device *dev) struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; struct storvsc_softc *sc; + uint16_t max_chans = 0; + boolean_t support_multichannel = FALSE; + + max_chans = 0; + support_multichannel = FALSE; sc = get_stor_device(dev, TRUE); - if (sc == NULL) { - return ENODEV; - } + if (sc == NULL) + return (ENODEV); request = &sc->hs_init_req; memset(request, 0, sizeof(struct hv_storvsc_request)); @@ -300,15 +453,13 @@ hv_storvsc_channel_init(struct hv_device *dev) HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret != 0) { + if (ret != 0) goto cleanup; - } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ - - if (ret != 0) { + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); + if (ret != 0) goto cleanup; - } if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { @@ -321,7 +472,8 @@ hv_storvsc_channel_init(struct hv_device *dev) vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; - vstor_packet->u.version.major_minor = VMSTOR_PROTOCOL_VERSION_CURRENT; + vstor_packet->u.version.major_minor = + VMSTOR_PROTOCOL_VERSION(storvsc_current_major, storvsc_current_minor); /* revision is only significant for Windows guests */ vstor_packet->u.version.revision = 0; @@ -334,21 +486,19 @@ hv_storvsc_channel_init(struct hv_device *dev) HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret != 0) { + if (ret != 0) goto cleanup; - } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret) { + if (ret) goto cleanup; - } /* TODO: Check returned version */ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) goto cleanup; - } /** * Query channel properties @@ -365,22 +515,30 @@ hv_storvsc_channel_init(struct hv_device *dev) HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if ( ret != 0) { + if ( ret != 0) goto cleanup; - } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret != 0) { + if (ret != 0) goto cleanup; - } /* TODO: Check returned version */ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) { goto cleanup; } + /* multi-channels feature is supported by WIN8 and above version */ + max_chans = vstor_packet->u.chan_props.max_channel_cnt; + if ((hv_vmbus_protocal_version != HV_VMBUS_VERSION_WIN7) && + (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) && + (vstor_packet->u.chan_props.flags & + HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) { + support_multichannel = TRUE; + } + memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; @@ -397,16 +555,22 @@ hv_storvsc_channel_init(struct hv_device *dev) goto cleanup; } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + /* wait 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); - if (ret != 0) { + if (ret != 0) goto cleanup; - } if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || - vstor_packet->status != 0) { + vstor_packet->status != 0) goto cleanup; - } + + /* + * If multi-channel is supported, send multichannel create + * request to host. + */ + if (support_multichannel) + storvsc_send_multichannel_request(dev, max_chans); cleanup: sema_destroy(&request->synch_sema); @@ -443,8 +607,7 @@ hv_storvsc_connect_vsp(struct hv_device *dev) (void *)&props, sizeof(struct vmstor_chan_props), hv_storvsc_on_channel_callback, - dev); - + dev->channel); if (ret != 0) { return ret; @@ -490,7 +653,7 @@ hv_storvsc_host_reset(struct hv_device *dev) goto cleanup; } - ret = sema_timedwait(&request->synch_sema, 500); /* KYS 5 seconds */ + ret = sema_timedwait(&request->synch_sema, 5 * hz); /* KYS 5 seconds */ if (ret) { goto cleanup; @@ -498,7 +661,7 @@ hv_storvsc_host_reset(struct hv_device *dev) /* - * At this point, all outstanding requests in the adapter + * At this point, all outstanding requests in the adapter * should have been flushed out and return to us */ @@ -521,6 +684,7 @@ hv_storvsc_io_request(struct hv_device *device, { struct storvsc_softc *sc; struct vstor_packet *vstor_packet = &request->vstor_packet; + struct hv_vmbus_channel* outgoing_channel = NULL; int ret = 0; sc = get_stor_device(device, TRUE); @@ -539,19 +703,20 @@ hv_storvsc_io_request(struct hv_device *device, vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; + outgoing_channel = vmbus_select_outgoing_channel(device->channel); mtx_unlock(&request->softc->hs_lock); if (request->data_buf.length) { ret = hv_vmbus_channel_send_packet_multipagebuffer( - device->channel, + outgoing_channel, &request->data_buf, - vstor_packet, - sizeof(struct vstor_packet), + vstor_packet, + sizeof(struct vstor_packet), (uint64_t)(uintptr_t)request); } else { ret = hv_vmbus_channel_send_packet( - device->channel, + outgoing_channel, vstor_packet, sizeof(struct vstor_packet), (uint64_t)(uintptr_t)request, @@ -610,7 +775,8 @@ static void hv_storvsc_on_channel_callback(void *context) { int ret = 0; - struct hv_device *device = (struct hv_device *)context; + hv_vmbus_channel *channel = (hv_vmbus_channel *)context; + struct hv_device *device = NULL; struct storvsc_softc *sc; uint32_t bytes_recvd; uint64_t request_id; @@ -618,15 +784,22 @@ hv_storvsc_on_channel_callback(void *context) struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; + if (channel->primary_channel != NULL){ + device = channel->primary_channel->device; + } else { + device = channel->device; + } + + KASSERT(device, ("device is NULL")); + sc = get_stor_device(device, FALSE); if (sc == NULL) { + printf("Storvsc_error: get stor device failed.\n"); return; } - KASSERT(device, ("device")); - ret = hv_vmbus_channel_recv_packet( - device->channel, + channel, packet, roundup2(sizeof(struct vstor_packet), 8), &bytes_recvd, @@ -634,21 +807,28 @@ hv_storvsc_on_channel_callback(void *context) while ((ret == 0) && (bytes_recvd > 0)) { request = (struct hv_storvsc_request *)(uintptr_t)request_id; - KASSERT(request, ("request")); if ((request == &sc->hs_init_req) || (request == &sc->hs_reset_req)) { memcpy(&request->vstor_packet, packet, sizeof(struct vstor_packet)); - sema_post(&request->synch_sema); + sema_post(&request->synch_sema); } else { vstor_packet = (struct vstor_packet *)packet; switch(vstor_packet->operation) { case VSTOR_OPERATION_COMPLETEIO: + if (request == NULL) + panic("VMBUS: storvsc received a " + "packet with NULL request id in " + "COMPLETEIO operation."); + hv_storvsc_on_iocompletion(sc, vstor_packet, request); break; case VSTOR_OPERATION_REMOVEDEVICE: + case VSTOR_OPERATION_ENUMERATE_BUS: + printf("VMBUS: storvsc operation %d not " + "implemented.\n", vstor_packet->operation); /* TODO: implement */ break; default: @@ -656,7 +836,7 @@ hv_storvsc_on_channel_callback(void *context) } } ret = hv_vmbus_channel_recv_packet( - device->channel, + channel, packet, roundup2(sizeof(struct vstor_packet), 8), &bytes_recvd, @@ -680,7 +860,16 @@ storvsc_probe(device_t dev) { int ata_disk_enable = 0; int ret = ENXIO; - + + if ((HV_VMBUS_VERSION_WIN8 == hv_vmbus_protocal_version) || + (HV_VMBUS_VERSION_WIN8_1 == hv_vmbus_protocal_version)){ + storvsc_current_major = STORVSC_WIN8_MAJOR; + storvsc_current_minor = STORVSC_WIN8_MINOR; + } else { + storvsc_current_major = STORVSC_WIN7_MAJOR; + storvsc_current_minor = STORVSC_WIN7_MINOR; + } + switch (storvsc_get_storage_type(dev)) { case DRIVER_BLKVSC: if(bootverbose) @@ -721,9 +910,11 @@ storvsc_attach(device_t dev) enum hv_storage_type stor_type; struct storvsc_softc *sc; struct cam_devq *devq; - int ret, i; + int ret, i, j; struct hv_storvsc_request *reqp; struct root_hold_token *root_mount_token = NULL; + struct hv_sgl_node *sgl_node = NULL; + void *tmp_buff = NULL; /* * We need to serialize storvsc attach calls. @@ -764,8 +955,41 @@ storvsc_attach(device_t dev) LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); } + /* create sg-list page pool */ + if (FALSE == g_hv_sgl_page_pool.is_init) { + g_hv_sgl_page_pool.is_init = TRUE; + LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list); + + /* + * Pre-create SG list, each SG list with + * HV_MAX_MULTIPAGE_BUFFER_COUNT segments, each + * segment has one page buffer + */ + for (i = 0; i < STORVSC_MAX_IO_REQUESTS; i++) { + sgl_node = malloc(sizeof(struct hv_sgl_node), + M_DEVBUF, M_WAITOK|M_ZERO); + + sgl_node->sgl_data = + sglist_alloc(HV_MAX_MULTIPAGE_BUFFER_COUNT, + M_WAITOK|M_ZERO); + + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++) { + tmp_buff = malloc(PAGE_SIZE, + M_DEVBUF, M_WAITOK|M_ZERO); + + sgl_node->sgl_data->sg_segs[j].ss_paddr = + (vm_paddr_t)tmp_buff; + } + + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, + sgl_node, link); + } + } + sc->hs_destroy = FALSE; sc->hs_drain_notify = FALSE; + sc->hs_open_multi_channel = FALSE; sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); ret = hv_storvsc_connect_vsp(hv_dev); @@ -834,6 +1058,20 @@ storvsc_attach(device_t dev) LIST_REMOVE(reqp, link); free(reqp, M_DEVBUF); } + + while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++) { + if (NULL != + (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { + free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); + } + } + sglist_free(sgl_node->sgl_data); + free(sgl_node, M_DEVBUF); + } + return (ret); } @@ -853,6 +1091,8 @@ storvsc_detach(device_t dev) struct storvsc_softc *sc = device_get_softc(dev); struct hv_storvsc_request *reqp = NULL; struct hv_device *hv_device = vmbus_get_devctx(dev); + struct hv_sgl_node *sgl_node = NULL; + int j = 0; mtx_lock(&hv_device->channel->inbound_lock); sc->hs_destroy = TRUE; @@ -884,6 +1124,20 @@ storvsc_detach(device_t dev) free(reqp, M_DEVBUF); } mtx_unlock(&sc->hs_lock); + + while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + for (j = 0; j < HV_MAX_MULTIPAGE_BUFFER_COUNT; j++){ + if (NULL != + (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { + free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); + } + } + sglist_free(sgl_node->sgl_data); + free(sgl_node, M_DEVBUF); + } + return (0); } @@ -939,7 +1193,7 @@ storvsc_timeout_test(struct hv_storvsc_request *reqp, ticks, __func__, (ret == 0)? "IO return detected" : "IO return not detected"); - /* + /* * Now both the timer handler and io done are running * simultaneously. We want to confirm the io done always * finishes after the timer handler exits. So reqp used by @@ -1023,7 +1277,7 @@ storvsc_poll(struct cam_sim *sim) mtx_assert(&sc->hs_lock, MA_OWNED); mtx_unlock(&sc->hs_lock); - hv_storvsc_on_channel_callback(sc->hs_dev); + hv_storvsc_on_channel_callback(sc->hs_dev->channel); mtx_lock(&sc->hs_lock); } @@ -1151,9 +1405,13 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb) bzero(reqp, sizeof(struct hv_storvsc_request)); reqp->softc = sc; - - ccb->ccb_h.status |= CAM_SIM_QUEUED; - create_storvsc_request(ccb, reqp); + + ccb->ccb_h.status |= CAM_SIM_QUEUED; + if ((res = create_storvsc_request(ccb, reqp)) != 0) { + ccb->ccb_h.status = CAM_REQ_INVALID; + xpt_done(ccb); + return; + } if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { callout_init(&reqp->callout, CALLOUT_MPSAFE); @@ -1193,6 +1451,212 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb) } } +/** + * @brief destroy bounce buffer + * + * This function is responsible for destroy a Scatter/Gather list + * that create by storvsc_create_bounce_buffer() + * + * @param sgl- the Scatter/Gather need be destroy + * @param sg_count- page count of the SG list. + * + */ +static void +storvsc_destroy_bounce_buffer(struct sglist *sgl) +{ + struct hv_sgl_node *sgl_node = NULL; + + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); + LIST_REMOVE(sgl_node, link); + if (NULL == sgl_node) { + printf("storvsc error: not enough in use sgl\n"); + return; + } + sgl_node->sgl_data = sgl; + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); +} + +/** + * @brief create bounce buffer + * + * This function is responsible for create a Scatter/Gather list, + * which hold several pages that can be aligned with page size. + * + * @param seg_count- SG-list segments count + * @param write - if WRITE_TYPE, set SG list page used size to 0, + * otherwise set used size to page size. + * + * return NULL if create failed + */ +static struct sglist * +storvsc_create_bounce_buffer(uint16_t seg_count, int write) +{ + int i = 0; + struct sglist *bounce_sgl = NULL; + unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE); + struct hv_sgl_node *sgl_node = NULL; + + /* get struct sglist from free_sgl_list */ + sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); + LIST_REMOVE(sgl_node, link); + if (NULL == sgl_node) { + printf("storvsc error: not enough free sgl\n"); + return NULL; + } + bounce_sgl = sgl_node->sgl_data; + LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link); + + bounce_sgl->sg_maxseg = seg_count; + + if (write == WRITE_TYPE) + bounce_sgl->sg_nseg = 0; + else + bounce_sgl->sg_nseg = seg_count; + + for (i = 0; i < seg_count; i++) + bounce_sgl->sg_segs[i].ss_len = buf_len; + + return bounce_sgl; +} + +/** + * @brief copy data from SG list to bounce buffer + * + * This function is responsible for copy data from one SG list's segments + * to another SG list which used as bounce buffer. + * + * @param bounce_sgl - the destination SG list + * @param orig_sgl - the segment of the source SG list. + * @param orig_sgl_count - the count of segments. + * @param orig_sgl_count - indicate which segment need bounce buffer, + * set 1 means need. + * + */ +static void +storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, + bus_dma_segment_t *orig_sgl, + unsigned int orig_sgl_count, + uint64_t seg_bits) +{ + int src_sgl_idx = 0; + + for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) { + if (seg_bits & (1 << src_sgl_idx)) { + memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr, + (void*)orig_sgl[src_sgl_idx].ds_addr, + orig_sgl[src_sgl_idx].ds_len); + + bounce_sgl->sg_segs[src_sgl_idx].ss_len = + orig_sgl[src_sgl_idx].ds_len; + } + } +} + +/** + * @brief copy data from SG list which used as bounce to another SG list + * + * This function is responsible for copy data from one SG list with bounce + * buffer to another SG list's segments. + * + * @param dest_sgl - the destination SG list's segments + * @param dest_sgl_count - the count of destination SG list's segment. + * @param src_sgl - the source SG list. + * @param seg_bits - indicate which segment used bounce buffer of src SG-list. + * + */ +void +storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, + unsigned int dest_sgl_count, + struct sglist* src_sgl, + uint64_t seg_bits) +{ + int sgl_idx = 0; + + for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) { + if (seg_bits & (1 << sgl_idx)) { + memcpy((void*)(dest_sgl[sgl_idx].ds_addr), + (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr), + src_sgl->sg_segs[sgl_idx].ss_len); + } + } +} + +/** + * @brief check SG list with bounce buffer or not + * + * This function is responsible for check if need bounce buffer for SG list. + * + * @param sgl - the SG list's segments + * @param sg_count - the count of SG list's segment. + * @param bits - segmengs number that need bounce buffer + * + * return -1 if SG list needless bounce buffer + */ +static int +storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl, + unsigned int sg_count, + uint64_t *bits) +{ + int i = 0; + int offset = 0; + uint64_t phys_addr = 0; + uint64_t tmp_bits = 0; + boolean_t found_hole = FALSE; + boolean_t pre_aligned = TRUE; + + if (sg_count < 2){ + return -1; + } + + *bits = 0; + + phys_addr = vtophys(sgl[0].ds_addr); + offset = phys_addr - trunc_page(phys_addr); + + if (offset != 0) { + pre_aligned = FALSE; + tmp_bits |= 1; + } + + for (i = 1; i < sg_count; i++) { + phys_addr = vtophys(sgl[i].ds_addr); + offset = phys_addr - trunc_page(phys_addr); + + if (offset == 0) { + if (FALSE == pre_aligned){ + /* + * This segment is aligned, if the previous + * one is not aligned, find a hole + */ + found_hole = TRUE; + } + pre_aligned = TRUE; + } else { + tmp_bits |= 1 << i; + if (!pre_aligned) { + if (phys_addr != vtophys(sgl[i-1].ds_addr + + sgl[i-1].ds_len)) { + /* + * Check whether connect to previous + * segment,if not, find the hole + */ + found_hole = TRUE; + } + } else { + found_hole = TRUE; + } + pre_aligned = FALSE; + } + } + + if (!found_hole) { + return (-1); + } else { + *bits = tmp_bits; + return 0; + } +} + /** * @brief Fill in a request structure based on a CAM control block * @@ -1203,7 +1667,7 @@ storvsc_action(struct cam_sim *sim, union ccb *ccb) * @param ccb pointer to a CAM contorl block * @param reqp pointer to a request structure */ -static void +static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) { struct ccb_scsiio *csio = &ccb->csio; @@ -1211,6 +1675,7 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) uint32_t bytes_to_copy = 0; uint32_t pfn_num = 0; uint32_t pfn; + uint64_t not_aligned_seg_bits = 0; /* refer to struct vmscsi_req for meanings of these two fields */ reqp->vstor_packet.u.vm_srb.port = @@ -1231,48 +1696,172 @@ create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) } switch (ccb->ccb_h.flags & CAM_DIR_MASK) { - case CAM_DIR_OUT: - reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; - break; - case CAM_DIR_IN: - reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; - break; - case CAM_DIR_NONE: - reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; - break; - default: - reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; - break; + case CAM_DIR_OUT: + reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; + break; + case CAM_DIR_IN: + reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; + break; + case CAM_DIR_NONE: + reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; + break; + default: + reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; + break; } reqp->sense_data = &csio->sense_data; reqp->sense_info_len = csio->sense_len; reqp->ccb = ccb; - /* - KASSERT((ccb->ccb_h.flags & CAM_SCATTER_VALID) == 0, - ("ccb is scatter gather valid\n")); - */ - if (csio->dxfer_len != 0) { - reqp->data_buf.length = csio->dxfer_len; + + if (0 == csio->dxfer_len) { + return (0); + } + + reqp->data_buf.length = csio->dxfer_len; + + switch (ccb->ccb_h.flags & CAM_DATA_MASK) { + case CAM_DATA_VADDR: + { bytes_to_copy = csio->dxfer_len; phys_addr = vtophys(csio->data_ptr); - reqp->data_buf.offset = phys_addr - trunc_page(phys_addr); + reqp->data_buf.offset = phys_addr & PAGE_MASK; + + while (bytes_to_copy != 0) { + int bytes, page_offset; + phys_addr = + vtophys(&csio->data_ptr[reqp->data_buf.length - + bytes_to_copy]); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[pfn_num] = pfn; + page_offset = phys_addr & PAGE_MASK; + + bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); + + bytes_to_copy -= bytes; + pfn_num++; + } + break; } - while (bytes_to_copy != 0) { - int bytes, page_offset; - phys_addr = vtophys(&csio->data_ptr[reqp->data_buf.length - - bytes_to_copy]); - pfn = phys_addr >> PAGE_SHIFT; - reqp->data_buf.pfn_array[pfn_num] = pfn; - page_offset = phys_addr - trunc_page(phys_addr); + case CAM_DATA_SG: + { + int i = 0; + int offset = 0; + int ret; - bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); + bus_dma_segment_t *storvsc_sglist = + (bus_dma_segment_t *)ccb->csio.data_ptr; + u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt; - bytes_to_copy -= bytes; - pfn_num++; + printf("Storvsc: get SG I/O operation, %d\n", + reqp->vstor_packet.u.vm_srb.data_in); + + if (storvsc_sg_count > HV_MAX_MULTIPAGE_BUFFER_COUNT){ + printf("Storvsc: %d segments is too much, " + "only support %d segments\n", + storvsc_sg_count, HV_MAX_MULTIPAGE_BUFFER_COUNT); + return (EINVAL); + } + + /* + * We create our own bounce buffer function currently. Idealy + * we should use BUS_DMA(9) framework. But with current BUS_DMA + * code there is no callback API to check the page alignment of + * middle segments before busdma can decide if a bounce buffer + * is needed for particular segment. There is callback, + * "bus_dma_filter_t *filter", but the parrameters are not + * sufficient for storvsc driver. + * TODO: + * Add page alignment check in BUS_DMA(9) callback. Once + * this is complete, switch the following code to use + * BUS_DMA(9) for storvsc bounce buffer support. + */ + /* check if we need to create bounce buffer */ + ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist, + storvsc_sg_count, ¬_aligned_seg_bits); + if (ret != -1) { + reqp->bounce_sgl = + storvsc_create_bounce_buffer(storvsc_sg_count, + reqp->vstor_packet.u.vm_srb.data_in); + if (NULL == reqp->bounce_sgl) { + printf("Storvsc_error: " + "create bounce buffer failed.\n"); + return (ENOMEM); + } + + reqp->bounce_sgl_count = storvsc_sg_count; + reqp->not_aligned_seg_bits = not_aligned_seg_bits; + + /* + * if it is write, we need copy the original data + *to bounce buffer + */ + if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { + storvsc_copy_sgl_to_bounce_buf( + reqp->bounce_sgl, + storvsc_sglist, + storvsc_sg_count, + reqp->not_aligned_seg_bits); + } + + /* transfer virtual address to physical frame number */ + if (reqp->not_aligned_seg_bits & 0x1){ + phys_addr = + vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr); + }else{ + phys_addr = + vtophys(storvsc_sglist[0].ds_addr); + } + reqp->data_buf.offset = phys_addr & PAGE_MASK; + + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[0] = pfn; + + for (i = 1; i < storvsc_sg_count; i++) { + if (reqp->not_aligned_seg_bits & (1 << i)) { + phys_addr = + vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr); + } else { + phys_addr = + vtophys(storvsc_sglist[i].ds_addr); + } + + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } + } else { + phys_addr = vtophys(storvsc_sglist[0].ds_addr); + + reqp->data_buf.offset = phys_addr & PAGE_MASK; + + for (i = 0; i < storvsc_sg_count; i++) { + phys_addr = vtophys(storvsc_sglist[i].ds_addr); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } + + /* check the last segment cross boundary or not */ + offset = phys_addr & PAGE_MASK; + if (offset) { + phys_addr = + vtophys(storvsc_sglist[i-1].ds_addr + + PAGE_SIZE - offset); + pfn = phys_addr >> PAGE_SHIFT; + reqp->data_buf.pfn_array[i] = pfn; + } + + reqp->bounce_sgl_count = 0; + } + break; } + default: + printf("Unknow flags: %d\n", ccb->ccb_h.flags); + return(EINVAL); + } + + return(0); } /** @@ -1291,7 +1880,29 @@ storvsc_io_done(struct hv_storvsc_request *reqp) struct ccb_scsiio *csio = &ccb->csio; struct storvsc_softc *sc = reqp->softc; struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb; - + bus_dma_segment_t *ori_sglist = NULL; + int ori_sg_count = 0; + + /* destroy bounce buffer if it is used */ + if (reqp->bounce_sgl_count) { + ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; + ori_sg_count = ccb->csio.sglist_cnt; + + /* + * If it is READ operation, we should copy back the data + * to original SG list. + */ + if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { + storvsc_copy_from_bounce_buf_to_sgl(ori_sglist, + ori_sg_count, + reqp->bounce_sgl, + reqp->not_aligned_seg_bits); + } + + storvsc_destroy_bounce_buffer(reqp->bounce_sgl); + reqp->bounce_sgl_count = 0; + } + if (reqp->retries > 0) { mtx_lock(&sc->hs_lock); #if HVS_TIMEOUT_TEST @@ -1309,7 +1920,7 @@ storvsc_io_done(struct hv_storvsc_request *reqp) mtx_unlock(&sc->hs_lock); } - /* + /* * callout_drain() will wait for the timer handler to finish * if it is running. So we don't need any lock to synchronize * between this routine and the timer handler. diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h index 2632676160ce..deb918303d60 100644 --- a/sys/dev/hyperv/storvsc/hv_vstorage.h +++ b/sys/dev/hyperv/storvsc/hv_vstorage.h @@ -53,7 +53,7 @@ * V1 RC > 2008/1/31 2.0 */ -#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(2, 0) +#define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(5, 1) /** * Packet structure ops describing virtual storage requests. @@ -69,7 +69,10 @@ enum vstor_packet_ops { VSTOR_OPERATION_ENDINITIALIZATION = 8, VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9, VSTOR_OPERATION_QUERYPROPERTIES = 10, - VSTOR_OPERATION_MAXIMUM = 10 + VSTOR_OPERATION_ENUMERATE_BUS = 11, + VSTOR_OPERATION_FCHBA_DATA = 12, + VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13, + VSTOR_OPERATION_MAXIMUM = 13 }; @@ -123,10 +126,12 @@ struct vmstor_chan_props { uint8_t path_id; uint8_t target_id; + uint16_t max_channel_cnt; + /** * Note: port number is only really known on the client side */ - uint32_t port; + uint16_t port; uint32_t flags; uint32_t max_transfer_bytes; @@ -193,6 +198,11 @@ struct vstor_packet { * Used during version negotiations. */ struct vmstor_proto_ver version; + + /** + * Number of multichannels to create + */ + uint16_t multi_channels_cnt; } u; } __packed; diff --git a/sys/dev/hyperv/utilities/hv_kvp.c b/sys/dev/hyperv/utilities/hv_kvp.c index 848d364a4b33..4598510bb8d5 100644 --- a/sys/dev/hyperv/utilities/hv_kvp.c +++ b/sys/dev/hyperv/utilities/hv_kvp.c @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -232,7 +233,7 @@ hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp, */ if ((icframe_vercnt >= 2) && (negop->icversion_data[1].major == 3)) { icframe_vercnt = 3; - if (icmsg_vercnt >= 2) + if (icmsg_vercnt > 2) icmsg_vercnt = 4; else icmsg_vercnt = 3; @@ -734,8 +735,8 @@ hv_kvp_process_request(void *context) recvlen = 0; ret = hv_vmbus_channel_recv_packet(channel, kvp_buf, 2 * PAGE_SIZE, &recvlen, &requestid); - hv_kvp_log_info("%s: read: context %p, pending_cnt %ju ret =%d, recvlen=%d\n", - __func__, context, pending_cnt, ret, recvlen); + hv_kvp_log_info("%s: read: context %p, pending_cnt %llu ret =%d, recvlen=%d\n", + __func__, context, (unsigned long long)pending_cnt, ret, recvlen); } } @@ -813,9 +814,9 @@ static void hv_kvp_dev_destroy(void) { - if (daemon_task != NULL) { + if (daemon_task != NULL) { PROC_LOCK(daemon_task); - kern_psignal(daemon_task, SIGKILL); + kern_psignal(daemon_task, SIGKILL); PROC_UNLOCK(daemon_task); } diff --git a/sys/dev/hyperv/utilities/hv_util.c b/sys/dev/hyperv/utilities/hv_util.c index 3e545cff50c3..dc4b1e2537ba 100644 --- a/sys/dev/hyperv/utilities/hv_util.c +++ b/sys/dev/hyperv/utilities/hv_util.c @@ -408,6 +408,15 @@ hv_util_attach(device_t dev) } } + /* + * These services are not performance critical and do not need + * batched reading. Furthermore, some services such as KVP can + * only handle one message from the host at a time. + * Turn off batched reading for all util drivers before we open the + * channel. + */ + hv_set_channel_read_state(hv_dev->channel, FALSE); + ret = hv_vmbus_channel_open(hv_dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0, service->callback, hv_dev->channel); diff --git a/sys/dev/hyperv/vmbus/hv_channel.c b/sys/dev/hyperv/vmbus/hv_channel.c index 103260a6adf7..94137fbeb225 100644 --- a/sys/dev/hyperv/vmbus/hv_channel.c +++ b/sys/dev/hyperv/vmbus/hv_channel.c @@ -75,7 +75,7 @@ vmbus_channel_set_event(hv_vmbus_channel *channel) (uint32_t *)&monitor_page-> trigger_group[channel->monitor_group].u.pending); } else { - hv_vmbus_set_event(channel->offer_msg.child_rel_id); + hv_vmbus_set_event(channel); } } @@ -99,6 +99,18 @@ hv_vmbus_channel_open( hv_vmbus_channel_open_channel* open_msg; hv_vmbus_channel_msg_info* open_info; + mtx_lock(&new_channel->sc_lock); + if (new_channel->state == HV_CHANNEL_OPEN_STATE) { + new_channel->state = HV_CHANNEL_OPENING_STATE; + } else { + mtx_unlock(&new_channel->sc_lock); + if(bootverbose) + printf("VMBUS: Trying to open channel <%p> which in " + "%d state.\n", new_channel, new_channel->state); + return (EINVAL); + } + mtx_unlock(&new_channel->sc_lock); + new_channel->on_channel_callback = pfn_on_channel_callback; new_channel->channel_callback_context = context; @@ -162,7 +174,7 @@ hv_vmbus_channel_open( new_channel->ring_buffer_gpadl_handle; open_msg->downstream_ring_buffer_page_offset = send_ring_buffer_size >> PAGE_SHIFT; - open_msg->server_context_area_gpadl_handle = 0; + open_msg->target_vcpu = new_channel->target_vcpu; if (user_data_len) memcpy(open_msg->user_data, user_data, user_data_len); @@ -182,10 +194,14 @@ hv_vmbus_channel_open( ret = sema_timedwait(&open_info->wait_sema, 500); /* KYS 5 seconds */ - if (ret) + if (ret) { + if(bootverbose) + printf("VMBUS: channel <%p> open timeout.\n", new_channel); goto cleanup; + } if (open_info->response.open_result.status == 0) { + new_channel->state = HV_CHANNEL_OPENED_STATE; if(bootverbose) printf("VMBUS: channel <%p> open success.\n", new_channel); } else { @@ -497,16 +513,20 @@ hv_vmbus_channel_teardown_gpdal( return (ret); } -/** - * @brief Close the specified channel - */ -void -hv_vmbus_channel_close(hv_vmbus_channel *channel) +static void +hv_vmbus_channel_close_internal(hv_vmbus_channel *channel) { int ret = 0; hv_vmbus_channel_close_channel* msg; hv_vmbus_channel_msg_info* info; + channel->state = HV_CHANNEL_OPEN_STATE; + channel->sc_creation_callback = NULL; + + /* + * Grab the lock to prevent race condition when a packet received + * and unloading driver is in the process. + */ mtx_lock(&channel->inbound_lock); channel->on_channel_callback = NULL; mtx_unlock(&channel->inbound_lock); @@ -545,23 +565,37 @@ hv_vmbus_channel_close(hv_vmbus_channel *channel) M_DEVBUF); free(info, M_DEVBUF); +} - /* - * If we are closing the channel during an error path in - * opening the channel, don't free the channel - * since the caller will free the channel - */ - if (channel->state == HV_CHANNEL_OPEN_STATE) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - channel, - list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); +/** + * @brief Close the specified channel + */ +void +hv_vmbus_channel_close(hv_vmbus_channel *channel) +{ + hv_vmbus_channel* sub_channel; - hv_vmbus_free_vmbus_channel(channel); + if (channel->primary_channel != NULL) { + /* + * We only close multi-channels when the primary is + * closed. + */ + return; } + /* + * Close all multi-channels first. + */ + TAILQ_FOREACH(sub_channel, &channel->sc_list_anchor, + sc_list_entry) { + if (sub_channel->state != HV_CHANNEL_OPENED_STATE) + continue; + hv_vmbus_channel_close_internal(sub_channel); + } + /* + * Then close the primary channel. + */ + hv_vmbus_channel_close_internal(channel); } /** @@ -581,6 +615,7 @@ hv_vmbus_channel_send_packet( uint32_t packet_len; uint64_t aligned_data; uint32_t packet_len_aligned; + boolean_t need_sig; hv_vmbus_sg_buffer_list buffer_list[3]; packet_len = sizeof(hv_vm_packet_descriptor) + buffer_len; @@ -604,12 +639,11 @@ hv_vmbus_channel_send_packet( buffer_list[2].data = &aligned_data; buffer_list[2].length = packet_len_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 - && !hv_vmbus_get_ring_buffer_interrupt_mask( - &channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } @@ -632,6 +666,7 @@ hv_vmbus_channel_send_packet_pagebuffer( int ret = 0; int i = 0; + boolean_t need_sig; uint32_t packet_len; uint32_t packetLen_aligned; hv_vmbus_sg_buffer_list buffer_list[3]; @@ -675,11 +710,11 @@ hv_vmbus_channel_send_packet_pagebuffer( buffer_list[2].data = &alignedData; buffer_list[2].length = packetLen_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 && - !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } @@ -700,6 +735,7 @@ hv_vmbus_channel_send_packet_multipagebuffer( int ret = 0; uint32_t desc_size; + boolean_t need_sig; uint32_t packet_len; uint32_t packet_len_aligned; uint32_t pfn_count; @@ -750,11 +786,11 @@ hv_vmbus_channel_send_packet_multipagebuffer( buffer_list[2].data = &aligned_data; buffer_list[2].length = packet_len_aligned - packet_len; - ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3); + ret = hv_ring_buffer_write(&channel->outbound, buffer_list, 3, + &need_sig); /* TODO: We should determine if this is optional */ - if (ret == 0 && - !hv_vmbus_get_ring_buffer_interrupt_mask(&channel->outbound)) { + if (ret == 0 && need_sig) { vmbus_channel_set_event(channel); } diff --git a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c index 011e305709e6..783f6bcf2f62 100644 --- a/sys/dev/hyperv/vmbus/hv_channel_mgmt.c +++ b/sys/dev/hyperv/vmbus/hv_channel_mgmt.c @@ -50,6 +50,8 @@ static void vmbus_channel_on_gpadl_torndown(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_offers_delivered(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_on_version_response(hv_vmbus_channel_msg_header* hdr); static void vmbus_channel_process_offer(void *context); +struct hv_vmbus_channel* + vmbus_select_outgoing_channel(struct hv_vmbus_channel *promary); /** * Channel message dispatch table @@ -233,6 +235,9 @@ hv_vmbus_allocate_channel(void) return (NULL); mtx_init(&channel->inbound_lock, "channel inbound", NULL, MTX_DEF); + mtx_init(&channel->sc_lock, "vmbus multi channel", NULL, MTX_DEF); + + TAILQ_INIT(&channel->sc_list_anchor); channel->control_work_queue = hv_work_queue_create("control"); @@ -262,6 +267,7 @@ ReleaseVmbusChannel(void *context) void hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) { + mtx_destroy(&channel->sc_lock); mtx_destroy(&channel->inbound_lock); /* * We have to release the channel's workqueue/thread in @@ -279,10 +285,10 @@ hv_vmbus_free_vmbus_channel(hv_vmbus_channel* channel) static void vmbus_channel_process_offer(void *context) { - int ret; hv_vmbus_channel* new_channel; boolean_t f_new; hv_vmbus_channel* channel; + int ret; new_channel = (hv_vmbus_channel*) context; f_new = TRUE; @@ -291,38 +297,76 @@ vmbus_channel_process_offer(void *context) /* * Make sure this is a new offer */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) { - if (!memcmp( - &channel->offer_msg.offer.interface_type, - &new_channel->offer_msg.offer.interface_type, - sizeof(hv_guid)) - && !memcmp( - &channel->offer_msg.offer.interface_instance, + if (memcmp(&channel->offer_msg.offer.interface_type, + &new_channel->offer_msg.offer.interface_type, + sizeof(hv_guid)) == 0 && + memcmp(&channel->offer_msg.offer.interface_instance, &new_channel->offer_msg.offer.interface_instance, - sizeof(hv_guid))) { - f_new = FALSE; - break; - } + sizeof(hv_guid)) == 0) { + f_new = FALSE; + break; + } } if (f_new) { - /* Insert at tail */ - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); + /* Insert at tail */ + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + + /*XXX add new channel to percpu_list */ if (!f_new) { + /* + * Check if this is a sub channel. + */ + if (new_channel->offer_msg.offer.sub_channel_index != 0) { + /* + * It is a sub channel offer, process it. + */ + new_channel->primary_channel = channel; + mtx_lock(&channel->sc_lock); + TAILQ_INSERT_TAIL( + &channel->sc_list_anchor, + new_channel, + sc_list_entry); + mtx_unlock(&channel->sc_lock); + + /* Insert new channel into channel_anchor. */ + printf("Storvsc get multi-channel offer, rel=%u.\n", + new_channel->offer_msg.child_rel_id); + mtx_lock(&hv_vmbus_g_connection.channel_lock); + TAILQ_INSERT_TAIL(&hv_vmbus_g_connection.channel_anchor, + new_channel, list_entry); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + + if(bootverbose) + printf("VMBUS: new multi-channel offer <%p>.\n", + new_channel); + + /*XXX add it to percpu_list */ + + new_channel->state = HV_CHANNEL_OPEN_STATE; + if (channel->sc_creation_callback != NULL) { + channel->sc_creation_callback(new_channel); + } + return; + } + hv_vmbus_free_vmbus_channel(new_channel); return; } + new_channel->state = HV_CHANNEL_OPEN_STATE; + /* * Start the process of binding this offer to the driver * (We need to set the device field before calling @@ -332,13 +376,6 @@ vmbus_channel_process_offer(void *context) new_channel->offer_msg.offer.interface_type, new_channel->offer_msg.offer.interface_instance, new_channel); - /* - * TODO - the HV_CHANNEL_OPEN_STATE flag should not be set below - * but in the "open" channel request. The ret != 0 logic below - * doesn't take into account that a channel - * may have been opened successfully - */ - /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() @@ -346,24 +383,82 @@ vmbus_channel_process_offer(void *context) */ ret = hv_vmbus_child_device_register(new_channel->device); if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_anchor, - new_channel, - list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); - hv_vmbus_free_vmbus_channel(new_channel); - } else { - /* - * This state is used to indicate a successful open - * so that when we do close the channel normally, - * we can clean up properly - */ - new_channel->state = HV_CHANNEL_OPEN_STATE; - + mtx_lock(&hv_vmbus_g_connection.channel_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_anchor, + new_channel, + list_entry); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); + hv_vmbus_free_vmbus_channel(new_channel); } } +/** + * Array of device guids that are performance critical. We try to distribute + * the interrupt load for these devices across all online cpus. + */ +static const hv_guid high_perf_devices[] = { + {HV_NIC_GUID, }, + {HV_IDE_GUID, }, + {HV_SCSI_GUID, }, +}; + +enum { + PERF_CHN_NIC = 0, + PERF_CHN_IDE, + PERF_CHN_SCSI, + MAX_PERF_CHN, +}; + +/* + * We use this static number to distribute the channel interrupt load. + */ +static uint32_t next_vcpu; + +/** + * Starting with Win8, we can statically distribute the incoming + * channel interrupt load by binding a channel to VCPU. We + * implement here a simple round robin scheme for distributing + * the interrupt load. + * We will bind channels that are not performance critical to cpu 0 and + * performance critical channels (IDE, SCSI and Network) will be uniformly + * distributed across all available CPUs. + */ +static void +vmbus_channel_select_cpu(hv_vmbus_channel *channel, hv_guid *guid) +{ + uint32_t current_cpu; + int i; + boolean_t is_perf_channel = FALSE; + + for (i = PERF_CHN_NIC; i < MAX_PERF_CHN; i++) { + if (memcmp(guid->data, high_perf_devices[i].data, + sizeof(hv_guid)) == 0) { + is_perf_channel = TRUE; + break; + } + } + + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7) || + (!is_perf_channel)) { + /* Host's view of guest cpu */ + channel->target_vcpu = 0; + /* Guest's own view of cpu */ + channel->target_cpu = 0; + return; + } + /* mp_ncpus should have the number cpus currently online */ + current_cpu = (++next_vcpu % mp_ncpus); + channel->target_cpu = current_cpu; + channel->target_vcpu = + hv_vmbus_g_context.hv_vcpu_index[current_cpu]; + if (bootverbose) + printf("VMBUS: Total online cpus %d, assign perf channel %d " + "to vcpu %d, cpu %d\n", mp_ncpus, i, channel->target_vcpu, + current_cpu); +} + /** * @brief Handler for channel offers from Hyper-V/Azure * @@ -391,6 +486,38 @@ vmbus_channel_on_offer(hv_vmbus_channel_msg_header* hdr) if (new_channel == NULL) return; + /* + * By default we setup state to enable batched + * reading. A specific service can choose to + * disable this prior to opening the channel. + */ + new_channel->batched_reading = TRUE; + + new_channel->signal_event_param = + (hv_vmbus_input_signal_event *) + (HV_ALIGN_UP((unsigned long) + &new_channel->signal_event_buffer, + HV_HYPERCALL_PARAM_ALIGN)); + + new_channel->signal_event_param->connection_id.as_uint32_t = 0; + new_channel->signal_event_param->connection_id.u.id = + HV_VMBUS_EVENT_CONNECTION_ID; + new_channel->signal_event_param->flag_number = 0; + new_channel->signal_event_param->rsvd_z = 0; + + if (hv_vmbus_protocal_version != HV_VMBUS_VERSION_WS2008) { + new_channel->is_dedicated_interrupt = + (offer->is_dedicated_interrupt != 0); + new_channel->signal_event_param->connection_id.u.id = + offer->connection_id; + } + + /* + * Bind the channel to a chosen cpu. + */ + vmbus_channel_select_cpu(new_channel, + &offer->offer.interface_type); + memcpy(&new_channel->offer_msg, offer, sizeof(hv_vmbus_channel_offer_channel)); new_channel->monitor_group = (uint8_t) offer->monitor_id / 32; @@ -666,7 +793,7 @@ hv_vmbus_release_unattached_channels(void) { hv_vmbus_channel *channel; - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); while (!TAILQ_EMPTY(&hv_vmbus_g_connection.channel_anchor)) { channel = TAILQ_FIRST(&hv_vmbus_g_connection.channel_anchor); @@ -676,5 +803,61 @@ hv_vmbus_release_unattached_channels(void) hv_vmbus_child_device_unregister(channel->device); hv_vmbus_free_vmbus_channel(channel); } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); +} + +/** + * @brief Select the best outgoing channel + * + * The channel whose vcpu binding is closest to the currect vcpu will + * be selected. + * If no multi-channel, always select primary channel + * + * @param primary - primary channel + */ +struct hv_vmbus_channel * +vmbus_select_outgoing_channel(struct hv_vmbus_channel *primary) +{ + hv_vmbus_channel *new_channel = NULL; + hv_vmbus_channel *outgoing_channel = primary; + int old_cpu_distance = 0; + int new_cpu_distance = 0; + int cur_vcpu = 0; + int smp_pro_id = PCPU_GET(cpuid); + + if (TAILQ_EMPTY(&primary->sc_list_anchor)) { + return outgoing_channel; + } + + if (smp_pro_id >= MAXCPU) { + return outgoing_channel; + } + + cur_vcpu = hv_vmbus_g_context.hv_vcpu_index[smp_pro_id]; + + TAILQ_FOREACH(new_channel, &primary->sc_list_anchor, sc_list_entry) { + if (new_channel->state != HV_CHANNEL_OPENED_STATE){ + continue; + } + + if (new_channel->target_vcpu == cur_vcpu){ + return new_channel; + } + + old_cpu_distance = ((outgoing_channel->target_vcpu > cur_vcpu) ? + (outgoing_channel->target_vcpu - cur_vcpu) : + (cur_vcpu - outgoing_channel->target_vcpu)); + + new_cpu_distance = ((new_channel->target_vcpu > cur_vcpu) ? + (new_channel->target_vcpu - cur_vcpu) : + (cur_vcpu - new_channel->target_vcpu)); + + if (old_cpu_distance < new_cpu_distance) { + continue; + } + + outgoing_channel = new_channel; + } + + return(outgoing_channel); } diff --git a/sys/dev/hyperv/vmbus/hv_connection.c b/sys/dev/hyperv/vmbus/hv_connection.c index c8e0b48ac65c..0300828961ba 100644 --- a/sys/dev/hyperv/vmbus/hv_connection.c +++ b/sys/dev/hyperv/vmbus/hv_connection.c @@ -45,14 +45,113 @@ hv_vmbus_connection hv_vmbus_g_connection = { .connect_state = HV_DISCONNECTED, .next_gpadl_handle = 0xE1E10, }; +uint32_t hv_vmbus_protocal_version = HV_VMBUS_VERSION_WS2008; + +static uint32_t +hv_vmbus_get_next_version(uint32_t current_ver) +{ + switch (current_ver) { + case (HV_VMBUS_VERSION_WIN7): + return(HV_VMBUS_VERSION_WS2008); + + case (HV_VMBUS_VERSION_WIN8): + return(HV_VMBUS_VERSION_WIN7); + + case (HV_VMBUS_VERSION_WIN8_1): + return(HV_VMBUS_VERSION_WIN8); + + case (HV_VMBUS_VERSION_WS2008): + default: + return(HV_VMBUS_VERSION_INVALID); + } +} + +/** + * Negotiate the highest supported hypervisor version. + */ +static int +hv_vmbus_negotiate_version(hv_vmbus_channel_msg_info *msg_info, + uint32_t version) +{ + int ret = 0; + hv_vmbus_channel_initiate_contact *msg; + + sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); + msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; + + msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; + msg->vmbus_version_requested = version; + + msg->interrupt_page = hv_get_phys_addr( + hv_vmbus_g_connection.interrupt_page); + + msg->monitor_page_1 = hv_get_phys_addr( + hv_vmbus_g_connection.monitor_pages); + + msg->monitor_page_2 = + hv_get_phys_addr( + ((uint8_t *) hv_vmbus_g_connection.monitor_pages + + PAGE_SIZE)); + + /** + * Add to list before we send the request since we may receive the + * response before returning from this routine + */ + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + TAILQ_INSERT_TAIL( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + ret = hv_vmbus_post_message( + msg, + sizeof(hv_vmbus_channel_initiate_contact)); + + if (ret != 0) { + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + return (ret); + } + + /** + * Wait for the connection response + */ + ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ + + mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + TAILQ_REMOVE( + &hv_vmbus_g_connection.channel_msg_anchor, + msg_info, + msg_list_entry); + mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + + /** + * Check if successful + */ + if (msg_info->response.version_response.version_supported) { + hv_vmbus_g_connection.connect_state = HV_CONNECTED; + } else { + ret = ECONNREFUSED; + } + + return (ret); +} + /** * Send a connect request on the partition service connection */ int hv_vmbus_connect(void) { int ret = 0; + uint32_t version; hv_vmbus_channel_msg_info* msg_info = NULL; - hv_vmbus_channel_initiate_contact* msg; /** * Make sure we are not connecting or connected @@ -74,7 +173,7 @@ hv_vmbus_connect(void) { TAILQ_INIT(&hv_vmbus_g_connection.channel_anchor); mtx_init(&hv_vmbus_g_connection.channel_lock, "vmbus channel", - NULL, MTX_SPIN); + NULL, MTX_DEF); /** * Setup the vmbus event connection for channel interrupt abstraction @@ -130,71 +229,30 @@ hv_vmbus_connect(void) { goto cleanup; } - sema_init(&msg_info->wait_sema, 0, "Msg Info Sema"); - msg = (hv_vmbus_channel_initiate_contact*) msg_info->msg; - - msg->header.message_type = HV_CHANNEL_MESSAGE_INITIATED_CONTACT; - msg->vmbus_version_requested = HV_VMBUS_REVISION_NUMBER; - - msg->interrupt_page = hv_get_phys_addr( - hv_vmbus_g_connection.interrupt_page); - - msg->monitor_page_1 = hv_get_phys_addr( - hv_vmbus_g_connection.monitor_pages); - - msg->monitor_page_2 = - hv_get_phys_addr( - ((uint8_t *) hv_vmbus_g_connection.monitor_pages - + PAGE_SIZE)); - - /** - * Add to list before we send the request since we may receive the - * response before returning from this routine + /* + * Find the highest vmbus version number we can support. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); + version = HV_VMBUS_VERSION_CURRENT; - TAILQ_INSERT_TAIL( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); + do { + ret = hv_vmbus_negotiate_version(msg_info, version); + if (ret == EWOULDBLOCK) { + /* + * We timed out. + */ + goto cleanup; + } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); + if (hv_vmbus_g_connection.connect_state == HV_CONNECTED) + break; - ret = hv_vmbus_post_message( - msg, - sizeof(hv_vmbus_channel_initiate_contact)); + version = hv_vmbus_get_next_version(version); + } while (version != HV_VMBUS_VERSION_INVALID); - if (ret != 0) { - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); - goto cleanup; - } - - /** - * Wait for the connection response - */ - ret = sema_timedwait(&msg_info->wait_sema, 500); /* KYS 5 seconds */ - - mtx_lock_spin(&hv_vmbus_g_connection.channel_msg_lock); - TAILQ_REMOVE( - &hv_vmbus_g_connection.channel_msg_anchor, - msg_info, - msg_list_entry); - mtx_unlock_spin(&hv_vmbus_g_connection.channel_msg_lock); - - /** - * Check if successful - */ - if (msg_info->response.version_response.version_supported) { - hv_vmbus_g_connection.connect_state = HV_CONNECTED; - } else { - ret = ECONNREFUSED; - goto cleanup; - } + hv_vmbus_protocal_version = version; + if (bootverbose) + printf("VMBUS: Portocal Version: %d.%d\n", + version >> 16, version & 0xFFFF); sema_destroy(&msg_info->wait_sema); free(msg_info, M_DEVBUF); @@ -286,7 +344,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { * and channels are accessed without the need to take this lock or search * the list. */ - mtx_lock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_lock(&hv_vmbus_g_connection.channel_lock); TAILQ_FOREACH(channel, &hv_vmbus_g_connection.channel_anchor, list_entry) { @@ -295,7 +353,7 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { break; } } - mtx_unlock_spin(&hv_vmbus_g_connection.channel_lock); + mtx_unlock(&hv_vmbus_g_connection.channel_lock); return (foundChannel); } @@ -306,7 +364,10 @@ hv_vmbus_get_channel_from_rel_id(uint32_t rel_id) { static void VmbusProcessChannelEvent(uint32_t relid) { + void* arg; + uint32_t bytes_to_read; hv_vmbus_channel* channel; + boolean_t is_batched_reading; /** * Find the channel based on this relid and invokes @@ -327,31 +388,98 @@ VmbusProcessChannelEvent(uint32_t relid) * callback to NULL. This closes the window. */ - mtx_lock(&channel->inbound_lock); + /* + * Disable the lock due to newly added WITNESS check in r277723. + * Will seek other way to avoid race condition. + * -- whu + */ + // mtx_lock(&channel->inbound_lock); if (channel->on_channel_callback != NULL) { - channel->on_channel_callback(channel->channel_callback_context); + arg = channel->channel_callback_context; + is_batched_reading = channel->batched_reading; + /* + * Optimize host to guest signaling by ensuring: + * 1. While reading the channel, we disable interrupts from + * host. + * 2. Ensure that we process all posted messages from the host + * before returning from this callback. + * 3. Once we return, enable signaling from the host. Once this + * state is set we check to see if additional packets are + * available to read. In this case we repeat the process. + */ + do { + if (is_batched_reading) + hv_ring_buffer_read_begin(&channel->inbound); + + channel->on_channel_callback(arg); + + if (is_batched_reading) + bytes_to_read = + hv_ring_buffer_read_end(&channel->inbound); + else + bytes_to_read = 0; + } while (is_batched_reading && (bytes_to_read != 0)); } - mtx_unlock(&channel->inbound_lock); + // mtx_unlock(&channel->inbound_lock); } +#ifdef HV_DEBUG_INTR +extern uint32_t hv_intr_count; +extern uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; +extern uint32_t hv_vmbus_intr_cpu[MAXCPU]; +#endif + /** * Handler for events */ void hv_vmbus_on_events(void *arg) { - int dword; int bit; + int cpu; + int dword; + void *page_addr; + uint32_t* recv_interrupt_page = NULL; int rel_id; - int maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + int maxdword; + hv_vmbus_synic_event_flags *event; /* int maxdword = PAGE_SIZE >> 3; */ - /* - * receive size is 1/2 page and divide that by 4 bytes - */ + cpu = (int)(long)arg; + KASSERT(cpu <= mp_maxid, ("VMBUS: hv_vmbus_on_events: " + "cpu out of range!")); - uint32_t* recv_interrupt_page = - hv_vmbus_g_connection.recv_interrupt_page; +#ifdef HV_DEBUG_INTR + int i; + hv_vmbus_swintr_event_cpu[cpu]++; + if (hv_intr_count % 10000 == 0) { + printf("VMBUS: Total interrupt %d\n", hv_intr_count); + for (i = 0; i < mp_ncpus; i++) + printf("VMBUS: hw cpu[%d]: %d, event sw intr cpu[%d]: %d\n", + i, hv_vmbus_intr_cpu[i], i, hv_vmbus_swintr_event_cpu[i]); + } +#endif + + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { + maxdword = HV_MAX_NUM_CHANNELS_SUPPORTED >> 5; + /* + * receive size is 1/2 page and divide that by 4 bytes + */ + recv_interrupt_page = + hv_vmbus_g_connection.recv_interrupt_page; + } else { + /* + * On Host with Win8 or above, the event page can be + * checked directly to get the id of the channel + * that has the pending interrupt. + */ + maxdword = HV_EVENT_FLAGS_DWORD_COUNT; + page_addr = hv_vmbus_g_context.syn_ic_event_page[cpu]; + event = (hv_vmbus_synic_event_flags *) + page_addr + HV_VMBUS_MESSAGE_SINT; + recv_interrupt_page = event->flags32; + } /* * Check events @@ -416,16 +544,16 @@ int hv_vmbus_post_message(void *buffer, size_t bufferLen) { * Send an event notification to the parent */ int -hv_vmbus_set_event(uint32_t child_rel_id) { +hv_vmbus_set_event(hv_vmbus_channel *channel) { int ret = 0; + uint32_t child_rel_id = channel->offer_msg.child_rel_id; /* Each uint32_t represents 32 channels */ synch_set_bit(child_rel_id & 31, (((uint32_t *)hv_vmbus_g_connection.send_interrupt_page + (child_rel_id >> 5)))); - ret = hv_vmbus_signal_event(); + ret = hv_vmbus_signal_event(channel->signal_event_param); return (ret); } - diff --git a/sys/dev/hyperv/vmbus/hv_hv.c b/sys/dev/hyperv/vmbus/hv_hv.c index 80a1f42940c6..84e2a5e46fcf 100644 --- a/sys/dev/hyperv/vmbus/hv_hv.c +++ b/sys/dev/hyperv/vmbus/hv_hv.c @@ -67,8 +67,6 @@ static inline void do_cpuid_inline(unsigned int op, unsigned int *eax, hv_vmbus_context hv_vmbus_g_context = { .syn_ic_initialized = FALSE, .hypercall_page = NULL, - .signal_event_param = NULL, - .signal_event_buffer = NULL, }; static struct timecounter hv_timecounter = { @@ -256,28 +254,6 @@ hv_vmbus_init(void) hv_vmbus_g_context.hypercall_page = virt_addr; - /* - * Setup the global signal event param for the signal event hypercall - */ - hv_vmbus_g_context.signal_event_buffer = - malloc(sizeof(hv_vmbus_input_signal_event_buffer), M_DEVBUF, - M_ZERO | M_NOWAIT); - KASSERT(hv_vmbus_g_context.signal_event_buffer != NULL, - ("Error VMBUS: Failed to allocate signal_event_buffer\n")); - if (hv_vmbus_g_context.signal_event_buffer == NULL) - goto cleanup; - - hv_vmbus_g_context.signal_event_param = - (hv_vmbus_input_signal_event*) - (HV_ALIGN_UP((unsigned long) - hv_vmbus_g_context.signal_event_buffer, - HV_HYPERCALL_PARAM_ALIGN)); - hv_vmbus_g_context.signal_event_param->connection_id.as_uint32_t = 0; - hv_vmbus_g_context.signal_event_param->connection_id.u.id = - HV_VMBUS_EVENT_CONNECTION_ID; - hv_vmbus_g_context.signal_event_param->flag_number = 0; - hv_vmbus_g_context.signal_event_param->rsvd_z = 0; - tc_init(&hv_timecounter); /* register virtual timecount */ return (0); @@ -303,12 +279,6 @@ hv_vmbus_cleanup(void) { hv_vmbus_x64_msr_hypercall_contents hypercall_msr; - if (hv_vmbus_g_context.signal_event_buffer != NULL) { - free(hv_vmbus_g_context.signal_event_buffer, M_DEVBUF); - hv_vmbus_g_context.signal_event_buffer = NULL; - hv_vmbus_g_context.signal_event_param = NULL; - } - if (hv_vmbus_g_context.guest_id == HV_FREEBSD_GUEST_ID) { if (hv_vmbus_g_context.hypercall_page != NULL) { hypercall_msr.as_uint64_t = 0; @@ -370,13 +340,13 @@ hv_vmbus_post_msg_via_msg_ipc( * event IPC. (This involves a hypercall.) */ hv_vmbus_status -hv_vmbus_signal_event() +hv_vmbus_signal_event(void *con_id) { hv_vmbus_status status; status = hv_vmbus_do_hypercall( HV_CALL_SIGNAL_EVENT, - hv_vmbus_g_context.signal_event_param, + con_id, 0) & 0xFFFF; return (status); @@ -390,6 +360,7 @@ hv_vmbus_synic_init(void *arg) { int cpu; + uint64_t hv_vcpu_index; hv_vmbus_synic_simp simp; hv_vmbus_synic_siefp siefp; hv_vmbus_synic_scontrol sctrl; @@ -402,24 +373,15 @@ hv_vmbus_synic_init(void *arg) if (hv_vmbus_g_context.hypercall_page == NULL) return; - /* - * KYS: Looks like we can only initialize on cpu0; don't we support - * SMP guests? - * - * TODO: Need to add SMP support for FreeBSD V9 - */ - - if (cpu != 0) - return; - /* * TODO: Check the version */ version = rdmsr(HV_X64_MSR_SVERSION); - - hv_vmbus_g_context.syn_ic_msg_page[cpu] = setup_args->page_buffers[0]; - hv_vmbus_g_context.syn_ic_event_page[cpu] = setup_args->page_buffers[1]; + hv_vmbus_g_context.syn_ic_msg_page[cpu] = + setup_args->page_buffers[2 * cpu]; + hv_vmbus_g_context.syn_ic_event_page[cpu] = + setup_args->page_buffers[2 * cpu + 1]; /* * Setup the Synic's message page @@ -443,9 +405,10 @@ hv_vmbus_synic_init(void *arg) wrmsr(HV_X64_MSR_SIEFP, siefp.as_uint64_t); /*HV_SHARED_SINT_IDT_VECTOR + 0x20; */ + shared_sint.as_uint64_t = 0; shared_sint.u.vector = setup_args->vector; shared_sint.u.masked = FALSE; - shared_sint.u.auto_eoi = FALSE; + shared_sint.u.auto_eoi = TRUE; wrmsr(HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT, shared_sint.as_uint64_t); @@ -458,6 +421,13 @@ hv_vmbus_synic_init(void *arg) hv_vmbus_g_context.syn_ic_initialized = TRUE; + /* + * Set up the cpuid mapping from Hyper-V to FreeBSD. + * The array is indexed using FreeBSD cpuid. + */ + hv_vcpu_index = rdmsr(HV_X64_MSR_VP_INDEX); + hv_vmbus_g_context.hv_vcpu_index[cpu] = (uint32_t)hv_vcpu_index; + return; } @@ -469,14 +439,10 @@ void hv_vmbus_synic_cleanup(void *arg) hv_vmbus_synic_sint shared_sint; hv_vmbus_synic_simp simp; hv_vmbus_synic_siefp siefp; - int cpu = PCPU_GET(cpuid); if (!hv_vmbus_g_context.syn_ic_initialized) return; - if (cpu != 0) - return; /* TODO: XXXKYS: SMP? */ - shared_sint.as_uint64_t = rdmsr( HV_X64_MSR_SINT0 + HV_VMBUS_MESSAGE_SINT); diff --git a/sys/dev/hyperv/vmbus/hv_ring_buffer.c b/sys/dev/hyperv/vmbus/hv_ring_buffer.c index f7c1965c8334..5e4f52a54ecb 100644 --- a/sys/dev/hyperv/vmbus/hv_ring_buffer.c +++ b/sys/dev/hyperv/vmbus/hv_ring_buffer.c @@ -144,6 +144,69 @@ get_ring_buffer_indices(hv_vmbus_ring_buffer_info* ring_info) return (uint64_t) ring_info->ring_buffer->write_index << 32; } +void +hv_ring_buffer_read_begin( + hv_vmbus_ring_buffer_info* ring_info) +{ + ring_info->ring_buffer->interrupt_mask = 1; + mb(); +} + +uint32_t +hv_ring_buffer_read_end( + hv_vmbus_ring_buffer_info* ring_info) +{ + uint32_t read, write; + + ring_info->ring_buffer->interrupt_mask = 0; + mb(); + + /* + * Now check to see if the ring buffer is still empty. + * If it is not, we raced and we need to process new + * incoming messages. + */ + get_ring_buffer_avail_bytes(ring_info, &read, &write); + + return (read); +} + +/* + * When we write to the ring buffer, check if the host needs to + * be signaled. Here is the details of this protocol: + * + * 1. The host guarantees that while it is draining the + * ring buffer, it will set the interrupt_mask to + * indicate it does not need to be interrupted when + * new data is placed. + * + * 2. The host guarantees that it will completely drain + * the ring buffer before exiting the read loop. Further, + * once the ring buffer is empty, it will clear the + * interrupt_mask and re-check to see if new data has + * arrived. + */ +static boolean_t +hv_ring_buffer_needsig_on_write( + uint32_t old_write_location, + hv_vmbus_ring_buffer_info* rbi) +{ + mb(); + if (rbi->ring_buffer->interrupt_mask) + return (FALSE); + + /* Read memory barrier */ + rmb(); + /* + * This is the only case we need to signal when the + * ring transitions from being empty to non-empty. + */ + if (old_write_location == rbi->ring_buffer->read_index) + return (TRUE); + + return (FALSE); +} + static uint32_t copy_to_ring_buffer( hv_vmbus_ring_buffer_info* ring_info, uint32_t start_write_offset, @@ -204,11 +267,13 @@ int hv_ring_buffer_write( hv_vmbus_ring_buffer_info* out_ring_info, hv_vmbus_sg_buffer_list sg_buffers[], - uint32_t sg_buffer_count) + uint32_t sg_buffer_count, + boolean_t *need_sig) { int i = 0; uint32_t byte_avail_to_write; uint32_t byte_avail_to_read; + uint32_t old_write_location; uint32_t total_bytes_to_write = 0; volatile uint32_t next_write_location; @@ -242,6 +307,8 @@ hv_ring_buffer_write( */ next_write_location = get_next_write_location(out_ring_info); + old_write_location = next_write_location; + for (i = 0; i < sg_buffer_count; i++) { next_write_location = copy_to_ring_buffer(out_ring_info, next_write_location, (char *) sg_buffers[i].data, @@ -258,9 +325,9 @@ hv_ring_buffer_write( (char *) &prev_indices, sizeof(uint64_t)); /* - * Make sure we flush all writes before updating the writeIndex + * Full memory barrier before upding the write index. */ - wmb(); + mb(); /* * Now, update the write location @@ -269,6 +336,9 @@ hv_ring_buffer_write( mtx_unlock_spin(&out_ring_info->ring_lock); + *need_sig = hv_ring_buffer_needsig_on_write(old_write_location, + out_ring_info); + return (0); } diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c index ca28fd5892cd..f9432c8ee521 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c +++ b/sys/dev/hyperv/vmbus/hv_vmbus_drv_freebsd.c @@ -53,22 +53,17 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include #include +#include #include "hv_vmbus_priv.h" #define VMBUS_IRQ 0x5 -static struct intr_event *hv_msg_intr_event; -static struct intr_event *hv_event_intr_event; -static void *msg_swintr; -static void *event_swintr; static device_t vmbus_devp; -static void *vmbus_cookiep; -static int vmbus_rid; -struct resource *intr_res; -static int vmbus_irq = VMBUS_IRQ; static int vmbus_inited; static hv_setup_args setup_args; /* only CPU 0 supported at this time */ @@ -77,14 +72,17 @@ static hv_setup_args setup_args; /* only CPU 0 supported at this time */ * the hypervisor. */ static void -vmbus_msg_swintr(void *dummy) +vmbus_msg_swintr(void *arg) { int cpu; void* page_addr; hv_vmbus_message* msg; hv_vmbus_message* copied; - cpu = PCPU_GET(cpuid); + cpu = (int)(long)arg; + KASSERT(cpu <= mp_maxid, ("VMBUS: vmbus_msg_swintr: " + "cpu out of range!")); + page_addr = hv_vmbus_g_context.syn_ic_msg_page[cpu]; msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; @@ -130,17 +128,8 @@ vmbus_msg_swintr(void *dummy) * * The purpose of this routine is to determine the type of VMBUS protocol * message to process - an event or a channel message. - * As this is an interrupt filter routine, the function runs in a very - * restricted envinronment. From the manpage for bus_setup_intr(9) - * - * In this restricted environment, care must be taken to account for all - * races. A careful analysis of races should be done as well. It is gener- - * ally cheaper to take an extra interrupt, for example, than to protect - * variables with spinlocks. Read, modify, write cycles of hardware regis- - * ters need to be carefully analyzed if other threads are accessing the - * same registers. */ -static int +static inline int hv_vmbus_isr(void *unused) { int cpu; @@ -149,8 +138,6 @@ hv_vmbus_isr(void *unused) void* page_addr; cpu = PCPU_GET(cpuid); - /* (Temporary limit) */ - KASSERT(cpu == 0, ("hv_vmbus_isr: Interrupt on CPU other than zero")); /* * The Windows team has advised that we check for events @@ -162,9 +149,21 @@ hv_vmbus_isr(void *unused) event = (hv_vmbus_synic_event_flags*) page_addr + HV_VMBUS_MESSAGE_SINT; - /* Since we are a child, we only need to check bit 0 */ - if (synch_test_and_clear_bit(0, &event->flags32[0])) { - swi_sched(event_swintr, 0); + if ((hv_vmbus_protocal_version == HV_VMBUS_VERSION_WS2008) || + (hv_vmbus_protocal_version == HV_VMBUS_VERSION_WIN7)) { + /* Since we are a child, we only need to check bit 0 */ + if (synch_test_and_clear_bit(0, &event->flags32[0])) { + swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); + } + } else { + /* + * On host with Win8 or above, we can directly look at + * the event page. If bit n is set, we have an interrupt + * on the channel with id n. + * Directly schedule the event software interrupt on + * current cpu. + */ + swi_sched(hv_vmbus_g_context.event_swintr[cpu], 0); } /* Check if there are actual msgs to be process */ @@ -172,12 +171,47 @@ hv_vmbus_isr(void *unused) msg = (hv_vmbus_message*) page_addr + HV_VMBUS_MESSAGE_SINT; if (msg->header.message_type != HV_MESSAGE_TYPE_NONE) { - swi_sched(msg_swintr, 0); + swi_sched(hv_vmbus_g_context.msg_swintr[cpu], 0); } return FILTER_HANDLED; } +#ifdef HV_DEBUG_INTR +uint32_t hv_intr_count = 0; +#endif +uint32_t hv_vmbus_swintr_event_cpu[MAXCPU]; +uint32_t hv_vmbus_intr_cpu[MAXCPU]; + +void +hv_vector_handler(struct trapframe *trap_frame) +{ +#ifdef HV_DEBUG_INTR + int cpu; +#endif + + /* + * Disable preemption. + */ + critical_enter(); + +#ifdef HV_DEBUG_INTR + /* + * Do a little interrupt counting. + */ + cpu = PCPU_GET(cpuid); + hv_vmbus_intr_cpu[cpu]++; + hv_intr_count++; +#endif + + hv_vmbus_isr(NULL); + + /* + * Enable preemption. + */ + critical_exit(); +} + static int vmbus_read_ivar( device_t dev, @@ -316,6 +350,81 @@ vmbus_probe(device_t dev) { return (BUS_PROBE_NOWILDCARD); } +#ifdef HYPERV +extern inthand_t IDTVEC(rsvd), IDTVEC(hv_vmbus_callback); + +/** + * @brief Find a free IDT slot and setup the interrupt handler. + */ +static int +vmbus_vector_alloc(void) +{ + int vector; + uintptr_t func; + struct gate_descriptor *ip; + + /* + * Search backwards form the highest IDT vector available for use + * as vmbus channel callback vector. We install 'hv_vmbus_callback' + * handler at that vector and use it to interrupt vcpus. + */ + vector = APIC_SPURIOUS_INT; + while (--vector >= APIC_IPI_INTS) { + ip = &idt[vector]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + if (func == (uintptr_t)&IDTVEC(rsvd)) { +#ifdef __i386__ + setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYS386IGT, + SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#else + setidt(vector , IDTVEC(hv_vmbus_callback), SDT_SYSIGT, + SEL_KPL, 0); +#endif + + return (vector); + } + } + return (0); +} + +/** + * @brief Restore the IDT slot to rsvd. + */ +static void +vmbus_vector_free(int vector) +{ + uintptr_t func; + struct gate_descriptor *ip; + + if (vector == 0) + return; + + KASSERT(vector >= APIC_IPI_INTS && vector < APIC_SPURIOUS_INT, + ("invalid vector %d", vector)); + + ip = &idt[vector]; + func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); + KASSERT(func == (uintptr_t)&IDTVEC(hv_vmbus_callback), + ("invalid vector %d", vector)); + + setidt(vector, IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); +} + +#else /* HYPERV */ + +static int +vmbus_vector_alloc(void) +{ + return(0); +} + +static void +vmbus_vector_free(int vector) +{ +} + +#endif /* HYPERV */ + /** * @brief Main vmbus driver initialization routine. * @@ -331,22 +440,7 @@ vmbus_probe(device_t dev) { static int vmbus_bus_init(void) { - struct ioapic_intsrc { - struct intsrc io_intsrc; - u_int io_irq; - u_int io_intpin:8; - u_int io_vector:8; - u_int io_cpu:8; - u_int io_activehi:1; - u_int io_edgetrigger:1; - u_int io_masked:1; - int io_bus:4; - uint32_t io_lowreg; - }; - int i, ret; - unsigned int vector = 0; - struct intsrc *isrc; - struct ioapic_intsrc *intpin; + int i, j, n, ret; if (vmbus_inited) return (0); @@ -361,80 +455,100 @@ vmbus_bus_init(void) return (ret); } - ret = swi_add(&hv_msg_intr_event, "hv_msg", vmbus_msg_swintr, - NULL, SWI_CLOCK, 0, &msg_swintr); - - if (ret) - goto cleanup; - /* - * Message SW interrupt handler checks a per-CPU page and - * thus the thread needs to be bound to CPU-0 - which is where - * all interrupts are processed. + * Find a free IDT slot for vmbus callback. */ - ret = intr_event_bind(hv_msg_intr_event, 0); + hv_vmbus_g_context.hv_cb_vector = vmbus_vector_alloc(); - if (ret) - goto cleanup1; - - ret = swi_add(&hv_event_intr_event, "hv_event", hv_vmbus_on_events, - NULL, SWI_CLOCK, 0, &event_swintr); - - if (ret) - goto cleanup1; - - intr_res = bus_alloc_resource(vmbus_devp, - SYS_RES_IRQ, &vmbus_rid, vmbus_irq, vmbus_irq, 1, RF_ACTIVE); - - if (intr_res == NULL) { - ret = ENOMEM; /* XXXKYS: Need a better errno */ - goto cleanup2; + if (hv_vmbus_g_context.hv_cb_vector == 0) { + if(bootverbose) + printf("Error VMBUS: Cannot find free IDT slot for " + "vmbus callback!\n"); + goto cleanup; } - /* - * Setup interrupt filter handler - */ - ret = bus_setup_intr(vmbus_devp, intr_res, - INTR_TYPE_NET | INTR_MPSAFE, hv_vmbus_isr, NULL, - NULL, &vmbus_cookiep); - - if (ret != 0) - goto cleanup3; - - ret = bus_bind_intr(vmbus_devp, intr_res, 0); - if (ret != 0) - goto cleanup4; - - isrc = intr_lookup_source(vmbus_irq); - if ((isrc == NULL) || (isrc->is_event == NULL)) { - ret = EINVAL; - goto cleanup4; - } - - /* vector = isrc->is_event->ie_vector; */ - intpin = (struct ioapic_intsrc *)isrc; - vector = intpin->io_vector; - if(bootverbose) - printf("VMBUS: irq 0x%x vector 0x%x\n", vmbus_irq, vector); + printf("VMBUS: vmbus callback vector %d\n", + hv_vmbus_g_context.hv_cb_vector); - /** - * Notify the hypervisor of our irq. + /* + * Notify the hypervisor of our vector. */ - setup_args.vector = vector; - for(i = 0; i < 2; i++) { - setup_args.page_buffers[i] = + setup_args.vector = hv_vmbus_g_context.hv_cb_vector; + + CPU_FOREACH(j) { + hv_vmbus_intr_cpu[j] = 0; + hv_vmbus_swintr_event_cpu[j] = 0; + hv_vmbus_g_context.hv_event_intr_event[j] = NULL; + hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.event_swintr[j] = NULL; + hv_vmbus_g_context.msg_swintr[j] = NULL; + + for (i = 0; i < 2; i++) + setup_args.page_buffers[2 * j + i] = NULL; + } + + /* + * Per cpu setup. + */ + CPU_FOREACH(j) { + /* + * Setup software interrupt thread and handler for msg handling. + */ + ret = swi_add(&hv_vmbus_g_context.hv_msg_intr_event[j], + "hv_msg", vmbus_msg_swintr, (void *)(long)j, SWI_CLOCK, 0, + &hv_vmbus_g_context.msg_swintr[j]); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to setup msg swi for " + "cpu %d\n", j); + goto cleanup1; + } + + /* + * Bind the swi thread to the cpu. + */ + ret = intr_event_bind(hv_vmbus_g_context.hv_msg_intr_event[j], + j); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to bind msg swi thread " + "to cpu %d\n", j); + goto cleanup1; + } + + /* + * Setup software interrupt thread and handler for + * event handling. + */ + ret = swi_add(&hv_vmbus_g_context.hv_event_intr_event[j], + "hv_event", hv_vmbus_on_events, (void *)(long)j, + SWI_CLOCK, 0, &hv_vmbus_g_context.event_swintr[j]); + if (ret) { + if(bootverbose) + printf("VMBUS: failed to setup event swi for " + "cpu %d\n", j); + goto cleanup1; + } + + /* + * Prepare the per cpu msg and event pages to be called on each cpu. + */ + for(i = 0; i < 2; i++) { + setup_args.page_buffers[2 * j + i] = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO); - if (setup_args.page_buffers[i] == NULL) { - KASSERT(setup_args.page_buffers[i] != NULL, + if (setup_args.page_buffers[2 * j + i] == NULL) { + KASSERT(setup_args.page_buffers[2 * j + i] != NULL, ("Error VMBUS: malloc failed!")); - if (i > 0) - free(setup_args.page_buffers[0], M_DEVBUF); - goto cleanup4; + goto cleanup1; + } } } - /* only CPU #0 supported at this time */ + if (bootverbose) + printf("VMBUS: Calling smp_rendezvous, smp_started = %d\n", + smp_started); + smp_rendezvous(NULL, hv_vmbus_synic_init, NULL, &setup_args); /* @@ -443,26 +557,32 @@ vmbus_bus_init(void) ret = hv_vmbus_connect(); if (ret != 0) - goto cleanup4; + goto cleanup1; hv_vmbus_request_channel_offers(); return (ret); - cleanup4: + cleanup1: + /* + * Free pages alloc'ed + */ + for (n = 0; n < 2 * MAXCPU; n++) + if (setup_args.page_buffers[n] != NULL) + free(setup_args.page_buffers[n], M_DEVBUF); /* - * remove swi, bus and intr resource + * remove swi and vmbus callback vector; */ - bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); + CPU_FOREACH(j) { + if (hv_vmbus_g_context.msg_swintr[j] != NULL) + swi_remove(hv_vmbus_g_context.msg_swintr[j]); + if (hv_vmbus_g_context.event_swintr[j] != NULL) + swi_remove(hv_vmbus_g_context.event_swintr[j]); + hv_vmbus_g_context.hv_msg_intr_event[j] = NULL; + hv_vmbus_g_context.hv_event_intr_event[j] = NULL; + } - cleanup3: - bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); - - cleanup2: - swi_remove(event_swintr); - - cleanup1: - swi_remove(msg_swintr); + vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); cleanup: hv_vmbus_cleanup(); @@ -515,20 +635,24 @@ vmbus_bus_exit(void) smp_rendezvous(NULL, hv_vmbus_synic_cleanup, NULL, NULL); - for(i = 0; i < 2; i++) { + for(i = 0; i < 2 * MAXCPU; i++) { if (setup_args.page_buffers[i] != 0) free(setup_args.page_buffers[i], M_DEVBUF); } hv_vmbus_cleanup(); - /* remove swi, bus and intr resource */ - bus_teardown_intr(vmbus_devp, intr_res, vmbus_cookiep); + /* remove swi */ + CPU_FOREACH(i) { + if (hv_vmbus_g_context.msg_swintr[i] != NULL) + swi_remove(hv_vmbus_g_context.msg_swintr[i]); + if (hv_vmbus_g_context.event_swintr[i] != NULL) + swi_remove(hv_vmbus_g_context.event_swintr[i]); + hv_vmbus_g_context.hv_msg_intr_event[i] = NULL; + hv_vmbus_g_context.hv_event_intr_event[i] = NULL; + } - bus_release_resource(vmbus_devp, SYS_RES_IRQ, vmbus_rid, intr_res); - - swi_remove(msg_swintr); - swi_remove(event_swintr); + vmbus_vector_free(hv_vmbus_g_context.hv_cb_vector); return; } @@ -603,6 +727,6 @@ devclass_t vmbus_devclass; DRIVER_MODULE(vmbus, nexus, vmbus_driver, vmbus_devclass, vmbus_modevent, 0); MODULE_VERSION(vmbus,1); -/* TODO: We want to be earlier than SI_SUB_VFS */ -SYSINIT(vmb_init, SI_SUB_VFS, SI_ORDER_MIDDLE, vmbus_init, NULL); +/* We want to be started after SMP is initialized */ +SYSINIT(vmb_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, vmbus_init, NULL); diff --git a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h index 6bc875df1e1c..faa6decd9ac2 100644 --- a/sys/dev/hyperv/vmbus/hv_vmbus_priv.h +++ b/sys/dev/hyperv/vmbus/hv_vmbus_priv.h @@ -181,49 +181,30 @@ enum { #define HV_HYPERCALL_PARAM_ALIGN sizeof(uint64_t) -/* - * Connection identifier type - */ -typedef union { - uint32_t as_uint32_t; - struct { - uint32_t id:24; - uint32_t reserved:8; - } u; - -} __packed hv_vmbus_connection_id; - -/* - * Definition of the hv_vmbus_signal_event hypercall input structure - */ -typedef struct { - hv_vmbus_connection_id connection_id; - uint16_t flag_number; - uint16_t rsvd_z; -} __packed hv_vmbus_input_signal_event; - -typedef struct { - uint64_t align8; - hv_vmbus_input_signal_event event; -} __packed hv_vmbus_input_signal_event_buffer; - typedef struct { uint64_t guest_id; void* hypercall_page; hv_bool_uint8_t syn_ic_initialized; - /* - * This is used as an input param to HV_CALL_SIGNAL_EVENT hypercall. - * The input param is immutable in our usage and - * must be dynamic mem (vs stack or global). - */ - hv_vmbus_input_signal_event_buffer *signal_event_buffer; - /* - * 8-bytes aligned of the buffer above - */ - hv_vmbus_input_signal_event *signal_event_param; hv_vmbus_handle syn_ic_msg_page[MAXCPU]; hv_vmbus_handle syn_ic_event_page[MAXCPU]; + /* + * For FreeBSD cpuid to Hyper-V vcpuid mapping. + */ + uint32_t hv_vcpu_index[MAXCPU]; + /* + * Each cpu has its own software interrupt handler for channel + * event and msg handling. + */ + struct intr_event *hv_event_intr_event[MAXCPU]; + struct intr_event *hv_msg_intr_event[MAXCPU]; + void *event_swintr[MAXCPU]; + void *msg_swintr[MAXCPU]; + /* + * Host use this vector to intrrupt guest for vmbus channel + * event and msg. + */ + unsigned int hv_cb_vector; } hv_vmbus_context; /* @@ -368,7 +349,8 @@ typedef struct { TAILQ_HEAD(, hv_vmbus_channel_msg_info) channel_msg_anchor; struct mtx channel_msg_lock; /** - * List of channels + * List of primary channels. Sub channels will be linked + * under their primary channel. */ TAILQ_HEAD(, hv_vmbus_channel) channel_anchor; struct mtx channel_lock; @@ -560,6 +542,8 @@ typedef union { uint32_t flags32[HV_EVENT_FLAGS_DWORD_COUNT]; } hv_vmbus_synic_event_flags; +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX (0x40000002) /* * Define synthetic interrupt controller model specific registers @@ -618,7 +602,8 @@ void hv_ring_buffer_cleanup( int hv_ring_buffer_write( hv_vmbus_ring_buffer_info *ring_info, hv_vmbus_sg_buffer_list sg_buffers[], - uint32_t sg_buff_count); + uint32_t sg_buff_count, + boolean_t *need_sig); int hv_ring_buffer_peek( hv_vmbus_ring_buffer_info *ring_info, @@ -638,6 +623,12 @@ void hv_vmbus_dump_ring_info( hv_vmbus_ring_buffer_info *ring_info, char *prefix); +void hv_ring_buffer_read_begin( + hv_vmbus_ring_buffer_info *ring_info); + +uint32_t hv_ring_buffer_read_end( + hv_vmbus_ring_buffer_info *ring_info); + hv_vmbus_channel* hv_vmbus_allocate_channel(void); void hv_vmbus_free_vmbus_channel(hv_vmbus_channel *channel); void hv_vmbus_on_channel_message(void *context); @@ -652,7 +643,7 @@ uint16_t hv_vmbus_post_msg_via_msg_ipc( void *payload, size_t payload_size); -uint16_t hv_vmbus_signal_event(void); +uint16_t hv_vmbus_signal_event(void *con_id); void hv_vmbus_synic_init(void *irq_arg); void hv_vmbus_synic_cleanup(void *arg); int hv_vmbus_query_hypervisor_presence(void); @@ -674,7 +665,7 @@ hv_vmbus_channel* hv_vmbus_get_channel_from_rel_id(uint32_t rel_id); int hv_vmbus_connect(void); int hv_vmbus_disconnect(void); int hv_vmbus_post_message(void *buffer, size_t buf_size); -int hv_vmbus_set_event(uint32_t child_rel_id); +int hv_vmbus_set_event(hv_vmbus_channel *channel); void hv_vmbus_on_events(void *); @@ -718,7 +709,7 @@ static inline uint64_t hv_generate_guest_id( typedef struct { unsigned int vector; - void *page_buffers[2]; + void *page_buffers[2 * MAXCPU]; } hv_setup_args; #endif /* __HYPERV_PRIV_H__ */ diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC index 086844a87ea1..b1740f32efc2 100644 --- a/sys/i386/conf/GENERIC +++ b/sys/i386/conf/GENERIC @@ -358,7 +358,9 @@ device virtio_blk # VirtIO Block device device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device -# HyperV drivers +# HyperV drivers and enchancement support +# NOTE: HYPERV depends on hyperv. They must be added or removed together. +options HYPERV # Hyper-V kernel infrastructure device hyperv # HyperV drivers # Xen HVM Guest Optimizations diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index 316bc7a189c2..3ad10b916d82 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -181,6 +181,25 @@ IDTVEC(xen_intr_upcall) jmp doreti #endif +#ifdef HYPERV +/* + * This is the Hyper-V vmbus channel direct callback interrupt. + * Only used when it is running on Hyper-V. + */ + .text + SUPERALIGN_TEXT +IDTVEC(hv_vmbus_callback) + PUSH_FRAME + SET_KERNEL_SREGS + cld + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + call hv_vector_handler + add $4, %esp + MEXITCOUNT + jmp doreti +#endif + #ifdef SMP /* * Global address space TLB shootdown. diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h index 818a83188566..0bd9fe5ece62 100644 --- a/sys/x86/include/apicvar.h +++ b/sys/x86/include/apicvar.h @@ -454,6 +454,7 @@ void lapic_handle_error(void); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); void xen_intr_handle_upcall(struct trapframe *frame); +void hv_vector_handler(struct trapframe *frame); extern int x2apic_mode; extern int lapic_eoi_suppression;