diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index f6d20f6cca12c..88f1730f621b5 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -459,6 +459,14 @@ HYPERVISOR_hvm_op(int op, void *arg) return _hypercall2(unsigned long, hvm_op, op, arg); } +static inline int +HYPERVISOR_domctl( + struct xen_domctl *arg) +{ + return _hypercall1(int, domctl, arg); +} + + static inline int HYPERVISOR_tmem_op( struct tmem_op *op) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 39171b3646bba..a7e82f27a778e 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -48,7 +48,11 @@ extern bool xen_hvm_need_lapic(void); static inline bool xen_x2apic_para_available(void) { +#ifdef CONFIG_XEN_PVHVM return xen_hvm_need_lapic(); +#else + return false; +#endif } #else static inline bool xen_x2apic_para_available(void) diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 62ca03ef5c657..496eceebb9aac 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -86,6 +86,7 @@ typedef long xen_long_t; /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); +__DEFINE_GUEST_HANDLE(ulong, unsigned long); DEFINE_GUEST_HANDLE(char); DEFINE_GUEST_HANDLE(int); DEFINE_GUEST_HANDLE(void); diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 30bb2e80cfe75..a18703be9ead9 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -54,38 +54,6 @@ static efi_system_table_t efi_systab_xen __initdata = { .tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */ }; -static const struct efi efi_xen __initconst = { - .systab = NULL, /* Initialized later. */ - .runtime_version = 0, /* Initialized later. */ - .mps = EFI_INVALID_TABLE_ADDR, - .acpi = EFI_INVALID_TABLE_ADDR, - .acpi20 = EFI_INVALID_TABLE_ADDR, - .smbios = EFI_INVALID_TABLE_ADDR, - .smbios3 = EFI_INVALID_TABLE_ADDR, - .sal_systab = EFI_INVALID_TABLE_ADDR, - .boot_info = EFI_INVALID_TABLE_ADDR, - .hcdp = EFI_INVALID_TABLE_ADDR, - .uga = EFI_INVALID_TABLE_ADDR, - .uv_systab = EFI_INVALID_TABLE_ADDR, - .fw_vendor = EFI_INVALID_TABLE_ADDR, - .runtime = EFI_INVALID_TABLE_ADDR, - .config_table = EFI_INVALID_TABLE_ADDR, - .get_time = xen_efi_get_time, - .set_time = xen_efi_set_time, - .get_wakeup_time = xen_efi_get_wakeup_time, - .set_wakeup_time = xen_efi_set_wakeup_time, - .get_variable = xen_efi_get_variable, - .get_next_variable = xen_efi_get_next_variable, - .set_variable = xen_efi_set_variable, - .query_variable_info = xen_efi_query_variable_info, - .update_capsule = xen_efi_update_capsule, - .query_capsule_caps = xen_efi_query_capsule_caps, - .get_next_high_mono_count = xen_efi_get_next_high_mono_count, - .reset_system = xen_efi_reset_system, - .set_virtual_address_map = NULL, /* Not used under Xen. */ - .flags = 0 /* Initialized later. */ -}; - static efi_system_table_t __init *xen_efi_probe(void) { struct xen_platform_op op = { @@ -102,7 +70,18 @@ static efi_system_table_t __init *xen_efi_probe(void) /* Here we know that Xen runs on EFI platform. */ - efi = efi_xen; + efi.get_time = xen_efi_get_time; + efi.set_time = xen_efi_set_time; + efi.get_wakeup_time = xen_efi_get_wakeup_time; + efi.set_wakeup_time = xen_efi_set_wakeup_time; + efi.get_variable = xen_efi_get_variable; + efi.get_next_variable = xen_efi_get_next_variable; + efi.set_variable = xen_efi_set_variable; + efi.query_variable_info = xen_efi_query_variable_info; + efi.update_capsule = xen_efi_update_capsule; + efi.query_capsule_caps = xen_efi_query_capsule_caps; + efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; + efi.reset_system = xen_efi_reset_system; efi_systab_xen.tables = info->cfg.addr; efi_systab_xen.nr_tables = info->cfg.nent; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3be06f3caf3c1..ffc6cc6b7c97e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "multicalls.h" #include "mmu.h" @@ -201,3 +202,84 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, return -EINVAL; } EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); + +/* Note: here 'mfn' is actually gfn!!! */ +struct vm_struct * xen_remap_domain_mfn_range_in_kernel(unsigned long mfn, + int nr, unsigned domid) +{ + struct vm_struct *area; + struct remap_data rmd; + struct mmu_update mmu_update[REMAP_BATCH_SIZE]; + int batch; + unsigned long range, addr; + pgprot_t prot; + int err; + + WARN_ON(in_interrupt() || irqs_disabled()); + + area = alloc_vm_area(nr << PAGE_SHIFT, NULL); + if (!area) + return NULL; + + addr = (unsigned long)area->addr; + + prot = __pgprot(pgprot_val(PAGE_KERNEL)); + rmd.mfn = &mfn; + rmd.prot = prot; + rmd.contiguous = true; + + while (nr) { + batch = min(REMAP_BATCH_SIZE, nr); + range = (unsigned long)batch << PAGE_SHIFT; + + rmd.mmu_update = mmu_update; + err = apply_to_page_range(&init_mm, addr, range, + remap_area_mfn_pte_fn, &rmd); + if (err || HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + goto err; + + nr -= batch; + addr += range; + } + + xen_flush_tlb_all(); + return area; +err: + free_vm_area(area); + xen_flush_tlb_all(); + return NULL; +} +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range_in_kernel); + +void xen_unmap_domain_mfn_range_in_kernel(struct vm_struct *area, int nr, + unsigned domid) +{ + struct remap_data rmd; + struct mmu_update mmu_update; + unsigned long range, addr = (unsigned long)area->addr; +#define INVALID_MFN (~0UL) + unsigned long invalid_mfn = INVALID_MFN; + int err; + + WARN_ON(in_interrupt() || irqs_disabled()); + + rmd.prot = PAGE_NONE; + + while (nr) { + range = (unsigned long)(1 << PAGE_SHIFT); + + rmd.mfn = &invalid_mfn; + rmd.mmu_update = &mmu_update; + err = apply_to_page_range(&init_mm, addr, range, + remap_area_mfn_pte_fn, &rmd); + BUG_ON(err); + BUG_ON(HYPERVISOR_mmu_update(&mmu_update, 1, NULL, domid) < 0); + + nr--; + addr += range; + } + + free_vm_area(area); + xen_flush_tlb_all(); +} +EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range_in_kernel); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index b372aad3b449c..045d6d311bde2 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -528,7 +528,8 @@ int __init efi_config_parse_tables(void *config_tables, int count, int sz, } } - efi_memattr_init(); + if (efi_enabled(EFI_MEMMAP)) + efi_memattr_init(); /* Parse the EFI Properties table if it exists */ if (efi.properties_table != EFI_INVALID_TABLE_ADDR) { diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index a5cd5dacf055c..e126c2de37cef 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -124,6 +124,15 @@ config DRM_I915_GVT_KVMGT Choose this option if you want to enable KVMGT support for Intel GVT-g. +config DRM_I915_GVT_XENGT + tristate "Enable XEN support for Intel GVT-g" + depends on DRM_I915_GVT + depends on XEN + default n + help + Choose this option if you want to enable XENGT support for + Intel GVT-g under XEN hypervisor environment. + menu "drm/i915 Debugging" depends on DRM_I915 depends on EXPERT diff --git a/drivers/gpu/drm/i915/gvt/Makefile b/drivers/gpu/drm/i915/gvt/Makefile index b123c20e20974..289a00b98a1e0 100644 --- a/drivers/gpu/drm/i915/gvt/Makefile +++ b/drivers/gpu/drm/i915/gvt/Makefile @@ -1,8 +1,9 @@ GVT_DIR := gvt GVT_SOURCE := gvt.o aperture_gm.o handlers.o vgpu.o trace_points.o firmware.o \ interrupt.o gtt.o cfg_space.o opregion.o mmio.o display.o edid.o \ - execlist.o scheduler.o sched_policy.o render.o cmd_parser.o + execlist.o scheduler.o sched_policy.o render.o cmd_parser.o migrate.o -ccflags-y += -I$(src) -I$(src)/$(GVT_DIR) -Wall +ccflags-y += -I$(src) -I$(src)/$(GVT_DIR) i915-y += $(addprefix $(GVT_DIR)/, $(GVT_SOURCE)) obj-$(CONFIG_DRM_I915_GVT_KVMGT) += $(GVT_DIR)/kvmgt.o +obj-$(CONFIG_DRM_I915_GVT_XENGT) += $(GVT_DIR)/xengt.o diff --git a/drivers/gpu/drm/i915/gvt/aperture_gm.c b/drivers/gpu/drm/i915/gvt/aperture_gm.c index 325618d969fee..d249e87968c28 100644 --- a/drivers/gpu/drm/i915/gvt/aperture_gm.c +++ b/drivers/gpu/drm/i915/gvt/aperture_gm.c @@ -144,8 +144,10 @@ void intel_vgpu_write_fence(struct intel_vgpu *vgpu, I915_WRITE(fence_reg_lo, 0); POSTING_READ(fence_reg_lo); - I915_WRITE(fence_reg_hi, upper_32_bits(value)); - I915_WRITE(fence_reg_lo, lower_32_bits(value)); + I915_WRITE(fence_reg_hi, + intel_gvt_reg_g2h(vgpu, upper_32_bits(value), 0xFFFFF000)); + I915_WRITE(fence_reg_lo, + intel_gvt_reg_g2h(vgpu, lower_32_bits(value), 0xFFFFF000)); POSTING_READ(fence_reg_lo); } @@ -285,8 +287,8 @@ static int alloc_resource(struct intel_vgpu *vgpu, return 0; no_enough_resource: - gvt_vgpu_err("fail to allocate resource %s\n", item); - gvt_vgpu_err("request %luMB avail %luMB max %luMB taken %luMB\n", + gvt_err("fail to allocate resource %s\n", item); + gvt_err("request %luMB avail %luMB max %luMB taken %luMB\n", BYTES_TO_MB(request), BYTES_TO_MB(avail), BYTES_TO_MB(max), BYTES_TO_MB(taken)); return -ENOSPC; diff --git a/drivers/gpu/drm/i915/gvt/cfg_space.c b/drivers/gpu/drm/i915/gvt/cfg_space.c index 40af17ec63125..f3bc92ee8b55b 100644 --- a/drivers/gpu/drm/i915/gvt/cfg_space.c +++ b/drivers/gpu/drm/i915/gvt/cfg_space.c @@ -33,6 +33,7 @@ #include "i915_drv.h" #include "gvt.h" +#include "i915_pvinfo.h" enum { INTEL_GVT_PCI_BAR_GTTMMIO = 0, @@ -101,7 +102,7 @@ int intel_vgpu_emulate_cfg_read(struct intel_vgpu *vgpu, unsigned int offset, if (WARN_ON(bytes > 4)) return -EINVAL; - if (WARN_ON(offset + bytes > INTEL_GVT_MAX_CFG_SPACE_SZ)) + if (WARN_ON(offset + bytes > vgpu->gvt->device_info.cfg_space_size)) return -EINVAL; memcpy(p_data, vgpu_cfg_space(vgpu) + offset, bytes); @@ -123,7 +124,7 @@ static int map_aperture(struct intel_vgpu *vgpu, bool map) else val = *(u32 *)(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_2); - first_gfn = (val + vgpu_aperture_offset(vgpu)) >> PAGE_SHIFT; + first_gfn = (val + vgpu_guest_aperture_offset(vgpu)) >> PAGE_SHIFT; first_mfn = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT; ret = intel_gvt_hypervisor_map_gfn_to_mfn(vgpu, first_gfn, @@ -288,7 +289,7 @@ int intel_vgpu_emulate_cfg_write(struct intel_vgpu *vgpu, unsigned int offset, if (WARN_ON(bytes > 4)) return -EINVAL; - if (WARN_ON(offset + bytes > INTEL_GVT_MAX_CFG_SPACE_SZ)) + if (WARN_ON(offset + bytes > vgpu->gvt->device_info.cfg_space_size)) return -EINVAL; /* First check if it's PCI_COMMAND */ diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c index 41b2c3aaa04a5..58de159865865 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.c +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c @@ -922,7 +922,8 @@ static int cmd_handler_lrr(struct parser_exec_state *s) } static inline int cmd_address_audit(struct parser_exec_state *s, - unsigned long guest_gma, int op_size, bool index_mode); + unsigned long guest_gma, int op_size, + bool index_mode, int offset); static int cmd_handler_lrm(struct parser_exec_state *s) { @@ -942,7 +943,8 @@ static int cmd_handler_lrm(struct parser_exec_state *s) gma = cmd_gma(s, i + 1); if (gmadr_bytes == 8) gma |= (cmd_gma_hi(s, i + 2)) << 32; - ret |= cmd_address_audit(s, gma, sizeof(u32), false); + ret |= cmd_address_audit(s, gma, sizeof(u32), + false, i + 1); } i += gmadr_dw_number(s) + 1; } @@ -962,7 +964,8 @@ static int cmd_handler_srm(struct parser_exec_state *s) gma = cmd_gma(s, i + 1); if (gmadr_bytes == 8) gma |= (cmd_gma_hi(s, i + 2)) << 32; - ret |= cmd_address_audit(s, gma, sizeof(u32), false); + ret |= cmd_address_audit(s, gma, sizeof(u32), + false, i + 1); } i += gmadr_dw_number(s) + 1; } @@ -1032,7 +1035,7 @@ static int cmd_handler_pipe_control(struct parser_exec_state *s) if (cmd_val(s, 1) & (1 << 21)) index_mode = true; ret |= cmd_address_audit(s, gma, sizeof(u64), - index_mode); + index_mode, 2); } } } @@ -1364,10 +1367,13 @@ static unsigned long get_gma_bb_from_cmd(struct parser_exec_state *s, int index) } static inline int cmd_address_audit(struct parser_exec_state *s, - unsigned long guest_gma, int op_size, bool index_mode) + unsigned long guest_gma, int op_size, + bool index_mode, int offset) { struct intel_vgpu *vgpu = s->vgpu; u32 max_surface_size = vgpu->gvt->device_info.max_surface_size; + int gmadr_bytes = vgpu->gvt->device_info.gmadr_bytes_in_cmd; + u64 host_gma; int i; int ret; @@ -1382,13 +1388,21 @@ static inline int cmd_address_audit(struct parser_exec_state *s, ret = -EINVAL; goto err; } - } else if ((!vgpu_gmadr_is_valid(s->vgpu, guest_gma)) || - (!vgpu_gmadr_is_valid(s->vgpu, - guest_gma + op_size - 1))) { + } else if (!intel_gvt_ggtt_validate_range(vgpu, guest_gma, op_size)) { ret = -EINVAL; goto err; + } else + intel_gvt_ggtt_gmadr_g2h(vgpu, guest_gma, &host_gma); + + if (offset > 0) { + patch_value(s, cmd_ptr(s, offset), host_gma & GENMASK(31, 2)); + if (gmadr_bytes == 8) + patch_value(s, cmd_ptr(s, offset + 1), + (host_gma >> 32) & GENMASK(15, 0)); } + return 0; + err: gvt_vgpu_err("cmd_parser: Malicious %s detected, addr=0x%lx, len=%d!\n", s->info->name, guest_gma, op_size); @@ -1429,7 +1443,7 @@ static int cmd_handler_mi_store_data_imm(struct parser_exec_state *s) gma = (gma_high << 32) | gma_low; core_id = (cmd_val(s, 1) & (1 << 0)) ? 1 : 0; } - ret = cmd_address_audit(s, gma + op_size * core_id, op_size, false); + ret = cmd_address_audit(s, gma + op_size * core_id, op_size, false, 1); return ret; } @@ -1473,7 +1487,7 @@ static int cmd_handler_mi_op_2f(struct parser_exec_state *s) gma_high = cmd_val(s, 2) & GENMASK(15, 0); gma = (gma_high << 32) | gma; } - ret = cmd_address_audit(s, gma, op_size, false); + ret = cmd_address_audit(s, gma, op_size, false, 1); return ret; } @@ -1513,7 +1527,8 @@ static int cmd_handler_mi_flush_dw(struct parser_exec_state *s) /* Store Data Index */ if (cmd_val(s, 0) & (1 << 21)) index_mode = true; - ret = cmd_address_audit(s, gma, sizeof(u64), index_mode); + ret = cmd_address_audit(s, (gma | (1 << 2)), + sizeof(u64), index_mode, 1); } /* Check notify bit */ if ((cmd_val(s, 0) & (1 << 8))) @@ -2414,53 +2429,13 @@ static void add_cmd_entry(struct intel_gvt *gvt, struct cmd_entry *e) hash_add(gvt->cmd_table, &e->hlist, e->info->opcode); } -#define GVT_MAX_CMD_LENGTH 20 /* In Dword */ - -static void trace_cs_command(struct parser_exec_state *s, - cycles_t cost_pre_cmd_handler, cycles_t cost_cmd_handler) -{ - /* This buffer is used by ftrace to store all commands copied from - * guest gma space. Sometimes commands can cross pages, this should - * not be handled in ftrace logic. So this is just used as a - * 'bounce buffer' - */ - u32 cmd_trace_buf[GVT_MAX_CMD_LENGTH]; - int i; - u32 cmd_len = cmd_length(s); - /* The chosen value of GVT_MAX_CMD_LENGTH are just based on - * following two considerations: - * 1) From observation, most common ring commands is not that long. - * But there are execeptions. So it indeed makes sence to observe - * longer commands. - * 2) From the performance and debugging point of view, dumping all - * contents of very commands is not necessary. - * We mgith shrink GVT_MAX_CMD_LENGTH or remove this trace event in - * future for performance considerations. - */ - if (unlikely(cmd_len > GVT_MAX_CMD_LENGTH)) { - gvt_dbg_cmd("cmd length exceed tracing limitation!\n"); - cmd_len = GVT_MAX_CMD_LENGTH; - } - - for (i = 0; i < cmd_len; i++) - cmd_trace_buf[i] = cmd_val(s, i); - - trace_gvt_command(s->vgpu->id, s->ring_id, s->ip_gma, cmd_trace_buf, - cmd_len, s->buf_type == RING_BUFFER_INSTRUCTION, - cost_pre_cmd_handler, cost_cmd_handler); -} - /* call the cmd handler, and advance ip */ static int cmd_parser_exec(struct parser_exec_state *s) { + struct intel_vgpu *vgpu = s->vgpu; struct cmd_info *info; u32 cmd; int ret = 0; - cycles_t t0, t1, t2; - struct parser_exec_state s_before_advance_custom; - struct intel_vgpu *vgpu = s->vgpu; - - t0 = get_cycles(); cmd = cmd_val(s, 0); @@ -2471,13 +2446,10 @@ static int cmd_parser_exec(struct parser_exec_state *s) return -EINVAL; } - gvt_dbg_cmd("%s\n", info->name); - s->info = info; - t1 = get_cycles(); - - s_before_advance_custom = *s; + trace_gvt_command(vgpu->id, s->ring_id, s->ip_gma, s->ip_va, + cmd_length(s), s->buf_type); if (info->handler) { ret = info->handler(s); @@ -2486,9 +2458,6 @@ static int cmd_parser_exec(struct parser_exec_state *s) return ret; } } - t2 = get_cycles(); - - trace_cs_command(&s_before_advance_custom, t1 - t0, t2 - t1); if (!(info->flag & F_IP_ADVANCE_CUSTOM)) { ret = cmd_advance_default(s); @@ -2522,8 +2491,6 @@ static int command_scan(struct parser_exec_state *s, gma_tail = rb_start + rb_tail; gma_bottom = rb_start + rb_len; - gvt_dbg_cmd("scan_start: start=%lx end=%lx\n", gma_head, gma_tail); - while (s->ip_gma != gma_tail) { if (s->buf_type == RING_BUFFER_INSTRUCTION) { if (!(s->ip_gma >= rb_start) || @@ -2552,8 +2519,6 @@ static int command_scan(struct parser_exec_state *s, } } - gvt_dbg_cmd("scan_end\n"); - return ret; } @@ -2586,6 +2551,11 @@ static int scan_workload(struct intel_vgpu_workload *workload) gma_head == gma_tail) return 0; + if (!intel_gvt_ggtt_validate_range(s.vgpu, s.ring_start, s.ring_size)) { + ret = -EINVAL; + goto out; + } + ret = ip_gma_set(&s, gma_head); if (ret) goto out; @@ -2629,6 +2599,11 @@ static int scan_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) s.rb_va = wa_ctx->indirect_ctx.shadow_va; s.workload = workload; + if (!intel_gvt_ggtt_validate_range(s.vgpu, s.ring_start, s.ring_size)) { + ret = -EINVAL; + goto out; + } + ret = ip_gma_set(&s, gma_head); if (ret) goto out; @@ -2687,7 +2662,7 @@ static int shadow_workload_ring_buffer(struct intel_vgpu_workload *workload) return 0; } -int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload) +int intel_gvt_scan_and_shadow_ringbuffer(struct intel_vgpu_workload *workload) { int ret; struct intel_vgpu *vgpu = workload->vgpu; diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.h b/drivers/gpu/drm/i915/gvt/cmd_parser.h index bed33514103c3..2867036430027 100644 --- a/drivers/gpu/drm/i915/gvt/cmd_parser.h +++ b/drivers/gpu/drm/i915/gvt/cmd_parser.h @@ -42,7 +42,7 @@ void intel_gvt_clean_cmd_parser(struct intel_gvt *gvt); int intel_gvt_init_cmd_parser(struct intel_gvt *gvt); -int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload); +int intel_gvt_scan_and_shadow_ringbuffer(struct intel_vgpu_workload *workload); int intel_gvt_scan_and_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx); diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c index e0261fcc5b504..da6ec015d0422 100644 --- a/drivers/gpu/drm/i915/gvt/display.c +++ b/drivers/gpu/drm/i915/gvt/display.c @@ -197,6 +197,12 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu) (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST | (PORT_B << TRANS_DDI_PORT_SHIFT) | TRANS_DDI_FUNC_ENABLE); + if (IS_BROADWELL(dev_priv)) { + vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_B)) &= + ~PORT_CLK_SEL_MASK; + vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_B)) |= + PORT_CLK_SEL_LCPLL_810; + } vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_B)) |= DDI_BUF_CTL_ENABLE; vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_B)) &= ~DDI_BUF_IS_IDLE; vgpu_vreg(vgpu, SDEISR) |= SDE_PORTB_HOTPLUG_CPT; @@ -211,6 +217,12 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu) (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST | (PORT_C << TRANS_DDI_PORT_SHIFT) | TRANS_DDI_FUNC_ENABLE); + if (IS_BROADWELL(dev_priv)) { + vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_C)) &= + ~PORT_CLK_SEL_MASK; + vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_C)) |= + PORT_CLK_SEL_LCPLL_810; + } vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_C)) |= DDI_BUF_CTL_ENABLE; vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_C)) &= ~DDI_BUF_IS_IDLE; vgpu_vreg(vgpu, SFUSE_STRAP) |= SFUSE_STRAP_DDIC_DETECTED; @@ -225,6 +237,12 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu) (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST | (PORT_D << TRANS_DDI_PORT_SHIFT) | TRANS_DDI_FUNC_ENABLE); + if (IS_BROADWELL(dev_priv)) { + vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_D)) &= + ~PORT_CLK_SEL_MASK; + vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_D)) |= + PORT_CLK_SEL_LCPLL_810; + } vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_D)) |= DDI_BUF_CTL_ENABLE; vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_D)) &= ~DDI_BUF_IS_IDLE; vgpu_vreg(vgpu, SFUSE_STRAP) |= SFUSE_STRAP_DDID_DETECTED; @@ -244,6 +262,10 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu) vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_A)) |= DDI_INIT_DISPLAY_DETECTED; } + + /* Clear host CRT status, so guest couldn't detect this host CRT. */ + if (IS_BROADWELL(dev_priv)) + vgpu_vreg(vgpu, PCH_ADPA) &= ~ADPA_CRT_HOTPLUG_MONITOR_MASK; } static void clean_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num) @@ -301,27 +323,27 @@ void intel_gvt_check_vblank_emulation(struct intel_gvt *gvt) { struct intel_gvt_irq *irq = &gvt->irq; struct intel_vgpu *vgpu; - bool have_enabled_pipe = false; int pipe, id; if (WARN_ON(!mutex_is_locked(&gvt->lock))) return; - hrtimer_cancel(&irq->vblank_timer.timer); - for_each_active_vgpu(gvt, vgpu, id) { for (pipe = 0; pipe < I915_MAX_PIPES; pipe++) { - have_enabled_pipe = - pipe_is_enabled(vgpu, pipe); - if (have_enabled_pipe) - break; + if (pipe_is_enabled(vgpu, pipe)) + goto out; } } - if (have_enabled_pipe) - hrtimer_start(&irq->vblank_timer.timer, - ktime_add_ns(ktime_get(), irq->vblank_timer.period), - HRTIMER_MODE_ABS); + /* all the pipes are disabled */ + hrtimer_cancel(&irq->vblank_timer.timer); + return; + +out: + hrtimer_start(&irq->vblank_timer.timer, + ktime_add_ns(ktime_get(), irq->vblank_timer.period), + HRTIMER_MODE_ABS); + } static void emulate_vblank_on_pipe(struct intel_vgpu *vgpu, int pipe) @@ -390,12 +412,13 @@ void intel_gvt_emulate_vblank(struct intel_gvt *gvt) */ void intel_vgpu_clean_display(struct intel_vgpu *vgpu) { - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + enum port port; - if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) - clean_virtual_dp_monitor(vgpu, PORT_D); - else - clean_virtual_dp_monitor(vgpu, PORT_B); + for (port = PORT_B; port < PORT_E; port++) { + if (intel_vgpu_has_monitor_on_port(vgpu, port) && + intel_vgpu_port_is_dp(vgpu, port)) + clean_virtual_dp_monitor(vgpu, port); + } } /** @@ -411,15 +434,33 @@ void intel_vgpu_clean_display(struct intel_vgpu *vgpu) int intel_vgpu_init_display(struct intel_vgpu *vgpu, u64 resolution) { struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + enum port port; + enum intel_vgpu_port_type type; intel_vgpu_init_i2c_edid(vgpu); - if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) - return setup_virtual_dp_monitor(vgpu, PORT_D, GVT_DP_D, - resolution); - else - return setup_virtual_dp_monitor(vgpu, PORT_B, GVT_DP_B, - resolution); + for (port = PORT_B; port < PORT_E; port++) { + if (!dev_priv->vbt.ddi_port_info[port].supports_dp) + continue; + + switch (port) { + case PORT_B: + type = GVT_DP_B; + break; + case PORT_C: + type = GVT_DP_C; + break; + case PORT_D: + type = GVT_DP_D; + break; + default: + BUG(); + } + + return setup_virtual_dp_monitor(vgpu, port, type, resolution); + } + + return -EINVAL; } /** diff --git a/drivers/gpu/drm/i915/gvt/execlist.c b/drivers/gpu/drm/i915/gvt/execlist.c index 24fe04d6307b0..705b53a35c042 100644 --- a/drivers/gpu/drm/i915/gvt/execlist.c +++ b/drivers/gpu/drm/i915/gvt/execlist.c @@ -46,6 +46,8 @@ #define same_context(a, b) (((a)->context_id == (b)->context_id) && \ ((a)->lrca == (b)->lrca)) +static void clean_workloads(struct intel_vgpu *vgpu, unsigned long engine_mask); + static int context_switch_events[] = { [RCS] = RCS_AS_CONTEXT_SWITCH, [BCS] = BCS_AS_CONTEXT_SWITCH, @@ -499,10 +501,10 @@ static void release_shadow_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx) static int complete_execlist_workload(struct intel_vgpu_workload *workload) { struct intel_vgpu *vgpu = workload->vgpu; - struct intel_vgpu_execlist *execlist = - &vgpu->execlist[workload->ring_id]; + int ring_id = workload->ring_id; + struct intel_vgpu_execlist *execlist = &vgpu->execlist[ring_id]; struct intel_vgpu_workload *next_workload; - struct list_head *next = workload_q_head(vgpu, workload->ring_id)->next; + struct list_head *next = workload_q_head(vgpu, ring_id)->next; bool lite_restore = false; int ret; @@ -512,10 +514,25 @@ static int complete_execlist_workload(struct intel_vgpu_workload *workload) release_shadow_batch_buffer(workload); release_shadow_wa_ctx(&workload->wa_ctx); - if (workload->status || vgpu->resetting) + if (workload->status || (vgpu->resetting_eng & ENGINE_MASK(ring_id))) { + /* if workload->status is not successful means HW GPU + * has occurred GPU hang or something wrong with i915/GVT, + * and GVT won't inject context switch interrupt to guest. + * So this error is a vGPU hang actually to the guest. + * According to this we should emunlate a vGPU hang. If + * there are pending workloads which are already submitted + * from guest, we should clean them up like HW GPU does. + * + * if it is in middle of engine resetting, the pending + * workloads won't be submitted to HW GPU and will be + * cleaned up during the resetting process later, so doing + * the workload clean up here doesn't have any impact. + **/ + clean_workloads(vgpu, ENGINE_MASK(ring_id)); goto out; + } - if (!list_empty(workload_q_head(vgpu, workload->ring_id))) { + if (!list_empty(workload_q_head(vgpu, ring_id))) { struct execlist_ctx_descriptor_format *this_desc, *next_desc; next_workload = container_of(next, @@ -605,6 +622,7 @@ static int submit_context(struct intel_vgpu *vgpu, int ring_id, struct list_head *q = workload_q_head(vgpu, ring_id); struct intel_vgpu_workload *last_workload = get_last_workload(q); struct intel_vgpu_workload *workload = NULL; + struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; u64 ring_context_gpa; u32 head, tail, start, ctl, ctx_ctl, per_ctx, indirect_ctx; int ret; @@ -668,6 +686,7 @@ static int submit_context(struct intel_vgpu *vgpu, int ring_id, workload->complete = complete_execlist_workload; workload->status = -EINPROGRESS; workload->emulate_schedule_in = emulate_schedule_in; + workload->shadowed = false; if (ring_id == RCS) { intel_gvt_hypervisor_read_gpa(vgpu, ring_context_gpa + @@ -701,6 +720,17 @@ static int submit_context(struct intel_vgpu *vgpu, int ring_id, return ret; } + /* Only scan and shadow the first workload in the queue + * as there is only one pre-allocated buf-obj for shadow. + */ + if (list_empty(workload_q_head(vgpu, ring_id))) { + intel_runtime_pm_get(dev_priv); + mutex_lock(&dev_priv->drm.struct_mutex); + intel_gvt_scan_and_shadow_workload(workload); + mutex_unlock(&dev_priv->drm.struct_mutex); + intel_runtime_pm_put(dev_priv); + } + queue_workload(workload); return 0; } diff --git a/drivers/gpu/drm/i915/gvt/firmware.c b/drivers/gpu/drm/i915/gvt/firmware.c index dce8d15f706f5..a26c1705430eb 100644 --- a/drivers/gpu/drm/i915/gvt/firmware.c +++ b/drivers/gpu/drm/i915/gvt/firmware.c @@ -72,11 +72,13 @@ static int expose_firmware_sysfs(struct intel_gvt *gvt) struct intel_gvt_device_info *info = &gvt->device_info; struct pci_dev *pdev = gvt->dev_priv->drm.pdev; struct intel_gvt_mmio_info *e; + struct gvt_mmio_block *block = gvt->mmio.mmio_block; + int num = gvt->mmio.num_mmio_block; struct gvt_firmware_header *h; void *firmware; void *p; unsigned long size, crc32_start; - int i; + int i, j; int ret; size = sizeof(*h) + info->mmio_size + info->cfg_space_size; @@ -102,12 +104,14 @@ static int expose_firmware_sysfs(struct intel_gvt *gvt) p = firmware + h->mmio_offset; - hash_for_each(gvt->mmio.mmio_info_table, i, e, node) { - int j; + hash_for_each(gvt->mmio.mmio_info_table, i, e, node) + *(u32 *)(p + e->offset) = I915_READ_NOTRACE(_MMIO(e->offset)); - for (j = 0; j < e->length; j += 4) - *(u32 *)(p + e->offset + j) = - I915_READ_NOTRACE(_MMIO(e->offset + j)); + for (i = 0; i < num; i++, block++) { + for (j = 0; j < block->size; j += 4) + *(u32 *)(p + INTEL_GVT_MMIO_OFFSET(block->offset) + j) = + I915_READ_NOTRACE(_MMIO(INTEL_GVT_MMIO_OFFSET( + block->offset) + j)); } memcpy(gvt->firmware.mmio, p, info->mmio_size); diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index c6f0077f590d2..6e5702108928f 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -32,7 +32,8 @@ * Bing Niu * */ - +#include +#include #include "i915_drv.h" #include "gvt.h" #include "i915_pvinfo.h" @@ -59,16 +60,15 @@ bool intel_gvt_ggtt_validate_range(struct intel_vgpu *vgpu, u64 addr, u32 size) /* translate a guest gmadr to host gmadr */ int intel_gvt_ggtt_gmadr_g2h(struct intel_vgpu *vgpu, u64 g_addr, u64 *h_addr) { - if (WARN(!vgpu_gmadr_is_valid(vgpu, g_addr), - "invalid guest gmadr %llx\n", g_addr)) + if (!vgpu_gmadr_is_valid(vgpu, g_addr)) return -EACCES; if (vgpu_gmadr_is_aperture(vgpu, g_addr)) *h_addr = vgpu_aperture_gmadr_base(vgpu) - + (g_addr - vgpu_aperture_offset(vgpu)); + + (g_addr - vgpu_guest_aperture_gmadr_base(vgpu)); else *h_addr = vgpu_hidden_gmadr_base(vgpu) - + (g_addr - vgpu_hidden_offset(vgpu)); + + (g_addr - vgpu_guest_hidden_gmadr_base(vgpu)); return 0; } @@ -80,10 +80,10 @@ int intel_gvt_ggtt_gmadr_h2g(struct intel_vgpu *vgpu, u64 h_addr, u64 *g_addr) return -EACCES; if (gvt_gmadr_is_aperture(vgpu->gvt, h_addr)) - *g_addr = vgpu_aperture_gmadr_base(vgpu) + *g_addr = vgpu_guest_aperture_gmadr_base(vgpu) + (h_addr - gvt_aperture_gmadr_base(vgpu->gvt)); else - *g_addr = vgpu_hidden_gmadr_base(vgpu) + *g_addr = vgpu_guest_hidden_gmadr_base(vgpu) + (h_addr - gvt_hidden_gmadr_base(vgpu->gvt)); return 0; } @@ -244,15 +244,19 @@ static u64 read_pte64(struct drm_i915_private *dev_priv, unsigned long index) return readq(addr); } +static void gtt_invalidate(struct drm_i915_private *dev_priv) +{ + mmio_hw_access_pre(dev_priv); + I915_WRITE(GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN); + mmio_hw_access_post(dev_priv); +} + static void write_pte64(struct drm_i915_private *dev_priv, unsigned long index, u64 pte) { void __iomem *addr = (gen8_pte_t __iomem *)dev_priv->ggtt.gsm + index; writeq(pte, addr); - - I915_WRITE(GFX_FLSH_CNTL_GEN6, GFX_FLSH_CNTL_EN); - POSTING_READ(GFX_FLSH_CNTL_GEN6); } static inline struct intel_gvt_gtt_entry *gtt_get_entry64(void *pt, @@ -1325,6 +1329,10 @@ static int ppgtt_handle_guest_write_page_table_bytes(void *gp, index = (pa & (PAGE_SIZE - 1)) >> info->gtt_entry_size_shift; + if (xen_initial_domain()) + /* Set guest ppgtt entry.Optional for KVMGT,but MUST for XENGT*/ + intel_gvt_hypervisor_write_gpa(vgpu, pa, p_data, bytes); + ppgtt_get_guest_entry(spt, &we, index); ops->test_pse(&we); @@ -1459,7 +1467,7 @@ void intel_vgpu_destroy_mm(struct kref *mm_ref) list_del(&mm->list); list_del(&mm->lru_list); - if (mm->has_shadow_page_table) + if (mm->has_shadow_page_table && mm->shadowed) invalidate_mm(mm); gtt->mm_free_page_table(mm); @@ -1649,7 +1657,8 @@ static int reclaim_one_mm(struct intel_gvt *gvt) continue; list_del_init(&mm->lru_list); - invalidate_mm(mm); + if (mm->has_shadow_page_table && mm->shadowed) + invalidate_mm(mm); return 1; } return 0; @@ -1815,17 +1824,15 @@ static int emulate_gtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off, struct intel_vgpu_mm *ggtt_mm = vgpu->gtt.ggtt_mm; struct intel_gvt_gtt_pte_ops *ops = gvt->gtt.pte_ops; unsigned long g_gtt_index = off >> info->gtt_entry_size_shift; - unsigned long gma; + unsigned long h_gtt_index; struct intel_gvt_gtt_entry e, m; int ret; if (bytes != 4 && bytes != 8) return -EINVAL; - gma = g_gtt_index << GTT_PAGE_SHIFT; - /* the VM may configure the whole GM space when ballooning is used */ - if (!vgpu_gmadr_is_valid(vgpu, gma)) + if (intel_gvt_ggtt_index_g2h(vgpu, g_gtt_index, &h_gtt_index)) return 0; ggtt_get_guest_entry(ggtt_mm, &e, g_gtt_index); @@ -1848,7 +1855,8 @@ static int emulate_gtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off, ops->set_pfn(&m, gvt->gtt.scratch_ggtt_mfn); } - ggtt_set_shadow_entry(ggtt_mm, &m, g_gtt_index); + ggtt_set_shadow_entry(ggtt_mm, &m, h_gtt_index); + gtt_invalidate(gvt->dev_priv); ggtt_set_guest_entry(ggtt_mm, &e, g_gtt_index); return 0; } @@ -2031,6 +2039,21 @@ static void intel_vgpu_free_mm(struct intel_vgpu *vgpu, int type) } } +void intel_vgpu_invalidate_ppgtt(struct intel_vgpu *vgpu) +{ + struct list_head *pos, *n; + struct intel_vgpu_mm *mm; + + list_for_each_safe(pos, n, &vgpu->gtt.mm_list_head) { + mm = container_of(pos, struct intel_vgpu_mm, list); + if (mm->type == INTEL_GVT_MM_PPGTT) { + list_del_init(&mm->lru_list); + if (mm->has_shadow_page_table && mm->shadowed) + invalidate_mm(mm); + } + } +} + /** * intel_vgpu_clean_gtt - clean up per-vGPU graphics memory virulization * @vgpu: a vGPU @@ -2254,6 +2277,8 @@ int intel_gvt_init_gtt(struct intel_gvt *gvt) ret = setup_spt_oos(gvt); if (ret) { gvt_err("fail to initialize SPT oos\n"); + dma_unmap_page(dev, daddr, 4096, PCI_DMA_BIDIRECTIONAL); + __free_page(gvt->gtt.scratch_ggtt_page); return ret; } } @@ -2301,8 +2326,6 @@ void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu) u32 num_entries; struct intel_gvt_gtt_entry e; - intel_runtime_pm_get(dev_priv); - memset(&e, 0, sizeof(struct intel_gvt_gtt_entry)); e.type = GTT_TYPE_GGTT_PTE; ops->set_pfn(&e, gvt->gtt.scratch_ggtt_mfn); @@ -2318,33 +2341,27 @@ void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu) for (offset = 0; offset < num_entries; offset++) ops->set_entry(NULL, &e, index + offset, false, 0, vgpu); - intel_runtime_pm_put(dev_priv); + gtt_invalidate(dev_priv); } /** * intel_vgpu_reset_gtt - reset the all GTT related status * @vgpu: a vGPU - * @dmlr: true for vGPU Device Model Level Reset, false for GT Reset * * This function is called from vfio core to reset reset all * GTT related status, including GGTT, PPGTT, scratch page. * */ -void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu, bool dmlr) +void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu) { int i; - ppgtt_free_all_shadow_page(vgpu); - /* Shadow pages are only created when there is no page * table tracking data, so remove page tracking data after * removing the shadow pages. */ intel_vgpu_free_mm(vgpu, INTEL_GVT_MM_PPGTT); - if (!dmlr) - return; - intel_vgpu_reset_ggtt(vgpu); /* clear scratch page for security */ diff --git a/drivers/gpu/drm/i915/gvt/gtt.h b/drivers/gpu/drm/i915/gvt/gtt.h index f88eb5e89bea0..cb12a5661ca46 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.h +++ b/drivers/gpu/drm/i915/gvt/gtt.h @@ -208,7 +208,8 @@ extern void intel_vgpu_clean_gtt(struct intel_vgpu *vgpu); void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu); extern int intel_gvt_init_gtt(struct intel_gvt *gvt); -extern void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu, bool dmlr); +void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu); +void intel_vgpu_invalidate_ppgtt(struct intel_vgpu *vgpu); extern void intel_gvt_clean_gtt(struct intel_gvt *gvt); extern struct intel_vgpu_mm *intel_gvt_find_ppgtt_mm(struct intel_vgpu *vgpu, diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c index 7dea5e5d55679..83417353ea62d 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.c +++ b/drivers/gpu/drm/i915/gvt/gvt.c @@ -54,6 +54,7 @@ static const struct intel_gvt_ops intel_gvt_ops = { .vgpu_reset = intel_gvt_reset_vgpu, .vgpu_activate = intel_gvt_activate_vgpu, .vgpu_deactivate = intel_gvt_deactivate_vgpu, + .vgpu_save_restore = intel_gvt_save_restore, }; /** @@ -111,7 +112,7 @@ static void init_device_info(struct intel_gvt *gvt) if (IS_BROADWELL(gvt->dev_priv) || IS_SKYLAKE(gvt->dev_priv) || IS_KABYLAKE(gvt->dev_priv)) { info->max_support_vgpus = 8; - info->cfg_space_size = 256; + info->cfg_space_size = PCI_CFG_SPACE_EXP_SIZE; info->mmio_size = 2 * 1024 * 1024; info->mmio_bar = 0; info->gtt_start_offset = 8 * 1024 * 1024; @@ -147,7 +148,9 @@ static int gvt_service_thread(void *data) mutex_unlock(&gvt->lock); } - if (test_and_clear_bit(INTEL_GVT_REQUEST_SCHED, + if (test_bit(INTEL_GVT_REQUEST_SCHED, + (void *)&gvt->service_request) || + test_bit(INTEL_GVT_REQUEST_EVENT_SCHED, (void *)&gvt->service_request)) { intel_gvt_schedule(gvt); } @@ -244,7 +247,7 @@ int intel_gvt_init_device(struct drm_i915_private *dev_priv) gvt_dbg_core("init gvt device\n"); idr_init(&gvt->vgpu_idr); - + spin_lock_init(&gvt->scheduler.mmio_context_lock); mutex_init(&gvt->lock); gvt->dev_priv = dev_priv; diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 930732e5c7806..76dae55529416 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -46,6 +46,7 @@ #include "sched_policy.h" #include "render.h" #include "cmd_parser.h" +#include "migrate.h" #define GVT_MAX_VGPU 8 @@ -99,7 +100,6 @@ struct intel_vgpu_mmio { bool disable_warn_untrack; }; -#define INTEL_GVT_MAX_CFG_SPACE_SZ 256 #define INTEL_GVT_MAX_BAR_NUM 4 struct intel_vgpu_pci_bar { @@ -108,7 +108,7 @@ struct intel_vgpu_pci_bar { }; struct intel_vgpu_cfg_space { - unsigned char virtual_cfg_space[INTEL_GVT_MAX_CFG_SPACE_SZ]; + unsigned char virtual_cfg_space[PCI_CFG_SPACE_EXP_SIZE]; struct intel_vgpu_pci_bar bar[INTEL_GVT_MAX_BAR_NUM]; }; @@ -149,7 +149,7 @@ struct intel_vgpu { bool active; bool pv_notified; bool failsafe; - bool resetting; + unsigned int resetting_eng; void *sched_data; struct vgpu_sched_ctl sched_ctl; @@ -168,6 +168,7 @@ struct intel_vgpu { ktime_t last_ctx_submit_time; DECLARE_BITMAP(tlb_handle_pending, I915_NUM_ENGINES); struct i915_gem_context *shadow_ctx; + unsigned long low_mem_max_gpfn; #if IS_ENABLED(CONFIG_DRM_I915_GVT_KVMGT) struct { @@ -196,11 +197,39 @@ struct intel_gvt_fence { unsigned long vgpu_allocated_fence_num; }; -#define INTEL_GVT_MMIO_HASH_BITS 9 +/* Special MMIO blocks. */ +struct gvt_mmio_block { + unsigned int device; + i915_reg_t offset; + unsigned int size; + gvt_mmio_func read; + gvt_mmio_func write; +}; + +#define INTEL_GVT_MMIO_HASH_BITS 11 struct intel_gvt_mmio { - u32 *mmio_attribute; + u8 *mmio_attribute; +/* Register contains RO bits */ +#define F_RO (1 << 0) +/* Register contains graphics address */ +#define F_GMADR (1 << 1) +/* Mode mask registers with high 16 bits as the mask bits */ +#define F_MODE_MASK (1 << 2) +/* This reg can be accessed by GPU commands */ +#define F_CMD_ACCESS (1 << 3) +/* This reg has been accessed by a VM */ +#define F_ACCESSED (1 << 4) +/* This reg has been accessed through GPU commands */ +#define F_CMD_ACCESSED (1 << 5) +/* This reg could be accessed by unaligned address */ +#define F_UNALIGN (1 << 6) + + struct gvt_mmio_block *mmio_block; + unsigned int num_mmio_block; + DECLARE_HASHTABLE(mmio_info_table, INTEL_GVT_MMIO_HASH_BITS); + unsigned int num_tracked_mmio; }; struct intel_gvt_firmware { @@ -257,7 +286,12 @@ static inline struct intel_gvt *to_gvt(struct drm_i915_private *i915) enum { INTEL_GVT_REQUEST_EMULATE_VBLANK = 0, + + /* Scheduling trigger by timer */ INTEL_GVT_REQUEST_SCHED = 1, + + /* Scheduling trigger by event */ + INTEL_GVT_REQUEST_EVENT_SCHED = 2, }; static inline void intel_gvt_request_service(struct intel_gvt *gvt, @@ -323,6 +357,20 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt); #define vgpu_fence_base(vgpu) (vgpu->fence.base) #define vgpu_fence_sz(vgpu) (vgpu->fence.size) +/* Aperture/GM space definitions for vGPU Guest view point */ +#define vgpu_guest_aperture_offset(vgpu) \ + vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) +#define vgpu_guest_hidden_offset(vgpu) \ + vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) + +#define vgpu_guest_aperture_gmadr_base(vgpu) (vgpu_guest_aperture_offset(vgpu)) +#define vgpu_guest_aperture_gmadr_end(vgpu) \ + (vgpu_guest_aperture_gmadr_base(vgpu) + vgpu_aperture_sz(vgpu) - 1) + +#define vgpu_guest_hidden_gmadr_base(vgpu) (vgpu_guest_hidden_offset(vgpu)) +#define vgpu_guest_hidden_gmadr_end(vgpu) \ + (vgpu_guest_hidden_gmadr_base(vgpu) + vgpu_hidden_sz(vgpu) - 1) + struct intel_vgpu_creation_params { __u64 handle; __u64 low_gm_sz; /* in MB */ @@ -397,15 +445,17 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, void intel_gvt_reset_vgpu(struct intel_vgpu *vgpu); void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu); void intel_gvt_deactivate_vgpu(struct intel_vgpu *vgpu); +int intel_gvt_save_restore(struct intel_vgpu *vgpu, char *buf, size_t count, + void *base, uint64_t off, bool restore); /* validating GM functions */ #define vgpu_gmadr_is_aperture(vgpu, gmadr) \ - ((gmadr >= vgpu_aperture_gmadr_base(vgpu)) && \ - (gmadr <= vgpu_aperture_gmadr_end(vgpu))) + ((gmadr >= vgpu_guest_aperture_gmadr_base(vgpu)) && \ + (gmadr <= vgpu_guest_aperture_gmadr_end(vgpu))) #define vgpu_gmadr_is_hidden(vgpu, gmadr) \ - ((gmadr >= vgpu_hidden_gmadr_base(vgpu)) && \ - (gmadr <= vgpu_hidden_gmadr_end(vgpu))) + ((gmadr >= vgpu_guest_hidden_gmadr_base(vgpu)) && \ + (gmadr <= vgpu_guest_hidden_gmadr_end(vgpu))) #define vgpu_gmadr_is_valid(vgpu, gmadr) \ ((vgpu_gmadr_is_aperture(vgpu, gmadr) || \ @@ -431,6 +481,20 @@ int intel_gvt_ggtt_index_g2h(struct intel_vgpu *vgpu, unsigned long g_index, int intel_gvt_ggtt_h2g_index(struct intel_vgpu *vgpu, unsigned long h_index, unsigned long *g_index); +/* apply guest to host gma conversion in GM registers setting */ +static inline u64 intel_gvt_reg_g2h(struct intel_vgpu *vgpu, + u32 addr, u32 mask) +{ + u64 gma; + + if (addr) { + intel_gvt_ggtt_gmadr_g2h(vgpu, + addr & mask, &gma); + addr = gma | (addr & (~mask)); + } + return addr; +} + void intel_vgpu_init_cfg_space(struct intel_vgpu *vgpu, bool primary); void intel_vgpu_reset_cfg_space(struct intel_vgpu *vgpu); @@ -450,6 +514,8 @@ int intel_vgpu_init_opregion(struct intel_vgpu *vgpu, u32 gpa); int intel_vgpu_emulate_opregion_request(struct intel_vgpu *vgpu, u32 swsci); void populate_pvinfo_page(struct intel_vgpu *vgpu); +int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload); + struct intel_gvt_ops { int (*emulate_cfg_read)(struct intel_vgpu *, unsigned int, void *, unsigned int); @@ -465,6 +531,8 @@ struct intel_gvt_ops { void (*vgpu_reset)(struct intel_vgpu *); void (*vgpu_activate)(struct intel_vgpu *); void (*vgpu_deactivate)(struct intel_vgpu *); + int (*vgpu_save_restore)(struct intel_vgpu *, char *buf, size_t count, + void *base, uint64_t off, bool restore); }; @@ -473,6 +541,80 @@ enum { GVT_FAILSAFE_INSUFFICIENT_RESOURCE, }; +static inline void mmio_hw_access_pre(struct drm_i915_private *dev_priv) +{ + intel_runtime_pm_get(dev_priv); +} + +static inline void mmio_hw_access_post(struct drm_i915_private *dev_priv) +{ + intel_runtime_pm_put(dev_priv); +} + +/** + * intel_gvt_mmio_set_accessed - mark a MMIO has been accessed + * @gvt: a GVT device + * @offset: register offset + * + */ +static inline void intel_gvt_mmio_set_accessed( + struct intel_gvt *gvt, unsigned int offset) +{ + gvt->mmio.mmio_attribute[offset >> 2] |= F_ACCESSED; +} + +/** + * intel_gvt_mmio_is_cmd_accessed - mark a MMIO could be accessed by command + * @gvt: a GVT device + * @offset: register offset + * + */ +static inline bool intel_gvt_mmio_is_cmd_access( + struct intel_gvt *gvt, unsigned int offset) +{ + return gvt->mmio.mmio_attribute[offset >> 2] & F_CMD_ACCESS; +} + +/** + * intel_gvt_mmio_is_unalign - mark a MMIO could be accessed unaligned + * @gvt: a GVT device + * @offset: register offset + * + */ +static inline bool intel_gvt_mmio_is_unalign( + struct intel_gvt *gvt, unsigned int offset) +{ + return gvt->mmio.mmio_attribute[offset >> 2] & F_UNALIGN; +} + +/** + * intel_gvt_mmio_set_cmd_accessed - mark a MMIO has been accessed by command + * @gvt: a GVT device + * @offset: register offset + * + */ +static inline void intel_gvt_mmio_set_cmd_accessed( + struct intel_gvt *gvt, unsigned int offset) +{ + gvt->mmio.mmio_attribute[offset >> 2] |= F_CMD_ACCESSED; +} + +/** + * intel_gvt_mmio_has_mode_mask - if a MMIO has a mode mask + * @gvt: a GVT device + * @offset: register offset + * + * Returns: + * True if a MMIO has a mode mask in its higher 16 bits, false if it isn't. + * + */ +static inline bool intel_gvt_mmio_has_mode_mask( + struct intel_gvt *gvt, unsigned int offset) +{ + return gvt->mmio.mmio_attribute[offset >> 2] & F_MODE_MASK; +} + +#include "trace.h" #include "mpt.h" #endif diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 0ffd696545927..a321196089996 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -47,21 +47,6 @@ #define PCH_PP_OFF_DELAYS _MMIO(0xc720c) #define PCH_PP_DIVISOR _MMIO(0xc7210) -/* Register contains RO bits */ -#define F_RO (1 << 0) -/* Register contains graphics address */ -#define F_GMADR (1 << 1) -/* Mode mask registers with high 16 bits as the mask bits */ -#define F_MODE_MASK (1 << 2) -/* This reg can be accessed by GPU commands */ -#define F_CMD_ACCESS (1 << 3) -/* This reg has been accessed by a VM */ -#define F_ACCESSED (1 << 4) -/* This reg has been accessed through GPU commands */ -#define F_CMD_ACCESSED (1 << 5) -/* This reg could be accessed by unaligned address */ -#define F_UNALIGN (1 << 6) - unsigned long intel_gvt_get_device_type(struct intel_gvt *gvt) { if (IS_BROADWELL(gvt->dev_priv)) @@ -92,11 +77,22 @@ static void write_vreg(struct intel_vgpu *vgpu, unsigned int offset, memcpy(&vgpu_vreg(vgpu, offset), p_data, bytes); } +static struct intel_gvt_mmio_info *find_mmio_info(struct intel_gvt *gvt, + unsigned int offset) +{ + struct intel_gvt_mmio_info *e; + + hash_for_each_possible(gvt->mmio.mmio_info_table, e, node, offset) { + if (e->offset == offset) + return e; + } + return NULL; +} + static int new_mmio_info(struct intel_gvt *gvt, - u32 offset, u32 flags, u32 size, + u32 offset, u8 flags, u32 size, u32 addr_mask, u32 ro_mask, u32 device, - int (*read)(struct intel_vgpu *, unsigned int, void *, unsigned int), - int (*write)(struct intel_vgpu *, unsigned int, void *, unsigned int)) + gvt_mmio_func read, gvt_mmio_func write) { struct intel_gvt_mmio_info *info, *p; u32 start, end, i; @@ -116,13 +112,19 @@ static int new_mmio_info(struct intel_gvt *gvt, return -ENOMEM; info->offset = i; - p = intel_gvt_find_mmio_info(gvt, info->offset); - if (p) - gvt_err("dup mmio definition offset %x\n", + p = find_mmio_info(gvt, info->offset); + if (p) { + WARN(1, "dup mmio definition offset %x\n", info->offset); - info->size = size; - info->length = (i + 4) < end ? 4 : (end - i); - info->addr_mask = addr_mask; + kfree(info); + + /* We return -EEXIST here to make GVT-g load fail. + * So duplicated MMIO can be found as soon as + * possible. + */ + return -EEXIST; + } + info->ro_mask = ro_mask; info->device = device; info->read = read ? read : intel_vgpu_default_mmio_read; @@ -130,6 +132,7 @@ static int new_mmio_info(struct intel_gvt *gvt, gvt->mmio.mmio_attribute[info->offset / 4] = flags; INIT_HLIST_NODE(&info->node); hash_add(gvt->mmio.mmio_info_table, &info->node, info->offset); + gvt->mmio.num_tracked_mmio++; } return 0; } @@ -209,6 +212,7 @@ static int fence_mmio_read(struct intel_vgpu *vgpu, unsigned int off, static int fence_mmio_write(struct intel_vgpu *vgpu, unsigned int off, void *p_data, unsigned int bytes) { + struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; unsigned int fence_num = offset_to_fence_num(off); int ret; @@ -217,8 +221,10 @@ static int fence_mmio_write(struct intel_vgpu *vgpu, unsigned int off, return ret; write_vreg(vgpu, off, p_data, bytes); + mmio_hw_access_pre(dev_priv); intel_vgpu_write_fence(vgpu, fence_num, vgpu_vreg64(vgpu, fence_num_to_offset(fence_num))); + mmio_hw_access_post(dev_priv); return 0; } @@ -300,6 +306,9 @@ static int gdrst_mmio_write(struct intel_vgpu *vgpu, unsigned int offset, intel_gvt_reset_vgpu_locked(vgpu, false, engine_mask); + /* sw will wait for the device to ack the reset request */ + vgpu_vreg(vgpu, offset) = 0; + return 0; } @@ -366,21 +375,24 @@ static int lcpll_ctl_mmio_write(struct intel_vgpu *vgpu, unsigned int offset, static int dpy_reg_mmio_read(struct intel_vgpu *vgpu, unsigned int offset, void *p_data, unsigned int bytes) { - *(u32 *)p_data = (1 << 17); - return 0; -} - -static int dpy_reg_mmio_read_2(struct intel_vgpu *vgpu, unsigned int offset, - void *p_data, unsigned int bytes) -{ - *(u32 *)p_data = 3; - return 0; -} + switch (offset) { + case 0xe651c: + case 0xe661c: + case 0xe671c: + case 0xe681c: + vgpu_vreg(vgpu, offset) = 1 << 17; + break; + case 0xe6c04: + vgpu_vreg(vgpu, offset) = 0x3; + break; + case 0xe6e1c: + vgpu_vreg(vgpu, offset) = 0x2f << 16; + break; + default: + return -EINVAL; + } -static int dpy_reg_mmio_read_3(struct intel_vgpu *vgpu, unsigned int offset, - void *p_data, unsigned int bytes) -{ - *(u32 *)p_data = (0x2f << 16); + read_vreg(vgpu, offset, p_data, bytes); return 0; } @@ -1265,7 +1277,10 @@ static int gen9_trtte_write(struct intel_vgpu *vgpu, unsigned int offset, } write_vreg(vgpu, offset, p_data, bytes); /* TRTTE is not per-context */ + + mmio_hw_access_pre(dev_priv); I915_WRITE(_MMIO(offset), vgpu_vreg(vgpu, offset)); + mmio_hw_access_post(dev_priv); return 0; } @@ -1278,7 +1293,9 @@ static int gen9_trtt_chicken_write(struct intel_vgpu *vgpu, unsigned int offset, if (val & 1) { /* unblock hw logic */ + mmio_hw_access_pre(dev_priv); I915_WRITE(_MMIO(offset), val); + mmio_hw_access_post(dev_priv); } write_vreg(vgpu, offset, p_data, bytes); return 0; @@ -1415,7 +1432,20 @@ static int ring_timestamp_mmio_read(struct intel_vgpu *vgpu, { struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + mmio_hw_access_pre(dev_priv); vgpu_vreg(vgpu, offset) = I915_READ(_MMIO(offset)); + mmio_hw_access_post(dev_priv); + return intel_vgpu_default_mmio_read(vgpu, offset, p_data, bytes); +} + +static int instdone_mmio_read(struct intel_vgpu *vgpu, + unsigned int offset, void *p_data, unsigned int bytes) +{ + struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + + mmio_hw_access_pre(dev_priv); + vgpu_vreg(vgpu, offset) = I915_READ(_MMIO(offset)); + mmio_hw_access_post(dev_priv); return intel_vgpu_default_mmio_read(vgpu, offset, p_data, bytes); } @@ -1603,6 +1633,12 @@ static int init_generic_mmio_info(struct intel_gvt *gvt) MMIO_RING_DFH(RING_REG, D_ALL, F_CMD_ACCESS, NULL, NULL); #undef RING_REG +#define RING_REG(base) (base + 0x6c) + MMIO_RING_DFH(RING_REG, D_ALL, 0, instdone_mmio_read, NULL); + MMIO_DH(RING_REG(GEN8_BSD2_RING_BASE), D_ALL, instdone_mmio_read, NULL); +#undef RING_REG + MMIO_DH(GEN7_SC_INSTDONE, D_HSW_PLUS, instdone_mmio_read, NULL); + MMIO_GM_RDR(0x2148, D_ALL, NULL, NULL); MMIO_GM_RDR(CCID, D_ALL, NULL, NULL); MMIO_GM_RDR(0x12198, D_ALL, NULL, NULL); @@ -1779,10 +1815,6 @@ static int init_generic_mmio_info(struct intel_gvt *gvt) MMIO_D(SPRSCALE(PIPE_C), D_ALL); MMIO_D(SPRSURFLIVE(PIPE_C), D_ALL); - MMIO_F(LGC_PALETTE(PIPE_A, 0), 4 * 256, 0, 0, 0, D_ALL, NULL, NULL); - MMIO_F(LGC_PALETTE(PIPE_B, 0), 4 * 256, 0, 0, 0, D_ALL, NULL, NULL); - MMIO_F(LGC_PALETTE(PIPE_C, 0), 4 * 256, 0, 0, 0, D_ALL, NULL, NULL); - MMIO_D(HTOTAL(TRANSCODER_A), D_ALL); MMIO_D(HBLANK(TRANSCODER_A), D_ALL); MMIO_D(HSYNC(TRANSCODER_A), D_ALL); @@ -1905,7 +1937,7 @@ static int init_generic_mmio_info(struct intel_gvt *gvt) MMIO_F(_PCH_DPD_AUX_CH_CTL, 6 * 4, 0, 0, 0, D_PRE_SKL, NULL, dp_aux_ch_ctl_mmio_write); - MMIO_RO(PCH_ADPA, D_ALL, 0, ADPA_CRT_HOTPLUG_MONITOR_MASK, NULL, pch_adpa_mmio_write); + MMIO_DH(PCH_ADPA, D_PRE_SKL, NULL, pch_adpa_mmio_write); MMIO_DH(_PCH_TRANSACONF, D_ALL, NULL, transconf_mmio_write); MMIO_DH(_PCH_TRANSBCONF, D_ALL, NULL, transconf_mmio_write); @@ -1991,8 +2023,8 @@ static int init_generic_mmio_info(struct intel_gvt *gvt) MMIO_DH(0xe661c, D_ALL, dpy_reg_mmio_read, NULL); MMIO_DH(0xe671c, D_ALL, dpy_reg_mmio_read, NULL); MMIO_DH(0xe681c, D_ALL, dpy_reg_mmio_read, NULL); - MMIO_DH(0xe6c04, D_ALL, dpy_reg_mmio_read_2, NULL); - MMIO_DH(0xe6e1c, D_ALL, dpy_reg_mmio_read_3, NULL); + MMIO_DH(0xe6c04, D_ALL, dpy_reg_mmio_read, NULL); + MMIO_DH(0xe6e1c, D_ALL, dpy_reg_mmio_read, NULL); MMIO_RO(PCH_PORT_HOTPLUG, D_ALL, 0, PORTA_HOTPLUG_STATUS_MASK @@ -2230,11 +2262,8 @@ static int init_generic_mmio_info(struct intel_gvt *gvt) MMIO_DH(GEN6_GDRST, D_ALL, NULL, gdrst_mmio_write); MMIO_F(FENCE_REG_GEN6_LO(0), 0x80, 0, 0, 0, D_ALL, fence_mmio_read, fence_mmio_write); - MMIO_F(VGT_PVINFO_PAGE, VGT_PVINFO_SIZE, F_UNALIGN, 0, 0, D_ALL, pvinfo_mmio_read, pvinfo_mmio_write); MMIO_DH(CPU_VGACNTRL, D_ALL, NULL, vga_control_mmio_write); - MMIO_F(MCHBAR_MIRROR_BASE_SNB, 0x40000, 0, 0, 0, D_ALL, NULL, NULL); - MMIO_D(TILECTL, D_ALL); MMIO_D(GEN6_UCGCTL1, D_ALL); @@ -2563,7 +2592,6 @@ static int init_broadwell_mmio_info(struct intel_gvt *gvt) MMIO_F(0x24d0, 48, F_CMD_ACCESS, 0, 0, D_BDW_PLUS, NULL, force_nonpriv_write); - MMIO_D(0x22040, D_BDW_PLUS); MMIO_D(0x44484, D_BDW_PLUS); MMIO_D(0x4448c, D_BDW_PLUS); @@ -2621,7 +2649,6 @@ static int init_skl_mmio_info(struct intel_gvt *gvt) MMIO_D(HSW_PWR_WELL_BIOS, D_SKL_PLUS); MMIO_DH(HSW_PWR_WELL_DRIVER, D_SKL_PLUS, NULL, skl_power_well_ctl_write); - MMIO_DH(GEN6_PCODE_MAILBOX, D_SKL_PLUS, NULL, mailbox_write); MMIO_D(0xa210, D_SKL_PLUS); MMIO_D(GEN9_MEDIA_PG_IDLE_HYSTERESIS, D_SKL_PLUS); @@ -2766,7 +2793,6 @@ static int init_skl_mmio_info(struct intel_gvt *gvt) MMIO_D(0x72380, D_SKL_PLUS); MMIO_D(0x7039c, D_SKL_PLUS); - MMIO_F(0x80000, 0x3000, 0, 0, 0, D_SKL_PLUS, NULL, NULL); MMIO_D(0x8f074, D_SKL | D_KBL); MMIO_D(0x8f004, D_SKL | D_KBL); MMIO_D(0x8f034, D_SKL | D_KBL); @@ -2814,7 +2840,6 @@ static int init_skl_mmio_info(struct intel_gvt *gvt) MMIO_D(0x320f0, D_SKL | D_KBL); MMIO_DFH(_REG_VCS2_EXCC, D_SKL_PLUS, F_CMD_ACCESS, NULL, NULL); - MMIO_DFH(_REG_VECS_EXCC, D_SKL_PLUS, F_CMD_ACCESS, NULL, NULL); MMIO_D(0x70034, D_SKL_PLUS); MMIO_D(0x71034, D_SKL_PLUS); MMIO_D(0x72034, D_SKL_PLUS); @@ -2832,34 +2857,25 @@ static int init_skl_mmio_info(struct intel_gvt *gvt) NULL, NULL); MMIO_D(0x4ab8, D_KBL); - MMIO_D(0x940c, D_SKL_PLUS); MMIO_D(0x2248, D_SKL_PLUS | D_KBL); - MMIO_D(0x4ab0, D_SKL | D_KBL); - MMIO_D(0x20d4, D_SKL | D_KBL); return 0; } -/** - * intel_gvt_find_mmio_info - find MMIO information entry by aligned offset - * @gvt: GVT device - * @offset: register offset - * - * This function is used to find the MMIO information entry from hash table - * - * Returns: - * pointer to MMIO information entry, NULL if not exists - */ -struct intel_gvt_mmio_info *intel_gvt_find_mmio_info(struct intel_gvt *gvt, - unsigned int offset) +static struct gvt_mmio_block *find_mmio_block(struct intel_gvt *gvt, + unsigned int offset) { - struct intel_gvt_mmio_info *e; - - WARN_ON(!IS_ALIGNED(offset, 4)); + unsigned long device = intel_gvt_get_device_type(gvt); + struct gvt_mmio_block *block = gvt->mmio.mmio_block; + int num = gvt->mmio.num_mmio_block; + int i; - hash_for_each_possible(gvt->mmio.mmio_info_table, e, node, offset) { - if (e->offset == offset) - return e; + for (i = 0; i < num; i++, block++) { + if (!(device & block->device)) + continue; + if (offset >= INTEL_GVT_MMIO_OFFSET(block->offset) && + offset < INTEL_GVT_MMIO_OFFSET(block->offset) + block->size) + return block; } return NULL; } @@ -2885,6 +2901,17 @@ void intel_gvt_clean_mmio_info(struct intel_gvt *gvt) gvt->mmio.mmio_attribute = NULL; } +/* Special MMIO blocks. */ +static struct gvt_mmio_block mmio_blocks[] = { + {D_SKL_PLUS, _MMIO(CSR_MMIO_START_RANGE), 0x3000, NULL, NULL}, + {D_ALL, _MMIO(MCHBAR_MIRROR_BASE_SNB), 0x40000, NULL, NULL}, + {D_ALL, _MMIO(VGT_PVINFO_PAGE), VGT_PVINFO_SIZE, + pvinfo_mmio_read, pvinfo_mmio_write}, + {D_ALL, LGC_PALETTE(PIPE_A, 0), 1024, NULL, NULL}, + {D_ALL, LGC_PALETTE(PIPE_B, 0), 1024, NULL, NULL}, + {D_ALL, LGC_PALETTE(PIPE_C, 0), 1024, NULL, NULL}, +}; + /** * intel_gvt_setup_mmio_info - setup MMIO information table for GVT device * @gvt: GVT device @@ -2899,9 +2926,10 @@ int intel_gvt_setup_mmio_info(struct intel_gvt *gvt) { struct intel_gvt_device_info *info = &gvt->device_info; struct drm_i915_private *dev_priv = gvt->dev_priv; + int size = info->mmio_size / 4 * sizeof(*gvt->mmio.mmio_attribute); int ret; - gvt->mmio.mmio_attribute = vzalloc(info->mmio_size); + gvt->mmio.mmio_attribute = vzalloc(size); if (!gvt->mmio.mmio_attribute) return -ENOMEM; @@ -2922,77 +2950,18 @@ int intel_gvt_setup_mmio_info(struct intel_gvt *gvt) if (ret) goto err; } + + gvt->mmio.mmio_block = mmio_blocks; + gvt->mmio.num_mmio_block = ARRAY_SIZE(mmio_blocks); + + gvt_dbg_mmio("traced %u virtual mmio registers\n", + gvt->mmio.num_tracked_mmio); return 0; err: intel_gvt_clean_mmio_info(gvt); return ret; } -/** - * intel_gvt_mmio_set_accessed - mark a MMIO has been accessed - * @gvt: a GVT device - * @offset: register offset - * - */ -void intel_gvt_mmio_set_accessed(struct intel_gvt *gvt, unsigned int offset) -{ - gvt->mmio.mmio_attribute[offset >> 2] |= - F_ACCESSED; -} - -/** - * intel_gvt_mmio_is_cmd_accessed - mark a MMIO could be accessed by command - * @gvt: a GVT device - * @offset: register offset - * - */ -bool intel_gvt_mmio_is_cmd_access(struct intel_gvt *gvt, - unsigned int offset) -{ - return gvt->mmio.mmio_attribute[offset >> 2] & - F_CMD_ACCESS; -} - -/** - * intel_gvt_mmio_is_unalign - mark a MMIO could be accessed unaligned - * @gvt: a GVT device - * @offset: register offset - * - */ -bool intel_gvt_mmio_is_unalign(struct intel_gvt *gvt, - unsigned int offset) -{ - return gvt->mmio.mmio_attribute[offset >> 2] & - F_UNALIGN; -} - -/** - * intel_gvt_mmio_set_cmd_accessed - mark a MMIO has been accessed by command - * @gvt: a GVT device - * @offset: register offset - * - */ -void intel_gvt_mmio_set_cmd_accessed(struct intel_gvt *gvt, - unsigned int offset) -{ - gvt->mmio.mmio_attribute[offset >> 2] |= - F_CMD_ACCESSED; -} - -/** - * intel_gvt_mmio_has_mode_mask - if a MMIO has a mode mask - * @gvt: a GVT device - * @offset: register offset - * - * Returns: - * True if a MMIO has a mode mask in its higher 16 bits, false if it isn't. - * - */ -bool intel_gvt_mmio_has_mode_mask(struct intel_gvt *gvt, unsigned int offset) -{ - return gvt->mmio.mmio_attribute[offset >> 2] & - F_MODE_MASK; -} /** * intel_vgpu_default_mmio_read - default MMIO read handler @@ -3044,3 +3013,91 @@ bool intel_gvt_in_force_nonpriv_whitelist(struct intel_gvt *gvt, { return in_whitelist(offset); } + +/** + * intel_vgpu_mmio_reg_rw - emulate tracked mmio registers + * @vgpu: a vGPU + * @offset: register offset + * @pdata: data buffer + * @bytes: data length + * + * Returns: + * Zero on success, negative error code if failed. + */ +int intel_vgpu_mmio_reg_rw(struct intel_vgpu *vgpu, unsigned int offset, + void *pdata, unsigned int bytes, bool is_read) +{ + struct intel_gvt *gvt = vgpu->gvt; + struct intel_gvt_mmio_info *mmio_info; + struct gvt_mmio_block *mmio_block; + gvt_mmio_func func; + int ret; + + if (WARN_ON(bytes > 8)) + return -EINVAL; + + /* + * Handle special MMIO blocks. + */ + mmio_block = find_mmio_block(gvt, offset); + if (mmio_block) { + func = is_read ? mmio_block->read : mmio_block->write; + if (func) + return func(vgpu, offset, pdata, bytes); + goto default_rw; + } + + /* + * Normal tracked MMIOs. + */ + mmio_info = find_mmio_info(gvt, offset); + if (!mmio_info) { + if (!vgpu->mmio.disable_warn_untrack) + gvt_vgpu_err("untracked MMIO %08x len %d\n", + offset, bytes); + goto default_rw; + } + + if (is_read) + return mmio_info->read(vgpu, offset, pdata, bytes); + else { + u64 ro_mask = mmio_info->ro_mask; + u32 old_vreg = 0, old_sreg = 0; + u64 data = 0; + + if (intel_gvt_mmio_has_mode_mask(gvt, mmio_info->offset)) { + old_vreg = vgpu_vreg(vgpu, offset); + old_sreg = vgpu_sreg(vgpu, offset); + } + + if (likely(!ro_mask)) + ret = mmio_info->write(vgpu, offset, pdata, bytes); + else if (!~ro_mask) { + gvt_vgpu_err("try to write RO reg %x\n", offset); + return 0; + } else { + /* keep the RO bits in the virtual register */ + memcpy(&data, pdata, bytes); + data &= ~ro_mask; + data |= vgpu_vreg(vgpu, offset) & ro_mask; + ret = mmio_info->write(vgpu, offset, &data, bytes); + } + + /* higher 16bits of mode ctl regs are mask bits for change */ + if (intel_gvt_mmio_has_mode_mask(gvt, mmio_info->offset)) { + u32 mask = vgpu_vreg(vgpu, offset) >> 16; + + vgpu_vreg(vgpu, offset) = (old_vreg & ~mask) + | (vgpu_vreg(vgpu, offset) & mask); + vgpu_sreg(vgpu, offset) = (old_sreg & ~mask) + | (vgpu_sreg(vgpu, offset) & mask); + } + } + + return ret; + +default_rw: + return is_read ? + intel_vgpu_default_mmio_read(vgpu, offset, pdata, bytes) : + intel_vgpu_default_mmio_write(vgpu, offset, pdata, bytes); +} diff --git a/drivers/gpu/drm/i915/gvt/interrupt.c b/drivers/gpu/drm/i915/gvt/interrupt.c index 9d6812f0957f0..7a041b368f688 100644 --- a/drivers/gpu/drm/i915/gvt/interrupt.c +++ b/drivers/gpu/drm/i915/gvt/interrupt.c @@ -31,6 +31,7 @@ #include "i915_drv.h" #include "gvt.h" +#include "trace.h" /* common offset among interrupt control registers */ #define regbase_to_isr(base) (base) @@ -178,8 +179,8 @@ int intel_vgpu_reg_imr_handler(struct intel_vgpu *vgpu, struct intel_gvt_irq_ops *ops = gvt->irq.ops; u32 imr = *(u32 *)p_data; - gvt_dbg_irq("write IMR %x, new %08x, old %08x, changed %08x\n", - reg, imr, vgpu_vreg(vgpu, reg), vgpu_vreg(vgpu, reg) ^ imr); + trace_write_ir(vgpu->id, "IMR", reg, imr, vgpu_vreg(vgpu, reg), + (vgpu_vreg(vgpu, reg) ^ imr)); vgpu_vreg(vgpu, reg) = imr; @@ -209,8 +210,8 @@ int intel_vgpu_reg_master_irq_handler(struct intel_vgpu *vgpu, u32 ier = *(u32 *)p_data; u32 virtual_ier = vgpu_vreg(vgpu, reg); - gvt_dbg_irq("write MASTER_IRQ %x, new %08x, old %08x, changed %08x\n", - reg, ier, virtual_ier, virtual_ier ^ ier); + trace_write_ir(vgpu->id, "MASTER_IRQ", reg, ier, virtual_ier, + (virtual_ier ^ ier)); /* * GEN8_MASTER_IRQ is a special irq register, @@ -248,8 +249,8 @@ int intel_vgpu_reg_ier_handler(struct intel_vgpu *vgpu, struct intel_gvt_irq_info *info; u32 ier = *(u32 *)p_data; - gvt_dbg_irq("write IER %x, new %08x, old %08x, changed %08x\n", - reg, ier, vgpu_vreg(vgpu, reg), vgpu_vreg(vgpu, reg) ^ ier); + trace_write_ir(vgpu->id, "IER", reg, ier, vgpu_vreg(vgpu, reg), + (vgpu_vreg(vgpu, reg) ^ ier)); vgpu_vreg(vgpu, reg) = ier; @@ -285,8 +286,8 @@ int intel_vgpu_reg_iir_handler(struct intel_vgpu *vgpu, unsigned int reg, iir_to_regbase(reg)); u32 iir = *(u32 *)p_data; - gvt_dbg_irq("write IIR %x, new %08x, old %08x, changed %08x\n", - reg, iir, vgpu_vreg(vgpu, reg), vgpu_vreg(vgpu, reg) ^ iir); + trace_write_ir(vgpu->id, "IIR", reg, iir, vgpu_vreg(vgpu, reg), + (vgpu_vreg(vgpu, reg) ^ iir)); if (WARN_ON(!info)) return -EINVAL; @@ -411,8 +412,7 @@ static void propagate_event(struct intel_gvt_irq *irq, if (!test_bit(bit, (void *)&vgpu_vreg(vgpu, regbase_to_imr(reg_base)))) { - gvt_dbg_irq("set bit (%d) for (%s) for vgpu (%d)\n", - bit, irq_name[event], vgpu->id); + trace_propagate_event(vgpu->id, irq_name[event], bit); set_bit(bit, (void *)&vgpu_vreg(vgpu, regbase_to_iir(reg_base))); } diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 1ae0b4083ce10..6c27419f5fb25 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -53,11 +53,21 @@ static const struct intel_gvt_ops *intel_gvt_ops; #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) +struct vfio_region; +struct intel_vgpu_regops { + size_t (*rw)(struct intel_vgpu *vgpu, char *buf, + size_t count, loff_t *ppos, bool iswrite); + void (*release)(struct intel_vgpu *vgpu, + struct vfio_region *region); +}; + struct vfio_region { u32 type; u32 subtype; size_t size; u32 flags; + const struct intel_vgpu_regops *ops; + void *data; }; struct kvmgt_pgfn { @@ -426,6 +436,131 @@ static void kvmgt_protect_table_del(struct kvmgt_guest_info *info, } } +static size_t intel_vgpu_reg_rw_device_state(struct intel_vgpu *vgpu, char *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + void *base = vgpu->vdev.region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + uint8_t state; + int rc = 0; + + if (pos >= vgpu->vdev.region[i].size) { + gvt_vgpu_err("invalid offset for Intel vgpu device state region\n"); + rc = -EINVAL; + goto exit; + } + + if (pos == 0) { + if (count != 1) { + rc = -EFAULT; + goto exit; + } + + if (iswrite) { + if (copy_from_user(&state, buf, count)) { + rc = -EFAULT; + goto exit; + } + switch (state) { + case VFIO_DEVICE_STOP: + intel_gvt_ops->vgpu_deactivate(vgpu); + break; + case VFIO_DEVICE_START: + intel_gvt_ops->vgpu_activate(vgpu); + break; + default: + rc = -EFAULT; + goto exit; + } + memcpy(base, &state, count); + } else { + if (copy_to_user(buf, base, count)) + rc = -EFAULT; + } + } else { + if (iswrite) { + if (copy_from_user(base + pos, buf, count)) { + rc = -EFAULT; + goto exit; + } + + rc = intel_gvt_ops->vgpu_save_restore(vgpu, + buf, count, base, pos, iswrite); + } else { + if (intel_gvt_ops->vgpu_save_restore(vgpu, + buf, count, base, pos, iswrite) != 0) { + rc = -EFAULT; + goto exit; + } + + if (copy_to_user(buf, base + pos, count)) + rc = -EFAULT; + } + } +exit: + return rc; +} + +static void intel_vgpu_reg_release_device_state(struct intel_vgpu *vgpu, + struct vfio_region *region) +{ + vfree(region->data); +} + +static const struct intel_vgpu_regops intel_vgpu_regops_device_state = { + .rw = intel_vgpu_reg_rw_device_state, + .release = intel_vgpu_reg_release_device_state, +}; + +static int intel_vgpu_register_region(struct intel_vgpu *vgpu, + unsigned int type, unsigned int subtype, + const struct intel_vgpu_regops *ops, + size_t size, u32 flags, void *data) +{ + struct vfio_region *region; + + region = krealloc(vgpu->vdev.region, + (vgpu->vdev.num_regions + 1) * sizeof(*region), + GFP_KERNEL); + if (!region) + return -ENOMEM; + + vgpu->vdev.region = region; + vgpu->vdev.region[vgpu->vdev.num_regions].type = type; + vgpu->vdev.region[vgpu->vdev.num_regions].subtype = subtype; + vgpu->vdev.region[vgpu->vdev.num_regions].ops = ops; + vgpu->vdev.region[vgpu->vdev.num_regions].size = size; + vgpu->vdev.region[vgpu->vdev.num_regions].flags = flags; + vgpu->vdev.region[vgpu->vdev.num_regions].data = data; + vgpu->vdev.num_regions++; + + return 0; +} + +static int kvmgt_init_migration(struct intel_vgpu *vgpu) +{ + void *base; + int ret; + + base = vzalloc(MIGRATION_IMG_MAX_SIZE); + if (base == NULL) + return -ENOMEM; + + ret = intel_vgpu_register_region(vgpu, + PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, + VFIO_REGION_SUBTYPE_DEVICE_STATE, + &intel_vgpu_regops_device_state, MIGRATION_IMG_MAX_SIZE, + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE, + base); + if (ret) + vfree(base); + + return ret; + +} + static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev) { struct intel_vgpu *vgpu = NULL; @@ -546,6 +681,8 @@ static int intel_vgpu_open(struct mdev_device *mdev) if (ret) goto undo_group; + kvmgt_init_migration(vgpu); + intel_gvt_ops->vgpu_activate(vgpu); atomic_set(&vgpu->vdev.released, 0); @@ -566,6 +703,7 @@ static void __intel_vgpu_release(struct intel_vgpu *vgpu) { struct kvmgt_guest_info *info; int ret; + int i; if (!handle_valid(vgpu->handle)) return; @@ -575,6 +713,13 @@ static void __intel_vgpu_release(struct intel_vgpu *vgpu) intel_gvt_ops->vgpu_deactivate(vgpu); + for (i = 0; i < vgpu->vdev.num_regions; i++) + vgpu->vdev.region[i].ops->release(vgpu, &vgpu->vdev.region[i]); + + vgpu->vdev.num_regions = 0; + kfree(vgpu->vdev.region); + vgpu->vdev.region = NULL; + ret = vfio_unregister_notifier(mdev_dev(vgpu->vdev.mdev), VFIO_IOMMU_NOTIFY, &vgpu->vdev.iommu_notifier); WARN(ret, "vfio_unregister_notifier for iommu failed: %d\n", ret); @@ -642,7 +787,7 @@ static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf, int ret = -EINVAL; - if (index >= VFIO_PCI_NUM_REGIONS) { + if (index >= VFIO_PCI_NUM_REGIONS + vgpu->vdev.num_regions) { gvt_vgpu_err("invalid index: %u\n", index); return -EINVAL; } @@ -676,8 +821,11 @@ static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf, case VFIO_PCI_BAR5_REGION_INDEX: case VFIO_PCI_VGA_REGION_INDEX: case VFIO_PCI_ROM_REGION_INDEX: + break; default: - gvt_vgpu_err("unsupported region: %u\n", index); + index -= VFIO_PCI_NUM_REGIONS; + ret = vgpu->vdev.region[index].ops->rw(vgpu, buf, + count, ppos, is_write); } return ret == 0 ? count : ret; @@ -688,6 +836,10 @@ static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf, { unsigned int done = 0; int ret; + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + + if (index >= VFIO_PCI_NUM_REGIONS) + return intel_vgpu_rw(mdev, (char *)buf, count, ppos, false); while (count) { size_t filled; @@ -748,6 +900,10 @@ static ssize_t intel_vgpu_write(struct mdev_device *mdev, { unsigned int done = 0; int ret; + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + + if (index >= VFIO_PCI_NUM_REGIONS) + return intel_vgpu_rw(mdev, (char *)buf, count, ppos, true); while (count) { size_t filled; @@ -940,7 +1096,8 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, info.flags = VFIO_DEVICE_FLAGS_PCI; info.flags |= VFIO_DEVICE_FLAGS_RESET; - info.num_regions = VFIO_PCI_NUM_REGIONS; + info.num_regions = VFIO_PCI_NUM_REGIONS + + vgpu->vdev.num_regions; info.num_irqs = VFIO_PCI_NUM_IRQS; return copy_to_user((void __user *)arg, &info, minsz) ? @@ -966,7 +1123,7 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, switch (info.index) { case VFIO_PCI_CONFIG_REGION_INDEX: info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = INTEL_GVT_MAX_CFG_SPACE_SZ; + info.size = vgpu->gvt->device_info.cfg_space_size; info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; break; @@ -1002,8 +1159,7 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, sparse->nr_areas = nr_areas; cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->areas[0].offset = - PAGE_ALIGN(vgpu_aperture_offset(vgpu)); + sparse->areas[0].offset = 0; sparse->areas[0].size = vgpu_aperture_sz(vgpu); break; @@ -1061,6 +1217,7 @@ static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd, } if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; if (info.argsz < sizeof(info) + caps.size) { info.argsz = sizeof(info) + caps.size; info.cap_offset = 0; diff --git a/drivers/gpu/drm/i915/gvt/migrate.c b/drivers/gpu/drm/i915/gvt/migrate.c new file mode 100644 index 0000000000000..d989d9baac470 --- /dev/null +++ b/drivers/gpu/drm/i915/gvt/migrate.c @@ -0,0 +1,887 @@ +/* + * Copyright(c) 2011-2016 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Yulei Zhang + * Xiao Zheng + */ + +#include "i915_drv.h" +#include "gvt.h" +#include "i915_pvinfo.h" + +#define INV (-1) +#define RULES_NUM(x) (sizeof(x)/sizeof(gvt_migration_obj_t)) +#define FOR_EACH_OBJ(obj, rules) \ + for (obj = rules; obj->region.type != GVT_MIGRATION_NONE; obj++) +#define MIG_VREG_RESTORE(vgpu, off) \ + { \ + u32 data = vgpu_vreg(vgpu, (off)); \ + u64 pa = intel_vgpu_mmio_offset_to_gpa(vgpu, off); \ + intel_vgpu_emulate_mmio_write(vgpu, pa, &data, 4); \ + } + +/* s - struct + * t - type of obj + * m - size of obj + * ops - operation override callback func + */ +#define MIGRATION_UNIT(_s, _t, _m, _ops) { \ +.img = NULL, \ +.region.type = _t, \ +.region.size = _m, \ +.ops = &(_ops), \ +.name = "["#_s":"#_t"]\0" \ +} + +#define MIGRATION_END { \ + NULL, NULL, 0, \ + {GVT_MIGRATION_NONE, 0},\ + NULL, \ + NULL \ +} + +static DEFINE_MUTEX(gvt_migration); +static int image_header_load(const struct gvt_migration_obj_t *obj, u32 size); +static int image_header_save(const struct gvt_migration_obj_t *obj); +static int vreg_load(const struct gvt_migration_obj_t *obj, u32 size); +static int vreg_save(const struct gvt_migration_obj_t *obj); +static int sreg_load(const struct gvt_migration_obj_t *obj, u32 size); +static int sreg_save(const struct gvt_migration_obj_t *obj); +static int vcfg_space_load(const struct gvt_migration_obj_t *obj, u32 size); +static int vcfg_space_save(const struct gvt_migration_obj_t *obj); +static int vggtt_load(const struct gvt_migration_obj_t *obj, u32 size); +static int vggtt_save(const struct gvt_migration_obj_t *obj); +static int workload_load(const struct gvt_migration_obj_t *obj, u32 size); +static int workload_save(const struct gvt_migration_obj_t *obj); +static int ppgtt_load(const struct gvt_migration_obj_t *obj, u32 size); +static int ppgtt_save(const struct gvt_migration_obj_t *obj); +static int opregion_load(const struct gvt_migration_obj_t *obj, u32 size); +static int opregion_save(const struct gvt_migration_obj_t *obj); + +/*********************************************** + * Internal Static Functions + ***********************************************/ +struct gvt_migration_operation_t vReg_ops = { + .pre_copy = NULL, + .pre_save = vreg_save, + .pre_load = vreg_load, + .post_load = NULL, +}; + +struct gvt_migration_operation_t sReg_ops = { + .pre_copy = NULL, + .pre_save = sreg_save, + .pre_load = sreg_load, + .post_load = NULL, +}; + +struct gvt_migration_operation_t vcfg_space_ops = { + .pre_copy = NULL, + .pre_save = vcfg_space_save, + .pre_load = vcfg_space_load, + .post_load = NULL, +}; + +struct gvt_migration_operation_t vgtt_info_ops = { + .pre_copy = NULL, + .pre_save = vggtt_save, + .pre_load = vggtt_load, + .post_load = NULL, +}; + +struct gvt_migration_operation_t image_header_ops = { + .pre_copy = NULL, + .pre_save = image_header_save, + .pre_load = image_header_load, + .post_load = NULL, +}; + +struct gvt_migration_operation_t workload_ops = { + .pre_copy = NULL, + .pre_save = workload_save, + .pre_load = workload_load, + .post_load = NULL, +}; + +struct gvt_migration_operation_t ppgtt_ops = { + .pre_copy = NULL, + .pre_save = ppgtt_save, + .pre_load = ppgtt_load, + .post_load = NULL, +}; + +struct gvt_migration_operation_t opregion_ops = { + .pre_copy = NULL, + .pre_save = opregion_save, + .pre_load = opregion_load, + .post_load = NULL, +}; + +/* gvt_device_objs[] are list of gvt_migration_obj_t objs + * Each obj has its operation method to save to qemu image + * and restore from qemu image during the migration. + * + * for each saved bject, it will have a region header + * struct gvt_region_t { + * region_type; + * region_size; + * } + *__________________ _________________ __________________ + *|x64 (Source) | |image region | |x64 (Target) | + *|________________| |________________| |________________| + *| Region A | | Region A | | Region A | + *| Header | | offset=0 | | allocate a page| + *| content | | | | copy data here | + *|----------------| | ... | |----------------| + *| Region B | | ... | | Region B | + *| Header | |----------------| | | + *| content | Region B | | | + *|----------------| | offset=4096 | |----------------| + * | | + * |----------------| + * + * On the target side, it will parser the incoming data copy + * from Qemu image, and apply difference restore handlers depends + * on the region type. + */ +static struct gvt_migration_obj_t gvt_device_objs[] = { + MIGRATION_UNIT(struct intel_vgpu, + GVT_MIGRATION_HEAD, + sizeof(struct gvt_image_header_t), + image_header_ops), + MIGRATION_UNIT(struct intel_vgpu, + GVT_MIGRATION_CFG_SPACE, + PCI_CFG_SPACE_EXP_SIZE, + vcfg_space_ops), + MIGRATION_UNIT(struct intel_vgpu, + GVT_MIGRATION_SREG, + GVT_MMIO_SIZE, sReg_ops), + MIGRATION_UNIT(struct intel_vgpu, + GVT_MIGRATION_VREG, + GVT_MMIO_SIZE, vReg_ops), + MIGRATION_UNIT(struct intel_vgpu, + GVT_MIGRATION_GTT, + 0, vgtt_info_ops), + MIGRATION_UNIT(struct intel_vgpu, + GVT_MIGRATION_PPGTT, + 0, ppgtt_ops), + MIGRATION_UNIT(struct intel_vgpu, + GVT_MIGRATION_WORKLOAD, + 0, workload_ops), + MIGRATION_UNIT(struct intel_vgpu, + GVT_MIGRATION_OPREGION, + INTEL_GVT_OPREGION_SIZE, opregion_ops), + MIGRATION_END, +}; + +static inline void +update_image_region_start_pos(struct gvt_migration_obj_t *obj, int pos) +{ + obj->offset = pos; +} + +static inline void +update_image_region_base(struct gvt_migration_obj_t *obj, void *base) +{ + obj->img = base; +} + +static inline void +update_status_region_base(struct gvt_migration_obj_t *obj, void *base) +{ + obj->vgpu = base; +} + +static inline struct gvt_migration_obj_t * +find_migration_obj(enum gvt_migration_type_t type) +{ + struct gvt_migration_obj_t *obj; + + for (obj = gvt_device_objs; + obj->region.type != GVT_MIGRATION_NONE; obj++) + if (obj->region.type == type) + return obj; + return NULL; +} + +static int image_header_save(const struct gvt_migration_obj_t *obj) +{ + struct gvt_region_t region; + struct gvt_image_header_t header; + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + + region.type = GVT_MIGRATION_HEAD; + region.size = sizeof(struct gvt_image_header_t); + memcpy(obj->img, ®ion, sizeof(struct gvt_region_t)); + + header.version = GVT_MIGRATION_VERSION; + header.data_size = obj->offset; + header.crc_check = 0; /* CRC check skipped for now*/ + + if (intel_gvt_host.hypervisor_type == INTEL_GVT_HYPERVISOR_XEN) { + header.global_data[0] = vgpu->low_mem_max_gpfn; + } + + memcpy(obj->img + sizeof(struct gvt_region_t), &header, + sizeof(struct gvt_image_header_t)); + + return sizeof(struct gvt_region_t) + sizeof(struct gvt_image_header_t); +} + +static int image_header_load(const struct gvt_migration_obj_t *obj, u32 size) +{ + struct gvt_image_header_t header; + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + + if (unlikely(size != sizeof(struct gvt_image_header_t))) { + gvt_err("migration obj size isn't match between target and image!" + " memsize=%d imgsize=%d\n", + obj->region.size, + size); + return INV; + } + + memcpy(&header, obj->img + obj->offset, + sizeof(struct gvt_image_header_t)); + + if (intel_gvt_host.hypervisor_type == INTEL_GVT_HYPERVISOR_XEN) { + vgpu->low_mem_max_gpfn = header.global_data[0]; + } + + return header.data_size; +} + +static int vcfg_space_save(const struct gvt_migration_obj_t *obj) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + int n_transfer = INV; + void *src = vgpu->cfg_space.virtual_cfg_space; + void *des = obj->img + obj->offset; + + memcpy(des, &obj->region, sizeof(struct gvt_region_t)); + + des += sizeof(struct gvt_region_t); + n_transfer = obj->region.size; + + memcpy(des, src, n_transfer); + return sizeof(struct gvt_region_t) + n_transfer; +} + +static int vcfg_space_load(const struct gvt_migration_obj_t *obj, u32 size) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + char *dest = vgpu->cfg_space.virtual_cfg_space; + int n_transfer = INV; + + if (unlikely(size != obj->region.size)) { + gvt_err("migration obj size isn't match between target and image!" + " memsize=%d imgsize=%d\n", + obj->region.size, + size); + return n_transfer; + } else { + n_transfer = obj->region.size; + memcpy(dest, obj->img + obj->offset, n_transfer); + } + + if (intel_gvt_host.hypervisor_type == INTEL_GVT_HYPERVISOR_XEN) { +#define MIG_CFG_SPACE_WRITE(off) { \ + u32 data; \ + data = *((u32 *)(dest + (off))); \ + intel_vgpu_emulate_cfg_write(vgpu, (off), &data, sizeof(data)); \ + } + +#define MIG_CFG_SPACE_WRITE_BAR(bar) { \ + u32 data = 0x500; \ + vgpu_cfg_space(vgpu)[PCI_COMMAND] = 0; \ + intel_vgpu_emulate_cfg_write(vgpu, PCI_COMMAND, &data, 2); \ + data = *((u32 *)(dest + (bar))); \ + intel_vgpu_emulate_cfg_write(vgpu, (bar), &data, sizeof(data)); \ + data = *((u32 *)(dest + (bar)+4)); \ + intel_vgpu_emulate_cfg_write(vgpu, (bar)+4, &data, sizeof(data));\ + data = 0x503; \ + intel_vgpu_emulate_cfg_write(vgpu, PCI_COMMAND, &data, 2); \ + } + + /* reconfig bar0,1,2 with source VM's base address. + * TargetVM and SourceVM must have same bar base. + */ + MIG_CFG_SPACE_WRITE_BAR(PCI_BASE_ADDRESS_0); + MIG_CFG_SPACE_WRITE_BAR(PCI_BASE_ADDRESS_2); + MIG_CFG_SPACE_WRITE_BAR(PCI_BASE_ADDRESS_4); + + /* restore OpRegion */ + MIG_CFG_SPACE_WRITE(INTEL_GVT_PCI_OPREGION); + MIG_CFG_SPACE_WRITE(INTEL_GVT_PCI_SWSCI); + } + return n_transfer; +} + +static int sreg_save(const struct gvt_migration_obj_t *obj) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + int n_transfer = INV; + void *src = vgpu->mmio.sreg; + void *des = obj->img + obj->offset; + + memcpy(des, &obj->region, sizeof(struct gvt_region_t)); + + des += sizeof(struct gvt_region_t); + n_transfer = obj->region.size; + + memcpy(des, src, n_transfer); + return sizeof(struct gvt_region_t) + n_transfer; +} + +static int sreg_load(const struct gvt_migration_obj_t *obj, u32 size) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + void *dest = vgpu->mmio.sreg; + int n_transfer = INV; + + if (unlikely(size != obj->region.size)) { + gvt_err("migration obj size isn't match between target and image!" + " memsize=%d imgsize=%d\n", + obj->region.size, + size); + return n_transfer; + } else { + n_transfer = obj->region.size; + memcpy(dest, obj->img + obj->offset, n_transfer); + } + + return n_transfer; +} + +static int opregion_save(const struct gvt_migration_obj_t *obj) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + int n_transfer = INV; + void *src = vgpu->opregion.va; + void *des = obj->img + obj->offset; + + memcpy(des, &obj->region, sizeof(struct gvt_region_t)); + + des += sizeof(struct gvt_region_t); + n_transfer = obj->region.size; + + memcpy(des, src, n_transfer); + return sizeof(struct gvt_region_t) + n_transfer; +} + +static int opregion_load(const struct gvt_migration_obj_t *obj, u32 size) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + int n_transfer = INV; + + if (unlikely(size != obj->region.size)) { + gvt_err("migration object size is not match between target \ + and image!!! memsize=%d imgsize=%d\n", + obj->region.size, + size); + return n_transfer; + } else { + vgpu_opregion(vgpu)->va = (void *)__get_free_pages(GFP_KERNEL | + __GFP_ZERO, + get_order(INTEL_GVT_OPREGION_SIZE)); + n_transfer = obj->region.size; + memcpy(vgpu_opregion(vgpu)->va, obj->img + obj->offset, n_transfer); + } + + return n_transfer; +} + +static int ppgtt_save(const struct gvt_migration_obj_t *obj) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + struct list_head *pos; + struct intel_vgpu_mm *mm; + struct gvt_ppgtt_entry_t entry; + struct gvt_region_t region; + int num = 0; + u32 sz = sizeof(struct gvt_ppgtt_entry_t); + void *des = obj->img + obj->offset; + + list_for_each(pos, &vgpu->gtt.mm_list_head) { + mm = container_of(pos, struct intel_vgpu_mm, list); + if (mm->type != INTEL_GVT_MM_PPGTT) + continue; + + entry.page_table_level = mm->page_table_level; + memcpy(&entry.pdp, mm->virtual_page_table, 32); + + memcpy(des + sizeof(struct gvt_region_t) + (num * sz), + &entry, sz); + num++; + } + + region.type = GVT_MIGRATION_PPGTT; + region.size = num * sz; + memcpy(des, ®ion, sizeof(struct gvt_region_t)); + + return sizeof(struct gvt_region_t) + region.size; +} + +static int ppgtt_load(const struct gvt_migration_obj_t *obj, u32 size) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + int n_transfer = INV; + struct gvt_ppgtt_entry_t entry; + struct intel_vgpu_mm *mm; + void *src = obj->img + obj->offset; + int i; + u32 sz = sizeof(struct gvt_ppgtt_entry_t); + + if (size == 0) + return size; + + if (unlikely(size % sz) != 0) { + gvt_err("migration obj size isn't match between target and image!" + " memsize=%d imgsize=%d\n", + obj->region.size, + size); + return n_transfer; + } + + for (i = 0; i < size / sz; i++) { + memcpy(&entry, src + (i * sz), sz); + mm = intel_vgpu_create_mm(vgpu, INTEL_GVT_MM_PPGTT, + entry.pdp, entry.page_table_level, 0); + if (IS_ERR(mm)) { + gvt_vgpu_err("fail to create mm object.\n"); + return n_transfer; + } + } + + n_transfer = size; + + return n_transfer; +} + +static int vreg_save(const struct gvt_migration_obj_t *obj) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + int n_transfer = INV; + void *src = vgpu->mmio.vreg; + void *des = obj->img + obj->offset; + + memcpy(des, &obj->region, sizeof(struct gvt_region_t)); + + des += sizeof(struct gvt_region_t); + n_transfer = obj->region.size; + + memcpy(des, src, n_transfer); + return sizeof(struct gvt_region_t) + n_transfer; +} + +static int vreg_load(const struct gvt_migration_obj_t *obj, u32 size) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + void *dest = vgpu->mmio.vreg; + int n_transfer = INV; + struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + enum pipe pipe; + + if (unlikely(size != obj->region.size)) { + gvt_err("migration obj size isn't match between target and image!" + " memsize=%d imgsize=%d\n", + obj->region.size, + size); + return n_transfer; + } else { + n_transfer = obj->region.size; + memcpy(dest, obj->img + obj->offset, n_transfer); + } + + //restore vblank emulation + for (pipe = PIPE_A; pipe < I915_MAX_PIPES; ++pipe) + MIG_VREG_RESTORE(vgpu, INTEL_GVT_MMIO_OFFSET(PIPECONF(pipe))); + + return n_transfer; +} + +static int workload_save(const struct gvt_migration_obj_t *obj) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + struct gvt_region_t region; + struct intel_engine_cs *engine; + struct intel_vgpu_workload *pos, *n; + unsigned int i; + struct gvt_pending_workload_t workload; + void *des = obj->img + obj->offset; + unsigned int num = 0; + u32 sz = sizeof(struct gvt_pending_workload_t); + + for_each_engine(engine, dev_priv, i) { + list_for_each_entry_safe(pos, n, + &vgpu->workload_q_head[engine->id], list) { + workload.ring_id = pos->ring_id; + memcpy(&workload.elsp_dwords, &pos->elsp_dwords, + sizeof(struct intel_vgpu_elsp_dwords)); + memcpy(des + sizeof(struct gvt_region_t) + (num * sz), + &workload, sz); + num++; + } + } + + region.type = GVT_MIGRATION_WORKLOAD; + region.size = num * sz; + memcpy(des, ®ion, sizeof(struct gvt_region_t)); + + return sizeof(struct gvt_region_t) + region.size; +} + +static int workload_load(const struct gvt_migration_obj_t *obj, u32 size) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + int n_transfer = INV; + struct gvt_pending_workload_t workload; + struct intel_engine_cs *engine; + void *src = obj->img + obj->offset; + u64 pa, off; + u32 sz = sizeof(struct gvt_pending_workload_t); + int i, j; + + if (size == 0) + return size; + + if (unlikely(size % sz) != 0) { + gvt_err("migration obj size isn't match between target and image!" + " memsize=%d imgsize=%d\n", + obj->region.size, + size); + return n_transfer; + } + + for (i = 0; i < size / sz; i++) { + memcpy(&workload, src + (i * sz), sz); + engine = dev_priv->engine[workload.ring_id]; + off = i915_mmio_reg_offset(RING_ELSP(engine)); + pa = intel_vgpu_mmio_offset_to_gpa(vgpu, off); + for (j = 0; j < 4; j++) { + intel_vgpu_emulate_mmio_write(vgpu, pa, + &workload.elsp_dwords.data[j], 4); + } + } + + n_transfer = size; + + return n_transfer; +} + +static int +mig_ggtt_save_restore(struct intel_vgpu_mm *ggtt_mm, + void *data, u64 gm_offset, + u64 gm_sz, + bool save_to_image) +{ + struct intel_vgpu *vgpu = ggtt_mm->vgpu; + struct intel_gvt_gtt_gma_ops *gma_ops = vgpu->gvt->gtt.gma_ops; + + void *ptable; + int sz; + int shift = vgpu->gvt->device_info.gtt_entry_size_shift; + + ptable = ggtt_mm->virtual_page_table + + (gma_ops->gma_to_ggtt_pte_index(gm_offset) << shift); + sz = (gm_sz >> GTT_PAGE_SHIFT) << shift; + + if (save_to_image) + memcpy(data, ptable, sz); + else + memcpy(ptable, data, sz); + + return sz; +} + +static int vggtt_save(const struct gvt_migration_obj_t *obj) +{ + int ret = INV; + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + struct intel_vgpu_mm *ggtt_mm = vgpu->gtt.ggtt_mm; + void *des = obj->img + obj->offset; + struct gvt_region_t region; + int sz; + + u64 aperture_offset = vgpu_guest_aperture_offset(vgpu); + u64 aperture_sz = vgpu_aperture_sz(vgpu); + u64 hidden_gm_offset = vgpu_guest_hidden_offset(vgpu); + u64 hidden_gm_sz = vgpu_hidden_sz(vgpu); + + des += sizeof(struct gvt_region_t); + + /*TODO:512MB GTT takes total 1024KB page table size, optimization here*/ + + gvt_dbg_core("Guest aperture=0x%llx (HW: 0x%llx)," + "Guest Hidden=0x%llx (HW:0x%llx)\n", + aperture_offset, vgpu_aperture_offset(vgpu), + hidden_gm_offset, vgpu_hidden_offset(vgpu)); + + /*TODO:to be fixed after removal of address ballooning */ + ret = 0; + + /* aperture */ + sz = mig_ggtt_save_restore(ggtt_mm, des, + aperture_offset, aperture_sz, true); + des += sz; + ret += sz; + + /* hidden gm */ + sz = mig_ggtt_save_restore(ggtt_mm, des, + hidden_gm_offset, hidden_gm_sz, true); + des += sz; + ret += sz; + + /* Save the total size of this session */ + region.type = GVT_MIGRATION_GTT; + region.size = ret; + memcpy(obj->img + obj->offset, ®ion, sizeof(struct gvt_region_t)); + + ret += sizeof(struct gvt_region_t); + + return ret; +} + +static int vggtt_load(const struct gvt_migration_obj_t *obj, u32 size) +{ + int ret; + u32 ggtt_index; + void *src; + int sz; + + struct intel_vgpu *vgpu = (struct intel_vgpu *) obj->vgpu; + struct intel_vgpu_mm *ggtt_mm = vgpu->gtt.ggtt_mm; + + int shift = vgpu->gvt->device_info.gtt_entry_size_shift; + + /* offset to bar1 beginning */ + u64 dest_aperture_offset = vgpu_guest_aperture_offset(vgpu); + u64 aperture_sz = vgpu_aperture_sz(vgpu); + u64 dest_hidden_gm_offset = vgpu_guest_hidden_offset(vgpu); + u64 hidden_gm_sz = vgpu_hidden_sz(vgpu); + + gvt_dbg_core("Guest aperture=0x%llx (HW: 0x%llx)," + "Guest Hidden=0x%llx (HW:0x%llx)\n", + dest_aperture_offset, vgpu_aperture_offset(vgpu), + dest_hidden_gm_offset, vgpu_hidden_offset(vgpu)); + + if ((size>>shift) != + ((aperture_sz + hidden_gm_sz) >> GTT_PAGE_SHIFT)) { + gvt_err("ggtt restore failed due to page table size not match\n"); + return INV; + } + + ret = 0; + src = obj->img + obj->offset; + + /* aperture */ + sz = mig_ggtt_save_restore(ggtt_mm, + src, dest_aperture_offset, aperture_sz, false); + src += sz; + ret += sz; + + /* hidden GM */ + sz = mig_ggtt_save_restore(ggtt_mm, src, + dest_hidden_gm_offset, hidden_gm_sz, false); + ret += sz; + + /* aperture/hidden GTT emulation from Source to Target */ + for (ggtt_index = 0; ggtt_index < ggtt_mm->page_table_entry_cnt; + ggtt_index++) { + + if (vgpu_gmadr_is_valid(vgpu, ggtt_index<gvt->gtt.pte_ops; + struct intel_gvt_gtt_entry e; + u64 offset; + u64 pa; + + /* TODO: hardcode to 64bit right now */ + offset = vgpu->gvt->device_info.gtt_start_offset + + (ggtt_index<test_present(&e)) { + /* same as gtt_emulate + * _write(vgt, offset, &e.val64, 1<region.type == GVT_MIGRATION_OPREGION) && + (intel_gvt_host.hypervisor_type == INTEL_GVT_HYPERVISOR_KVM)) + continue; + + /* obj will copy data to image file img.offset */ + update_image_region_start_pos(node, n_img_actual_saved); + if (node->ops->pre_save == NULL) { + n_img = 0; + } else { + n_img = node->ops->pre_save(node); + if (n_img == INV) { + gvt_err("Save obj %s failed\n", + node->name); + n_img_actual_saved = INV; + break; + } + } + /* show GREEN on screen with colorred term */ + gvt_dbg_core("Save obj %s success with %d bytes\n", + node->name, n_img); + n_img_actual_saved += n_img; + + if (n_img_actual_saved >= MIGRATION_IMG_MAX_SIZE) { + gvt_err("Image size overflow!!! data=%d MAX=%ld\n", + n_img_actual_saved, + MIGRATION_IMG_MAX_SIZE); + /* Mark as invalid */ + n_img_actual_saved = INV; + break; + } + } + /* update the header with real image size */ + node = find_migration_obj(GVT_MIGRATION_HEAD); + update_image_region_start_pos(node, n_img_actual_saved); + node->ops->pre_save(node); + return n_img_actual_saved; +} + +static int vgpu_restore(void *img) +{ + struct gvt_migration_obj_t *node; + struct gvt_region_t region; + int n_img_actual_recv = 0; + u32 n_img_actual_size; + + /* load image header at first to get real size */ + memcpy(®ion, img, sizeof(struct gvt_region_t)); + if (region.type != GVT_MIGRATION_HEAD) { + gvt_err("Invalid image. Doesn't start with image_head\n"); + return INV; + } + + n_img_actual_recv += sizeof(struct gvt_region_t); + node = find_migration_obj(region.type); + update_image_region_start_pos(node, n_img_actual_recv); + n_img_actual_size = node->ops->pre_load(node, region.size); + if (n_img_actual_size == INV) { + gvt_err("Load img %s failed\n", node->name); + return INV; + } + + if (n_img_actual_size >= MIGRATION_IMG_MAX_SIZE) { + gvt_err("Invalid image. magic_id offset = 0x%x\n", + n_img_actual_size); + return INV; + } + + n_img_actual_recv += sizeof(struct gvt_image_header_t); + + do { + int n_img = INV; + /* parse each region head to get type and size */ + memcpy(®ion, img + n_img_actual_recv, + sizeof(struct gvt_region_t)); + node = find_migration_obj(region.type); + if (node == NULL) + break; + n_img_actual_recv += sizeof(struct gvt_region_t); + update_image_region_start_pos(node, n_img_actual_recv); + + if (node->ops->pre_load == NULL) { + n_img = 0; + } else { + n_img = node->ops->pre_load(node, region.size); + if (n_img == INV) { + /* Error occurred. colored as RED */ + gvt_err("Load obj %s failed\n", + node->name); + n_img_actual_recv = INV; + break; + } + } + /* show GREEN on screen with colorred term */ + gvt_dbg_core("Load obj %s success with %d bytes.\n", + node->name, n_img); + n_img_actual_recv += n_img; + } while (n_img_actual_recv < MIGRATION_IMG_MAX_SIZE); + + return n_img_actual_recv; +} + +int intel_gvt_save_restore(struct intel_vgpu *vgpu, char *buf, size_t count, + void *base, uint64_t off, bool restore) +{ + struct gvt_migration_obj_t *node; + int ret = 0; + + mutex_lock(&gvt_migration); + + FOR_EACH_OBJ(node, gvt_device_objs) { + update_image_region_base(node, base + off); + update_image_region_start_pos(node, INV); + update_status_region_base(node, vgpu); + } + + if (restore) { + vgpu->pv_notified = true; + if (vgpu_restore(base + off) == INV) { + ret = -EFAULT; + goto exit; + } + } else { + if (vgpu_save(base + off) == INV) { + ret = -EFAULT; + goto exit; + } + + } + +exit: + mutex_unlock(&gvt_migration); + + return ret; +} diff --git a/drivers/gpu/drm/i915/gvt/migrate.h b/drivers/gpu/drm/i915/gvt/migrate.h new file mode 100644 index 0000000000000..1f818c3377aef --- /dev/null +++ b/drivers/gpu/drm/i915/gvt/migrate.h @@ -0,0 +1,99 @@ +/* + * Copyright(c) 2011-2016 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Yulei Zhang + * Xiao Zheng + */ + +#ifndef __GVT_MIGRATE_H__ +#define __GVT_MIGRATE_H__ + +/* Assume 9MB is eough to descript VM kernel state */ +#define MIGRATION_IMG_MAX_SIZE (9*1024UL*1024UL) +#define GVT_MMIO_SIZE (2*1024UL*1024UL) +#define GVT_MIGRATION_VERSION 0 + +enum gvt_migration_type_t { + GVT_MIGRATION_NONE, + GVT_MIGRATION_HEAD, + GVT_MIGRATION_CFG_SPACE, + GVT_MIGRATION_VREG, + GVT_MIGRATION_SREG, + GVT_MIGRATION_GTT, + GVT_MIGRATION_PPGTT, + GVT_MIGRATION_WORKLOAD, + GVT_MIGRATION_OPREGION, +}; + +struct gvt_ppgtt_entry_t { + int page_table_level; + u32 pdp[8]; +}; + +struct gvt_pending_workload_t { + int ring_id; + struct intel_vgpu_elsp_dwords elsp_dwords; +}; + +struct gvt_region_t { + enum gvt_migration_type_t type; + u32 size; /* obj size of bytes to read/write */ +}; + +struct gvt_migration_obj_t { + void *img; + void *vgpu; + u32 offset; + struct gvt_region_t region; + /* operation func defines how data save-restore */ + struct gvt_migration_operation_t *ops; + char *name; +}; + +struct gvt_migration_operation_t { + /* called during pre-copy stage, VM is still alive */ + int (*pre_copy)(const struct gvt_migration_obj_t *obj); + /* called before when VM was paused, + * return bytes transferred + */ + int (*pre_save)(const struct gvt_migration_obj_t *obj); + /* called before load the state of device */ + int (*pre_load)(const struct gvt_migration_obj_t *obj, u32 size); + /* called after load the state of device, VM already alive */ + int (*post_load)(const struct gvt_migration_obj_t *obj, u32 size); +}; + +struct gvt_image_header_t { + int version; + int data_size; + u64 crc_check; + u64 global_data[64]; +}; + +struct gvt_logd_pfn { + struct rb_node node; + unsigned long gfn; + atomic_t ref_count; +}; + +#endif diff --git a/drivers/gpu/drm/i915/gvt/mmio.c b/drivers/gpu/drm/i915/gvt/mmio.c index 1ba3bdb093416..637b3db2c935f 100644 --- a/drivers/gpu/drm/i915/gvt/mmio.c +++ b/drivers/gpu/drm/i915/gvt/mmio.c @@ -50,6 +50,19 @@ int intel_vgpu_gpa_to_mmio_offset(struct intel_vgpu *vgpu, u64 gpa) return gpa - gttmmio_gpa; } +/** + * intel_vgpu_mmio_offset_to_GPA - translate a MMIO offset to GPA + * @vgpu: a vGPU + * + * Returns: + * Zero on success, negative error code if failed + */ +int intel_vgpu_mmio_offset_to_gpa(struct intel_vgpu *vgpu, u64 offset) +{ + return offset + ((*(u64 *)(vgpu_cfg_space(vgpu) + PCI_BASE_ADDRESS_0)) & + ~GENMASK(3, 0)); +} + #define reg_is_mmio(gvt, reg) \ (reg >= 0 && reg < gvt->device_info.mmio_size) @@ -123,7 +136,6 @@ int intel_vgpu_emulate_mmio_read(struct intel_vgpu *vgpu, uint64_t pa, void *p_data, unsigned int bytes) { struct intel_gvt *gvt = vgpu->gvt; - struct intel_gvt_mmio_info *mmio; unsigned int offset = 0; int ret = -EINVAL; @@ -187,32 +199,8 @@ int intel_vgpu_emulate_mmio_read(struct intel_vgpu *vgpu, uint64_t pa, goto err; } - mmio = intel_gvt_find_mmio_info(gvt, rounddown(offset, 4)); - if (mmio) { - if (!intel_gvt_mmio_is_unalign(gvt, mmio->offset)) { - if (WARN_ON(offset + bytes > mmio->offset + mmio->size)) - goto err; - if (WARN_ON(mmio->offset != offset)) - goto err; - } - ret = mmio->read(vgpu, offset, p_data, bytes); - } else { - ret = intel_vgpu_default_mmio_read(vgpu, offset, p_data, bytes); - - if (!vgpu->mmio.disable_warn_untrack) { - gvt_vgpu_err("read untracked MMIO %x(%dB) val %x\n", - offset, bytes, *(u32 *)p_data); - - if (offset == 0x206c) { - gvt_vgpu_err("------------------------------------------\n"); - gvt_vgpu_err("likely triggers a gfx reset\n"); - gvt_vgpu_err("------------------------------------------\n"); - vgpu->mmio.disable_warn_untrack = true; - } - } - } - - if (ret) + ret = intel_vgpu_mmio_reg_rw(vgpu, offset, p_data, bytes, true); + if (ret < 0) goto err; intel_gvt_mmio_set_accessed(gvt, offset); @@ -239,9 +227,7 @@ int intel_vgpu_emulate_mmio_write(struct intel_vgpu *vgpu, uint64_t pa, void *p_data, unsigned int bytes) { struct intel_gvt *gvt = vgpu->gvt; - struct intel_gvt_mmio_info *mmio; unsigned int offset = 0; - u32 old_vreg = 0, old_sreg = 0; int ret = -EINVAL; if (vgpu->failsafe) { @@ -296,66 +282,10 @@ int intel_vgpu_emulate_mmio_write(struct intel_vgpu *vgpu, uint64_t pa, return ret; } - mmio = intel_gvt_find_mmio_info(gvt, rounddown(offset, 4)); - if (!mmio && !vgpu->mmio.disable_warn_untrack) - gvt_dbg_mmio("vgpu%d: write untracked MMIO %x len %d val %x\n", - vgpu->id, offset, bytes, *(u32 *)p_data); - - if (!intel_gvt_mmio_is_unalign(gvt, offset)) { - if (WARN_ON(!IS_ALIGNED(offset, bytes))) - goto err; - } - - if (mmio) { - u64 ro_mask = mmio->ro_mask; - - if (!intel_gvt_mmio_is_unalign(gvt, mmio->offset)) { - if (WARN_ON(offset + bytes > mmio->offset + mmio->size)) - goto err; - if (WARN_ON(mmio->offset != offset)) - goto err; - } - - if (intel_gvt_mmio_has_mode_mask(gvt, mmio->offset)) { - old_vreg = vgpu_vreg(vgpu, offset); - old_sreg = vgpu_sreg(vgpu, offset); - } - - if (!ro_mask) { - ret = mmio->write(vgpu, offset, p_data, bytes); - } else { - /* Protect RO bits like HW */ - u64 data = 0; - - /* all register bits are RO. */ - if (ro_mask == ~(u64)0) { - gvt_vgpu_err("try to write RO reg %x\n", - offset); - ret = 0; - goto out; - } - /* keep the RO bits in the virtual register */ - memcpy(&data, p_data, bytes); - data &= ~mmio->ro_mask; - data |= vgpu_vreg(vgpu, offset) & mmio->ro_mask; - ret = mmio->write(vgpu, offset, &data, bytes); - } - - /* higher 16bits of mode ctl regs are mask bits for change */ - if (intel_gvt_mmio_has_mode_mask(gvt, mmio->offset)) { - u32 mask = vgpu_vreg(vgpu, offset) >> 16; - - vgpu_vreg(vgpu, offset) = (old_vreg & ~mask) - | (vgpu_vreg(vgpu, offset) & mask); - vgpu_sreg(vgpu, offset) = (old_sreg & ~mask) - | (vgpu_sreg(vgpu, offset) & mask); - } - } else - ret = intel_vgpu_default_mmio_write(vgpu, offset, p_data, - bytes); - if (ret) + ret = intel_vgpu_mmio_reg_rw(vgpu, offset, p_data, bytes, false); + if (ret < 0) goto err; -out: + intel_gvt_mmio_set_accessed(gvt, offset); mutex_unlock(&gvt->lock); return 0; @@ -372,20 +302,32 @@ int intel_vgpu_emulate_mmio_write(struct intel_vgpu *vgpu, uint64_t pa, * @vgpu: a vGPU * */ -void intel_vgpu_reset_mmio(struct intel_vgpu *vgpu) +void intel_vgpu_reset_mmio(struct intel_vgpu *vgpu, bool dmlr) { struct intel_gvt *gvt = vgpu->gvt; const struct intel_gvt_device_info *info = &gvt->device_info; + void *mmio = gvt->firmware.mmio; - memcpy(vgpu->mmio.vreg, gvt->firmware.mmio, info->mmio_size); - memcpy(vgpu->mmio.sreg, gvt->firmware.mmio, info->mmio_size); + if (dmlr) { + memcpy(vgpu->mmio.vreg, mmio, info->mmio_size); + memcpy(vgpu->mmio.sreg, mmio, info->mmio_size); - vgpu_vreg(vgpu, GEN6_GT_THREAD_STATUS_REG) = 0; + vgpu_vreg(vgpu, GEN6_GT_THREAD_STATUS_REG) = 0; - /* set the bit 0:2(Core C-State ) to C0 */ - vgpu_vreg(vgpu, GEN6_GT_CORE_STATUS) = 0; + /* set the bit 0:2(Core C-State ) to C0 */ + vgpu_vreg(vgpu, GEN6_GT_CORE_STATUS) = 0; + + vgpu->mmio.disable_warn_untrack = false; + } else { +#define GVT_GEN8_MMIO_RESET_OFFSET (0x44200) + /* only reset the engine related, so starting with 0x44200 + * interrupt include DE,display mmio related will not be + * touched + */ + memcpy(vgpu->mmio.vreg, mmio, GVT_GEN8_MMIO_RESET_OFFSET); + memcpy(vgpu->mmio.sreg, mmio, GVT_GEN8_MMIO_RESET_OFFSET); + } - vgpu->mmio.disable_warn_untrack = false; } /** @@ -405,7 +347,7 @@ int intel_vgpu_init_mmio(struct intel_vgpu *vgpu) vgpu->mmio.sreg = vgpu->mmio.vreg + info->mmio_size; - intel_vgpu_reset_mmio(vgpu); + intel_vgpu_reset_mmio(vgpu, true); return 0; } diff --git a/drivers/gpu/drm/i915/gvt/mmio.h b/drivers/gpu/drm/i915/gvt/mmio.h index 7edd66f38ef98..36993043b0904 100644 --- a/drivers/gpu/drm/i915/gvt/mmio.h +++ b/drivers/gpu/drm/i915/gvt/mmio.h @@ -60,15 +60,15 @@ struct intel_vgpu; #define D_PRE_SKL (D_SNB | D_IVB | D_HSW | D_BDW) #define D_ALL (D_SNB | D_IVB | D_HSW | D_BDW | D_SKL | D_KBL) +typedef int (*gvt_mmio_func)(struct intel_vgpu *, unsigned int, void *, + unsigned int); + struct intel_gvt_mmio_info { u32 offset; - u32 size; - u32 length; - u32 addr_mask; u64 ro_mask; u32 device; - int (*read)(struct intel_vgpu *, unsigned int, void *, unsigned int); - int (*write)(struct intel_vgpu *, unsigned int, void *, unsigned int); + gvt_mmio_func read; + gvt_mmio_func write; u32 addr_range; struct hlist_node node; }; @@ -79,8 +79,6 @@ bool intel_gvt_match_device(struct intel_gvt *gvt, unsigned long device); int intel_gvt_setup_mmio_info(struct intel_gvt *gvt); void intel_gvt_clean_mmio_info(struct intel_gvt *gvt); -struct intel_gvt_mmio_info *intel_gvt_find_mmio_info(struct intel_gvt *gvt, - unsigned int offset); #define INTEL_GVT_MMIO_OFFSET(reg) ({ \ typeof(reg) __reg = reg; \ u32 *offset = (u32 *)&__reg; \ @@ -88,22 +86,17 @@ struct intel_gvt_mmio_info *intel_gvt_find_mmio_info(struct intel_gvt *gvt, }) int intel_vgpu_init_mmio(struct intel_vgpu *vgpu); -void intel_vgpu_reset_mmio(struct intel_vgpu *vgpu); +void intel_vgpu_reset_mmio(struct intel_vgpu *vgpu, bool dmlr); void intel_vgpu_clean_mmio(struct intel_vgpu *vgpu); int intel_vgpu_gpa_to_mmio_offset(struct intel_vgpu *vgpu, u64 gpa); +int intel_vgpu_mmio_offset_to_gpa(struct intel_vgpu *vgpu, u64 offset); int intel_vgpu_emulate_mmio_read(struct intel_vgpu *vgpu, u64 pa, void *p_data, unsigned int bytes); int intel_vgpu_emulate_mmio_write(struct intel_vgpu *vgpu, u64 pa, void *p_data, unsigned int bytes); -bool intel_gvt_mmio_is_cmd_access(struct intel_gvt *gvt, - unsigned int offset); -bool intel_gvt_mmio_is_unalign(struct intel_gvt *gvt, unsigned int offset); -void intel_gvt_mmio_set_accessed(struct intel_gvt *gvt, unsigned int offset); -void intel_gvt_mmio_set_cmd_accessed(struct intel_gvt *gvt, - unsigned int offset); -bool intel_gvt_mmio_has_mode_mask(struct intel_gvt *gvt, unsigned int offset); + int intel_vgpu_default_mmio_read(struct intel_vgpu *vgpu, unsigned int offset, void *p_data, unsigned int bytes); int intel_vgpu_default_mmio_write(struct intel_vgpu *vgpu, unsigned int offset, @@ -111,4 +104,8 @@ int intel_vgpu_default_mmio_write(struct intel_vgpu *vgpu, unsigned int offset, bool intel_gvt_in_force_nonpriv_whitelist(struct intel_gvt *gvt, unsigned int offset); + +int intel_vgpu_mmio_reg_rw(struct intel_vgpu *vgpu, unsigned int offset, + void *pdata, unsigned int bytes, bool is_read); + #endif diff --git a/drivers/gpu/drm/i915/gvt/mpt.h b/drivers/gpu/drm/i915/gvt/mpt.h index 419353624c5a4..f0e5487e66886 100644 --- a/drivers/gpu/drm/i915/gvt/mpt.h +++ b/drivers/gpu/drm/i915/gvt/mpt.h @@ -133,8 +133,7 @@ static inline int intel_gvt_hypervisor_inject_msi(struct intel_vgpu *vgpu) if (WARN(control & GENMASK(15, 1), "only support one MSI format\n")) return -EINVAL; - gvt_dbg_irq("vgpu%d: inject msi address %x data%x\n", vgpu->id, addr, - data); + trace_inject_msi(vgpu->id, addr, data); ret = intel_gvt_host.mpt->inject_msi(vgpu->handle, addr, data); if (ret) diff --git a/drivers/gpu/drm/i915/gvt/opregion.c b/drivers/gpu/drm/i915/gvt/opregion.c index 311799136d7f6..3225c34790dbf 100644 --- a/drivers/gpu/drm/i915/gvt/opregion.c +++ b/drivers/gpu/drm/i915/gvt/opregion.c @@ -22,6 +22,7 @@ */ #include +#include #include "i915_drv.h" #include "gvt.h" @@ -30,7 +31,11 @@ static int init_vgpu_opregion(struct intel_vgpu *vgpu, u32 gpa) u8 *buf; int i; - if (WARN((vgpu_opregion(vgpu)->va), + if (xen_initial_domain() && vgpu_opregion(vgpu)->va) { + gvt_vgpu_err("opregion has been initialized already.\n"); + intel_vgpu_clean_opregion(vgpu); + } + else if (WARN((vgpu_opregion(vgpu)->va), "vgpu%d: opregion has been initialized already.\n", vgpu->id)) return -EINVAL; diff --git a/drivers/gpu/drm/i915/gvt/render.c b/drivers/gpu/drm/i915/gvt/render.c index a5e11d89df2f8..2ea542257f03b 100644 --- a/drivers/gpu/drm/i915/gvt/render.c +++ b/drivers/gpu/drm/i915/gvt/render.c @@ -35,6 +35,7 @@ #include "i915_drv.h" #include "gvt.h" +#include "trace.h" struct render_mmio { int ring_id; @@ -206,18 +207,16 @@ static void load_mocs(struct intel_vgpu *vgpu, int ring_id) offset.reg = regs[ring_id]; for (i = 0; i < 64; i++) { - gen9_render_mocs[ring_id][i] = I915_READ(offset); + gen9_render_mocs[ring_id][i] = I915_READ_FW(offset); I915_WRITE(offset, vgpu_vreg(vgpu, offset)); - POSTING_READ(offset); offset.reg += 4; } if (ring_id == RCS) { l3_offset.reg = 0xb020; for (i = 0; i < 32; i++) { - gen9_render_mocs_L3[i] = I915_READ(l3_offset); - I915_WRITE(l3_offset, vgpu_vreg(vgpu, l3_offset)); - POSTING_READ(l3_offset); + gen9_render_mocs_L3[i] = I915_READ_FW(l3_offset); + I915_WRITE_FW(l3_offset, vgpu_vreg(vgpu, l3_offset)); l3_offset.reg += 4; } } @@ -241,18 +240,16 @@ static void restore_mocs(struct intel_vgpu *vgpu, int ring_id) offset.reg = regs[ring_id]; for (i = 0; i < 64; i++) { - vgpu_vreg(vgpu, offset) = I915_READ(offset); - I915_WRITE(offset, gen9_render_mocs[ring_id][i]); - POSTING_READ(offset); + vgpu_vreg(vgpu, offset) = I915_READ_FW(offset); + I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]); offset.reg += 4; } if (ring_id == RCS) { l3_offset.reg = 0xb020; for (i = 0; i < 32; i++) { - vgpu_vreg(vgpu, l3_offset) = I915_READ(l3_offset); - I915_WRITE(l3_offset, gen9_render_mocs_L3[i]); - POSTING_READ(l3_offset); + vgpu_vreg(vgpu, l3_offset) = I915_READ_FW(l3_offset); + I915_WRITE_FW(l3_offset, gen9_render_mocs_L3[i]); l3_offset.reg += 4; } } @@ -260,7 +257,8 @@ static void restore_mocs(struct intel_vgpu *vgpu, int ring_id) #define CTX_CONTEXT_CONTROL_VAL 0x03 -void intel_gvt_load_render_mmio(struct intel_vgpu *vgpu, int ring_id) +/* Switch ring mmio values (context) from host to a vgpu. */ +static void switch_mmio_to_vgpu(struct intel_vgpu *vgpu, int ring_id) { struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; struct render_mmio *mmio; @@ -270,6 +268,7 @@ void intel_gvt_load_render_mmio(struct intel_vgpu *vgpu, int ring_id) u32 ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL]; u32 inhibit_mask = _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); + i915_reg_t last_reg = _MMIO(0); if (IS_SKYLAKE(vgpu->gvt->dev_priv) || IS_KABYLAKE(vgpu->gvt->dev_priv)) { @@ -285,7 +284,7 @@ void intel_gvt_load_render_mmio(struct intel_vgpu *vgpu, int ring_id) if (mmio->ring_id != ring_id) continue; - mmio->value = I915_READ(mmio->reg); + mmio->value = I915_READ_FW(mmio->reg); /* * if it is an inhibit context, load in_context mmio @@ -302,20 +301,27 @@ void intel_gvt_load_render_mmio(struct intel_vgpu *vgpu, int ring_id) else v = vgpu_vreg(vgpu, mmio->reg); - I915_WRITE(mmio->reg, v); - POSTING_READ(mmio->reg); + I915_WRITE_FW(mmio->reg, v); + last_reg = mmio->reg; - gvt_dbg_render("load reg %x old %x new %x\n", - i915_mmio_reg_offset(mmio->reg), - mmio->value, v); + trace_render_mmio(vgpu->id, "load", + i915_mmio_reg_offset(mmio->reg), + mmio->value, v); } + + /* Make sure the swiched MMIOs has taken effect. */ + if (likely(INTEL_GVT_MMIO_OFFSET(last_reg))) + I915_READ_FW(last_reg); + handle_tlb_pending_event(vgpu, ring_id); } -void intel_gvt_restore_render_mmio(struct intel_vgpu *vgpu, int ring_id) +/* Switch ring mmio values (context) from vgpu to host. */ +static void switch_mmio_to_host(struct intel_vgpu *vgpu, int ring_id) { struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; struct render_mmio *mmio; + i915_reg_t last_reg = _MMIO(0); u32 v; int i, array_size; @@ -332,7 +338,7 @@ void intel_gvt_restore_render_mmio(struct intel_vgpu *vgpu, int ring_id) if (mmio->ring_id != ring_id) continue; - vgpu_vreg(vgpu, mmio->reg) = I915_READ(mmio->reg); + vgpu_vreg(vgpu, mmio->reg) = I915_READ_FW(mmio->reg); if (mmio->mask) { vgpu_vreg(vgpu, mmio->reg) &= ~(mmio->mask << 16); @@ -343,11 +349,57 @@ void intel_gvt_restore_render_mmio(struct intel_vgpu *vgpu, int ring_id) if (mmio->in_context) continue; - I915_WRITE(mmio->reg, v); - POSTING_READ(mmio->reg); + I915_WRITE_FW(mmio->reg, v); + last_reg = mmio->reg; - gvt_dbg_render("restore reg %x old %x new %x\n", - i915_mmio_reg_offset(mmio->reg), - mmio->value, v); + trace_render_mmio(vgpu->id, "restore", + i915_mmio_reg_offset(mmio->reg), + mmio->value, v); } + + /* Make sure the swiched MMIOs has taken effect. */ + if (likely(INTEL_GVT_MMIO_OFFSET(last_reg))) + I915_READ_FW(last_reg); +} + +/** + * intel_gvt_switch_render_mmio - switch mmio context of specific engine + * @pre: the last vGPU that own the engine + * @next: the vGPU to switch to + * @ring_id: specify the engine + * + * If pre is null indicates that host own the engine. If next is null + * indicates that we are switching to host workload. + */ +void intel_gvt_switch_mmio(struct intel_vgpu *pre, + struct intel_vgpu *next, int ring_id) +{ + struct drm_i915_private *dev_priv; + + if (WARN_ON(!pre && !next)) + return; + + gvt_dbg_render("switch ring %d from %s to %s\n", ring_id, + pre ? "vGPU" : "host", next ? "vGPU" : "HOST"); + + dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv; + + /** + * We are using raw mmio access wrapper to improve the + * performace for batch mmio read/write, so we need + * handle forcewake mannually. + */ + intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); + + /** + * TODO: Optimize for vGPU to vGPU switch by merging + * switch_mmio_to_host() and switch_mmio_to_vgpu(). + */ + if (pre) + switch_mmio_to_host(pre, ring_id); + + if (next) + switch_mmio_to_vgpu(next, ring_id); + + intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); } diff --git a/drivers/gpu/drm/i915/gvt/render.h b/drivers/gpu/drm/i915/gvt/render.h index dac1a3cc458b0..91db1d39d28f6 100644 --- a/drivers/gpu/drm/i915/gvt/render.h +++ b/drivers/gpu/drm/i915/gvt/render.h @@ -36,8 +36,8 @@ #ifndef __GVT_RENDER_H__ #define __GVT_RENDER_H__ -void intel_gvt_load_render_mmio(struct intel_vgpu *vgpu, int ring_id); +void intel_gvt_switch_mmio(struct intel_vgpu *pre, + struct intel_vgpu *next, int ring_id); -void intel_gvt_restore_render_mmio(struct intel_vgpu *vgpu, int ring_id); #endif diff --git a/drivers/gpu/drm/i915/gvt/sched_policy.c b/drivers/gpu/drm/i915/gvt/sched_policy.c index f25ff133865f1..03532dfc0cd51 100644 --- a/drivers/gpu/drm/i915/gvt/sched_policy.c +++ b/drivers/gpu/drm/i915/gvt/sched_policy.c @@ -202,11 +202,6 @@ static void tbs_sched_func(struct gvt_sched_data *sched_data) struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler; struct vgpu_sched_data *vgpu_data; struct intel_vgpu *vgpu = NULL; - static uint64_t timer_check; - - if (!(timer_check++ % GVT_TS_BALANCE_PERIOD_MS)) - gvt_balance_timeslice(sched_data); - /* no active vgpu or has already had a target */ if (list_empty(&sched_data->lru_runq_head) || scheduler->next_vgpu) goto out; @@ -231,9 +226,19 @@ static void tbs_sched_func(struct gvt_sched_data *sched_data) void intel_gvt_schedule(struct intel_gvt *gvt) { struct gvt_sched_data *sched_data = gvt->scheduler.sched_data; + static uint64_t timer_check; mutex_lock(&gvt->lock); + + if (test_and_clear_bit(INTEL_GVT_REQUEST_SCHED, + (void *)&gvt->service_request)) { + if (!(timer_check++ % GVT_TS_BALANCE_PERIOD_MS)) + gvt_balance_timeslice(sched_data); + } + clear_bit(INTEL_GVT_REQUEST_EVENT_SCHED, (void *)&gvt->service_request); + tbs_sched_func(sched_data); + mutex_unlock(&gvt->lock); } @@ -371,6 +376,7 @@ void intel_vgpu_stop_schedule(struct intel_vgpu *vgpu) { struct intel_gvt_workload_scheduler *scheduler = &vgpu->gvt->scheduler; + int ring_id; gvt_dbg_core("vgpu%d: stop schedule\n", vgpu->id); @@ -384,4 +390,13 @@ void intel_vgpu_stop_schedule(struct intel_vgpu *vgpu) scheduler->need_reschedule = true; scheduler->current_vgpu = NULL; } + + spin_lock_bh(&scheduler->mmio_context_lock); + for (ring_id = 0; ring_id < I915_NUM_ENGINES; ring_id++) { + if (scheduler->engine_owner[ring_id] == vgpu) { + intel_gvt_switch_mmio(vgpu, NULL, ring_id); + scheduler->engine_owner[ring_id] = NULL; + } + } + spin_unlock_bh(&scheduler->mmio_context_lock); } diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c index bada32b332378..a61dd920a056e 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.c +++ b/drivers/gpu/drm/i915/gvt/scheduler.c @@ -112,9 +112,14 @@ static int populate_shadow_context(struct intel_vgpu_workload *workload) COPY_REG(rcs_indirect_ctx_offset); } #undef COPY_REG - + /* + * pin/unpin the shadow mm before using to ensure it has been + * shadowed. + */ + intel_vgpu_pin_mm(workload->shadow_mm); set_context_pdp_root_pointer(shadow_ring_context, workload->shadow_mm->shadow_page_table); + intel_vgpu_unpin_mm(workload->shadow_mm); intel_gvt_hypervisor_read_gpa(vgpu, workload->ring_context_gpa + @@ -139,30 +144,42 @@ static int shadow_context_status_change(struct notifier_block *nb, struct intel_gvt *gvt = container_of(nb, struct intel_gvt, shadow_ctx_notifier_block[req->engine->id]); struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler; - struct intel_vgpu_workload *workload = - scheduler->current_workload[req->engine->id]; + enum intel_engine_id ring_id = req->engine->id; + struct intel_vgpu_workload *workload; + + if (!is_gvt_request(req)) { + spin_lock_bh(&scheduler->mmio_context_lock); + if (action == INTEL_CONTEXT_SCHEDULE_IN && + scheduler->engine_owner[ring_id]) { + /* Switch ring from vGPU to host. */ + intel_gvt_switch_mmio(scheduler->engine_owner[ring_id], + NULL, ring_id); + scheduler->engine_owner[ring_id] = NULL; + } + spin_unlock_bh(&scheduler->mmio_context_lock); - if (!is_gvt_request(req) || unlikely(!workload)) + return NOTIFY_OK; + } + + workload = scheduler->current_workload[ring_id]; + if (unlikely(!workload)) return NOTIFY_OK; switch (action) { case INTEL_CONTEXT_SCHEDULE_IN: - intel_gvt_load_render_mmio(workload->vgpu, - workload->ring_id); + spin_lock_bh(&scheduler->mmio_context_lock); + if (workload->vgpu != scheduler->engine_owner[ring_id]) { + /* Switch ring from host to vGPU or vGPU to vGPU. */ + intel_gvt_switch_mmio(scheduler->engine_owner[ring_id], + workload->vgpu, ring_id); + scheduler->engine_owner[ring_id] = workload->vgpu; + } else + gvt_dbg_sched("skip ring %d mmio switch for vgpu%d\n", + ring_id, workload->vgpu->id); + spin_unlock_bh(&scheduler->mmio_context_lock); atomic_set(&workload->shadow_ctx_active, 1); break; case INTEL_CONTEXT_SCHEDULE_OUT: - intel_gvt_restore_render_mmio(workload->vgpu, - workload->ring_id); - /* If the status is -EINPROGRESS means this workload - * doesn't meet any issue during dispatching so when - * get the SCHEDULE_OUT set the status to be zero for - * good. If the status is NOT -EINPROGRESS means there - * is something wrong happened during dispatching and - * the status should not be set to zero - */ - if (workload->status == -EINPROGRESS) - workload->status = 0; atomic_set(&workload->shadow_ctx_active, 0); break; default: @@ -173,40 +190,32 @@ static int shadow_context_status_change(struct notifier_block *nb, return NOTIFY_OK; } -static int dispatch_workload(struct intel_vgpu_workload *workload) +/** + * intel_gvt_scan_and_shadow_workload - audit the workload by scanning and + * shadow it as well, include ringbuffer,wa_ctx and ctx. + * @workload: an abstract entity for each execlist submission. + * + * This function is called before the workload submitting to i915, to make + * sure the content of the workload is valid. + */ +int intel_gvt_scan_and_shadow_workload(struct intel_vgpu_workload *workload) { int ring_id = workload->ring_id; struct i915_gem_context *shadow_ctx = workload->vgpu->shadow_ctx; struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv; - struct intel_engine_cs *engine = dev_priv->engine[ring_id]; struct drm_i915_gem_request *rq; struct intel_vgpu *vgpu = workload->vgpu; int ret; - gvt_dbg_sched("ring id %d prepare to dispatch workload %p\n", - ring_id, workload); + lockdep_assert_held(&dev_priv->drm.struct_mutex); + + if (workload->shadowed) + return 0; shadow_ctx->desc_template &= ~(0x3 << GEN8_CTX_ADDRESSING_MODE_SHIFT); shadow_ctx->desc_template |= workload->ctx_desc.addressing_mode << GEN8_CTX_ADDRESSING_MODE_SHIFT; - mutex_lock(&dev_priv->drm.struct_mutex); - - /* pin shadow context by gvt even the shadow context will be pinned - * when i915 alloc request. That is because gvt will update the guest - * context from shadow context when workload is completed, and at that - * moment, i915 may already unpined the shadow context to make the - * shadow_ctx pages invalid. So gvt need to pin itself. After update - * the guest context, gvt can unpin the shadow_ctx safely. - */ - ret = engine->context_pin(engine, shadow_ctx); - if (ret) { - gvt_vgpu_err("fail to pin shadow context\n"); - workload->status = ret; - mutex_unlock(&dev_priv->drm.struct_mutex); - return ret; - } - rq = i915_gem_request_alloc(dev_priv->engine[ring_id], shadow_ctx); if (IS_ERR(rq)) { gvt_vgpu_err("fail to allocate gem request\n"); @@ -218,7 +227,7 @@ static int dispatch_workload(struct intel_vgpu_workload *workload) workload->req = i915_gem_request_get(rq); - ret = intel_gvt_scan_and_shadow_workload(workload); + ret = intel_gvt_scan_and_shadow_ringbuffer(workload); if (ret) goto out; @@ -233,25 +242,59 @@ static int dispatch_workload(struct intel_vgpu_workload *workload) if (ret) goto out; + workload->shadowed = true; + +out: + return ret; +} + +static int dispatch_workload(struct intel_vgpu_workload *workload) +{ + int ring_id = workload->ring_id; + struct i915_gem_context *shadow_ctx = workload->vgpu->shadow_ctx; + struct drm_i915_private *dev_priv = workload->vgpu->gvt->dev_priv; + struct intel_engine_cs *engine = dev_priv->engine[ring_id]; + struct intel_vgpu *vgpu = workload->vgpu; + int ret = 0; + + gvt_dbg_sched("ring id %d prepare to dispatch workload %p\n", + ring_id, workload); + + mutex_lock(&dev_priv->drm.struct_mutex); + + ret = intel_gvt_scan_and_shadow_workload(workload); + if (ret) + goto out; + if (workload->prepare) { ret = workload->prepare(workload); if (ret) goto out; } - gvt_dbg_sched("ring id %d submit workload to i915 %p\n", - ring_id, workload->req); + /* pin shadow context by gvt even the shadow context will be pinned + * when i915 alloc request. That is because gvt will update the guest + * context from shadow context when workload is completed, and at that + * moment, i915 may already unpined the shadow context to make the + * shadow_ctx pages invalid. So gvt need to pin itself. After update + * the guest context, gvt can unpin the shadow_ctx safely. + */ + ret = engine->context_pin(engine, shadow_ctx); + if (ret) { + gvt_vgpu_err("fail to pin shadow context\n"); + goto out; + } - ret = 0; - workload->dispatched = true; out: if (ret) workload->status = ret; - if (!IS_ERR_OR_NULL(rq)) - i915_add_request(rq); - else - engine->context_unpin(engine, shadow_ctx); + if (!IS_ERR_OR_NULL(workload->req)) { + gvt_dbg_sched("ring id %d submit workload to i915 %p\n", + ring_id, workload->req); + i915_add_request(workload->req); + workload->dispatched = true; + } mutex_unlock(&dev_priv->drm.struct_mutex); return ret; @@ -406,9 +449,22 @@ static void complete_current_workload(struct intel_gvt *gvt, int ring_id) wait_event(workload->shadow_ctx_status_wq, !atomic_read(&workload->shadow_ctx_active)); + /* If this request caused GPU hang, req->fence.error will + * be set to -EIO. Use -EIO to set workload status so + * that when this request caused GPU hang, didn't trigger + * context switch interrupt to guest. + */ + if (likely(workload->status == -EINPROGRESS)) { + if (workload->req->fence.error == -EIO) + workload->status = -EIO; + else + workload->status = 0; + } + i915_gem_request_put(fetch_and_zero(&workload->req)); - if (!workload->status && !vgpu->resetting) { + if (!workload->status && !(vgpu->resetting_eng & + ENGINE_MASK(ring_id))) { update_guest_context(workload); for_each_set_bit(event, workload->pending_events, @@ -431,6 +487,10 @@ static void complete_current_workload(struct intel_gvt *gvt, int ring_id) atomic_dec(&vgpu->running_workload_num); wake_up(&scheduler->workload_complete_wq); + + if (gvt->scheduler.need_reschedule) + intel_gvt_request_service(gvt, INTEL_GVT_REQUEST_EVENT_SCHED); + mutex_unlock(&gvt->lock); } @@ -439,8 +499,6 @@ struct workload_thread_param { int ring_id; }; -static DEFINE_MUTEX(scheduler_mutex); - static int workload_thread(void *priv) { struct workload_thread_param *p = (struct workload_thread_param *)priv; @@ -472,8 +530,6 @@ static int workload_thread(void *priv) if (!workload) break; - mutex_lock(&scheduler_mutex); - gvt_dbg_sched("ring id %d next workload %p vgpu %d\n", workload->ring_id, workload, workload->vgpu->id); @@ -512,9 +568,6 @@ static int workload_thread(void *priv) FORCEWAKE_ALL); intel_runtime_pm_put(gvt->dev_priv); - - mutex_unlock(&scheduler_mutex); - } return 0; } diff --git a/drivers/gpu/drm/i915/gvt/scheduler.h b/drivers/gpu/drm/i915/gvt/scheduler.h index 2cd725c0573e7..0d431a968a329 100644 --- a/drivers/gpu/drm/i915/gvt/scheduler.h +++ b/drivers/gpu/drm/i915/gvt/scheduler.h @@ -42,6 +42,10 @@ struct intel_gvt_workload_scheduler { struct intel_vgpu_workload *current_workload[I915_NUM_ENGINES]; bool need_reschedule; + spinlock_t mmio_context_lock; + /* can be null when owner is host */ + struct intel_vgpu *engine_owner[I915_NUM_ENGINES]; + wait_queue_head_t workload_complete_wq; struct task_struct *thread[I915_NUM_ENGINES]; wait_queue_head_t waitq[I915_NUM_ENGINES]; @@ -78,6 +82,7 @@ struct intel_vgpu_workload { struct drm_i915_gem_request *req; /* if this workload has been dispatched to i915? */ bool dispatched; + bool shadowed; int status; struct intel_vgpu_mm *shadow_mm; diff --git a/drivers/gpu/drm/i915/gvt/trace.h b/drivers/gpu/drm/i915/gvt/trace.h index 53a2d10cf3f12..8c150381d9a4e 100644 --- a/drivers/gpu/drm/i915/gvt/trace.h +++ b/drivers/gpu/drm/i915/gvt/trace.h @@ -224,58 +224,138 @@ TRACE_EVENT(oos_sync, TP_printk("%s", __entry->buf) ); -#define MAX_CMD_STR_LEN 256 TRACE_EVENT(gvt_command, - TP_PROTO(u8 vm_id, u8 ring_id, u32 ip_gma, u32 *cmd_va, u32 cmd_len, bool ring_buffer_cmd, cycles_t cost_pre_cmd_handler, cycles_t cost_cmd_handler), - - TP_ARGS(vm_id, ring_id, ip_gma, cmd_va, cmd_len, ring_buffer_cmd, cost_pre_cmd_handler, cost_cmd_handler), - - TP_STRUCT__entry( - __field(u8, vm_id) - __field(u8, ring_id) - __field(int, i) - __array(char, tmp_buf, MAX_CMD_STR_LEN) - __array(char, cmd_str, MAX_CMD_STR_LEN) - ), - - TP_fast_assign( - __entry->vm_id = vm_id; - __entry->ring_id = ring_id; - __entry->cmd_str[0] = '\0'; - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "VM(%d) Ring(%d): %s ip(%08x) pre handler cost (%llu), handler cost (%llu) ", vm_id, ring_id, ring_buffer_cmd ? "RB":"BB", ip_gma, cost_pre_cmd_handler, cost_cmd_handler); - strcat(__entry->cmd_str, __entry->tmp_buf); - entry->i = 0; - while (cmd_len > 0) { - if (cmd_len >= 8) { - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x %08x %08x %08x %08x %08x %08x ", - cmd_va[__entry->i], cmd_va[__entry->i+1], cmd_va[__entry->i+2], cmd_va[__entry->i+3], - cmd_va[__entry->i+4], cmd_va[__entry->i+5], cmd_va[__entry->i+6], cmd_va[__entry->i+7]); - __entry->i += 8; - cmd_len -= 8; - strcat(__entry->cmd_str, __entry->tmp_buf); - } else if (cmd_len >= 4) { - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x %08x %08x ", - cmd_va[__entry->i], cmd_va[__entry->i+1], cmd_va[__entry->i+2], cmd_va[__entry->i+3]); - __entry->i += 4; - cmd_len -= 4; - strcat(__entry->cmd_str, __entry->tmp_buf); - } else if (cmd_len >= 2) { - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x ", cmd_va[__entry->i], cmd_va[__entry->i+1]); - __entry->i += 2; - cmd_len -= 2; - strcat(__entry->cmd_str, __entry->tmp_buf); - } else if (cmd_len == 1) { - snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x ", cmd_va[__entry->i]); - __entry->i += 1; - cmd_len -= 1; - strcat(__entry->cmd_str, __entry->tmp_buf); - } - } - strcat(__entry->cmd_str, "\n"); - ), + TP_PROTO(u8 vgpu_id, u8 ring_id, u32 ip_gma, u32 *cmd_va, u32 cmd_len, + u32 buf_type), + + TP_ARGS(vgpu_id, ring_id, ip_gma, cmd_va, cmd_len, buf_type), + + TP_STRUCT__entry( + __field(u8, vgpu_id) + __field(u8, ring_id) + __field(u32, ip_gma) + __field(u32, buf_type) + __field(u32, cmd_len) + __dynamic_array(u32, raw_cmd, cmd_len) + ), + + TP_fast_assign( + __entry->vgpu_id = vgpu_id; + __entry->ring_id = ring_id; + __entry->ip_gma = ip_gma; + __entry->buf_type = buf_type; + __entry->cmd_len = cmd_len; + memcpy(__get_dynamic_array(raw_cmd), cmd_va, cmd_len * sizeof(*cmd_va)); + ), + + + TP_printk("vgpu%d ring %d: buf_type %u, ip_gma %08x, raw cmd %s", + __entry->vgpu_id, + __entry->ring_id, + __entry->buf_type, + __entry->ip_gma, + __print_array(__get_dynamic_array(raw_cmd), __entry->cmd_len, 4)) +); + +#define GVT_TEMP_STR_LEN 10 +TRACE_EVENT(write_ir, + TP_PROTO(int id, char *reg_name, unsigned int reg, unsigned int new_val, + unsigned int old_val, bool changed), + + TP_ARGS(id, reg_name, reg, new_val, old_val, changed), + + TP_STRUCT__entry( + __field(int, id) + __array(char, buf, GVT_TEMP_STR_LEN) + __field(unsigned int, reg) + __field(unsigned int, new_val) + __field(unsigned int, old_val) + __field(bool, changed) + ), + + TP_fast_assign( + __entry->id = id; + snprintf(__entry->buf, GVT_TEMP_STR_LEN, "%s", reg_name); + __entry->reg = reg; + __entry->new_val = new_val; + __entry->old_val = old_val; + __entry->changed = changed; + ), + + TP_printk("VM%u write [%s] %x, new %08x, old %08x, changed %08x\n", + __entry->id, __entry->buf, __entry->reg, __entry->new_val, + __entry->old_val, __entry->changed) +); + +TRACE_EVENT(propagate_event, + TP_PROTO(int id, const char *irq_name, int bit), + + TP_ARGS(id, irq_name, bit), + + TP_STRUCT__entry( + __field(int, id) + __array(char, buf, GVT_TEMP_STR_LEN) + __field(int, bit) + ), - TP_printk("%s", __entry->cmd_str) + TP_fast_assign( + __entry->id = id; + snprintf(__entry->buf, GVT_TEMP_STR_LEN, "%s", irq_name); + __entry->bit = bit; + ), + + TP_printk("Set bit (%d) for (%s) for vgpu (%d)\n", + __entry->bit, __entry->buf, __entry->id) ); + +TRACE_EVENT(inject_msi, + TP_PROTO(int id, unsigned int address, unsigned int data), + + TP_ARGS(id, address, data), + + TP_STRUCT__entry( + __field(int, id) + __field(unsigned int, address) + __field(unsigned int, data) + ), + + TP_fast_assign( + __entry->id = id; + __entry->address = address; + __entry->data = data; + ), + + TP_printk("vgpu%d:inject msi address %x data %x\n", + __entry->id, __entry->address, __entry->data) +); + +TRACE_EVENT(render_mmio, + TP_PROTO(int id, char *action, unsigned int reg, + unsigned int old_val, unsigned int new_val), + + TP_ARGS(id, action, reg, new_val, old_val), + + TP_STRUCT__entry( + __field(int, id) + __array(char, buf, GVT_TEMP_STR_LEN) + __field(unsigned int, reg) + __field(unsigned int, old_val) + __field(unsigned int, new_val) + ), + + TP_fast_assign( + __entry->id = id; + snprintf(__entry->buf, GVT_TEMP_STR_LEN, "%s", action); + __entry->reg = reg; + __entry->old_val = old_val; + __entry->new_val = new_val; + ), + + TP_printk("VM%u %s reg %x, old %08x new %08x\n", + __entry->id, __entry->buf, __entry->reg, + __entry->old_val, __entry->new_val) +); + #endif /* _GVT_TRACE_H_ */ /* This part must be out of protection */ diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 6e3cbd8caec26..2ed8aaf3f4167 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -43,8 +43,7 @@ void populate_pvinfo_page(struct intel_vgpu *vgpu) vgpu_vreg(vgpu, vgtif_reg(version_minor)) = 0; vgpu_vreg(vgpu, vgtif_reg(display_ready)) = 0; vgpu_vreg(vgpu, vgtif_reg(vgt_id)) = vgpu->id; - vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) = - vgpu_aperture_gmadr_base(vgpu); + vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) = 0; vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.size)) = vgpu_aperture_sz(vgpu); vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) = @@ -206,6 +205,7 @@ void intel_gvt_activate_vgpu(struct intel_vgpu *vgpu) { mutex_lock(&vgpu->gvt->lock); vgpu->active = true; + intel_vgpu_start_schedule(vgpu); mutex_unlock(&vgpu->gvt->lock); } @@ -480,11 +480,16 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, { struct intel_gvt *gvt = vgpu->gvt; struct intel_gvt_workload_scheduler *scheduler = &gvt->scheduler; + u64 maddr = vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)); + u64 unmaddr = vgpu_vreg(vgpu, + vgtif_reg(avail_rs.nonmappable_gmadr.base)); + unsigned int resetting_eng = dmlr ? ALL_ENGINES : engine_mask; gvt_dbg_core("------------------------------------------\n"); gvt_dbg_core("resseting vgpu%d, dmlr %d, engine_mask %08x\n", vgpu->id, dmlr, engine_mask); - vgpu->resetting = true; + + vgpu->resetting_eng = resetting_eng; intel_vgpu_stop_schedule(vgpu); /* @@ -497,14 +502,23 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, mutex_lock(&gvt->lock); } - intel_vgpu_reset_execlist(vgpu, dmlr ? ALL_ENGINES : engine_mask); + intel_vgpu_reset_execlist(vgpu, resetting_eng); /* full GPU reset or device model level reset */ if (engine_mask == ALL_ENGINES || dmlr) { - intel_vgpu_reset_gtt(vgpu, dmlr); - intel_vgpu_reset_resource(vgpu); - intel_vgpu_reset_mmio(vgpu); + intel_vgpu_invalidate_ppgtt(vgpu); + /*fence will not be reset during virtual reset */ + if (dmlr) { + intel_vgpu_reset_gtt(vgpu); + intel_vgpu_reset_resource(vgpu); + } + + intel_vgpu_reset_mmio(vgpu, dmlr); populate_pvinfo_page(vgpu); + vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) = + maddr; + vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) = + unmaddr; intel_vgpu_reset_display(vgpu); if (dmlr) { @@ -515,7 +529,7 @@ void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, } } - vgpu->resetting = false; + vgpu->resetting_eng = 0; gvt_dbg_core("reset vgpu%d done\n", vgpu->id); gvt_dbg_core("------------------------------------------\n"); } diff --git a/drivers/gpu/drm/i915/gvt/xengt.c b/drivers/gpu/drm/i915/gvt/xengt.c new file mode 100644 index 0000000000000..aa134a292c0a5 --- /dev/null +++ b/drivers/gpu/drm/i915/gvt/xengt.c @@ -0,0 +1,1824 @@ +/* + * Interfaces coupled to Xen + * + * Copyright(c) 2011-2013 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of Version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + */ + +/* + * NOTE: + * This file contains hypervisor specific interactions to + * implement the concept of mediated pass-through framework. + * What this file provides is actually a general abstraction + * of in-kernel device model, which is not vgt specific. + * + * Now temporarily in vgt code. long-term this should be + * in hypervisor (xen/kvm) specific directory + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "xengt.h" + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("XenGT mediated passthrough driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.1"); + +struct kobject *gvt_ctrl_kobj; +static struct kset *gvt_kset; +static DEFINE_MUTEX(gvt_sysfs_lock); + +struct xengt_struct xengt_priv; +const struct intel_gvt_ops *intel_gvt_ops; + +static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct kobj_attribute *kattr; + ssize_t ret = -EIO; + + kattr = container_of(attr, struct kobj_attribute, attr); + if (kattr->show) + ret = kattr->show(kobj, kattr, buf); + return ret; +} + +static ssize_t kobj_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t count) +{ + struct kobj_attribute *kattr; + ssize_t ret = -EIO; + + kattr = container_of(attr, struct kobj_attribute, attr); + if (kattr->store) + ret = kattr->store(kobj, kattr, buf, count); + return ret; +} + +/* + * TODO + * keep the sysfs name of create_vgt_instance no change to reuse current + * test tool-kit. Better name should be: create_xengt_instance + + * destroy_xengt_instance. + */ +static struct kobj_attribute xengt_instance_attr = +__ATTR(create_vgt_instance, 0220, NULL, xengt_sysfs_instance_manage); + +static struct kobj_attribute xengt_vm_attr = +__ATTR(vgpu_id, 0440, xengt_sysfs_vgpu_id, NULL); + +static struct kobj_attribute xengt_sch_attr = +__ATTR(schedule, 0220, NULL, xengt_sysfs_vgpu_schedule); + +static struct attribute *xengt_ctrl_attrs[] = { + &xengt_instance_attr.attr, + NULL, /* need to NULL terminate the list of attributes */ +}; + +static struct attribute *xengt_vm_attrs[] = { + &xengt_vm_attr.attr, + &xengt_sch_attr.attr, + NULL, /* need to NULL terminate the list of attributes */ +}; + +const struct sysfs_ops xengt_kobj_sysfs_ops = { + .show = kobj_attr_show, + .store = kobj_attr_store, +}; + +static struct kobj_type xengt_instance_ktype = { + .sysfs_ops = &xengt_kobj_sysfs_ops, + .default_attrs = xengt_vm_attrs, +}; + +static struct kobj_type xengt_ctrl_ktype = { + .sysfs_ops = &xengt_kobj_sysfs_ops, + .default_attrs = xengt_ctrl_attrs, +}; + +static ssize_t +device_state_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + struct xengt_hvm_dev *info = container_of((kobj), struct xengt_hvm_dev, kobj); + struct intel_vgpu *vgpu = info->vgpu; + void *base = info->dev_state; + + if (!count || off < 0 || (off + count > bin_attr->size) || (off & 0x3)) + return -EINVAL; + + if (info->dev_state == NULL) + return -EINVAL; + + if (intel_gvt_ops->vgpu_save_restore(vgpu, + buf, count, base, 0, false) != 0) + return -EINVAL; + + memcpy(buf, base + off, count); + + return count; +} + +static ssize_t +device_state_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + struct xengt_hvm_dev *info = container_of((kobj), struct xengt_hvm_dev, kobj); + struct intel_vgpu *vgpu = info->vgpu; + void *base = info->dev_state; + + if (!count || off < 0 || (off + count > bin_attr->size) || (off & 0x3)) + return -EINVAL; + + if (info->dev_state == NULL) + return -EINVAL; + + memcpy(base + off, buf, count); + + if ((off + count) == bin_attr->size) { + if (intel_gvt_ops->vgpu_save_restore(vgpu, + buf, count, base, 0, true) != 0) + return -EINVAL; + } + + return count; +} + +static struct bin_attribute vgpu_state_attr = { + .attr = { + .name = "device_state", + .mode = 0660 + }, + .size = MIGRATION_IMG_MAX_SIZE, + .read = device_state_read, + .write = device_state_write, +}; + +static struct intel_vgpu_type *xengt_choose_vgpu_type( + struct xengt_hvm_params *vp) +{ + struct intel_vgpu_type *vgpu_type; + unsigned int i; + + for (i = 0; i < xengt_priv.gvt->num_types; i++) { + vgpu_type = &xengt_priv.gvt->types[i]; + if ((vgpu_type->low_gm_size >> 20) == vp->aperture_sz) { + gvt_dbg_core("choose vgpu type:%d\n", i); + return vgpu_type; + } + } + + gvt_err("specify a wrong low_gm_sz in hvm.cfg: %d\n", vp->aperture_sz); + return NULL; +} + +static int xengt_sysfs_add_instance(struct xengt_hvm_params *vp) +{ + int ret = 0; + struct intel_vgpu *vgpu; + struct xengt_hvm_dev *info; + struct intel_vgpu_type *type; + + type = xengt_choose_vgpu_type(vp); + if (type == NULL) { + gvt_err("choose vgpu type failed"); + return -EINVAL; + } + mutex_lock(&gvt_sysfs_lock); + vgpu = xengt_instance_create(vp->vm_id, type); + mutex_unlock(&gvt_sysfs_lock); + if (vgpu == NULL) { + gvt_err("xengt_sysfs_add_instance failed.\n"); + ret = -EINVAL; + } else { + info = (struct xengt_hvm_dev *) vgpu->handle; + xengt_priv.vgpus[vgpu->id - 1] = vgpu; + gvt_dbg_core("add xengt instance for vm-%d with vgpu-%d.\n", + vp->vm_id, vgpu->id); + + kobject_init(&info->kobj, &xengt_instance_ktype); + info->kobj.kset = gvt_kset; + /* add kobject, NULL parent indicates using kset as parent */ + ret = kobject_add(&info->kobj, NULL, "vm%u", info->vm_id); + if (ret) { + gvt_err("%s: kobject add error: %d\n", __func__, ret); + kobject_put(&info->kobj); + } + + ret = sysfs_create_bin_file(&info->kobj, &vgpu_state_attr); + if (ret) { + gvt_err("%s: kobject add error: %d\n", __func__, ret); + kobject_put(&info->kobj); + } + } + + return ret; +} + +static struct intel_vgpu *vgpu_from_vm_id(int vm_id) +{ + int i; + + /* vm_id is negtive in del_instance call */ + if (vm_id < 0) + vm_id = -vm_id; + for (i = 0; i < GVT_MAX_VGPU_INSTANCE; i++) { + if (xengt_priv.vgpus[i]) { + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *) + (xengt_priv.vgpus[i]->handle); + if (info->vm_id == vm_id) + return xengt_priv.vgpus[i]; + } + } + return NULL; +} + +static int xengt_sysfs_del_instance(struct xengt_hvm_params *vp) +{ + int ret = 0; + struct intel_vgpu *vgpu = vgpu_from_vm_id(vp->vm_id); + struct xengt_hvm_dev *info; + + if (vgpu) { + gvt_dbg_core("xengt: remove vm-%d sysfs node.\n", vp->vm_id); + + info = (struct xengt_hvm_dev *) vgpu->handle; + kobject_put(&info->kobj); + + mutex_lock(&gvt_sysfs_lock); + xengt_priv.vgpus[vgpu->id - 1] = NULL; + xengt_instance_destroy(vgpu); + mutex_unlock(&gvt_sysfs_lock); + } + + return ret; +} + +static ssize_t xengt_sysfs_vgpu_id(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i; + + for (i = 0; i < GVT_MAX_VGPU_INSTANCE; i++) { + if (xengt_priv.vgpus[i] && + (kobj == &((struct xengt_hvm_dev *) + (xengt_priv.vgpus[i]->handle))->kobj)) { + return sprintf(buf, "%d\n", xengt_priv.vgpus[i]->id); + } + } + return 0; +} + +static ssize_t xengt_sysfs_instance_manage(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct xengt_hvm_params vp; + int param_cnt; + char param_str[64]; + int rc; + int high_gm_sz; + int low_gm_sz; + + /* We expect the param_str should be vmid,a,b,c (where the guest + * wants a MB aperture and b MB gm, and c fence registers) or -vmid + * (where we want to release the vgt instance). + */ + (void)sscanf(buf, "%63s", param_str); + param_cnt = sscanf(param_str, "%d,%d,%d,%d,%d,%d", &vp.vm_id, + &low_gm_sz, &high_gm_sz, &vp.fence_sz, &vp.gvt_primary, + &vp.cap); + vp.aperture_sz = low_gm_sz; + vp.gm_sz = high_gm_sz + low_gm_sz; + if (param_cnt == 1) { + if (vp.vm_id >= 0) + return -EINVAL; + } else if (param_cnt == 4 || param_cnt == 5 || param_cnt == 6) { + if (!(vp.vm_id > 0 && vp.aperture_sz > 0 && + vp.aperture_sz <= vp.gm_sz && vp.fence_sz > 0)) + return -EINVAL; + + if (param_cnt == 5 || param_cnt == 6) { + /* -1/0/1 means: not-specified, non-primary, primary */ + if (vp.gvt_primary < -1 || vp.gvt_primary > 1) + return -EINVAL; + if (vp.cap < 0 || vp.cap > 100) + return -EINVAL; + } else { + vp.cap = 0; /* default 0 means no upper cap. */ + vp.gvt_primary = -1; /* no valid value specified. */ + } + } else + return -EINVAL; + + rc = (vp.vm_id > 0) ? xengt_sysfs_add_instance(&vp) : + xengt_sysfs_del_instance(&vp); + + return rc < 0 ? rc : count; +} + +static int xengt_hvm_modified_memory(struct xengt_hvm_dev *info, uint64_t start_pfn) +{ + xen_dm_op_buf_t dm_buf[2]; + struct xen_dm_op op; + struct xen_dm_op_modified_memory *header; + struct xen_dm_op_modified_memory_extent data; + int rc; + + memset(&op, 0, sizeof(op)); + memset(&data, 0, sizeof(data)); + + op.op = XEN_DMOP_modified_memory; + header = &op.u.modified_memory; + header->nr_extents = 1; + + data.nr = 1; + data.first_pfn = start_pfn; + + dm_buf[0].h = &op; + dm_buf[0].size = sizeof(op); + + dm_buf[1].h = &data; + dm_buf[1].size = sizeof(data); + + rc = HYPERVISOR_dm_op(info->vm_id, 2, &dm_buf); + + if (rc < 0) + gvt_err("Cannot modified memory: %d!\n", rc); + + return rc; +} + +static void xengt_logd_sync(struct xengt_hvm_dev *info) +{ + struct gvt_logd_pfn *logd, *next; + + mutex_lock(&info->logd_lock); + rbtree_postorder_for_each_entry_safe(logd, next, + &info->logd_list, node) + xengt_hvm_modified_memory(info, logd->gfn); + mutex_unlock(&info->logd_lock); +} + +static ssize_t xengt_sysfs_vgpu_schedule(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct xengt_hvm_dev *info = + container_of((kobj), struct xengt_hvm_dev, kobj); + struct intel_vgpu *vgpu = info->vgpu; + int running; + + mutex_lock(&gvt_sysfs_lock); + if (sscanf(buf, "%d", &running) != 1) { + mutex_unlock(&gvt_sysfs_lock); + return -EINVAL; + } + + if (running) { + if (info->iosrv_enabled == 0) { + hvm_claim_ioreq_server_type(info, 1); + xen_hvm_toggle_iorequest_server(info, true); + } + intel_gvt_ops->vgpu_activate(vgpu); + } else { + intel_gvt_ops->vgpu_deactivate(vgpu); + if (info->iosrv_enabled != 0) { + hvm_claim_ioreq_server_type(info, 0); + xen_hvm_toggle_iorequest_server(info, false); + } + xengt_logd_sync(info); + } + + mutex_unlock(&gvt_sysfs_lock); + + return count; +} + +int xengt_sysfs_init(struct intel_gvt *gvt) +{ + int ret; + + /* + * TODO. + * keep the name of 'vgt', not 'gvt', so that current tool kit + * still could be used. + */ + gvt_kset = kset_create_and_add("vgt", NULL, kernel_kobj); + if (!gvt_kset) { + ret = -ENOMEM; + goto kset_fail; + } + + gvt_ctrl_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (!gvt_ctrl_kobj) { + ret = -ENOMEM; + goto ctrl_fail; + } + + gvt_ctrl_kobj->kset = gvt_kset; + ret = kobject_init_and_add(gvt_ctrl_kobj, &xengt_ctrl_ktype, + NULL, "control"); + if (ret) { + ret = -EINVAL; + goto kobj_fail; + } + + return 0; + +kobj_fail: + kobject_put(gvt_ctrl_kobj); +ctrl_fail: + kset_unregister(gvt_kset); +kset_fail: + return ret; +} + +void xengt_sysfs_del(void) +{ + kobject_put(gvt_ctrl_kobj); + kset_unregister(gvt_kset); +} + +/* Translate from VM's guest pfn to machine pfn */ +static unsigned long xen_g2m_pfn(domid_t vm_id, unsigned long g_pfn) +{ + struct xen_get_mfn_from_pfn pfn_arg; + int rc; + unsigned long pfn_list[1]; + + pfn_list[0] = g_pfn; + + set_xen_guest_handle(pfn_arg.pfn_list, pfn_list); + pfn_arg.nr_pfns = 1; + pfn_arg.domid = vm_id; + + rc = HYPERVISOR_memory_op(XENMEM_get_mfn_from_pfn, &pfn_arg); + if (rc < 0) { + gvt_err("failed to get mfn for gpfn 0x%lx: %d\n", g_pfn, rc); + return INTEL_GVT_INVALID_ADDR; + } + + return pfn_list[0]; +} + +static int xen_get_max_gpfn(domid_t vm_id) +{ + domid_t dom_id = vm_id; + int max_gpfn = HYPERVISOR_memory_op(XENMEM_maximum_gpfn, &dom_id); + + if (max_gpfn < 0) + max_gpfn = 0; + return max_gpfn; +} + +static int xen_pause_domain(domid_t vm_id) +{ + int rc; + struct xen_domctl domctl; + + domctl.domain = vm_id; + domctl.cmd = XEN_DOMCTL_pausedomain; + domctl.interface_version = XEN_DOMCTL_INTERFACE_VERSION; + + rc = HYPERVISOR_domctl(&domctl); + if (rc != 0) + gvt_dbg_core("xen_pause_domain fail: %d!\n", rc); + + return rc; +} + +static int xen_shutdown_domain(domid_t vm_id) +{ + int rc; + struct sched_remote_shutdown r; + + r.reason = SHUTDOWN_crash; + r.domain_id = vm_id; + rc = HYPERVISOR_sched_op(SCHEDOP_remote_shutdown, &r); + if (rc != 0) + gvt_dbg_core("xen_shutdown_domain failed: %d\n", rc); + return rc; +} + +static int xen_domain_iomem_perm(domid_t domain_id, uint64_t first_mfn, + uint64_t nr_mfns, uint8_t allow_access) +{ + struct xen_domctl arg; + int rc; + + arg.domain = domain_id; + arg.cmd = XEN_DOMCTL_iomem_permission; + arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION; + arg.u.iomem_perm.first_mfn = first_mfn; + arg.u.iomem_perm.nr_mfns = nr_mfns; + arg.u.iomem_perm.allow_access = allow_access; + rc = HYPERVISOR_domctl(&arg); + + return rc; +} + +static int xen_get_nr_vcpu(domid_t vm_id) +{ + struct xen_domctl arg; + int rc; + + arg.domain = vm_id; + arg.cmd = XEN_DOMCTL_getdomaininfo; + arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION; + + rc = HYPERVISOR_domctl(&arg); + if (rc < 0) { + gvt_err("HYPERVISOR_domctl fail ret=%d\n", rc); + /* assume it is UP */ + return 1; + } + + return arg.u.getdomaininfo.max_vcpu_id + 1; +} + +static int xen_hvm_memory_mapping(domid_t vm_id, uint64_t first_gfn, + uint64_t first_mfn, uint32_t nr_mfns, uint32_t add_mapping) +{ + struct xen_domctl arg; + int rc = 0, err = 0; + unsigned long done = 0, mapping_sz = 64; + + if (add_mapping) { + rc = xen_domain_iomem_perm(vm_id, first_mfn, nr_mfns, 1); + if (rc < 0) { + gvt_err("xen_domain_iomem_perm failed: %d\n", rc); + return rc; + } + } + + arg.domain = vm_id; + arg.cmd = XEN_DOMCTL_memory_mapping; + arg.interface_version = XEN_DOMCTL_INTERFACE_VERSION; + arg.u.memory_mapping.add_mapping = add_mapping; + +retry: + if (nr_mfns > 0 && mapping_sz > 0) { + while (done < nr_mfns) { + mapping_sz = min(nr_mfns - done, mapping_sz); + arg.u.memory_mapping.nr_mfns = mapping_sz; + arg.u.memory_mapping.first_gfn = first_gfn + done; + arg.u.memory_mapping.first_mfn = first_mfn + done; + err = HYPERVISOR_domctl(&arg); + if (err == -E2BIG) { + mapping_sz /= 2; + goto retry; + } + //Save first error status. + if (!rc) + rc = err; + + if (err && add_mapping != DPCI_REMOVE_MAPPING) + break; + done += mapping_sz; + } + + //Undo operation, if some error to mapping. + if (rc && add_mapping != DPCI_REMOVE_MAPPING) { + xen_hvm_memory_mapping(vm_id, first_gfn, first_mfn, + nr_mfns, DPCI_REMOVE_MAPPING); + } + } + + if (rc < 0) { + gvt_err("map fail: %d gfn:0x%llx mfn:0x%llx nr:%d\n", + rc, first_gfn, first_mfn, nr_mfns); + return rc; + } + + if (!add_mapping) { + rc = xen_domain_iomem_perm(vm_id, first_mfn, nr_mfns, 0); + if (rc < 0) { + gvt_err("xen_domain_iomem_perm failed: %d\n", rc); + return rc; + } + } + + return rc; +} + +static int xen_hvm_create_iorequest_server(struct xengt_hvm_dev *info) +{ + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_create_ioreq_server *data; + int r; + + memset(&op, 0, sizeof(op)); + + op.op = XEN_DMOP_create_ioreq_server; + data = &op.u.create_ioreq_server; + data->handle_bufioreq = 0; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + r = HYPERVISOR_dm_op(info->vm_id, 1, &dm_buf); + if (r < 0) { + gvt_err("Cannot create io-requset server: %d!\n", r); + return r; + } + info->iosrv_id = data->id; + + return r; +} + +static int xen_hvm_toggle_iorequest_server(struct xengt_hvm_dev *info, bool enable) +{ + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_set_ioreq_server_state *data; + int r; + + if (info->iosrv_enabled == !!enable) + return 0; + + info->iosrv_enabled = !!enable; + + memset(&op, 0, sizeof(op)); + + op.op = XEN_DMOP_set_ioreq_server_state; + data = &op.u.set_ioreq_server_state; + data->id = info->iosrv_id; + data->enabled = !!enable; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + r = HYPERVISOR_dm_op(info->vm_id, 1, &dm_buf); + if (r < 0) { + gvt_err("Cannot %s io-request server: %d!\n", + enable ? "enable" : "disbale", r); + return r; + } + + return r; +} + +static int xen_hvm_get_ioreq_pfn(struct xengt_hvm_dev *info, uint64_t *value) +{ + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_get_ioreq_server_info *data; + int r; + + memset(&op, 0, sizeof(op)); + + op.op = XEN_DMOP_get_ioreq_server_info; + data = &op.u.get_ioreq_server_info; + data->id = info->iosrv_id; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + r = HYPERVISOR_dm_op(info->vm_id, 1, &dm_buf); + if (r < 0) { + gvt_err("Cannot get ioreq pfn: %d!\n", r); + return r; + } + *value = data->ioreq_pfn; + return r; +} + +static int xen_hvm_destroy_iorequest_server(struct xengt_hvm_dev *info) +{ + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_destroy_ioreq_server *data; + int r; + + memset(&op, 0, sizeof(op)); + + op.op = XEN_DMOP_destroy_ioreq_server; + data = &op.u.destroy_ioreq_server; + data->id = info->iosrv_id; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + r = HYPERVISOR_dm_op(info->vm_id, 1, &dm_buf); + if (r < 0) { + gvt_err("Cannot destroy io-request server(%d): %d!\n", + info->iosrv_id, r); + return r; + } + info->iosrv_id = 0; + + return r; +} + +static struct vm_struct *xen_hvm_map_iopage(struct xengt_hvm_dev *info) +{ + uint64_t ioreq_pfn; + int rc; + + rc = xen_hvm_create_iorequest_server(info); + if (rc < 0) + return NULL; + rc = xen_hvm_get_ioreq_pfn(info, &ioreq_pfn); + if (rc < 0) { + xen_hvm_destroy_iorequest_server(info); + return NULL; + } + + return xen_remap_domain_mfn_range_in_kernel(ioreq_pfn, 1, info->vm_id); +} + +static int xen_hvm_map_io_range_to_ioreq_server(struct xengt_hvm_dev *info, + int is_mmio, uint64_t start, uint64_t end, int map) +{ + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_ioreq_server_range *data; + int r; + + memset(&op, 0, sizeof(op)); + + op.op = map ? XEN_DMOP_map_io_range_to_ioreq_server : + XEN_DMOP_unmap_io_range_from_ioreq_server; + data = map ? &op.u.map_io_range_to_ioreq_server : + &op.u.unmap_io_range_from_ioreq_server; + data->id = info->iosrv_id; + data->type = is_mmio ? XEN_DMOP_IO_RANGE_MEMORY : + XEN_DMOP_IO_RANGE_PORT; + data->start = start; + data->end = end; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + r = HYPERVISOR_dm_op(info->vm_id, 1, &dm_buf); + if (r < 0) { + gvt_err("Couldn't %s io_range 0x%llx ~ 0x%llx, vm_id:%d:%d\n", + map ? "map" : "unmap", + start, end, info->vm_id, r); + } + return r; +} + +static int xen_hvm_map_pcidev_to_ioreq_server(struct xengt_hvm_dev *info, + uint64_t sbdf) +{ + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_ioreq_server_range *data; + int r; + + memset(&op, 0, sizeof(op)); + + op.op = XEN_DMOP_map_io_range_to_ioreq_server; + data = &op.u.map_io_range_to_ioreq_server; + data->id = info->iosrv_id; + data->type = XEN_DMOP_IO_RANGE_PCI; + data->start = data->end = sbdf; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + r = HYPERVISOR_dm_op(info->vm_id, 1, &dm_buf); + if (r < 0) + gvt_err("Cannot map pci_dev to ioreq_server: %d!\n", r); + + return r; +} + +static int hvm_claim_ioreq_server_type(struct xengt_hvm_dev *info, + uint32_t set) +{ + + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_map_mem_type_to_ioreq_server *data; + int r; + + memset(&op, 0, sizeof(op)); + + op.op = XEN_DMOP_map_mem_type_to_ioreq_server; + data = &op.u.map_mem_type_to_ioreq_server; + data->id = info->iosrv_id; + data->type = HVMMEM_ioreq_server; + data->flags = (set == 1) ? XEN_DMOP_IOREQ_MEM_ACCESS_WRITE : 0; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + r = HYPERVISOR_dm_op(info->vm_id, 1, &dm_buf); + if (r < 0) + gvt_err("Cannot map mem type to ioreq_server\n"); + + return r; +} + +static int xen_hvm_set_mem_type(domid_t vm_id, uint16_t mem_type, + uint64_t first_pfn, uint64_t nr) +{ + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_set_mem_type *data; + int r; + + memset(&op, 0, sizeof(op)); + + op.op = XEN_DMOP_set_mem_type; + data = &op.u.set_mem_type; + + data->mem_type = mem_type; + data->first_pfn = first_pfn; + data->nr = nr; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + r = HYPERVISOR_dm_op(vm_id, 1, &dm_buf); + if (r < 0) { + gvt_err("Cannot set mem type for 0x%llx ~ 0x%llx, memtype: %x\n", + first_pfn, first_pfn+nr, mem_type); + } + return r; +} + +static int xen_hvm_wp_page_to_ioreq_server(struct xengt_hvm_dev *info, + unsigned long page, bool set) +{ + int rc = 0; + uint16_t mem_type; + + mem_type = set ? HVMMEM_ioreq_server : HVMMEM_ram_rw; + rc = xen_hvm_set_mem_type(info->vm_id, mem_type, page, 1); + if (rc < 0) { + gvt_err("set mem type of page 0x%lx to %s fail - %d!\n", page, + set ? "HVMMEM_ioreq_server" : "HVMMEM_ram_rw", rc); + } + + return rc; +} + +static int xengt_map_gfn_to_mfn(unsigned long handle, unsigned long gfn, + unsigned long mfn, unsigned int nr, bool map) +{ + int rc; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)handle; + + if (!info) + return -EINVAL; + + if (info->on_destroy) + return 0; + + rc = xen_hvm_memory_mapping(info->vm_id, gfn, mfn, nr, + map ? DPCI_ADD_MAPPING : DPCI_REMOVE_MAPPING); + if (rc != 0) + gvt_err("xen_hvm_memory_mapping failed: %d\n", rc); + return rc; +} + +static int xengt_set_trap_area(unsigned long handle, u64 start, + u64 end, bool map) +{ + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)handle; + + if (!info) + return -EINVAL; + + return xen_hvm_map_io_range_to_ioreq_server(info, 1, start, end, map); +} + +static int xengt_set_wp_page(unsigned long handle, u64 gfn) +{ + int r; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)handle; + + if (!info) + return -EINVAL; + + if (info->on_destroy) + return 0; + + r = xen_hvm_wp_page_to_ioreq_server(info, gfn, true); + if (r) { + gvt_err("fail to set write protection.\n"); + return -EFAULT; + } + + return 0; +} + +static int xengt_unset_wp_page(unsigned long handle, u64 gfn) +{ + int r; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)handle; + + if (!info) + return -EINVAL; + + if (info->on_destroy) + return 0; + + r = xen_hvm_wp_page_to_ioreq_server(info, gfn, false); + if (r) { + gvt_err("fail to clear write protection.\n"); + return -EFAULT; + } + + return 0; +} + +static int xengt_hvm_vmem_init(struct intel_vgpu *vgpu) +{ + unsigned long i, j, gpfn, count; + unsigned long nr_low_1mb_bkt, nr_high_bkt, nr_high_4k_bkt; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)vgpu->handle; + + if (!info->vm_id) + return 0; + + info->vmem_sz = xen_get_max_gpfn(info->vm_id); + info->vmem_sz <<= PAGE_SHIFT; + + nr_low_1mb_bkt = VMEM_1MB >> PAGE_SHIFT; + nr_high_bkt = (info->vmem_sz >> VMEM_BUCK_SHIFT); + nr_high_4k_bkt = (info->vmem_sz >> PAGE_SHIFT); + + info->vmem_vma_low_1mb = + vzalloc(sizeof(*info->vmem_vma) * nr_low_1mb_bkt); + info->vmem_vma = + vzalloc(sizeof(*info->vmem_vma) * nr_high_bkt); + info->vmem_vma_4k = /* TODO: really needs so big array for every page? */ + vzalloc(sizeof(*info->vmem_vma) * nr_high_4k_bkt); + + if (info->vmem_vma_low_1mb == NULL || info->vmem_vma == NULL || + info->vmem_vma_4k == NULL) { + gvt_err("Insufficient memory for vmem_vma, vmem_sz=0x%llx\n", + info->vmem_sz); + goto err; + } + + /* map the low 1MB memory */ + for (i = 0; i < nr_low_1mb_bkt; i++) { + info->vmem_vma_low_1mb[i] = + xen_remap_domain_mfn_range_in_kernel(i, 1, info->vm_id); + + if (info->vmem_vma_low_1mb[i] != NULL) + continue; + + /* Don't warn on [0xa0000, 0x100000): a known non-RAM hole */ + if (i < (0xa0000 >> PAGE_SHIFT)) + gvt_err("VM%d: can't map GPFN %ld!\n", info->vm_id, i); + } + + count = 0; + /* map the >1MB memory */ + for (i = 1; i < nr_high_bkt; i++) { + gpfn = i << (VMEM_BUCK_SHIFT - PAGE_SHIFT); + info->vmem_vma[i] = xen_remap_domain_mfn_range_in_kernel( + gpfn, VMEM_BUCK_SIZE >> PAGE_SHIFT, info->vm_id); + + if (info->vmem_vma[i] != NULL) + continue; + + /* for <4G GPFNs: skip the hole after low_mem_max_gpfn */ + if (gpfn < (1 << (32 - PAGE_SHIFT)) && + vgpu->low_mem_max_gpfn != 0 && + gpfn > vgpu->low_mem_max_gpfn) + continue; + + for (j = gpfn; + j < ((i + 1) << (VMEM_BUCK_SHIFT - PAGE_SHIFT)); + j++) { + info->vmem_vma_4k[j] = + xen_remap_domain_mfn_range_in_kernel(j, 1, + info->vm_id); + + if (info->vmem_vma_4k[j]) { + count++; + gvt_dbg_mm("map 4k gpa (%lx)\n", j << PAGE_SHIFT); + } + } + + /* To reduce the number of err messages(some of them, due to + * the MMIO hole, are spurious and harmless) we only print a + * message if it's at every 64MB boundary or >4GB memory. + */ + if (!info->vmem_vma_4k[gpfn] && + ((i % 64 == 0) || (i >= (1ULL << (32 - VMEM_BUCK_SHIFT))))) + gvt_dbg_mm("VM%d: can't map gpfn 0x%lx\n", info->vm_id, gpfn); + } + + return 0; +err: + vfree(info->vmem_vma); + vfree(info->vmem_vma_low_1mb); + vfree(info->vmem_vma_4k); + info->vmem_vma = info->vmem_vma_low_1mb = info->vmem_vma_4k = NULL; + return -ENOMEM; +} + +static void xengt_vmem_destroy(struct intel_vgpu *vgpu) +{ + int i, j; + unsigned long nr_low_1mb_bkt, nr_high_bkt, nr_high_bkt_4k; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)vgpu->handle; + + if (!info || info->vm_id == 0) + return; + + /* + * Maybe the VM hasn't accessed GEN MMIO(e.g., still in the legacy VGA + * mode), so no mapping is created yet. + */ + if (info->vmem_vma == NULL && info->vmem_vma_low_1mb == NULL) + return; + + nr_low_1mb_bkt = VMEM_1MB >> PAGE_SHIFT; + nr_high_bkt = (info->vmem_sz >> VMEM_BUCK_SHIFT); + nr_high_bkt_4k = (info->vmem_sz >> PAGE_SHIFT); + + for (i = 0; i < nr_low_1mb_bkt; i++) { + if (info->vmem_vma_low_1mb[i] == NULL) + continue; + xen_unmap_domain_mfn_range_in_kernel(info->vmem_vma_low_1mb[i], + 1, info->vm_id); + } + + for (i = 1; i < nr_high_bkt; i++) { + if (info->vmem_vma[i] == NULL) { + for (j = (i << (VMEM_BUCK_SHIFT - PAGE_SHIFT)); + j < ((i + 1) << (VMEM_BUCK_SHIFT - PAGE_SHIFT)); + j++) { + if (info->vmem_vma_4k[j] == NULL) + continue; + xen_unmap_domain_mfn_range_in_kernel( + info->vmem_vma_4k[j], 1, info->vm_id); + } + continue; + } + xen_unmap_domain_mfn_range_in_kernel( + info->vmem_vma[i], VMEM_BUCK_SIZE >> PAGE_SHIFT, + info->vm_id); + } + + vfree(info->vmem_vma); + vfree(info->vmem_vma_low_1mb); + vfree(info->vmem_vma_4k); +} + +static uint64_t intel_vgpu_get_bar0_addr(struct intel_vgpu *vgpu) +{ + u32 start_lo, start_hi; + u32 mem_type; + int pos = PCI_BASE_ADDRESS_0; + + start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + pos)) & + PCI_BASE_ADDRESS_MEM_MASK; + mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + pos)) & + PCI_BASE_ADDRESS_MEM_TYPE_MASK; + + switch (mem_type) { + case PCI_BASE_ADDRESS_MEM_TYPE_64: + start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + + pos + 4)); + break; + case PCI_BASE_ADDRESS_MEM_TYPE_32: + case PCI_BASE_ADDRESS_MEM_TYPE_1M: + /* 1M mem BAR treated as 32-bit BAR */ + default: + /* mem unknown type treated as 32-bit BAR */ + start_hi = 0; + break; + } + + return ((u64)start_hi << 32) | start_lo; +} + +static int xengt_hvm_mmio_emulation(struct intel_vgpu *vgpu, + struct ioreq *req) +{ + int i, sign; + void *gva; + unsigned long gpa; + uint64_t base = intel_vgpu_get_bar0_addr(vgpu); + uint64_t tmp; + int pvinfo_page; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)vgpu->handle; + + if (info->vmem_vma == NULL) { + tmp = req->addr - base; + pvinfo_page = (tmp >= VGT_PVINFO_PAGE + && tmp < (VGT_PVINFO_PAGE + VGT_PVINFO_SIZE)); + /* + * hvmloader will read PVINFO to identify if HVM is in VGT + * or VTD. So we don't trigger HVM mapping logic here. + */ + if (!pvinfo_page && xengt_hvm_vmem_init(vgpu) < 0) { + gvt_err("can not map the memory of VM%d!!!\n", + info->vm_id); + return -EINVAL; + } + } + + sign = req->df ? -1 : 1; + + if (req->dir == IOREQ_READ) { + /* MMIO READ */ + if (!req->data_is_ptr) { + if (req->count != 1) + goto err_ioreq_count; + + if (intel_gvt_ops->emulate_mmio_read(vgpu, req->addr, + &req->data, req->size)) + return -EINVAL; + } else { + for (i = 0; i < req->count; i++) { + if (intel_gvt_ops->emulate_mmio_read(vgpu, + req->addr + sign * i * req->size, + &tmp, req->size)) + return -EINVAL; + + gpa = req->data + sign * i * req->size; + gva = xengt_gpa_to_va((unsigned long)info, gpa); + if (!gva) { + gvt_err("vGT: can not read gpa = 0x%lx!!!\n", gpa); + return -EFAULT; + } + memcpy(gva, &tmp, req->size); + } + } + } else { /* MMIO Write */ + if (!req->data_is_ptr) { + if (req->count != 1) + goto err_ioreq_count; + if (intel_gvt_ops->emulate_mmio_write(vgpu, + req->addr, + &req->data, req->size)) + return -EINVAL; + } else { + for (i = 0; i < req->count; i++) { + gpa = req->data + sign * i * req->size; + gva = xengt_gpa_to_va((unsigned long)info, gpa); + if (!gva) { + gvt_err("VM %d mmio access invalid gpa: 0x%lx.\n", + info->vm_id, gpa); + return -EFAULT; + } + + memcpy(&tmp, gva, req->size); + if (intel_gvt_ops->emulate_mmio_write(vgpu, + req->addr + sign * i * req->size, + &tmp, req->size)) + return -EINVAL; + } + } + } + + return 0; + +err_ioreq_count: + gvt_err("VM(%d): Unexpected %s request count(%d)\n", + info->vm_id, req->dir == IOREQ_READ ? "read" : "write", + req->count); + return -EINVAL; +} + +static bool xengt_write_cfg_space(struct intel_vgpu *vgpu, + uint64_t addr, unsigned int bytes, unsigned long val) +{ + /* Low 32 bit of addr is real address, high 32 bit is bdf */ + unsigned int port = addr & 0xffffffff; + + if (port == PCI_VENDOR_ID) { + /* Low 20 bit of val are valid low mem gpfn. */ + val &= 0xfffff; + vgpu->low_mem_max_gpfn = val; + return true; + } + if (intel_gvt_ops->emulate_cfg_write(vgpu, port, &val, bytes)) + return false; + return true; +} + +static bool xengt_read_cfg_space(struct intel_vgpu *vgpu, + uint64_t addr, unsigned int bytes, unsigned long *val) +{ + unsigned long data; + /* Low 32 bit of addr is real address, high 32 bit is bdf */ + unsigned int port = addr & 0xffffffff; + + if (intel_gvt_ops->emulate_cfg_read(vgpu, port, &data, bytes)) + return false; + memcpy(val, &data, bytes); + return true; +} + +static int xengt_hvm_pio_emulation(struct intel_vgpu *vgpu, struct ioreq *ioreq) +{ + int sign; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)vgpu->handle; + + sign = ioreq->df ? -1 : 1; + + if (ioreq->dir == IOREQ_READ) { + /* PIO READ */ + if (!ioreq->data_is_ptr) { + if (!xengt_read_cfg_space(vgpu, + ioreq->addr, + ioreq->size, + (unsigned long *)&ioreq->data)) + return -EINVAL; + } else { + gvt_err("VGT: _hvm_pio_emulation read data_ptr %lx\n", + (long)ioreq->data); + goto err_data_ptr; + } + } else { + /* PIO WRITE */ + if (!ioreq->data_is_ptr) { + if (!xengt_write_cfg_space(vgpu, + ioreq->addr, + ioreq->size, + (unsigned long)ioreq->data)) + return -EINVAL; + } else { + gvt_err("VGT: _hvm_pio_emulation write data_ptr %lx\n", + (long)ioreq->data); + goto err_data_ptr; + } + } + return 0; +err_data_ptr: + /* The data pointer of emulation is guest physical address + * so far, which goes to Qemu emulation, but hard for + * vGT driver which doesn't know gpn_2_mfn translation. + * We may ask hypervisor to use mfn for vGT driver. + * We mark it as unsupported in case guest really it. + */ + gvt_err("VM(%d): Unsupported %s data_ptr(%lx)\n", + info->vm_id, ioreq->dir == IOREQ_READ ? "read" : "write", + (long)ioreq->data); + return -EINVAL; +} + +static int xengt_do_ioreq(struct intel_vgpu *vgpu, struct ioreq *ioreq) +{ + int rc = 0; + + BUG_ON(ioreq->state != STATE_IOREQ_INPROCESS); + + switch (ioreq->type) { + case IOREQ_TYPE_PCI_CONFIG: + rc = xengt_hvm_pio_emulation(vgpu, ioreq); + break; + case IOREQ_TYPE_COPY: /* MMIO */ + rc = xengt_hvm_mmio_emulation(vgpu, ioreq); + break; + case IOREQ_TYPE_INVALIDATE: + case IOREQ_TYPE_TIMEOFFSET: + break; + default: + gvt_err("Unknown ioreq type %x addr %llx size %u state %u\n", + ioreq->type, ioreq->addr, ioreq->size, ioreq->state); + rc = -EINVAL; + break; + } + + wmb(); + + return rc; +} + +static struct ioreq *xengt_get_hvm_ioreq(struct intel_vgpu *vgpu, int vcpu) +{ + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)vgpu->handle; + ioreq_t *req = &(info->iopage->vcpu_ioreq[vcpu]); + + if (req->state != STATE_IOREQ_READY) + return NULL; + + rmb(); + + req->state = STATE_IOREQ_INPROCESS; + return req; +} + +static int xengt_emulation_thread(void *priv) +{ + struct intel_vgpu *vgpu = (struct intel_vgpu *)priv; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)vgpu->handle; + + int vcpu; + int nr_vcpus = info->nr_vcpu; + + struct ioreq *ioreq; + int irq, ret; + + gvt_dbg_core("start kthread for VM%d\n", info->vm_id); + + set_freezable(); + while (1) { + ret = wait_event_freezable(info->io_event_wq, + kthread_should_stop() || + bitmap_weight(info->ioreq_pending, nr_vcpus)); + + if (kthread_should_stop()) + return 0; + + if (ret) + gvt_err("Emulation thread(%d) waken up" + "by unexpected signal!\n", info->vm_id); + + for (vcpu = 0; vcpu < nr_vcpus; vcpu++) { + if (!test_and_clear_bit(vcpu, info->ioreq_pending)) + continue; + + ioreq = xengt_get_hvm_ioreq(vgpu, vcpu); + if (ioreq == NULL) + continue; + + if (xengt_do_ioreq(vgpu, ioreq)) { + xen_pause_domain(info->vm_id); + xen_shutdown_domain(info->vm_id); + } + + ioreq->state = STATE_IORESP_READY; + + irq = info->evtchn_irq[vcpu]; + notify_remote_via_irq(irq); + } + } + + BUG(); /* It's actually impossible to reach here */ + return 0; +} + +static inline void xengt_raise_emulation_request(struct intel_vgpu *vgpu, + int vcpu) +{ + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)vgpu->handle; + + set_bit(vcpu, info->ioreq_pending); + wake_up(&info->io_event_wq); +} + +static irqreturn_t xengt_io_req_handler(int irq, void *dev) +{ + struct intel_vgpu *vgpu; + struct xengt_hvm_dev *info; + int vcpu; + + vgpu = (struct intel_vgpu *)dev; + info = (struct xengt_hvm_dev *)vgpu->handle; + + for (vcpu = 0; vcpu < info->nr_vcpu; vcpu++) { + if (info->evtchn_irq[vcpu] == irq) + break; + } + if (vcpu == info->nr_vcpu) { + /*opps, irq is not the registered one*/ + gvt_dbg_core("Received a IOREQ w/o vcpu target\n"); + gvt_dbg_core("Possible a false request from event binding\n"); + return IRQ_NONE; + } + + xengt_raise_emulation_request(vgpu, vcpu); + + return IRQ_HANDLED; +} + +static void xengt_logd_destroy(struct xengt_hvm_dev *info) +{ + struct gvt_logd_pfn *logd; + struct rb_node *node = NULL; + + mutex_lock(&info->logd_lock); + while ((node = rb_first(&info->logd_list))) { + logd = rb_entry(node, struct gvt_logd_pfn, node); + rb_erase(&logd->node, &info->logd_list); + kfree(logd); + } + mutex_unlock(&info->logd_lock); +} + +void xengt_instance_destroy(struct intel_vgpu *vgpu) +{ + struct xengt_hvm_dev *info; + int vcpu; + + intel_gvt_ops->vgpu_deactivate(vgpu); + + info = (struct xengt_hvm_dev *)vgpu->handle; + if (info == NULL) + goto free_vgpu; + + info->on_destroy = true; + if (info->emulation_thread != NULL) + kthread_stop(info->emulation_thread); + + if (!info->nr_vcpu || info->evtchn_irq == NULL) + goto out1; + + if (info->iosrv_enabled != 0) { + hvm_claim_ioreq_server_type(info, 0); + xen_hvm_toggle_iorequest_server(info, false); + } + + if (info->iosrv_id != 0) + xen_hvm_destroy_iorequest_server(info); + + for (vcpu = 0; vcpu < info->nr_vcpu; vcpu++) { + if (info->evtchn_irq[vcpu] >= 0) + unbind_from_irqhandler(info->evtchn_irq[vcpu], vgpu); + } + + if (info->iopage_vma != NULL) { + xen_unmap_domain_mfn_range_in_kernel(info->iopage_vma, 1, + info->vm_id); + info->iopage_vma = NULL; + } + + kfree(info->evtchn_irq); + + if (info->dev_state) + vfree(info->dev_state); + +out1: + xengt_logd_destroy(info); + xengt_vmem_destroy(vgpu); + vgpu->handle = (unsigned long)NULL; + kfree(info); + +free_vgpu: + if (vgpu) + intel_gvt_ops->vgpu_destroy(vgpu); +} + +struct intel_vgpu *xengt_instance_create(domid_t vm_id, + struct intel_vgpu_type *vgpu_type) +{ + struct xengt_hvm_dev *info; + struct intel_vgpu *vgpu; + int vcpu, irq, rc = 0; + struct task_struct *thread; + + if (!intel_gvt_ops || !xengt_priv.gvt) + return NULL; + + vgpu = intel_gvt_ops->vgpu_create(xengt_priv.gvt, vgpu_type); + if (IS_ERR(vgpu)) + return NULL; + intel_gvt_ops->vgpu_activate(vgpu); + info = kzalloc(sizeof(struct xengt_hvm_dev), GFP_KERNEL); + if (info == NULL) + goto err; + + info->vm_id = vm_id; + info->vgpu = vgpu; + vgpu->handle = (unsigned long)info; + info->iopage_vma = xen_hvm_map_iopage(info); + if (info->iopage_vma == NULL) { + gvt_err("Failed to map HVM I/O page for VM%d\n", vm_id); + rc = -EFAULT; + goto err; + } + info->iopage = info->iopage_vma->addr; + init_waitqueue_head(&info->io_event_wq); + info->nr_vcpu = xen_get_nr_vcpu(vm_id); + info->evtchn_irq = kmalloc(info->nr_vcpu * sizeof(int), GFP_KERNEL); + if (info->evtchn_irq == NULL) { + rc = -ENOMEM; + goto err; + } + for (vcpu = 0; vcpu < info->nr_vcpu; vcpu++) + info->evtchn_irq[vcpu] = -1; + + info->dev_state = vzalloc(MIGRATION_IMG_MAX_SIZE); + if (info->dev_state == NULL) { + rc = -ENOMEM; + goto err; + } + + rc = xen_hvm_map_pcidev_to_ioreq_server(info, + PCI_BDF2(0, 0x10));//FIXME hack the dev bdf + if (rc < 0) + goto err; + + rc = hvm_claim_ioreq_server_type(info, 1); + if (rc < 0) + goto err; + + rc = xen_hvm_toggle_iorequest_server(info, 1); + if (rc < 0) + goto err; + + for (vcpu = 0; vcpu < info->nr_vcpu; vcpu++) { + irq = bind_interdomain_evtchn_to_irqhandler(vm_id, + info->iopage->vcpu_ioreq[vcpu].vp_eport, + xengt_io_req_handler, 0, + "xengt", vgpu); + if (irq < 0) { + rc = irq; + gvt_err("Failed to bind event channle: %d\n", rc); + goto err; + } + info->evtchn_irq[vcpu] = irq; + } + + thread = kthread_run(xengt_emulation_thread, vgpu, + "xengt_emulation:%d", vm_id); + if (IS_ERR(thread)) + goto err; + info->emulation_thread = thread; + + return vgpu; + +err: + xengt_instance_destroy(vgpu); + return NULL; +} + +static void *xengt_gpa_to_va(unsigned long handle, unsigned long gpa) +{ + unsigned long buck_index, buck_4k_index; + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)handle; + + if (!info->vm_id) + return (char *)mfn_to_virt(gpa>>PAGE_SHIFT) + + (gpa & (PAGE_SIZE-1)); + + if (gpa > info->vmem_sz) { + if (info->vmem_sz == 0) + xengt_hvm_vmem_init(info->vgpu); + else { + gvt_err("vGT try to access invalid gpa=0x%lx\n", gpa); + return NULL; + } + } + + /* handle the low 1MB memory */ + if (gpa < VMEM_1MB) { + buck_index = gpa >> PAGE_SHIFT; + if (!info->vmem_vma_low_1mb[buck_index]) + return NULL; + + return (char *)(info->vmem_vma_low_1mb[buck_index]->addr) + + (gpa & ~PAGE_MASK); + + } + + /* handle the >1MB memory */ + buck_index = gpa >> VMEM_BUCK_SHIFT; + + if (!info->vmem_vma[buck_index]) { + buck_4k_index = gpa >> PAGE_SHIFT; + if (!info->vmem_vma_4k[buck_4k_index]) { + if (buck_4k_index > info->vgpu->low_mem_max_gpfn) + gvt_err("vGT failed to map gpa=0x%lx?\n", gpa); + return NULL; + } + + return (char *)(info->vmem_vma_4k[buck_4k_index]->addr) + + (gpa & ~PAGE_MASK); + } + + return (char *)(info->vmem_vma[buck_index]->addr) + + (gpa & (VMEM_BUCK_SIZE - 1)); +} + +static int xengt_host_init(struct device *dev, void *gvt, const void *ops) +{ + int ret = -EFAULT; + + if (!gvt || !ops) + return -EINVAL; + + xengt_priv.gvt = (struct intel_gvt *)gvt; + intel_gvt_ops = (const struct intel_gvt_ops *)ops; + + ret = xengt_sysfs_init(xengt_priv.gvt); + if (ret) { + xengt_priv.gvt = NULL; + intel_gvt_ops = NULL; + } + + return ret; +} + +static void xengt_host_exit(struct device *dev, void *gvt) +{ + xengt_sysfs_del(); + xengt_priv.gvt = NULL; + intel_gvt_ops = NULL; +} + +static int xengt_attach_vgpu(void *vgpu, unsigned long *handle) +{ + /* nothing to do here */ + return 0; +} + +static void xengt_detach_vgpu(unsigned long handle) +{ + /* nothing to do here */ +} + +static int xengt_inject_msi(unsigned long handle, u32 addr_lo, u16 data) +{ + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)handle; + xen_dm_op_buf_t dm_buf; + struct xen_dm_op op; + struct xen_dm_op_inject_msi *arg; + + memset(&op, 0, sizeof(op)); + + op.op = XEN_DMOP_inject_msi; + arg = &op.u.inject_msi; + + arg->addr = (uint64_aligned_t)addr_lo; + arg->data = (uint32_t)data; + + dm_buf.h = &op; + dm_buf.size = sizeof(op); + + return HYPERVISOR_dm_op(info->vm_id, 1, &dm_buf); +} + +static unsigned long xengt_virt_to_mfn(void *addr) +{ + return virt_to_mfn(addr); +} + +static int xengt_read_gpa(unsigned long handle, unsigned long gpa, + void *buf, unsigned long len) +{ + void *va = NULL; + + if (!handle) + return -EINVAL; + + va = xengt_gpa_to_va(handle, gpa); + if (!va) { + gvt_err("GVT: can not read gpa = 0x%lx!!!\n", gpa); + return -EFAULT; + } + memcpy(buf, va, len); + return 0; +} + +static int xengt_write_gpa(unsigned long handle, unsigned long gpa, + void *buf, unsigned long len) +{ + void *va = NULL; + + if (!handle) + return -EINVAL; + + va = xengt_gpa_to_va(handle, gpa); + if (!va) { + gvt_err("GVT: can not write gpa = 0x%lx!!!\n", gpa); + return -EFAULT; + } + memcpy(va, buf, len); + return 0; +} + +static struct gvt_logd_pfn *xengt_find_logd(struct xengt_hvm_dev *info, + unsigned long gfn) +{ + struct gvt_logd_pfn *logd; + struct rb_node *node = info->logd_list.rb_node; + + while (node) { + logd = rb_entry(node, struct gvt_logd_pfn, node); + + if (gfn < logd->gfn) + node = node->rb_left; + else if (gfn > logd->gfn) + node = node->rb_right; + else + return logd; + } + return NULL; +} + +static void xengt_logd_add(struct xengt_hvm_dev *info, unsigned long gfn) +{ + struct gvt_logd_pfn *logd, *itr; + struct rb_node **node = &info->logd_list.rb_node, *parent = NULL; + + mutex_lock(&info->logd_lock); + + logd = xengt_find_logd(info, gfn); + if (logd) { + atomic_inc(&logd->ref_count); + mutex_unlock(&info->logd_lock); + return; + } + + logd = kzalloc(sizeof(struct gvt_logd_pfn), GFP_KERNEL); + if (!logd) + goto exit; + + logd->gfn = gfn; + atomic_set(&logd->ref_count, 1); + + while (*node) { + parent = *node; + itr = rb_entry(parent, struct gvt_logd_pfn, node); + + if (logd->gfn < itr->gfn) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + rb_link_node(&logd->node, parent, node); + rb_insert_color(&logd->node, &info->logd_list); + +exit: + mutex_unlock(&info->logd_lock); + return; +} + +static unsigned long xengt_gfn_to_pfn(unsigned long handle, unsigned long gfn) +{ + struct xengt_hvm_dev *info = (struct xengt_hvm_dev *)handle; + unsigned long pfn; + + if (!info) + return -EINVAL; + + pfn = xen_g2m_pfn(info->vm_id, gfn); + + if (pfn != INTEL_GVT_INVALID_ADDR) + xengt_logd_add(info, gfn); + + return pfn; +} + +struct intel_gvt_mpt xengt_mpt = { + //.detect_host = xengt_detect_host, + .host_init = xengt_host_init, + .host_exit = xengt_host_exit, + .attach_vgpu = xengt_attach_vgpu, + .detach_vgpu = xengt_detach_vgpu, + .inject_msi = xengt_inject_msi, + .from_virt_to_mfn = xengt_virt_to_mfn, + .set_wp_page = xengt_set_wp_page, + .unset_wp_page = xengt_unset_wp_page, + .read_gpa = xengt_read_gpa, + .write_gpa = xengt_write_gpa, + .gfn_to_mfn = xengt_gfn_to_pfn, + .map_gfn_to_mfn = xengt_map_gfn_to_mfn, + .set_trap_area = xengt_set_trap_area, +}; +EXPORT_SYMBOL_GPL(xengt_mpt); + +static int __init xengt_init(void) +{ + if (!xen_initial_domain()) + return -EINVAL; + return 0; +} + +static void __exit xengt_exit(void) +{ + gvt_dbg_core("xengt: unloaded\n"); +} + +module_init(xengt_init); +module_exit(xengt_exit); diff --git a/drivers/gpu/drm/i915/gvt/xengt.h b/drivers/gpu/drm/i915/gvt/xengt.h new file mode 100644 index 0000000000000..fd9be531c76a0 --- /dev/null +++ b/drivers/gpu/drm/i915/gvt/xengt.h @@ -0,0 +1,91 @@ +#ifndef INTEL_GVT_XENGT_H +#define INTEL_GVT_XENGT_H + +extern struct intel_gvt *gvt_instance; +extern const struct intel_gvt_ops *intel_gvt_ops; + +#define PCI_BDF2(b, df) ((((b) & 0xff) << 8) | ((df) & 0xff)) + +#define MAX_HVM_VCPUS_SUPPORTED 127 + +#define VMEM_1MB (1ULL << 20) /* the size of the first 1MB */ +#define VMEM_BUCK_SHIFT 20 +#define VMEM_BUCK_SIZE (1ULL << VMEM_BUCK_SHIFT) +#define VMEM_BUCK_MASK (~(VMEM_BUCK_SIZE - 1)) + +/* + * xengt_hvm_dev is a wrapper of a vGPU instance which is reprensented by the + * intel_vgpu structure. Under xen hypervisor, the xengt_instance stands for a + * HVM device, which the related resource. + */ +struct xengt_hvm_dev { + domid_t vm_id; + struct kobject kobj; + struct intel_vgpu *vgpu; + int on_destroy; + + /* iopage_vma->addr is just iopage. We need iopage_vma on VM destroy */ + shared_iopage_t *iopage; + struct vm_struct *iopage_vma; + + /* the event channel irqs to handle HVM io request, index is vcpu id */ + int nr_vcpu; + int *evtchn_irq; + ioservid_t iosrv_id; /* io-request server id */ + int iosrv_enabled; + struct task_struct *emulation_thread; + DECLARE_BITMAP(ioreq_pending, MAX_HVM_VCPUS_SUPPORTED); + wait_queue_head_t io_event_wq; + + uint64_t vmem_sz; + /* for the 1st 1MB memory of HVM: each vm_struct means one 4K-page */ + struct vm_struct **vmem_vma_low_1mb; + /* for >1MB memory of HVM: each vm_struct means 1MB */ + struct vm_struct **vmem_vma; + /* for >1MB memory of HVM: each vm_struct means 4KB */ + struct vm_struct **vmem_vma_4k; + void *dev_state; + struct rb_root logd_list; + struct mutex logd_lock; +}; + +struct xengt_hvm_params { + int vm_id; + int aperture_sz; /* in MB */ + int gm_sz; /* in MB */ + int fence_sz; + int cap; + /* + * 0/1: config the vgt device as secondary/primary VGA, + * -1: means the ioemu doesn't supply a value + */ + int gvt_primary; +}; + +/* + * struct gvt_xengt should be a single instance to share global + * information for XENGT module. + */ +#define GVT_MAX_VGPU_INSTANCE 15 +struct xengt_struct { + struct intel_gvt *gvt; + struct intel_vgpu *vgpus[GVT_MAX_VGPU_INSTANCE]; +}; + +static void *xengt_gpa_to_va(unsigned long handle, unsigned long gpa); +static ssize_t xengt_sysfs_instance_manage(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count); +static ssize_t xengt_sysfs_vgpu_id(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +static ssize_t xengt_sysfs_vgpu_schedule(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count); + +struct intel_vgpu *xengt_instance_create(domid_t vm_id, + struct intel_vgpu_type *type); +void xengt_instance_destroy(struct intel_vgpu *vgpu); +static int hvm_claim_ioreq_server_type(struct xengt_hvm_dev *info, + uint32_t set); +static int xen_hvm_toggle_iorequest_server(struct xengt_hvm_dev *info, bool enable); + + +#endif diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 190f6aa5d15eb..36339a8d30d1b 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "i915_drv.h" #include "i915_trace.h" #include "intel_drv.h" @@ -3461,8 +3462,12 @@ static void gen8_de_irq_postinstall(struct drm_i915_private *dev_priv) enum pipe pipe; if (INTEL_INFO(dev_priv)->gen >= 9) { - de_pipe_masked |= GEN9_PIPE_PLANE1_FLIP_DONE | - GEN9_DE_PIPE_IRQ_FAULT_ERRORS; + if (xen_initial_domain()) + de_pipe_masked |= GEN9_PIPE_PLANE1_FLIP_DONE; + else + de_pipe_masked |= GEN9_PIPE_PLANE1_FLIP_DONE | + GEN9_DE_PIPE_IRQ_FAULT_ERRORS; + de_port_masked |= GEN9_AUX_CHANNEL_B | GEN9_AUX_CHANNEL_C | GEN9_AUX_CHANNEL_D; if (IS_GEN9_LP(dev_priv)) diff --git a/drivers/gpu/drm/i915/intel_gvt.c b/drivers/gpu/drm/i915/intel_gvt.c index e1ab6432a9146..54776fee4b52d 100644 --- a/drivers/gpu/drm/i915/intel_gvt.c +++ b/drivers/gpu/drm/i915/intel_gvt.c @@ -45,7 +45,7 @@ static bool is_supported_device(struct drm_i915_private *dev_priv) return true; if (IS_SKYLAKE(dev_priv)) return true; - if (IS_KABYLAKE(dev_priv) && INTEL_DEVID(dev_priv) == 0x591D) + if (IS_KABYLAKE(dev_priv)) return true; return false; } diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 62f44d3e7c43c..5faac1cdf8070 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1143,6 +1143,14 @@ static u32 port_seqno(struct execlist_port *port) return port->request ? port->request->global_seqno : 0; } +static u8 gtiir[] = { + [RCS] = 0, + [BCS] = 0, + [VCS] = 1, + [VCS2] = 1, + [VECS] = 3, +}; + static int gen8_init_common_ring(struct intel_engine_cs *engine) { struct drm_i915_private *dev_priv = engine->i915; @@ -1164,8 +1172,22 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine) DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name); - /* After a GPU reset, we may have requests to replay */ + GEM_BUG_ON(engine->id >= ARRAY_SIZE(gtiir)); + + /* + * Clear any pending interrupt state. + * + * We do it twice out of paranoia that some of the IIR are double + * buffered, and if we only reset it once there may still be + * an interrupt pending. + */ + I915_WRITE(GEN8_GT_IIR(gtiir[engine->id]), + GT_CONTEXT_SWITCH_INTERRUPT << engine->irq_shift); + I915_WRITE(GEN8_GT_IIR(gtiir[engine->id]), + GT_CONTEXT_SWITCH_INTERRUPT << engine->irq_shift); clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); + + /* After a GPU reset, we may have requests to replay */ if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) { DRM_DEBUG_DRIVER("Restarting %s from requests [0x%x, 0x%x]\n", engine->name, diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 561084ab387f3..2386e594406e7 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -423,6 +423,34 @@ static void vfio_group_put(struct vfio_group *group) kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock); } +struct vfio_group_put_work { + struct work_struct work; + struct vfio_group *group; +}; + +static void vfio_group_put_bg(struct work_struct *work) +{ + struct vfio_group_put_work *do_work; + + do_work = container_of(work, struct vfio_group_put_work, work); + + vfio_group_put(do_work->group); + kfree(do_work); +} + +static void vfio_group_schedule_put(struct vfio_group *group) +{ + struct vfio_group_put_work *do_work; + + do_work = kmalloc(sizeof(*do_work), GFP_KERNEL); + if (WARN_ON(!do_work)) + return; + + INIT_WORK(&do_work->work, vfio_group_put_bg); + do_work->group = group; + schedule_work(&do_work->work); +} + /* Assume group_lock or group reference is held */ static void vfio_group_get(struct vfio_group *group) { @@ -762,7 +790,14 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, break; } - vfio_group_put(group); + /* + * If we're the last reference to the group, the group will be + * released, which includes unregistering the iommu group notifier. + * We hold a read-lock on that notifier list, unregistering needs + * a write-lock... deadlock. Release our reference asynchronously + * to avoid that situation. + */ + vfio_group_schedule_put(group); return NOTIFY_OK; } @@ -1140,15 +1175,11 @@ static long vfio_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_set_iommu(container, arg); break; default: - down_read(&container->group_lock); - driver = container->iommu_driver; data = container->iommu_data; if (driver) /* passthrough all unrecognized ioctls */ ret = driver->ops->ioctl(data, cmd, arg); - - up_read(&container->group_lock); } return ret; @@ -1202,15 +1233,11 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf, struct vfio_iommu_driver *driver; ssize_t ret = -EINVAL; - down_read(&container->group_lock); - driver = container->iommu_driver; if (likely(driver && driver->ops->read)) ret = driver->ops->read(container->iommu_data, buf, count, ppos); - up_read(&container->group_lock); - return ret; } @@ -1221,15 +1248,11 @@ static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, struct vfio_iommu_driver *driver; ssize_t ret = -EINVAL; - down_read(&container->group_lock); - driver = container->iommu_driver; if (likely(driver && driver->ops->write)) ret = driver->ops->write(container->iommu_data, buf, count, ppos); - up_read(&container->group_lock); - return ret; } @@ -1239,14 +1262,10 @@ static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) struct vfio_iommu_driver *driver; int ret = -EINVAL; - down_read(&container->group_lock); - driver = container->iommu_driver; if (likely(driver && driver->ops->mmap)) ret = driver->ops->mmap(container->iommu_data, vma); - up_read(&container->group_lock); - return ret; } @@ -1949,8 +1968,6 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, goto err_pin_pages; container = group->container; - down_read(&container->group_lock); - driver = container->iommu_driver; if (likely(driver && driver->ops->pin_pages)) ret = driver->ops->pin_pages(container->iommu_data, user_pfn, @@ -1958,7 +1975,6 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, else ret = -ENOTTY; - up_read(&container->group_lock); vfio_group_try_dissolve_container(group); err_pin_pages: @@ -1998,8 +2014,6 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage) goto err_unpin_pages; container = group->container; - down_read(&container->group_lock); - driver = container->iommu_driver; if (likely(driver && driver->ops->unpin_pages)) ret = driver->ops->unpin_pages(container->iommu_data, user_pfn, @@ -2007,7 +2021,6 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage) else ret = -ENOTTY; - up_read(&container->group_lock); vfio_group_try_dissolve_container(group); err_unpin_pages: @@ -2029,8 +2042,6 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, return -EINVAL; container = group->container; - down_read(&container->group_lock); - driver = container->iommu_driver; if (likely(driver && driver->ops->register_notifier)) ret = driver->ops->register_notifier(container->iommu_data, @@ -2038,7 +2049,6 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, else ret = -ENOTTY; - up_read(&container->group_lock); vfio_group_try_dissolve_container(group); return ret; @@ -2056,8 +2066,6 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, return -EINVAL; container = group->container; - down_read(&container->group_lock); - driver = container->iommu_driver; if (likely(driver && driver->ops->unregister_notifier)) ret = driver->ops->unregister_notifier(container->iommu_data, @@ -2065,7 +2073,6 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, else ret = -ENOTTY; - up_read(&container->group_lock); vfio_group_try_dissolve_container(group); return ret; @@ -2083,7 +2090,6 @@ static int vfio_register_group_notifier(struct vfio_group *group, unsigned long *events, struct notifier_block *nb) { - struct vfio_container *container; int ret; bool set_kvm = false; @@ -2101,9 +2107,6 @@ static int vfio_register_group_notifier(struct vfio_group *group, if (ret) return -EINVAL; - container = group->container; - down_read(&container->group_lock); - ret = blocking_notifier_chain_register(&group->notifier, nb); /* @@ -2114,7 +2117,6 @@ static int vfio_register_group_notifier(struct vfio_group *group, blocking_notifier_call_chain(&group->notifier, VFIO_GROUP_NOTIFY_SET_KVM, group->kvm); - up_read(&container->group_lock); vfio_group_try_dissolve_container(group); return ret; @@ -2123,19 +2125,14 @@ static int vfio_register_group_notifier(struct vfio_group *group, static int vfio_unregister_group_notifier(struct vfio_group *group, struct notifier_block *nb) { - struct vfio_container *container; int ret; ret = vfio_group_add_container_user(group); if (ret) return -EINVAL; - container = group->container; - down_read(&container->group_lock); - ret = blocking_notifier_chain_unregister(&group->notifier, nb); - up_read(&container->group_lock); vfio_group_try_dissolve_container(group); return ret; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 8549cb111627f..61ddcb45e9dac 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -41,6 +41,7 @@ #include #include #include +#include #define DRIVER_VERSION "0.2" #define DRIVER_AUTHOR "Alex Williamson " @@ -1526,6 +1527,23 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu) return ret; } +static void vfio_dma_update_dirty_bitmap(struct vfio_iommu *iommu, + u64 start_addr, u64 npage, void *bitmap) +{ + u64 iova = start_addr; + struct vfio_dma *dma; + int i; + + for (i = 0; i < npage; i++) { + dma = vfio_find_dma(iommu, iova, PAGE_SIZE); + if (dma) + if (vfio_find_vpfn(dma, iova)) + set_bit(i, bitmap); + + iova += PAGE_SIZE; + } +} + static long vfio_iommu_type1_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { @@ -1596,6 +1614,30 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, return copy_to_user((void __user *)arg, &unmap, minsz) ? -EFAULT : 0; + } else if (cmd == VFIO_IOMMU_GET_DIRTY_BITMAP) { + struct vfio_iommu_get_dirty_bitmap d; + unsigned long bitmap_sz; + unsigned int *bitmap; + + minsz = offsetofend(struct vfio_iommu_get_dirty_bitmap, + page_nr); + + if (copy_from_user(&d, (void __user *)arg, minsz)) + return -EFAULT; + + bitmap_sz = (BITS_TO_LONGS(d.page_nr) + 1) * + sizeof(unsigned long); + bitmap = vzalloc(bitmap_sz); + vfio_dma_update_dirty_bitmap(iommu, d.start_addr, + d.page_nr, bitmap); + + if (copy_to_user((void __user *)arg + minsz, + bitmap, bitmap_sz)) { + vfree(bitmap); + return -EFAULT; + } + vfree(bitmap); + return 0; } return -ENOTTY; diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b52852f38cff9..0fc95e40255e0 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -582,7 +582,7 @@ static void shutdown_pirq(struct irq_data *data) static void enable_pirq(struct irq_data *data) { - startup_pirq(data); + enable_dynirq(data); } static void disable_pirq(struct irq_data *data) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index ae461050661af..aab4bc04aa477 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -296,9 +296,18 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) /* 8086 Vendor sub-types */ -#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) -#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) +#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) + +/* Mdev sub-type for device state save and restore */ +#define VFIO_REGION_SUBTYPE_DEVICE_STATE (4) + +/* Offset in region to save device state */ +#define VFIO_DEVICE_STATE_OFFSET 1 + +#define VFIO_DEVICE_START 0 +#define VFIO_DEVICE_STOP 1 /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, @@ -565,6 +574,20 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) +/** + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_get_dirty_bitmap) + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_iommu_get_dirty_bitmap { + __u64 start_addr; + __u64 page_nr; + __u8 dirty_bitmap[]; +}; + +#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17) + /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ /* diff --git a/include/xen/interface/hvm/dm_op.h b/include/xen/interface/hvm/dm_op.h index ee9e480bc559f..023a28330ebd2 100644 --- a/include/xen/interface/hvm/dm_op.h +++ b/include/xen/interface/hvm/dm_op.h @@ -18,15 +18,395 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. + * */ #ifndef __XEN_PUBLIC_HVM_DM_OP_H__ #define __XEN_PUBLIC_HVM_DM_OP_H__ +#include "../xen.h" + +#include "../event_channel.h" + +#ifndef uint64_aligned_t +#define uint64_aligned_t uint64_t +#endif + +/* + * IOREQ Servers + * + * The interface between an I/O emulator an Xen is called an IOREQ Server. + * A domain supports a single 'legacy' IOREQ Server which is instantiated if + * parameter... + * + * HVM_PARAM_IOREQ_PFN is read (to get the gmfn containing the synchronous + * ioreq structures), or... + * HVM_PARAM_BUFIOREQ_PFN is read (to get the gmfn containing the buffered + * ioreq ring), or... + * HVM_PARAM_BUFIOREQ_EVTCHN is read (to get the event channel that Xen uses + * to request buffered I/O emulation). + * + * The following hypercalls facilitate the creation of IOREQ Servers for + * 'secondary' emulators which are invoked to implement port I/O, memory, or + * PCI config space ranges which they explicitly register. + */ + +typedef uint16_t ioservid_t; + +/* + * XEN_DMOP_create_ioreq_server: Instantiate a new IOREQ Server for a + * secondary emulator. + * + * The handed back is unique for target domain. The valur of + * should be one of HVM_IOREQSRV_BUFIOREQ_* defined in + * hvm_op.h. If the value is HVM_IOREQSRV_BUFIOREQ_OFF then the buffered + * ioreq ring will not be allocated and hence all emulation requests to + * this server will be synchronous. + */ +#define XEN_DMOP_create_ioreq_server 1 + +struct xen_dm_op_create_ioreq_server { + /* IN - should server handle buffered ioreqs */ + uint8_t handle_bufioreq; + uint8_t pad[3]; + /* OUT - server id */ + ioservid_t id; +}; + +/* + * XEN_DMOP_get_ioreq_server_info: Get all the information necessary to + * access IOREQ Server . + * + * The emulator needs to map the synchronous ioreq structures and buffered + * ioreq ring (if it exists) that Xen uses to request emulation. These are + * hosted in the target domain's gmfns and + * respectively. In addition, if the IOREQ Server is handling buffered + * emulation requests, the emulator needs to bind to event channel + * to listen for them. (The event channels used for + * synchronous emulation requests are specified in the per-CPU ioreq + * structures in ). + * If the IOREQ Server is not handling buffered emulation requests then the + * values handed back in and will both be 0. + */ +#define XEN_DMOP_get_ioreq_server_info 2 + +struct xen_dm_op_get_ioreq_server_info { + /* IN - server id */ + ioservid_t id; + uint16_t pad; + /* OUT - buffered ioreq port */ + evtchn_port_t bufioreq_port; + /* OUT - sync ioreq pfn */ + uint64_aligned_t ioreq_pfn; + /* OUT - buffered ioreq pfn */ + uint64_aligned_t bufioreq_pfn; +}; + +/* + * XEN_DMOP_map_io_range_to_ioreq_server: Register an I/O range for + * emulation by the client of + * IOREQ Server . + * XEN_DMOP_unmap_io_range_from_ioreq_server: Deregister an I/O range + * previously registered for + * emulation by the client of + * IOREQ Server . + * + * There are three types of I/O that can be emulated: port I/O, memory + * accesses and PCI config space accesses. The field denotes which + * type of range* the and (inclusive) fields are specifying. + * PCI config space ranges are specified by segment/bus/device/function + * values which should be encoded using the DMOP_PCI_SBDF helper macro + * below. + * + * NOTE: unless an emulation request falls entirely within a range mapped + * by a secondary emulator, it will not be passed to that emulator. + */ +#define XEN_DMOP_map_io_range_to_ioreq_server 3 +#define XEN_DMOP_unmap_io_range_from_ioreq_server 4 + +struct xen_dm_op_ioreq_server_range { + /* IN - server id */ + ioservid_t id; + uint16_t pad; + /* IN - type of range */ + uint32_t type; +# define XEN_DMOP_IO_RANGE_PORT 0 /* I/O port range */ +# define XEN_DMOP_IO_RANGE_MEMORY 1 /* MMIO range */ +# define XEN_DMOP_IO_RANGE_PCI 2 /* PCI segment/bus/dev/func range */ + /* IN - inclusive start and end of range */ + uint64_aligned_t start, end; +}; + +#define XEN_DMOP_PCI_SBDF(s, b, d, f) \ + ((((s) & 0xffff) << 16) | \ + (((b) & 0xff) << 8) | \ + (((d) & 0x1f) << 3) | \ + ((f) & 0x07)) + +/* + * XEN_DMOP_set_ioreq_server_state: Enable or disable the IOREQ Server + * + * The IOREQ Server will not be passed any emulation requests until it is + * in the enabled state. + * Note that the contents of the ioreq_pfn and bufioreq_fn (see + * XEN_DMOP_get_ioreq_server_info) are not meaningful until the IOREQ Server + * is in the enabled state. + */ +#define XEN_DMOP_set_ioreq_server_state 5 + +struct xen_dm_op_set_ioreq_server_state { + /* IN - server id */ + ioservid_t id; + /* IN - enabled? */ + uint8_t enabled; + uint8_t pad; +}; + +/* + * XEN_DMOP_destroy_ioreq_server: Destroy the IOREQ Server . + * + * Any registered I/O ranges will be automatically deregistered. + */ +#define XEN_DMOP_destroy_ioreq_server 6 + +struct xen_dm_op_destroy_ioreq_server { + /* IN - server id */ + ioservid_t id; + uint16_t pad; +}; + +/* + * XEN_DMOP_track_dirty_vram: Track modifications to the specified pfn + * range. + * + * NOTE: The bitmap passed back to the caller is passed in a + * secondary buffer. + */ +#define XEN_DMOP_track_dirty_vram 7 + +struct xen_dm_op_track_dirty_vram { + /* IN - number of pages to be tracked */ + uint32_t nr; + uint32_t pad; + /* IN - first pfn to track */ + uint64_aligned_t first_pfn; +}; + +/* + * XEN_DMOP_set_pci_intx_level: Set the logical level of one of a domain's + * PCI INTx pins. + */ +#define XEN_DMOP_set_pci_intx_level 8 + +struct xen_dm_op_set_pci_intx_level { + /* IN - PCI INTx identification (domain:bus:device:intx) */ + uint16_t domain; + uint8_t bus, device, intx; + /* IN - Level: 0 -> deasserted, 1 -> asserted */ + uint8_t level; +}; + +/* + * XEN_DMOP_set_isa_irq_level: Set the logical level of a one of a domain's + * ISA IRQ lines. + */ +#define XEN_DMOP_set_isa_irq_level 9 + +struct xen_dm_op_set_isa_irq_level { + /* IN - ISA IRQ (0-15) */ + uint8_t isa_irq; + /* IN - Level: 0 -> deasserted, 1 -> asserted */ + uint8_t level; +}; + +/* + * XEN_DMOP_set_pci_link_route: Map a PCI INTx line to an IRQ line. + */ +#define XEN_DMOP_set_pci_link_route 10 + +struct xen_dm_op_set_pci_link_route { + /* PCI INTx line (0-3) */ + uint8_t link; + /* ISA IRQ (1-15) or 0 -> disable link */ + uint8_t isa_irq; +}; + +/* + * XEN_DMOP_modified_memory: Notify that a set of pages were modified by + * an emulator. + * + * DMOP buf 1 contains an array of xen_dm_op_modified_memory_extent with + * @nr_extents entries. + * + * On error, @nr_extents will contain the index+1 of the extent that + * had the error. It is not defined if or which pages may have been + * marked as dirty, in this event. + */ +#define XEN_DMOP_modified_memory 11 + +struct xen_dm_op_modified_memory { + /* + * IN - Number of extents to be processed + * OUT -returns n+1 for failing extent + */ + uint32_t nr_extents; + /* IN/OUT - Must be set to 0 */ + uint32_t opaque; +}; + +struct xen_dm_op_modified_memory_extent { + /* IN - number of contiguous pages modified */ + uint32_t nr; + uint32_t pad; + /* IN - first pfn modified */ + uint64_aligned_t first_pfn; +}; + +/* + * XEN_DMOP_set_mem_type: Notify that a region of memory is to be treated + * in a specific way. (See definition of + * hvmmem_type_t). + * + * NOTE: In the event of a continuation (return code -ERESTART), the + * @first_pfn is set to the value of the pfn of the remaining + * region and @nr reduced to the size of the remaining region. + */ +#define XEN_DMOP_set_mem_type 12 + +struct xen_dm_op_set_mem_type { + /* IN - number of contiguous pages */ + uint32_t nr; + /* IN - new hvmmem_type_t of region */ + uint16_t mem_type; + uint16_t pad; + /* IN - first pfn in region */ + uint64_aligned_t first_pfn; +}; + +/* + * XEN_DMOP_inject_event: Inject an event into a VCPU, which will + * get taken up when it is next scheduled. + * + * Note that the caller should know enough of the state of the CPU before + * injecting, to know what the effect of injecting the event will be. + */ +#define XEN_DMOP_inject_event 13 + +struct xen_dm_op_inject_event { + /* IN - index of vCPU */ + uint32_t vcpuid; + /* IN - interrupt vector */ + uint8_t vector; + /* IN - event type (DMOP_EVENT_* ) */ + uint8_t type; +/* NB. This enumeration precisely matches hvm.h:X86_EVENTTYPE_* */ +# define XEN_DMOP_EVENT_ext_int 0 /* external interrupt */ +# define XEN_DMOP_EVENT_nmi 2 /* nmi */ +# define XEN_DMOP_EVENT_hw_exc 3 /* hardware exception */ +# define XEN_DMOP_EVENT_sw_int 4 /* software interrupt (CD nn) */ +# define XEN_DMOP_EVENT_pri_sw_exc 5 /* ICEBP (F1) */ +# define XEN_DMOP_EVENT_sw_exc 6 /* INT3 (CC), INTO (CE) */ + /* IN - instruction length */ + uint8_t insn_len; + uint8_t pad0; + /* IN - error code (or ~0 to skip) */ + uint32_t error_code; + uint32_t pad1; + /* IN - CR2 for page faults */ + uint64_aligned_t cr2; +}; + +/* + * XEN_DMOP_inject_msi: Inject an MSI for an emulated device. + */ +#define XEN_DMOP_inject_msi 14 + +struct xen_dm_op_inject_msi { + /* IN - MSI data (lower 32 bits) */ + uint32_t data; + uint32_t pad; + /* IN - MSI address (0xfeexxxxx) */ + uint64_aligned_t addr; +}; + +/* + * XEN_DMOP_map_mem_type_to_ioreq_server : map or unmap the IOREQ Server + * to specific memory type + * for specific accesses + * + * For now, flags only accept the value of XEN_DMOP_IOREQ_MEM_ACCESS_WRITE, + * which means only write operations are to be forwarded to an ioreq server. + * Support for the emulation of read operations can be added when an ioreq + * server has such requirement in future. + */ +#define XEN_DMOP_map_mem_type_to_ioreq_server 15 + +struct xen_dm_op_map_mem_type_to_ioreq_server { + ioservid_t id; /* IN - ioreq server id */ + uint16_t type; /* IN - memory type */ + uint32_t flags; /* IN - types of accesses to be forwarded to the + ioreq server. flags with 0 means to unmap the + ioreq server */ + +#define XEN_DMOP_IOREQ_MEM_ACCESS_READ (1u << 0) +#define XEN_DMOP_IOREQ_MEM_ACCESS_WRITE (1u << 1) + uint64_t opaque; /* IN/OUT - only used for hypercall continuation, + has to be set to zero by the caller */ +}; + +struct xen_dm_op { + uint32_t op; + uint32_t pad; + union { + struct xen_dm_op_create_ioreq_server create_ioreq_server; + struct xen_dm_op_get_ioreq_server_info get_ioreq_server_info; + struct xen_dm_op_ioreq_server_range map_io_range_to_ioreq_server; + struct xen_dm_op_ioreq_server_range unmap_io_range_from_ioreq_server; + struct xen_dm_op_set_ioreq_server_state set_ioreq_server_state; + struct xen_dm_op_destroy_ioreq_server destroy_ioreq_server; + struct xen_dm_op_track_dirty_vram track_dirty_vram; + struct xen_dm_op_set_pci_intx_level set_pci_intx_level; + struct xen_dm_op_set_isa_irq_level set_isa_irq_level; + struct xen_dm_op_set_pci_link_route set_pci_link_route; + struct xen_dm_op_modified_memory modified_memory; + struct xen_dm_op_set_mem_type set_mem_type; + struct xen_dm_op_inject_event inject_event; + struct xen_dm_op_inject_msi inject_msi; + struct xen_dm_op_map_mem_type_to_ioreq_server + map_mem_type_to_ioreq_server; + } u; +}; + struct xen_dm_op_buf { GUEST_HANDLE(void) h; xen_ulong_t size; }; -DEFINE_GUEST_HANDLE_STRUCT(xen_dm_op_buf); +typedef struct xen_dm_op_buf xen_dm_op_buf_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_dm_op_buf_t); + +/* ` enum neg_errnoval + * ` HYPERVISOR_dm_op(domid_t domid, + * ` unsigned int nr_bufs, + * ` xen_dm_op_buf_t bufs[]) + * ` + * + * @domid is the domain the hypercall operates on. + * @nr_bufs is the number of buffers in the @bufs array. + * @bufs points to an array of buffers where @bufs[0] contains a struct + * xen_dm_op, describing the specific device model operation and its + * parameters. + * @bufs[1..] may be referenced in the parameters for the purposes of + * passing extra information to or from the domain. + */ #endif /* __XEN_PUBLIC_HVM_DM_OP_H__ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h index 956a04682865b..ab95adf9b7519 100644 --- a/include/xen/interface/hvm/hvm_op.h +++ b/include/xen/interface/hvm/hvm_op.h @@ -16,50 +16,235 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2007, Keir Fraser */ #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__ #define __XEN_PUBLIC_HVM_HVM_OP_H__ -/* Get/set subcommands: the second argument of the hypercall is a - * pointer to a xen_hvm_param struct. */ +#include "../xen.h" +//#include "../trace.h" +#include "../event_channel.h" + +/* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */ #define HVMOP_set_param 0 #define HVMOP_get_param 1 struct xen_hvm_param { - domid_t domid; /* IN */ - uint32_t index; /* IN */ - uint64_t value; /* IN/OUT */ + domid_t domid; /* IN */ + uint32_t index; /* IN */ + uint64_t value; /* IN/OUT */ }; -DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param); +typedef struct xen_hvm_param xen_hvm_param_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param_t); + +/* Flushes all VCPU TLBs: @arg must be NULL. */ +#define HVMOP_flush_tlbs 5 + +typedef enum { + HVMMEM_ram_rw, /* Normal read/write guest RAM */ + HVMMEM_ram_ro, /* Read-only; writes are discarded */ + HVMMEM_mmio_dm, /* Reads and write go to the device model */ + HVMMEM_unused, /* Placeholder; setting memory to this type + will fail for code after 4.7.0 */ + HVMMEM_ioreq_server /* Memory type claimed by an ioreq server; type + changes to this value are only allowed after + an ioreq server has claimed its ownership. + Only pages with HVMMEM_ram_rw are allowed to + change to this type; conversely, pages with + this type are only allowed to be changed back + to HVMMEM_ram_rw. */ +} hvmmem_type_t; /* Hint from PV drivers for pagetable destruction. */ -#define HVMOP_pagetable_dying 9 +#define HVMOP_pagetable_dying 9 struct xen_hvm_pagetable_dying { - /* Domain with a pagetable about to be destroyed. */ - domid_t domid; - /* guest physical address of the toplevel pagetable dying */ - aligned_u64 gpa; + /* Domain with a pagetable about to be destroyed. */ + domid_t domid; + uint16_t pad[3]; /* align next field on 8-byte boundary */ + /* guest physical address of the toplevel pagetable dying */ + uint64_t gpa; }; typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t; DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t); - -enum hvmmem_type_t { - HVMMEM_ram_rw, /* Normal read/write guest RAM */ - HVMMEM_ram_ro, /* Read-only; writes are discarded */ - HVMMEM_mmio_dm, /* Reads and write go to the device model */ + +/* Get the current Xen time, in nanoseconds since system boot. */ +#define HVMOP_get_time 10 +struct xen_hvm_get_time { + uint64_t now; /* OUT */ }; +typedef struct xen_hvm_get_time xen_hvm_get_time_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_time_t); + +//#define HVMOP_xentrace 11 +//struct xen_hvm_xentrace { +// uint16_t event, extra_bytes; +// uint8_t extra[TRACE_EXTRA_MAX * sizeof(uint32_t)]; +//}; +//typedef struct xen_hvm_xentrace xen_hvm_xentrace_t; +//DEFINE_XEN_GUEST_HANDLE(xen_hvm_xentrace_t); + +/* Following tools-only interfaces may change in future. */ +#if defined(__XEN__) || defined(__XEN_TOOLS__) + +/* Deprecated by XENMEM_access_op_set_access */ +#define HVMOP_set_mem_access 12 + +/* Deprecated by XENMEM_access_op_get_access */ +#define HVMOP_get_mem_access 13 + +#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ #define HVMOP_get_mem_type 15 /* Return hvmmem_type_t for the specified pfn. */ struct xen_hvm_get_mem_type { - /* Domain to be queried. */ - domid_t domid; - /* OUT variable. */ - uint16_t mem_type; - uint16_t pad[2]; /* align next field on 8-byte boundary */ - /* IN variable. */ - uint64_t pfn; + /* Domain to be queried. */ + domid_t domid; + /* OUT variable. */ + uint16_t mem_type; + uint16_t pad[2]; /* align next field on 8-byte boundary */ + /* IN variable. */ + uint64_t pfn; +}; +typedef struct xen_hvm_get_mem_type xen_hvm_get_mem_type_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_mem_type_t); + +/* Following tools-only interfaces may change in future. */ +#if defined(__XEN__) || defined(__XEN_TOOLS__) + +/* + * Definitions relating to DMOP_create_ioreq_server. (Defined here for + * backwards compatibility). + */ + +#define HVM_IOREQSRV_BUFIOREQ_OFF 0 +#define HVM_IOREQSRV_BUFIOREQ_LEGACY 1 +/* + * Use this when read_pointer gets updated atomically and + * the pointer pair gets read atomically: + */ +#define HVM_IOREQSRV_BUFIOREQ_ATOMIC 2 + +#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ + +#if defined(__i386__) || defined(__x86_64__) + +/* + * HVMOP_set_evtchn_upcall_vector: Set a that should be used for event + * channel upcalls on the specified . If set, + * this vector will be used in preference to the + * domain global callback via (see + * HVM_PARAM_CALLBACK_IRQ). + */ +#define HVMOP_set_evtchn_upcall_vector 23 +struct xen_hvm_evtchn_upcall_vector { + uint32_t vcpu; + uint8_t vector; }; -DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_mem_type); +typedef struct xen_hvm_evtchn_upcall_vector xen_hvm_evtchn_upcall_vector_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_evtchn_upcall_vector_t); + +#endif /* defined(__i386__) || defined(__x86_64__) */ + +#define HVMOP_guest_request_vm_event 24 + +/* HVMOP_altp2m: perform altp2m state operations */ +#define HVMOP_altp2m 25 + +#define HVMOP_ALTP2M_INTERFACE_VERSION 0x00000001 + +struct xen_hvm_altp2m_domain_state { + /* IN or OUT variable on/off */ + uint8_t state; +}; +typedef struct xen_hvm_altp2m_domain_state xen_hvm_altp2m_domain_state_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_altp2m_domain_state_t); + +struct xen_hvm_altp2m_vcpu_enable_notify { + uint32_t vcpu_id; + uint32_t pad; + /* #VE info area gfn */ + uint64_t gfn; +}; +typedef struct xen_hvm_altp2m_vcpu_enable_notify xen_hvm_altp2m_vcpu_enable_notify_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_altp2m_vcpu_enable_notify_t); + +struct xen_hvm_altp2m_view { + /* IN/OUT variable */ + uint16_t view; + /* Create view only: default access type + * NOTE: currently ignored */ + uint16_t hvmmem_default_access; /* xenmem_access_t */ +}; +typedef struct xen_hvm_altp2m_view xen_hvm_altp2m_view_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_altp2m_view_t); + +struct xen_hvm_altp2m_set_mem_access { + /* view */ + uint16_t view; + /* Memory type */ + uint16_t hvmmem_access; /* xenmem_access_t */ + uint32_t pad; + /* gfn */ + uint64_t gfn; +}; +typedef struct xen_hvm_altp2m_set_mem_access xen_hvm_altp2m_set_mem_access_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_altp2m_set_mem_access_t); + +struct xen_hvm_altp2m_change_gfn { + /* view */ + uint16_t view; + uint16_t pad1; + uint32_t pad2; + /* old gfn */ + uint64_t old_gfn; + /* new gfn, INVALID_GFN (~0UL) means revert */ + uint64_t new_gfn; +}; +typedef struct xen_hvm_altp2m_change_gfn xen_hvm_altp2m_change_gfn_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_altp2m_change_gfn_t); + +struct xen_hvm_altp2m_op { + uint32_t version; /* HVMOP_ALTP2M_INTERFACE_VERSION */ + uint32_t cmd; +/* Get/set the altp2m state for a domain */ +#define HVMOP_altp2m_get_domain_state 1 +#define HVMOP_altp2m_set_domain_state 2 +/* Set the current VCPU to receive altp2m event notifications */ +#define HVMOP_altp2m_vcpu_enable_notify 3 +/* Create a new view */ +#define HVMOP_altp2m_create_p2m 4 +/* Destroy a view */ +#define HVMOP_altp2m_destroy_p2m 5 +/* Switch view for an entire domain */ +#define HVMOP_altp2m_switch_p2m 6 +/* Notify that a page of memory is to have specific access types */ +#define HVMOP_altp2m_set_mem_access 7 +/* Change a p2m entry to have a different gfn->mfn mapping */ +#define HVMOP_altp2m_change_gfn 8 + domid_t domain; + uint16_t pad1; + uint32_t pad2; + union { + struct xen_hvm_altp2m_domain_state domain_state; + struct xen_hvm_altp2m_vcpu_enable_notify enable_notify; + struct xen_hvm_altp2m_view view; + struct xen_hvm_altp2m_set_mem_access set_mem_access; + struct xen_hvm_altp2m_change_gfn change_gfn; + uint8_t pad[64]; + } u; +}; +typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_altp2m_op_t); #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/include/xen/interface/hvm/ioreq.h b/include/xen/interface/hvm/ioreq.h new file mode 100644 index 0000000000000..a9a3fd56dd128 --- /dev/null +++ b/include/xen/interface/hvm/ioreq.h @@ -0,0 +1,138 @@ +/* + * ioreq.h: I/O request definitions for device models + * Copyright (c) 2004, Intel Corporation. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _IOREQ_H_ +#define _IOREQ_H_ + +#define IOREQ_READ 1 +#define IOREQ_WRITE 0 + +#define STATE_IOREQ_NONE 0 +#define STATE_IOREQ_READY 1 +#define STATE_IOREQ_INPROCESS 2 +#define STATE_IORESP_READY 3 + +#define IOREQ_TYPE_PIO 0 /* pio */ +#define IOREQ_TYPE_COPY 1 /* mmio ops */ +#define IOREQ_TYPE_PCI_CONFIG 2 +#define IOREQ_TYPE_TIMEOFFSET 7 +#define IOREQ_TYPE_INVALIDATE 8 /* mapcache */ + +/* + * VMExit dispatcher should cooperate with instruction decoder to + * prepare this structure and notify service OS and DM by sending + * virq. + * + * For I/O type IOREQ_TYPE_PCI_CONFIG, the physical address is formatted + * as follows: + * + * 63....48|47..40|39..35|34..32|31........0 + * SEGMENT |BUS |DEV |FN |OFFSET + */ +struct ioreq { + uint64_t addr; /* physical address */ + uint64_t data; /* data (or paddr of data) */ + uint32_t count; /* for rep prefixes */ + uint32_t size; /* size in bytes */ + uint32_t vp_eport; /* evtchn for notifications to/from device model */ + uint16_t _pad0; + uint8_t state:4; + uint8_t data_is_ptr:1; /* if 1, data above is the guest paddr + * of the real data to use. */ + uint8_t dir:1; /* 1=read, 0=write */ + uint8_t df:1; + uint8_t _pad1:1; + uint8_t type; /* I/O type */ +}; +typedef struct ioreq ioreq_t; + +struct shared_iopage { + struct ioreq vcpu_ioreq[1]; +}; +typedef struct shared_iopage shared_iopage_t; + +struct buf_ioreq { + uint8_t type; /* I/O type */ + uint8_t pad:1; + uint8_t dir:1; /* 1=read, 0=write */ + uint8_t size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two buf_ioreqs */ + uint32_t addr:20;/* physical address */ + uint32_t data; /* data */ +}; +typedef struct buf_ioreq buf_ioreq_t; + +#define IOREQ_BUFFER_SLOT_NUM 511 /* 8 bytes each, plus 2 4-byte indexes */ +struct buffered_iopage { +#ifdef __XEN__ + union bufioreq_pointers { + struct { +#endif + uint32_t read_pointer; + uint32_t write_pointer; +#ifdef __XEN__ + }; + uint64_t full; + } ptrs; +#endif + buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM]; +}; /* NB. Size of this structure must be no greater than one page. */ +typedef struct buffered_iopage buffered_iopage_t; + +/* + * ACPI Control/Event register locations. Location is controlled by a + * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION. + */ + +/* Version 0 (default): Traditional Xen locations. */ +#define ACPI_PM1A_EVT_BLK_ADDRESS_V0 0x1f40 +#define ACPI_PM1A_CNT_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x04) +#define ACPI_PM_TMR_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x08) +#define ACPI_GPE0_BLK_ADDRESS_V0 (ACPI_PM_TMR_BLK_ADDRESS_V0 + 0x20) +#define ACPI_GPE0_BLK_LEN_V0 0x08 + +/* Version 1: Locations preferred by modern Qemu. */ +#define ACPI_PM1A_EVT_BLK_ADDRESS_V1 0xb000 +#define ACPI_PM1A_CNT_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x04) +#define ACPI_PM_TMR_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x08) +#define ACPI_GPE0_BLK_ADDRESS_V1 0xafe0 +#define ACPI_GPE0_BLK_LEN_V1 0x04 + +/* Compatibility definitions for the default location (version 0). */ +#define ACPI_PM1A_EVT_BLK_ADDRESS ACPI_PM1A_EVT_BLK_ADDRESS_V0 +#define ACPI_PM1A_CNT_BLK_ADDRESS ACPI_PM1A_CNT_BLK_ADDRESS_V0 +#define ACPI_PM_TMR_BLK_ADDRESS ACPI_PM_TMR_BLK_ADDRESS_V0 +#define ACPI_GPE0_BLK_ADDRESS ACPI_GPE0_BLK_ADDRESS_V0 +#define ACPI_GPE0_BLK_LEN ACPI_GPE0_BLK_LEN_V0 + + +#endif /* _IOREQ_H_ */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h index 9aa8988cb340e..be0493d084f70 100644 --- a/include/xen/interface/memory.h +++ b/include/xen/interface/memory.h @@ -111,6 +111,11 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange); #define XENMEM_current_reservation 3 #define XENMEM_maximum_reservation 4 +/* + * Returns the maximum GPFN in use by the guest, or -ve errcode on failure. + */ +#define XENMEM_maximum_gpfn 14 + /* * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys * mapping table. Architectures which do not have a m2p table do not implement @@ -242,6 +247,27 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map); */ #define XENMEM_machine_memory_map 10 +/* + * Translate the given guest PFNs to MFNs + */ +#define XENMEM_get_mfn_from_pfn 28 +struct xen_get_mfn_from_pfn { + /* + * Pointer to buffer to fill with list of pfn. + * for IN, it contains the guest PFN that need to translated + * for OUT, it contains the translated MFN. or INVALID_MFN if no valid translation + */ + GUEST_HANDLE(ulong) pfn_list; + + /* + * IN: Size of the pfn_array. + */ + unsigned int nr_pfns; + + /* IN: which domain */ + domid_t domid; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_get_mfn_from_pfn); /* * Prevent the balloon driver from changing the memory reservation diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h index 98188c87f5c15..fc79cb4336115 100644 --- a/include/xen/interface/vcpu.h +++ b/include/xen/interface/vcpu.h @@ -178,4 +178,49 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info); /* Send an NMI to the specified VCPU. @extra_arg == NULL. */ #define VCPUOP_send_nmi 11 + +/* Request an I/O emulation for the specified VCPU. */ +#define VCPUOP_request_io_emulation 14 +#define PV_IOREQ_READ 1 +#define PV_IOREQ_WRITE 0 + +#define PV_IOREQ_TYPE_PIO 0 /* pio */ +#define PV_IOREQ_TYPE_COPY 1 /* mmio ops */ + +struct vcpu_emul_ioreq { + uint64_t addr; /* physical address */ + uint64_t data; /* data (or paddr of data) */ + uint64_t count; /* for rep prefixes */ + uint32_t size; /* size in bytes */ + uint16_t _pad0; + uint8_t state:4; + uint8_t data_is_ptr:1; /* if 1, data above is the guest paddr + * of the real data to use. */ + uint8_t dir:1; /* 1=read, 0=write */ + uint8_t df:1; + uint8_t _pad1:1; + uint8_t type; /* I/O type */ +}; +DEFINE_GUEST_HANDLE_STRUCT(vcpu_emul_ioreq); + +#define VCPUOP_get_sysdata 16 +/* sub operations */ +#define VCPUOP_sysdata_get_segment 0 +#define VCPUOP_sysdata_read 1 +struct vcpu_sysdata_request { + uint64_t op_type; + union { + struct { + uint32_t selector; + uint32_t pad1; + uint64_t xdt_desc[2]; + }; + struct { + uint64_t src_addr; /* linear address */ + uint64_t sys_data; + uint32_t bytes; + }; + }; +}; + #endif /* __XEN_PUBLIC_VCPU_H__ */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 4f4830ef8f934..6c50927055fcb 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -115,6 +115,7 @@ #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ #define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ #define VIRQ_XENPMU 13 /* PMC interrupt */ +#define VIRQ_VGT_GFX 15 /* (DOM0) Used for graphics interrupt */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 @@ -771,6 +772,111 @@ struct tmem_op { DEFINE_GUEST_HANDLE(u64); +/* XEN_DOMCTL_getdomaininfo */ +struct xen_domctl_getdomaininfo { + /* OUT variables. */ + domid_t domain; /* Also echoed in domctl.domain */ + /* Domain is scheduled to die. */ +#define _XEN_DOMINF_dying 0 +#define XEN_DOMINF_dying (1U<<_XEN_DOMINF_dying) + /* Domain is an HVM guest (as opposed to a PV guest). */ +#define _XEN_DOMINF_hvm_guest 1 +#define XEN_DOMINF_hvm_guest (1U<<_XEN_DOMINF_hvm_guest) + /* The guest OS has shut down. */ +#define _XEN_DOMINF_shutdown 2 +#define XEN_DOMINF_shutdown (1U<<_XEN_DOMINF_shutdown) + /* Currently paused by control software. */ +#define _XEN_DOMINF_paused 3 +#define XEN_DOMINF_paused (1U<<_XEN_DOMINF_paused) + /* Currently blocked pending an event. */ +#define _XEN_DOMINF_blocked 4 +#define XEN_DOMINF_blocked (1U<<_XEN_DOMINF_blocked) + /* Domain is currently running. */ +#define _XEN_DOMINF_running 5 +#define XEN_DOMINF_running (1U<<_XEN_DOMINF_running) + /* Being debugged. */ +#define _XEN_DOMINF_debugged 6 +#define XEN_DOMINF_debugged (1U<<_XEN_DOMINF_debugged) + /* XEN_DOMINF_shutdown guest-supplied code. */ +#define XEN_DOMINF_shutdownmask 255 +#define XEN_DOMINF_shutdownshift 16 + uint32_t flags; /* XEN_DOMINF_* */ + aligned_u64 tot_pages; + aligned_u64 max_pages; + aligned_u64 outstanding_pages; + aligned_u64 shr_pages; + aligned_u64 paged_pages; + aligned_u64 shared_info_frame; /* GMFN of shared_info struct */ + aligned_u64 cpu_time; + uint32_t nr_online_vcpus; /* Number of VCPUs currently online. */ + uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */ + uint32_t ssidref; + xen_domain_handle_t handle; + uint32_t cpupool; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_getdomaininfo); + +#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000d +#define XEN_DOMCTL_pausedomain 3 +#define XEN_DOMCTL_getdomaininfo 5 +#define XEN_DOMCTL_memory_mapping 39 +#define XEN_DOMCTL_iomem_permission 20 + + +#define XEN_DOMCTL_vgt_io_trap 700 + +#define MAX_VGT_IO_TRAP_INFO 4 + +struct vgt_io_trap_info { + uint64_t s; + uint64_t e; +}; + +struct xen_domctl_vgt_io_trap { + uint32_t n_pio; + struct vgt_io_trap_info pio[MAX_VGT_IO_TRAP_INFO]; + + uint32_t n_mmio; + struct vgt_io_trap_info mmio[MAX_VGT_IO_TRAP_INFO]; +}; + +/* Bind machine I/O address range -> HVM address range. */ +/* XEN_DOMCTL_memory_mapping */ +#define DPCI_ADD_MAPPING 1 +#define DPCI_REMOVE_MAPPING 0 +struct xen_domctl_memory_mapping { + aligned_u64 first_gfn; /* first page (hvm guest phys page) in range */ + aligned_u64 first_mfn; /* first page (machine page) in range. */ + aligned_u64 nr_mfns; /* number of pages in range (>0) */ + uint32_t add_mapping; /* Add or remove mapping */ + uint32_t padding; /* padding for 64-bit aligned struct */ +}; +typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_memory_mapping_t); + +/* XEN_DOMCTL_iomem_permission */ +struct xen_domctl_iomem_permission { + aligned_u64 first_mfn;/* first page (physical page number) in range */ + aligned_u64 nr_mfns; /* number of pages in range (>0) */ + uint8_t allow_access; /* allow (!0) or deny (0) access to range? */ +}; +typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_iomem_permission_t); + +struct xen_domctl { + uint32_t cmd; + uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ + domid_t domain; + union { + struct xen_domctl_getdomaininfo getdomaininfo; + struct xen_domctl_vgt_io_trap vgt_io_trap; + struct xen_domctl_memory_mapping memory_mapping; + struct xen_domctl_iomem_permission iomem_perm; + uint8_t pad[256]; + }u; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl); + #else /* __ASSEMBLY__ */ /* In assembly code we cannot use C numeric constant suffixes. */ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index c44a2ee8c8f80..790b60b042646 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -166,4 +166,9 @@ static inline void xen_preemptible_hcall_end(void) #endif /* CONFIG_PREEMPT */ +struct vm_struct * xen_remap_domain_mfn_range_in_kernel(unsigned long mfn, + int nr, unsigned domid); +void xen_unmap_domain_mfn_range_in_kernel(struct vm_struct *area, int nr, + unsigned domid); + #endif /* INCLUDE_XEN_OPS_H */