From 9a6e2dcfdda31275296c2a55ae10ec9ee5265459 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 10 May 2017 16:32:55 +0800 Subject: [PATCH 01/31] mc146818rtc: update periodic timer only if it is needed Currently, the timer is updated whenever RegA or RegB is written even if the periodic timer related configuration is not changed This patch optimizes it slightly to make the update happen only if its period or enable-status is changed, also later patches are depend on this optimization Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-2-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index 93de3e1cc5..7d78391b62 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -391,6 +391,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { RTCState *s = opaque; + bool update_periodic_timer; if ((addr & 1) == 0) { s->cmos_index = data & 0x7f; @@ -423,6 +424,8 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, } break; case RTC_REG_A: + update_periodic_timer = (s->cmos_data[RTC_REG_A] ^ data) & 0x0f; + if ((data & 0x60) == 0x60) { if (rtc_running(s)) { rtc_update_time(s); @@ -445,10 +448,17 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, /* UIP bit is read only */ s->cmos_data[RTC_REG_A] = (data & ~REG_A_UIP) | (s->cmos_data[RTC_REG_A] & REG_A_UIP); - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + + if (update_periodic_timer) { + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + } + check_update_timer(s); break; case RTC_REG_B: + update_periodic_timer = (s->cmos_data[RTC_REG_B] ^ data) + & REG_B_PIE; + if (data & REG_B_SET) { /* update cmos to when the rtc was stopping */ if (rtc_running(s)) { @@ -475,7 +485,11 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, qemu_irq_lower(s->irq); } s->cmos_data[RTC_REG_B] = data; - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + + if (update_periodic_timer) { + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + } + check_update_timer(s); break; case RTC_REG_C: From 369b41359af46bded5799c9ef8be2b641d92e043 Mon Sep 17 00:00:00 2001 From: Tai Yunfang Date: Wed, 10 May 2017 16:32:56 +0800 Subject: [PATCH 02/31] mc146818rtc: precisely count the clock for periodic timer There are two issues in current code: 1) If the period is changed by re-configuring RegA, the coalesced irq will be scaled to reflect the new period, however, it calculates the new interrupt number like this: s->irq_coalesced = (s->irq_coalesced * s->period) / period; There are some clocks will be lost if they are not enough to be squeezed to a single new period that will cause the VM clock slower In order to fix the issue, we calculate the interrupt window based on the precise clock rather than period, then the clocks lost during period is scaled can be compensated properly 2) If periodic_timer_update() is called due to RegA reconfiguration, i.e, the period is updated, current time is not the start point for the next periodic timer, instead, which should start from the last interrupt, otherwise, the clock in VM will become slow This patch takes the clocks from last interrupt to current clock into account and compensates the clocks for the next interrupt, especially if a complete interrupt was lost in this window, the time can be caught up by LOST_TICK_POLICY_SLEW Signed-off-by: Tai Yunfang Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-3-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 120 +++++++++++++++++++++++++++++++++-------- 1 file changed, 97 insertions(+), 23 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index 7d78391b62..aeb60cc3e3 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -146,31 +146,100 @@ static void rtc_coalesced_timer(void *opaque) } #endif -/* handle periodic timer */ -static void periodic_timer_update(RTCState *s, int64_t current_time) +static uint32_t rtc_periodic_clock_ticks(RTCState *s) { - int period_code, period; - int64_t cur_clock, next_irq_clock; + int period_code; + + if (!(s->cmos_data[RTC_REG_B] & REG_B_PIE)) { + return 0; + } period_code = s->cmos_data[RTC_REG_A] & 0x0f; - if (period_code != 0 - && (s->cmos_data[RTC_REG_B] & REG_B_PIE)) { - if (period_code <= 2) - period_code += 7; - /* period in 32 Khz cycles */ - period = 1 << (period_code - 1); -#ifdef TARGET_I386 - if (period != s->period) { - s->irq_coalesced = (s->irq_coalesced * s->period) / period; - DPRINTF_C("cmos: coalesced irqs scaled to %d\n", s->irq_coalesced); - } - s->period = period; -#endif + if (!period_code) { + return 0; + } + + if (period_code <= 2) { + period_code += 7; + } + + /* period in 32 Khz cycles */ + return 1 << (period_code - 1); +} + +/* + * handle periodic timer. @old_period indicates the periodic timer update + * is just due to period adjustment. + */ +static void +periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) +{ + uint32_t period; + int64_t cur_clock, next_irq_clock, lost_clock = 0; + + period = rtc_periodic_clock_ticks(s); + + if (period) { /* compute 32 khz clock */ cur_clock = muldiv64(current_time, RTC_CLOCK_RATE, NANOSECONDS_PER_SECOND); - next_irq_clock = (cur_clock & ~(period - 1)) + period; + /* + * if the periodic timer's update is due to period re-configuration, + * we should count the clock since last interrupt. + */ + if (old_period) { + int64_t last_periodic_clock, next_periodic_clock; + + next_periodic_clock = muldiv64(s->next_periodic_time, + RTC_CLOCK_RATE, NANOSECONDS_PER_SECOND); + last_periodic_clock = next_periodic_clock - old_period; + lost_clock = cur_clock - last_periodic_clock; + assert(lost_clock >= 0); + } + +#ifdef TARGET_I386 + /* + * s->irq_coalesced can change for two reasons: + * + * a) if one or more periodic timer interrupts have been lost, + * lost_clock will be more that a period. + * + * b) when the period may be reconfigured, we expect the OS to + * treat delayed tick as the new period. So, when switching + * from a shorter to a longer period, scale down the missing, + * because the OS will treat past delayed ticks as longer + * (leftovers are put back into lost_clock). When switching + * to a shorter period, scale up the missing ticks since the + * OS handler will treat past delayed ticks as shorter. + */ + if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { + uint32_t old_irq_coalesced = s->irq_coalesced; + + s->period = period; + lost_clock += old_irq_coalesced * old_period; + s->irq_coalesced = lost_clock / s->period; + lost_clock %= s->period; + if (old_irq_coalesced != s->irq_coalesced || + old_period != s->period) { + DPRINTF_C("cmos: coalesced irqs scaled from %d to %d, " + "period scaled from %d to %d\n", old_irq_coalesced, + s->irq_coalesced, old_period, s->period); + rtc_coalesced_timer_update(s); + } + } else +#endif + { + /* + * no way to compensate the interrupt if LOST_TICK_POLICY_SLEW + * is not used, we should make the time progress anyway. + */ + lost_clock = MIN(lost_clock, period); + } + + assert(lost_clock >= 0 && lost_clock <= period); + + next_irq_clock = cur_clock + period - lost_clock; s->next_periodic_time = muldiv64(next_irq_clock, NANOSECONDS_PER_SECOND, RTC_CLOCK_RATE) + 1; timer_mod(s->periodic_timer, s->next_periodic_time); @@ -186,7 +255,7 @@ static void rtc_periodic_timer(void *opaque) { RTCState *s = opaque; - periodic_timer_update(s, s->next_periodic_time); + periodic_timer_update(s, s->next_periodic_time, 0); s->cmos_data[RTC_REG_C] |= REG_C_PF; if (s->cmos_data[RTC_REG_B] & REG_B_PIE) { s->cmos_data[RTC_REG_C] |= REG_C_IRQF; @@ -391,6 +460,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { RTCState *s = opaque; + uint32_t old_period; bool update_periodic_timer; if ((addr & 1) == 0) { @@ -425,6 +495,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, break; case RTC_REG_A: update_periodic_timer = (s->cmos_data[RTC_REG_A] ^ data) & 0x0f; + old_period = rtc_periodic_clock_ticks(s); if ((data & 0x60) == 0x60) { if (rtc_running(s)) { @@ -450,7 +521,8 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, (s->cmos_data[RTC_REG_A] & REG_A_UIP); if (update_periodic_timer) { - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock), + old_period); } check_update_timer(s); @@ -458,6 +530,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, case RTC_REG_B: update_periodic_timer = (s->cmos_data[RTC_REG_B] ^ data) & REG_B_PIE; + old_period = rtc_periodic_clock_ticks(s); if (data & REG_B_SET) { /* update cmos to when the rtc was stopping */ @@ -487,7 +560,8 @@ static void cmos_ioport_write(void *opaque, hwaddr addr, s->cmos_data[RTC_REG_B] = data; if (update_periodic_timer) { - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock), + old_period); } check_update_timer(s); @@ -757,7 +831,7 @@ static int rtc_post_load(void *opaque, int version_id) uint64_t now = qemu_clock_get_ns(rtc_clock); if (now < s->next_periodic_time || now > (s->next_periodic_time + get_max_clock_jump())) { - periodic_timer_update(s, qemu_clock_get_ns(rtc_clock)); + periodic_timer_update(s, qemu_clock_get_ns(rtc_clock), 0); } } @@ -822,7 +896,7 @@ static void rtc_notify_clock_reset(Notifier *notifier, void *data) int64_t now = *(int64_t *)data; rtc_set_date_from_host(ISA_DEVICE(s)); - periodic_timer_update(s, now); + periodic_timer_update(s, now, 0); check_update_timer(s); #ifdef TARGET_I386 if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { From 4aa70a0e9cd0c0332a8369df8c4f6d8e22fafe23 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 10 May 2017 16:32:57 +0800 Subject: [PATCH 03/31] mc146818rtc: ensure LOST_TICK_POLICY_SLEW is only enabled on TARGET_I386 Any tick policy specified on other platforms rather on TARGET_I386 will fall back to LOST_TICK_POLICY_DISCARD silently, this patch makes sure only TARGET_I386 can enable LOST_TICK_POLICY_SLEW After that, we can enable LOST_TICK_POLICY_SLEW in the common code which need not use '#ifdef TARGET_I386' to make these code be x86 specific anymore Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-4-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index aeb60cc3e3..4870a72015 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -974,19 +974,19 @@ static void rtc_realizefn(DeviceState *dev, Error **errp) rtc_set_date_from_host(isadev); -#ifdef TARGET_I386 switch (s->lost_tick_policy) { +#ifdef TARGET_I386 case LOST_TICK_POLICY_SLEW: s->coalesced_timer = timer_new_ns(rtc_clock, rtc_coalesced_timer, s); break; +#endif case LOST_TICK_POLICY_DISCARD: break; default: error_setg(errp, "Invalid lost tick policy."); return; } -#endif s->periodic_timer = timer_new_ns(rtc_clock, rtc_periodic_timer, s); s->update_timer = timer_new_ns(rtc_clock, rtc_update_timer, s); From 388ad5d2969b70242a385031caadef46328bf940 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 10 May 2017 16:32:58 +0800 Subject: [PATCH 04/31] mc146818rtc: drop unnecessary '#ifdef TARGET_I386' If the code purely depends on LOST_TICK_POLICY_SLEW, we can simply drop '#ifdef TARGET_I386' as only x86 can enable this tick policy Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-5-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index 4870a72015..f9d6181d58 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -112,7 +112,6 @@ static uint64_t get_guest_rtc_ns(RTCState *s) guest_clock - s->last_update + s->offset; } -#ifdef TARGET_I386 static void rtc_coalesced_timer_update(RTCState *s) { if (s->irq_coalesced == 0) { @@ -126,6 +125,7 @@ static void rtc_coalesced_timer_update(RTCState *s) } } +#ifdef TARGET_I386 static void rtc_coalesced_timer(void *opaque) { RTCState *s = opaque; @@ -198,7 +198,6 @@ periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) assert(lost_clock >= 0); } -#ifdef TARGET_I386 /* * s->irq_coalesced can change for two reasons: * @@ -227,9 +226,7 @@ periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) s->irq_coalesced, old_period, s->period); rtc_coalesced_timer_update(s); } - } else -#endif - { + } else { /* * no way to compensate the interrupt if LOST_TICK_POLICY_SLEW * is not used, we should make the time progress anyway. @@ -244,9 +241,7 @@ periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) RTC_CLOCK_RATE) + 1; timer_mod(s->periodic_timer, s->next_periodic_time); } else { -#ifdef TARGET_I386 s->irq_coalesced = 0; -#endif timer_del(s->periodic_timer); } } @@ -835,13 +830,11 @@ static int rtc_post_load(void *opaque, int version_id) } } -#ifdef TARGET_I386 if (version_id >= 2) { if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { rtc_coalesced_timer_update(s); } } -#endif return 0; } @@ -898,11 +891,10 @@ static void rtc_notify_clock_reset(Notifier *notifier, void *data) rtc_set_date_from_host(ISA_DEVICE(s)); periodic_timer_update(s, now, 0); check_update_timer(s); -#ifdef TARGET_I386 + if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { rtc_coalesced_timer_update(s); } -#endif } /* set CMOS shutdown status register (index 0xF) as S3_resume(0xFE) @@ -923,12 +915,10 @@ static void rtc_reset(void *opaque) qemu_irq_lower(s->irq); -#ifdef TARGET_I386 if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { s->irq_coalesced = 0; s->irq_reinject_on_ack_count = 0; } -#endif } static const MemoryRegionOps cmos_ops = { From e0c8b950d17a57343926ed937af10e8903b0d6cc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 10 May 2017 16:32:59 +0800 Subject: [PATCH 05/31] mc146818rtc: embrace all x86 specific code Introduce a function, rtc_policy_slew_deliver_irq(), which delivers irq if LOST_TICK_POLICY_SLEW is used, as which is only supported on x86, other platforms call it will trigger a assert After that, we can move the x86 specific code to the common place Signed-off-by: Xiao Guangrong Message-Id: <20170510083259.3900-6-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 60 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index f9d6181d58..542cd09bc1 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -125,17 +125,34 @@ static void rtc_coalesced_timer_update(RTCState *s) } } +static QLIST_HEAD(, RTCState) rtc_devices = + QLIST_HEAD_INITIALIZER(rtc_devices); + #ifdef TARGET_I386 +void qmp_rtc_reset_reinjection(Error **errp) +{ + RTCState *s; + + QLIST_FOREACH(s, &rtc_devices, link) { + s->irq_coalesced = 0; + } +} + +static bool rtc_policy_slew_deliver_irq(RTCState *s) +{ + apic_reset_irq_delivered(); + qemu_irq_raise(s->irq); + return apic_get_irq_delivered(); +} + static void rtc_coalesced_timer(void *opaque) { RTCState *s = opaque; if (s->irq_coalesced != 0) { - apic_reset_irq_delivered(); s->cmos_data[RTC_REG_C] |= 0xc0; DPRINTF_C("cmos: injecting from timer\n"); - qemu_irq_raise(s->irq); - if (apic_get_irq_delivered()) { + if (rtc_policy_slew_deliver_irq(s)) { s->irq_coalesced--; DPRINTF_C("cmos: coalesced irqs decreased to %d\n", s->irq_coalesced); @@ -144,6 +161,12 @@ static void rtc_coalesced_timer(void *opaque) rtc_coalesced_timer_update(s); } +#else +static bool rtc_policy_slew_deliver_irq(RTCState *s) +{ + assert(0); + return false; +} #endif static uint32_t rtc_periodic_clock_ticks(RTCState *s) @@ -254,21 +277,17 @@ static void rtc_periodic_timer(void *opaque) s->cmos_data[RTC_REG_C] |= REG_C_PF; if (s->cmos_data[RTC_REG_B] & REG_B_PIE) { s->cmos_data[RTC_REG_C] |= REG_C_IRQF; -#ifdef TARGET_I386 if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) { if (s->irq_reinject_on_ack_count >= RTC_REINJECT_ON_ACK_COUNT) - s->irq_reinject_on_ack_count = 0; - apic_reset_irq_delivered(); - qemu_irq_raise(s->irq); - if (!apic_get_irq_delivered()) { + s->irq_reinject_on_ack_count = 0; + if (!rtc_policy_slew_deliver_irq(s)) { s->irq_coalesced++; rtc_coalesced_timer_update(s); DPRINTF_C("cmos: coalesced irqs increased to %d\n", s->irq_coalesced); } } else -#endif - qemu_irq_raise(s->irq); + qemu_irq_raise(s->irq); } } @@ -612,20 +631,6 @@ static void rtc_get_time(RTCState *s, struct tm *tm) rtc_from_bcd(s, s->cmos_data[RTC_CENTURY]) * 100 - 1900; } -static QLIST_HEAD(, RTCState) rtc_devices = - QLIST_HEAD_INITIALIZER(rtc_devices); - -#ifdef TARGET_I386 -void qmp_rtc_reset_reinjection(Error **errp) -{ - RTCState *s; - - QLIST_FOREACH(s, &rtc_devices, link) { - s->irq_coalesced = 0; - } -} -#endif - static void rtc_set_time(RTCState *s) { struct tm tm; @@ -745,22 +750,19 @@ static uint64_t cmos_ioport_read(void *opaque, hwaddr addr, if (ret & (REG_C_UF | REG_C_AF)) { check_update_timer(s); } -#ifdef TARGET_I386 + if(s->irq_coalesced && (s->cmos_data[RTC_REG_B] & REG_B_PIE) && s->irq_reinject_on_ack_count < RTC_REINJECT_ON_ACK_COUNT) { s->irq_reinject_on_ack_count++; s->cmos_data[RTC_REG_C] |= REG_C_IRQF | REG_C_PF; - apic_reset_irq_delivered(); DPRINTF_C("cmos: injecting on ack\n"); - qemu_irq_raise(s->irq); - if (apic_get_irq_delivered()) { + if (rtc_policy_slew_deliver_irq(s)) { s->irq_coalesced--; DPRINTF_C("cmos: coalesced irqs decreased to %d\n", s->irq_coalesced); } } -#endif break; default: ret = s->cmos_data[s->cmos_index]; From bd618eab7641693f0838da52c5af5c8050f831d3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 27 May 2017 10:53:01 +0800 Subject: [PATCH 06/31] qtest: add rtc periodic timer test It tests the accuracy of rtc periodic timer which is recently improved & fixed by commit 7ffcb539a3 ("mc146818rtc: precisely count the clock for periodic timer", 2017-05-19). Signed-off-by: Xiao Guangrong Message-Id: <20170527025301.23499-1-xiaoguangrong@tencent.com> Signed-off-by: Paolo Bonzini --- hw/timer/mc146818rtc.c | 15 ++------- include/hw/timer/mc146818rtc_regs.h | 20 ++++++++++++ tests/rtc-test.c | 49 +++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 12 deletions(-) diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index 542cd09bc1..1b8d3d7d4c 100644 --- a/hw/timer/mc146818rtc.c +++ b/hw/timer/mc146818rtc.c @@ -120,7 +120,7 @@ static void rtc_coalesced_timer_update(RTCState *s) /* divide each RTC interval to 2 - 8 smaller intervals */ int c = MIN(s->irq_coalesced, 7) + 1; int64_t next_clock = qemu_clock_get_ns(rtc_clock) + - muldiv64(s->period / c, NANOSECONDS_PER_SECOND, RTC_CLOCK_RATE); + periodic_clock_to_ns(s->period / c); timer_mod(s->coalesced_timer, next_clock); } } @@ -178,16 +178,8 @@ static uint32_t rtc_periodic_clock_ticks(RTCState *s) } period_code = s->cmos_data[RTC_REG_A] & 0x0f; - if (!period_code) { - return 0; - } - if (period_code <= 2) { - period_code += 7; - } - - /* period in 32 Khz cycles */ - return 1 << (period_code - 1); + return periodic_period_to_clock(period_code); } /* @@ -260,8 +252,7 @@ periodic_timer_update(RTCState *s, int64_t current_time, uint32_t old_period) assert(lost_clock >= 0 && lost_clock <= period); next_irq_clock = cur_clock + period - lost_clock; - s->next_periodic_time = muldiv64(next_irq_clock, NANOSECONDS_PER_SECOND, - RTC_CLOCK_RATE) + 1; + s->next_periodic_time = periodic_clock_to_ns(next_irq_clock) + 1; timer_mod(s->periodic_timer, s->next_periodic_time); } else { s->irq_coalesced = 0; diff --git a/include/hw/timer/mc146818rtc_regs.h b/include/hw/timer/mc146818rtc_regs.h index 6ede6c832e..c62f17bf2d 100644 --- a/include/hw/timer/mc146818rtc_regs.h +++ b/include/hw/timer/mc146818rtc_regs.h @@ -65,4 +65,24 @@ #define REG_C_AF 0x20 #define REG_C_MASK 0x70 +static inline uint32_t periodic_period_to_clock(int period_code) +{ + if (!period_code) { + return 0; + } + + if (period_code <= 2) { + period_code += 7; + } + /* period in 32 Khz cycles */ + return 1 << (period_code - 1); +} + +#define RTC_CLOCK_RATE 32768 + +static inline int64_t periodic_clock_to_ns(int64_t clocks) +{ + return muldiv64(clocks, NANOSECONDS_PER_SECOND, RTC_CLOCK_RATE); +} + #endif diff --git a/tests/rtc-test.c b/tests/rtc-test.c index a086efd120..e78f701afb 100644 --- a/tests/rtc-test.c +++ b/tests/rtc-test.c @@ -14,6 +14,7 @@ #include "qemu/osdep.h" #include "libqtest.h" +#include "qemu/timer.h" #include "hw/timer/mc146818rtc_regs.h" static uint8_t base = 0x70; @@ -542,6 +543,52 @@ static void register_b_set_flag(void) g_assert_cmpint(cmos_read(RTC_CENTURY), ==, 0x20); } +#define RTC_PERIOD_CODE1 13 /* 8 Hz */ +#define RTC_PERIOD_CODE2 15 /* 2 Hz */ + +#define RTC_PERIOD_TEST_NR 50 + +static uint64_t wait_periodic_interrupt(uint64_t real_time) +{ + while (!get_irq(RTC_ISA_IRQ)) { + real_time = clock_step_next(); + } + + g_assert((cmos_read(RTC_REG_C) & REG_C_PF) != 0); + return real_time; +} + +static void periodic_timer(void) +{ + int i; + uint64_t period_clocks, period_time, start_time, real_time; + + /* disable all interrupts. */ + cmos_write(RTC_REG_B, cmos_read(RTC_REG_B) & + ~(REG_B_PIE | REG_B_AIE | REG_B_UIE)); + cmos_write(RTC_REG_A, RTC_PERIOD_CODE1); + /* enable periodic interrupt after properly configure the period. */ + cmos_write(RTC_REG_B, cmos_read(RTC_REG_B) | REG_B_PIE); + + start_time = real_time = clock_step_next(); + + for (i = 0; i < RTC_PERIOD_TEST_NR; i++) { + cmos_write(RTC_REG_A, RTC_PERIOD_CODE1); + real_time = wait_periodic_interrupt(real_time); + cmos_write(RTC_REG_A, RTC_PERIOD_CODE2); + real_time = wait_periodic_interrupt(real_time); + } + + period_clocks = periodic_period_to_clock(RTC_PERIOD_CODE1) + + periodic_period_to_clock(RTC_PERIOD_CODE2); + period_clocks *= RTC_PERIOD_TEST_NR; + period_time = periodic_clock_to_ns(period_clocks); + + real_time -= start_time; + g_assert_cmpint(ABS((int64_t)(real_time - period_time)), <=, + NANOSECONDS_PER_SECOND * 0.5); +} + int main(int argc, char **argv) { QTestState *s = NULL; @@ -564,6 +611,8 @@ int main(int argc, char **argv) qtest_add_func("/rtc/set-year/1980", set_year_1980); qtest_add_func("/rtc/misc/register_b_set_flag", register_b_set_flag); qtest_add_func("/rtc/misc/fuzz-registers", fuzz_registers); + qtest_add_func("/rtc/periodic/interrupt", periodic_timer); + ret = g_test_run(); if (s) { From 9ba35d0b865361f9838f830672ab229a569024eb Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 May 2017 14:00:42 +0800 Subject: [PATCH 07/31] kvm: irqchip: trace changes on msi add/remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It'll be nice to know which virq belongs to which device/vector when adding msi routes, so adding two more parameters for the add trace. Meanwhile, releasing virq has no tracing before. Add one for it. Signed-off-by: Peter Xu Message-Id: <1494309644-18743-2-git-send-email-peterx@redhat.com> Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Paolo Bonzini --- kvm-all.c | 4 +++- trace-events | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 494b9256aa..1b9fe23490 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1144,6 +1144,7 @@ void kvm_irqchip_release_virq(KVMState *s, int virq) } clear_gsi(s, virq); kvm_arch_release_virq_post(virq); + trace_kvm_irqchip_release_virq(virq); } static unsigned int kvm_hash_msi(uint32_t data) @@ -1287,7 +1288,8 @@ int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) return -EINVAL; } - trace_kvm_irqchip_add_msi_route(virq); + trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", + vector, virq); kvm_add_routing_entry(s, &kroute); kvm_arch_add_msi_route_post(&kroute, vector, dev); diff --git a/trace-events b/trace-events index d7a4d94168..b496be94d4 100644 --- a/trace-events +++ b/trace-events @@ -62,8 +62,9 @@ kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p" kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s" kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s" kvm_irqchip_commit_routes(void) "" -kvm_irqchip_add_msi_route(int virq) "Adding MSI route virq=%d" +kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d" kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d" +kvm_irqchip_release_virq(int virq) "virq %d" # TCG related tracing (mostly disabled by default) # cpu-exec.c From 993b1f4b2ceb7b09a7153aa01d03bdf95972e61d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 May 2017 14:00:43 +0800 Subject: [PATCH 08/31] msix: trace control bit write op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Meanwhile, abstract a function to detect msix masked bit. Signed-off-by: Peter Xu Message-Id: <1494309644-18743-3-git-send-email-peterx@redhat.com> Acked-by: Michael S. Tsirkin Reviewed-by: Michael S. Tsirkin Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Paolo Bonzini --- hw/pci/msix.c | 11 +++++++++-- hw/pci/trace-events | 3 +++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/hw/pci/msix.c b/hw/pci/msix.c index bb54e8b0ac..fc5fe511b3 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -22,6 +22,7 @@ #include "hw/xen/xen.h" #include "qemu/range.h" #include "qapi/error.h" +#include "trace.h" #define MSIX_CAP_LENGTH 12 @@ -130,10 +131,14 @@ static void msix_handle_mask_update(PCIDevice *dev, int vector, bool was_masked) } } +static bool msix_masked(PCIDevice *dev) +{ + return dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK; +} + static void msix_update_function_masked(PCIDevice *dev) { - dev->msix_function_masked = !msix_enabled(dev) || - (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK); + dev->msix_function_masked = !msix_enabled(dev) || msix_masked(dev); } /* Handle MSI-X capability config write. */ @@ -148,6 +153,8 @@ void msix_write_config(PCIDevice *dev, uint32_t addr, return; } + trace_msix_write_config(dev->name, msix_enabled(dev), msix_masked(dev)); + was_masked = dev->msix_function_masked; msix_update_function_masked(dev); diff --git a/hw/pci/trace-events b/hw/pci/trace-events index 2b9cf24405..83c8f5ace7 100644 --- a/hw/pci/trace-events +++ b/hw/pci/trace-events @@ -7,3 +7,6 @@ pci_update_mappings_add(void *d, uint32_t bus, uint32_t slot, uint32_t func, int # hw/pci/pci_host.c pci_cfg_read(const char *dev, unsigned devid, unsigned fnid, unsigned offs, unsigned val) "%s %02u:%u @0x%x -> 0x%x" pci_cfg_write(const char *dev, unsigned devid, unsigned fnid, unsigned offs, unsigned val) "%s %02u:%u @0x%x <- 0x%x" + +# hw/pci/msix.c +msix_write_config(char *name, bool enabled, bool masked) "dev %s enabled %d masked %d" From fd563564222f308e1d86847efdec8555fb472536 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 May 2017 14:00:44 +0800 Subject: [PATCH 09/31] kvm: irqchip: skip update msi when disabled It's possible that one device kept its irqfd/virq there even when MSI/MSIX was disabled globally for that device. One example is virtio-net-pci (see commit f1d0f15a6 and virtio_pci_vq_vector_mask()). It is used as a fast path to avoid allocate/release irqfd/virq frequently when guest enables/disables MSIX. However, this fast path brought a problem to msi_route_list, that the device MSIRouteEntry is still dangling there even if MSIX disabled - then we cannot know which message to fetch, even if we can, the messages are meaningless. In this case, we can just simply ignore this entry. It's safe, since when MSIX is enabled again, we'll rebuild them no matter what. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1448813 Signed-off-by: Peter Xu Message-Id: <1494309644-18743-4-git-send-email-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- target/i386/kvm.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 49b6115eae..9087677d00 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -43,6 +43,7 @@ #include "standard-headers/asm-x86/hyperv.h" #include "hw/pci/pci.h" #include "hw/pci/msi.h" +#include "hw/pci/msix.h" #include "migration/blocker.h" #include "exec/memattrs.h" #include "trace.h" @@ -3510,12 +3511,17 @@ static void kvm_update_msi_routes_all(void *private, bool global, int cnt = 0; MSIRouteEntry *entry; MSIMessage msg; + PCIDevice *dev; + /* TODO: explicit route update */ QLIST_FOREACH(entry, &msi_route_list, list) { cnt++; - msg = pci_get_msi_message(entry->dev, entry->vector); - kvm_irqchip_update_msi_route(kvm_state, entry->virq, - msg, entry->dev); + dev = entry->dev; + if (!msix_enabled(dev) && !msi_enabled(dev)) { + continue; + } + msg = pci_get_msi_message(dev, entry->vector); + kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev); } kvm_irqchip_commit_routes(kvm_state); trace_kvm_x86_update_msi_routes(cnt); From 7e6478e7d4f2c4b607069bf488d57089a9d3244b Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 9 May 2017 12:04:52 -0700 Subject: [PATCH 10/31] Check the return value of fcntl in qemu_set_cloexec Assert that the return value is not an error. This issue was found by Coverity. CID: 1374831 Signed-off-by: Stefano Stabellini CC: groug@kaod.org CC: pbonzini@redhat.com CC: Eric Blake Message-Id: <1494356693-13190-2-git-send-email-sstabellini@kernel.org> Signed-off-by: Paolo Bonzini --- util/oslib-posix.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 7e28c161b2..048d40d9de 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -182,7 +182,9 @@ void qemu_set_cloexec(int fd) { int f; f = fcntl(fd, F_GETFD); - fcntl(fd, F_SETFD, f | FD_CLOEXEC); + assert(f != -1); + f = fcntl(fd, F_SETFD, f | FD_CLOEXEC); + assert(f != -1); } /* From f250a42ddaee042ad2eb02022a3ebd18fcf987de Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 16 May 2017 12:45:29 +0300 Subject: [PATCH 11/31] nbd: strict nbd_wr_syncv nbd_wr_syncv is called either from coroutine or from client negotiation code, when socket is in blocking mode. So, -EAGAIN is impossible. Furthermore, EAGAIN is confusing, as, what to read/write again? With EAGAIN as a return code we don't know how much data is already read or written by the function, so in case of EAGAIN the whole communication is broken. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170516094533.6160-2-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- nbd/common.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/nbd/common.c b/nbd/common.c index dccbb8e9de..4db45b3ede 100644 --- a/nbd/common.c +++ b/nbd/common.c @@ -20,6 +20,10 @@ #include "qapi/error.h" #include "nbd-internal.h" +/* nbd_wr_syncv + * The function may be called from coroutine or from non-coroutine context. + * When called from non-coroutine context @ioc must be in blocking mode. + */ ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *iov, size_t niov, @@ -42,11 +46,8 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, len = qio_channel_writev(ioc, local_iov, nlocal_iov, &local_err); } if (len == QIO_CHANNEL_ERR_BLOCK) { - if (qemu_in_coroutine()) { - qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT); - } else { - return -EAGAIN; - } + assert(qemu_in_coroutine()); + qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT); continue; } if (len < 0) { From f5d406fe86bb28da85824b6581e58980cc1a07f3 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 16 May 2017 12:45:30 +0300 Subject: [PATCH 12/31] nbd: read_sync and friends: return 0 on success functions read_sync, drop_sync, write_sync, and also nbd_negotiate_write, nbd_negotiate_read, nbd_negotiate_drop_sync returns number of processed bytes. But what this number can be, except requested number of bytes? Actually, underlying nbd_wr_syncv function returns a value >= 0 and != requested_bytes only on eof on read operation. So, firstly, it is impossible on write (let's add an assert) and on read it actually means, that communication is broken (except nbd_receive_reply, see below). Most of callers operate like this: if (func(..., size) != size) { /* error path */ } , i.e.: 1. They are not interested in partial success 2. Extra duplications in code (especially bad are duplications of magic numbers) 3. User doesn't see actual error message, as return code is lost. (this patch doesn't fix this point, but it makes fixing easier) Several callers handles ret >= 0 and != requested-size separately, by just returning EINVAL in this case. This patch makes read_sync and friends return EINVAL in this case, so final behavior is the same. And only one caller - nbd_receive_reply() does something not so obvious. It returns EINVAL for ret > 0 and != requested-size, like previous group, but for ret == 0 it returns 0. The only caller of nbd_receive_reply() - nbd_read_reply_entry() handles ret == 0 in the same way as ret < 0, so for now it doesn't matter. However, in following commits error path handling will be improved and we'll need to distinguish success from fail in this case too. So, this patch adds separate helper for this case - read_sync_eof. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170516094533.6160-3-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- nbd/client.c | 63 ++++++++++++++-------------------- nbd/nbd-internal.h | 34 ++++++++++++++++--- nbd/server.c | 84 ++++++++++++++++++---------------------------- 3 files changed, 88 insertions(+), 93 deletions(-) diff --git a/nbd/client.c b/nbd/client.c index a58fb02cb4..6b74a628f1 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -86,9 +86,9 @@ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); */ -/* Discard length bytes from channel. Return -errno on failure, or - * the amount of bytes consumed. */ -static ssize_t drop_sync(QIOChannel *ioc, size_t size) +/* Discard length bytes from channel. Return -errno on failure and 0 on + * success*/ +static int drop_sync(QIOChannel *ioc, size_t size) { ssize_t ret = 0; char small[1024]; @@ -96,14 +96,13 @@ static ssize_t drop_sync(QIOChannel *ioc, size_t size) buffer = sizeof(small) >= size ? small : g_malloc(MIN(65536, size)); while (size > 0) { - ssize_t count = read_sync(ioc, buffer, MIN(65536, size)); + ssize_t count = MIN(65536, size); + ret = read_sync(ioc, buffer, MIN(65536, size)); - if (count <= 0) { + if (ret < 0) { goto cleanup; } - assert(count <= size); size -= count; - ret += count; } cleanup: @@ -136,12 +135,12 @@ static int nbd_send_option_request(QIOChannel *ioc, uint32_t opt, stl_be_p(&req.option, opt); stl_be_p(&req.length, len); - if (write_sync(ioc, &req, sizeof(req)) != sizeof(req)) { + if (write_sync(ioc, &req, sizeof(req)) < 0) { error_setg(errp, "Failed to send option request header"); return -1; } - if (len && write_sync(ioc, (char *) data, len) != len) { + if (len && write_sync(ioc, (char *) data, len) < 0) { error_setg(errp, "Failed to send option request data"); return -1; } @@ -170,7 +169,7 @@ static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt, nbd_opt_reply *reply, Error **errp) { QEMU_BUILD_BUG_ON(sizeof(*reply) != 20); - if (read_sync(ioc, reply, sizeof(*reply)) != sizeof(*reply)) { + if (read_sync(ioc, reply, sizeof(*reply)) < 0) { error_setg(errp, "failed to read option reply"); nbd_send_opt_abort(ioc); return -1; @@ -219,7 +218,7 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply, goto cleanup; } msg = g_malloc(reply->length + 1); - if (read_sync(ioc, msg, reply->length) != reply->length) { + if (read_sync(ioc, msg, reply->length) < 0) { error_setg(errp, "failed to read option error message"); goto cleanup; } @@ -321,7 +320,7 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, nbd_send_opt_abort(ioc); return -1; } - if (read_sync(ioc, &namelen, sizeof(namelen)) != sizeof(namelen)) { + if (read_sync(ioc, &namelen, sizeof(namelen)) < 0) { error_setg(errp, "failed to read option name length"); nbd_send_opt_abort(ioc); return -1; @@ -334,7 +333,7 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, return -1; } if (namelen != strlen(want)) { - if (drop_sync(ioc, len) != len) { + if (drop_sync(ioc, len) < 0) { error_setg(errp, "failed to skip export name with wrong length"); nbd_send_opt_abort(ioc); return -1; @@ -343,14 +342,14 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, } assert(namelen < sizeof(name)); - if (read_sync(ioc, name, namelen) != namelen) { + if (read_sync(ioc, name, namelen) < 0) { error_setg(errp, "failed to read export name"); nbd_send_opt_abort(ioc); return -1; } name[namelen] = '\0'; len -= namelen; - if (drop_sync(ioc, len) != len) { + if (drop_sync(ioc, len) < 0) { error_setg(errp, "failed to read export description"); nbd_send_opt_abort(ioc); return -1; @@ -477,7 +476,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, buf, 8) != 8) { + if (read_sync(ioc, buf, 8) < 0) { error_setg(errp, "Failed to read data"); goto fail; } @@ -503,7 +502,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, &magic, sizeof(magic)) != sizeof(magic)) { + if (read_sync(ioc, &magic, sizeof(magic)) < 0) { error_setg(errp, "Failed to read magic"); goto fail; } @@ -515,8 +514,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, uint16_t globalflags; bool fixedNewStyle = false; - if (read_sync(ioc, &globalflags, sizeof(globalflags)) != - sizeof(globalflags)) { + if (read_sync(ioc, &globalflags, sizeof(globalflags)) < 0) { error_setg(errp, "Failed to read server flags"); goto fail; } @@ -534,8 +532,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } /* client requested flags */ clientflags = cpu_to_be32(clientflags); - if (write_sync(ioc, &clientflags, sizeof(clientflags)) != - sizeof(clientflags)) { + if (write_sync(ioc, &clientflags, sizeof(clientflags)) < 0) { error_setg(errp, "Failed to send clientflags field"); goto fail; } @@ -573,13 +570,13 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } /* Read the response */ - if (read_sync(ioc, &s, sizeof(s)) != sizeof(s)) { + if (read_sync(ioc, &s, sizeof(s)) < 0) { error_setg(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); - if (read_sync(ioc, flags, sizeof(*flags)) != sizeof(*flags)) { + if (read_sync(ioc, flags, sizeof(*flags)) < 0) { error_setg(errp, "Failed to read export flags"); goto fail; } @@ -596,14 +593,14 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, &s, sizeof(s)) != sizeof(s)) { + if (read_sync(ioc, &s, sizeof(s)) < 0) { error_setg(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); TRACE("Size is %" PRIu64, *size); - if (read_sync(ioc, &oldflags, sizeof(oldflags)) != sizeof(oldflags)) { + if (read_sync(ioc, &oldflags, sizeof(oldflags)) < 0) { error_setg(errp, "Failed to read export flags"); goto fail; } @@ -619,7 +616,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } TRACE("Size is %" PRIu64 ", export flags %" PRIx16, *size, *flags); - if (zeroes && drop_sync(ioc, 124) != 124) { + if (zeroes && drop_sync(ioc, 124) < 0) { error_setg(errp, "Failed to read reserved block"); goto fail; } @@ -744,7 +741,6 @@ int nbd_disconnect(int fd) ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) { uint8_t buf[NBD_REQUEST_SIZE]; - ssize_t ret; TRACE("Sending request to server: " "{ .from = %" PRIu64", .len = %" PRIu32 ", .handle = %" PRIu64 @@ -759,16 +755,7 @@ ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) stq_be_p(buf + 16, request->from); stl_be_p(buf + 24, request->len); - ret = write_sync(ioc, buf, sizeof(buf)); - if (ret < 0) { - return ret; - } - - if (ret != sizeof(buf)) { - LOG("writing to socket failed"); - return -EINVAL; - } - return 0; + return write_sync(ioc, buf, sizeof(buf)); } ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) @@ -777,7 +764,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) uint32_t magic; ssize_t ret; - ret = read_sync(ioc, buf, sizeof(buf)); + ret = read_sync_eof(ioc, buf, sizeof(buf)); if (ret <= 0) { return ret; } diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index f43d990a05..e6bbc7c4b4 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -94,7 +94,13 @@ #define NBD_ENOSPC 28 #define NBD_ESHUTDOWN 108 -static inline ssize_t read_sync(QIOChannel *ioc, void *buffer, size_t size) +/* read_sync_eof + * Tries to read @size bytes from @ioc. Returns number of bytes actually read. + * May return a value >= 0 and < size only on EOF, i.e. when iteratively called + * qio_channel_readv() returns 0. So, there are no needs to call read_sync_eof + * iteratively. + */ +static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size) { struct iovec iov = { .iov_base = buffer, .iov_len = size }; /* Sockets are kept in blocking mode in the negotiation phase. After @@ -105,12 +111,32 @@ static inline ssize_t read_sync(QIOChannel *ioc, void *buffer, size_t size) return nbd_wr_syncv(ioc, &iov, 1, size, true); } -static inline ssize_t write_sync(QIOChannel *ioc, const void *buffer, - size_t size) +/* read_sync + * Reads @size bytes from @ioc. Returns 0 on success. + */ +static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size) +{ + ssize_t ret = read_sync_eof(ioc, buffer, size); + + if (ret >= 0 && ret != size) { + ret = -EINVAL; + } + + return ret < 0 ? ret : 0; +} + +/* write_sync + * Writes @size bytes to @ioc. Returns 0 on success. + */ +static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size) { struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size }; - return nbd_wr_syncv(ioc, &iov, 1, size, false); + ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false); + + assert(ret < 0 || ret == size); + + return ret < 0 ? ret : 0; } struct NBDTLSHandshakeData { diff --git a/nbd/server.c b/nbd/server.c index 924a1fe2db..1e1096c762 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -112,7 +112,7 @@ static gboolean nbd_negotiate_continue(QIOChannel *ioc, return TRUE; } -static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) +static int nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) { ssize_t ret; guint watch; @@ -130,8 +130,7 @@ static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) } -static ssize_t nbd_negotiate_write(QIOChannel *ioc, const void *buffer, - size_t size) +static int nbd_negotiate_write(QIOChannel *ioc, const void *buffer, size_t size) { ssize_t ret; guint watch; @@ -148,24 +147,24 @@ static ssize_t nbd_negotiate_write(QIOChannel *ioc, const void *buffer, return ret; } -static ssize_t nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size) +static int nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size) { - ssize_t ret, dropped = size; + ssize_t ret; uint8_t *buffer = g_malloc(MIN(65536, size)); while (size > 0) { - ret = nbd_negotiate_read(ioc, buffer, MIN(65536, size)); + size_t count = MIN(65536, size); + ret = nbd_negotiate_read(ioc, buffer, count); if (ret < 0) { g_free(buffer); return ret; } - assert(ret <= size); - size -= ret; + size -= count; } g_free(buffer); - return dropped; + return 0; } /* Basic flow for negotiation @@ -206,22 +205,22 @@ static int nbd_negotiate_send_rep_len(QIOChannel *ioc, uint32_t type, type, opt, len); magic = cpu_to_be64(NBD_REP_MAGIC); - if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) { + if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) < 0) { LOG("write failed (rep magic)"); return -EINVAL; } opt = cpu_to_be32(opt); - if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) { + if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) < 0) { LOG("write failed (rep opt)"); return -EINVAL; } type = cpu_to_be32(type); - if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) { + if (nbd_negotiate_write(ioc, &type, sizeof(type)) < 0) { LOG("write failed (rep type)"); return -EINVAL; } len = cpu_to_be32(len); - if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { + if (nbd_negotiate_write(ioc, &len, sizeof(len)) < 0) { LOG("write failed (rep data length)"); return -EINVAL; } @@ -256,7 +255,7 @@ nbd_negotiate_send_rep_err(QIOChannel *ioc, uint32_t type, if (ret < 0) { goto out; } - if (nbd_negotiate_write(ioc, msg, len) != len) { + if (nbd_negotiate_write(ioc, msg, len) < 0) { LOG("write failed (error message)"); ret = -EIO; } else { @@ -287,15 +286,15 @@ static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp) } len = cpu_to_be32(name_len); - if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { + if (nbd_negotiate_write(ioc, &len, sizeof(len)) < 0) { LOG("write failed (name length)"); return -EINVAL; } - if (nbd_negotiate_write(ioc, name, name_len) != name_len) { + if (nbd_negotiate_write(ioc, name, name_len) < 0) { LOG("write failed (name buffer)"); return -EINVAL; } - if (nbd_negotiate_write(ioc, desc, desc_len) != desc_len) { + if (nbd_negotiate_write(ioc, desc, desc_len) < 0) { LOG("write failed (description buffer)"); return -EINVAL; } @@ -309,7 +308,7 @@ static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length) NBDExport *exp; if (length) { - if (nbd_negotiate_drop_sync(client->ioc, length) != length) { + if (nbd_negotiate_drop_sync(client->ioc, length) < 0) { return -EIO; } return nbd_negotiate_send_rep_err(client->ioc, @@ -340,7 +339,7 @@ static int nbd_negotiate_handle_export_name(NBDClient *client, uint32_t length) LOG("Bad length received"); goto fail; } - if (nbd_negotiate_read(client->ioc, name, length) != length) { + if (nbd_negotiate_read(client->ioc, name, length) < 0) { LOG("read failed"); goto fail; } @@ -373,7 +372,7 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client, TRACE("Setting up TLS"); ioc = client->ioc; if (length) { - if (nbd_negotiate_drop_sync(ioc, length) != length) { + if (nbd_negotiate_drop_sync(ioc, length) < 0) { return NULL; } nbd_negotiate_send_rep_err(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS, @@ -437,8 +436,7 @@ static int nbd_negotiate_options(NBDClient *client) ... Rest of request */ - if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) != - sizeof(flags)) { + if (nbd_negotiate_read(client->ioc, &flags, sizeof(flags)) < 0) { LOG("read failed"); return -EIO; } @@ -464,8 +462,7 @@ static int nbd_negotiate_options(NBDClient *client) uint32_t clientflags, length; uint64_t magic; - if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) != - sizeof(magic)) { + if (nbd_negotiate_read(client->ioc, &magic, sizeof(magic)) < 0) { LOG("read failed"); return -EINVAL; } @@ -476,14 +473,14 @@ static int nbd_negotiate_options(NBDClient *client) } if (nbd_negotiate_read(client->ioc, &clientflags, - sizeof(clientflags)) != sizeof(clientflags)) { + sizeof(clientflags)) < 0) + { LOG("read failed"); return -EINVAL; } clientflags = be32_to_cpu(clientflags); - if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) != - sizeof(length)) { + if (nbd_negotiate_read(client->ioc, &length, sizeof(length)) < 0) { LOG("read failed"); return -EINVAL; } @@ -513,7 +510,7 @@ static int nbd_negotiate_options(NBDClient *client) return -EINVAL; default: - if (nbd_negotiate_drop_sync(client->ioc, length) != length) { + if (nbd_negotiate_drop_sync(client->ioc, length) < 0) { return -EIO; } ret = nbd_negotiate_send_rep_err(client->ioc, @@ -551,7 +548,7 @@ static int nbd_negotiate_options(NBDClient *client) return nbd_negotiate_handle_export_name(client, length); case NBD_OPT_STARTTLS: - if (nbd_negotiate_drop_sync(client->ioc, length) != length) { + if (nbd_negotiate_drop_sync(client->ioc, length) < 0) { return -EIO; } if (client->tlscreds) { @@ -570,7 +567,7 @@ static int nbd_negotiate_options(NBDClient *client) } break; default: - if (nbd_negotiate_drop_sync(client->ioc, length) != length) { + if (nbd_negotiate_drop_sync(client->ioc, length) < 0) { return -EIO; } ret = nbd_negotiate_send_rep_err(client->ioc, @@ -659,12 +656,12 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) TRACE("TLS cannot be enabled with oldstyle protocol"); goto fail; } - if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) != sizeof(buf)) { + if (nbd_negotiate_write(client->ioc, buf, sizeof(buf)) < 0) { LOG("write failed"); goto fail; } } else { - if (nbd_negotiate_write(client->ioc, buf, 18) != 18) { + if (nbd_negotiate_write(client->ioc, buf, 18) < 0) { LOG("write failed"); goto fail; } @@ -679,7 +676,7 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) stq_be_p(buf + 18, client->exp->size); stw_be_p(buf + 26, client->exp->nbdflags | myflags); len = client->no_zeroes ? 10 : sizeof(buf) - 18; - if (nbd_negotiate_write(client->ioc, buf + 18, len) != len) { + if (nbd_negotiate_write(client->ioc, buf + 18, len) < 0) { LOG("write failed"); goto fail; } @@ -702,11 +699,6 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request) return ret; } - if (ret != sizeof(buf)) { - LOG("read failed"); - return -EINVAL; - } - /* Request [ 0 .. 3] magic (NBD_REQUEST_MAGIC) [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...) @@ -737,7 +729,6 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request) static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply) { uint8_t buf[NBD_REPLY_SIZE]; - ssize_t ret; reply->error = system_errno_to_nbd_errno(reply->error); @@ -754,16 +745,7 @@ static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply) stl_be_p(buf + 4, reply->error); stq_be_p(buf + 8, reply->handle); - ret = write_sync(ioc, buf, sizeof(buf)); - if (ret < 0) { - return ret; - } - - if (ret != sizeof(buf)) { - LOG("writing to socket failed"); - return -EINVAL; - } - return 0; + return write_sync(ioc, buf, sizeof(buf)); } #define MAX_NBD_REQUESTS 16 @@ -1067,7 +1049,7 @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply, rc = nbd_send_reply(client->ioc, reply); if (rc >= 0) { ret = write_sync(client->ioc, req->data, len); - if (ret != len) { + if (ret < 0) { rc = -EIO; } } @@ -1141,7 +1123,7 @@ static ssize_t nbd_co_receive_request(NBDRequestData *req, if (request->type == NBD_CMD_WRITE) { TRACE("Reading %" PRIu32 " byte(s)", request->len); - if (read_sync(client->ioc, req->data, request->len) != request->len) { + if (read_sync(client->ioc, req->data, request->len) < 0) { LOG("reading from socket failed"); rc = -EIO; goto out; From f2609565369429bc1619d106b200106dba29290e Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 16 May 2017 12:45:31 +0300 Subject: [PATCH 13/31] nbd: add errp parameter to nbd_wr_syncv() Will be used in following patch to provide actual error message in some cases. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170516094533.6160-4-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 4 ++-- include/block/nbd.h | 3 ++- nbd/common.c | 12 +++++------- nbd/nbd-internal.h | 4 ++-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 1e2952fdae..538d95e031 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -136,7 +136,7 @@ static int nbd_co_send_request(BlockDriverState *bs, rc = nbd_send_request(s->ioc, request); if (rc >= 0) { ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, - false); + false, NULL); if (ret != request->len) { rc = -EIO; } @@ -165,7 +165,7 @@ static void nbd_co_receive_reply(NBDClientSession *s, } else { if (qiov && reply->error == 0) { ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len, - true); + true, NULL); if (ret != request->len) { reply->error = EIO; } diff --git a/include/block/nbd.h b/include/block/nbd.h index 0ed077502e..9d385ea564 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -127,7 +127,8 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *iov, size_t niov, size_t length, - bool do_read); + bool do_read, + Error **errp); int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, diff --git a/nbd/common.c b/nbd/common.c index 4db45b3ede..bd81637ab9 100644 --- a/nbd/common.c +++ b/nbd/common.c @@ -28,10 +28,10 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, struct iovec *iov, size_t niov, size_t length, - bool do_read) + bool do_read, + Error **errp) { ssize_t done = 0; - Error *local_err = NULL; struct iovec *local_iov = g_new(struct iovec, niov); struct iovec *local_iov_head = local_iov; unsigned int nlocal_iov = niov; @@ -41,19 +41,17 @@ ssize_t nbd_wr_syncv(QIOChannel *ioc, while (nlocal_iov > 0) { ssize_t len; if (do_read) { - len = qio_channel_readv(ioc, local_iov, nlocal_iov, &local_err); + len = qio_channel_readv(ioc, local_iov, nlocal_iov, errp); } else { - len = qio_channel_writev(ioc, local_iov, nlocal_iov, &local_err); + len = qio_channel_writev(ioc, local_iov, nlocal_iov, errp); } if (len == QIO_CHANNEL_ERR_BLOCK) { + /* errp should not be set */ assert(qemu_in_coroutine()); qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT); continue; } if (len < 0) { - TRACE("I/O error: %s", error_get_pretty(local_err)); - error_free(local_err); - /* XXX handle Error objects */ done = -EIO; goto cleanup; } diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index e6bbc7c4b4..1d479fe135 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -108,7 +108,7 @@ static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size) * our request/reply. Synchronization is done with recv_coroutine, so * that this is coroutine-safe. */ - return nbd_wr_syncv(ioc, &iov, 1, size, true); + return nbd_wr_syncv(ioc, &iov, 1, size, true, NULL); } /* read_sync @@ -132,7 +132,7 @@ static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size) { struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size }; - ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false); + ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false, NULL); assert(ret < 0 || ret == size); From e44ed99d1949315755bffb12a5a483ac66d4a976 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 16 May 2017 12:45:32 +0300 Subject: [PATCH 14/31] nbd: add errp to read_sync, write_sync and drop_sync There a lot of calls of these functions, which already have errp, which they are filling themselves. On the other hand, nbd_wr_syncv has errp parameter too, so it would be great to connect them. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170516094533.6160-5-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- nbd/client.c | 76 +++++++++++++++++++++++----------------------- nbd/nbd-internal.h | 16 ++++++---- nbd/server.c | 12 ++++---- 3 files changed, 54 insertions(+), 50 deletions(-) diff --git a/nbd/client.c b/nbd/client.c index 6b74a628f1..f102375504 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -88,7 +88,7 @@ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); /* Discard length bytes from channel. Return -errno on failure and 0 on * success*/ -static int drop_sync(QIOChannel *ioc, size_t size) +static int drop_sync(QIOChannel *ioc, size_t size, Error **errp) { ssize_t ret = 0; char small[1024]; @@ -97,7 +97,7 @@ static int drop_sync(QIOChannel *ioc, size_t size) buffer = sizeof(small) >= size ? small : g_malloc(MIN(65536, size)); while (size > 0) { ssize_t count = MIN(65536, size); - ret = read_sync(ioc, buffer, MIN(65536, size)); + ret = read_sync(ioc, buffer, MIN(65536, size), errp); if (ret < 0) { goto cleanup; @@ -135,13 +135,13 @@ static int nbd_send_option_request(QIOChannel *ioc, uint32_t opt, stl_be_p(&req.option, opt); stl_be_p(&req.length, len); - if (write_sync(ioc, &req, sizeof(req)) < 0) { - error_setg(errp, "Failed to send option request header"); + if (write_sync(ioc, &req, sizeof(req), errp) < 0) { + error_prepend(errp, "Failed to send option request header"); return -1; } - if (len && write_sync(ioc, (char *) data, len) < 0) { - error_setg(errp, "Failed to send option request data"); + if (len && write_sync(ioc, (char *) data, len, errp) < 0) { + error_prepend(errp, "Failed to send option request data"); return -1; } @@ -169,8 +169,8 @@ static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt, nbd_opt_reply *reply, Error **errp) { QEMU_BUILD_BUG_ON(sizeof(*reply) != 20); - if (read_sync(ioc, reply, sizeof(*reply)) < 0) { - error_setg(errp, "failed to read option reply"); + if (read_sync(ioc, reply, sizeof(*reply), errp) < 0) { + error_prepend(errp, "failed to read option reply"); nbd_send_opt_abort(ioc); return -1; } @@ -218,8 +218,8 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply, goto cleanup; } msg = g_malloc(reply->length + 1); - if (read_sync(ioc, msg, reply->length) < 0) { - error_setg(errp, "failed to read option error message"); + if (read_sync(ioc, msg, reply->length, errp) < 0) { + error_prepend(errp, "failed to read option error message"); goto cleanup; } msg[reply->length] = '\0'; @@ -320,8 +320,8 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, nbd_send_opt_abort(ioc); return -1; } - if (read_sync(ioc, &namelen, sizeof(namelen)) < 0) { - error_setg(errp, "failed to read option name length"); + if (read_sync(ioc, &namelen, sizeof(namelen), errp) < 0) { + error_prepend(errp, "failed to read option name length"); nbd_send_opt_abort(ioc); return -1; } @@ -333,8 +333,8 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, return -1; } if (namelen != strlen(want)) { - if (drop_sync(ioc, len) < 0) { - error_setg(errp, "failed to skip export name with wrong length"); + if (drop_sync(ioc, len, errp) < 0) { + error_prepend(errp, "failed to skip export name with wrong length"); nbd_send_opt_abort(ioc); return -1; } @@ -342,15 +342,15 @@ static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, } assert(namelen < sizeof(name)); - if (read_sync(ioc, name, namelen) < 0) { - error_setg(errp, "failed to read export name"); + if (read_sync(ioc, name, namelen, errp) < 0) { + error_prepend(errp, "failed to read export name"); nbd_send_opt_abort(ioc); return -1; } name[namelen] = '\0'; len -= namelen; - if (drop_sync(ioc, len) < 0) { - error_setg(errp, "failed to read export description"); + if (drop_sync(ioc, len, errp) < 0) { + error_prepend(errp, "failed to read export description"); nbd_send_opt_abort(ioc); return -1; } @@ -476,8 +476,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, buf, 8) < 0) { - error_setg(errp, "Failed to read data"); + if (read_sync(ioc, buf, 8, errp) < 0) { + error_prepend(errp, "Failed to read data"); goto fail; } @@ -502,8 +502,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, &magic, sizeof(magic)) < 0) { - error_setg(errp, "Failed to read magic"); + if (read_sync(ioc, &magic, sizeof(magic), errp) < 0) { + error_prepend(errp, "Failed to read magic"); goto fail; } magic = be64_to_cpu(magic); @@ -514,8 +514,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, uint16_t globalflags; bool fixedNewStyle = false; - if (read_sync(ioc, &globalflags, sizeof(globalflags)) < 0) { - error_setg(errp, "Failed to read server flags"); + if (read_sync(ioc, &globalflags, sizeof(globalflags), errp) < 0) { + error_prepend(errp, "Failed to read server flags"); goto fail; } globalflags = be16_to_cpu(globalflags); @@ -532,8 +532,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } /* client requested flags */ clientflags = cpu_to_be32(clientflags); - if (write_sync(ioc, &clientflags, sizeof(clientflags)) < 0) { - error_setg(errp, "Failed to send clientflags field"); + if (write_sync(ioc, &clientflags, sizeof(clientflags), errp) < 0) { + error_prepend(errp, "Failed to send clientflags field"); goto fail; } if (tlscreds) { @@ -570,14 +570,14 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } /* Read the response */ - if (read_sync(ioc, &s, sizeof(s)) < 0) { - error_setg(errp, "Failed to read export length"); + if (read_sync(ioc, &s, sizeof(s), errp) < 0) { + error_prepend(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); - if (read_sync(ioc, flags, sizeof(*flags)) < 0) { - error_setg(errp, "Failed to read export flags"); + if (read_sync(ioc, flags, sizeof(*flags), errp) < 0) { + error_prepend(errp, "Failed to read export flags"); goto fail; } be16_to_cpus(flags); @@ -593,15 +593,15 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } - if (read_sync(ioc, &s, sizeof(s)) < 0) { - error_setg(errp, "Failed to read export length"); + if (read_sync(ioc, &s, sizeof(s), errp) < 0) { + error_prepend(errp, "Failed to read export length"); goto fail; } *size = be64_to_cpu(s); TRACE("Size is %" PRIu64, *size); - if (read_sync(ioc, &oldflags, sizeof(oldflags)) < 0) { - error_setg(errp, "Failed to read export flags"); + if (read_sync(ioc, &oldflags, sizeof(oldflags), errp) < 0) { + error_prepend(errp, "Failed to read export flags"); goto fail; } be32_to_cpus(&oldflags); @@ -616,8 +616,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } TRACE("Size is %" PRIu64 ", export flags %" PRIx16, *size, *flags); - if (zeroes && drop_sync(ioc, 124) < 0) { - error_setg(errp, "Failed to read reserved block"); + if (zeroes && drop_sync(ioc, 124, errp) < 0) { + error_prepend(errp, "Failed to read reserved block"); goto fail; } rc = 0; @@ -755,7 +755,7 @@ ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) stq_be_p(buf + 16, request->from); stl_be_p(buf + 24, request->len); - return write_sync(ioc, buf, sizeof(buf)); + return write_sync(ioc, buf, sizeof(buf), NULL); } ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) @@ -764,7 +764,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) uint32_t magic; ssize_t ret; - ret = read_sync_eof(ioc, buf, sizeof(buf)); + ret = read_sync_eof(ioc, buf, sizeof(buf), NULL); if (ret <= 0) { return ret; } diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index 1d479fe135..d6071640a0 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -100,7 +100,8 @@ * qio_channel_readv() returns 0. So, there are no needs to call read_sync_eof * iteratively. */ -static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size) +static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size, + Error **errp) { struct iovec iov = { .iov_base = buffer, .iov_len = size }; /* Sockets are kept in blocking mode in the negotiation phase. After @@ -108,18 +109,20 @@ static inline ssize_t read_sync_eof(QIOChannel *ioc, void *buffer, size_t size) * our request/reply. Synchronization is done with recv_coroutine, so * that this is coroutine-safe. */ - return nbd_wr_syncv(ioc, &iov, 1, size, true, NULL); + return nbd_wr_syncv(ioc, &iov, 1, size, true, errp); } /* read_sync * Reads @size bytes from @ioc. Returns 0 on success. */ -static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size) +static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size, + Error **errp) { - ssize_t ret = read_sync_eof(ioc, buffer, size); + ssize_t ret = read_sync_eof(ioc, buffer, size, errp); if (ret >= 0 && ret != size) { ret = -EINVAL; + error_setg(errp, "End of file"); } return ret < 0 ? ret : 0; @@ -128,11 +131,12 @@ static inline int read_sync(QIOChannel *ioc, void *buffer, size_t size) /* write_sync * Writes @size bytes to @ioc. Returns 0 on success. */ -static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size) +static inline int write_sync(QIOChannel *ioc, const void *buffer, size_t size, + Error **errp) { struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size }; - ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false, NULL); + ssize_t ret = nbd_wr_syncv(ioc, &iov, 1, size, false, errp); assert(ret < 0 || ret == size); diff --git a/nbd/server.c b/nbd/server.c index 1e1096c762..ee59e5d234 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -124,7 +124,7 @@ static int nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) nbd_negotiate_continue, qemu_coroutine_self(), NULL); - ret = read_sync(ioc, buffer, size); + ret = read_sync(ioc, buffer, size, NULL); g_source_remove(watch); return ret; @@ -142,7 +142,7 @@ static int nbd_negotiate_write(QIOChannel *ioc, const void *buffer, size_t size) nbd_negotiate_continue, qemu_coroutine_self(), NULL); - ret = write_sync(ioc, buffer, size); + ret = write_sync(ioc, buffer, size, NULL); g_source_remove(watch); return ret; } @@ -694,7 +694,7 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request) uint32_t magic; ssize_t ret; - ret = read_sync(ioc, buf, sizeof(buf)); + ret = read_sync(ioc, buf, sizeof(buf), NULL); if (ret < 0) { return ret; } @@ -745,7 +745,7 @@ static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply) stl_be_p(buf + 4, reply->error); stq_be_p(buf + 8, reply->handle); - return write_sync(ioc, buf, sizeof(buf)); + return write_sync(ioc, buf, sizeof(buf), NULL); } #define MAX_NBD_REQUESTS 16 @@ -1048,7 +1048,7 @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply, qio_channel_set_cork(client->ioc, true); rc = nbd_send_reply(client->ioc, reply); if (rc >= 0) { - ret = write_sync(client->ioc, req->data, len); + ret = write_sync(client->ioc, req->data, len, NULL); if (ret < 0) { rc = -EIO; } @@ -1123,7 +1123,7 @@ static ssize_t nbd_co_receive_request(NBDRequestData *req, if (request->type == NBD_CMD_WRITE) { TRACE("Reading %" PRIu32 " byte(s)", request->len); - if (read_sync(client->ioc, req->data, request->len) < 0) { + if (read_sync(client->ioc, req->data, request->len, NULL) < 0) { LOG("reading from socket failed"); rc = -EIO; goto out; From be41c100c0d67f6072ddd4910c4b6f7d239f025f Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Fri, 26 May 2017 14:09:13 +0300 Subject: [PATCH 15/31] nbd/client.c: use errp instead of LOG Move to modern errp scheme from just LOGging errors. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20170526110913.89098-1-vsementsov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 7 ++++++- include/block/nbd.h | 5 +++-- nbd/client.c | 30 +++++++++++++++++------------- qemu-nbd.c | 3 ++- tests/qemu-iotests/083.out | 2 ++ 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 538d95e031..09d955bc4d 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -28,6 +28,7 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "nbd-client.h" #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) @@ -70,10 +71,14 @@ static coroutine_fn void nbd_read_reply_entry(void *opaque) NBDClientSession *s = opaque; uint64_t i; int ret; + Error *local_err = NULL; for (;;) { assert(s->reply.handle == 0); - ret = nbd_receive_reply(s->ioc, &s->reply); + ret = nbd_receive_reply(s->ioc, &s->reply, &local_err); + if (ret < 0) { + error_report_err(local_err); + } if (ret <= 0) { break; } diff --git a/include/block/nbd.h b/include/block/nbd.h index 9d385ea564..416257abca 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -133,9 +133,10 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, off_t *size, Error **errp); -int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size); +int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size, + Error **errp); ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request); -ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply); +ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply, Error **errp); int nbd_client(int fd); int nbd_disconnect(int fd); diff --git a/nbd/client.c b/nbd/client.c index f102375504..595d99ed30 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -627,11 +627,13 @@ fail: } #ifdef __linux__ -int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) +int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size, + Error **errp) { unsigned long sectors = size / BDRV_SECTOR_SIZE; if (size / BDRV_SECTOR_SIZE != sectors) { - LOG("Export size %lld too large for 32-bit kernel", (long long) size); + error_setg(errp, "Export size %lld too large for 32-bit kernel", + (long long) size); return -E2BIG; } @@ -639,7 +641,7 @@ int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) if (ioctl(fd, NBD_SET_SOCK, (unsigned long) sioc->fd) < 0) { int serrno = errno; - LOG("Failed to set NBD socket"); + error_setg(errp, "Failed to set NBD socket"); return -serrno; } @@ -647,7 +649,7 @@ int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) if (ioctl(fd, NBD_SET_BLKSIZE, (unsigned long)BDRV_SECTOR_SIZE) < 0) { int serrno = errno; - LOG("Failed setting NBD block size"); + error_setg(errp, "Failed setting NBD block size"); return -serrno; } @@ -659,7 +661,7 @@ int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) if (ioctl(fd, NBD_SET_SIZE_BLOCKS, sectors) < 0) { int serrno = errno; - LOG("Failed setting size (in blocks)"); + error_setg(errp, "Failed setting size (in blocks)"); return -serrno; } @@ -670,12 +672,12 @@ int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size) if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) { int serrno = errno; - LOG("Failed setting read-only attribute"); + error_setg(errp, "Failed setting read-only attribute"); return -serrno; } } else { int serrno = errno; - LOG("Failed setting flags"); + error_setg(errp, "Failed setting flags"); return -serrno; } } @@ -723,8 +725,10 @@ int nbd_disconnect(int fd) } #else -int nbd_init(int fd, QIOChannelSocket *ioc, uint16_t flags, off_t size) +int nbd_init(int fd, QIOChannelSocket *ioc, uint16_t flags, off_t size, + Error **errp) { + error_setg(errp, "nbd_init is only supported on Linux"); return -ENOTSUP; } @@ -758,19 +762,19 @@ ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) return write_sync(ioc, buf, sizeof(buf), NULL); } -ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) +ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply, Error **errp) { uint8_t buf[NBD_REPLY_SIZE]; uint32_t magic; ssize_t ret; - ret = read_sync_eof(ioc, buf, sizeof(buf), NULL); + ret = read_sync_eof(ioc, buf, sizeof(buf), errp); if (ret <= 0) { return ret; } if (ret != sizeof(buf)) { - LOG("read failed"); + error_setg(errp, "read failed"); return -EINVAL; } @@ -788,7 +792,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) if (reply->error == ESHUTDOWN) { /* This works even on mingw which lacks a native ESHUTDOWN */ - LOG("server shutting down"); + error_setg(errp, "server shutting down"); return -EINVAL; } TRACE("Got reply: { magic = 0x%" PRIx32 ", .error = % " PRId32 @@ -796,7 +800,7 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) magic, reply->error, reply->handle); if (magic != NBD_REPLY_MAGIC) { - LOG("invalid magic (got 0x%" PRIx32 ")", magic); + error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic); return -EINVAL; } return sizeof(buf); diff --git a/qemu-nbd.c b/qemu-nbd.c index b7ab86bfa7..f60842fd86 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -288,8 +288,9 @@ static void *nbd_client_thread(void *arg) goto out_socket; } - ret = nbd_init(fd, sioc, nbdflags, size); + ret = nbd_init(fd, sioc, nbdflags, size, &local_error); if (ret < 0) { + error_report_err(local_error); goto out_fd; } diff --git a/tests/qemu-iotests/083.out b/tests/qemu-iotests/083.out index 0c13888ba1..a24c6bfece 100644 --- a/tests/qemu-iotests/083.out +++ b/tests/qemu-iotests/083.out @@ -69,10 +69,12 @@ read failed: Input/output error === Check disconnect 4 reply === +read failed read failed: Input/output error === Check disconnect 8 reply === +read failed read failed: Input/output error === Check disconnect before data === From 003a0cf2cd1828a1141a874428571267b117f765 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 15 May 2017 16:50:57 +0800 Subject: [PATCH 16/31] exec: simplify phys_page_find() params It really only plays with the dispatchers, so the parameter list does not need that complexity. This helps for readability at least. Signed-off-by: Peter Xu Message-Id: <1494838260-30439-2-git-send-email-peterx@redhat.com> Reviewed-by: David Gibson Signed-off-by: Paolo Bonzini --- exec.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/exec.c b/exec.c index b1db12fe36..a93e209625 100644 --- a/exec.c +++ b/exec.c @@ -374,10 +374,11 @@ static inline bool section_covers_addr(const MemoryRegionSection *section, int128_getlo(section->size), addr); } -static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr, - Node *nodes, MemoryRegionSection *sections) +static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr) { - PhysPageEntry *p; + PhysPageEntry lp = d->phys_map, *p; + Node *nodes = d->map.nodes; + MemoryRegionSection *sections = d->map.sections; hwaddr index = addr >> TARGET_PAGE_BITS; int i; @@ -415,8 +416,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d, section_covers_addr(section, addr)) { update = false; } else { - section = phys_page_find(d->phys_map, addr, d->map.nodes, - d->map.sections); + section = phys_page_find(d, addr); update = true; } if (resolve_subpage && section->mr->subpage) { @@ -1285,8 +1285,7 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti subpage_t *subpage; hwaddr base = section->offset_within_address_space & TARGET_PAGE_MASK; - MemoryRegionSection *existing = phys_page_find(d->phys_map, base, - d->map.nodes, d->map.sections); + MemoryRegionSection *existing = phys_page_find(d, base); MemoryRegionSection subsection = { .offset_within_address_space = base, .size = int128_make64(TARGET_PAGE_SIZE), From 2cbe2de5454cf9af44b620b2b40d56361a12a45f Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Thu, 18 May 2017 18:28:08 +0800 Subject: [PATCH 17/31] virtio-scsi: Unset hotplug handler when unrealize This matches the qbus_set_hotplug_handler in realize, and it releases the final reference to the embedded VirtIODevice so that it is properly finalized. A use-after-free is fixed with this patch, indirectly: virtio_device_instance_finalize wasn't called at hot-unplug, and the vdev->listener would be a dangling pointer in the global and the per address space listener list. See also RHBZ 1449031. Cc: qemu-stable@nongnu.org Signed-off-by: Fam Zheng Message-Id: <20170518102808.30046-1-famz@redhat.com> Signed-off-by: Paolo Bonzini --- hw/scsi/virtio-scsi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 46a3e3f280..f46f06d055 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -918,6 +918,9 @@ void virtio_scsi_common_unrealize(DeviceState *dev, Error **errp) static void virtio_scsi_device_unrealize(DeviceState *dev, Error **errp) { + VirtIOSCSI *s = VIRTIO_SCSI(dev); + + qbus_set_hotplug_handler(BUS(&s->bus), NULL, &error_abort); virtio_scsi_common_unrealize(dev, errp); } From c8bc83a4dd29a9a33f5be81686bfe6e2e628097b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 May 2017 13:35:28 +0200 Subject: [PATCH 18/31] target/i386: enable A20 automatically in system management mode Ignore env->a20_mask when running in system management mode. Reported-by: Anthony Xu Signed-off-by: Paolo Bonzini Message-Id: <1494502528-12670-1-git-send-email-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- target/i386/arch_memory_mapping.c | 18 +++++++------ target/i386/cpu.h | 9 +++++++ target/i386/helper.c | 42 +++++++++++++++++-------------- 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/target/i386/arch_memory_mapping.c b/target/i386/arch_memory_mapping.c index 826aee597b..647cff2829 100644 --- a/target/i386/arch_memory_mapping.c +++ b/target/i386/arch_memory_mapping.c @@ -272,25 +272,27 @@ void x86_cpu_get_memory_mapping(CPUState *cs, MemoryMappingList *list, { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + int32_t a20_mask; if (!cpu_paging_enabled(cs)) { /* paging is disabled */ return; } + a20_mask = x86_get_a20_mask(env); if (env->cr[4] & CR4_PAE_MASK) { #ifdef TARGET_X86_64 if (env->hflags & HF_LMA_MASK) { if (env->cr[4] & CR4_LA57_MASK) { hwaddr pml5e_addr; - pml5e_addr = (env->cr[3] & PLM4_ADDR_MASK) & env->a20_mask; - walk_pml5e(list, cs->as, pml5e_addr, env->a20_mask); + pml5e_addr = (env->cr[3] & PLM4_ADDR_MASK) & a20_mask; + walk_pml5e(list, cs->as, pml5e_addr, a20_mask); } else { hwaddr pml4e_addr; - pml4e_addr = (env->cr[3] & PLM4_ADDR_MASK) & env->a20_mask; - walk_pml4e(list, cs->as, pml4e_addr, env->a20_mask, + pml4e_addr = (env->cr[3] & PLM4_ADDR_MASK) & a20_mask; + walk_pml4e(list, cs->as, pml4e_addr, a20_mask, 0xffffULL << 48); } } else @@ -298,16 +300,16 @@ void x86_cpu_get_memory_mapping(CPUState *cs, MemoryMappingList *list, { hwaddr pdpe_addr; - pdpe_addr = (env->cr[3] & ~0x1f) & env->a20_mask; - walk_pdpe2(list, cs->as, pdpe_addr, env->a20_mask); + pdpe_addr = (env->cr[3] & ~0x1f) & a20_mask; + walk_pdpe2(list, cs->as, pdpe_addr, a20_mask); } } else { hwaddr pde_addr; bool pse; - pde_addr = (env->cr[3] & ~0xfff) & env->a20_mask; + pde_addr = (env->cr[3] & ~0xfff) & a20_mask; pse = !!(env->cr[4] & CR4_PSE_MASK); - walk_pde2(list, cs->as, pde_addr, env->a20_mask, pse); + walk_pde2(list, cs->as, pde_addr, a20_mask, pse); } } diff --git a/target/i386/cpu.h b/target/i386/cpu.h index cfe825f0a4..0facb354b5 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1625,6 +1625,15 @@ static inline MemTxAttrs cpu_get_mem_attrs(CPUX86State *env) return ((MemTxAttrs) { .secure = (env->hflags & HF_SMM_MASK) != 0 }); } +static inline int32_t x86_get_a20_mask(CPUX86State *env) +{ + if (env->hflags & HF_SMM_MASK) { + return -1; + } else { + return env->a20_mask; + } +} + /* fpu_helper.c */ void cpu_set_mxcsr(CPUX86State *env, uint32_t val); void cpu_set_fpuc(CPUX86State *env, uint16_t val); diff --git a/target/i386/helper.c b/target/i386/helper.c index ee7eff2f6f..3850c56701 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -724,6 +724,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; uint64_t ptep, pte; + int32_t a20_mask; target_ulong pde_addr, pte_addr; int error_code = 0; int is_dirty, prot, page_size, is_write, is_user; @@ -739,6 +740,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, #endif is_write = is_write1 & 1; + a20_mask = x86_get_a20_mask(env); if (!(env->cr[0] & CR0_PG_MASK)) { pte = addr; #ifdef TARGET_X86_64 @@ -777,7 +779,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, if (la57) { pml5e_addr = ((env->cr[3] & ~0xfff) + - (((addr >> 48) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 48) & 0x1ff) << 3)) & a20_mask; pml5e = x86_ldq_phys(cs, pml5e_addr); if (!(pml5e & PG_PRESENT_MASK)) { goto do_fault; @@ -796,7 +798,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, } pml4e_addr = ((pml5e & PG_ADDRESS_MASK) + - (((addr >> 39) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 39) & 0x1ff) << 3)) & a20_mask; pml4e = x86_ldq_phys(cs, pml4e_addr); if (!(pml4e & PG_PRESENT_MASK)) { goto do_fault; @@ -810,7 +812,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, } ptep &= pml4e ^ PG_NX_MASK; pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) & - env->a20_mask; + a20_mask; pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) { goto do_fault; @@ -835,7 +837,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, { /* XXX: load them when cr3 is loaded ? */ pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) & - env->a20_mask; + a20_mask; pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) { goto do_fault; @@ -848,7 +850,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, } pde_addr = ((pdpe & PG_ADDRESS_MASK) + (((addr >> 21) & 0x1ff) << 3)) & - env->a20_mask; + a20_mask; pde = x86_ldq_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) { goto do_fault; @@ -870,7 +872,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, x86_stl_phys_notdirty(cs, pde_addr, pde); } pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) & - env->a20_mask; + a20_mask; pte = x86_ldq_phys(cs, pte_addr); if (!(pte & PG_PRESENT_MASK)) { goto do_fault; @@ -886,7 +888,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, /* page directory entry */ pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & - env->a20_mask; + a20_mask; pde = x86_ldl_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) { goto do_fault; @@ -913,7 +915,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, /* page directory entry */ pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & - env->a20_mask; + a20_mask; pte = x86_ldl_phys(cs, pte_addr); if (!(pte & PG_PRESENT_MASK)) { goto do_fault; @@ -992,7 +994,7 @@ do_check_protect_pse36: } do_mapping: - pte = pte & env->a20_mask; + pte = pte & a20_mask; /* align to page_size */ pte &= PG_ADDRESS_MASK & ~(page_size - 1); @@ -1039,11 +1041,13 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) CPUX86State *env = &cpu->env; target_ulong pde_addr, pte_addr; uint64_t pte; + int32_t a20_mask; uint32_t page_offset; int page_size; + a20_mask = x86_get_a20_mask(env); if (!(env->cr[0] & CR0_PG_MASK)) { - pte = addr & env->a20_mask; + pte = addr & a20_mask; page_size = 4096; } else if (env->cr[4] & CR4_PAE_MASK) { target_ulong pdpe_addr; @@ -1064,7 +1068,7 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) if (la57) { pml5e_addr = ((env->cr[3] & ~0xfff) + - (((addr >> 48) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 48) & 0x1ff) << 3)) & a20_mask; pml5e = x86_ldq_phys(cs, pml5e_addr); if (!(pml5e & PG_PRESENT_MASK)) { return -1; @@ -1074,13 +1078,13 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) } pml4e_addr = ((pml5e & PG_ADDRESS_MASK) + - (((addr >> 39) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 39) & 0x1ff) << 3)) & a20_mask; pml4e = x86_ldq_phys(cs, pml4e_addr); if (!(pml4e & PG_PRESENT_MASK)) { return -1; } pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + - (((addr >> 30) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 30) & 0x1ff) << 3)) & a20_mask; pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) { return -1; @@ -1095,14 +1099,14 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) #endif { pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) & - env->a20_mask; + a20_mask; pdpe = x86_ldq_phys(cs, pdpe_addr); if (!(pdpe & PG_PRESENT_MASK)) return -1; } pde_addr = ((pdpe & PG_ADDRESS_MASK) + - (((addr >> 21) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 21) & 0x1ff) << 3)) & a20_mask; pde = x86_ldq_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) { return -1; @@ -1114,7 +1118,7 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) } else { /* 4 KB page */ pte_addr = ((pde & PG_ADDRESS_MASK) + - (((addr >> 12) & 0x1ff) << 3)) & env->a20_mask; + (((addr >> 12) & 0x1ff) << 3)) & a20_mask; page_size = 4096; pte = x86_ldq_phys(cs, pte_addr); } @@ -1125,7 +1129,7 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) uint32_t pde; /* page directory entry */ - pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & env->a20_mask; + pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) & a20_mask; pde = x86_ldl_phys(cs, pde_addr); if (!(pde & PG_PRESENT_MASK)) return -1; @@ -1134,14 +1138,14 @@ hwaddr x86_cpu_get_phys_page_debug(CPUState *cs, vaddr addr) page_size = 4096 * 1024; } else { /* page directory entry */ - pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & env->a20_mask; + pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) & a20_mask; pte = x86_ldl_phys(cs, pte_addr); if (!(pte & PG_PRESENT_MASK)) { return -1; } page_size = 4096; } - pte = pte & env->a20_mask; + pte = pte & a20_mask; } #ifdef TARGET_X86_64 From f8c45c6550b9ff1e1f0b92709ff3213a79870879 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Mar 2017 10:34:48 +0100 Subject: [PATCH 19/31] target/i386: use multiple CPU AddressSpaces This speeds up SMM switches. Later on it may remove the need to take the BQL, and it may also allow to reuse code between TCG and KVM. Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 15 +++++++---- target/i386/cpu.h | 11 +++++++- target/i386/helper.c | 54 ++++++++++++++++++++-------------------- target/i386/machine.c | 4 --- target/i386/smm_helper.c | 18 -------------- 5 files changed, 47 insertions(+), 55 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index ffb5267162..3f832a6a94 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3239,7 +3239,7 @@ static void x86_cpu_machine_done(Notifier *n, void *unused) cpu->smram = g_new(MemoryRegion, 1); memory_region_init_alias(cpu->smram, OBJECT(cpu), "smram", smram, 0, 1ull << 32); - memory_region_set_enabled(cpu->smram, false); + memory_region_set_enabled(cpu->smram, true); memory_region_add_subregion_overlap(cpu->cpu_as_root, 0, cpu->smram, 1); } } @@ -3619,7 +3619,9 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) #ifndef CONFIG_USER_ONLY if (tcg_enabled()) { - AddressSpace *newas = g_new(AddressSpace, 1); + AddressSpace *as_normal = address_space_init_shareable(cs->memory, + "cpu-memory"); + AddressSpace *as_smm = g_new(AddressSpace, 1); cpu->cpu_as_mem = g_new(MemoryRegion, 1); cpu->cpu_as_root = g_new(MemoryRegion, 1); @@ -3635,9 +3637,11 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) get_system_memory(), 0, ~0ull); memory_region_add_subregion_overlap(cpu->cpu_as_root, 0, cpu->cpu_as_mem, 0); memory_region_set_enabled(cpu->cpu_as_mem, true); - address_space_init(newas, cpu->cpu_as_root, "CPU"); - cs->num_ases = 1; - cpu_address_space_init(cs, newas, 0); + address_space_init(as_smm, cpu->cpu_as_root, "CPU"); + + cs->num_ases = 2; + cpu_address_space_init(cs, as_normal, 0); + cpu_address_space_init(cs, as_smm, 1); /* ... SMRAM with higher priority, linked from /machine/smram. */ cpu->machine_done.notify = x86_cpu_machine_done; @@ -4053,6 +4057,7 @@ static void x86_cpu_common_class_init(ObjectClass *oc, void *data) #ifdef CONFIG_USER_ONLY cc->handle_mmu_fault = x86_cpu_handle_mmu_fault; #else + cc->asidx_from_attrs = x86_asidx_from_attrs; cc->get_memory_mapping = x86_cpu_get_memory_mapping; cc->get_phys_page_debug = x86_cpu_get_phys_page_debug; cc->write_elf64_note = x86_cpu_write_elf64_note; diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 0facb354b5..de0551f775 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1451,6 +1451,16 @@ int x86_cpu_handle_mmu_fault(CPUState *cpu, vaddr addr, void x86_cpu_set_a20(X86CPU *cpu, int a20_state); #ifndef CONFIG_USER_ONLY +static inline int x86_asidx_from_attrs(CPUState *cs, MemTxAttrs attrs) +{ + return !!attrs.secure; +} + +static inline AddressSpace *cpu_addressspace(CPUState *cs, MemTxAttrs attrs) +{ + return cpu_get_address_space(cs, cpu_asidx_from_attrs(cs, attrs)); +} + uint8_t x86_ldub_phys(CPUState *cs, hwaddr addr); uint32_t x86_lduw_phys(CPUState *cs, hwaddr addr); uint32_t x86_ldl_phys(CPUState *cs, hwaddr addr); @@ -1653,7 +1663,6 @@ void do_interrupt_x86_hardirq(CPUX86State *env, int intno, int is_hw); /* smm_helper.c */ void do_smm_enter(X86CPU *cpu); -void cpu_smm_update(X86CPU *cpu); /* apic.c */ void cpu_report_tpr_access(CPUX86State *env, TPRAccess access); diff --git a/target/i386/helper.c b/target/i386/helper.c index 3850c56701..ef0505949a 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -1403,89 +1403,89 @@ uint8_t x86_ldub_phys(CPUState *cs, hwaddr addr) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - return address_space_ldub(cs->as, addr, - cpu_get_mem_attrs(env), - NULL); + return address_space_ldub(as, addr, attrs, NULL); } uint32_t x86_lduw_phys(CPUState *cs, hwaddr addr) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - return address_space_lduw(cs->as, addr, - cpu_get_mem_attrs(env), - NULL); + return address_space_lduw(as, addr, attrs, NULL); } uint32_t x86_ldl_phys(CPUState *cs, hwaddr addr) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - return address_space_ldl(cs->as, addr, - cpu_get_mem_attrs(env), - NULL); + return address_space_ldl(as, addr, attrs, NULL); } uint64_t x86_ldq_phys(CPUState *cs, hwaddr addr) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - return address_space_ldq(cs->as, addr, - cpu_get_mem_attrs(env), - NULL); + return address_space_ldq(as, addr, attrs, NULL); } void x86_stb_phys(CPUState *cs, hwaddr addr, uint8_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stb(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stb(as, addr, val, attrs, NULL); } void x86_stl_phys_notdirty(CPUState *cs, hwaddr addr, uint32_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stl_notdirty(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stl_notdirty(as, addr, val, attrs, NULL); } void x86_stw_phys(CPUState *cs, hwaddr addr, uint32_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stw(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stw(as, addr, val, attrs, NULL); } void x86_stl_phys(CPUState *cs, hwaddr addr, uint32_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stl(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stl(as, addr, val, attrs, NULL); } void x86_stq_phys(CPUState *cs, hwaddr addr, uint64_t val) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; + MemTxAttrs attrs = cpu_get_mem_attrs(env); + AddressSpace *as = cpu_addressspace(cs, attrs); - address_space_stq(cs->as, addr, val, - cpu_get_mem_attrs(env), - NULL); + address_space_stq(as, addr, val, attrs, NULL); } #endif diff --git a/target/i386/machine.c b/target/i386/machine.c index 3cb272948e..8c7a822e9f 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -274,10 +274,6 @@ static int cpu_post_load(void *opaque, int version_id) cpu_x86_update_dr7(env, dr7); } tlb_flush(cs); - - if (tcg_enabled()) { - cpu_smm_update(cpu); - } return 0; } diff --git a/target/i386/smm_helper.c b/target/i386/smm_helper.c index f051a77c4a..90621e5977 100644 --- a/target/i386/smm_helper.c +++ b/target/i386/smm_helper.c @@ -43,19 +43,6 @@ void helper_rsm(CPUX86State *env) #define SMM_REVISION_ID 0x00020000 #endif -/* Called with iothread lock taken */ -void cpu_smm_update(X86CPU *cpu) -{ - CPUX86State *env = &cpu->env; - bool smm_enabled = (env->hflags & HF_SMM_MASK); - - g_assert(qemu_mutex_iothread_locked()); - - if (cpu->smram) { - memory_region_set_enabled(cpu->smram, smm_enabled); - } -} - void do_smm_enter(X86CPU *cpu) { CPUX86State *env = &cpu->env; @@ -73,7 +60,6 @@ void do_smm_enter(X86CPU *cpu) } else { env->hflags2 |= HF2_NMI_MASK; } - cpu_smm_update(cpu); sm_state = env->smbase + 0x8000; @@ -338,10 +324,6 @@ void helper_rsm(CPUX86State *env) env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK; env->hflags &= ~HF_SMM_MASK; - qemu_mutex_lock_iothread(); - cpu_smm_update(cpu); - qemu_mutex_unlock_iothread(); - qemu_log_mask(CPU_LOG_INT, "SMM: after RSM\n"); log_cpu_state_mask(CPU_LOG_INT, CPU(cpu), CPU_DUMP_CCOP); } From 5b003a40bb1ab14d0398e91f03393d3c6b9577cd Mon Sep 17 00:00:00 2001 From: Mihail Abakumov Date: Fri, 19 May 2017 12:36:15 +0300 Subject: [PATCH 20/31] i386: fix read/write cr with icount option Running Windows with icount causes a crash in instruction of write cr. This patch fixes it. Reading and writing cr cause an icount read because there are called cpu_get_apic_tpr and cpu_set_apic_tpr functions. So, there is need gen_io_start()/gen_io_end() calls. Signed-off-by: Mihail Abakumov Message-Id: Signed-off-by: Paolo Bonzini --- target/i386/translate.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/target/i386/translate.c b/target/i386/translate.c index 674ec96d5a..ed3b896db4 100644 --- a/target/i386/translate.c +++ b/target/i386/translate.c @@ -7939,14 +7939,26 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, gen_update_cc_op(s); gen_jmp_im(pc_start - s->cs_base); if (b & 2) { + if (s->tb->cflags & CF_USE_ICOUNT) { + gen_io_start(); + } gen_op_mov_v_reg(ot, cpu_T0, rm); gen_helper_write_crN(cpu_env, tcg_const_i32(reg), cpu_T0); + if (s->tb->cflags & CF_USE_ICOUNT) { + gen_io_end(); + } gen_jmp_im(s->pc - s->cs_base); gen_eob(s); } else { + if (s->tb->cflags & CF_USE_ICOUNT) { + gen_io_start(); + } gen_helper_read_crN(cpu_T0, cpu_env, tcg_const_i32(reg)); gen_op_mov_reg_v(ot, rm, cpu_T0); + if (s->tb->cflags & CF_USE_ICOUNT) { + gen_io_end(); + } } break; default: From ad9579aaa16d5b385922d49edac2c96c79bcfb62 Mon Sep 17 00:00:00 2001 From: "Daniel P. Berrange" Date: Thu, 25 May 2017 16:53:00 +0100 Subject: [PATCH 21/31] sockets: improve error reporting if UNIX socket path is too long The 'struct sockaddr_un' only allows 108 bytes for the socket path. If the user supplies a path, QEMU uses snprintf() to silently truncate it when too long. This is undesirable because the user will then be unable to connect to the path they asked for. If the user doesn't supply a path, QEMU builds one based on TMPDIR, but if that leads to an overlong path, it mistakenly uses error_setg_errno() with a stale errno value, because snprintf() does not set errno on truncation. In solving this the code needed some refactoring to ensure we don't pass 'un.sun_path' directly to any APIs which expect NUL-terminated strings, because the path is not required to be terminated. Signed-off-by: Daniel P. Berrange Message-Id: <20170525155300.22743-1-berrange@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini --- util/qemu-sockets.c | 68 ++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index b39ae74fe0..82290cb687 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -845,6 +845,8 @@ static int unix_listen_saddr(UnixSocketAddress *saddr, { struct sockaddr_un un; int sock, fd; + char *pathbuf = NULL; + const char *path; sock = qemu_socket(PF_UNIX, SOCK_STREAM, 0); if (sock < 0) { @@ -852,20 +854,22 @@ static int unix_listen_saddr(UnixSocketAddress *saddr, return -1; } - memset(&un, 0, sizeof(un)); - un.sun_family = AF_UNIX; - if (saddr->path && strlen(saddr->path)) { - snprintf(un.sun_path, sizeof(un.sun_path), "%s", saddr->path); + if (saddr->path && saddr->path[0]) { + path = saddr->path; } else { const char *tmpdir = getenv("TMPDIR"); tmpdir = tmpdir ? tmpdir : "/tmp"; - if (snprintf(un.sun_path, sizeof(un.sun_path), "%s/qemu-socket-XXXXXX", - tmpdir) >= sizeof(un.sun_path)) { - error_setg_errno(errp, errno, - "TMPDIR environment variable (%s) too large", tmpdir); - goto err; - } + path = pathbuf = g_strdup_printf("%s/qemu-socket-XXXXXX", tmpdir); + } + if (strlen(path) > sizeof(un.sun_path)) { + error_setg(errp, "UNIX socket path '%s' is too long", path); + error_append_hint(errp, "Path must be less than %zu bytes\n", + sizeof(un.sun_path)); + goto err; + } + + if (pathbuf != NULL) { /* * This dummy fd usage silences the mktemp() unsecure warning. * Using mkstemp() doesn't make things more secure here @@ -873,24 +877,25 @@ static int unix_listen_saddr(UnixSocketAddress *saddr, * to unlink first and thus re-open the race window. The * worst case possible is bind() failing, i.e. a DoS attack. */ - fd = mkstemp(un.sun_path); + fd = mkstemp(pathbuf); if (fd < 0) { error_setg_errno(errp, errno, - "Failed to make a temporary socket name in %s", tmpdir); + "Failed to make a temporary socket %s", pathbuf); goto err; } close(fd); - if (update_addr) { - g_free(saddr->path); - saddr->path = g_strdup(un.sun_path); - } } - if (unlink(un.sun_path) < 0 && errno != ENOENT) { + if (unlink(path) < 0 && errno != ENOENT) { error_setg_errno(errp, errno, - "Failed to unlink socket %s", un.sun_path); + "Failed to unlink socket %s", path); goto err; } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + strncpy(un.sun_path, path, sizeof(un.sun_path)); + if (bind(sock, (struct sockaddr*) &un, sizeof(un)) < 0) { error_setg_errno(errp, errno, "Failed to bind socket to %s", un.sun_path); goto err; @@ -900,9 +905,16 @@ static int unix_listen_saddr(UnixSocketAddress *saddr, goto err; } + if (update_addr && pathbuf) { + g_free(saddr->path); + saddr->path = pathbuf; + } else { + g_free(pathbuf); + } return sock; err: + g_free(pathbuf); closesocket(sock); return -1; } @@ -932,9 +944,16 @@ static int unix_connect_saddr(UnixSocketAddress *saddr, qemu_set_nonblock(sock); } + if (strlen(saddr->path) > sizeof(un.sun_path)) { + error_setg(errp, "UNIX socket path '%s' is too long", saddr->path); + error_append_hint(errp, "Path must be less than %zu bytes\n", + sizeof(un.sun_path)); + goto err; + } + memset(&un, 0, sizeof(un)); un.sun_family = AF_UNIX; - snprintf(un.sun_path, sizeof(un.sun_path), "%s", saddr->path); + strncpy(un.sun_path, saddr->path, sizeof(un.sun_path)); /* connect to peer */ do { @@ -956,13 +975,18 @@ static int unix_connect_saddr(UnixSocketAddress *saddr, } if (rc < 0) { - error_setg_errno(errp, -rc, "Failed to connect socket"); - close(sock); - sock = -1; + error_setg_errno(errp, -rc, "Failed to connect socket %s", + saddr->path); + goto err; } g_free(connect_state); return sock; + + err: + close(sock); + g_free(connect_state); + return -1; } #else From df8ad9f128c15aa0a0ebc7b24e9a22c9775b67af Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 26 May 2017 22:04:21 -0500 Subject: [PATCH 22/31] nbd: Fully initialize client in case of failed negotiation If a non-NBD client connects to qemu-nbd, we would end up with a SIGSEGV in nbd_client_put() because we were trying to unregister the client's association to the export, even though we skipped inserting the client into that list. Easy trigger in two terminals: $ qemu-nbd -p 30001 --format=raw file $ nmap 127.0.0.1 -p 30001 nmap claims that it thinks it connected to a pago-services1 server (which probably means nmap could be updated to learn the NBD protocol and give a more accurate diagnosis of the open port - but that's not our problem), then terminates immediately, so our call to nbd_negotiate() fails. The fix is to reorder nbd_co_client_start() to ensure that all initialization occurs before we ever try talking to a client in nbd_negotiate(), so that the teardown sequence on negotiation failure doesn't fault while dereferencing a half-initialized object. While debugging this, I also noticed that nbd_update_server_watch() called by nbd_client_closed() was still adding a channel to accept the next client, even when the state was no longer RUNNING. That is fixed by making nbd_can_accept() pay attention to the current state. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1451614 Signed-off-by: Eric Blake Message-Id: <20170527030421.28366-1-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/server.c | 10 ++++------ qemu-nbd.c | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index ee59e5d234..49b55f6ede 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1358,15 +1358,13 @@ static coroutine_fn void nbd_co_client_start(void *opaque) if (exp) { nbd_export_get(exp); - } - if (nbd_negotiate(data)) { - client_close(client); - goto out; + QTAILQ_INSERT_TAIL(&exp->clients, client, next); } qemu_co_mutex_init(&client->send_lock); - if (exp) { - QTAILQ_INSERT_TAIL(&exp->clients, client, next); + if (nbd_negotiate(data)) { + client_close(client); + goto out; } nbd_client_receive_next_request(client); diff --git a/qemu-nbd.c b/qemu-nbd.c index f60842fd86..651f85ecc1 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -325,7 +325,7 @@ out: static int nbd_can_accept(void) { - return nb_fds < shared; + return state == RUNNING && nb_fds < shared; } static void nbd_export_closed(NBDExport *exp) From e2b6c1712e08bc5feafb44fdc65ab81ef2630b4b Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Mon, 29 May 2017 13:49:04 +0300 Subject: [PATCH 23/31] kvmclock: update system_time_msr address forcibly Do an update of system_time_msr address every time before reading the value of tsc_timestamp from guest's kvmclock page. There is no other code paths which ensure that qemu has an up-to-date value of system_time_msr. So, force this update on guest's tsc_timestamp reading. This bug causes effect on those nested setups which turn off TPR access interception for L2 guests and that access being intercepted by L0 doesn't show up in L1. Linux bootstrap initiate kvmclock before APIC initializing causing TPR access. That's why on L1 guests, having TPR interception turned on for L2, the effect of the bug is not revealed. This patch fixes this problem by making sure it knows the correct system_time_msr address every time it is needed. Signed-off-by: Denis Plotnikov Message-Id: <1496054944-25623-1-git-send-email-dplotnikov@virtuozzo.com> Signed-off-by: Paolo Bonzini --- hw/i386/kvm/clock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 13eca374cd..363d1b5743 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -19,6 +19,7 @@ #include "qemu/host-utils.h" #include "sysemu/sysemu.h" #include "sysemu/kvm.h" +#include "sysemu/hw_accel.h" #include "kvm_i386.h" #include "hw/sysbus.h" #include "hw/kvm/clock.h" @@ -69,6 +70,8 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s) uint64_t nsec_hi; uint64_t nsec; + cpu_synchronize_state(cpu); + if (!(env->system_time_msr & 1ULL)) { /* KVM clock not active */ return 0; From 7e018385103cd7a571b9ea0d6f994af6b1129fe7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 May 2017 14:37:15 +0200 Subject: [PATCH 24/31] linuxboot_dma: compile for i486 The ROM uses the cmovne instruction, which is new in Pentium Pro and does not work when running QEMU with "-cpu 486". Avoid producing that instruction. Suggested-by: Richard W.M. Jones Suggested-by: Thomas Huth Reported-by: Rob Landley Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini --- pc-bios/linuxboot_dma.bin | Bin 1536 -> 1536 bytes pc-bios/optionrom/Makefile | 1 + 2 files changed, 1 insertion(+) diff --git a/pc-bios/linuxboot_dma.bin b/pc-bios/linuxboot_dma.bin index 218d3ab4a29bfb5ab7125ec7a4d29dad1860c673..d176f62797813e4b926dca9dfce7ce554dc1a4d6 100644 GIT binary patch literal 1536 zcmeHFL2J`s82-{_T52KL%w~}udiW-YBE*9t=s~ClrIm^s9^6H6Pf%onr07*+kWd=L zpW)d<4G1ZESU1FyUDSCf^YYaxioy<}t?!rmu^#*f?mHxJo;-Qp_kHp#Je^o$|2##; zYs)n)qh6k7XI07O)9TW>>H@3WSgzkI-MW44`qG_Jxhgq7fAt@+rpuZy7{g_FpG^eo zuoD<7Fjn!zK~Uydy3X@Fj1CnQuA}{tXz$^z=7>a?#S!%6CsARNbiO)h=v+Q~y@5O1 ze8y7!akTy&-YXMV@p3Es`0q*km+)Mtxe(3DE2)a5;c%$H0|Yu~mbnR0C7dKHR7Zsr zmrccq5lm%U=wXc39gV6@%Jl!9^%vQ9V0A-7aUkNnpxu8npp2ruY0}SsKn=BiGN9Y; L*>&K*abWuoMsO(* literal 1536 zcmeHF&ubGw6rN4Dn#RCxYN|nUh%gpJFME=L9)#^}1F5!ps0f07iy|#(O1W^nf~VP8b!3b3v@l;?V$SuIwL6uYt5>dGyH&BPBIn1?(K>6@S?eidI1hgSZ^wky zNA%Hz@f^krJcQTH!Ptb+b>Z7QOZy_v)9!Gc_A*CUnxyOgP~7<9qN#In@`mGjfw=z$ z*1p4gW?~r|cz_FdqLT)y8y*s0Z-)z#_;!*cE?PD>cN4CX-n&AUsLGXj9UuS(2NP%(`OENoGw|EGqnf#7ASHq-k?KrExJ~ z^CHnSr*VvZO%mMQmY+taei4m+{T$|c^(w4)XlpT*^|#=^eK`aF_0%gE_5j9w)aV=c zk7$UtO_Fkt>S!)b5N*mx5@orPY(na#Hj4MfwIDj65ohT<|dXkK7E?qHJ9XdoKqMjysh NXm9N~@Si)N{RVuA8xQ~h diff --git a/pc-bios/optionrom/Makefile b/pc-bios/optionrom/Makefile index fa53d9e58e..a9a9e5e7eb 100644 --- a/pc-bios/optionrom/Makefile +++ b/pc-bios/optionrom/Makefile @@ -13,6 +13,7 @@ $(call set-vpath, $(SRC_PATH)/pc-bios/optionrom) ifeq ($(lastword $(filter -O%, -O0 $(CFLAGS))),-O0) override CFLAGS += -O2 endif +override CFLAGS += -march=i486 # Drop -fstack-protector and the like QEMU_CFLAGS := $(filter -W%, $(QEMU_CFLAGS)) $(CFLAGS_NOPIE) -ffreestanding From c25a67f0c3d0c86231f9653267a222c4effa706f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 May 2017 14:56:37 +0200 Subject: [PATCH 25/31] edu: fix memory leak on msi_broken platforms If msi_init fails, the thread has already been created and the mutex/condvar are not destroyed. Initialize everything only after the point where pci_edu_realize cannot fail. Reported-by: Markus Armbruster Cc: Peter Xu Signed-off-by: Paolo Bonzini --- hw/misc/edu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hw/misc/edu.c b/hw/misc/edu.c index 401039c100..01acacf142 100644 --- a/hw/misc/edu.c +++ b/hw/misc/edu.c @@ -343,6 +343,12 @@ static void pci_edu_realize(PCIDevice *pdev, Error **errp) EduState *edu = DO_UPCAST(EduState, pdev, pdev); uint8_t *pci_conf = pdev->config; + pci_config_set_interrupt_pin(pci_conf, 1); + + if (msi_init(pdev, 0, 1, true, false, errp)) { + return; + } + timer_init_ms(&edu->dma_timer, QEMU_CLOCK_VIRTUAL, edu_dma_timer, edu); qemu_mutex_init(&edu->thr_mutex); @@ -350,12 +356,6 @@ static void pci_edu_realize(PCIDevice *pdev, Error **errp) qemu_thread_create(&edu->thread, "edu", edu_fact_thread, edu, QEMU_THREAD_JOINABLE); - pci_config_set_interrupt_pin(pci_conf, 1); - - if (msi_init(pdev, 0, 1, true, false, errp)) { - return; - } - memory_region_init_io(&edu->mmio, OBJECT(edu), &edu_mmio_ops, edu, "edu-mmio", 1 << 20); pci_register_bar(pdev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &edu->mmio); From d45fc087c26674eedda9314b9aaefd8e061bf104 Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Thu, 1 Jun 2017 10:56:04 +0200 Subject: [PATCH 26/31] i386/kvm: do not zero out segment flags if segment is unusable or not present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a fix for the problem [1], where VMCB.CPL was set to 0 and interrupt was taken on userspace stack. The root cause lies in the specific AMD CPU behaviour which manifests itself as unusable segment attributes on SYSRET[2]. Here in this patch flags are not touched even segment is unusable or is not present, therefore CPL (which is stored in DPL field) should not be lost and will be successfully restored on kvm/svm kernel side. Also current patch should not break desired behavior described in this commit: 4cae9c97967a ("target-i386: kvm: clear unusable segments' flags in migration") since present bit will be dropped if segment is unusable or is not present. This is the second part of the whole fix of the corresponding problem [1], first part is related to kvm/svm kernel side and does exactly the same: segment attributes are not zeroed out. [1] Message id: CAJrWOzD6Xq==b-zYCDdFLgSRMPM-NkNuTSDFEtX=7MreT45i7Q@mail.gmail.com [2] Message id: 5d120f358612d73fc909f5bfa47e7bd082db0af0.1429841474.git.luto@kernel.org Signed-off-by: Roman Pen Signed-off-by: Mikhail Sennikovskii Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michael Chapman Cc: qemu-devel@nongnu.org Message-Id: <20170601085604.12980-1-roman.penyaev@profitbricks.com> Signed-off-by: Paolo Bonzini --- target/i386/kvm.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 9087677d00..5936d2761f 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -1301,18 +1301,14 @@ static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) lhs->selector = rhs->selector; lhs->base = rhs->base; lhs->limit = rhs->limit; - if (rhs->unusable) { - lhs->flags = 0; - } else { - lhs->flags = (rhs->type << DESC_TYPE_SHIFT) | - (rhs->present * DESC_P_MASK) | - (rhs->dpl << DESC_DPL_SHIFT) | - (rhs->db << DESC_B_SHIFT) | - (rhs->s * DESC_S_MASK) | - (rhs->l << DESC_L_SHIFT) | - (rhs->g * DESC_G_MASK) | - (rhs->avl * DESC_AVL_MASK); - } + lhs->flags = (rhs->type << DESC_TYPE_SHIFT) | + ((rhs->present && !rhs->unusable) * DESC_P_MASK) | + (rhs->dpl << DESC_DPL_SHIFT) | + (rhs->db << DESC_B_SHIFT) | + (rhs->s * DESC_S_MASK) | + (rhs->l << DESC_L_SHIFT) | + (rhs->g * DESC_G_MASK) | + (rhs->avl * DESC_AVL_MASK); } static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set) From b8158192fadb3e346372456c25cbbc4be584a85c Mon Sep 17 00:00:00 2001 From: Abdallah Bouassida Date: Thu, 1 Jun 2017 11:33:15 +0200 Subject: [PATCH 27/31] target/i386: Add GDB XML description for SSE registers Add an XML description for SSE registers (XMM+MXCSR) for both X86 and X86-64 architectures in the GDB stub: - configure: Define gdb_xml_files for the X86 targets (32 and 64bit). - gdb-xml/i386-32bit-sse.xml & gdb-xml/i386-64bit-sse.xml: The XML files that contain a description of the XMM + MXCSR registers. - gdb-xml/i386-32bit.xml & gdb-xml/i386-64bit.xml: wrappers that include the XML file of the core registers and the other XML file of the SSE registers. - target/i386/cpu.c: Modify the gdb_core_xml_file to the new XML wrapper, modify the gdb_num_core_regs to fit the registers number defined in each XML file. Signed-off-by: Abdallah Bouassida Signed-off-by: Paolo Bonzini --- configure | 4 +-- gdb-xml/i386-32bit-sse.xml | 52 +++++++++++++++++++++++++++++++++ gdb-xml/i386-32bit.xml | 14 +++++++++ gdb-xml/i386-64bit-sse.xml | 60 ++++++++++++++++++++++++++++++++++++++ gdb-xml/i386-64bit.xml | 14 +++++++++ target/i386/cpu.c | 8 ++--- 6 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 gdb-xml/i386-32bit-sse.xml create mode 100644 gdb-xml/i386-32bit.xml create mode 100644 gdb-xml/i386-64bit-sse.xml create mode 100644 gdb-xml/i386-64bit.xml diff --git a/configure b/configure index 13e040d28c..71f5612a65 100755 --- a/configure +++ b/configure @@ -6027,11 +6027,11 @@ TARGET_ABI_DIR="" case "$target_name" in i386) - gdb_xml_files="i386-32bit-core.xml" + gdb_xml_files="i386-32bit.xml i386-32bit-core.xml i386-32bit-sse.xml" ;; x86_64) TARGET_BASE_ARCH=i386 - gdb_xml_files="i386-64bit-core.xml" + gdb_xml_files="i386-64bit.xml i386-64bit-core.xml i386-64bit-sse.xml" ;; alpha) mttcg="yes" diff --git a/gdb-xml/i386-32bit-sse.xml b/gdb-xml/i386-32bit-sse.xml new file mode 100644 index 0000000000..57678473d6 --- /dev/null +++ b/gdb-xml/i386-32bit-sse.xml @@ -0,0 +1,52 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gdb-xml/i386-32bit.xml b/gdb-xml/i386-32bit.xml new file mode 100644 index 0000000000..956fc7f45f --- /dev/null +++ b/gdb-xml/i386-32bit.xml @@ -0,0 +1,14 @@ + + + + + + + + + + diff --git a/gdb-xml/i386-64bit-sse.xml b/gdb-xml/i386-64bit-sse.xml new file mode 100644 index 0000000000..e86efc9ce5 --- /dev/null +++ b/gdb-xml/i386-64bit-sse.xml @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gdb-xml/i386-64bit.xml b/gdb-xml/i386-64bit.xml new file mode 100644 index 0000000000..0b2f00ccbe --- /dev/null +++ b/gdb-xml/i386-64bit.xml @@ -0,0 +1,14 @@ + + + + + + + + + + diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 3f832a6a94..b2b1d20cee 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -4068,11 +4068,11 @@ static void x86_cpu_common_class_init(ObjectClass *oc, void *data) #endif cc->gdb_arch_name = x86_gdb_arch_name; #ifdef TARGET_X86_64 - cc->gdb_core_xml_file = "i386-64bit-core.xml"; - cc->gdb_num_core_regs = 40; + cc->gdb_core_xml_file = "i386-64bit.xml"; + cc->gdb_num_core_regs = 57; #else - cc->gdb_core_xml_file = "i386-32bit-core.xml"; - cc->gdb_num_core_regs = 32; + cc->gdb_core_xml_file = "i386-32bit.xml"; + cc->gdb_num_core_regs = 41; #endif #ifndef CONFIG_USER_ONLY cc->debug_excp_handler = breakpoint_handler; From 6bdcc018a6ed760b9dfe43539124e420aed83092 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 1 Jun 2017 12:44:56 +0200 Subject: [PATCH 28/31] nbd: make it thread-safe, fix qcow2 over nbd NBD is not thread safe, because it accesses s->in_flight without a CoMutex. Fixing this will be required for multiqueue. CoQueue doesn't have spurious wakeups but, when another coroutine can run between qemu_co_queue_next's wakeup and qemu_co_queue_wait's re-locking of the mutex, the wait condition can become false and a loop is necessary. In fact, it turns out that the loop is necessary even without this multi-threaded scenario. A particular sequence of coroutine wakeups is happening ~80% of the time when starting a guest with qcow2 image served over NBD (i.e. qemu-nbd --format=raw, and QEMU's -drive option has -format=qcow2). This patch fixes that issue too. Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 09d955bc4d..87d19c7253 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -119,6 +119,10 @@ static int nbd_co_send_request(BlockDriverState *bs, int rc, ret, i; qemu_co_mutex_lock(&s->send_mutex); + while (s->in_flight == MAX_NBD_REQUESTS) { + qemu_co_queue_wait(&s->free_sema, &s->send_mutex); + } + s->in_flight++; for (i = 0; i < MAX_NBD_REQUESTS; i++) { if (s->recv_coroutine[i] == NULL) { @@ -181,20 +185,6 @@ static void nbd_co_receive_reply(NBDClientSession *s, } } -static void nbd_coroutine_start(NBDClientSession *s, - NBDRequest *request) -{ - /* Poor man semaphore. The free_sema is locked when no other request - * can be accepted, and unlocked after receiving one reply. */ - if (s->in_flight == MAX_NBD_REQUESTS) { - qemu_co_queue_wait(&s->free_sema, NULL); - assert(s->in_flight < MAX_NBD_REQUESTS); - } - s->in_flight++; - - /* s->recv_coroutine[i] is set as soon as we get the send_lock. */ -} - static void nbd_coroutine_end(BlockDriverState *bs, NBDRequest *request) { @@ -202,13 +192,16 @@ static void nbd_coroutine_end(BlockDriverState *bs, int i = HANDLE_TO_INDEX(s, request->handle); s->recv_coroutine[i] = NULL; - s->in_flight--; - qemu_co_queue_next(&s->free_sema); /* Kick the read_reply_co to get the next reply. */ if (s->read_reply_co) { aio_co_wake(s->read_reply_co); } + + qemu_co_mutex_lock(&s->send_mutex); + s->in_flight--; + qemu_co_queue_next(&s->free_sema); + qemu_co_mutex_unlock(&s->send_mutex); } int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, @@ -226,7 +219,6 @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, assert(bytes <= NBD_MAX_BUFFER_SIZE); assert(!flags); - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; @@ -256,7 +248,6 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, assert(bytes <= NBD_MAX_BUFFER_SIZE); - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, qiov); if (ret < 0) { reply.error = -ret; @@ -291,7 +282,6 @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, request.flags |= NBD_CMD_FLAG_NO_HOLE; } - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; @@ -316,7 +306,6 @@ int nbd_client_co_flush(BlockDriverState *bs) request.from = 0; request.len = 0; - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; @@ -342,7 +331,6 @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) return 0; } - nbd_coroutine_start(client, &request); ret = nbd_co_send_request(bs, &request, NULL); if (ret < 0) { reply.error = -ret; From d870cfdea5b5fc7934cacc9786f185d741eab308 Mon Sep 17 00:00:00 2001 From: Gonglei Date: Thu, 1 Jun 2017 19:35:15 +0800 Subject: [PATCH 29/31] kvm: don't register smram_listener when smm is off If the user set disable smm by '-machine smm=off', we should not register smram_listener so that we can avoid waster memory in kvm since the added sencond address space. Meanwhile we should assign value of the global kvm_state before invoking the kvm_arch_init(), because pc_machine_is_smm_enabled() may use it by kvm_has_mm(). Signed-off-by: Gonglei Message-Id: <1496316915-121196-1-git-send-email-arei.gonglei@huawei.com> Signed-off-by: Paolo Bonzini --- kvm-all.c | 4 ++-- target/i386/kvm.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 1b9fe23490..44b3cf43cc 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1748,6 +1748,8 @@ static int kvm_init(MachineState *ms) kvm_ioeventfd_any_length_allowed = (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); + kvm_state = s; + ret = kvm_arch_init(ms, s); if (ret < 0) { goto err; @@ -1757,8 +1759,6 @@ static int kvm_init(MachineState *ms) kvm_irqchip_create(ms, s); } - kvm_state = s; - if (kvm_eventfds_allowed) { s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 5936d2761f..ee36502789 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -1255,7 +1255,9 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } - if (kvm_check_extension(s, KVM_CAP_X86_SMM)) { + if (kvm_check_extension(s, KVM_CAP_X86_SMM) && + object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE) && + pc_machine_is_smm_enabled(PC_MACHINE(ms))) { smram_machine_done.notify = register_smram_listener; qemu_add_machine_init_done_notifier(&smram_machine_done); } From 90bb0c04214545beb75044a2742f711335103269 Mon Sep 17 00:00:00 2001 From: Felipe Franciosi Date: Fri, 19 May 2017 22:29:50 +0100 Subject: [PATCH 30/31] cpus: reset throttle_thread_scheduled after sleep Currently, the throttle_thread_scheduled flag is reset back to 0 before sleeping (as part of the throttling logic). Given that throttle_timer (well, any timer) may tick with a slight delay, it so happens that under heavy throttling (ie. close or on CPU_THROTTLE_PCT_MAX) the tick may schedule a further cpu_throttle_thread() work item after the flag reset, but before the previous sleep completed. This results on the vCPU thread sleeping continuously for potentially several seconds in a row. The chances of that happening can be drastically minimised by resetting the flag after the sleep. Signed-off-by: Felipe Franciosi Signed-off-by: Malcolm Crossley Message-Id: <1495229390-18909-1-git-send-email-felipe@nutanix.com> Acked-by: Jason J. Herne Signed-off-by: Paolo Bonzini --- cpus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpus.c b/cpus.c index 6398439946..14bb8d552e 100644 --- a/cpus.c +++ b/cpus.c @@ -677,9 +677,9 @@ static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque) sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS); qemu_mutex_unlock_iothread(); - atomic_set(&cpu->throttle_thread_scheduled, 0); g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */ qemu_mutex_lock_iothread(); + atomic_set(&cpu->throttle_thread_scheduled, 0); } static void cpu_throttle_timer_tick(void *opaque) From ac06724a715864942e2b5e28f92d5d5421f0a0b0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Jun 2017 16:46:26 +0200 Subject: [PATCH 31/31] docs: create config/, devel/ and spin/ subdirectories Developer documentation should be its own manual. As a start, move all developer-oriented files to a separate directory. Also move non-text files to their own directories: docs/config/ for QEMU -readconfig input, and docs/spin/ for formal models to be used with the SPIN model checker. Reviewed-by: Daniel P. Berrange Signed-off-by: Paolo Bonzini --- docs/{ => config}/ich9-ehci-uhci.cfg | 0 docs/{ => config}/mach-virt-graphical.cfg | 0 docs/{ => config}/mach-virt-serial.cfg | 0 docs/{ => config}/q35-emulated.cfg | 0 docs/{ => config}/q35-virtio-graphical.cfg | 0 docs/{ => config}/q35-virtio-serial.cfg | 0 docs/{ => devel}/atomics.txt | 0 docs/{ => devel}/bitmaps.md | 0 docs/{ => devel}/blkdebug.txt | 0 docs/{ => devel}/blkverify.txt | 0 docs/{ => devel}/build-system.txt | 0 docs/{ => devel}/lockcnt.txt | 0 docs/{ => devel}/memory.txt | 0 docs/{ => devel}/migration.txt | 0 docs/{ => devel}/multi-thread-tcg.txt | 0 docs/{ => devel}/multiple-iothreads.txt | 0 docs/{ => devel}/qapi-code-gen.txt | 0 docs/{ => devel}/rcu.txt | 0 docs/{ => devel}/tracing.txt | 0 docs/{ => devel}/virtio-migration.txt | 0 docs/{ => devel}/writing-qmp-commands.txt | 0 docs/{ => spin}/aio_notify.promela | 0 docs/{ => spin}/aio_notify_accept.promela | 0 docs/{ => spin}/aio_notify_bug.promela | 0 docs/{ => spin}/tcg-exclusive.promela | 0 docs/{ => spin}/win32-qemu-event.promela | 0 26 files changed, 0 insertions(+), 0 deletions(-) rename docs/{ => config}/ich9-ehci-uhci.cfg (100%) rename docs/{ => config}/mach-virt-graphical.cfg (100%) rename docs/{ => config}/mach-virt-serial.cfg (100%) rename docs/{ => config}/q35-emulated.cfg (100%) rename docs/{ => config}/q35-virtio-graphical.cfg (100%) rename docs/{ => config}/q35-virtio-serial.cfg (100%) rename docs/{ => devel}/atomics.txt (100%) rename docs/{ => devel}/bitmaps.md (100%) rename docs/{ => devel}/blkdebug.txt (100%) rename docs/{ => devel}/blkverify.txt (100%) rename docs/{ => devel}/build-system.txt (100%) rename docs/{ => devel}/lockcnt.txt (100%) rename docs/{ => devel}/memory.txt (100%) rename docs/{ => devel}/migration.txt (100%) rename docs/{ => devel}/multi-thread-tcg.txt (100%) rename docs/{ => devel}/multiple-iothreads.txt (100%) rename docs/{ => devel}/qapi-code-gen.txt (100%) rename docs/{ => devel}/rcu.txt (100%) rename docs/{ => devel}/tracing.txt (100%) rename docs/{ => devel}/virtio-migration.txt (100%) rename docs/{ => devel}/writing-qmp-commands.txt (100%) rename docs/{ => spin}/aio_notify.promela (100%) rename docs/{ => spin}/aio_notify_accept.promela (100%) rename docs/{ => spin}/aio_notify_bug.promela (100%) rename docs/{ => spin}/tcg-exclusive.promela (100%) rename docs/{ => spin}/win32-qemu-event.promela (100%) diff --git a/docs/ich9-ehci-uhci.cfg b/docs/config/ich9-ehci-uhci.cfg similarity index 100% rename from docs/ich9-ehci-uhci.cfg rename to docs/config/ich9-ehci-uhci.cfg diff --git a/docs/mach-virt-graphical.cfg b/docs/config/mach-virt-graphical.cfg similarity index 100% rename from docs/mach-virt-graphical.cfg rename to docs/config/mach-virt-graphical.cfg diff --git a/docs/mach-virt-serial.cfg b/docs/config/mach-virt-serial.cfg similarity index 100% rename from docs/mach-virt-serial.cfg rename to docs/config/mach-virt-serial.cfg diff --git a/docs/q35-emulated.cfg b/docs/config/q35-emulated.cfg similarity index 100% rename from docs/q35-emulated.cfg rename to docs/config/q35-emulated.cfg diff --git a/docs/q35-virtio-graphical.cfg b/docs/config/q35-virtio-graphical.cfg similarity index 100% rename from docs/q35-virtio-graphical.cfg rename to docs/config/q35-virtio-graphical.cfg diff --git a/docs/q35-virtio-serial.cfg b/docs/config/q35-virtio-serial.cfg similarity index 100% rename from docs/q35-virtio-serial.cfg rename to docs/config/q35-virtio-serial.cfg diff --git a/docs/atomics.txt b/docs/devel/atomics.txt similarity index 100% rename from docs/atomics.txt rename to docs/devel/atomics.txt diff --git a/docs/bitmaps.md b/docs/devel/bitmaps.md similarity index 100% rename from docs/bitmaps.md rename to docs/devel/bitmaps.md diff --git a/docs/blkdebug.txt b/docs/devel/blkdebug.txt similarity index 100% rename from docs/blkdebug.txt rename to docs/devel/blkdebug.txt diff --git a/docs/blkverify.txt b/docs/devel/blkverify.txt similarity index 100% rename from docs/blkverify.txt rename to docs/devel/blkverify.txt diff --git a/docs/build-system.txt b/docs/devel/build-system.txt similarity index 100% rename from docs/build-system.txt rename to docs/devel/build-system.txt diff --git a/docs/lockcnt.txt b/docs/devel/lockcnt.txt similarity index 100% rename from docs/lockcnt.txt rename to docs/devel/lockcnt.txt diff --git a/docs/memory.txt b/docs/devel/memory.txt similarity index 100% rename from docs/memory.txt rename to docs/devel/memory.txt diff --git a/docs/migration.txt b/docs/devel/migration.txt similarity index 100% rename from docs/migration.txt rename to docs/devel/migration.txt diff --git a/docs/multi-thread-tcg.txt b/docs/devel/multi-thread-tcg.txt similarity index 100% rename from docs/multi-thread-tcg.txt rename to docs/devel/multi-thread-tcg.txt diff --git a/docs/multiple-iothreads.txt b/docs/devel/multiple-iothreads.txt similarity index 100% rename from docs/multiple-iothreads.txt rename to docs/devel/multiple-iothreads.txt diff --git a/docs/qapi-code-gen.txt b/docs/devel/qapi-code-gen.txt similarity index 100% rename from docs/qapi-code-gen.txt rename to docs/devel/qapi-code-gen.txt diff --git a/docs/rcu.txt b/docs/devel/rcu.txt similarity index 100% rename from docs/rcu.txt rename to docs/devel/rcu.txt diff --git a/docs/tracing.txt b/docs/devel/tracing.txt similarity index 100% rename from docs/tracing.txt rename to docs/devel/tracing.txt diff --git a/docs/virtio-migration.txt b/docs/devel/virtio-migration.txt similarity index 100% rename from docs/virtio-migration.txt rename to docs/devel/virtio-migration.txt diff --git a/docs/writing-qmp-commands.txt b/docs/devel/writing-qmp-commands.txt similarity index 100% rename from docs/writing-qmp-commands.txt rename to docs/devel/writing-qmp-commands.txt diff --git a/docs/aio_notify.promela b/docs/spin/aio_notify.promela similarity index 100% rename from docs/aio_notify.promela rename to docs/spin/aio_notify.promela diff --git a/docs/aio_notify_accept.promela b/docs/spin/aio_notify_accept.promela similarity index 100% rename from docs/aio_notify_accept.promela rename to docs/spin/aio_notify_accept.promela diff --git a/docs/aio_notify_bug.promela b/docs/spin/aio_notify_bug.promela similarity index 100% rename from docs/aio_notify_bug.promela rename to docs/spin/aio_notify_bug.promela diff --git a/docs/tcg-exclusive.promela b/docs/spin/tcg-exclusive.promela similarity index 100% rename from docs/tcg-exclusive.promela rename to docs/spin/tcg-exclusive.promela diff --git a/docs/win32-qemu-event.promela b/docs/spin/win32-qemu-event.promela similarity index 100% rename from docs/win32-qemu-event.promela rename to docs/spin/win32-qemu-event.promela