From a0b7655523dc321b2fb56eca83f40bb65cba3f2e Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Tue, 4 Jul 2023 15:53:12 +0200 Subject: [PATCH] machine start: qemu: adjust backoffs Make sure that starting a qemu machine uses proper exponential backoffs and that a single variable isn't shared across multiple backoffs. DO NOT BACKPORT: I want to avoid backporting this PR to the upcoming 4.6 release as it increases the flakiness of machine start (see #17403). On my M2 machine, the flake rate seems to have increased with this change and I strongly suspect that additional/redundant sleep after waiting for the machine to be running and listening reduced the flakiness. My hope is to have more predictable behavior and find the sources of the flakes soon. [NO NEW TESTS NEEDED] - still too flaky to add a test to CI. Signed-off-by: Valentin Rothberg --- pkg/machine/qemu/machine.go | 47 ++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/pkg/machine/qemu/machine.go b/pkg/machine/qemu/machine.go index 783f5cd368..783d917b68 100644 --- a/pkg/machine/qemu/machine.go +++ b/pkg/machine/qemu/machine.go @@ -417,9 +417,11 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error { conn net.Conn err error qemuSocketConn net.Conn - wait = time.Millisecond * 500 ) + defaultBackoff := 500 * time.Millisecond + maxBackoffs := 6 + v.Starting = true if err := v.writeConfig(); err != nil { return fmt.Errorf("writing JSON file: %w", err) @@ -471,13 +473,17 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error { if err := v.QMPMonitor.Address.Delete(); err != nil { return err } - for i := 0; i < 6; i++ { + + backoff := defaultBackoff + for i := 0; i < maxBackoffs; i++ { + if i > 0 { + time.Sleep(backoff) + backoff *= 2 + } qemuSocketConn, err = net.Dial("unix", v.QMPMonitor.Address.GetPath()) if err == nil { break } - time.Sleep(wait) - wait++ } if err != nil { return err @@ -560,7 +566,12 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error { // The socket is not made until the qemu process is running so here // we do a backoff waiting for it. Once we have a conn, we break and // then wait to read it. - for i := 0; i < 6; i++ { + backoff = defaultBackoff + for i := 0; i < maxBackoffs; i++ { + if i > 0 { + time.Sleep(backoff) + backoff *= 2 + } conn, err = net.Dial("unix", filepath.Join(socketPath, "podman", v.Name+"_ready.sock")) if err == nil { break @@ -570,8 +581,6 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error { if err != nil { return err } - time.Sleep(wait) - wait++ } if err != nil { return err @@ -591,18 +600,24 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error { } } if len(v.Mounts) > 0 { - state, err := v.State(true) - if err != nil { - return err - } - listening := v.isListening() - for state != machine.Running || !listening { - time.Sleep(100 * time.Millisecond) - state, err = v.State(true) + connected := false + backoff = 500 * time.Millisecond + for i := 0; i < maxBackoffs; i++ { + if i > 0 { + time.Sleep(backoff) + backoff *= 2 + } + state, err := v.State(true) if err != nil { return err } - listening = v.isListening() + if state == machine.Running && v.isListening() { + connected = true + break + } + } + if !connected { + return fmt.Errorf("machine did not transition into running state") } } for _, mount := range v.Mounts {