mirror of
https://github.com/containers/podman.git
synced 2026-03-04 14:57:22 -05:00
machine start: qemu: adjust backoffs
Make sure that starting a qemu machine uses proper exponential backoffs and that a single variable isn't shared across multiple backoffs. DO NOT BACKPORT: I want to avoid backporting this PR to the upcoming 4.6 release as it increases the flakiness of machine start (see #17403). On my M2 machine, the flake rate seems to have increased with this change and I strongly suspect that additional/redundant sleep after waiting for the machine to be running and listening reduced the flakiness. My hope is to have more predictable behavior and find the sources of the flakes soon. [NO NEW TESTS NEEDED] - still too flaky to add a test to CI. Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
@@ -417,9 +417,11 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
|
||||
conn net.Conn
|
||||
err error
|
||||
qemuSocketConn net.Conn
|
||||
wait = time.Millisecond * 500
|
||||
)
|
||||
|
||||
defaultBackoff := 500 * time.Millisecond
|
||||
maxBackoffs := 6
|
||||
|
||||
v.Starting = true
|
||||
if err := v.writeConfig(); err != nil {
|
||||
return fmt.Errorf("writing JSON file: %w", err)
|
||||
@@ -471,13 +473,17 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
|
||||
if err := v.QMPMonitor.Address.Delete(); err != nil {
|
||||
return err
|
||||
}
|
||||
for i := 0; i < 6; i++ {
|
||||
|
||||
backoff := defaultBackoff
|
||||
for i := 0; i < maxBackoffs; i++ {
|
||||
if i > 0 {
|
||||
time.Sleep(backoff)
|
||||
backoff *= 2
|
||||
}
|
||||
qemuSocketConn, err = net.Dial("unix", v.QMPMonitor.Address.GetPath())
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
time.Sleep(wait)
|
||||
wait++
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -560,7 +566,12 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
|
||||
// The socket is not made until the qemu process is running so here
|
||||
// we do a backoff waiting for it. Once we have a conn, we break and
|
||||
// then wait to read it.
|
||||
for i := 0; i < 6; i++ {
|
||||
backoff = defaultBackoff
|
||||
for i := 0; i < maxBackoffs; i++ {
|
||||
if i > 0 {
|
||||
time.Sleep(backoff)
|
||||
backoff *= 2
|
||||
}
|
||||
conn, err = net.Dial("unix", filepath.Join(socketPath, "podman", v.Name+"_ready.sock"))
|
||||
if err == nil {
|
||||
break
|
||||
@@ -570,8 +581,6 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
time.Sleep(wait)
|
||||
wait++
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -591,18 +600,24 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
|
||||
}
|
||||
}
|
||||
if len(v.Mounts) > 0 {
|
||||
state, err := v.State(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
listening := v.isListening()
|
||||
for state != machine.Running || !listening {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
state, err = v.State(true)
|
||||
connected := false
|
||||
backoff = 500 * time.Millisecond
|
||||
for i := 0; i < maxBackoffs; i++ {
|
||||
if i > 0 {
|
||||
time.Sleep(backoff)
|
||||
backoff *= 2
|
||||
}
|
||||
state, err := v.State(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
listening = v.isListening()
|
||||
if state == machine.Running && v.isListening() {
|
||||
connected = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !connected {
|
||||
return fmt.Errorf("machine did not transition into running state")
|
||||
}
|
||||
}
|
||||
for _, mount := range v.Mounts {
|
||||
|
||||
Reference in New Issue
Block a user