Skip to content

Commit

Permalink
Use cenkalti/backoff/v4 for io retry logic
Browse files Browse the repository at this point in the history
This adds cenkalti/backoff/v4 as a dependency to be used for handling
exponential backoff logic for stdio connection retry attempts. Retrying
at a fixed interval is a bit naive as all of the shims would potentially
be trying to reconnect to 3 pipes continuously all in <timeout> bursts.
This allows us to space out the connections, set an upper limit on timeout
intervals and add an element of randomness to the retry attempts.

Signed-off-by: Daniel Canter <dcanter@microsoft.com>
  • Loading branch information
dcantah committed Oct 21, 2021
1 parent 0209d14 commit e30f98f
Show file tree
Hide file tree
Showing 34 changed files with 1,438 additions and 51 deletions.
7 changes: 6 additions & 1 deletion .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ issues:
# supposed to be a pretty faithful mapping to an OS call/constants, or non-generated code still checks if we're following idioms,
# while ignoring the things that are just noise or would be more of a hassle than it'd be worth to change.
exclude-rules:
- path: internal\\cmd\\
linters:
- staticcheck
Text: "SA5011"

- path: layer.go
linters:
- stylecheck
Expand All @@ -24,7 +29,7 @@ issues:
linters:
- stylecheck
Text: "ST1003:"

- path: cmd\\ncproxy\\nodenetsvc\\
linters:
- stylecheck
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.13
require (
github.com/BurntSushi/toml v0.3.1
github.com/Microsoft/go-winio v0.4.17
github.com/cenkalti/backoff/v4 v4.1.1
github.com/containerd/cgroups v1.0.1
github.com/containerd/console v1.0.2
github.com/containerd/containerd v1.5.7
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ github.com/buger/jsonparser v0.0.0-20180808090653-f4dd9f5a6b44/go.mod h1:bbYlZJ7
github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd/go.mod h1:2oa8nejYd4cQ/b0hMIopN0lCRxU0bueqREvZLWFrtK8=
github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b/go.mod h1:obH5gd0BsqsP2LwDJ9aOkm/6J86V6lyAXCoQWGw3K50=
github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE=
github.com/cenkalti/backoff/v4 v4.1.1 h1:G2HAfAmvm/GcKan2oOQpBXOd2tT2G57ZnZGWa1PxPBQ=
github.com/cenkalti/backoff/v4 v4.1.1/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
Expand Down
77 changes: 52 additions & 25 deletions internal/cmd/io_npipe.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"io"
"math/rand"
"net"
"sync"
"syscall"
Expand All @@ -12,10 +13,16 @@ import (
winio "github.com/Microsoft/go-winio"
"github.com/Microsoft/go-winio/pkg/guid"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/cenkalti/backoff/v4"
"github.com/sirupsen/logrus"
"golang.org/x/sys/windows"
)

func init() {
// Need to seed for the rng in backoff.NextBackoff()
rand.Seed(time.Now().UnixNano())
}

const defaultIOReconnectTimeout time.Duration = 10 * time.Second

// NewNpipeIO creates connected upstream io. It is the callers responsibility to
Expand Down Expand Up @@ -56,22 +63,37 @@ func NewNpipeIO(ctx context.Context, stdin, stdout, stderr string, terminal bool
if err != nil {
return nil, err
}
nio.sout = &nPipeRetryWriter{c, stdout, retryTimeout}
nio.sout = &nPipeRetryWriter{c, stdout, newBackOff(retryTimeout)}
}
if stderr != "" {
c, err := winio.DialPipeContext(ctx, stderr)
if err != nil {
return nil, err
}
nio.serr = &nPipeRetryWriter{c, stderr, retryTimeout}
nio.serr = &nPipeRetryWriter{c, stderr, newBackOff(retryTimeout)}
}
return nio, nil
}

type nPipeRetryWriter struct {
net.Conn
pipePath string
timeout time.Duration
backOff backoff.BackOff
}

// newBackOff returns a new BackOff interface. The values chosen are fairly conservative, the main use is to get a somewhat random
// retry timeout on each ask. This can help avoid flooding a server all at once.
func newBackOff(timeout time.Duration) backoff.BackOff {
b := &backoff.ExponentialBackOff{
InitialInterval: time.Millisecond * 200, // First backoff timeout will be somewhere in the 100 - 300 ms range given the default multiplier.
RandomizationFactor: backoff.DefaultRandomizationFactor,
Multiplier: backoff.DefaultMultiplier,
MaxInterval: time.Second * 2,
MaxElapsedTime: timeout,
Stop: backoff.Stop,
Clock: backoff.SystemClock,
}
return b
}

func (nprw *nPipeRetryWriter) Write(p []byte) (n int, err error) {
Expand All @@ -80,22 +102,45 @@ func (nprw *nPipeRetryWriter) Write(p []byte) (n int, err error) {
if err != nil {
// If the error is one that we can discern calls for a retry, attempt to redial the pipe.
if isDisconnectedErr(err) {
retryCtx, cancel := context.WithTimeout(context.Background(), nprw.timeout)
defer cancel()

// Close the old conn.
nprw.Conn.Close()
newConn, retryErr := retryDialPipe(retryCtx, nprw.pipePath)
newConn, retryErr := nprw.retryDialPipe()
if retryErr == nil {
nprw.Conn = newConn
continue
}
err = retryErr
}
}
return
}
}

// retryDialPipe is a helper to retry dialing a named pipe until the timeout of nprw.BackOff or a successful connection. This is mainly to
// assist in scenarios where the server end of the pipe has crashed/went away and is no longer accepting new connections but may
// come back online. The backoff used inside is to try and space out connections to the server as to not flood it all at once with connection
// attempts at the same interval.
func (nprw *nPipeRetryWriter) retryDialPipe() (net.Conn, error) {
// Reset the backoff object as it starts ticking down when it's created. This also ensures we can re-use it in the event the server goes
// away more than once.
nprw.backOff.Reset()
for {
backOffTime := nprw.backOff.NextBackOff()
// We don't simply use a context with a timeout and pass it to DialPipe because DialPipe only retries the connection (and thus makes use of
// the timeout) if it sees that the pipe is busy. If the server isn't up and not listening it will just error out. That's the case we're
// most likely in right now so we need our own retry logic on top.
conn, err := winio.DialPipe(nprw.pipePath, nil)
if err == nil {
return conn, nil
}
// Next backoff would go over our timeout. We've tried once more above due to the ordering of this check, but now we need to bail out.
if backOffTime == backoff.Stop {
return nil, fmt.Errorf("reached timeout while retrying dial on %s", nprw.pipePath)
}
time.Sleep(backOffTime)
}
}

// isDisconnectedErr is a helper to determine if the error received from writing to the server end of a named pipe indicates a disconnect/severed
// connection. This can be used to attempt a redial if it's expected that the server will come back online at some point.
func isDisconnectedErr(err error) bool {
Expand All @@ -106,24 +151,6 @@ func isDisconnectedErr(err error) bool {
return false
}

// retryDialPipe is a helper to retry dialing a named pipe until context timeout or a successful connection. This is mainly to
// assist in scenarios where the server end of the pipe has crashed/went away and is no longer accepting new connections but may
// come back online.
func retryDialPipe(ctx context.Context, path string) (net.Conn, error) {
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("failed to reconnect to IO pipe within timeout: %w", ctx.Err())
default:
conn, err := winio.DialPipe(path, nil)
if err == nil {
return conn, nil
}
time.Sleep(time.Millisecond * 100)
}
}
}

var _ = (UpstreamIO)(&npipeio{})

type npipeio struct {
Expand Down
2 changes: 2 additions & 0 deletions test/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ github.com/buger/jsonparser v0.0.0-20180808090653-f4dd9f5a6b44/go.mod h1:bbYlZJ7
github.com/bugsnag/bugsnag-go v0.0.0-20141110184014-b1d153021fcd/go.mod h1:2oa8nejYd4cQ/b0hMIopN0lCRxU0bueqREvZLWFrtK8=
github.com/bugsnag/osext v0.0.0-20130617224835-0dd3f918b21b/go.mod h1:obH5gd0BsqsP2LwDJ9aOkm/6J86V6lyAXCoQWGw3K50=
github.com/bugsnag/panicwrap v0.0.0-20151223152923-e2c28503fcd0/go.mod h1:D/8v3kj0zr8ZAKg1AQ6crr+5VwKN5eIywRkfhyM/+dE=
github.com/cenkalti/backoff/v4 v4.1.1 h1:G2HAfAmvm/GcKan2oOQpBXOd2tT2G57ZnZGWa1PxPBQ=
github.com/cenkalti/backoff/v4 v4.1.1/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
Expand Down
1 change: 1 addition & 0 deletions test/vendor/github.com/Microsoft/hcsshim/go.mod

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions test/vendor/github.com/Microsoft/hcsshim/go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

77 changes: 52 additions & 25 deletions test/vendor/github.com/Microsoft/hcsshim/internal/cmd/io_npipe.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions test/vendor/github.com/cenkalti/backoff/v4/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions test/vendor/github.com/cenkalti/backoff/v4/.travis.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit e30f98f

Please sign in to comment.