Skip to content
Snippets Groups Projects
Commit aa235855 authored by Will Chandler's avatar Will Chandler
Browse files

nodes: Set connection backoff MaxDelay to 1 second

gRPC clients use an exponential backoff strategy[0] for re-establishing
connections, meaning that the longer a connection has been in a bad
state the greater the delay before the client will make its next
connection attempt. This is useful in scenarios where a very large
number of clients could trigger a thundering herd effect on a server as
it returns to service.

In a Gitaly Cluster, this means that in cases where a Gitaly node is
down for some time and a large connection backoff has been set,
Praefect may wait to try to connect for up to 120 seconds. This causes
Gitaly nodes to remain unavailable longer than necessary.

The issues addressed gRPC's default exponential backoff behavior do not
apply in this scenario as we will always have a small number of clients
(Praefect nodes), and the volume of traffic from healthchecks is dwarfed
by normal production load.

To resolve this, set the maximum backoff delay to one second.

[0] https://github.com/grpc/grpc/blob/v1.51.0/doc/connection-backoff.md

Changelog: fixed
parent dd80d2aa
No related merge requests found
......@@ -24,6 +24,7 @@ import (
prommetrics "gitlab.com/gitlab-org/gitaly/v15/internal/prometheus/metrics"
"gitlab.com/gitlab-org/gitaly/v15/internal/sidechannel"
"google.golang.org/grpc"
"google.golang.org/grpc/backoff"
healthpb "google.golang.org/grpc/health/grpc_health_v1"
)
......@@ -118,6 +119,7 @@ type leaderElectionStrategy interface {
var ErrPrimaryNotHealthy = errors.New("primary gitaly is not healthy")
const dialTimeout = 10 * time.Second
const dialMaxBackoff = 1 * time.Second
// Dial dials a node with the necessary interceptors configured.
func Dial(ctx context.Context, node *config.Node, registry *protoregistry.Registry, errorTracker tracker.ErrorTracker, handshaker client.Handshaker, sidechannelRegistry *sidechannel.Registry) (*grpc.ClientConn, error) {
......@@ -135,11 +137,16 @@ func Dial(ctx context.Context, node *config.Node, registry *protoregistry.Regist
sidechannel.NewUnaryProxy(sidechannelRegistry),
}
b := backoff.DefaultConfig
b.MaxDelay = dialMaxBackoff
connParams := grpc.ConnectParams{Backoff: b}
dialOpts := []grpc.DialOption{
grpc.WithDefaultCallOptions(grpc.ForceCodec(proxy.NewCodec())),
grpc.WithPerRPCCredentials(gitalyauth.RPCCredentialsV2(node.Token)),
grpc.WithChainStreamInterceptor(streamInterceptors...),
grpc.WithChainUnaryInterceptor(unaryInterceptors...),
grpc.WithConnectParams(connParams),
client.UnaryInterceptor(),
client.StreamInterceptor(),
}
......
......@@ -18,6 +18,8 @@ import (
"gitlab.com/gitlab-org/gitaly/v15/internal/testhelper"
"gitlab.com/gitlab-org/gitaly/v15/internal/testhelper/promtest"
"google.golang.org/grpc"
"google.golang.org/grpc/backoff"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
......@@ -562,3 +564,45 @@ func TestNodeStatus_IsHealthy(t *testing.T) {
})
})
}
func TestNodeStatus_NoBackoffDelay(t *testing.T) {
socket := testhelper.GetTemporaryGitalySocketFileName(t)
address := "unix://" + socket
_, stopServer := testhelper.NewServerWithHealthAndStop(t, socket)
b := backoff.DefaultConfig
b.MaxDelay = dialMaxBackoff
connParams := grpc.ConnectParams{Backoff: b}
dialOpts := []grpc.DialOption{grpc.WithConnectParams(connParams)}
clientConn, err := client.Dial(address, dialOpts)
require.NoError(t, err)
defer func() { require.NoError(t, clientConn.Close()) }()
latencyHistMock := &promtest.MockHistogramVec{}
node := config.Node{Storage: "gitaly-0", Address: address}
ns := newConnectionStatus(node, clientConn, testhelper.NewDiscardingLogger(t), latencyHistMock, nil)
ctx := testhelper.Context(t)
stopServer() // Kill the existing server.
// Fail repeatedly to grow exponential backoff.
// With a default config this would end up at roughly 20 seconds.
for i := 0; i < 60; i++ {
_, err = ns.CheckHealth(ctx)
testhelper.RequireGrpcCode(t, err, codes.Unavailable)
fmt.Println(time.Now())
time.Sleep(1 * time.Second)
}
_ = testhelper.NewServerWithHealth(t, socket)
// Wait long enough for a short backoff to complete, but not a default backoff.
time.Sleep(2 * time.Second)
_, err = ns.CheckHealth(ctx)
// If the default backoff duration were used connection attempts would error out
// for 10-15 more seconds.
require.NoError(t, err)
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment