Skip to content

Commit 25e244e

Browse files
committed
Add AC hit rate metrics with prometheus labels
This is a draft, not ready to be merged. Summary of what is added by this commit: - Prometheus counter for cache hit ratio of only AC requests. - Support for prometheus labels based on custom HTTP and gRPC headers. Cache hit ratio for CAS entries is easily misinterpreted. Example: A typical action cache hit often involves 3 or more HTTP requests: GET AC 200 GET CAS 200 (.o file) GET CAS 200 (.d file) ... But a cache miss for the same action is typically a single HTTP request: GET AC 404 The ratio between all HTTP GET 200 vs HTTP GET 404 above does not represent the cache hit ratio experienced by the user for actions. The ratio of only AC requests is easier to reason about, especially when AC requests checks existence of CAS dependencies. The number of AC hits and misses can be directly compared against numbers printed in the end of each build by bazel client. And against other prometheus counters produced by remote execution systems for executed actions. An understanding about the reason for cache misses is necessary to improve the cache hit ratio. It could be that the system has been configured in a way that prevent artifacts from being reused between different OS. Or that the cache is only populated by CI jobs on master, potentially resulting in cache misses for other users, etc. It becomes easier to notice such patterns, if cache hit ratio could be calculated for different categories of builds. Such categories can be set as custom headers via bazel flags --remote_header=branch=master and applied as prometheus labels. Mapping of headers to prometheus labels are controlled in bazel-remote's config file. The ratio between cache uploads and cache misses is also relevant, as an view about which categories are not uploading their result. The ratio of cache uploads can also indicate if much is uploaded but seldom requested. E.g. does it make sense to populate central caches from interactive builds or only from CI? Categories and custom headers, could also be set for an overview about: - Bazel versions using a cache instance? - How much separate organizations are using a cache instance? - From which network traffic originates? - Which products are built using the cache? - If the traffic comes via proxy adding its own headers? - Distinguish dummy requests for monitoring the cache, from real requests? - ...
1 parent 0a6dd55 commit 25e244e

15 files changed

Lines changed: 335 additions & 38 deletions

File tree

BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ go_library(
2222
"//config:go_default_library",
2323
"//server:go_default_library",
2424
"//utils/idle:go_default_library",
25+
"//utils/metrics:go_default_library",
2526
"//utils/rlimit:go_default_library",
2627
"@com_github_abbot_go_http_auth//:go_default_library",
2728
"@com_github_grpc_ecosystem_go_grpc_prometheus//:go_default_library",

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,23 @@ host: localhost
223223

224224
# If true, enable experimental remote asset API support:
225225
#experimental_remote_asset_api: true
226+
227+
# Allows mapping HTTP and gRPC headers to prometheus
228+
# labels. Headers can be set by bazel client as:
229+
# --remote_header=os=ubuntu18-04. Not all counters are
230+
# affected.
231+
#metrics:
232+
# categories:
233+
# os:
234+
# - rhel7
235+
# - rhel8
236+
# - ubuntu16-04
237+
# - ubuntu18-04
238+
# branch:
239+
# - master
240+
# user:
241+
# - ci
242+
226243
```
227244

228245
## Docker

cache/cache.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ const (
2121
// used for HTTP when running with the --disable_http_ac_validation
2222
// commandline flag.
2323
RAW
24+
25+
UNKNOWN
2426
)
2527

2628
func (e EntryKind) String() string {
@@ -30,7 +32,10 @@ func (e EntryKind) String() string {
3032
if e == CAS {
3133
return "cas"
3234
}
33-
return "raw"
35+
if e == RAW {
36+
return "raw"
37+
}
38+
return "unknown"
3439
}
3540

3641
// Logger is designed to be satisfied by log.Logger.

config/config.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ type HTTPBackendConfig struct {
3535
BaseURL string `yaml:"url"`
3636
}
3737

38+
// Metrics stores configuration for prometheus metrics.
39+
type Metrics struct {
40+
Categories map[string][]string `yaml:"categories"`
41+
}
42+
3843
// Config holds the top-level configuration for bazel-remote.
3944
type Config struct {
4045
Host string `yaml:"host"`
@@ -55,6 +60,7 @@ type Config struct {
5560
DisableGRPCACDepsCheck bool `yaml:"disable_grpc_ac_deps_check"`
5661
EnableACKeyInstanceMangling bool `yaml:"enable_ac_key_instance_mangling"`
5762
EnableEndpointMetrics bool `yaml:"enable_endpoint_metrics"`
63+
Metrics *Metrics `yaml:"metrics"`
5864
ExperimentalRemoteAssetAPI bool `yaml:"experimental_remote_asset_api"`
5965
HTTPReadTimeout time.Duration `yaml:"http_read_timeout"`
6066
HTTPWriteTimeout time.Duration `yaml:"http_write_timeout"`
@@ -73,6 +79,7 @@ func New(dir string, maxSize int, host string, port int, grpcPort int,
7379
disableGRPCACDepsCheck bool,
7480
enableACKeyInstanceMangling bool,
7581
enableEndpointMetrics bool,
82+
metrics *Metrics,
7683
experimentalRemoteAssetAPI bool,
7784
httpReadTimeout time.Duration,
7885
httpWriteTimeout time.Duration) (*Config, error) {
@@ -95,6 +102,7 @@ func New(dir string, maxSize int, host string, port int, grpcPort int,
95102
DisableGRPCACDepsCheck: disableGRPCACDepsCheck,
96103
EnableACKeyInstanceMangling: enableACKeyInstanceMangling,
97104
EnableEndpointMetrics: enableEndpointMetrics,
105+
Metrics: metrics,
98106
ExperimentalRemoteAssetAPI: experimentalRemoteAssetAPI,
99107
HTTPReadTimeout: httpReadTimeout,
100108
HTTPWriteTimeout: httpWriteTimeout,

main.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/buchgr/bazel-remote/config"
2323
"github.com/buchgr/bazel-remote/server"
2424
"github.com/buchgr/bazel-remote/utils/idle"
25+
"github.com/buchgr/bazel-remote/utils/metrics"
2526
"github.com/buchgr/bazel-remote/utils/rlimit"
2627

2728
grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
@@ -283,6 +284,7 @@ func main() {
283284
ctx.Bool("disable_grpc_ac_deps_check"),
284285
ctx.Bool("enable_ac_key_instance_mangling"),
285286
ctx.Bool("enable_endpoint_metrics"),
287+
nil,
286288
ctx.Bool("experimental_remote_asset_api"),
287289
ctx.Duration("http_read_timeout"),
288290
ctx.Duration("http_write_timeout"),
@@ -311,6 +313,7 @@ func main() {
311313

312314
accessLogger := log.New(os.Stdout, "", logFlags)
313315
errorLogger := log.New(os.Stderr, "", logFlags)
316+
metrics := metrics.NewMetrics(c.Metrics)
314317

315318
var proxyCache cache.Proxy
316319
if c.GoogleCloudStorage != nil {
@@ -344,8 +347,7 @@ func main() {
344347
}
345348

346349
validateAC := !c.DisableHTTPACValidation
347-
h := server.NewHTTPCache(diskCache, accessLogger, errorLogger, validateAC, c.EnableACKeyInstanceMangling, gitCommit)
348-
350+
h := server.NewHTTPCache(diskCache, accessLogger, errorLogger, metrics, validateAC, c.EnableACKeyInstanceMangling, gitCommit)
349351
var htpasswdSecrets auth.SecretProvider
350352
cacheHandler := h.CacheHandler
351353
if c.HtpasswdFile != "" {
@@ -444,7 +446,7 @@ func main() {
444446
validateAC,
445447
c.EnableACKeyInstanceMangling,
446448
enableRemoteAssetAPI,
447-
diskCache, accessLogger, errorLogger)
449+
diskCache, accessLogger, errorLogger, metrics)
448450
if err3 != nil {
449451
log.Fatal(err3)
450452
}

server/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ go_library(
1818
"//cache:go_default_library",
1919
"//cache/disk:go_default_library",
2020
"//utils/idle:go_default_library",
21+
"//utils/metrics:go_default_library",
2122
"@com_github_abbot_go_http_auth//:go_default_library",
2223
"@com_github_bazelbuild_remote_apis//build/bazel/remote/asset/v1:go_default_library",
2324
"@com_github_bazelbuild_remote_apis//build/bazel/remote/execution/v2:go_default_library",

server/grpc.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import (
1818

1919
"github.com/buchgr/bazel-remote/cache"
2020
"github.com/buchgr/bazel-remote/cache/disk"
21-
21+
"github.com/buchgr/bazel-remote/utils/metrics"
2222
_ "github.com/mostynb/go-grpc-compression/snappy" // Register snappy
2323
_ "github.com/mostynb/go-grpc-compression/zstd" // and zstd support.
2424
)
@@ -39,6 +39,7 @@ type grpcServer struct {
3939
errorLogger cache.Logger
4040
depsCheck bool
4141
mangleACKeys bool
42+
metrics metrics.Metrics
4243
}
4344

4445
// ListenAndServeGRPC creates a new gRPC server and listens on the given
@@ -48,27 +49,28 @@ func ListenAndServeGRPC(addr string, opts []grpc.ServerOption,
4849
validateACDeps bool,
4950
mangleACKeys bool,
5051
enableRemoteAssetAPI bool,
51-
c *disk.Cache, a cache.Logger, e cache.Logger) error {
52+
c *disk.Cache, a cache.Logger, e cache.Logger, m metrics.Metrics) error {
5253

5354
listener, err := net.Listen("tcp", addr)
5455
if err != nil {
5556
return err
5657
}
5758

58-
return serveGRPC(listener, opts, validateACDeps, mangleACKeys, enableRemoteAssetAPI, c, a, e)
59+
return serveGRPC(listener, opts, validateACDeps, mangleACKeys, enableRemoteAssetAPI, c, a, e, m)
5960
}
6061

6162
func serveGRPC(l net.Listener, opts []grpc.ServerOption,
6263
validateACDepsCheck bool,
6364
mangleACKeys bool,
6465
enableRemoteAssetAPI bool,
65-
c *disk.Cache, a cache.Logger, e cache.Logger) error {
66+
c *disk.Cache, a cache.Logger, e cache.Logger, m metrics.Metrics) error {
6667

6768
srv := grpc.NewServer(opts...)
6869
s := &grpcServer{
6970
cache: c, accessLogger: a, errorLogger: e,
7071
depsCheck: validateACDepsCheck,
7172
mangleACKeys: mangleACKeys,
73+
metrics: m,
7274
}
7375
pb.RegisterActionCacheServer(srv, s)
7476
pb.RegisterCapabilitiesServer(srv, s)

server/grpc_ac.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@ import (
1313
pb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2"
1414
"github.com/golang/protobuf/proto"
1515
"google.golang.org/grpc/codes"
16+
"google.golang.org/grpc/metadata"
1617
"google.golang.org/grpc/peer"
1718
"google.golang.org/grpc/status"
1819

1920
"github.com/buchgr/bazel-remote/cache"
21+
"github.com/buchgr/bazel-remote/utils/metrics"
2022
)
2123

2224
var (
@@ -63,6 +65,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context,
6365
}
6466
if rdr == nil || sizeBytes <= 0 {
6567
s.accessLogger.Printf("%s %s %s", logPrefix, req.ActionDigest.Hash, "NOT FOUND")
68+
s.incAcRequestMetrics(metrics.METHOD_GET, metrics.NOT_FOUND, ctx)
6669
return nil, status.Error(codes.NotFound,
6770
fmt.Sprintf("%s not found in AC", req.ActionDigest.Hash))
6871
}
@@ -82,6 +85,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context,
8285
}
8386

8487
s.accessLogger.Printf("%s %s OK", logPrefix, req.ActionDigest.Hash)
88+
s.incAcRequestMetrics(metrics.METHOD_GET, metrics.OK, ctx)
8589
return result, nil
8690
}
8791

@@ -93,6 +97,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context,
9397

9498
if result == nil {
9599
s.accessLogger.Printf("%s %s NOT FOUND", logPrefix, req.ActionDigest.Hash)
100+
s.incAcRequestMetrics(metrics.METHOD_GET, metrics.NOT_FOUND, ctx)
96101
return nil, status.Error(codes.NotFound,
97102
fmt.Sprintf("%s not found in AC", req.ActionDigest.Hash))
98103
}
@@ -129,6 +134,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context,
129134
}
130135

131136
s.accessLogger.Printf("GRPC AC GET %s OK", req.ActionDigest.Hash)
137+
s.incAcRequestMetrics(metrics.METHOD_GET, metrics.OK, ctx)
132138

133139
return result, nil
134140
}
@@ -290,6 +296,7 @@ func (s *grpcServer) UpdateActionResult(ctx context.Context,
290296
}
291297

292298
s.accessLogger.Printf("GRPC AC PUT %s OK", req.ActionDigest.Hash)
299+
s.incAcRequestMetrics(metrics.METHOD_PUT, metrics.OK, ctx)
293300

294301
// Trivia: the RE API wants us to return the ActionResult from the
295302
// request, in order to follow this standard method style guide:
@@ -331,3 +338,8 @@ func addWorkerMetadataGRPC(ctx context.Context, ar *pb.ActionResult) {
331338

332339
ar.ExecutionMetadata.Worker = worker
333340
}
341+
342+
func (s *grpcServer) incAcRequestMetrics(method metrics.Method, status metrics.Status, ctx context.Context) {
343+
headers, _ := metadata.FromIncomingContext(ctx)
344+
s.metrics.IncomingRequestCompleted(metrics.AC, method, status, headers, metrics.GRPC)
345+
}

server/grpc_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ func TestMain(m *testing.M) {
7373

7474
accessLogger := testutils.NewSilentLogger()
7575
errorLogger := testutils.NewSilentLogger()
76+
metrics := testutils.NewMetricsStub()
7677

7778
listener = bufconn.Listen(bufSize)
7879

@@ -87,7 +88,7 @@ func TestMain(m *testing.M) {
8788
validateAC,
8889
mangleACKeys,
8990
enableRemoteAssetAPI,
90-
diskCache, accessLogger, errorLogger)
91+
diskCache, accessLogger, errorLogger, metrics)
9192
if err2 != nil {
9293
fmt.Println(err2)
9394
os.Exit(1)

0 commit comments

Comments
 (0)