From 0c0de999ea772859f202106bff57905200d4935c Mon Sep 17 00:00:00 2001 From: Fredrik Ekre Date: Wed, 1 Nov 2023 23:01:16 +0100 Subject: [PATCH] Add GCCollector (#1) --- CHANGELOG.md | 3 +- README.md | 20 +++++++++++-- src/Prometheus.jl | 71 +++++++++++++++++++++++++++++++++++++++++++++-- test/runtests.jl | 37 ++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b023d1..32d7b70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 First stable release of Prometheus.jl: - - Supported collectors: Counter, Gauge, Summary + - Supported basic collectors: Counter, Gauge, Summary + - GCCollector for metrics about allocations and garbage collection - Support for default and custom collector registries - Support for metric labeling - Support for exposing metrics to file and over HTTP diff --git a/README.md b/README.md index c975c9b..75a9f5a 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,20 @@ 3. Visit in your browser. You will see something like the following ``` + # HELP gc_alloc_bytes_total Total number of allocated bytes + # TYPE gc_alloc_bytes_total counter + gc_alloc_bytes_total 365578814 + + [...] + # HELP request_count Number of handled requests # TYPE request_count counter request_count 1 ``` - which is how the counter is presented when Prometheus scrapes the metrics. - Every time you refresh, the counter will increment its value. - `close(server)` will shutdown the server. + The default output contains metrics about allocations and garbage collections (see + [GCCollector](#gccollector)), as well as the request counter that we added ourselves. + Every time you refresh, the counter will increment its value. `close(server)` will + shutdown the server. ## Collectors @@ -66,6 +73,13 @@ See for details. Supported methods: - `Prometheus.observe(summary, v)`: record the observed value `v`. +### GCCollector + +A collector that exports metrics about allocations and garbage collection (for example +number of allocations, number of bytes allocated, time spent in garbage collection, etc). +These metrics have the `gc_` prefix in their name. + + ## Labels See for details. diff --git a/src/Prometheus.jl b/src/Prometheus.jl index 001bb5d..c43979c 100644 --- a/src/Prometheus.jl +++ b/src/Prometheus.jl @@ -69,8 +69,6 @@ struct CollectorRegistry end end -const DEFAULT_REGISTRY = CollectorRegistry() - function register(reg::CollectorRegistry, collector::Collector) existing_names = Set{String}() # TODO: Cache existing_names in the registry? @lock reg.lock begin @@ -401,6 +399,71 @@ function collect!(metrics::Vector, family::Family{C}) where C return metrics end +############################ +# GCCollector <: Collector # +############################ + +mutable struct GCCollector <: Collector + function GCCollector(registry::Union{CollectorRegistry, Nothing}=DEFAULT_REGISTRY) + gcc = new() + if registry !== nothing + register(registry, gcc) + end + return gcc + end +end + +function metric_names(::GCCollector) + return ( + "gc_alloc_total", "gc_free_total", "gc_alloc_bytes_total", + "gc_live_bytes", "gc_seconds_total", "gc_collections_total", + ) +end + +function collect!(metrics::Vector, ::GCCollector) + # See base/timing.jl + gc_num = Base.gc_num() + gc_live_bytes = Base.gc_live_bytes() + # Push all the metrics + push!(metrics, + Metric( + "counter", "gc_alloc_total", "Total number of allocations (calls to malloc, realloc, etc)", + LabelNames(["type"]), + [ + Sample(nothing, LabelValues(["bigalloc"]), gc_num.bigalloc), + Sample(nothing, LabelValues(["malloc"]), gc_num.malloc), + Sample(nothing, LabelValues(["poolalloc"]), gc_num.poolalloc), + Sample(nothing, LabelValues(["realloc"]), gc_num.realloc), + ], + ), + Metric( + "counter", "gc_free_total", "Total number of calls to free()", + nothing, Sample(nothing, nothing, gc_num.freecall), + ), + Metric( + "counter", "gc_alloc_bytes_total", "Total number of allocated bytes", nothing, + Sample(nothing, nothing, Base.gc_total_bytes(gc_num)), + ), + Metric( + "gauge", "gc_live_bytes", "Current number of live bytes", nothing, + Sample(nothing, nothing, gc_live_bytes), + ), + Metric( + "counter", "gc_seconds_total", "Total time spent in garbage collection", nothing, + Sample(nothing, nothing, gc_num.total_time / 10^9), # [ns] to [s] + ), + Metric( + "counter", "gc_collections_total", "Total number of calls to garbage collection", + LabelNames(["type"]), + [ + Sample(nothing, LabelValues(["full"]), gc_num.full_sweep), + Sample(nothing, LabelValues(["minor"]), gc_num.pause - gc_num.full_sweep), + ], + ), + ) + return metrics +end + ############## # Exposition # @@ -563,4 +626,8 @@ function expose(http::HTTP.Stream, reg::CollectorRegistry = DEFAULT_REGISTRY; co return end +# Default registry and collectors +const DEFAULT_REGISTRY = CollectorRegistry() +const GC_COLLECTOR = GCCollector(DEFAULT_REGISTRY) + end # module Prometheus diff --git a/test/runtests.jl b/test/runtests.jl index b7eadbe..fe71e84 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -281,6 +281,43 @@ end """ end +@testset "Prometheus.GCCollector" begin + r = Prometheus.CollectorRegistry() + c = Prometheus.GCCollector(r) + @test c in r.collectors + # Record before and after stats and test that the metrics are in between + old_stats = Base.gc_num() + x = zeros(1024^2); x = nothing; GC.gc(); GC.gc() + metrics = Prometheus.collect(c) + x = zeros(1024^2); x = nothing; GC.gc(); GC.gc() + new_stats = Base.gc_num() + @test length(metrics) == 6 + gc_alloc_total = metrics[findfirst(x -> x.metric_name == "gc_alloc_total", metrics)] + @test old_stats.bigalloc <= gc_alloc_total.samples[1].value <= new_stats.bigalloc + @test old_stats.malloc <= gc_alloc_total.samples[2].value <= new_stats.malloc + @test old_stats.poolalloc <= gc_alloc_total.samples[3].value <= new_stats.poolalloc + @test old_stats.realloc <= gc_alloc_total.samples[4].value <= new_stats.realloc + gc_free_total = metrics[findfirst(x -> x.metric_name == "gc_free_total", metrics)] + @test old_stats.freecall <= gc_free_total.samples.value <= new_stats.freecall + gc_alloc_bytes_total = metrics[findfirst(x -> x.metric_name == "gc_alloc_bytes_total", metrics)] + @test Base.gc_total_bytes(old_stats) <= gc_alloc_bytes_total.samples.value <= Base.gc_total_bytes(new_stats) + gc_seconds_total = metrics[findfirst(x -> x.metric_name == "gc_seconds_total", metrics)] + @test old_stats.total_time / 10^9 <= gc_seconds_total.samples.value <= new_stats.total_time / 10^9 + # Prometheus.expose_metric(...) + str = sprint(Prometheus.expose_metric, gc_alloc_total) + @test occursin( + r""" + # HELP gc_alloc_total Total number of allocations \(calls to malloc, realloc, etc\) + # TYPE gc_alloc_total counter + gc_alloc_total{type="bigalloc"} \d+ + gc_alloc_total{type="malloc"} \d+ + gc_alloc_total{type="poolalloc"} \d+ + gc_alloc_total{type="realloc"} \d+ + """, + sprint(Prometheus.expose_metric, gc_alloc_total), + ) +end + @testset "Prometheus.expose(::Union{String, IO})" begin r = Prometheus.DEFAULT_REGISTRY empty!(r.collectors)