grafos_observe/
metrics.rs

1//! Core metrics types for fabric observability.
2//!
3//! All types use atomics and work in `no_std` environments. The
4//! [`FabricMetrics`] singleton collects system-wide counters and gauges.
5
6use core::sync::atomic::{AtomicI64, AtomicU64, Ordering};
7
8/// Monotonically increasing counter (atomic u64).
9///
10/// Counters only go up — use them for totals like "operations completed"
11/// or "bytes transferred". Thread-safe via relaxed atomic operations.
12///
13/// # Examples
14///
15/// ```
16/// use grafos_observe::MetricCounter;
17///
18/// let c = MetricCounter::new();
19/// c.inc();
20/// c.add(10);
21/// assert_eq!(c.get(), 11);
22///
23/// let prev = c.reset();
24/// assert_eq!(prev, 11);
25/// assert_eq!(c.get(), 0);
26/// ```
27pub struct MetricCounter {
28    value: AtomicU64,
29}
30
31impl Default for MetricCounter {
32    fn default() -> Self {
33        Self::new()
34    }
35}
36
37impl MetricCounter {
38    /// Create a new counter starting at zero.
39    pub const fn new() -> Self {
40        Self {
41            value: AtomicU64::new(0),
42        }
43    }
44
45    /// Increment the counter by one.
46    pub fn inc(&self) {
47        self.value.fetch_add(1, Ordering::Relaxed);
48    }
49
50    /// Increment the counter by `n`.
51    pub fn add(&self, n: u64) {
52        self.value.fetch_add(n, Ordering::Relaxed);
53    }
54
55    /// Read the current counter value.
56    pub fn get(&self) -> u64 {
57        self.value.load(Ordering::Relaxed)
58    }
59
60    /// Reset the counter to zero. Returns the previous value.
61    pub fn reset(&self) -> u64 {
62        self.value.swap(0, Ordering::Relaxed)
63    }
64}
65
66/// Current-value gauge (atomic i64).
67///
68/// Gauges go up and down — use them for values like "active leases"
69/// or "connections open".
70///
71/// # Examples
72///
73/// ```
74/// use grafos_observe::MetricGauge;
75///
76/// let g = MetricGauge::new();
77/// g.inc();
78/// g.inc();
79/// g.dec();
80/// assert_eq!(g.get(), 1);
81///
82/// g.set(-5);
83/// assert_eq!(g.get(), -5);
84/// ```
85pub struct MetricGauge {
86    value: AtomicI64,
87}
88
89impl Default for MetricGauge {
90    fn default() -> Self {
91        Self::new()
92    }
93}
94
95impl MetricGauge {
96    /// Create a new gauge starting at zero.
97    pub const fn new() -> Self {
98        Self {
99            value: AtomicI64::new(0),
100        }
101    }
102
103    /// Increment the gauge by one.
104    pub fn inc(&self) {
105        self.value.fetch_add(1, Ordering::Relaxed);
106    }
107
108    /// Decrement the gauge by one.
109    pub fn dec(&self) {
110        self.value.fetch_sub(1, Ordering::Relaxed);
111    }
112
113    /// Set the gauge to an absolute value.
114    pub fn set(&self, val: i64) {
115        self.value.store(val, Ordering::Relaxed);
116    }
117
118    /// Read the current gauge value.
119    pub fn get(&self) -> i64 {
120        self.value.load(Ordering::Relaxed)
121    }
122}
123
124/// Fixed-bucket latency histogram.
125///
126/// Tracks the distribution of values (typically durations in microseconds)
127/// across predefined buckets. No allocator needed — the bucket array is
128/// inline. Uses 10 buckets: 100us, 500us, 1ms, 5ms, 10ms, 50ms, 100ms,
129/// 500ms, 1s, +Inf.
130///
131/// Buckets are cumulative: each bucket count includes all observations that
132/// also fall into lower buckets. This matches the Prometheus histogram
133/// convention.
134///
135/// # Examples
136///
137/// ```
138/// use grafos_observe::MetricHistogram;
139///
140/// let h = MetricHistogram::new();
141/// h.observe(50);     // 50us — lands in the <=100us bucket
142/// h.observe(2_000);  // 2ms  — lands in the <=5ms bucket
143///
144/// assert_eq!(h.count(), 2);
145/// assert_eq!(h.sum(), 2050);
146/// assert_eq!(h.bucket_count(0), 1); // <=100us: only the 50us observation
147/// assert_eq!(h.bucket_count(3), 2); // <=5ms: both observations
148/// ```
149pub struct MetricHistogram {
150    /// Bucket upper bounds in microseconds. The last bucket is +Inf (u64::MAX).
151    bounds: [u64; Self::NUM_BUCKETS],
152    /// Counts per bucket (cumulative — each bucket includes all lower buckets).
153    counts: [AtomicU64; Self::NUM_BUCKETS],
154    /// Sum of all observed values.
155    sum: AtomicU64,
156    /// Total number of observations.
157    total: AtomicU64,
158}
159
160impl Default for MetricHistogram {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166impl MetricHistogram {
167    /// Number of histogram buckets.
168    pub const NUM_BUCKETS: usize = 10;
169
170    /// Bucket boundaries in microseconds.
171    pub const BUCKET_BOUNDS: [u64; Self::NUM_BUCKETS] = [
172        100,       // 100us
173        500,       // 500us
174        1_000,     // 1ms
175        5_000,     // 5ms
176        10_000,    // 10ms
177        50_000,    // 50ms
178        100_000,   // 100ms
179        500_000,   // 500ms
180        1_000_000, // 1s
181        u64::MAX,  // +Inf
182    ];
183
184    /// Create a new histogram with the default bucket boundaries.
185    #[allow(clippy::declare_interior_mutable_const)]
186    pub const fn new() -> Self {
187        Self {
188            bounds: Self::BUCKET_BOUNDS,
189            counts: [
190                AtomicU64::new(0),
191                AtomicU64::new(0),
192                AtomicU64::new(0),
193                AtomicU64::new(0),
194                AtomicU64::new(0),
195                AtomicU64::new(0),
196                AtomicU64::new(0),
197                AtomicU64::new(0),
198                AtomicU64::new(0),
199                AtomicU64::new(0),
200            ],
201            sum: AtomicU64::new(0),
202            total: AtomicU64::new(0),
203        }
204    }
205
206    /// Record an observation (value in microseconds).
207    ///
208    /// Increments the count for every bucket whose bound is >= the value
209    /// (cumulative histogram, matching Prometheus convention).
210    pub fn observe(&self, value_us: u64) {
211        self.sum.fetch_add(value_us, Ordering::Relaxed);
212        self.total.fetch_add(1, Ordering::Relaxed);
213        for (i, &bound) in self.bounds.iter().enumerate() {
214            if value_us <= bound {
215                self.counts[i].fetch_add(1, Ordering::Relaxed);
216            }
217        }
218    }
219
220    /// Read the cumulative count for a specific bucket index.
221    pub fn bucket_count(&self, index: usize) -> u64 {
222        if index < Self::NUM_BUCKETS {
223            self.counts[index].load(Ordering::Relaxed)
224        } else {
225            0
226        }
227    }
228
229    /// Read the bucket upper bound for a specific bucket index (in microseconds).
230    pub fn bucket_bound(&self, index: usize) -> u64 {
231        if index < Self::NUM_BUCKETS {
232            self.bounds[index]
233        } else {
234            u64::MAX
235        }
236    }
237
238    /// Total number of observations.
239    pub fn count(&self) -> u64 {
240        self.total.load(Ordering::Relaxed)
241    }
242
243    /// Sum of all observed values (in microseconds).
244    pub fn sum(&self) -> u64 {
245        self.sum.load(Ordering::Relaxed)
246    }
247}
248
249/// Global fabric metrics registry.
250///
251/// Tracks system-wide counters and gauges for lease lifecycles,
252/// data-plane operations, and graph rewrites. Access the process-wide
253/// singleton via [`FabricMetrics::global()`].
254///
255/// # Examples
256///
257/// ```
258/// use grafos_observe::FabricMetrics;
259///
260/// let m = FabricMetrics::global();
261/// m.leases_total.inc();
262/// m.leases_active.inc();
263/// m.ops_total.add(5);
264/// m.bytes_read.add(1024);
265/// m.op_latency.observe(300);
266/// ```
267pub struct FabricMetrics {
268    /// Currently active leases (gauge — goes up on acquire, down on drop/expire).
269    pub leases_active: MetricGauge,
270    /// Total leases ever created (counter).
271    pub leases_total: MetricCounter,
272    /// Total leases that expired (counter).
273    pub leases_expired: MetricCounter,
274    /// Total leases that entered fenced state (counter).
275    pub leases_fenced: MetricCounter,
276    /// Total data-plane operations completed (counter).
277    pub ops_total: MetricCounter,
278    /// Total data-plane operations that failed (counter).
279    pub ops_errors: MetricCounter,
280    /// Total bytes read across all data-plane operations (counter).
281    pub bytes_read: MetricCounter,
282    /// Total bytes written across all data-plane operations (counter).
283    pub bytes_written: MetricCounter,
284    /// Total graph rewrites initiated (counter).
285    pub rewrites_total: MetricCounter,
286    /// Histogram of operation latencies in microseconds.
287    pub op_latency: MetricHistogram,
288    /// Total leases explicitly revoked (counter) — distinct from expired.
289    pub leases_revoked: MetricCounter,
290    /// Histogram of lease bind latency in microseconds.
291    pub bind_latency: MetricHistogram,
292    /// Histogram of lease renewal latency in microseconds.
293    pub renew_latency: MetricHistogram,
294    /// Histogram of lease revocation latency in microseconds.
295    pub revoke_latency: MetricHistogram,
296    /// Histogram of teardown execution latency in microseconds.
297    pub teardown_latency: MetricHistogram,
298    /// Total authentication failures (counter).
299    pub auth_failures: MetricCounter,
300    /// Total anti-replay cache rejections (counter).
301    pub replay_rejections: MetricCounter,
302    /// Total capability token validations (counter).
303    pub token_validations: MetricCounter,
304    /// Total capability token validation failures (counter).
305    pub token_failures: MetricCounter,
306    /// Total stale access attempts after revoke/expiry (counter).
307    pub stale_access_rejections: MetricCounter,
308    /// Histogram of control-plane operation latencies in microseconds.
309    pub control_latency: MetricHistogram,
310    /// Histogram of dataplane operation latencies in microseconds.
311    pub dataplane_latency: MetricHistogram,
312    /// Histogram of tasklet submit (full dispatch) latencies in microseconds.
313    pub tasklet_submit_latency: MetricHistogram,
314    /// Histogram of tasklet execution-only latencies in microseconds.
315    pub tasklet_exec_latency: MetricHistogram,
316    /// Total tasklet submissions received (counter).
317    pub tasklet_submits: MetricCounter,
318    /// Total tasklet executions that completed successfully (counter).
319    pub tasklet_completions: MetricCounter,
320    /// Total tasklet executions that failed (counter).
321    pub tasklet_failures: MetricCounter,
322    /// Histogram of tasklet wall-clock duration in microseconds.
323    pub tasklet_duration: MetricHistogram,
324    /// Total module cache hits (counter).
325    pub module_cache_hits: MetricCounter,
326    /// Total module cache misses (counter).
327    pub module_cache_misses: MetricCounter,
328    /// Total module cache stores (counter).
329    pub module_cache_stores: MetricCounter,
330    /// Total module cache hash mismatches (counter).
331    pub module_cache_hash_mismatches: MetricCounter,
332}
333
334impl Default for FabricMetrics {
335    fn default() -> Self {
336        Self::new()
337    }
338}
339
340impl FabricMetrics {
341    /// Create a new metrics instance with all values at zero.
342    pub const fn new() -> Self {
343        Self {
344            leases_active: MetricGauge::new(),
345            leases_total: MetricCounter::new(),
346            leases_expired: MetricCounter::new(),
347            leases_fenced: MetricCounter::new(),
348            ops_total: MetricCounter::new(),
349            ops_errors: MetricCounter::new(),
350            bytes_read: MetricCounter::new(),
351            bytes_written: MetricCounter::new(),
352            rewrites_total: MetricCounter::new(),
353            op_latency: MetricHistogram::new(),
354            leases_revoked: MetricCounter::new(),
355            bind_latency: MetricHistogram::new(),
356            renew_latency: MetricHistogram::new(),
357            revoke_latency: MetricHistogram::new(),
358            teardown_latency: MetricHistogram::new(),
359            auth_failures: MetricCounter::new(),
360            replay_rejections: MetricCounter::new(),
361            token_validations: MetricCounter::new(),
362            token_failures: MetricCounter::new(),
363            stale_access_rejections: MetricCounter::new(),
364            control_latency: MetricHistogram::new(),
365            dataplane_latency: MetricHistogram::new(),
366            tasklet_submit_latency: MetricHistogram::new(),
367            tasklet_exec_latency: MetricHistogram::new(),
368            tasklet_submits: MetricCounter::new(),
369            tasklet_completions: MetricCounter::new(),
370            tasklet_failures: MetricCounter::new(),
371            tasklet_duration: MetricHistogram::new(),
372            module_cache_hits: MetricCounter::new(),
373            module_cache_misses: MetricCounter::new(),
374            module_cache_stores: MetricCounter::new(),
375            module_cache_hash_mismatches: MetricCounter::new(),
376        }
377    }
378
379    /// Access the global metrics instance.
380    pub fn global() -> &'static FabricMetrics {
381        static INSTANCE: FabricMetrics = FabricMetrics::new();
382        &INSTANCE
383    }
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389
390    #[test]
391    fn counter_inc_and_add() {
392        let c = MetricCounter::new();
393        assert_eq!(c.get(), 0);
394        c.inc();
395        assert_eq!(c.get(), 1);
396        c.add(10);
397        assert_eq!(c.get(), 11);
398    }
399
400    #[test]
401    fn counter_reset() {
402        let c = MetricCounter::new();
403        c.add(42);
404        let prev = c.reset();
405        assert_eq!(prev, 42);
406        assert_eq!(c.get(), 0);
407    }
408
409    #[test]
410    fn gauge_inc_dec_set() {
411        let g = MetricGauge::new();
412        assert_eq!(g.get(), 0);
413        g.inc();
414        g.inc();
415        assert_eq!(g.get(), 2);
416        g.dec();
417        assert_eq!(g.get(), 1);
418        g.set(100);
419        assert_eq!(g.get(), 100);
420        g.set(-5);
421        assert_eq!(g.get(), -5);
422    }
423
424    #[test]
425    fn histogram_observe() {
426        let h = MetricHistogram::new();
427        // Observation of 50us should land in the 100us bucket (index 0) and all above.
428        h.observe(50);
429        assert_eq!(h.count(), 1);
430        assert_eq!(h.sum(), 50);
431        // Bucket 0 (<=100us) should have 1 count.
432        assert_eq!(h.bucket_count(0), 1);
433        // Bucket 9 (+Inf) should also have 1 count.
434        assert_eq!(h.bucket_count(9), 1);
435    }
436
437    #[test]
438    fn histogram_multiple_observations() {
439        let h = MetricHistogram::new();
440        h.observe(50); // <=100us bucket
441        h.observe(200); // <=500us bucket
442        h.observe(2_000); // <=5ms bucket
443        h.observe(999_999); // <=1s bucket
444
445        assert_eq!(h.count(), 4);
446        assert_eq!(h.sum(), 50 + 200 + 2_000 + 999_999);
447
448        // Bucket 0 (<=100us): only the 50us observation.
449        assert_eq!(h.bucket_count(0), 1);
450        // Bucket 1 (<=500us): 50 + 200.
451        assert_eq!(h.bucket_count(1), 2);
452        // Bucket 3 (<=5ms): 50 + 200 + 2000.
453        assert_eq!(h.bucket_count(3), 3);
454        // Bucket 8 (<=1s): all four.
455        assert_eq!(h.bucket_count(8), 4);
456        // +Inf: all four.
457        assert_eq!(h.bucket_count(9), 4);
458    }
459
460    #[test]
461    fn histogram_out_of_bounds_index() {
462        let h = MetricHistogram::new();
463        assert_eq!(h.bucket_count(99), 0);
464        assert_eq!(h.bucket_bound(99), u64::MAX);
465    }
466
467    #[test]
468    fn fabric_metrics_global_is_singleton() {
469        let m1 = FabricMetrics::global();
470        let m2 = FabricMetrics::global();
471        // Same address.
472        assert!(core::ptr::eq(m1, m2));
473    }
474
475    #[test]
476    fn fabric_metrics_fields() {
477        let m = FabricMetrics::new();
478        m.leases_active.inc();
479        m.leases_total.inc();
480        m.ops_total.add(5);
481        m.bytes_read.add(1024);
482        m.bytes_written.add(512);
483        m.rewrites_total.inc();
484        m.leases_expired.inc();
485        m.leases_fenced.inc();
486        m.ops_errors.inc();
487
488        assert_eq!(m.leases_active.get(), 1);
489        assert_eq!(m.leases_total.get(), 1);
490        assert_eq!(m.ops_total.get(), 5);
491        assert_eq!(m.bytes_read.get(), 1024);
492        assert_eq!(m.bytes_written.get(), 512);
493        assert_eq!(m.rewrites_total.get(), 1);
494        assert_eq!(m.leases_expired.get(), 1);
495        assert_eq!(m.leases_fenced.get(), 1);
496        assert_eq!(m.ops_errors.get(), 1);
497
498        m.leases_revoked.inc();
499        m.auth_failures.inc();
500        m.replay_rejections.inc();
501        m.token_validations.add(10);
502        m.token_failures.inc();
503        m.stale_access_rejections.inc();
504        m.bind_latency.observe(100);
505        m.renew_latency.observe(200);
506        m.revoke_latency.observe(300);
507        m.teardown_latency.observe(400);
508
509        assert_eq!(m.leases_revoked.get(), 1);
510        assert_eq!(m.auth_failures.get(), 1);
511        assert_eq!(m.replay_rejections.get(), 1);
512        assert_eq!(m.token_validations.get(), 10);
513        assert_eq!(m.token_failures.get(), 1);
514        assert_eq!(m.stale_access_rejections.get(), 1);
515        assert_eq!(m.bind_latency.count(), 1);
516        assert_eq!(m.renew_latency.count(), 1);
517        assert_eq!(m.revoke_latency.count(), 1);
518        assert_eq!(m.teardown_latency.count(), 1);
519    }
520}