📊 Priority: MEDIUM - Production Readiness
Background
The application lacks performance and health metrics, making it difficult to monitor system behavior, detect anomalies, or set up alerting in production. Prometheus metrics provide standardized monitoring capabilities.
Current State - No Metrics
No instrumentation exists for:
- HTTP request duration/throughput
- WebSocket connection counts
- Database operation latency
- Error rates
- System resource usage
Recommended Solution
Part 1: Install prom-client
Part 2: Metrics Configuration
// backend/src/monitoring/metrics.js (NEW FILE)
import client from 'prom-client';
// Enable default metrics (CPU, memory, event loop lag)
client.collectDefaultMetrics({
prefix: 'gemini_flow_',
gcDurationBuckets: [0.001, 0.01, 0.1, 1, 2, 5]
});
// HTTP Request Duration Histogram
export const httpRequestDuration = new client.Histogram({
name: 'gemini_flow_http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5]
});
// HTTP Request Counter
export const httpRequestTotal = new client.Counter({
name: 'gemini_flow_http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
// WebSocket Connection Gauge
export const wsConnectionsActive = new client.Gauge({
name: 'gemini_flow_websocket_connections_active',
help: 'Number of active WebSocket connections'
});
// WebSocket Message Counter
export const wsMessagesTotal = new client.Counter({
name: 'gemini_flow_websocket_messages_total',
help: 'Total WebSocket messages sent/received',
labelNames: ['direction', 'type'] // direction: inbound/outbound, type: event type
});
// Database Operation Duration
export const dbOperationDuration = new client.Histogram({
name: 'gemini_flow_db_operation_duration_seconds',
help: 'Duration of database operations in seconds',
labelNames: ['operation', 'collection'], // operation: read/write/delete, collection: workflows/store/sessions
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
});
// Database Operation Counter
export const dbOperationTotal = new client.Counter({
name: 'gemini_flow_db_operations_total',
help: 'Total number of database operations',
labelNames: ['operation', 'collection', 'status'] // status: success/error
});
// Workflow Statistics
export const workflowsTotal = new client.Gauge({
name: 'gemini_flow_workflows_total',
help: 'Total number of workflows in database'
});
export const workflowNodesHistogram = new client.Histogram({
name: 'gemini_flow_workflow_nodes',
help: 'Distribution of node counts across workflows',
buckets: [0, 10, 25, 50, 100, 250, 500, 1000]
});
// Error Counter
export const errorsTotal = new client.Counter({
name: 'gemini_flow_errors_total',
help: 'Total number of errors',
labelNames: ['type', 'path'] // type: validation/database/network/etc
});
// Registry for all metrics
export const register = client.register;
Part 3: HTTP Metrics Middleware
// backend/src/api/middleware/metricsMiddleware.js (NEW FILE)
import { httpRequestDuration, httpRequestTotal } from '../../monitoring/metrics.js';
export function metricsMiddleware(req, res, next) {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000; // Convert to seconds
const route = req.route?.path || req.path;
const labels = {
method: req.method,
route,
status_code: res.statusCode
};
httpRequestDuration.observe(labels, duration);
httpRequestTotal.inc(labels);
});
next();
}
Part 4: Instrument WebSocket Server
// backend/src/websocket/server.js
import { wsConnectionsActive, wsMessagesTotal } from '../monitoring/metrics.js';
export class WebSocketServer {
handleConnection(ws, req) {
// ... existing auth code ...
const clientId = `client-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
this.clients.set(clientId, ws);
// Increment connection gauge
wsConnectionsActive.inc();
ws.on('message', (data) => {
wsMessagesTotal.inc({ direction: 'inbound', type: 'message' });
// ... handle message ...
});
ws.on('close', () => {
this.handleDisconnection(clientId);
// Decrement connection gauge
wsConnectionsActive.dec();
});
}
broadcast(event, excludeClientId = null) {
// ... existing broadcast code ...
const clientCount = this.clients.size - (excludeClientId ? 1 : 0);
wsMessagesTotal.inc({
direction: 'outbound',
type: event.type
}, clientCount);
}
}
Part 5: Instrument Database Operations
// backend/src/db/database.js
import { dbOperationDuration, dbOperationTotal, workflowsTotal } from '../monitoring/metrics.js';
// Wrapper for instrumented operations
function instrument(operation, collection) {
return async function(fn) {
const start = Date.now();
let status = 'success';
try {
const result = await fn();
return result;
} catch (error) {
status = 'error';
throw error;
} finally {
const duration = (Date.now() - start) / 1000;
dbOperationDuration.observe({ operation, collection }, duration);
dbOperationTotal.inc({ operation, collection, status });
}
};
}
export async function getAllWorkflows() {
return instrument('read', 'workflows')(async () => {
const data = await fs.readFile(WORKFLOWS_FILE, 'utf-8');
const parsed = JSON.parse(data);
const workflows = parsed.workflows || [];
// Update workflow count gauge
workflowsTotal.set(workflows.length);
return workflows;
});
}
export async function updateWorkflow(id, updates) {
return instrument('write', 'workflows')(async () => {
// ... existing update code ...
});
}
Part 6: Metrics Endpoint
// backend/src/server.js
import { register } from './monitoring/metrics.js';
import { metricsMiddleware } from './api/middleware/metricsMiddleware.js';
// Add metrics middleware (after request ID, before routes)
app.use(metricsMiddleware);
// Metrics endpoint (no auth for Prometheus scraper)
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
Part 7: Error Instrumentation
// backend/src/api/middleware/errorHandler.js
import { errorsTotal } from '../../monitoring/metrics.js';
export function errorHandler(err, req, res, next) {
// Increment error counter
errorsTotal.inc({
type: err.name || 'UnknownError',
path: req.path
});
// ... existing error handling ...
}
Files to Create
backend/src/monitoring/metrics.js (new)
backend/src/api/middleware/metricsMiddleware.js (new)
Files to Modify
backend/src/server.js (add metrics endpoint and middleware)
backend/src/websocket/server.js (instrument connections and messages)
backend/src/db/database.js (instrument operations)
backend/src/api/middleware/errorHandler.js (instrument errors)
Prometheus Configuration
# prometheus.yml
scrape_configs:
- job_name: 'gemini-flow-backend'
scrape_interval: 15s
static_configs:
- targets: ['localhost:3001']
metrics_path: '/metrics'
Acceptance Criteria
Example Queries (PromQL)
# Average HTTP request duration by route
rate(gemini_flow_http_request_duration_seconds_sum[5m])
/
rate(gemini_flow_http_request_duration_seconds_count[5m])
# Error rate percentage
100 * rate(gemini_flow_errors_total[5m])
/
rate(gemini_flow_http_requests_total[5m])
# 95th percentile request latency
histogram_quantile(0.95,
rate(gemini_flow_http_request_duration_seconds_bucket[5m])
)
# Active WebSocket connections
gemini_flow_websocket_connections_active
Grafana Dashboard (Optional)
Create monitoring/grafana-dashboard.json with panels for:
- Request throughput (requests/sec)
- Request latency (p50, p95, p99)
- Error rate
- WebSocket connections
- Database operation latency
- CPU and memory usage
Testing Metrics Endpoint
# Scrape metrics
curl http://localhost:3001/metrics
# Example output:
# HELP gemini_flow_http_requests_total Total number of HTTP requests
# TYPE gemini_flow_http_requests_total counter
# gemini_flow_http_requests_total{method="GET",route="/api/workflows",status_code="200"} 42
#
# HELP gemini_flow_websocket_connections_active Number of active WebSocket connections
# TYPE gemini_flow_websocket_connections_active gauge
# gemini_flow_websocket_connections_active 3
References
Additional Context
Metrics are essential for production monitoring, alerting, and capacity planning. Implement alongside structured logging (#73) for complete observability.
📊 Priority: MEDIUM - Production Readiness
Background
The application lacks performance and health metrics, making it difficult to monitor system behavior, detect anomalies, or set up alerting in production. Prometheus metrics provide standardized monitoring capabilities.
Current State - No Metrics
No instrumentation exists for:
Recommended Solution
Part 1: Install prom-client
Part 2: Metrics Configuration
Part 3: HTTP Metrics Middleware
Part 4: Instrument WebSocket Server
Part 5: Instrument Database Operations
Part 6: Metrics Endpoint
Part 7: Error Instrumentation
Files to Create
backend/src/monitoring/metrics.js(new)backend/src/api/middleware/metricsMiddleware.js(new)Files to Modify
backend/src/server.js(add metrics endpoint and middleware)backend/src/websocket/server.js(instrument connections and messages)backend/src/db/database.js(instrument operations)backend/src/api/middleware/errorHandler.js(instrument errors)Prometheus Configuration
Acceptance Criteria
Example Queries (PromQL)
Grafana Dashboard (Optional)
Create
monitoring/grafana-dashboard.jsonwith panels for:Testing Metrics Endpoint
References
Additional Context
Metrics are essential for production monitoring, alerting, and capacity planning. Implement alongside structured logging (#73) for complete observability.