Introduction
“Ça marchait hier, pourquoi ça ne marche plus aujourd’hui ?” Cette phrase, tout développeur l’a prononcée au moins une fois. Dans un monolithe simple, on peut encore s’en sortir en regardant les logs. Mais avec des architectures distribuées, des microservices et des déploiements fréquents, cette approche montre vite ses limites.
L’observabilité moderne va bien au-delà du simple monitoring. C’est la capacité à comprendre l’état interne d’un système à partir de ses données externes. Explorons comment la mettre en place efficacement.
Les trois piliers de l’observabilité
Logs : le récit de ce qui s’est passé
Logs structurés vs logs textuels
// ❌ Log textuel difficile à parser
console.log(`User ${userId} failed to login from ${ip} at ${new Date()}`);
// ✅ Log structuré facilement analysable
logger.info('User login failed', {
userId,
ip,
timestamp: new Date().toISOString(),
event: 'login_failed',
metadata: {
userAgent: req.headers['user-agent'],
attemptNumber: 3,
reason: 'invalid_credentials'
}
});
Niveaux de log pertinents
// Structure de logging par niveau
const logLevels = {
ERROR: {
when: 'Something failed and requires action',
examples: ['Database connection lost', 'Payment processing failed'],
alerting: true
},
WARN: {
when: 'Something unexpected but recoverable happened',
examples: ['Retry successful after failure', 'Deprecated API used'],
alerting: false
},
INFO: {
when: 'Significant business events',
examples: ['User registered', 'Order completed', 'Service started'],
alerting: false
},
DEBUG: {
when: 'Detailed execution flow (development only)',
examples: ['Function parameters', 'SQL queries', 'HTTP requests'],
alerting: false
}
};
Métriques : les signaux vitaux du système
Les quatre métriques fondamentales (Golden Signals)
// Instrumenter les Golden Signals
const metrics = {
// Latency - Temps de réponse
latency: histogram('http_request_duration_seconds', {
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code']
}),
// Traffic - Volume de requêtes
traffic: counter('http_requests_total', {
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
}),
// Errors - Taux d'erreur
errors: counter('http_requests_errors_total', {
help: 'Total number of HTTP request errors',
labelNames: ['method', 'route', 'error_type']
}),
// Saturation - Utilisation des ressources
saturation: gauge('system_resource_usage', {
help: 'System resource usage percentage',
labelNames: ['resource_type'] // cpu, memory, disk
})
};
Métriques business critiques
// Métriques métier à surveiller
const businessMetrics = {
// E-commerce
orderConversionRate: gauge('orders_conversion_rate'),
averageOrderValue: histogram('order_value_euros'),
checkoutAbandonRate: gauge('checkout_abandon_rate'),
// SaaS
monthlyActiveUsers: gauge('monthly_active_users'),
featureUsageRate: counter('feature_usage_total', ['feature_name']),
subscriptionChurnRate: gauge('subscription_churn_rate'),
// Général
userRegistrationRate: counter('user_registrations_total'),
apiUsageByClient: counter('api_calls_total', ['client_id', 'endpoint'])
};
Traces : suivre une requête de bout en bout
Distributed tracing avec OpenTelemetry
// Configuration OpenTelemetry
const { NodeSDK } = require('@opentelemetry/auto-instrumentations-node');
const { Resource } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: 'user-service',
[SemanticResourceAttributes.SERVICE_VERSION]: '1.2.3',
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV
}),
traceExporter: new JaegerExporter({
endpoint: 'http://jaeger:14268/api/traces'
})
});
sdk.start();
// Instrumentation manuelle pour logique métier critique
const tracer = trace.getTracer('user-service');
async function processOrder(orderId) {
const span = tracer.startSpan('process_order');
try {
span.setAttributes({
'order.id': orderId,
'user.id': order.userId
});
// Validation
const validationSpan = tracer.startSpan('validate_order', { parent: span });
await validateOrder(order);
validationSpan.end();
// Payment
const paymentSpan = tracer.startSpan('process_payment', { parent: span });
const payment = await processPayment(order);
paymentSpan.setAttributes({
'payment.amount': payment.amount,
'payment.method': payment.method
});
paymentSpan.end();
// Fulfillment
await triggerFulfillment(order);
span.setStatus({ code: SpanStatusCode.OK });
return order;
} catch (error) {
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
throw error;
} finally {
span.end();
}
}
Implémentation pratique par couches
Application : instrumenter le code
Middleware d’observabilité
// Middleware Express complet
const observabilityMiddleware = (req, res, next) => {
const startTime = Date.now();
const requestId = generateRequestId();
// Contexte de log enrichi
req.log = logger.child({
requestId,
method: req.method,
url: req.url,
userAgent: req.headers['user-agent'],
ip: req.ip
});
// Headers pour distributed tracing
res.set('X-Request-ID', requestId);
// Métriques automatiques
const labels = {
method: req.method,
route: req.route?.path || 'unknown',
status_code: res.statusCode
};
res.on('finish', () => {
const duration = (Date.now() - startTime) / 1000;
// Métriques
metrics.traffic.inc(labels);
metrics.latency.observe(labels, duration);
if (res.statusCode >= 400) {
metrics.errors.inc(labels);
}
// Log de requête
req.log.info('Request completed', {
statusCode: res.statusCode,
responseTime: duration,
contentLength: res.get('Content-Length')
});
});
next();
};
Health checks intelligents
// Health check détaillé
app.get('/health', async (req, res) => {
const checks = {
timestamp: new Date().toISOString(),
service: 'user-service',
version: process.env.npm_package_version,
uptime: process.uptime(),
dependencies: {
database: await checkDatabase(),
redis: await checkRedis(),
externalApi: await checkExternalAPI()
},
resources: {
memory: {
used: process.memoryUsage().heapUsed,
total: process.memoryUsage().heapTotal,
percentage: Math.round((process.memoryUsage().heapUsed / process.memoryUsage().heapTotal) * 100)
},
cpu: await getCPUUsage()
}
};
const isHealthy = Object.values(checks.dependencies).every(check => check.status === 'ok');
res.status(isHealthy ? 200 : 503).json(checks);
});
async function checkDatabase() {
try {
const start = Date.now();
await db.raw('SELECT 1');
return {
status: 'ok',
responseTime: Date.now() - start
};
} catch (error) {
return {
status: 'error',
error: error.message
};
}
}
Infrastructure : métriques système
Docker containers monitoring
# docker-compose.yml avec monitoring
version: '3.8'
services:
app:
build: .
environment:
- OTEL_EXPORTER_JAEGER_ENDPOINT=http://jaeger:14268
labels:
- "monitoring.enable=true"
- "monitoring.port=9090"
# Prometheus pour métriques
prometheus:
image: prom/prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
# Grafana pour visualisation
grafana:
image: grafana/grafana
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana-storage:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/datasources:/etc/grafana/provisioning/datasources
volumes:
grafana-storage:
Configuration Prometheus
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "alert_rules.yml"
scrape_configs:
- job_name: 'node-app'
static_configs:
- targets: ['app:9090']
scrape_interval: 5s
metrics_path: '/metrics'
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
Alerting : être prévenu au bon moment
Règles d’alerte intelligentes
# alert_rules.yml
groups:
- name: application.rules
rules:
# Taux d'erreur élevé
- alert: HighErrorRate
expr: rate(http_requests_errors_total[5m]) / rate(http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
# Latence élevée
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: critical
annotations:
summary: "High latency detected"
description: "95th percentile latency is {{ $value }}s for the last 5 minutes"
# Service down
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "{{ $labels.instance }} has been down for more than 1 minute"
- name: business.rules
rules:
# Baisse conversion
- alert: ConversionRateDrop
expr: orders_conversion_rate < 0.02
for: 10m
labels:
severity: warning
team: product
annotations:
summary: "Conversion rate dropped significantly"
description: "Conversion rate is {{ $value | humanizePercentage }}, below 2%"
Notifications contextuelles
// Webhook alertmanager pour Slack enrichi
app.post('/webhook/alerts', (req, res) => {
const alerts = req.body.alerts;
alerts.forEach(alert => {
const slackMessage = {
channel: getChannelForAlert(alert),
attachments: [{
color: getColorForSeverity(alert.labels.severity),
title: alert.annotations.summary,
text: alert.annotations.description,
fields: [
{
title: 'Service',
value: alert.labels.job,
short: true
},
{
title: 'Environment',
value: alert.labels.environment,
short: true
},
{
title: 'Runbook',
value: `<${getRunbookUrl(alert.labels.alertname)}|View runbook>`,
short: false
}
],
actions: [
{
name: 'acknowledge',
text: 'Acknowledge',
type: 'button',
value: alert.fingerprint
},
{
name: 'silence',
text: 'Silence 1h',
type: 'button',
value: alert.fingerprint
}
],
footer: 'Prometheus Alert',
ts: new Date(alert.startsAt).getTime() / 1000
}]
};
sendSlackMessage(slackMessage);
});
res.status(200).send('OK');
});
Dashboards et visualisation
Dashboards par audience
Dashboard développeur (focus debug)
{
"dashboard": {
"title": "Developer Debug Dashboard",
"panels": [
{
"title": "Request Rate by Endpoint",
"type": "graph",
"targets": [{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{route}}"
}]
},
{
"title": "Error Rate by Service",
"type": "singlestat",
"targets": [{
"expr": "rate(http_requests_errors_total[5m]) / rate(http_requests_total[5m]) * 100"
}]
},
{
"title": "Recent Error Logs",
"type": "logs",
"targets": [{
"expr": "{level=\"error\"} |= ``",
"refId": "A"
}]
},
{
"title": "Database Query Performance",
"type": "heatmap",
"targets": [{
"expr": "rate(db_query_duration_seconds_bucket[5m])"
}]
}
]
}
}
Dashboard business (focus métier)
{
"dashboard": {
"title": "Business Metrics Dashboard",
"panels": [
{
"title": "Daily Active Users",
"type": "stat",
"targets": [{
"expr": "daily_active_users",
"legendFormat": "DAU"
}]
},
{
"title": "Conversion Funnel",
"type": "barchart",
"targets": [
{"expr": "visits_total", "legendFormat": "Visits"},
{"expr": "signups_total", "legendFormat": "Signups"},
{"expr": "purchases_total", "legendFormat": "Purchases"}
]
},
{
"title": "Revenue Trend",
"type": "timeseries",
"targets": [{
"expr": "increase(revenue_total[1d])",
"legendFormat": "Daily Revenue"
}]
}
]
}
}
Corrélation multi-sources
Liens intelligents entre outils
// Enrichir les logs avec liens vers traces
logger.info('Payment processed', {
orderId,
amount,
paymentMethod,
// Lien direct vers la trace
traceUrl: `https://jaeger.company.com/trace/${traceId}`,
// Lien vers métriques corrélées
metricsUrl: `https://grafana.company.com/d/payments?var-order_id=${orderId}`
});
Bonnes pratiques et pièges à éviter
Structuration des données
Conventions de nommage cohérentes
// Conventions métriques (Prometheus)
const metricNamingConventions = {
// Format: component_unit_total
counters: [
'http_requests_total',
'database_queries_total',
'errors_total'
],
// Format: component_unit (current value)
gauges: [
'memory_usage_bytes',
'active_connections',
'queue_size'
],
// Format: component_unit_duration (time-based)
histograms: [
'http_request_duration_seconds',
'database_query_duration_seconds'
]
};
// Labels cohérents
const labelConventions = {
common: ['service', 'environment', 'version'],
http: ['method', 'route', 'status_code'],
database: ['table', 'operation', 'result'],
business: ['user_type', 'plan', 'feature']
};
Gestion du cardinality
// ❌ Cardinality trop élevé (éviter)
metrics.userActions.inc({
userId: req.user.id, // Trop de valeurs possibles
timestamp: Date.now(), // Infini
sessionId: req.sessionId // Trop de valeurs uniques
});
// ✅ Cardinality maîtrisé
metrics.userActions.inc({
action: 'login', // Valeurs limitées
userType: 'premium', // Catégories fixes
country: req.user.country // Valeurs finies
});
Observabilité en développement
Local development setup
# docker-compose.dev.yml
version: '3.8'
services:
app:
build: .
environment:
- LOG_LEVEL=debug
- OTEL_EXPORTER_JAEGER_ENDPOINT=http://localhost:14268
volumes:
- .:/app
ports:
- "3000:3000"
- "9090:9090" # Métriques
jaeger:
image: jaegertracing/all-in-one:latest
ports:
- "16686:16686" # UI
- "14268:14268" # HTTP endpoint
environment:
- COLLECTOR_ZIPKIN_HTTP_PORT=9411
prometheus:
image: prom/prometheus
ports:
- "9091:9090"
volumes:
- ./dev-prometheus.yml:/etc/prometheus/prometheus.yml
Tests d’observabilité
// Tester que l'instrumentation fonctionne
describe('Observability', () => {
test('should emit metrics on HTTP requests', async () => {
const beforeCount = await getMetricValue('http_requests_total');
await request(app)
.get('/api/users')
.expect(200);
const afterCount = await getMetricValue('http_requests_total');
expect(afterCount).toBeGreaterThan(beforeCount);
});
test('should create traces for business operations', async () => {
const tracer = trace.getTracer('test');
const span = tracer.startSpan('test-operation');
await processOrder(testOrder);
span.end();
const traces = await getTracesFromJaeger('test-operation');
expect(traces).toHaveLength(1);
expect(traces[0].spans).toContain('validate_order');
});
});
Plan de mise en œuvre
Phase 1 : Fondations (Semaines 1-2)
- Logs structurés : migrer console.log vers logger structuré
- Métriques de base : instrumenter HTTP requests/responses
- Health checks : endpoint /health détaillé
- Environnement local : Prometheus + Grafana en local
Phase 2 : Visibilité (Semaines 3-4)
- Dashboard développeur : métriques techniques essentielles
- Alertes critiques : service down, erreur rate élevé
- Distributed tracing : setup OpenTelemetry de base
- Log aggregation : centraliser logs avec structure
Phase 3 : Corrélation (Mois 2)
- Métriques business : conversion, usage features
- Traces détaillées : instrumenter logique métier critique
- Dashboard business : KPIs temps réel
- Alertes métier : baisse conversion, churn
Phase 4 : Optimisation (Mois 3+)
- SLOs/SLIs : définir objectifs de performance
- Runbooks : procédures de résolution d’incidents
- Chaos engineering : tester la résilience
- Post-mortem process : apprendre des incidents
Conclusion
Une observabilité efficace n’est pas qu’une question d’outils, c’est une culture. Elle change la façon dont on développe, déploie et maintient les applications.
L’objectif n’est pas de tout surveiller, mais de voir ce qui compte vraiment. Commencez simple avec les Golden Signals, puis enrichissez progressivement avec des métriques business et des traces détaillées.
Dans un monde où les systèmes deviennent de plus en plus complexes, l’observabilité n’est plus un luxe : c’est une nécessité pour maintenir la qualité de service et la vélocité de développement.
Quelle sera votre première étape pour améliorer l’observabilité de vos applications ?