Built comprehensive monitoring and logging solution for Kubernetes clusters using Prometheus, Grafana, and Loki. Implemented custom dashboards and alerting rules that reduced mean time to detection by 70%.
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name]
action: keep
regex: default;kubernetes
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
{
"dashboard": {
"title": "Kubernetes Cluster Overview",
"panels": [
{
"title": "CPU Usage by Namespace",
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total[5m])) by (namespace)",
"legendFormat": "{{ namespace }}"
}
],
"type": "graph"
},
{
"title": "Memory Usage by Pod",
"targets": [
{
"expr": "sum(container_memory_usage_bytes) by (pod)",
"legendFormat": "{{ pod }}"
}
],
"type": "graph"
},
{
"title": "Network I/O",
"targets": [
{
"expr": "sum(rate(container_network_receive_bytes_total[5m]))",
"legendFormat": "Receive"
},
{
"expr": "sum(rate(container_network_transmit_bytes_total[5m]))",
"legendFormat": "Transmit"
}
],
"type": "graph"
}
],
"refresh": "10s",
"time": {
"from": "now-1h",
"to": "now"
}
}
}
groups:
- name: kubernetes-alerts
interval: 30s
rules:
- alert: HighCPUUsage
expr: |
(sum(rate(container_cpu_usage_seconds_total[5m])) by (pod)
/ sum(container_spec_cpu_quota/container_spec_cpu_period) by (pod)) > 0.8
for: 5m
labels:
severity: warning
component: kubernetes
annotations:
summary: "High CPU usage detected"
description: "Pod {{ $labels.pod }} CPU usage is above 80%"
- alert: HighMemoryUsage
expr: |
(sum(container_memory_usage_bytes) by (pod)
/ sum(container_spec_memory_limit_bytes) by (pod)) > 0.9
for: 5m
labels:
severity: critical
component: kubernetes
annotations:
summary: "High memory usage detected"
description: "Pod {{ $labels.pod }} memory usage is above 90%"
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[1h]) > 5
for: 5m
labels:
severity: critical
annotations:
summary: "Pod is crash looping"
description: "Pod {{ $labels.pod }} is crash looping"
# values.yaml for kube-prometheus-stack
prometheus:
prometheusSpec:
retention: 30d
storageSpec:
volumeClaimTemplate:
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 50Gi
resources:
requests:
memory: 2Gi
cpu: 1000m
limits:
memory: 4Gi
cpu: 2000m
grafana:
adminPassword: "SecurePassword123!"
persistence:
enabled: true
size: 10Gi
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
folder: 'Custom'
type: file
options:
path: /var/lib/grafana/dashboards
loki:
persistence:
enabled: true
size: 50Gi
config:
table_manager:
retention_deletes_enabled: true
retention_period: 168h
alertmanager:
config:
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'slack-notifications'
receivers:
- name: 'slack-notifications'
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#alerts'
# Clone the repository git clone https://github.com/jconover/k8s-observability-stack.git cd k8s-observability-stack # Add Prometheus Helm repository helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo add grafana https://grafana.github.io/helm-charts helm repo update # Create monitoring namespace kubectl create namespace monitoring # Install kube-prometheus-stack helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \ --namespace monitoring \ --values values.yaml # Install Loki helm install loki grafana/loki-stack \ --namespace monitoring \ --set grafana.enabled=false \ --set promtail.enabled=true # Port forward to access Grafana kubectl port-forward -n monitoring svc/kube-prometheus-stack-grafana 3000:80 # Access Grafana at http://localhost:3000 # Default credentials: admin / prom-operator