1. Docker 容器化

生产级 Dockerfile

# ─── 阶段1: 构建 ──────────────────────────────────────────────
FROM golang:1.22-alpine AS builder

# 安装必要工具
RUN apk add --no-cache git ca-certificates tzdata

WORKDIR /app

# 先复制依赖文件 (利用层缓存)
COPY go.mod go.sum ./
RUN go mod download

# 复制源码并编译
COPY . .
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \
    go build -ldflags="-w -s -X main.version=$(git describe --tags --always)" \
    -trimpath \
    -o server ./cmd/server

# ─── 阶段2: 运行 (最小镜像) ──────────────────────────────────
FROM scratch

# 从构建阶段复制必要文件
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
COPY --from=builder /usr/share/zoneinfo /usr/share/zoneinfo
COPY --from=builder /app/server /server

# 非 root 用户运行 (安全)
USER 65534:65534

EXPOSE 8080
ENTRYPOINT ["/server"]
💡

多阶段构建的优势

最终镜像只包含二进制文件,无需 Go 编译器。从 ~1GB 的 golang 镜像缩减到 <20MB 的 scratch 镜像,同时消除构建工具带来的安全攻击面。

Docker Compose 本地开发

version: '3.9'

services:
  # 应用服务
  app:
    build:
      context: .
      target: builder       # 开发模式用构建阶段镜像
    ports:
      - "8080:8080"
    environment:
      - DATABASE_URL=postgres://dev:dev@postgres:5432/myapp?sslmode=disable
      - REDIS_URL=redis:6379
      - JWT_SECRET=dev-secret-change-in-production
    volumes:
      - .:/app              # 挂载源码 (配合 air 热重载)
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_started
    command: air            # 使用 air 热重载

  # PostgreSQL
  postgres:
    image: postgres:16-alpine
    environment:
      POSTGRES_DB: myapp
      POSTGRES_USER: dev
      POSTGRES_PASSWORD: dev
    volumes:
      - postgres_data:/var/lib/postgresql/data
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U dev -d myapp"]
      interval: 5s
      timeout: 5s
      retries: 5

  # Redis
  redis:
    image: redis:7-alpine
    command: redis-server --appendonly yes
    volumes:
      - redis_data:/data
    ports:
      - "6379:6379"

  # Prometheus 监控
  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./config/prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"

  # Grafana 可视化
  grafana:
    image: grafana/grafana:latest
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana_data:/var/lib/grafana
    ports:
      - "3000:3000"
    depends_on:
      - prometheus

volumes:
  postgres_data:
  redis_data:
  grafana_data:

2. Kubernetes 部署

Deployment + Service

apiVersion: apps/v1
kind: Deployment
metadata:
  name: myapp
  namespace: production
  labels:
    app: myapp
spec:
  replicas: 3
  selector:
    matchLabels:
      app: myapp
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 0   # 零停机更新
  template:
    metadata:
      labels:
        app: myapp
    spec:
      containers:
        - name: myapp
          image: registry.example.com/myapp:v1.2.3
          ports:
            - containerPort: 8080
          env:
            - name: DATABASE_URL
              valueFrom:
                secretKeyRef:
                  name: myapp-secrets
                  key: database-url
            - name: JWT_SECRET
              valueFrom:
                secretKeyRef:
                  name: myapp-secrets
                  key: jwt-secret
          resources:
            requests:
              cpu: "100m"
              memory: "128Mi"
            limits:
              cpu: "500m"
              memory: "512Mi"
          # 存活探针: 确定是否需要重启
          livenessProbe:
            httpGet:
              path: /health
              port: 8080
            initialDelaySeconds: 10
            periodSeconds: 15
          # 就绪探针: 确定是否接收流量
          readinessProbe:
            httpGet:
              path: /ready
              port: 8080
            initialDelaySeconds: 5
            periodSeconds: 5
      # 优雅关机
      terminationGracePeriodSeconds: 30
---
apiVersion: v1
kind: Service
metadata:
  name: myapp-svc
  namespace: production
spec:
  selector:
    app: myapp
  ports:
    - port: 80
      targetPort: 8080
  type: ClusterIP
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: myapp-ingress
  annotations:
    cert-manager.io/cluster-issuer: letsencrypt-prod
    nginx.ingress.kubernetes.io/rate-limit: "100"
spec:
  ingressClassName: nginx
  tls:
    - hosts:
        - api.example.com
      secretName: myapp-tls
  rules:
    - host: api.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: myapp-svc
                port:
                  number: 80

HPA 水平自动伸缩

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: myapp-hpa
  namespace: production
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: myapp
  minReplicas: 2
  maxReplicas: 20
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70   # CPU 超过 70% 时扩容
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 60   # 1分钟稳定窗口
      policies:
        - type: Pods
          value: 2
          periodSeconds: 60
    scaleDown:
      stabilizationWindowSeconds: 300  # 5分钟稳定窗口防止抖动

3. CI/CD:GitHub Actions

代码推送
git push
单元测试
go test
代码扫描
golangci-lint
构建镜像
docker build
部署 staging
kubectl apply
发布生产
manual approve
# .github/workflows/ci-cd.yml
name: CI/CD Pipeline

on:
  push:
    branches: [main, develop]
  pull_request:
    branches: [main]

env:
  REGISTRY: ghcr.io
  IMAGE_NAME: ${{ github.repository }}

jobs:
  # ─── 测试与代码质量 ─────────────────────────────────────────
  test:
    runs-on: ubuntu-latest
    services:
      postgres:
        image: postgres:16
        env:
          POSTGRES_PASSWORD: test
          POSTGRES_DB: testdb
        options: >-
          --health-cmd pg_isready
          --health-interval 10s
          --health-retries 5

    steps:
      - uses: actions/checkout@v4

      - name: 设置 Go 环境
        uses: actions/setup-go@v5
        with:
          go-version: '1.22'
          cache: true          # 缓存 go module

      - name: 安装依赖
        run: go mod download

      - name: 运行 linter
        uses: golangci/golangci-lint-action@v6
        with:
          version: latest

      - name: 运行测试
        run: |
          go test -v -race -coverprofile=coverage.out ./...
          go tool cover -func=coverage.out
        env:
          DATABASE_URL: postgres://postgres:test@localhost/testdb?sslmode=disable

      - name: 上传覆盖率报告
        uses: codecov/codecov-action@v4
        with:
          files: coverage.out

  # ─── 构建并推送镜像 ─────────────────────────────────────────
  build:
    needs: test
    runs-on: ubuntu-latest
    if: github.event_name != 'pull_request'
    outputs:
      image-tag: ${{ steps.meta.outputs.tags }}
      image-digest: ${{ steps.build.outputs.digest }}

    steps:
      - uses: actions/checkout@v4

      - name: 登录 Container Registry
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: 提取镜像元数据
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
          tags: |
            type=sha,prefix=,suffix=,format=short
            type=ref,event=branch

      - name: 构建并推送
        id: build
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          cache-from: type=gha
          cache-to: type=gha,mode=max

  # ─── 部署到 Staging ─────────────────────────────────────────
  deploy-staging:
    needs: build
    runs-on: ubuntu-latest
    environment: staging

    steps:
      - uses: actions/checkout@v4

      - name: 部署到 Kubernetes
        run: |
          kubectl set image deployment/myapp \
            myapp=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ needs.build.outputs.image-tag }} \
            -n staging
          kubectl rollout status deployment/myapp -n staging --timeout=3m

  # ─── 部署到生产 (需人工审批) ─────────────────────────────────
  deploy-production:
    needs: [build, deploy-staging]
    runs-on: ubuntu-latest
    environment: production   # 配置了 required reviewers
    if: github.ref == 'refs/heads/main'

    steps:
      - name: 蓝绿部署
        run: |
          kubectl set image deployment/myapp \
            myapp=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ needs.build.outputs.image-tag }} \
            -n production
          kubectl rollout status deployment/myapp -n production --timeout=5m

4. Prometheus + Grafana 监控

package metrics

import (
    "net/http"
    "strconv"
    "time"
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
    "github.com/prometheus/client_golang/prometheus/promhttp"
    "github.com/gin-gonic/gin"
)

var (
    // HTTP 请求总数
    httpRequestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "HTTP 请求总数",
        },
        []string{"method", "path", "status"},
    )

    // HTTP 请求延迟
    httpRequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "HTTP 请求处理时长",
            Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
        },
        []string{"method", "path"},
    )

    // 活跃连接数
    activeConnections = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "active_connections",
            Help: "当前活跃连接数",
        },
    )

    // 业务指标: 用户注册数
    userRegistrations = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "user_registrations_total",
            Help: "用户注册总数",
        },
    )
)

// Prometheus 中间件 (Gin)
func PrometheusMiddleware() gin.HandlerFunc {
    return func(c *gin.Context) {
        path := c.FullPath()
        if path == "" {
            path = "unknown"
        }

        start := time.Now()
        activeConnections.Inc()
        defer activeConnections.Dec()

        c.Next()

        duration := time.Since(start).Seconds()
        status := strconv.Itoa(c.Writer.Status())

        httpRequestsTotal.WithLabelValues(c.Request.Method, path, status).Inc()
        httpRequestDuration.WithLabelValues(c.Request.Method, path).Observe(duration)
    }
}

// 注册 metrics 端点
func RegisterMetrics(r *gin.Engine) {
    r.GET("/metrics", gin.WrapH(promhttp.Handler()))
}

// 业务代码中记录指标
func RecordUserRegistration() {
    userRegistrations.Inc()
}
# config/prometheus.yml
global:
  scrape_interval: 15s

scrape_configs:
  - job_name: 'myapp'
    static_configs:
      - targets: ['app:8080']
    metrics_path: /metrics
🎯

关键监控指标 (黄金信号)

延迟:P50/P95/P99 请求延迟  ② 流量:每秒请求数 (RPS)  ③ 错误率:5xx 错误率  ④ 饱和度:CPU/内存使用率。这四项是判断服务健康的核心指标。