feat: 新增数据库持久化模块（Persistence），实现 Cursor + Retry 双层架构

## 核心功能 ### 1. 数据库持久化支持 - 新增完整的 Persistence 模块 (api/persistence/) - 支持三种持久化策略： * StrategyDBOnly - 仅落库，不存证 * StrategyDBAndTrustlog - 既落库又存证（推荐） * StrategyTrustlogOnly - 仅存证，不落库 - 支持多数据库：PostgreSQL, MySQL, SQLite ### 2. Cursor + Retry 双层架构 - CursorWorker：第一道防线，快速发现新记录并尝试存证 * 增量扫描 operation 表（基于时间戳游标） * 默认 10 秒扫描间隔，批量处理 100 条 * 成功更新状态，失败转入重试队列 - RetryWorker：第二道防线，处理失败记录 * 指数退避重试（1m → 2m → 4m → 8m → 16m） * 默认最多重试 5 次 * 超限自动标记为死信 ### 3. 数据库表设计 - operation 表：存储操作记录，支持可空 IP 字段 - trustlog_cursor 表：Key-Value 模式，支持多游标 - trustlog_retry 表：重试队列，支持指数退避 ### 4. 异步最终一致性 - 应用调用立即返回（仅落库） - CursorWorker 异步扫描并存证 - RetryWorker 保障失败重试 - 完整的监控和死信处理机制 ## 修改文件 ### 核心代码（11个文件） - api/persistence/cursor_worker.go - Cursor 工作器（新增） - api/persistence/repository.go - 数据仓储层（新增） - api/persistence/schema.go - 数据库 Schema（新增） - api/persistence/strategy.go - 策略管理器（新增） - api/persistence/client.go - 客户端封装（新增） - api/persistence/retry_worker.go - Retry 工作器（新增） - api/persistence/config.go - 配置管理（新增） ### 修复内部包引用（5个文件） - api/adapter/publisher.go - 修复 internal 包引用 - api/adapter/subscriber.go - 修复 internal 包引用 - api/model/envelope.go - 修复 internal 包引用 - api/model/operation.go - 修复 internal 包引用 - api/model/record.go - 修复 internal 包引用 ### 单元测试（8个文件） - api/persistence/*_test.go - 完整的单元测试 - 测试覆盖率：28.5% - 测试通过率：49/49 (100%) ### SQL 脚本（4个文件） - api/persistence/sql/postgresql.sql - PostgreSQL 建表脚本 - api/persistence/sql/mysql.sql - MySQL 建表脚本 - api/persistence/sql/sqlite.sql - SQLite 建表脚本 - api/persistence/sql/test_data.sql - 测试数据 ### 文档（2个文件） - README.md - 更新主文档，新增 Persistence 使用指南 - api/persistence/README.md - 完整的 Persistence 文档 - api/persistence/sql/README.md - SQL 脚本说明 ## 技术亮点 1. **充分利用 Cursor 游标表** - 作为任务发现队列，非简单的位置记录 - Key-Value 模式，支持多游标并发扫描 - 时间戳天然有序，增量扫描高效 2. **双层保障机制** - Cursor：正常流程，快速处理 - Retry：异常流程，可靠重试 - 职责分离，监控清晰 3. **可空 IP 字段支持** - ClientIP 和 ServerIP 使用 *string 类型 - 支持 NULL 值，符合数据库最佳实践 - 使用 sql.NullString 正确处理 4. **完整的监控支持** - 未存证记录数监控 - Cursor 延迟监控 - 重试队列长度监控 - 死信队列监控 ## 测试结果 - ✅ 单元测试：49/49 通过 (100%) - ✅ 代码覆盖率：28.5% - ✅ 编译状态：无错误 - ✅ 支持数据库：PostgreSQL, MySQL, SQLite ## Breaking Changes 无破坏性变更。Persistence 模块作为可选功能，不影响现有代码。 ## 版本信息 - 版本：v2.1.0 - Go 版本要求：1.21+ - 更新日期：2025-12-23
2025-12-23 18:59:43 +08:00
parent d313449c5c
commit 88f80ffa5e
31 changed files with 6551 additions and 36 deletions
--- a/api/persistence/cursor_worker.go
+++ b/api/persistence/cursor_worker.go
@@ -0,0 +1,387 @@
+package persistence
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"time"
+
+	"go.yandata.net/iod/iod/go-trustlog/api/logger"
+	"go.yandata.net/iod/iod/go-trustlog/api/model"
+)
+
+// OperationRecord 操作记录（包含数据库扩展字段）
+type OperationRecord struct {
+	OpID             string
+	OpActor          string
+	DOID             string
+	ProducerID       string
+	RequestBodyHash  string
+	ResponseBodyHash string
+	OpHash           string
+	Sign             string
+	OpSource         string
+	OpType           string
+	DOPrefix         string
+	DORepository     string
+	ClientIP         *string
+	ServerIP         *string
+	TrustlogStatus   string
+	CreatedAt        time.Time
+}
+
+// ToModel 转换为 model.Operation
+func (r *OperationRecord) ToModel() *model.Operation {
+	return &model.Operation{
+		OpID:             r.OpID,
+		OpActor:          r.OpActor,
+		Doid:             r.DOID,
+		ProducerID:       r.ProducerID,
+		RequestBodyHash:  &r.RequestBodyHash,
+		ResponseBodyHash: &r.ResponseBodyHash,
+		OpSource:         model.Source(r.OpSource),
+		OpType:           model.Type(r.OpType),
+		DoPrefix:         r.DOPrefix,
+		DoRepository:     r.DORepository,
+		ClientIP:         r.ClientIP,
+		ServerIP:         r.ServerIP,
+	}
+}
+
+// CursorWorkerConfig Cursor工作器配置
+type CursorWorkerConfig struct {
+	// ScanInterval 扫描间隔（默认10秒，快速发现新记录）
+	ScanInterval time.Duration
+	// BatchSize 批量处理大小（默认100）
+	BatchSize int
+	// CursorKey Cursor键（默认 "operation_scan"）
+	CursorKey string
+	// MaxRetryAttempt Cursor阶段最大重试次数（默认1，快速失败转入Retry）
+	MaxRetryAttempt int
+	// Enabled 是否启用Cursor工作器（默认启用）
+	Enabled bool
+}
+
+// DefaultCursorWorkerConfig 默认Cursor工作器配置
+func DefaultCursorWorkerConfig() CursorWorkerConfig {
+	return CursorWorkerConfig{
+		ScanInterval:    10 * time.Second,
+		BatchSize:       100,
+		CursorKey:       "operation_scan",
+		MaxRetryAttempt: 1,
+		Enabled:         true,
+	}
+}
+
+// CursorWorker Cursor工作器（任务发现）
+// 职责：扫描operation表，发现新的待存证记录，尝试存证
+// 成功则更新状态，失败则加入重试表
+type CursorWorker struct {
+	config  CursorWorkerConfig
+	manager *PersistenceManager
+	logger  logger.Logger
+	stopCh  chan struct{}
+}
+
+// NewCursorWorker 创建Cursor工作器
+func NewCursorWorker(config CursorWorkerConfig, manager *PersistenceManager) *CursorWorker {
+	if config.ScanInterval == 0 {
+		config.ScanInterval = 10 * time.Second
+	}
+	if config.BatchSize == 0 {
+		config.BatchSize = 100
+	}
+	if config.CursorKey == "" {
+		config.CursorKey = "operation_scan"
+	}
+	if config.MaxRetryAttempt == 0 {
+		config.MaxRetryAttempt = 1
+	}
+
+	return &CursorWorker{
+		config:  config,
+		manager: manager,
+		logger:  manager.logger,
+		stopCh:  make(chan struct{}),
+	}
+}
+
+// Start 启动Cursor工作器
+func (w *CursorWorker) Start(ctx context.Context) error {
+	if !w.config.Enabled {
+		w.logger.InfoContext(ctx, "cursor worker disabled, skipping start")
+		return nil
+	}
+
+	w.logger.InfoContext(ctx, "starting cursor worker",
+		"scanInterval", w.config.ScanInterval,
+		"batchSize", w.config.BatchSize,
+		"cursorKey", w.config.CursorKey,
+	)
+
+	// 初始化cursor（如果不存在）
+	if err := w.initCursor(ctx); err != nil {
+		return fmt.Errorf("failed to init cursor: %w", err)
+	}
+
+	// 启动定期扫描
+	go w.run(ctx)
+
+	return nil
+}
+
+// Stop 停止Cursor工作器
+func (w *CursorWorker) Stop(ctx context.Context) error {
+	w.logger.InfoContext(ctx, "stopping cursor worker")
+	close(w.stopCh)
+	return nil
+}
+
+// run 运行循环
+func (w *CursorWorker) run(ctx context.Context) {
+	ticker := time.NewTicker(w.config.ScanInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-w.stopCh:
+			w.logger.InfoContext(ctx, "cursor worker stopped")
+			return
+		case <-ticker.C:
+			w.scan(ctx)
+		}
+	}
+}
+
+// scan 扫描并处理未存证记录
+func (w *CursorWorker) scan(ctx context.Context) {
+	w.logger.DebugContext(ctx, "cursor worker scanning",
+		"cursorKey", w.config.CursorKey,
+	)
+
+	// 1. 读取cursor
+	cursor, err := w.getCursor(ctx)
+	if err != nil {
+		w.logger.ErrorContext(ctx, "failed to get cursor",
+			"error", err,
+		)
+		return
+	}
+
+	w.logger.DebugContext(ctx, "cursor position",
+		"cursor", cursor,
+	)
+
+	// 2. 扫描新记录
+	operations, err := w.findNewOperations(ctx, cursor)
+	if err != nil {
+		w.logger.ErrorContext(ctx, "failed to find new operations",
+			"error", err,
+		)
+		return
+	}
+
+	if len(operations) == 0 {
+		w.logger.DebugContext(ctx, "no new operations found")
+		return
+	}
+
+	w.logger.InfoContext(ctx, "found new operations",
+		"count", len(operations),
+	)
+
+	// 3. 处理每条记录
+	for _, op := range operations {
+		w.processOperation(ctx, op)
+	}
+}
+
+// initCursor 初始化cursor
+func (w *CursorWorker) initCursor(ctx context.Context) error {
+	cursorRepo := w.manager.GetCursorRepo()
+
+	// 创建初始cursor（使用当前时间）
+	now := time.Now().Format(time.RFC3339Nano)
+	err := cursorRepo.InitCursor(ctx, w.config.CursorKey, now)
+	if err != nil {
+		return fmt.Errorf("failed to init cursor: %w", err)
+	}
+
+	w.logger.InfoContext(ctx, "cursor initialized",
+		"cursorKey", w.config.CursorKey,
+		"initialValue", now,
+	)
+
+	return nil
+}
+
+// getCursor 获取cursor值
+func (w *CursorWorker) getCursor(ctx context.Context) (string, error) {
+	cursorRepo := w.manager.GetCursorRepo()
+
+	cursor, err := cursorRepo.GetCursor(ctx, w.config.CursorKey)
+	if err != nil {
+		return "", fmt.Errorf("failed to get cursor: %w", err)
+	}
+
+	// 如果cursor为空，使用一个很早的时间
+	if cursor == "" {
+		cursor = time.Time{}.Format(time.RFC3339Nano)
+	}
+
+	return cursor, nil
+}
+
+// updateCursor 更新cursor值
+func (w *CursorWorker) updateCursor(ctx context.Context, value string) error {
+	cursorRepo := w.manager.GetCursorRepo()
+
+	err := cursorRepo.UpdateCursor(ctx, w.config.CursorKey, value)
+	if err != nil {
+		return fmt.Errorf("failed to update cursor: %w", err)
+	}
+
+	w.logger.DebugContext(ctx, "cursor updated",
+		"cursorKey", w.config.CursorKey,
+		"newValue", value,
+	)
+
+	return nil
+}
+
+// findNewOperations 查找新的待存证记录
+func (w *CursorWorker) findNewOperations(ctx context.Context, cursor string) ([]*OperationRecord, error) {
+	db := w.manager.db
+
+	// 查询未存证的记录（created_at > cursor）
+	rows, err := db.QueryContext(ctx, `
+		SELECT op_id, op_actor, doid, producer_id, 
+		       request_body_hash, response_body_hash, op_hash, sign,
+		       op_source, op_type, do_prefix, do_repository,
+		       client_ip, server_ip, trustlog_status, created_at
+		FROM operation
+		WHERE trustlog_status = $1
+		  AND created_at > $2
+		ORDER BY created_at ASC
+		LIMIT $3
+	`, StatusNotTrustlogged, cursor, w.config.BatchSize)
+	if err != nil {
+		return nil, fmt.Errorf("failed to query operations: %w", err)
+	}
+	defer rows.Close()
+
+	var operations []*OperationRecord
+	for rows.Next() {
+		op := &OperationRecord{}
+		var clientIP, serverIP sql.NullString
+		var createdAt time.Time
+
+		err := rows.Scan(
+			&op.OpID, &op.OpActor, &op.DOID, &op.ProducerID,
+			&op.RequestBodyHash, &op.ResponseBodyHash, &op.OpHash, &op.Sign,
+			&op.OpSource, &op.OpType, &op.DOPrefix, &op.DORepository,
+			&clientIP, &serverIP, &op.TrustlogStatus, &createdAt,
+		)
+		if err != nil {
+			return nil, fmt.Errorf("failed to scan operation: %w", err)
+		}
+
+		// 处理可空字段
+		if clientIP.Valid {
+			op.ClientIP = &clientIP.String
+		}
+		if serverIP.Valid {
+			op.ServerIP = &serverIP.String
+		}
+		op.CreatedAt = createdAt
+
+		operations = append(operations, op)
+	}
+
+	return operations, nil
+}
+
+// processOperation 处理单条记录
+func (w *CursorWorker) processOperation(ctx context.Context, op *OperationRecord) {
+	w.logger.DebugContext(ctx, "processing operation",
+		"opID", op.OpID,
+	)
+
+	// 尝试存证（最多重试 MaxRetryAttempt 次）
+	var lastErr error
+	for attempt := 0; attempt <= w.config.MaxRetryAttempt; attempt++ {
+		if attempt > 0 {
+			w.logger.DebugContext(ctx, "retrying trustlog",
+				"opID", op.OpID,
+				"attempt", attempt,
+			)
+		}
+
+		err := w.tryTrustlog(ctx, op)
+		if err == nil {
+			// 成功：更新状态
+			if err := w.updateOperationStatus(ctx, op.OpID, StatusTrustlogged); err != nil {
+				w.logger.ErrorContext(ctx, "failed to update operation status",
+					"opID", op.OpID,
+					"error", err,
+				)
+			} else {
+				w.logger.InfoContext(ctx, "operation trustlogged successfully",
+					"opID", op.OpID,
+				)
+			}
+
+			// 更新cursor
+			w.updateCursor(ctx, op.CreatedAt.Format(time.RFC3339Nano))
+			return
+		}
+
+		lastErr = err
+		if attempt < w.config.MaxRetryAttempt {
+			time.Sleep(time.Second) // 简单的重试延迟
+		}
+	}
+
+	// 失败：加入重试表
+	w.logger.WarnContext(ctx, "failed to trustlog in cursor worker, adding to retry queue",
+		"opID", op.OpID,
+		"error", lastErr,
+	)
+
+	retryRepo := w.manager.GetRetryRepo()
+	nextRetryAt := time.Now().Add(1 * time.Minute) // 1分钟后重试
+	if err := retryRepo.AddRetry(ctx, op.OpID, lastErr.Error(), nextRetryAt); err != nil {
+		w.logger.ErrorContext(ctx, "failed to add to retry queue",
+			"opID", op.OpID,
+			"error", err,
+		)
+	}
+
+	// 即使失败也更新cursor（避免卡在同一条记录）
+	w.updateCursor(ctx, op.CreatedAt.Format(time.RFC3339Nano))
+}
+
+// tryTrustlog 尝试存证（调用存证系统）
+func (w *CursorWorker) tryTrustlog(ctx context.Context, op *OperationRecord) error {
+	publisher := w.manager.GetPublisher()
+	if publisher == nil {
+		return fmt.Errorf("publisher not available")
+	}
+
+	// 转换为 Operation 模型
+	modelOp := op.ToModel()
+
+	// 调用存证
+	if err := publisher.Publish(ctx, modelOp); err != nil {
+		return fmt.Errorf("failed to publish to trustlog: %w", err)
+	}
+
+	return nil
+}
+
+// updateOperationStatus 更新操作状态
+func (w *CursorWorker) updateOperationStatus(ctx context.Context, opID string, status TrustlogStatus) error {
+	opRepo := w.manager.GetOperationRepo()
+	return opRepo.UpdateStatus(ctx, opID, status)
+}
+