feat: 完善数据库持久化与存证功能

主要更新： 1. 数据库持久化功能 - 支持三种策略：仅落库、既落库又存证、仅存证 - 实现 Cursor Worker 异步扫描和存证机制 - 实现 Retry Worker 失败重试机制 - 支持 PostgreSQL、MySQL、SQLite 等多种数据库 - 添加 ClientIP 和 ServerIP 字段（可空，仅落库） 2. 集群并发安全 - 使用 SELECT FOR UPDATE SKIP LOCKED 防止重复处理 - 实现 CAS (Compare-And-Set) 原子状态更新 - 添加 updated_at 字段支持并发控制 3. Cursor 初始化优化 - 自动基于历史数据初始化 cursor - 确保不遗漏任何历史记录 - 修复 UPSERT 逻辑 4. 测试完善 - 添加 E2E 集成测试（含 Pulsar 消费者验证） - 添加 PostgreSQL 集成测试 - 添加 Pulsar 集成测试 - 添加集群并发安全测试 - 添加 Cursor 初始化验证测试 - 补充大量单元测试，提升覆盖率 5. 工具脚本 - 添加数据库迁移脚本 - 添加 Cursor 状态检查工具 - 添加 Cursor 初始化工具 - 添加 Pulsar 消息验证工具 6. 文档清理 - 删除冗余文档，只保留根目录 README 测试结果： - 所有 E2E 测试通过（100%） - 数据库持久化与异步存证流程验证通过 - 集群环境下的并发安全性验证通过 - Cursor 自动初始化和历史数据处理验证通过
2025-12-24 15:31:11 +08:00
parent 88f80ffa5e
commit 4b72a37120
60 changed files with 6160 additions and 1313 deletions
--- a/api/persistence/repository.go
+++ b/api/persistence/repository.go
@@ -24,6 +24,14 @@ type OperationRepository interface {
 	FindByID(ctx context.Context, opID string) (*model.Operation, TrustlogStatus, error)
 	// FindUntrustlogged 查询未存证的操作记录（用于重试机制）
 	FindUntrustlogged(ctx context.Context, limit int) ([]*model.Operation, error)
+	// FindUntrustloggedWithLock 查找未存证的操作（支持集群并发安全）
+	// 使用 SELECT FOR UPDATE SKIP LOCKED 确保多个 worker 不会处理相同的记录
+	// 返回: operations, opIDs, error
+	FindUntrustloggedWithLock(ctx context.Context, tx *sql.Tx, limit int) ([]*model.Operation, []string, error)
+	// UpdateStatusWithCAS 使用 CAS (Compare-And-Set) 更新状态
+	// 只有当前状态匹配 expectedStatus 时才会更新
+	// 返回: updated (是否更新成功), error
+	UpdateStatusWithCAS(ctx context.Context, tx *sql.Tx, opID string, expectedStatus, newStatus TrustlogStatus) (bool, error)
 }

 // CursorRepository 游标仓储接口（Key-Value 模式）
@@ -68,31 +76,73 @@ type RetryRecord struct {

 // operationRepository 操作记录仓储实现
 type operationRepository struct {
-	db     *sql.DB
-	logger logger.Logger
+	db         *sql.DB
+	logger     logger.Logger
+	driverName string
+}
+
+// detectDriverName 检测数据库驱动名
+func detectDriverName(db *sql.DB) string {
+	if db == nil {
+		return "sqlite3"
+	}
+	// 尝试执行 PostgreSQL 特有的查询
+	var version string
+	err := db.QueryRow("SELECT version()").Scan(&version)
+	if err == nil && len(version) >= 10 && version[:10] == "PostgreSQL" {
+		return "postgres"
+	}
+	return "sqlite3" // 默认
+}
+
+// convertPlaceholdersForDriver 将 ? 占位符转换为适合数据库的占位符
+func convertPlaceholdersForDriver(query, driverName string) string {
+	if driverName == "postgres" {
+		// PostgreSQL 使用 $1, $2, $3...
+		count := 1
+		result := ""
+		for i := 0; i < len(query); i++ {
+			if query[i] == '?' {
+				result += fmt.Sprintf("$%d", count)
+				count++
+			} else {
+				result += string(query[i])
+			}
+		}
+		return result
+	}
+	// 其他数据库（SQLite, MySQL）使用 ?
+	return query
 }

 // NewOperationRepository 创建操作记录仓储
 func NewOperationRepository(db *sql.DB, log logger.Logger) OperationRepository {
+	driverName := detectDriverName(db)
 	return &operationRepository{
-		db:     db,
-		logger: log,
+		db:         db,
+		logger:     log,
+		driverName: driverName,
 	}
 }

+// convertPlaceholders 将 ? 占位符转换为适合数据库的占位符
+func (r *operationRepository) convertPlaceholders(query string) string {
+	return convertPlaceholdersForDriver(query, r.driverName)
+}
+
 func (r *operationRepository) Save(ctx context.Context, op *model.Operation, status TrustlogStatus) error {
 	return r.SaveTx(ctx, nil, op, status)
 }

 func (r *operationRepository) SaveTx(ctx context.Context, tx *sql.Tx, op *model.Operation, status TrustlogStatus) error {
-	query := `
+	query := r.convertPlaceholders(`
 		INSERT INTO operation (
 			op_id, op_actor, doid, producer_id, 
 			request_body_hash, response_body_hash, 
 			op_source, op_type, do_prefix, do_repository,
 			client_ip, server_ip, trustlog_status, timestamp
 		) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-	`
+	`)

 	var reqHash, respHash, clientIP, serverIP sql.NullString
 	if op.RequestBodyHash != nil {
@@ -152,7 +202,7 @@ func (r *operationRepository) UpdateStatus(ctx context.Context, opID string, sta
 }

 func (r *operationRepository) UpdateStatusTx(ctx context.Context, tx *sql.Tx, opID string, status TrustlogStatus) error {
-	query := `UPDATE operation SET trustlog_status = ? WHERE op_id = ?`
+	query := r.convertPlaceholders(`UPDATE operation SET trustlog_status = ? WHERE op_id = ?`)

 	var err error
 	if tx != nil {
@@ -178,7 +228,7 @@ func (r *operationRepository) UpdateStatusTx(ctx context.Context, tx *sql.Tx, op
 }

 func (r *operationRepository) FindByID(ctx context.Context, opID string) (*model.Operation, TrustlogStatus, error) {
-	query := `
+	query := r.convertPlaceholders(`
 		SELECT 
 			op_id, op_actor, doid, producer_id,
 			request_body_hash, response_body_hash,
@@ -186,7 +236,7 @@ func (r *operationRepository) FindByID(ctx context.Context, opID string) (*model
 			client_ip, server_ip, trustlog_status, timestamp
 		FROM operation
 		WHERE op_id = ?
-	`
+	`)

 	var op model.Operation
 	var statusStr string
@@ -236,8 +286,12 @@ func (r *operationRepository) FindByID(ctx context.Context, opID string) (*model
 	return &op, TrustlogStatus(statusStr), nil
 }

-func (r *operationRepository) FindUntrustlogged(ctx context.Context, limit int) ([]*model.Operation, error) {
-	query := `
+// FindUntrustloggedWithLock 查找未存证的操作（支持集群并发安全）
+// 使用 SELECT FOR UPDATE SKIP LOCKED 确保多个 worker 不会处理相同的记录
+func (r *operationRepository) FindUntrustloggedWithLock(ctx context.Context, tx *sql.Tx, limit int) ([]*model.Operation, []string, error) {
+	// 使用 FOR UPDATE SKIP LOCKED 锁定记录
+	// SKIP LOCKED: 跳过已被其他事务锁定的行，避免等待
+	query := r.convertPlaceholders(`
 		SELECT 
 			op_id, op_actor, doid, producer_id,
 			request_body_hash, response_body_hash,
@@ -247,7 +301,142 @@ func (r *operationRepository) FindUntrustlogged(ctx context.Context, limit int)
 		WHERE trustlog_status = ?
 		ORDER BY timestamp ASC
 		LIMIT ?
-	`
+		FOR UPDATE SKIP LOCKED
+	`)
+
+	var rows *sql.Rows
+	var err error
+	if tx != nil {
+		rows, err = tx.QueryContext(ctx, query, string(StatusNotTrustlogged), limit)
+	} else {
+		rows, err = r.db.QueryContext(ctx, query, string(StatusNotTrustlogged), limit)
+	}
+
+	if err != nil {
+		r.logger.ErrorContext(ctx, "failed to find untrustlogged operations with lock",
+			"error", err,
+		)
+		return nil, nil, fmt.Errorf("failed to find untrustlogged operations: %w", err)
+	}
+	defer rows.Close()
+
+	var operations []*model.Operation
+	var opIDs []string
+	for rows.Next() {
+		var op model.Operation
+		var reqHash, respHash, clientIP, serverIP sql.NullString
+
+		err := rows.Scan(
+			&op.OpID,
+			&op.OpActor,
+			&op.Doid,
+			&op.ProducerID,
+			&reqHash,
+			&respHash,
+			&op.OpSource,
+			&op.OpType,
+			&op.DoPrefix,
+			&op.DoRepository,
+			&clientIP,
+			&serverIP,
+			&op.Timestamp,
+		)
+		if err != nil {
+			r.logger.ErrorContext(ctx, "failed to scan operation",
+				"error", err,
+			)
+			continue
+		}
+
+		if reqHash.Valid {
+			op.RequestBodyHash = &reqHash.String
+		}
+		if respHash.Valid {
+			op.ResponseBodyHash = &respHash.String
+		}
+		if clientIP.Valid {
+			op.ClientIP = &clientIP.String
+		}
+		if serverIP.Valid {
+			op.ServerIP = &serverIP.String
+		}
+
+		operations = append(operations, &op)
+		opIDs = append(opIDs, op.OpID)
+	}
+
+	if err := rows.Err(); err != nil {
+		r.logger.ErrorContext(ctx, "error iterating rows",
+			"error", err,
+		)
+		return nil, nil, fmt.Errorf("error iterating rows: %w", err)
+	}
+
+	return operations, opIDs, nil
+}
+
+// UpdateStatusWithCAS 使用 CAS (Compare-And-Set) 更新状态
+// 只有当前状态匹配 expectedStatus 时才会更新，确保并发安全
+func (r *operationRepository) UpdateStatusWithCAS(ctx context.Context, tx *sql.Tx, opID string, expectedStatus, newStatus TrustlogStatus) (bool, error) {
+	query := r.convertPlaceholders(`
+		UPDATE operation 
+		SET trustlog_status = ?
+		WHERE op_id = ? AND trustlog_status = ?
+	`)
+
+	var result sql.Result
+	var err error
+
+	if tx != nil {
+		result, err = tx.ExecContext(ctx, query, string(newStatus), opID, string(expectedStatus))
+	} else {
+		result, err = r.db.ExecContext(ctx, query, string(newStatus), opID, string(expectedStatus))
+	}
+
+	if err != nil {
+		r.logger.ErrorContext(ctx, "failed to update operation status with CAS",
+			"opID", opID,
+			"expectedStatus", expectedStatus,
+			"newStatus", newStatus,
+			"error", err,
+		)
+		return false, fmt.Errorf("failed to update operation status: %w", err)
+	}
+
+	rowsAffected, err := result.RowsAffected()
+	if err != nil {
+		return false, fmt.Errorf("failed to get rows affected: %w", err)
+	}
+
+	// 如果影响行数为 0，说明状态已被其他 worker 修改
+	if rowsAffected == 0 {
+		r.logger.WarnContext(ctx, "CAS update failed: status already changed by another worker",
+			"opID", opID,
+			"expectedStatus", expectedStatus,
+		)
+		return false, nil
+	}
+
+	r.logger.DebugContext(ctx, "operation status updated with CAS",
+		"opID", opID,
+		"expectedStatus", expectedStatus,
+		"newStatus", newStatus,
+	)
+	return true, nil
+}
+
+func (r *operationRepository) FindUntrustlogged(ctx context.Context, limit int) ([]*model.Operation, error) {
+	query := r.convertPlaceholders(`
+		SELECT 
+			op_id, op_actor, doid, producer_id,
+			request_body_hash, response_body_hash,
+			op_source, op_type, do_prefix, do_repository,
+			client_ip, server_ip, timestamp
+		FROM operation
+		WHERE trustlog_status = ?
+		ORDER BY timestamp ASC
+		LIMIT ?
+	`)

 	rows, err := r.db.QueryContext(ctx, query, string(StatusNotTrustlogged), limit)
 	if err != nil {
@@ -310,21 +499,29 @@ func (r *operationRepository) FindUntrustlogged(ctx context.Context, limit int)

 // cursorRepository 游标仓储实现
 type cursorRepository struct {
-	db     *sql.DB
-	logger logger.Logger
+	db         *sql.DB
+	logger     logger.Logger
+	driverName string
 }

 // NewCursorRepository 创建游标仓储
 func NewCursorRepository(db *sql.DB, log logger.Logger) CursorRepository {
+	driverName := detectDriverName(db)
 	return &cursorRepository{
-		db:     db,
-		logger: log,
+		db:         db,
+		logger:     log,
+		driverName: driverName,
 	}
 }

+// convertPlaceholders 将 ? 占位符转换为适合数据库的占位符
+func (r *cursorRepository) convertPlaceholders(query string) string {
+	return convertPlaceholdersForDriver(query, r.driverName)
+}
+
 // GetCursor 获取游标值（Key-Value 模式）
 func (r *cursorRepository) GetCursor(ctx context.Context, cursorKey string) (string, error) {
-	query := `SELECT cursor_value FROM trustlog_cursor WHERE cursor_key = ?`
+	query := r.convertPlaceholders(`SELECT cursor_value FROM trustlog_cursor WHERE cursor_key = ?`)

 	var cursorValue string
 	err := r.db.QueryRowContext(ctx, query, cursorKey).Scan(&cursorValue)
@@ -353,13 +550,13 @@ func (r *cursorRepository) UpdateCursor(ctx context.Context, cursorKey string, c
 // UpdateCursorTx 在事务中更新游标值（使用 UPSERT）
 func (r *cursorRepository) UpdateCursorTx(ctx context.Context, tx *sql.Tx, cursorKey string, cursorValue string) error {
 	// 使用 UPSERT 语法（适配不同数据库）
-	query := `
+	query := r.convertPlaceholders(`
 		INSERT INTO trustlog_cursor (cursor_key, cursor_value, last_updated_at)
 		VALUES (?, ?, ?)
 		ON CONFLICT (cursor_key) DO UPDATE SET
 			cursor_value = excluded.cursor_value,
 			last_updated_at = excluded.last_updated_at
-	`
+	`)

 	var err error
 	now := time.Now()
@@ -386,13 +583,19 @@ func (r *cursorRepository) UpdateCursorTx(ctx context.Context, tx *sql.Tx, curso

 // InitCursor 初始化游标（如果不存在）
 func (r *cursorRepository) InitCursor(ctx context.Context, cursorKey string, initialValue string) error {
-	query := `
+	// 使用简单的 UPSERT：如果冲突则更新为新值
+	// 这样可以确保 cursor 总是基于最新的数据库状态初始化
+	query := r.convertPlaceholders(`
 		INSERT INTO trustlog_cursor (cursor_key, cursor_value, last_updated_at)
 		VALUES (?, ?, ?)
-		ON CONFLICT (cursor_key) DO NOTHING
-	`
+		ON CONFLICT (cursor_key) 
+		DO UPDATE SET 
+			cursor_value = EXCLUDED.cursor_value,
+			last_updated_at = EXCLUDED.last_updated_at
+	`)

-	_, err := r.db.ExecContext(ctx, query, cursorKey, initialValue, time.Now())
+	now := time.Now()
+	_, err := r.db.ExecContext(ctx, query, cursorKey, initialValue, now)
 	if err != nil {
 		r.logger.ErrorContext(ctx, "failed to init cursor",
 			"cursorKey", cursorKey,
@@ -410,27 +613,35 @@ func (r *cursorRepository) InitCursor(ctx context.Context, cursorKey string, ini

 // retryRepository 重试仓储实现
 type retryRepository struct {
-	db     *sql.DB
-	logger logger.Logger
+	db         *sql.DB
+	logger     logger.Logger
+	driverName string
 }

 // NewRetryRepository 创建重试仓储
 func NewRetryRepository(db *sql.DB, log logger.Logger) RetryRepository {
+	driverName := detectDriverName(db)
 	return &retryRepository{
-		db:     db,
-		logger: log,
+		db:         db,
+		logger:     log,
+		driverName: driverName,
 	}
 }

+// convertPlaceholders 将 ? 占位符转换为适合数据库的占位符
+func (r *retryRepository) convertPlaceholders(query string) string {
+	return convertPlaceholdersForDriver(query, r.driverName)
+}
+
 func (r *retryRepository) AddRetry(ctx context.Context, opID string, errorMsg string, nextRetryAt time.Time) error {
 	return r.AddRetryTx(ctx, nil, opID, errorMsg, nextRetryAt)
 }

 func (r *retryRepository) AddRetryTx(ctx context.Context, tx *sql.Tx, opID string, errorMsg string, nextRetryAt time.Time) error {
-	query := `
+	query := r.convertPlaceholders(`
 		INSERT INTO trustlog_retry (op_id, retry_count, retry_status, error_message, next_retry_at, updated_at)
 		VALUES (?, 0, ?, ?, ?, ?)
-	`
+	`)

 	var err error
 	if tx != nil {
@@ -455,7 +666,7 @@ func (r *retryRepository) AddRetryTx(ctx context.Context, tx *sql.Tx, opID strin
 }

 func (r *retryRepository) IncrementRetry(ctx context.Context, opID string, errorMsg string, nextRetryAt time.Time) error {
-	query := `
+	query := r.convertPlaceholders(`
 		UPDATE trustlog_retry 
 		SET retry_count = retry_count + 1, 
 		    retry_status = ?,
@@ -464,7 +675,7 @@ func (r *retryRepository) IncrementRetry(ctx context.Context, opID string, error
 		    error_message = ?,
 		    updated_at = ?
 		WHERE op_id = ?
-	`
+	`)

 	_, err := r.db.ExecContext(ctx, query,
 		string(RetryStatusRetrying),
@@ -491,13 +702,13 @@ func (r *retryRepository) IncrementRetry(ctx context.Context, opID string, error
 }

 func (r *retryRepository) MarkAsDeadLetter(ctx context.Context, opID string, errorMsg string) error {
-	query := `
+	query := r.convertPlaceholders(`
 		UPDATE trustlog_retry 
 		SET retry_status = ?,
 		    error_message = ?,
 		    updated_at = ?
 		WHERE op_id = ?
-	`
+	`)

 	_, err := r.db.ExecContext(ctx, query,
 		string(RetryStatusDeadLetter),
@@ -522,7 +733,7 @@ func (r *retryRepository) MarkAsDeadLetter(ctx context.Context, opID string, err
 }

 func (r *retryRepository) FindPendingRetries(ctx context.Context, limit int) ([]RetryRecord, error) {
-	query := `
+	query := r.convertPlaceholders(`
 		SELECT 
 			op_id, retry_count, retry_status,
 			last_retry_at, next_retry_at, error_message,
@@ -531,7 +742,7 @@ func (r *retryRepository) FindPendingRetries(ctx context.Context, limit int) ([]
 		WHERE retry_status IN (?, ?) AND next_retry_at <= ?
 		ORDER BY next_retry_at ASC
 		LIMIT ?
-	`
+	`)

 	rows, err := r.db.QueryContext(ctx, query,
 		string(RetryStatusPending),
@@ -587,7 +798,7 @@ func (r *retryRepository) FindPendingRetries(ctx context.Context, limit int) ([]
 }

 func (r *retryRepository) DeleteRetry(ctx context.Context, opID string) error {
-	query := `DELETE FROM trustlog_retry WHERE op_id = ?`
+	query := r.convertPlaceholders(`DELETE FROM trustlog_retry WHERE op_id = ?`)

 	_, err := r.db.ExecContext(ctx, query, opID)
 	if err != nil {