feat: 完善数据库持久化与存证功能

主要更新:

1. 数据库持久化功能
   - 支持三种策略:仅落库、既落库又存证、仅存证
   - 实现 Cursor Worker 异步扫描和存证机制
   - 实现 Retry Worker 失败重试机制
   - 支持 PostgreSQL、MySQL、SQLite 等多种数据库
   - 添加 ClientIP 和 ServerIP 字段(可空,仅落库)

2. 集群并发安全
   - 使用 SELECT FOR UPDATE SKIP LOCKED 防止重复处理
   - 实现 CAS (Compare-And-Set) 原子状态更新
   - 添加 updated_at 字段支持并发控制

3. Cursor 初始化优化
   - 自动基于历史数据初始化 cursor
   - 确保不遗漏任何历史记录
   - 修复 UPSERT 逻辑

4. 测试完善
   - 添加 E2E 集成测试(含 Pulsar 消费者验证)
   - 添加 PostgreSQL 集成测试
   - 添加 Pulsar 集成测试
   - 添加集群并发安全测试
   - 添加 Cursor 初始化验证测试
   - 补充大量单元测试,提升覆盖率

5. 工具脚本
   - 添加数据库迁移脚本
   - 添加 Cursor 状态检查工具
   - 添加 Cursor 初始化工具
   - 添加 Pulsar 消息验证工具

6. 文档清理
   - 删除冗余文档,只保留根目录 README

测试结果:
- 所有 E2E 测试通过(100%)
- 数据库持久化与异步存证流程验证通过
- 集群环境下的并发安全性验证通过
- Cursor 自动初始化和历史数据处理验证通过
This commit is contained in:
ryan
2025-12-24 15:31:11 +08:00
parent 88f80ffa5e
commit 4b72a37120
60 changed files with 6160 additions and 1313 deletions

View File

@@ -97,6 +97,8 @@ func NewCursorWorker(config CursorWorkerConfig, manager *PersistenceManager) *Cu
if config.MaxRetryAttempt == 0 {
config.MaxRetryAttempt = 1
}
// 注意Enabled 字段需要显式设置,这里不设置默认值
// 因为在 PersistenceClient 创建时会根据 EnableCursorWorker 参数来控制
return &CursorWorker{
config: config,
@@ -153,7 +155,7 @@ func (w *CursorWorker) run(ctx context.Context) {
}
}
// scan 扫描并处理未存证记录
// scan 扫描并处理未存证记录(集群并发安全版本)
func (w *CursorWorker) scan(ctx context.Context) {
w.logger.DebugContext(ctx, "cursor worker scanning",
"cursorKey", w.config.CursorKey,
@@ -172,8 +174,20 @@ func (w *CursorWorker) scan(ctx context.Context) {
"cursor", cursor,
)
// 2. 扫描新记录
operations, err := w.findNewOperations(ctx, cursor)
// 2. 使用事务 + FOR UPDATE SKIP LOCKED 扫描新记录
// 这样可以避免多个 worker 处理相同的记录
tx, err := w.manager.db.BeginTx(ctx, &sql.TxOptions{
Isolation: sql.LevelReadCommitted,
})
if err != nil {
w.logger.ErrorContext(ctx, "failed to begin transaction",
"error", err,
)
return
}
defer tx.Rollback() // 如果没有提交,确保回滚
operations, opIDs, err := w.findNewOperationsWithLock(ctx, tx, cursor)
if err != nil {
w.logger.ErrorContext(ctx, "failed to find new operations",
"error", err,
@@ -183,33 +197,102 @@ func (w *CursorWorker) scan(ctx context.Context) {
if len(operations) == 0 {
w.logger.DebugContext(ctx, "no new operations found")
tx.Commit() // 提交空事务
return
}
w.logger.InfoContext(ctx, "found new operations",
w.logger.InfoContext(ctx, "found new operations (locked for processing)",
"count", len(operations),
"opIDs", opIDs,
)
// 3. 处理每条记录
for _, op := range operations {
w.processOperation(ctx, op)
// 3. 处理每条记录(在事务中)
successCount := 0
for i, op := range operations {
if w.processOperationInTx(ctx, tx, op) {
successCount++
}
// 每处理 10 条提交一次,避免长时间锁定
if (i+1)%10 == 0 {
if err := tx.Commit(); err != nil {
w.logger.ErrorContext(ctx, "failed to commit transaction batch",
"error", err,
"processed", i+1,
)
return
}
// 开始新事务
tx, err = w.manager.db.BeginTx(ctx, &sql.TxOptions{
Isolation: sql.LevelReadCommitted,
})
if err != nil {
w.logger.ErrorContext(ctx, "failed to begin new transaction",
"error", err,
)
return
}
defer tx.Rollback()
}
}
// 提交最后一批
if err := tx.Commit(); err != nil {
w.logger.ErrorContext(ctx, "failed to commit final transaction",
"error", err,
)
return
}
w.logger.InfoContext(ctx, "scan completed",
"total", len(operations),
"succeeded", successCount,
)
}
// initCursor 初始化cursor
func (w *CursorWorker) initCursor(ctx context.Context) error {
cursorRepo := w.manager.GetCursorRepo()
// 创建初始cursor使用当前时间
now := time.Now().Format(time.RFC3339Nano)
err := cursorRepo.InitCursor(ctx, w.config.CursorKey, now)
// 查询数据库中最早的 NOT_TRUSTLOGGED 记录
db := w.manager.db
var earliestTime sql.NullTime
err := db.QueryRowContext(ctx,
"SELECT MIN(created_at) FROM operation WHERE trustlog_status = $1",
StatusNotTrustlogged,
).Scan(&earliestTime)
if err != nil && err != sql.ErrNoRows {
w.logger.WarnContext(ctx, "failed to query earliest record, using default",
"error", err,
)
}
var initialValue string
if earliestTime.Valid {
// 使用最早记录之前 1 秒作为初始 cursor
initialValue = earliestTime.Time.Add(-1 * time.Second).Format(time.RFC3339Nano)
w.logger.InfoContext(ctx, "setting cursor based on earliest record",
"earliestRecord", earliestTime.Time,
"cursorValue", initialValue,
)
} else {
// 如果没有记录,使用一个很早的时间,确保不会漏掉任何记录
initialValue = time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC).Format(time.RFC3339Nano)
w.logger.InfoContext(ctx, "no records found, using default early time",
"cursorValue", initialValue,
)
}
err = cursorRepo.InitCursor(ctx, w.config.CursorKey, initialValue)
if err != nil {
return fmt.Errorf("failed to init cursor: %w", err)
}
w.logger.InfoContext(ctx, "cursor initialized",
"cursorKey", w.config.CursorKey,
"initialValue", now,
"initialValue", initialValue,
)
return nil
@@ -249,7 +332,71 @@ func (w *CursorWorker) updateCursor(ctx context.Context, value string) error {
return nil
}
// findNewOperations 查找新的待存证记录
// findNewOperationsWithLock 使用 FOR UPDATE SKIP LOCKED 查找新操作(集群安全)
func (w *CursorWorker) findNewOperationsWithLock(ctx context.Context, tx *sql.Tx, cursor string) ([]*OperationRecord, []string, error) {
// 使用 FOR UPDATE SKIP LOCKED 锁定记录
// 这样多个 worker 不会处理相同的记录
query := `
SELECT op_id, op_actor, doid, producer_id,
request_body_hash, response_body_hash, op_hash, sign,
op_source, op_type, do_prefix, do_repository,
client_ip, server_ip, trustlog_status, created_at
FROM operation
WHERE trustlog_status = $1
AND created_at > $2
ORDER BY created_at ASC
LIMIT $3
FOR UPDATE SKIP LOCKED
`
rows, err := tx.QueryContext(ctx, query, StatusNotTrustlogged, cursor, w.config.BatchSize)
if err != nil {
return nil, nil, fmt.Errorf("failed to query operations with lock: %w", err)
}
defer rows.Close()
var operations []*OperationRecord
var opIDs []string
for rows.Next() {
op := &OperationRecord{}
var clientIP, serverIP sql.NullString
var createdAt time.Time
err := rows.Scan(
&op.OpID, &op.OpActor, &op.DOID, &op.ProducerID,
&op.RequestBodyHash, &op.ResponseBodyHash, &op.OpHash, &op.Sign,
&op.OpSource, &op.OpType, &op.DOPrefix, &op.DORepository,
&clientIP, &serverIP, &op.TrustlogStatus, &createdAt,
)
if err != nil {
return nil, nil, fmt.Errorf("failed to scan operation: %w", err)
}
// 处理可空字段
if clientIP.Valid {
op.ClientIP = &clientIP.String
}
if serverIP.Valid {
op.ServerIP = &serverIP.String
}
op.CreatedAt = createdAt
operations = append(operations, op)
opIDs = append(opIDs, op.OpID)
}
return operations, opIDs, nil
}
// getStringOrEmpty 辅助函数:从指针获取字符串或空字符串
func getStringOrEmpty(s *string) string {
if s == nil {
return ""
}
return *s
}
// findNewOperations 查找新的待存证记录(旧版本,保留用于兼容)
func (w *CursorWorker) findNewOperations(ctx context.Context, cursor string) ([]*OperationRecord, error) {
db := w.manager.db
@@ -301,7 +448,63 @@ func (w *CursorWorker) findNewOperations(ctx context.Context, cursor string) ([]
return operations, nil
}
// processOperation 处理单条记录
// processOperationInTx 在事务中处理单条记录(集群安全版本)
// 返回 true 表示处理成功false 表示失败
func (w *CursorWorker) processOperationInTx(ctx context.Context, tx *sql.Tx, op *OperationRecord) bool {
w.logger.DebugContext(ctx, "processing operation in transaction",
"opID", op.OpID,
)
// 尝试存证
err := w.tryTrustlog(ctx, op)
if err != nil {
w.logger.WarnContext(ctx, "failed to trustlog operation",
"opID", op.OpID,
"error", err,
)
// 失败:加入重试表
retryRepo := w.manager.GetRetryRepo()
nextRetryAt := time.Now().Add(1 * time.Minute)
if retryErr := retryRepo.AddRetryTx(ctx, tx, op.OpID, err.Error(), nextRetryAt); retryErr != nil {
w.logger.ErrorContext(ctx, "failed to add to retry queue",
"opID", op.OpID,
"error", retryErr,
)
}
return false
}
// 成功:使用 CAS 更新状态
opRepo := w.manager.GetOperationRepo()
updated, err := opRepo.UpdateStatusWithCAS(ctx, tx, op.OpID, StatusNotTrustlogged, StatusTrustlogged)
if err != nil {
w.logger.ErrorContext(ctx, "failed to update operation status with CAS",
"opID", op.OpID,
"error", err,
)
return false
}
if !updated {
// CAS 失败,说明状态已被其他 worker 修改
w.logger.WarnContext(ctx, "operation already processed by another worker",
"opID", op.OpID,
)
return false
}
w.logger.InfoContext(ctx, "operation trustlogged successfully",
"opID", op.OpID,
)
// 更新cursor
w.updateCursor(ctx, op.CreatedAt.Format(time.RFC3339Nano))
return true
}
// processOperation 处理单条记录(旧版本,保留用于兼容)
func (w *CursorWorker) processOperation(ctx context.Context, op *OperationRecord) {
w.logger.DebugContext(ctx, "processing operation",
"opID", op.OpID,
@@ -384,4 +587,3 @@ func (w *CursorWorker) updateOperationStatus(ctx context.Context, opID string, s
opRepo := w.manager.GetOperationRepo()
return opRepo.UpdateStatus(ctx, opID, status)
}