package persistence import ( "context" "database/sql" "fmt" "time" "go.yandata.net/iod/iod/go-trustlog/api/logger" "go.yandata.net/iod/iod/go-trustlog/api/model" ) // OperationRecord 操作记录(包含数据库扩展字段) type OperationRecord struct { OpID string OpActor string DOID string ProducerID string RequestBodyHash string ResponseBodyHash string OpHash string Sign string OpSource string OpType string DOPrefix string DORepository string ClientIP *string ServerIP *string TrustlogStatus string CreatedAt time.Time } // ToModel 转换为 model.Operation func (r *OperationRecord) ToModel() *model.Operation { return &model.Operation{ OpID: r.OpID, OpActor: r.OpActor, Doid: r.DOID, ProducerID: r.ProducerID, RequestBodyHash: &r.RequestBodyHash, ResponseBodyHash: &r.ResponseBodyHash, OpSource: model.Source(r.OpSource), OpType: model.Type(r.OpType), DoPrefix: r.DOPrefix, DoRepository: r.DORepository, ClientIP: r.ClientIP, ServerIP: r.ServerIP, } } // CursorWorkerConfig Cursor工作器配置 type CursorWorkerConfig struct { // ScanInterval 扫描间隔(默认10秒,快速发现新记录) ScanInterval time.Duration // BatchSize 批量处理大小(默认100) BatchSize int // CursorKey Cursor键(默认 "operation_scan") CursorKey string // MaxRetryAttempt Cursor阶段最大重试次数(默认1,快速失败转入Retry) MaxRetryAttempt int // Enabled 是否启用Cursor工作器(默认启用) Enabled bool } // DefaultCursorWorkerConfig 默认Cursor工作器配置 func DefaultCursorWorkerConfig() CursorWorkerConfig { return CursorWorkerConfig{ ScanInterval: 10 * time.Second, BatchSize: 100, CursorKey: "operation_scan", MaxRetryAttempt: 1, Enabled: true, } } // CursorWorker Cursor工作器(任务发现) // 职责:扫描operation表,发现新的待存证记录,尝试存证 // 成功则更新状态,失败则加入重试表 type CursorWorker struct { config CursorWorkerConfig manager *PersistenceManager logger logger.Logger stopCh chan struct{} } // NewCursorWorker 创建Cursor工作器 func NewCursorWorker(config CursorWorkerConfig, manager *PersistenceManager) *CursorWorker { if config.ScanInterval == 0 { config.ScanInterval = 10 * time.Second } if config.BatchSize == 0 { config.BatchSize = 100 } if config.CursorKey == "" { config.CursorKey = "operation_scan" } if config.MaxRetryAttempt == 0 { config.MaxRetryAttempt = 1 } // 注意:Enabled 字段需要显式设置,这里不设置默认值 // 因为在 PersistenceClient 创建时会根据 EnableCursorWorker 参数来控制 return &CursorWorker{ config: config, manager: manager, logger: manager.logger, stopCh: make(chan struct{}), } } // Start 启动Cursor工作器 func (w *CursorWorker) Start(ctx context.Context) error { if !w.config.Enabled { w.logger.InfoContext(ctx, "cursor worker disabled, skipping start") return nil } w.logger.InfoContext(ctx, "starting cursor worker", "scanInterval", w.config.ScanInterval, "batchSize", w.config.BatchSize, "cursorKey", w.config.CursorKey, ) // 初始化cursor(如果不存在) if err := w.initCursor(ctx); err != nil { return fmt.Errorf("failed to init cursor: %w", err) } // 启动定期扫描 go w.run(ctx) return nil } // Stop 停止Cursor工作器 func (w *CursorWorker) Stop(ctx context.Context) error { w.logger.InfoContext(ctx, "stopping cursor worker") close(w.stopCh) return nil } // run 运行循环 func (w *CursorWorker) run(ctx context.Context) { ticker := time.NewTicker(w.config.ScanInterval) defer ticker.Stop() for { select { case <-w.stopCh: w.logger.InfoContext(ctx, "cursor worker stopped") return case <-ticker.C: w.scan(ctx) } } } // scan 扫描并处理未存证记录(集群并发安全版本) func (w *CursorWorker) scan(ctx context.Context) { w.logger.DebugContext(ctx, "cursor worker scanning", "cursorKey", w.config.CursorKey, ) // 1. 读取cursor cursor, err := w.getCursor(ctx) if err != nil { w.logger.ErrorContext(ctx, "failed to get cursor", "error", err, ) return } w.logger.DebugContext(ctx, "cursor position", "cursor", cursor, ) // 2. 使用事务 + FOR UPDATE SKIP LOCKED 扫描新记录 // 这样可以避免多个 worker 处理相同的记录 tx, err := w.manager.db.BeginTx(ctx, &sql.TxOptions{ Isolation: sql.LevelReadCommitted, }) if err != nil { w.logger.ErrorContext(ctx, "failed to begin transaction", "error", err, ) return } defer tx.Rollback() // 如果没有提交,确保回滚 operations, opIDs, err := w.findNewOperationsWithLock(ctx, tx, cursor) if err != nil { w.logger.ErrorContext(ctx, "failed to find new operations", "error", err, ) return } if len(operations) == 0 { w.logger.DebugContext(ctx, "no new operations found") tx.Commit() // 提交空事务 return } w.logger.InfoContext(ctx, "found new operations (locked for processing)", "count", len(operations), "opIDs", opIDs, ) // 3. 处理每条记录(在事务中) successCount := 0 for i, op := range operations { if w.processOperationInTx(ctx, tx, op) { successCount++ } // 每处理 10 条提交一次,避免长时间锁定 if (i+1)%10 == 0 { if err := tx.Commit(); err != nil { w.logger.ErrorContext(ctx, "failed to commit transaction batch", "error", err, "processed", i+1, ) return } // 开始新事务 tx, err = w.manager.db.BeginTx(ctx, &sql.TxOptions{ Isolation: sql.LevelReadCommitted, }) if err != nil { w.logger.ErrorContext(ctx, "failed to begin new transaction", "error", err, ) return } defer tx.Rollback() } } // 提交最后一批 if err := tx.Commit(); err != nil { w.logger.ErrorContext(ctx, "failed to commit final transaction", "error", err, ) return } w.logger.InfoContext(ctx, "scan completed", "total", len(operations), "succeeded", successCount, ) } // initCursor 初始化cursor func (w *CursorWorker) initCursor(ctx context.Context) error { cursorRepo := w.manager.GetCursorRepo() // 查询数据库中最早的 NOT_TRUSTLOGGED 记录 db := w.manager.db var earliestTime sql.NullTime err := db.QueryRowContext(ctx, "SELECT MIN(created_at) FROM operation WHERE trustlog_status = $1", StatusNotTrustlogged, ).Scan(&earliestTime) if err != nil && err != sql.ErrNoRows { w.logger.WarnContext(ctx, "failed to query earliest record, using default", "error", err, ) } var initialValue string if earliestTime.Valid { // 使用最早记录之前 1 秒作为初始 cursor initialValue = earliestTime.Time.Add(-1 * time.Second).Format(time.RFC3339Nano) w.logger.InfoContext(ctx, "setting cursor based on earliest record", "earliestRecord", earliestTime.Time, "cursorValue", initialValue, ) } else { // 如果没有记录,使用一个很早的时间,确保不会漏掉任何记录 initialValue = time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC).Format(time.RFC3339Nano) w.logger.InfoContext(ctx, "no records found, using default early time", "cursorValue", initialValue, ) } err = cursorRepo.InitCursor(ctx, w.config.CursorKey, initialValue) if err != nil { return fmt.Errorf("failed to init cursor: %w", err) } w.logger.InfoContext(ctx, "cursor initialized", "cursorKey", w.config.CursorKey, "initialValue", initialValue, ) return nil } // getCursor 获取cursor值 func (w *CursorWorker) getCursor(ctx context.Context) (string, error) { cursorRepo := w.manager.GetCursorRepo() cursor, err := cursorRepo.GetCursor(ctx, w.config.CursorKey) if err != nil { return "", fmt.Errorf("failed to get cursor: %w", err) } // 如果cursor为空,使用一个很早的时间 if cursor == "" { cursor = time.Time{}.Format(time.RFC3339Nano) } return cursor, nil } // updateCursor 更新cursor值 func (w *CursorWorker) updateCursor(ctx context.Context, value string) error { cursorRepo := w.manager.GetCursorRepo() err := cursorRepo.UpdateCursor(ctx, w.config.CursorKey, value) if err != nil { return fmt.Errorf("failed to update cursor: %w", err) } w.logger.DebugContext(ctx, "cursor updated", "cursorKey", w.config.CursorKey, "newValue", value, ) return nil } // findNewOperationsWithLock 使用 FOR UPDATE SKIP LOCKED 查找新操作(集群安全) func (w *CursorWorker) findNewOperationsWithLock(ctx context.Context, tx *sql.Tx, cursor string) ([]*OperationRecord, []string, error) { // 使用 FOR UPDATE SKIP LOCKED 锁定记录 // 这样多个 worker 不会处理相同的记录 query := ` SELECT op_id, op_actor, doid, producer_id, request_body_hash, response_body_hash, op_hash, sign, op_source, op_type, do_prefix, do_repository, client_ip, server_ip, trustlog_status, created_at FROM operation WHERE trustlog_status = $1 AND created_at > $2 ORDER BY created_at ASC LIMIT $3 FOR UPDATE SKIP LOCKED ` rows, err := tx.QueryContext(ctx, query, StatusNotTrustlogged, cursor, w.config.BatchSize) if err != nil { return nil, nil, fmt.Errorf("failed to query operations with lock: %w", err) } defer rows.Close() var operations []*OperationRecord var opIDs []string for rows.Next() { op := &OperationRecord{} var clientIP, serverIP sql.NullString var createdAt time.Time err := rows.Scan( &op.OpID, &op.OpActor, &op.DOID, &op.ProducerID, &op.RequestBodyHash, &op.ResponseBodyHash, &op.OpHash, &op.Sign, &op.OpSource, &op.OpType, &op.DOPrefix, &op.DORepository, &clientIP, &serverIP, &op.TrustlogStatus, &createdAt, ) if err != nil { return nil, nil, fmt.Errorf("failed to scan operation: %w", err) } // 处理可空字段 if clientIP.Valid { op.ClientIP = &clientIP.String } if serverIP.Valid { op.ServerIP = &serverIP.String } op.CreatedAt = createdAt operations = append(operations, op) opIDs = append(opIDs, op.OpID) } return operations, opIDs, nil } // getStringOrEmpty 辅助函数:从指针获取字符串或空字符串 func getStringOrEmpty(s *string) string { if s == nil { return "" } return *s } // findNewOperations 查找新的待存证记录(旧版本,保留用于兼容) func (w *CursorWorker) findNewOperations(ctx context.Context, cursor string) ([]*OperationRecord, error) { db := w.manager.db // 查询未存证的记录(created_at > cursor) rows, err := db.QueryContext(ctx, ` SELECT op_id, op_actor, doid, producer_id, request_body_hash, response_body_hash, op_hash, sign, op_source, op_type, do_prefix, do_repository, client_ip, server_ip, trustlog_status, created_at FROM operation WHERE trustlog_status = $1 AND created_at > $2 ORDER BY created_at ASC LIMIT $3 `, StatusNotTrustlogged, cursor, w.config.BatchSize) if err != nil { return nil, fmt.Errorf("failed to query operations: %w", err) } defer rows.Close() var operations []*OperationRecord for rows.Next() { op := &OperationRecord{} var clientIP, serverIP sql.NullString var createdAt time.Time err := rows.Scan( &op.OpID, &op.OpActor, &op.DOID, &op.ProducerID, &op.RequestBodyHash, &op.ResponseBodyHash, &op.OpHash, &op.Sign, &op.OpSource, &op.OpType, &op.DOPrefix, &op.DORepository, &clientIP, &serverIP, &op.TrustlogStatus, &createdAt, ) if err != nil { return nil, fmt.Errorf("failed to scan operation: %w", err) } // 处理可空字段 if clientIP.Valid { op.ClientIP = &clientIP.String } if serverIP.Valid { op.ServerIP = &serverIP.String } op.CreatedAt = createdAt operations = append(operations, op) } return operations, nil } // processOperationInTx 在事务中处理单条记录(集群安全版本) // 返回 true 表示处理成功,false 表示失败 func (w *CursorWorker) processOperationInTx(ctx context.Context, tx *sql.Tx, op *OperationRecord) bool { w.logger.DebugContext(ctx, "processing operation in transaction", "opID", op.OpID, ) // 尝试存证 err := w.tryTrustlog(ctx, op) if err != nil { w.logger.WarnContext(ctx, "failed to trustlog operation", "opID", op.OpID, "error", err, ) // 失败:加入重试表 retryRepo := w.manager.GetRetryRepo() nextRetryAt := time.Now().Add(1 * time.Minute) if retryErr := retryRepo.AddRetryTx(ctx, tx, op.OpID, err.Error(), nextRetryAt); retryErr != nil { w.logger.ErrorContext(ctx, "failed to add to retry queue", "opID", op.OpID, "error", retryErr, ) } return false } // 成功:使用 CAS 更新状态 opRepo := w.manager.GetOperationRepo() updated, err := opRepo.UpdateStatusWithCAS(ctx, tx, op.OpID, StatusNotTrustlogged, StatusTrustlogged) if err != nil { w.logger.ErrorContext(ctx, "failed to update operation status with CAS", "opID", op.OpID, "error", err, ) return false } if !updated { // CAS 失败,说明状态已被其他 worker 修改 w.logger.WarnContext(ctx, "operation already processed by another worker", "opID", op.OpID, ) return false } w.logger.InfoContext(ctx, "operation trustlogged successfully", "opID", op.OpID, ) // 更新cursor w.updateCursor(ctx, op.CreatedAt.Format(time.RFC3339Nano)) return true } // processOperation 处理单条记录(旧版本,保留用于兼容) func (w *CursorWorker) processOperation(ctx context.Context, op *OperationRecord) { w.logger.DebugContext(ctx, "processing operation", "opID", op.OpID, ) // 尝试存证(最多重试 MaxRetryAttempt 次) var lastErr error for attempt := 0; attempt <= w.config.MaxRetryAttempt; attempt++ { if attempt > 0 { w.logger.DebugContext(ctx, "retrying trustlog", "opID", op.OpID, "attempt", attempt, ) } err := w.tryTrustlog(ctx, op) if err == nil { // 成功:更新状态 if err := w.updateOperationStatus(ctx, op.OpID, StatusTrustlogged); err != nil { w.logger.ErrorContext(ctx, "failed to update operation status", "opID", op.OpID, "error", err, ) } else { w.logger.InfoContext(ctx, "operation trustlogged successfully", "opID", op.OpID, ) } // 更新cursor w.updateCursor(ctx, op.CreatedAt.Format(time.RFC3339Nano)) return } lastErr = err if attempt < w.config.MaxRetryAttempt { time.Sleep(time.Second) // 简单的重试延迟 } } // 失败:加入重试表 w.logger.WarnContext(ctx, "failed to trustlog in cursor worker, adding to retry queue", "opID", op.OpID, "error", lastErr, ) retryRepo := w.manager.GetRetryRepo() nextRetryAt := time.Now().Add(1 * time.Minute) // 1分钟后重试 if err := retryRepo.AddRetry(ctx, op.OpID, lastErr.Error(), nextRetryAt); err != nil { w.logger.ErrorContext(ctx, "failed to add to retry queue", "opID", op.OpID, "error", err, ) } // 即使失败也更新cursor(避免卡在同一条记录) w.updateCursor(ctx, op.CreatedAt.Format(time.RFC3339Nano)) } // tryTrustlog 尝试存证(调用存证系统) func (w *CursorWorker) tryTrustlog(ctx context.Context, op *OperationRecord) error { publisher := w.manager.GetPublisher() if publisher == nil { return fmt.Errorf("publisher not available") } // 转换为 Operation 模型 modelOp := op.ToModel() // 调用存证 if err := publisher.Publish(ctx, modelOp); err != nil { return fmt.Errorf("failed to publish to trustlog: %w", err) } return nil } // updateOperationStatus 更新操作状态 func (w *CursorWorker) updateOperationStatus(ctx context.Context, opID string, status TrustlogStatus) error { opRepo := w.manager.GetOperationRepo() return opRepo.UpdateStatus(ctx, opID, status) }