package persistence import ( "context" "fmt" "time" "github.com/ThreeDotsLabs/watermill/message" "go.yandata.net/iod/iod/go-trustlog/api/logger" "go.yandata.net/iod/iod/go-trustlog/api/model" ) // RetryWorkerConfig 重试工作器配置 type RetryWorkerConfig struct { // RetryInterval 重试检查间隔 RetryInterval time.Duration // MaxRetryCount 最大重试次数 MaxRetryCount int // BatchSize 每批处理的记录数 BatchSize int // BackoffMultiplier 退避乘数(每次重试间隔翻倍) BackoffMultiplier float64 // InitialBackoff 初始退避时间 InitialBackoff time.Duration } // DefaultRetryWorkerConfig 返回默认重试工作器配置 func DefaultRetryWorkerConfig() RetryWorkerConfig { return RetryWorkerConfig{ RetryInterval: 30 * time.Second, MaxRetryCount: 5, BatchSize: 100, BackoffMultiplier: 2.0, InitialBackoff: 1 * time.Minute, } } // RetryWorker 重试工作器,负责处理失败的存证操作 type RetryWorker struct { config RetryWorkerConfig manager *PersistenceManager publisher message.Publisher logger logger.Logger stopChan chan struct{} stoppedChan chan struct{} } // NewRetryWorker 创建重试工作器 func NewRetryWorker( config RetryWorkerConfig, manager *PersistenceManager, publisher message.Publisher, log logger.Logger, ) *RetryWorker { return &RetryWorker{ config: config, manager: manager, publisher: publisher, logger: log, stopChan: make(chan struct{}), stoppedChan: make(chan struct{}), } } // Start 启动重试工作器 func (w *RetryWorker) Start(ctx context.Context) { w.logger.InfoContext(ctx, "starting retry worker", "retryInterval", w.config.RetryInterval, "maxRetryCount", w.config.MaxRetryCount, "batchSize", w.config.BatchSize, ) ticker := time.NewTicker(w.config.RetryInterval) defer ticker.Stop() defer close(w.stoppedChan) for { select { case <-ctx.Done(): w.logger.InfoContext(ctx, "retry worker stopped by context") return case <-w.stopChan: w.logger.InfoContext(ctx, "retry worker stopped by signal") return case <-ticker.C: w.processRetries(ctx) } } } // Stop 停止重试工作器 func (w *RetryWorker) Stop() { w.logger.Info("stopping retry worker") close(w.stopChan) <-w.stoppedChan w.logger.Info("retry worker stopped") } // processRetries 处理待重试的记录 // 从重试表中读取待处理的记录,无需游标扫描 operation 表 func (w *RetryWorker) processRetries(ctx context.Context) { w.logger.DebugContext(ctx, "processing retries from retry table") retryRepo := w.manager.GetRetryRepo() opRepo := w.manager.GetOperationRepo() // 直接从重试表查找待重试的记录(已到重试时间的记录) records, err := retryRepo.FindPendingRetries(ctx, w.config.BatchSize) if err != nil { w.logger.ErrorContext(ctx, "failed to find pending retries", "error", err, ) return } if len(records) == 0 { w.logger.DebugContext(ctx, "no pending retries found") return } w.logger.InfoContext(ctx, "found pending retries from retry table", "count", len(records), "batchSize", w.config.BatchSize, ) // 处理每条重试记录 for _, record := range records { w.processRetry(ctx, record, retryRepo, opRepo) } } // processRetry 处理单个重试记录 func (w *RetryWorker) processRetry( ctx context.Context, record RetryRecord, retryRepo RetryRepository, opRepo OperationRepository, ) { w.logger.DebugContext(ctx, "processing retry", "opID", record.OpID, "retryCount", record.RetryCount, ) // 检查是否超过最大重试次数 if record.RetryCount >= w.config.MaxRetryCount { w.logger.WarnContext(ctx, "max retry count exceeded, marking as dead letter", "opID", record.OpID, "retryCount", record.RetryCount, ) if err := retryRepo.MarkAsDeadLetter(ctx, record.OpID, fmt.Sprintf("exceeded max retry count (%d)", w.config.MaxRetryCount)); err != nil { w.logger.ErrorContext(ctx, "failed to mark as dead letter", "opID", record.OpID, "error", err, ) } return } // 查找操作记录 op, status, err := opRepo.FindByID(ctx, record.OpID) if err != nil { w.logger.ErrorContext(ctx, "failed to find operation for retry", "opID", record.OpID, "error", err, ) nextRetry := w.calculateNextRetry(record.RetryCount) retryRepo.IncrementRetry(ctx, record.OpID, err.Error(), nextRetry) return } // 如果已经存证,删除重试记录 if status == StatusTrustlogged { w.logger.InfoContext(ctx, "operation already trustlogged, removing retry record", "opID", record.OpID, ) if err := retryRepo.DeleteRetry(ctx, record.OpID); err != nil { w.logger.ErrorContext(ctx, "failed to delete retry record", "opID", record.OpID, "error", err, ) } return } // 尝试重新发布到存证系统 // 这里需要根据实际的存证逻辑来实现 // 示例:将操作发送到消息队列 if err := w.republishOperation(ctx, op); err != nil { w.logger.ErrorContext(ctx, "failed to republish operation", "opID", record.OpID, "error", err, ) nextRetry := w.calculateNextRetry(record.RetryCount) retryRepo.IncrementRetry(ctx, record.OpID, err.Error(), nextRetry) return } // 发布成功,更新状态为已存证 if err := opRepo.UpdateStatus(ctx, record.OpID, StatusTrustlogged); err != nil { w.logger.ErrorContext(ctx, "failed to update operation status", "opID", record.OpID, "error", err, ) return } // 删除重试记录 if err := retryRepo.DeleteRetry(ctx, record.OpID); err != nil { w.logger.ErrorContext(ctx, "failed to delete retry record", "opID", record.OpID, "error", err, ) return } w.logger.InfoContext(ctx, "operation retry successful", "opID", record.OpID, "retryCount", record.RetryCount, ) } // republishOperation 重新发布操作到存证系统 // 注意:这里需要序列化为 Envelope 格式 func (w *RetryWorker) republishOperation(ctx context.Context, op *model.Operation) error { // 这里需要根据实际的发布逻辑来实现 // 简化实现:假设 publisher 已经配置好 if w.publisher == nil { return fmt.Errorf("publisher not configured") } // 注意:实际使用时需要使用 Envelope 序列化 // 这里只是示例,具体实现需要在 HighClient 中集成 w.logger.WarnContext(ctx, "republish not implemented yet, needs Envelope serialization", "opID", op.OpID, ) return nil } // calculateNextRetry 计算下次重试时间(指数退避) func (w *RetryWorker) calculateNextRetry(retryCount int) time.Time { backoff := float64(w.config.InitialBackoff) for i := 0; i < retryCount; i++ { backoff *= w.config.BackoffMultiplier } return time.Now().Add(time.Duration(backoff)) }