go-trustlog/api/persistence/retry_worker.go

package persistence

import (
	"context"
	"fmt"
	"time"

	"github.com/ThreeDotsLabs/watermill/message"

	"go.yandata.net/iod/iod/go-trustlog/api/logger"
	"go.yandata.net/iod/iod/go-trustlog/api/model"
)

// RetryWorkerConfig 重试工作器配置
type RetryWorkerConfig struct {
	// RetryInterval 重试检查间隔
	RetryInterval time.Duration
	// MaxRetryCount 最大重试次数
	MaxRetryCount int
	// BatchSize 每批处理的记录数
	BatchSize int
	// BackoffMultiplier 退避乘数（每次重试间隔翻倍）
	BackoffMultiplier float64
	// InitialBackoff 初始退避时间
	InitialBackoff time.Duration
}

// DefaultRetryWorkerConfig 返回默认重试工作器配置
func DefaultRetryWorkerConfig() RetryWorkerConfig {
	return RetryWorkerConfig{
		RetryInterval:     30 * time.Second,
		MaxRetryCount:     5,
		BatchSize:         100,
		BackoffMultiplier: 2.0,
		InitialBackoff:    1 * time.Minute,
	}
}

// RetryWorker 重试工作器，负责处理失败的存证操作
type RetryWorker struct {
	config      RetryWorkerConfig
	manager     *PersistenceManager
	publisher   message.Publisher
	logger      logger.Logger
	stopChan    chan struct{}
	stoppedChan chan struct{}
}

// NewRetryWorker 创建重试工作器
func NewRetryWorker(
	config RetryWorkerConfig,
	manager *PersistenceManager,
	publisher message.Publisher,
	log logger.Logger,
) *RetryWorker {
	return &RetryWorker{
		config:      config,
		manager:     manager,
		publisher:   publisher,
		logger:      log,
		stopChan:    make(chan struct{}),
		stoppedChan: make(chan struct{}),
	}
}

// Start 启动重试工作器
func (w *RetryWorker) Start(ctx context.Context) {
	w.logger.InfoContext(ctx, "starting retry worker",
		"retryInterval", w.config.RetryInterval,
		"maxRetryCount", w.config.MaxRetryCount,
		"batchSize", w.config.BatchSize,
	)

	ticker := time.NewTicker(w.config.RetryInterval)
	defer ticker.Stop()
	defer close(w.stoppedChan)

	for {
		select {
		case <-ctx.Done():
			w.logger.InfoContext(ctx, "retry worker stopped by context")
			return
		case <-w.stopChan:
			w.logger.InfoContext(ctx, "retry worker stopped by signal")
			return
		case <-ticker.C:
			w.processRetries(ctx)
		}
	}
}

// Stop 停止重试工作器
func (w *RetryWorker) Stop() {
	w.logger.Info("stopping retry worker")
	close(w.stopChan)
	<-w.stoppedChan
	w.logger.Info("retry worker stopped")
}

// processRetries 处理待重试的记录
// 从重试表中读取待处理的记录，无需游标扫描 operation 表
func (w *RetryWorker) processRetries(ctx context.Context) {
	w.logger.DebugContext(ctx, "processing retries from retry table")

	retryRepo := w.manager.GetRetryRepo()
	opRepo := w.manager.GetOperationRepo()

	// 直接从重试表查找待重试的记录（已到重试时间的记录）
	records, err := retryRepo.FindPendingRetries(ctx, w.config.BatchSize)
	if err != nil {
		w.logger.ErrorContext(ctx, "failed to find pending retries",
			"error", err,
		)
		return
	}

	if len(records) == 0 {
		w.logger.DebugContext(ctx, "no pending retries found")
		return
	}

	w.logger.InfoContext(ctx, "found pending retries from retry table",
		"count", len(records),
		"batchSize", w.config.BatchSize,
	)

	// 处理每条重试记录
	for _, record := range records {
		w.processRetry(ctx, record, retryRepo, opRepo)
	}
}

// processRetry 处理单个重试记录
func (w *RetryWorker) processRetry(
	ctx context.Context,
	record RetryRecord,
	retryRepo RetryRepository,
	opRepo OperationRepository,
) {
	w.logger.DebugContext(ctx, "processing retry",
		"opID", record.OpID,
		"retryCount", record.RetryCount,
	)

	// 检查是否超过最大重试次数
	if record.RetryCount >= w.config.MaxRetryCount {
		w.logger.WarnContext(ctx, "max retry count exceeded, marking as dead letter",
			"opID", record.OpID,
			"retryCount", record.RetryCount,
		)
		if err := retryRepo.MarkAsDeadLetter(ctx, record.OpID,
			fmt.Sprintf("exceeded max retry count (%d)", w.config.MaxRetryCount)); err != nil {
			w.logger.ErrorContext(ctx, "failed to mark as dead letter",
				"opID", record.OpID,
				"error", err,
			)
		}
		return
	}

	// 查找操作记录
	op, status, err := opRepo.FindByID(ctx, record.OpID)
	if err != nil {
		w.logger.ErrorContext(ctx, "failed to find operation for retry",
			"opID", record.OpID,
			"error", err,
		)
		nextRetry := w.calculateNextRetry(record.RetryCount)
		retryRepo.IncrementRetry(ctx, record.OpID, err.Error(), nextRetry)
		return
	}

	// 如果已经存证，删除重试记录
	if status == StatusTrustlogged {
		w.logger.InfoContext(ctx, "operation already trustlogged, removing retry record",
			"opID", record.OpID,
		)
		if err := retryRepo.DeleteRetry(ctx, record.OpID); err != nil {
			w.logger.ErrorContext(ctx, "failed to delete retry record",
				"opID", record.OpID,
				"error", err,
			)
		}
		return
	}

	// 尝试重新发布到存证系统
	// 这里需要根据实际的存证逻辑来实现
	// 示例：将操作发送到消息队列
	if err := w.republishOperation(ctx, op); err != nil {
		w.logger.ErrorContext(ctx, "failed to republish operation",
			"opID", record.OpID,
			"error", err,
		)
		nextRetry := w.calculateNextRetry(record.RetryCount)
		retryRepo.IncrementRetry(ctx, record.OpID, err.Error(), nextRetry)
		return
	}

	// 发布成功，更新状态为已存证
	if err := opRepo.UpdateStatus(ctx, record.OpID, StatusTrustlogged); err != nil {
		w.logger.ErrorContext(ctx, "failed to update operation status",
			"opID", record.OpID,
			"error", err,
		)
		return
	}

	// 删除重试记录
	if err := retryRepo.DeleteRetry(ctx, record.OpID); err != nil {
		w.logger.ErrorContext(ctx, "failed to delete retry record",
			"opID", record.OpID,
			"error", err,
		)
		return
	}

	w.logger.InfoContext(ctx, "operation retry successful",
		"opID", record.OpID,
		"retryCount", record.RetryCount,
	)
}

// republishOperation 重新发布操作到存证系统
// 注意：这里需要序列化为 Envelope 格式
func (w *RetryWorker) republishOperation(ctx context.Context, op *model.Operation) error {
	// 这里需要根据实际的发布逻辑来实现
	// 简化实现：假设 publisher 已经配置好
	if w.publisher == nil {
		return fmt.Errorf("publisher not configured")
	}

	// 注意：实际使用时需要使用 Envelope 序列化
	// 这里只是示例，具体实现需要在 HighClient 中集成
	w.logger.WarnContext(ctx, "republish not implemented yet, needs Envelope serialization",
		"opID", op.OpID,
	)
	return nil
}

// calculateNextRetry 计算下次重试时间（指数退避）
func (w *RetryWorker) calculateNextRetry(retryCount int) time.Time {
	backoff := float64(w.config.InitialBackoff)
	for i := 0; i < retryCount; i++ {
		backoff *= w.config.BackoffMultiplier
	}
	return time.Now().Add(time.Duration(backoff))
}