01907562-f5f0-40b6-8908-451d758b6138

Committed 02 Jul 2024 09:37PM UTC coverage: 71.512% (+0.003%) from 71.509%

Build # 01907562-f5f0-40b6-8908-451d758b6138

Build Type

Pull #6155

buildkite

Committed by

Groxx

Commit Message

Stop the ratelimiter collections when stopping the service

Pull Request Pull Request #6155: Stop the ratelimiter collections when stopping the service

Run Details

9 of 17 new or added lines in 1 file covered. (52.94%)

22 existing lines in 7 files now uncovered.

105315 of 147269 relevant lines covered (71.51%)

2600.5 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.13

/service/history/shard/context.go

// Copyright (c) 2020 Uber Technologies, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

//go:generate mockgen -package $GOPACKAGE -source $GOFILE -destination context_mock.go -package shard github.com/uber/cadence/history/shard/context Context

package shard

import (
        "context"
        "errors"
        "fmt"
        "math"
        "sync"
        "sync/atomic"
        "time"

        "github.com/uber/cadence/common"
        "github.com/uber/cadence/common/backoff"
        "github.com/uber/cadence/common/cache"
        "github.com/uber/cadence/common/clock"
        "github.com/uber/cadence/common/cluster"
        "github.com/uber/cadence/common/log"
        "github.com/uber/cadence/common/log/tag"
        "github.com/uber/cadence/common/metrics"
        "github.com/uber/cadence/common/persistence"
        "github.com/uber/cadence/common/types"
        "github.com/uber/cadence/service/history/config"
        "github.com/uber/cadence/service/history/engine"
        "github.com/uber/cadence/service/history/events"
        "github.com/uber/cadence/service/history/resource"
)

type (
        // Context represents a history engine shard
        Context interface {
                GetShardID() int
                GetService() resource.Resource
                GetExecutionManager() persistence.ExecutionManager
                GetHistoryManager() persistence.HistoryManager
                GetDomainCache() cache.DomainCache
                GetClusterMetadata() cluster.Metadata
                GetConfig() *config.Config
                GetEventsCache() events.Cache
                GetLogger() log.Logger
                GetThrottledLogger() log.Logger
                GetMetricsClient() metrics.Client
                GetTimeSource() clock.TimeSource
                PreviousShardOwnerWasDifferent() bool

                GetEngine() engine.Engine
                SetEngine(engine.Engine)

                GenerateTransferTaskID() (int64, error)
                GenerateTransferTaskIDs(number int) ([]int64, error)

                GetTransferMaxReadLevel() int64
                UpdateTimerMaxReadLevel(cluster string) time.Time

                SetCurrentTime(cluster string, currentTime time.Time)
                GetCurrentTime(cluster string) time.Time
                GetLastUpdatedTime() time.Time
                GetTimerMaxReadLevel(cluster string) time.Time

                GetTransferAckLevel() int64
                UpdateTransferAckLevel(ackLevel int64) error
                GetTransferClusterAckLevel(cluster string) int64
                UpdateTransferClusterAckLevel(cluster string, ackLevel int64) error
                GetTransferProcessingQueueStates(cluster string) []*types.ProcessingQueueState
                UpdateTransferProcessingQueueStates(cluster string, states []*types.ProcessingQueueState) error

                GetClusterReplicationLevel(cluster string) int64
                UpdateClusterReplicationLevel(cluster string, lastTaskID int64) error

                GetTimerAckLevel() time.Time
                UpdateTimerAckLevel(ackLevel time.Time) error
                GetTimerClusterAckLevel(cluster string) time.Time
                UpdateTimerClusterAckLevel(cluster string, ackLevel time.Time) error
                GetTimerProcessingQueueStates(cluster string) []*types.ProcessingQueueState
                UpdateTimerProcessingQueueStates(cluster string, states []*types.ProcessingQueueState) error

                UpdateTransferFailoverLevel(failoverID string, level TransferFailoverLevel) error
                DeleteTransferFailoverLevel(failoverID string) error
                GetAllTransferFailoverLevels() map[string]TransferFailoverLevel

                UpdateTimerFailoverLevel(failoverID string, level TimerFailoverLevel) error
                DeleteTimerFailoverLevel(failoverID string) error
                GetAllTimerFailoverLevels() map[string]TimerFailoverLevel

                GetDomainNotificationVersion() int64
                UpdateDomainNotificationVersion(domainNotificationVersion int64) error

                GetWorkflowExecution(ctx context.Context, request *persistence.GetWorkflowExecutionRequest) (*persistence.GetWorkflowExecutionResponse, error)
                CreateWorkflowExecution(ctx context.Context, request *persistence.CreateWorkflowExecutionRequest) (*persistence.CreateWorkflowExecutionResponse, error)
                UpdateWorkflowExecution(ctx context.Context, request *persistence.UpdateWorkflowExecutionRequest) (*persistence.UpdateWorkflowExecutionResponse, error)
                ConflictResolveWorkflowExecution(ctx context.Context, request *persistence.ConflictResolveWorkflowExecutionRequest) (*persistence.ConflictResolveWorkflowExecutionResponse, error)
                AppendHistoryV2Events(ctx context.Context, request *persistence.AppendHistoryNodesRequest, domainID string, execution types.WorkflowExecution) (*persistence.AppendHistoryNodesResponse, error)

                ReplicateFailoverMarkers(ctx context.Context, markers []*persistence.FailoverMarkerTask) error
                AddingPendingFailoverMarker(*types.FailoverMarkerAttributes) error
                ValidateAndUpdateFailoverMarkers() ([]*types.FailoverMarkerAttributes, error)
        }

        // TransferFailoverLevel contains corresponding start / end level
        TransferFailoverLevel struct {
                StartTime    time.Time
                MinLevel     int64
                CurrentLevel int64
                MaxLevel     int64
                DomainIDs    map[string]struct{}
        }

        // TimerFailoverLevel contains domain IDs and corresponding start / end level
        TimerFailoverLevel struct {
                StartTime    time.Time
                MinLevel     time.Time
                CurrentLevel time.Time
                MaxLevel     time.Time
                DomainIDs    map[string]struct{}
        }

        contextImpl struct {
                resource.Resource

                shardItem        *historyShardsItem
                shardID          int
                rangeID          int64
                executionManager persistence.ExecutionManager
                eventsCache      events.Cache
                closeCallback    func(int, *historyShardsItem)
                closedAt         atomic.Pointer[time.Time]
                config           *config.Config
                logger           log.Logger
                throttledLogger  log.Logger
                engine           engine.Engine

                sync.RWMutex
                lastUpdated               time.Time
                shardInfo                 *persistence.ShardInfo
                transferSequenceNumber    int64
                maxTransferSequenceNumber int64
                transferMaxReadLevel      int64
                timerMaxReadLevelMap      map[string]time.Time             // cluster -> timerMaxReadLevel
                transferFailoverLevels    map[string]TransferFailoverLevel // uuid -> TransferFailoverLevel
                timerFailoverLevels       map[string]TimerFailoverLevel    // uuid -> TimerFailoverLevel

                // exist only in memory
                remoteClusterCurrentTime map[string]time.Time

                // true if previous owner was different from the acquirer's identity.
                previousShardOwnerWasDifferent bool
        }
)

var _ Context = (*contextImpl)(nil)

type ErrShardClosed struct {
        Msg      string
        ClosedAt time.Time
}

var _ error = (*ErrShardClosed)(nil)

func (e *ErrShardClosed) Error() string {
        return e.Msg
}

const (
        TimeBeforeShardClosedIsError = 10 * time.Second
)

const (
        // transfer/cross cluster diff/lag is in terms of taskID, which is calculated based on shard rangeID
        // on shard movement, taskID will increase by around 1 million
        logWarnTransferLevelDiff    = 3000000 // 3 million
        logWarnCrossClusterLevelLag = 3000000 // 3 million
        logWarnTimerLevelDiff       = time.Duration(30 * time.Minute)
        historySizeLogThreshold     = 10 * 1024 * 1024
        minContextTimeout           = 1 * time.Second
)

func (s *contextImpl) GetShardID() int {
        return s.shardID
}

func (s *contextImpl) GetService() resource.Resource {
        return s.Resource
}

func (s *contextImpl) GetExecutionManager() persistence.ExecutionManager {
        return s.executionManager
}

func (s *contextImpl) GetEngine() engine.Engine {
        return s.engine
}

func (s *contextImpl) SetEngine(engine engine.Engine) {
        s.engine = engine
}

func (s *contextImpl) GenerateTransferTaskID() (int64, error) {
        s.Lock()
        defer s.Unlock()

        return s.generateTransferTaskIDLocked()
}

func (s *contextImpl) GenerateTransferTaskIDs(number int) ([]int64, error) {
        s.Lock()
        defer s.Unlock()

        result := []int64{}
        for i := 0; i < number; i++ {
                id, err := s.generateTransferTaskIDLocked()
                if err != nil {
                        return nil, err
                }
                result = append(result, id)
        }
        return result, nil
}

func (s *contextImpl) GetTransferMaxReadLevel() int64 {
        s.RLock()
        defer s.RUnlock()
        return s.transferMaxReadLevel
}

func (s *contextImpl) GetTransferAckLevel() int64 {
        s.RLock()
        defer s.RUnlock()

        return s.shardInfo.TransferAckLevel
}

func (s *contextImpl) UpdateTransferAckLevel(ackLevel int64) error {
        s.Lock()
        defer s.Unlock()

        s.shardInfo.TransferAckLevel = ackLevel
        s.shardInfo.StolenSinceRenew = 0
        return s.updateShardInfoLocked()
}

func (s *contextImpl) GetTransferClusterAckLevel(cluster string) int64 {
        s.RLock()
        defer s.RUnlock()

        // if we can find corresponding ack level
        if ackLevel, ok := s.shardInfo.ClusterTransferAckLevel[cluster]; ok {
                return ackLevel
        }
        // otherwise, default to existing ack level, which belongs to local cluster
        // this can happen if you add more cluster
        return s.shardInfo.TransferAckLevel
}

func (s *contextImpl) UpdateTransferClusterAckLevel(cluster string, ackLevel int64) error {
        s.Lock()
        defer s.Unlock()

        s.shardInfo.ClusterTransferAckLevel[cluster] = ackLevel
        s.shardInfo.StolenSinceRenew = 0
        return s.updateShardInfoLocked()
}

func (s *contextImpl) GetTransferProcessingQueueStates(cluster string) []*types.ProcessingQueueState {
        s.RLock()
        defer s.RUnlock()

        // if we can find corresponding processing queue states
        if states, ok := s.shardInfo.TransferProcessingQueueStates.StatesByCluster[cluster]; ok {
                return states
        }

        // check if we can find corresponding ack level
        var ackLevel int64
        var ok bool
        if ackLevel, ok = s.shardInfo.ClusterTransferAckLevel[cluster]; !ok {
                // otherwise, default to existing ack level, which belongs to local cluster
                // this can happen if you add more cluster
                ackLevel = s.shardInfo.TransferAckLevel
        }

        // otherwise, create default queue state based on existing ack level,
        // which belongs to local cluster. this can happen if you add more cluster
        return []*types.ProcessingQueueState{
                {
                        Level:    common.Int32Ptr(0),
                        AckLevel: common.Int64Ptr(ackLevel),
                        MaxLevel: common.Int64Ptr(math.MaxInt64),
                        DomainFilter: &types.DomainFilter{
                                ReverseMatch: true,
                        },
                },
        }
}

func (s *contextImpl) UpdateTransferProcessingQueueStates(cluster string, states []*types.ProcessingQueueState) error {
        s.Lock()
        defer s.Unlock()

        if len(states) == 0 {
                return errors.New("empty transfer processing queue states")
        }

        if s.shardInfo.TransferProcessingQueueStates.StatesByCluster == nil {
                s.shardInfo.TransferProcessingQueueStates.StatesByCluster = make(map[string][]*types.ProcessingQueueState)
        }
        s.shardInfo.TransferProcessingQueueStates.StatesByCluster[cluster] = states

        // for backward compatibility
        ackLevel := states[0].GetAckLevel()
        for _, state := range states {
                ackLevel = common.MinInt64(ackLevel, state.GetAckLevel())
        }
        s.shardInfo.ClusterTransferAckLevel[cluster] = ackLevel

        s.shardInfo.StolenSinceRenew = 0
        return s.updateShardInfoLocked()
}

func (s *contextImpl) GetClusterReplicationLevel(cluster string) int64 {
        s.RLock()
        defer s.RUnlock()

        // if we can find corresponding replication level
        if replicationLevel, ok := s.shardInfo.ClusterReplicationLevel[cluster]; ok {
                return replicationLevel
        }

        // New cluster always starts from -1
        return -1
}

func (s *contextImpl) UpdateClusterReplicationLevel(cluster string, lastTaskID int64) error {
        s.Lock()
        defer s.Unlock()

        s.shardInfo.ClusterReplicationLevel[cluster] = lastTaskID
        s.shardInfo.StolenSinceRenew = 0
        return s.updateShardInfoLocked()
}

func (s *contextImpl) GetTimerAckLevel() time.Time {
        s.RLock()
        defer s.RUnlock()

        return s.shardInfo.TimerAckLevel
}

func (s *contextImpl) UpdateTimerAckLevel(ackLevel time.Time) error {
        s.Lock()
        defer s.Unlock()

        s.shardInfo.TimerAckLevel = ackLevel
        s.shardInfo.StolenSinceRenew = 0
        return s.updateShardInfoLocked()
}

func (s *contextImpl) GetTimerClusterAckLevel(cluster string) time.Time {
        s.RLock()
        defer s.RUnlock()

        // if we can find corresponding ack level
        if ackLevel, ok := s.shardInfo.ClusterTimerAckLevel[cluster]; ok {
                return ackLevel
        }
        // otherwise, default to existing ack level, which belongs to local cluster
        // this can happen if you add more cluster
        return s.shardInfo.TimerAckLevel
}

func (s *contextImpl) UpdateTimerClusterAckLevel(cluster string, ackLevel time.Time) error {
        s.Lock()
        defer s.Unlock()

        s.shardInfo.ClusterTimerAckLevel[cluster] = ackLevel
        s.shardInfo.StolenSinceRenew = 0
        return s.updateShardInfoLocked()
}

func (s *contextImpl) GetTimerProcessingQueueStates(cluster string) []*types.ProcessingQueueState {
        s.RLock()
        defer s.RUnlock()

        // if we can find corresponding processing queue states
        if states, ok := s.shardInfo.TimerProcessingQueueStates.StatesByCluster[cluster]; ok {
                return states
        }

        // check if we can find corresponding ack level
        var ackLevel time.Time
        var ok bool
        if ackLevel, ok = s.shardInfo.ClusterTimerAckLevel[cluster]; !ok {
                // otherwise, default to existing ack level, which belongs to local cluster
                // this can happen if you add more cluster
                ackLevel = s.shardInfo.TimerAckLevel
        }

        // otherwise, create default queue state based on existing ack level,
        // which belongs to local cluster. this can happen if you add more cluster
        return []*types.ProcessingQueueState{
                {
                        Level:    common.Int32Ptr(0),
                        AckLevel: common.Int64Ptr(ackLevel.UnixNano()),
                        MaxLevel: common.Int64Ptr(math.MaxInt64),
                        DomainFilter: &types.DomainFilter{
                                ReverseMatch: true,
                        },
                },
        }
}

func (s *contextImpl) UpdateTimerProcessingQueueStates(cluster string, states []*types.ProcessingQueueState) error {
        s.Lock()
        defer s.Unlock()

        if len(states) == 0 {
                return errors.New("empty transfer processing queue states")
        }

        if s.shardInfo.TimerProcessingQueueStates.StatesByCluster == nil {
                s.shardInfo.TimerProcessingQueueStates.StatesByCluster = make(map[string][]*types.ProcessingQueueState)
        }
        s.shardInfo.TimerProcessingQueueStates.StatesByCluster[cluster] = states

        // for backward compatibility
        ackLevel := states[0].GetAckLevel()
        for _, state := range states {
                ackLevel = common.MinInt64(ackLevel, state.GetAckLevel())
        }
        s.shardInfo.ClusterTimerAckLevel[cluster] = time.Unix(0, ackLevel)

        s.shardInfo.StolenSinceRenew = 0
        return s.updateShardInfoLocked()
}

func (s *contextImpl) UpdateTransferFailoverLevel(failoverID string, level TransferFailoverLevel) error {
        s.Lock()
        defer s.Unlock()

        s.transferFailoverLevels[failoverID] = level
        return nil
}

func (s *contextImpl) DeleteTransferFailoverLevel(failoverID string) error {
        s.Lock()
        defer s.Unlock()

        if level, ok := s.transferFailoverLevels[failoverID]; ok {
                s.GetMetricsClient().RecordTimer(metrics.ShardInfoScope, metrics.ShardInfoTransferFailoverLatencyTimer, time.Since(level.StartTime))
                delete(s.transferFailoverLevels, failoverID)
        }
        return nil
}

func (s *contextImpl) GetAllTransferFailoverLevels() map[string]TransferFailoverLevel {
        s.RLock()
        defer s.RUnlock()

        ret := map[string]TransferFailoverLevel{}
        for k, v := range s.transferFailoverLevels {
                ret[k] = v
        }
        return ret
}

func (s *contextImpl) UpdateTimerFailoverLevel(failoverID string, level TimerFailoverLevel) error {
        s.Lock()
        defer s.Unlock()

        s.timerFailoverLevels[failoverID] = level
        return nil
}

func (s *contextImpl) DeleteTimerFailoverLevel(failoverID string) error {
        s.Lock()
        defer s.Unlock()

        if level, ok := s.timerFailoverLevels[failoverID]; ok {
                s.GetMetricsClient().RecordTimer(metrics.ShardInfoScope, metrics.ShardInfoTimerFailoverLatencyTimer, time.Since(level.StartTime))
                delete(s.timerFailoverLevels, failoverID)
        }
        return nil
}

func (s *contextImpl) GetAllTimerFailoverLevels() map[string]TimerFailoverLevel {
        s.RLock()
        defer s.RUnlock()

        ret := map[string]TimerFailoverLevel{}
        for k, v := range s.timerFailoverLevels {
                ret[k] = v
        }
        return ret
}

func (s *contextImpl) GetDomainNotificationVersion() int64 {
        s.RLock()
        defer s.RUnlock()

        return s.shardInfo.DomainNotificationVersion
}

func (s *contextImpl) UpdateDomainNotificationVersion(domainNotificationVersion int64) error {
        s.Lock()
        defer s.Unlock()

        s.shardInfo.DomainNotificationVersion = domainNotificationVersion
        return s.updateShardInfoLocked()
}

func (s *contextImpl) GetTimerMaxReadLevel(cluster string) time.Time {
        s.RLock()
        defer s.RUnlock()

        return s.timerMaxReadLevelMap[cluster]
}

func (s *contextImpl) UpdateTimerMaxReadLevel(cluster string) time.Time {
        s.Lock()
        defer s.Unlock()

        currentTime := s.GetTimeSource().Now()
        if cluster != "" && cluster != s.GetClusterMetadata().GetCurrentClusterName() {
                currentTime = s.remoteClusterCurrentTime[cluster]
        }

        s.timerMaxReadLevelMap[cluster] = currentTime.Add(s.config.TimerProcessorMaxTimeShift()).Truncate(time.Millisecond)
        return s.timerMaxReadLevelMap[cluster]
}

func (s *contextImpl) GetWorkflowExecution(
        ctx context.Context,
        request *persistence.GetWorkflowExecutionRequest,
) (*persistence.GetWorkflowExecutionResponse, error) {
        request.RangeID = atomic.LoadInt64(&s.rangeID) // This is to make sure read is not blocked by write, s.rangeID is synced with s.shardInfo.RangeID
        if err := s.closedError(); err != nil {
                return nil, err
        }
        return s.executionManager.GetWorkflowExecution(ctx, request)
}

func (s *contextImpl) CreateWorkflowExecution(
        ctx context.Context,
        request *persistence.CreateWorkflowExecutionRequest,
) (*persistence.CreateWorkflowExecutionResponse, error) {
        if err := s.closedError(); err != nil {
                return nil, err
        }

        ctx, cancel, err := s.ensureMinContextTimeout(ctx)
        if err != nil {
                return nil, err
        }
        if cancel != nil {
                defer cancel()
        }

        domainID := request.NewWorkflowSnapshot.ExecutionInfo.DomainID
        workflowID := request.NewWorkflowSnapshot.ExecutionInfo.WorkflowID

        // do not try to get domain cache within shard lock
        domainEntry, err := s.GetDomainCache().GetDomainByID(domainID)
        if err != nil {
                return nil, err
        }

        s.Lock()
        defer s.Unlock()

        transferMaxReadLevel := int64(0)
        if err := s.allocateTaskIDsLocked(
                domainEntry,
                workflowID,
                request.NewWorkflowSnapshot.TransferTasks,
                request.NewWorkflowSnapshot.CrossClusterTasks,
                request.NewWorkflowSnapshot.ReplicationTasks,
                request.NewWorkflowSnapshot.TimerTasks,
                &transferMaxReadLevel,
        ); err != nil {
                return nil, err
        }

        if err := s.closedError(); err != nil {
                return nil, err
        }
        currentRangeID := s.getRangeID()
        request.RangeID = currentRangeID

        response, err := s.executionManager.CreateWorkflowExecution(ctx, request)
        switch err.(type) {
        case nil:
                // Update MaxReadLevel if write to DB succeeds
                s.updateMaxReadLevelLocked(transferMaxReadLevel)
                return response, nil
        case *types.WorkflowExecutionAlreadyStartedError,
                *persistence.WorkflowExecutionAlreadyStartedError,
                *persistence.CurrentWorkflowConditionFailedError,
                *persistence.DuplicateRequestError,
                *types.ServiceBusyError:
                // No special handling required for these errors
                // We know write to DB fails if these errors are returned
                return nil, err
        case *persistence.ShardOwnershipLostError:
                {
                        // Shard is stolen, trigger shutdown of history engine
                        s.logger.Warn(
                                "Closing shard: CreateWorkflowExecution failed due to stolen shard.",
                                tag.Error(err),
                        )
                        s.closeShard()
                        return nil, err
                }
        default:
                {
                        // We have no idea if the write failed or will eventually make it to
                        // persistence. Increment RangeID to guarantee that subsequent reads
                        // will either see that write, or know for certain that it failed.
                        // This allows the callers to reliably check the outcome by performing
                        // a read.
                        err1 := s.renewRangeLocked(false)
                        if err1 != nil {
                                // At this point we have no choice but to unload the shard, so that it
                                // gets a new RangeID when it's reloaded.
                                s.logger.Warn(
                                        "Closing shard: CreateWorkflowExecution failed due to unknown error.",
                                        tag.Error(err),
                                )
                                s.closeShard()
                        }
                        return nil, err
                }
        }
}

func (s *contextImpl) getDefaultEncoding(domainName string) common.EncodingType {
        return common.EncodingType(s.config.EventEncodingType(domainName))
}

func (s *contextImpl) UpdateWorkflowExecution(
        ctx context.Context,
        request *persistence.UpdateWorkflowExecutionRequest,
) (*persistence.UpdateWorkflowExecutionResponse, error) {
        if err := s.closedError(); err != nil {
                return nil, err
        }
        ctx, cancel, err := s.ensureMinContextTimeout(ctx)
        if err != nil {
                return nil, err
        }
        if cancel != nil {
                defer cancel()
        }

        domainID := request.UpdateWorkflowMutation.ExecutionInfo.DomainID
        workflowID := request.UpdateWorkflowMutation.ExecutionInfo.WorkflowID

        // do not try to get domain cache within shard lock
        domainEntry, err := s.GetDomainCache().GetDomainByID(domainID)
        if err != nil {
                return nil, err
        }
        request.Encoding = s.getDefaultEncoding(domainEntry.GetInfo().Name)

        s.Lock()
        defer s.Unlock()

        transferMaxReadLevel := int64(0)
        if err := s.allocateTaskIDsLocked(
                domainEntry,
                workflowID,
                request.UpdateWorkflowMutation.TransferTasks,
                request.UpdateWorkflowMutation.CrossClusterTasks,
                request.UpdateWorkflowMutation.ReplicationTasks,
                request.UpdateWorkflowMutation.TimerTasks,
                &transferMaxReadLevel,
        ); err != nil {
                return nil, err
        }
        if request.NewWorkflowSnapshot != nil {
                if err := s.allocateTaskIDsLocked(
                        domainEntry,
                        workflowID,
                        request.NewWorkflowSnapshot.TransferTasks,
                        request.NewWorkflowSnapshot.CrossClusterTasks,
                        request.NewWorkflowSnapshot.ReplicationTasks,
                        request.NewWorkflowSnapshot.TimerTasks,
                        &transferMaxReadLevel,
                ); err != nil {
                        return nil, err
                }
        }

        if err := s.closedError(); err != nil {
                return nil, err
        }
        currentRangeID := s.getRangeID()
        request.RangeID = currentRangeID

        resp, err := s.executionManager.UpdateWorkflowExecution(ctx, request)
        switch err.(type) {
        case nil:
                // Update MaxReadLevel if write to DB succeeds
                s.updateMaxReadLevelLocked(transferMaxReadLevel)
                return resp, nil
        case *persistence.ConditionFailedError,
                *persistence.DuplicateRequestError,
                *types.ServiceBusyError:
                // No special handling required for these errors
                // We know write to DB fails if these errors are returned
                return nil, err
        case *persistence.ShardOwnershipLostError:
                {
                        // Shard is stolen, trigger shutdown of history engine
                        s.logger.Warn(
                                "Closing shard: UpdateWorkflowExecution failed due to stolen shard.",
                                tag.Error(err),
                        )
                        s.closeShard()
                        return nil, err
                }
        default:
                {
                        // We have no idea if the write failed or will eventually make it to
                        // persistence. Increment RangeID to guarantee that subsequent reads
                        // will either see that write, or know for certain that it failed.
                        // This allows the callers to reliably check the outcome by performing
                        // a read.
                        err1 := s.renewRangeLocked(false)
                        if err1 != nil {
                                // At this point we have no choice but to unload the shard, so that it
                                // gets a new RangeID when it's reloaded.
                                s.logger.Warn(
                                        "Closing shard: UpdateWorkflowExecution failed due to unknown error.",
                                        tag.Error(err),
                                )
                                s.closeShard()
                        }
                        return nil, err
                }
        }
}

func (s *contextImpl) ConflictResolveWorkflowExecution(
        ctx context.Context,
        request *persistence.ConflictResolveWorkflowExecutionRequest,
) (*persistence.ConflictResolveWorkflowExecutionResponse, error) {
        if err := s.closedError(); err != nil {
                return nil, err
        }

        ctx, cancel, err := s.ensureMinContextTimeout(ctx)
        if err != nil {
                return nil, err
        }
        if cancel != nil {
                defer cancel()
        }

        domainID := request.ResetWorkflowSnapshot.ExecutionInfo.DomainID
        workflowID := request.ResetWorkflowSnapshot.ExecutionInfo.WorkflowID

        // do not try to get domain cache within shard lock
        domainEntry, err := s.GetDomainCache().GetDomainByID(domainID)
        if err != nil {
                return nil, err
        }
        request.Encoding = s.getDefaultEncoding(domainEntry.GetInfo().Name)

        s.Lock()
        defer s.Unlock()

        transferMaxReadLevel := int64(0)
        if request.CurrentWorkflowMutation != nil {
                if err := s.allocateTaskIDsLocked(
                        domainEntry,
                        workflowID,
                        request.CurrentWorkflowMutation.TransferTasks,
                        request.CurrentWorkflowMutation.CrossClusterTasks,
                        request.CurrentWorkflowMutation.ReplicationTasks,
                        request.CurrentWorkflowMutation.TimerTasks,
                        &transferMaxReadLevel,
                ); err != nil {
                        return nil, err
                }
        }
        if err := s.allocateTaskIDsLocked(
                domainEntry,
                workflowID,
                request.ResetWorkflowSnapshot.TransferTasks,
                request.ResetWorkflowSnapshot.CrossClusterTasks,
                request.ResetWorkflowSnapshot.ReplicationTasks,
                request.ResetWorkflowSnapshot.TimerTasks,
                &transferMaxReadLevel,
        ); err != nil {
                return nil, err
        }
        if request.NewWorkflowSnapshot != nil {
                if err := s.allocateTaskIDsLocked(
                        domainEntry,
                        workflowID,
                        request.NewWorkflowSnapshot.TransferTasks,
                        request.NewWorkflowSnapshot.CrossClusterTasks,
                        request.NewWorkflowSnapshot.ReplicationTasks,
                        request.NewWorkflowSnapshot.TimerTasks,
                        &transferMaxReadLevel,
                ); err != nil {
                        return nil, err
                }
        }

        if err := s.closedError(); err != nil {
                return nil, err
        }
        currentRangeID := s.getRangeID()
        request.RangeID = currentRangeID
        resp, err := s.executionManager.ConflictResolveWorkflowExecution(ctx, request)
        switch err.(type) {
        case nil:
                // Update MaxReadLevel if write to DB succeeds
                s.updateMaxReadLevelLocked(transferMaxReadLevel)
                return resp, nil
        case *persistence.ConditionFailedError,
                *types.ServiceBusyError:
                // No special handling required for these errors
                // We know write to DB fails if these errors are returned
                return nil, err
        case *persistence.ShardOwnershipLostError:
                {
                        // RangeID might have been renewed by the same host while this update was in flight
                        // Retry the operation if we still have the shard ownership
                        // Shard is stolen, trigger shutdown of history engine
                        s.logger.Warn(
                                "Closing shard: ConflictResolveWorkflowExecution failed due to stolen shard.",
                                tag.Error(err),
                        )
                        s.closeShard()
                        return nil, err
                }
        default:
                {
                        // We have no idea if the write failed or will eventually make it to
                        // persistence. Increment RangeID to guarantee that subsequent reads
                        // will either see that write, or know for certain that it failed.
                        // This allows the callers to reliably check the outcome by performing
                        // a read.
                        err1 := s.renewRangeLocked(false)
                        if err1 != nil {
                                // At this point we have no choice but to unload the shard, so that it
                                // gets a new RangeID when it's reloaded.
                                s.logger.Warn(
                                        "Closing shard: ConflictResolveWorkflowExecution failed due to unknown error.",
                                        tag.Error(err),
                                )
                                s.closeShard()
                        }
                        return nil, err
                }
        }
}

func (s *contextImpl) ensureMinContextTimeout(
        parent context.Context,
) (context.Context, context.CancelFunc, error) {
        if err := parent.Err(); err != nil {
                return nil, nil, err
        }

        deadline, ok := parent.Deadline()
        if !ok || deadline.Sub(s.GetTimeSource().Now()) >= minContextTimeout {
                return parent, nil, nil
        }

        childCtx, cancel := context.WithTimeout(context.Background(), minContextTimeout)
        return childCtx, cancel, nil
}

func (s *contextImpl) AppendHistoryV2Events(
        ctx context.Context,
        request *persistence.AppendHistoryNodesRequest,
        domainID string,
        execution types.WorkflowExecution,
) (*persistence.AppendHistoryNodesResponse, error) {
        if err := s.closedError(); err != nil {
                return nil, err
        }

        domainName, err := s.GetDomainCache().GetDomainName(domainID)
        if err != nil {
                return nil, err
        }

        // NOTE: do not use generateNextTransferTaskIDLocked since
        // generateNextTransferTaskIDLocked is not guarded by lock
        transactionID, err := s.GenerateTransferTaskID()
        if err != nil {
                return nil, err
        }

        request.Encoding = s.getDefaultEncoding(domainName)
        request.ShardID = common.IntPtr(s.shardID)
        request.TransactionID = transactionID

        size := 0
        defer func() {
                s.GetMetricsClient().Scope(metrics.SessionSizeStatsScope, metrics.DomainTag(domainName)).
                        RecordTimer(metrics.HistorySize, time.Duration(size))
                if size >= historySizeLogThreshold {
                        s.throttledLogger.Warn("history size threshold breached",
                                tag.WorkflowID(execution.GetWorkflowID()),
                                tag.WorkflowRunID(execution.GetRunID()),
                                tag.WorkflowDomainID(domainID),
                                tag.WorkflowHistorySizeBytes(size))
                }
        }()
        resp, err0 := s.GetHistoryManager().AppendHistoryNodes(ctx, request)
        if resp != nil {
                size = len(resp.DataBlob.Data)
        }
        return resp, err0
}

func (s *contextImpl) GetConfig() *config.Config {
        return s.config
}

func (s *contextImpl) PreviousShardOwnerWasDifferent() bool {
        return s.previousShardOwnerWasDifferent
}

func (s *contextImpl) GetEventsCache() events.Cache {
        // the shard needs to be restarted to release the shard cache once global mode is on.
        if s.config.EventsCacheGlobalEnable() {
                return s.GetEventCache()
        }
        return s.eventsCache
}

func (s *contextImpl) GetLogger() log.Logger {
        return s.logger
}

func (s *contextImpl) GetThrottledLogger() log.Logger {
        return s.throttledLogger
}

func (s *contextImpl) getRangeID() int64 {
        return s.shardInfo.RangeID
}

func (s *contextImpl) closedError() error {
        closedAt := s.closedAt.Load()
        if closedAt == nil {
                return nil
        }

        return &ErrShardClosed{
                Msg:      "shard closed",
                ClosedAt: *closedAt,
        }
}

func (s *contextImpl) closeShard() {
        if !s.closedAt.CompareAndSwap(nil, common.TimePtr(time.Now())) {
                return
        }

        go func() {
                s.closeCallback(s.shardID, s.shardItem)
        }()

        // fails any writes that may start after this point.
        s.shardInfo.RangeID = -1
        atomic.StoreInt64(&s.rangeID, s.shardInfo.RangeID)
}

func (s *contextImpl) generateTransferTaskIDLocked() (int64, error) {
        if err := s.updateRangeIfNeededLocked(); err != nil {
                return -1, err
        }

        taskID := s.transferSequenceNumber
        s.transferSequenceNumber++

        return taskID, nil
}

func (s *contextImpl) updateRangeIfNeededLocked() error {
        if s.transferSequenceNumber < s.maxTransferSequenceNumber {
                return nil
        }

        return s.renewRangeLocked(false)
}

func (s *contextImpl) renewRangeLocked(isStealing bool) error {
        updatedShardInfo := s.shardInfo.Copy()
        updatedShardInfo.RangeID++
        if isStealing {
                updatedShardInfo.StolenSinceRenew++
        }

        var err error
        if err := s.closedError(); err != nil {
                return err
        }
        err = s.GetShardManager().UpdateShard(context.Background(), &persistence.UpdateShardRequest{
                ShardInfo:       updatedShardInfo,
                PreviousRangeID: s.shardInfo.RangeID})
        switch err.(type) {
        case nil:
        case *persistence.ShardOwnershipLostError:
                // Shard is stolen, trigger history engine shutdown
                s.logger.Warn(
                        "Closing shard: renewRangeLocked failed due to stolen shard.",
                        tag.Error(err),
                )
                s.closeShard()
        default:
                s.logger.Warn("UpdateShard failed with an unknown error.",
                        tag.Error(err),
                        tag.ShardRangeID(updatedShardInfo.RangeID),
                        tag.PreviousShardRangeID(s.shardInfo.RangeID))
        }
        if err != nil {
                // Failure in updating shard to grab new RangeID
                s.logger.Error("renewRangeLocked failed.",
                        tag.StoreOperationUpdateShard,
                        tag.Error(err),
                        tag.ShardRangeID(updatedShardInfo.RangeID),
                        tag.PreviousShardRangeID(s.shardInfo.RangeID))
                return err
        }

        // Range is successfully updated in cassandra now update shard context to reflect new range
        s.transferSequenceNumber = updatedShardInfo.RangeID << s.config.RangeSizeBits
        s.maxTransferSequenceNumber = (updatedShardInfo.RangeID + 1) << s.config.RangeSizeBits
        s.transferMaxReadLevel = s.transferSequenceNumber - 1
        atomic.StoreInt64(&s.rangeID, updatedShardInfo.RangeID)
        s.shardInfo = updatedShardInfo

        s.logger.Info("Range updated for shardID",
                tag.ShardRangeID(s.shardInfo.RangeID),
                tag.Number(s.transferSequenceNumber),
                tag.NextNumber(s.maxTransferSequenceNumber))
        return nil
}

func (s *contextImpl) updateMaxReadLevelLocked(rl int64) {
        if rl > s.transferMaxReadLevel {
                s.logger.Debug(fmt.Sprintf("Updating MaxReadLevel: %v", rl))
                s.transferMaxReadLevel = rl
        }
}

func (s *contextImpl) updateShardInfoLocked() error {
        return s.persistShardInfoLocked(false)
}

func (s *contextImpl) forceUpdateShardInfoLocked() error {
        return s.persistShardInfoLocked(true)
}

func (s *contextImpl) persistShardInfoLocked(
        isForced bool,
) error {

        if err := s.closedError(); err != nil {
                return err
        }

        var err error
        now := clock.NewRealTimeSource().Now()
        if !isForced && s.lastUpdated.Add(s.config.ShardUpdateMinInterval()).After(now) {
                return nil
        }
        updatedShardInfo := s.shardInfo.Copy()
        s.emitShardInfoMetricsLogsLocked()

        err = s.GetShardManager().UpdateShard(context.Background(), &persistence.UpdateShardRequest{
                ShardInfo:       updatedShardInfo,
                PreviousRangeID: s.shardInfo.RangeID,
        })

        if err != nil {
                // Shard is stolen, trigger history engine shutdown
                if _, ok := err.(*persistence.ShardOwnershipLostError); ok {
                        s.logger.Warn(
                                "Closing shard: updateShardInfoLocked failed due to stolen shard.",
                                tag.Error(err),
                        )
                        s.closeShard()
                }
        } else {
                s.lastUpdated = now
        }

        return err
}

func (s *contextImpl) emitShardInfoMetricsLogsLocked() {
        currentCluster := s.GetClusterMetadata().GetCurrentClusterName()
        clusterInfo := s.GetClusterMetadata().GetAllClusterInfo()

        minTransferLevel := s.shardInfo.ClusterTransferAckLevel[currentCluster]
        maxTransferLevel := s.shardInfo.ClusterTransferAckLevel[currentCluster]
        for clusterName, v := range s.shardInfo.ClusterTransferAckLevel {
                if !clusterInfo[clusterName].Enabled {
                        continue
                }

                if v < minTransferLevel {
                        minTransferLevel = v
                }
                if v > maxTransferLevel {
                        maxTransferLevel = v
                }
        }
        diffTransferLevel := maxTransferLevel - minTransferLevel

        minTimerLevel := s.shardInfo.ClusterTimerAckLevel[currentCluster]
        maxTimerLevel := s.shardInfo.ClusterTimerAckLevel[currentCluster]
        for clusterName, v := range s.shardInfo.ClusterTimerAckLevel {
                if !clusterInfo[clusterName].Enabled {
                        continue
                }

                if v.Before(minTimerLevel) {
                        minTimerLevel = v
                }
                if v.After(maxTimerLevel) {
                        maxTimerLevel = v
                }
        }
        diffTimerLevel := maxTimerLevel.Sub(minTimerLevel)

        replicationLag := s.transferMaxReadLevel - s.shardInfo.ReplicationAckLevel
        transferLag := s.transferMaxReadLevel - s.shardInfo.TransferAckLevel
        timerLag := time.Since(s.shardInfo.TimerAckLevel)

        transferFailoverInProgress := len(s.transferFailoverLevels)
        timerFailoverInProgress := len(s.timerFailoverLevels)

        if s.config.EmitShardDiffLog() &&
                (logWarnTransferLevelDiff < diffTransferLevel ||
                        logWarnTimerLevelDiff < diffTimerLevel ||
                        logWarnTransferLevelDiff < transferLag ||
                        logWarnTimerLevelDiff < timerLag) {

                logger := s.logger.WithTags(
                        tag.ShardTime(s.remoteClusterCurrentTime),
                        tag.ShardReplicationAck(s.shardInfo.ReplicationAckLevel),
                        tag.ShardTimerAcks(s.shardInfo.ClusterTimerAckLevel),
                        tag.ShardTransferAcks(s.shardInfo.ClusterTransferAckLevel),
                )

                logger.Warn("Shard ack levels diff exceeds warn threshold.")
        }

        metricsScope := s.GetMetricsClient().Scope(metrics.ShardInfoScope)
        metricsScope.RecordTimer(metrics.ShardInfoTransferDiffTimer, time.Duration(diffTransferLevel))
        metricsScope.RecordTimer(metrics.ShardInfoTimerDiffTimer, diffTimerLevel)

        metricsScope.RecordTimer(metrics.ShardInfoReplicationLagTimer, time.Duration(replicationLag))
        metricsScope.RecordTimer(metrics.ShardInfoTransferLagTimer, time.Duration(transferLag))
        metricsScope.RecordTimer(metrics.ShardInfoTimerLagTimer, timerLag)

        metricsScope.RecordTimer(metrics.ShardInfoTransferFailoverInProgressTimer, time.Duration(transferFailoverInProgress))
        metricsScope.RecordTimer(metrics.ShardInfoTimerFailoverInProgressTimer, time.Duration(timerFailoverInProgress))
}

func (s *contextImpl) allocateTaskIDsLocked(
        domainEntry *cache.DomainCacheEntry,
        workflowID string,
        transferTasks []persistence.Task,
        crossClusterTasks []persistence.Task,
        replicationTasks []persistence.Task,
        timerTasks []persistence.Task,
        transferMaxReadLevel *int64,
) error {

        if err := s.allocateTransferIDsLocked(
                transferTasks,
                transferMaxReadLevel,
        ); err != nil {
                return err
        }
        if err := s.allocateTransferIDsLocked(
                crossClusterTasks,
                transferMaxReadLevel,
        ); err != nil {
                return err
        }
        if err := s.allocateTimerIDsLocked(
                domainEntry,
                workflowID,
                timerTasks,
        ); err != nil {
                return err
        }

        // Ensure that task IDs for replication tasks are generated last.
        // This allows optimizing replication by checking whether there no potential tasks to read.
        return s.allocateTransferIDsLocked(
                replicationTasks,
                transferMaxReadLevel,
        )
}

func (s *contextImpl) allocateTransferIDsLocked(
        tasks []persistence.Task,
        transferMaxReadLevel *int64,
) error {
        now := s.GetTimeSource().Now()

        for _, task := range tasks {
                id, err := s.generateTransferTaskIDLocked()
                if err != nil {
                        return err
                }
                s.logger.Debug(fmt.Sprintf("Assigning task ID: %v", id))
                task.SetTaskID(id)
                // only set task visibility timestamp if it's not set
                if task.GetVisibilityTimestamp().IsZero() {
                        task.SetVisibilityTimestamp(now)
                }
                *transferMaxReadLevel = id
        }
        return nil
}

// NOTE: allocateTimerIDsLocked should always been called after assigning taskID for transferTasks when assigning taskID together,
// because Cadence Indexer assume timer taskID of deleteWorkflowExecution is larger than transfer taskID of closeWorkflowExecution
// for a given workflow.
func (s *contextImpl) allocateTimerIDsLocked(
        domainEntry *cache.DomainCacheEntry,
        workflowID string,
        timerTasks []persistence.Task,
) error {

        // assign IDs for the timer tasks. They need to be assigned under shard lock.
        currentCluster := s.GetClusterMetadata().GetCurrentClusterName()
        for _, task := range timerTasks {
                ts := task.GetVisibilityTimestamp()
                if task.GetVersion() != common.EmptyVersion {
                        // cannot use version to determine the corresponding cluster for timer task
                        // this is because during failover, timer task should be created as active
                        // or otherwise, failover + active processing logic may not pick up the task.
                        currentCluster = domainEntry.GetReplicationConfig().ActiveClusterName
                }
                readCursorTS := s.timerMaxReadLevelMap[currentCluster]
                if ts.Before(readCursorTS) {
                        // This can happen if shard move and new host have a time SKU, or there is db write delay.
                        // We generate a new timer ID using timerMaxReadLevel.
                        s.logger.Warn("New timer generated is less than read level",
                                tag.WorkflowDomainID(domainEntry.GetInfo().ID),
                                tag.WorkflowID(workflowID),
                                tag.Timestamp(ts),
                                tag.CursorTimestamp(readCursorTS),
                                tag.ValueShardAllocateTimerBeforeRead)
                        task.SetVisibilityTimestamp(s.timerMaxReadLevelMap[currentCluster].Add(time.Millisecond))
                }

                seqNum, err := s.generateTransferTaskIDLocked()
                if err != nil {
                        return err
                }
                task.SetTaskID(seqNum)
                visibilityTs := task.GetVisibilityTimestamp()
                s.logger.Debug(fmt.Sprintf("Assigning new timer (timestamp: %v, seq: %v)) ackLeveL: %v",
                        visibilityTs, task.GetTaskID(), s.shardInfo.TimerAckLevel))
        }
        return nil
}

func (s *contextImpl) SetCurrentTime(cluster string, currentTime time.Time) {
        s.Lock()
        defer s.Unlock()
        if cluster != s.GetClusterMetadata().GetCurrentClusterName() {
                prevTime := s.remoteClusterCurrentTime[cluster]
                if prevTime.Before(currentTime) {
                        s.remoteClusterCurrentTime[cluster] = currentTime
                }
        } else {
                panic("Cannot set current time for current cluster")
        }
}

func (s *contextImpl) GetCurrentTime(cluster string) time.Time {
        s.RLock()
        defer s.RUnlock()
        if cluster != s.GetClusterMetadata().GetCurrentClusterName() {
                return s.remoteClusterCurrentTime[cluster]
        }
        return s.GetTimeSource().Now()
}

func (s *contextImpl) GetLastUpdatedTime() time.Time {
        s.RLock()
        defer s.RUnlock()
        return s.lastUpdated
}

func (s *contextImpl) ReplicateFailoverMarkers(
        ctx context.Context,
        markers []*persistence.FailoverMarkerTask,
) error {
        if err := s.closedError(); err != nil {
                return err
        }

        tasks := make([]persistence.Task, 0, len(markers))
        for _, marker := range markers {
                tasks = append(tasks, marker)
        }

        s.Lock()
        defer s.Unlock()

        transferMaxReadLevel := int64(0)
        if err := s.allocateTransferIDsLocked(
                tasks,
                &transferMaxReadLevel,
        ); err != nil {
                return err
        }

        var err error
        if err := s.closedError(); err != nil {
                return err
        }
        err = s.executionManager.CreateFailoverMarkerTasks(
                ctx,
                &persistence.CreateFailoverMarkersRequest{
                        RangeID: s.getRangeID(),
                        Markers: markers,
                },
        )
        switch err.(type) {
        case nil:
                // Update MaxReadLevel if write to DB succeeds
                s.updateMaxReadLevelLocked(transferMaxReadLevel)
        case *persistence.ShardOwnershipLostError:
                // do not retry on ShardOwnershipLostError
                s.logger.Warn(
                        "Closing shard: ReplicateFailoverMarkers failed due to stolen shard.",
                        tag.Error(err),
                )
                s.closeShard()
        default:
                s.logger.Error(
                        "Failed to insert the failover marker into replication queue.",
                        tag.Error(err),
                )
        }
        return err
}

func (s *contextImpl) AddingPendingFailoverMarker(
        marker *types.FailoverMarkerAttributes,
) error {

        domainEntry, err := s.GetDomainCache().GetDomainByID(marker.GetDomainID())
        if err != nil {
                return err
        }
        // domain is active, the marker is expired
        isActive, _ := domainEntry.IsActiveIn(s.GetClusterMetadata().GetCurrentClusterName())
        if isActive || domainEntry.GetFailoverVersion() > marker.GetFailoverVersion() {
                s.logger.Info("Skipped out-of-date failover marker", tag.WorkflowDomainName(domainEntry.GetInfo().Name))
                return nil
        }

        s.Lock()
        defer s.Unlock()

        s.shardInfo.PendingFailoverMarkers = append(s.shardInfo.PendingFailoverMarkers, marker)
        return s.forceUpdateShardInfoLocked()
}

func (s *contextImpl) ValidateAndUpdateFailoverMarkers() ([]*types.FailoverMarkerAttributes, error) {

        completedFailoverMarkers := make(map[*types.FailoverMarkerAttributes]struct{})
        s.RLock()
        for _, marker := range s.shardInfo.PendingFailoverMarkers {
                domainEntry, err := s.GetDomainCache().GetDomainByID(marker.GetDomainID())
                if err != nil {
                        s.RUnlock()
                        return nil, err
                }
                isActive, _ := domainEntry.IsActiveIn(s.GetClusterMetadata().GetCurrentClusterName())
                if isActive || domainEntry.GetFailoverVersion() > marker.GetFailoverVersion() {
                        completedFailoverMarkers[marker] = struct{}{}
                }
        }

        if len(completedFailoverMarkers) == 0 {
                s.RUnlock()
                return s.shardInfo.PendingFailoverMarkers, nil
        }
        s.RUnlock()

        // clean up all pending failover tasks
        s.Lock()
        defer s.Unlock()

        for idx, marker := range s.shardInfo.PendingFailoverMarkers {
                if _, ok := completedFailoverMarkers[marker]; ok {
                        s.shardInfo.PendingFailoverMarkers[idx] = s.shardInfo.PendingFailoverMarkers[len(s.shardInfo.PendingFailoverMarkers)-1]
                        s.shardInfo.PendingFailoverMarkers[len(s.shardInfo.PendingFailoverMarkers)-1] = nil
                        s.shardInfo.PendingFailoverMarkers = s.shardInfo.PendingFailoverMarkers[:len(s.shardInfo.PendingFailoverMarkers)-1]
                }
        }
        if err := s.updateShardInfoLocked(); err != nil {
                return nil, err
        }

        return s.shardInfo.PendingFailoverMarkers, nil
}

func acquireShard(
        shardItem *historyShardsItem,
        closeCallback func(int, *historyShardsItem),
) (Context, error) {

        var shardInfo *persistence.ShardInfo

        retryPolicy := backoff.NewExponentialRetryPolicy(50 * time.Millisecond)
        retryPolicy.SetMaximumInterval(time.Second)
        retryPolicy.SetExpirationInterval(5 * time.Second)

        retryPredicate := func(err error) bool {
                if persistence.IsTransientError(err) {
                        return true
                }
                _, ok := err.(*persistence.ShardAlreadyExistError)
                return ok
        }

        getShard := func() error {
                resp, err := shardItem.GetShardManager().GetShard(context.Background(), &persistence.GetShardRequest{
                        ShardID: shardItem.shardID,
                })
                if err == nil {
                        shardInfo = resp.ShardInfo
                        return nil
                }
                if _, ok := err.(*types.EntityNotExistsError); !ok {
                        return err
                }

                // EntityNotExistsError error
                shardInfo = &persistence.ShardInfo{
                        ShardID:          shardItem.shardID,
                        RangeID:          0,
                        TransferAckLevel: 0,
                }
                return shardItem.GetShardManager().CreateShard(context.Background(), &persistence.CreateShardRequest{ShardInfo: shardInfo})
        }

        throttleRetry := backoff.NewThrottleRetry(
                backoff.WithRetryPolicy(retryPolicy),
                backoff.WithRetryableError(retryPredicate),
        )
        err := throttleRetry.Do(context.Background(), getShard)
        if err != nil {
                shardItem.logger.Error("Fail to acquire shard.", tag.Error(err))
                return nil, err
        }

        updatedShardInfo := shardInfo.Copy()
        ownershipChanged := shardInfo.Owner != shardItem.GetHostInfo().Identity()
        updatedShardInfo.Owner = shardItem.GetHostInfo().Identity()

        // initialize the cluster current time to be the same as ack level
        remoteClusterCurrentTime := make(map[string]time.Time)
        timerMaxReadLevelMap := make(map[string]time.Time)
        for clusterName := range shardItem.GetClusterMetadata().GetEnabledClusterInfo() {
                if clusterName != shardItem.GetClusterMetadata().GetCurrentClusterName() {
                        if currentTime, ok := shardInfo.ClusterTimerAckLevel[clusterName]; ok {
                                remoteClusterCurrentTime[clusterName] = currentTime
                                timerMaxReadLevelMap[clusterName] = currentTime
                        } else {
                                remoteClusterCurrentTime[clusterName] = shardInfo.TimerAckLevel
                                timerMaxReadLevelMap[clusterName] = shardInfo.TimerAckLevel
                        }
                } else { // active cluster
                        timerMaxReadLevelMap[clusterName] = shardInfo.TimerAckLevel
                }

                timerMaxReadLevelMap[clusterName] = timerMaxReadLevelMap[clusterName].Truncate(time.Millisecond)
        }

        if updatedShardInfo.TransferProcessingQueueStates == nil {
                updatedShardInfo.TransferProcessingQueueStates = &types.ProcessingQueueStates{
                        StatesByCluster: make(map[string][]*types.ProcessingQueueState),
                }
        }
        if updatedShardInfo.TimerProcessingQueueStates == nil {
                updatedShardInfo.TimerProcessingQueueStates = &types.ProcessingQueueStates{
                        StatesByCluster: make(map[string][]*types.ProcessingQueueState),
                }
        }

        executionMgr, err := shardItem.GetExecutionManager(shardItem.shardID)
        if err != nil {
                return nil, err
        }

        context := &contextImpl{
                Resource:                       shardItem.Resource,
                shardItem:                      shardItem,
                shardID:                        shardItem.shardID,
                executionManager:               executionMgr,
                shardInfo:                      updatedShardInfo,
                closeCallback:                  closeCallback,
                config:                         shardItem.config,
                remoteClusterCurrentTime:       remoteClusterCurrentTime,
                timerMaxReadLevelMap:           timerMaxReadLevelMap, // use ack to init read level
                transferFailoverLevels:         make(map[string]TransferFailoverLevel),
                timerFailoverLevels:            make(map[string]TimerFailoverLevel),
                logger:                         shardItem.logger,
                throttledLogger:                shardItem.throttledLogger,
                previousShardOwnerWasDifferent: ownershipChanged,
        }

        // TODO remove once migrated to global event cache
        context.eventsCache = events.NewCache(
                context.shardID,
                context.Resource.GetHistoryManager(),
                context.config,
                context.logger,
                context.Resource.GetMetricsClient(),
                shardItem.GetDomainCache(),
        )

        context.logger.Debug(fmt.Sprintf("Global event cache mode: %v", context.config.EventsCacheGlobalEnable()))

        err1 := context.renewRangeLocked(true)
        if err1 != nil {
                return nil, err1
        }

        return context, nil
}

uber / cadence / 01907562-f5f0-40b6-8908-451d758b6138

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous