• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

ochaton / switchover / #96

pending completion
#96

push

ochaton
fix: Fixes ETCD endpoint selection and ignore_tarantool_quorum

28 of 28 new or added lines in 6 files covered. (100.0%)

5210 of 8029 relevant lines covered (64.89%)

963.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

59.55
/switchover/heal.lua
1
local M = {}
17✔
2
local fun = require 'fun'
17✔
3
local log = require 'log'
17✔
4
local json = require 'json'
17✔
5
local panic = require 'switchover._error'.panic
17✔
6
local e = require 'switchover._error'
17✔
7
local etcd = require 'switchover._global'.etcd
17✔
8
local Mutex = require 'switchover._mutex'
17✔
9

10
---@class etcdswitchargs
11
---@field public etcd_master ExtendedInstanceInfo etcd instance info
12
---@field public candidate_uuid string UUID of the candidate
13
---@field public etcd_master_name string name of the current master
14
---@field public etcd_shard Cluster etcd description of the shard
15

16
---Performs master switch in ETCD (with CaS and so on)
17
---@param args etcdSwitchArgs
18
function M.etcd_switch(args)
17✔
19
        ---@type ExtendedInstanceInfo
20
        local etcd_master = assert(args.etcd_master, "etcd_switch: .etcd_master required")
12✔
21
        ---@type string
22
        local candidate_uuid = assert(args.candidate_uuid, "etcd_switch: .candidate_uuid required")
12✔
23
        ---@type string
24
        local etcd_master_name = assert(args.etcd_master_name, "etcd_switch: .etcd_master_name required")
12✔
25
        ---@type Cluster
26
        local etcd_shard = assert(args.etcd_shard, "etcd_switch: .etcd_shard required")
12✔
27

28
        if candidate_uuid == etcd_master.box.instance_uuid then
12✔
29
                log.info("%s already registered as master", candidate_uuid)
×
30
                return 0
×
31
        end
32

33
        ---@type ExtendedInstanceInfo
34
        local etcd_candidate = etcd_shard:instance_by_uuid(candidate_uuid)
12✔
35
        if not etcd_candidate then
12✔
36
                log.error("Instance %s wasn't found in etcd (at %s)", candidate_uuid, etcd_shard.name)
×
37
                return 1
×
38
        end
39

40
        if etcd_candidate.cluster ~= etcd_master.cluster then
12✔
41
                log.error("Candidate (%s/cluster:%s) and master (%s/cluster:%s) are from different clusters. Fix ETCD by yourself!",
×
42
                        etcd_candidate.name, etcd_candidate.cluster, etcd_master_name, etcd_master.cluster)
×
43
                return 1
×
44
        end
45

46
        log.warn("Changing master in ETCD: %s -> %s", etcd_master_name, etcd_candidate.name)
12✔
47

48
        -- Perform CAS
49
        local r = etcd:set(etcd_shard.cluster_path.."/master", etcd_candidate.name,
24✔
50
                { prevValue = etcd_master_name, quorum = true },
12✔
51
                { leader = true }
×
52
        )
53
        log.info("ETCD response: %s", json.encode(r))
12✔
54

55
        if r.action == 'compareAndSwap' then
12✔
56
                return 0
12✔
57
        end
58

59
        panic("ETCD CaS failed: %s", json.encode(r))
×
60
        return 1
×
61
end
62

63
---@class etcdSwitchArgs
64
---@field public etcd_master ExtendedInstanceInfo etcd instance info
65
---@field public candidate_uuid string UUID of the candidate
66
---@field public etcd_master_name string name of the current master
67
---@field public etcd_shard Cluster etcd description of the shard
68
---@field public autofailover_timeout number Mutex timeout
69

70
---Performs master switch in ETCD (with CaS under switchover mutex)
71
---@param args etcdSwitchArgs
72
function M.safe_switch(args)
17✔
73
        ---@type ExtendedInstanceInfo
74
        local etcd_master = assert(args.etcd_master, "safe_switch: .etcd_master required")
×
75
        ---@type string
76
        local candidate_uuid = assert(args.candidate_uuid, "safe_switch: .candidate_uuid required")
×
77
        ---@type string
78
        assert(args.etcd_master_name, "safe_switch: .etcd_master_name required")
×
79
        ---@type Cluster
80
        local etcd_shard = assert(args.etcd_shard, "safe_switch: .etcd_shard required")
×
81

82
        if candidate_uuid == etcd_master.box.instance_uuid then
×
83
                log.info("%s already registered as master", candidate_uuid)
×
84
                return 0
×
85
        end
86

87
        local mutex_key = etcd_shard:switchover_path()
×
88
        log.info("heal: Taking mutex key: %s", mutex_key)
×
89

90
        local mutex_value = ('switchover:heal:%s'):format(candidate_uuid)
×
91
        local timeout = math.max(args.autofailover_timeout or 0, 2.5)
×
92

93
        local ok, err = Mutex:new(mutex_key)
×
94
                :atomic({ key = mutex_value, ttl = 2*timeout }, M.etcd_switch, args)
×
95

96
        if err then
×
97
                if ok then
×
98
                        log.warn("Heal failed but replicaset is consistent. Reason: %s", err)
×
99
                else
100
                        log.error("ALERT: Switchover ruined your replicaset. Restore it by yourself. Reason: %s", err)
×
101
                end
102

103
                return 1
×
104
        else
105
                -- Everything is fine:
106
                log.info("Heal finished successfully")
×
107
                return 0
×
108
        end
109
end
110

111

112
function M.run(args)
17✔
113
        assert(args.command == "heal")
4✔
114
        local how, _
115
        ---@type Cluster
116
        local etcd_shard
117

118
        how, _, etcd_shard = require 'switchover._resolve'({args.shard}, 'shard')
8✔
119
        if how ~= 'etcd' then
4✔
120
                e.panic("ETCD was not used")
1✔
121
        end
122

123
        local repl = require 'switchover.discovery'.discovery({
6✔
124
                endpoints = etcd_shard:endpoints(),
6✔
125
                discovery_timeout = args.discovery_timeout,
3✔
126
        })
127

128
        local etcd_master, etcd_master_name = etcd_shard:master()
3✔
129
        local master = repl:master()
3✔
130
        if not master then
3✔
131
                local ro_master = repl.replicas[etcd_master.box.instance_uuid]
1✔
132
                log.error("Master in replicaset not found. Should be: %s (%s)", etcd_master_name, ro_master)
1✔
133
                log.warn("Try switchover pr / switchover rr / switchover promote to bring master back")
1✔
134
                return 1
1✔
135
        end
136

137
        if etcd_master.box.instance_uuid == master:uuid() then
4✔
138
                log.info("ETCD master %s is actual replicaset master %s (nothing to do)", etcd_master_name, master.endpoint)
1✔
139
                return 0
1✔
140
        end
141

142
        -- Check liveness of replication of master
143
        local ups, downs = repl:score(master)
1✔
144

145
        -- We need quorum?
146
        local quorum = math.ceil((#repl.replica_list+1)/2)-1 -- N/2+1 except self
1✔
147
        if #ups < quorum then
1✔
148
                log.error("Master %s has too little upstreams: %s (required >= %s)", master.endpoint,
×
149
                        table.concat(fun.iter(ups):map(function(r) return r.endpoint end):totable(), ","), quorum)
×
150
                return 1
×
151
        end
152
        if #downs < quorum then
1✔
153
                log.error("Master %s has too little downstreams: %s (required >= %s)", master.endpoint,
×
154
                        table.concat(fun.iter(downs):map(function(r) return r.endpoint end):totable(), ","), quorum)
×
155
                return 1
×
156
        end
157

158
        return M.etcd_switch {
1✔
159
                etcd_master = etcd_master,
1✔
160
                etcd_master_name = etcd_master_name,
1✔
161
                candidate_uuid = master:uuid(),
2✔
162
                etcd_shard = etcd_shard,
1✔
163
        }
1✔
164
end
165

166
return M
17✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc