-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathliveliness.lua
312 lines (232 loc) · 7.43 KB
/
liveliness.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
-- liveliness.lua
-- (C) 2009 Michael Meier
--
-- SECURITY: please note that the current mechanism for dealing with
-- incomplete contacts (missing unique) is very primitive and not
-- robust. it does handle the case in which a known socket may refer
-- to multiple uniques.
--
-- the current handling of active probes is very shake and probably a
-- source of many funny and interesting race conditions. you have been
-- warned.
LivelinessManager = {}
function LivelinessManager:new(node)
local o = {
byunique = {},
bysocket = {},
node = node,
maxretry = 3
}
if node == nil then error("LivelinessManager needs a node") end
setmetatable(o,self)
self.__index = self
return o
end
function LivelinessManager:findwatchdog(contact)
local watchdog
local socket = contact.addr .. "|" .. tostring(contact.port)
if contact.unique then
watchdog = self.byunique[contact.unique]
if watchdog then
self.bysocket[socket] = watchdog
else
watchdog = self.bysocket[socket]
if watchdog then
self.byunique[contact.unique] = watchdog
end
end
else
watchdog = self.bysocket[contact.addr .. "|" .. tostring(contact.port)]
end
return watchdog
end
function LivelinessManager:newwatchdog(contact)
local eventpipe = Channel:new()
local contact = {addr=contact.addr,
port=contact.port,
id=contact.id,
unique=contact.unique
}
local socket = contact.addr .. "|" .. tostring(contact.port)
local rpcsrunning = {}
local waitingclients = {}
local numwaitingclients = 0
local probing = false
local probingstep = 0
local lastin = 0
-- serves as a stop signal to probe(). probe() does however not
-- stop if there are still clients waiting for their answers
local stopprobing = false
local function updatecontact(newcontact)
if not contact.unique and newcontact.unique then
contact.unique = newcontact.unique
end
end
local function probebackoff(step)
local base = timeout * (2 ^ step)
local jitter = (math.random() * base) / 2
return base + jitter
end
local function probe()
probingstep = 0
probing = true
local lastinatstart = lastin
print("LIVELINESS: starting probe for " .. socket)
while probingstep < self.maxretry do
if stopprobing and numwaitingclients == 0 then
stopprobing = false
probing = false
return
end
print("LIVELINESS: probing step " .. probingstep .. " @ " .. socket)
local errorfree = self.node:ping(contact)
if stopprobing and numwaitingclients == 0 then
probing = false
stopprobing = false
return
end
if errorfree then
print("LIVELINESS: probing step " .. tonumber(probingstep) .. " @ " .. socket .. " OK")
probing = false
eventpipe:send("probereply", contact)
probing = false
stopprobing = false
return
else
probingstep = probingstep + 1
local oldlastin = lastin
eventpipe:send("probetimeout", contact, probingstep)
print("LIVELINESS: probing step " .. tonumber(probingstep - 1) .. " @ " .. socket .. " TIMEOUT")
ssleep(probebackoff(probingstep))
--if lastin > oldlastin then
-- probing = false
-- stopprobing = false
-- return
--end
end
end
probing = false
stopprobing = false
end
local function watchdog()
print("LIVELINESS: starting watchdog for " .. socket)
while true do
local msg, rcontact, rpcid = eventpipe:receive()
local now = ec.time()
if msg == "out" then
print("LIVELINESS: outgoing packet to " .. socket .. " >")
updatecontact(rcontact)
rpcsrunning[rpcid] = {rpcid=rpcid,
when=now}
elseif msg == "in" then
updatecontact(rcontact)
print("LIVELINESS: updating lastin of " .. socket .. " <")
lastin = now
rpcsrunning[rpcid] = nil
stopprobing = true
-- notify all the waiting clients that the contact is alive
for name, val in pairs(waitingclients) do
val.retpipe:sendasync(true, contact)
end
waitingclients = {}
numwaitingclients = 0
elseif msg == "timeout" then
-- timeout should not yield information about unique
local runningt = rpcsrunning[rpcid]
if not runningt then print("LIVELINESS: WARNING: timeout with no corresponding entry in rpcsrunning") end
rpcsrunning[rpcid] = nil
elseif msg == "isweak" then
local args = rpcid
local timediff = args.timediff
local retpipe = args.retpipe
if (now - timediff) <= lastin then
retpipe:sendasync(true, contact)
else
retpipe:sendasync(false, contact)
end
elseif msg == "isstrong" then
local args = rpcid
numwaitingclients = numwaitingclients + 1
waitingclients[args] = args
local func, v, p = pairs(rpcsrunning)
local firstname, firstval = func(v,p)
stopprobing = false
if not probing then
srun(probe)
end
elseif msg == "probetimeout" then
local stepattimeout = rpcid
if stepattimeout >= self.maxretry then
for name, clientt in pairs(waitingclients) do
clientt.retpipe:sendasync(false, contact)
-- TODO: should probably die after that
end
waitingclients = {}
numwaitingclients = 0
end
elseif msg == "die" then
local socket = contact.addr .. "|" .. tostring(contact.port)
self.bysocket[socket] = nil
if contact.unique then
self.byunique[contact.unique] = nil
end
return
end
end
end
srun(watchdog)
local watchdogstruct = {eventpipe=eventpipe,
lastcomm=0
}
if contact.unique then
self.byunique[contact.unique] = watchdogstruct
end
local socket = contact.addr .. "|" .. tostring(contact.port)
self.bysocket[socket] = watchdogstruct
return watchdogstruct
end
function LivelinessManager:outcomm(contact, rpcid)
local watchdog = self:findwatchdog(contact) or self:newwatchdog(contact)
watchdog.eventpipe:sendasync("out", contact, rpcid)
watchdog.lastcomm = ec.time()
end
function LivelinessManager:incomm(contact, rpcid)
local watchdog = self:findwatchdog(contact) or self:newwatchdog(contact)
watchdog.eventpipe:sendasync("in", contact, rpcid)
watchdog.lastcomm = ec.time()
end
function LivelinessManager:timeout(contact, rpcid)
local watchdog = self:findwatchdog(contact)
if watchdog then
watchdog.eventpipe:sendasync("timeout", contact, rpcid)
watchdog.lastcomm = ec.time()
end
--if not watchdog then
-- print("LIVELINESS: ERROR: timeout with no corresponding watchdog")
--end
end
function LivelinessManager:isweaklyalive(contact, timeframe)
local timeframe = timeframe or 10
local watchdog = self:findwatchdog(contact)
if watchdog then
local args = {timediff=timeframe,
retpipe=Channel:new()}
watchdog.eventpipe:sendasync("isweak", contact, args)
return args.retpipe:receive()
else
return false
end
end
function LivelinessManager:isstronglyalive(contact)
local watchdog = self:findwatchdog(contact) or self:newwatchdog(contact)
local args = {retpipe=Channel:new()}
watchdog.eventpipe:sendasync("isstrong", contact, args)
local retvals = {args.retpipe:receive()}
local up = retvals[1]
local contact = retvals[2]
if retvals[1] == false then
local routingtable = self.node.routingtable
routingtable:nodedown(contact)
end
return unpack(retvals)
end