4. ...except it does
C bugs, untrapped exception, infinite loops,
blocking calls, thread dead-lock, inconsistent
resident state
5. Recovery is important
"[S]ystem failure can usually be considered to
be the result of two program errors[...] the
second, in the recovery routine[...]"
10. Example: Counting
def update_counter():
fp = file("counter.txt")
s = fp.read()
counter = int(s.strip())
counter += 1
# If there is a crash before this point,
# no changes have been done.
fp = file("counter.txt.tmp", 'w')
print >>fp, counter
fp.close()
# If there is a crash before this point,
# only a temp file has been modified
# The following is an atomic operation
os.rename("counter.txt.tmp", "counter.txt")
19. Watchdog: Heartbeats
## In a Twisted process
def beat():
file('beats/my-name', 'a').close()
task.LoopingCall(beat).start(30)
20. Watchdog: Get time-outs
def getTimeout()
timeout = dict()
now = time.time()
for heart in glob.glob('hearts/*'):
beat = int(file(heart).read().strip())
timeout[heart] = now-beat
return timeout
21. Watchdog: Mark problems
def markProblems():
timeout = getTimeout()
for heart in glob.glob('beats/*'):
mtime = os.path.getmtime(heart)
problem = 'problems/'+heart
if (mtime<timeout[heart] and
not os.path.isfile(problem)):
fp = file('problems/'+heart, 'w')
fp.write('watchdog')
fp.close()
22. Watchdog: check solutions
def checkSolutions():
now = time.time()
problemTimeout = now-30
for problem in glob.glob('problems/*'):
mtime = os.path.getmtime(problem)
if mtime<problemTimeout:
subprocess.call(['restart-system'])
30. Example: Counting on Windows
def update_counter():
fp = file("counter.txt")
s = fp.read()
counter = int(s.strip())
counter += 1
# If there is a crash before this point,
# no changes have been done.
fp = file("counter.txt.tmp", 'w')
print >>fp, counter
fp.close()
# If there is a crash before this point,
# only a temp file has been modified
os.remove("counter.txt")
# At this point, the state is inconsistent*
# The following is an atomic operation
32. Example: Counting on Windows
(Recovery)
def recover():
if not os.path.exists("counter.txt"):
# The permanent file has been removed
# Therefore, the temp file is valid
os.rename("counter.txt.tmp",
"counter.txt")
33. Example: Counting with versions
def update_counter():
files = [int(name.split('.')[-1])
for name in os.listdir('.')
if name.startswith('counter.')]
last = max(files)
counter = int(file('counter.%s' % last
).read().strip())
counter += 1
# If there is a crash before this point,
# no changes have been done.
fp = file("tmp.counter", 'w')
print >>fp, counter
fp.close()
# If there is a crash before this point,
# only a temp file has been modified
35. Example: Counting with versions
(cleanup)
# This is not a recovery routine, but a cleanup
# routine.
# Even in its absence, the state is consistent
def cleanup():
files = [int(name.split('.')[-1])
for name in os.listdir('.')
if name.startswith('counter.')]
files.sort()
files.pop()
for n in files:
os.remove('counter.%d' % n)
if os.path.exists('tmp.counter'):
os.remove('tmp.counter')
36. Correct ordering
def activate_due():
scheduled = rs.smembers('scheduled')
now = time.time()
for el in scheduled:
due = int(rs.get(el+':due'))
if now<due:
continue
rs.sadd('activated', el)
rs.delete(el+':due')
rs.sremove('scheduled', el)
37. Correct ordering (recovery)
def recover():
inconsistent = rs.sinter('activated',
'scheduled')
for el in inconsistent:
rs.delete(el+':due') #*
rs.sremove('scheduled', el)
40. Example: Key/value stores (utility
functions)
## Get the level of a file
def getLevel(s)
return int(s.split('.')[0])
## Get all files of a given type
def getType(tp):
return [(getLevel(s), s)
for s in files if s.endswith(tp)]
41. Example: Key/value stores
(classifying files)
## Get all relevant files
def relevant(d):
files = os.listdir(d):
mlevel, master = max(getType('.master'))
logs = getType('.log')
logs.sort()
return master+[log for llevel, log in logs
if llevel>mlevel]
42. Example: Key/value stores (reading)
## Read in a single file
def update(result, fp):
for line in fp:
val = json.loads(line)
if val[0] == 'add':
result[val[1]] = val[2]
else:
del result[val[1]]
## Read in several files
def read(files):
result = dict()
for fname in files:
try:
update(result, file(fname))
46. Example: Key/value stores (storage
class)
## The actual data store abstraction.
class Store(object):
def __init__(self):
files = relevant(d)
self.result = read(files)
level = getLevel(files[-1])
self.writer = Writer(level)
def get(self, key):
return self.result[key]
def add(self, key, value):
self.writer.write(['add', key, value])
def remove(self, key):
self.writer.write(['remove', key])
47. Example: Key/value stores
(compression code)
## This should be run periodically
# from a different thread
def compress(d):
files = relevant(d)[:-1]
if len(files)<2:
return
result = read(files)
master = getLevel(files[-1])+1
fp = file('%3d.master.tmp' % master, 'w')
for key, value in result.iteritems():
towrite = ['add', key, value])
print >>fp, json.dumps(towrite)
fp.close()
48. Vertical splitting: Example
def forking_server():
s = socket.socket()
s.bind(('', 8080))
s.listen(5)
while True:
client = s.accept()
newpid = os.fork()
if newpid:
f = client.makefile()
f.write("Sunday, May 22, 1983 "
"18:45:59-PST")
f.close()
os._exit()
49. Horizontal splitting: front-end
## Process one
class SchedulerResource(resource.Resource):
isLeaf = True
def __init__(self, filepath):
resource.Resource.__init__(self)
self.filepath = filepath
def render_PUT(self, request):
uuid, = request.postpath
content = request.content.read()
child = self.filepath.child(uuid)
child.setContent(content)
fp = filepath.FilePath("things")
r = SchedulerResource(fp)
s = server.Site(r)
reactor.listenTCP(8080, s)
50. Horizontal splitting: scheduler
## Process two
rs = redis.Redis(host='localhost',
port=6379, db=9)
while True:
for fname in os.listdir("things"):
when = int(file(fname).read().strip())
rs.set(uuid+':due', when)
rs.sadd('scheduled', uuid)
os.remove(fname)
time.sleep(1)
51. Horizontal splitting: runner
## Process three
rs = redis.Redis(host='localhost',
port=6379, db=9)
recover()
while True:
activate_due()
time.sleep(1)
53. Horizontal splitting: message
queues: sender
## Process four
rs = redis.Redis(host='localhost',
port=6379, db=9)
params = pika.ConnectionParameters('localhost')
conn = pika.BlockingConnection(params)
channel = conn.channel()
channel.queue_declare(queue='active')
while True:
activated = rs.smembers('activated')
finished = set(rs.smembers('finished'))
for el in activated:
if el in finished:
continue
55. Horizontal splitting: message
queues: receiver
## Process five
# It is possible to get "dups" of bodies.
# Application logic should deal with that
params = pika.ConnectionParameters('localhost')
conn = pika.BlockingConnection(params)
channel = conn.channel()
channel.queue_declare(queue='active')
def callback(ch, method, properties, el):
syslog.syslog('Activated %s' % el)
channel.basic_consume(callback, queue='hello', no_ack=True)
channel.start_consuming()