-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclv.py
1466 lines (1233 loc) · 52.3 KB
/
clv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import collections, getopt, heapq, itertools, random, sys
# how many records in the database?
NRECS = 100
# Simulate an open system rather than a closed one. Doing so adds
# realism and makes it harder to hide poor response times. clients
# don't wait for a response before sending a new request, but rather
# track all transactions they have submitted which are still in
# flight. If the in-flight list for a client grows too long, it will
# stall with a warning.
NCLIENTS = 10
# we model a transaction consisting of 1+ reads followed by 0+
# writes. A read-modify-write is the same as a write from a
# concurrency control perspective, but we allow the possibility for
# later (seemingly independent) accesses to previously-read
# values. This maximizes the potential for Bad Things to occur
# (because the reads are vulnerable to updates while we wait for the
# writes to complete). There is little point to appending reads after
# the last write, because those can be served from a snapshot after
# releasing all locks. We expose three knobs: total number of
# accesses, percentage of those accesses which are to be writes (on
# average; R/W ratios for individual transactions vary), and the
# average length of the delay (if any) between the read and write phase of the
# transaction, measured in arbitrary time units we'll call
# "ticks". All data accesses are assumed to take one between one and
# two ticks, in the absence of lock conflicts.
NACCESS = 10
PWRITE = 10
RWDELAY = 10
LOGDELAY = 100
# TODO: Another tuning knob: how likely is a transaction to access a value a
# second time? This percentage is in addition to of any random
# re-access that might be chosen accidentally.
PAGAIN = 0
# TODO; to avoid floating point weirdness, we should use a fixed-point
# representation for all timestamps, accurate to one part in 64k.
# inspired by http://www.dabeaz.com/coroutines/Coroutines.pdf
todos = []
now = [0]
db = dict()
ONE_TICK = 1000
def errlog(msg, *args):
if args:
msg %= args
sys.stderr.write('%s\n' % msg)
def log_devnull():
while 1:
yield
# SVGPan is BSD-licensed code downloaded from
# http://www.cyberz.org/projects/SVGPan/SVGPan.js
class svg_printer(object):
def __enter__(self):
head = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">
<script xlink:href="SVGPan.js"/>
<g transform='translate(100,0)'><title>Simulation output</title>'''
print(head)
def __exit__(self, *exc_info):
foot = '''
</g>
</svg>
</div>
</body>
</html>
'''
foot = '''</g></svg>'''
print(foot)
def log_svg_task():
h = 10
tasks = set()
with svg_printer():
while 1:
t,cpu,pid,work,kwargs = (yield)
y = h*pid
if pid not in tasks:
tasks.add(pid)
fmt = "<text font-size='{h}pt' text-anchor='right' x='-100' y='{y}' width='100'>{s}</text>"
print(fmt.format(h=h, y=y, s=pid))
title = kwargs.get('title', None)
if title:
print('<g><title>t=%.2f: %s</title>' % (t/float(ONE_TICK), title))
color = kwargs.get('color', None)
stroke = color or 'black'
fmt = "<rect x='{x}' y='{y}' width='{w}' height='{h}' stroke='{stroke}' fill='{color}' stroke-width='1'/>"
print(fmt.format(x=t*10./ONE_TICK, y=y, w=work*10./ONE_TICK,
h=h, color=color, stroke=stroke))
if title:
print('</g>')
_sys_cpu = 0
_sys_now = 1
_sys_sleep = 2
_sys_busy = 3
_sys_getpid = 4
_sys_park = 5
_sys_unpark = 6
_sys_spawn = 7
_sys_exit = 8
# simulator syscalls
def sys_cpu():
return (yield (_sys_cpu,))
def sys_now():
return (yield (_sys_now,))
def sys_sleep(delay):
yield _sys_sleep, delay
def sys_busy(delay, **kwargs):
yield _sys_busy, delay, kwargs
def sys_getpid():
return (yield (_sys_getpid,))
def sys_park():
return (yield (_sys_park,))
def sys_unpark(pid):
yield _sys_unpark, pid
def sys_spawn(task):
return (yield _sys_spawn, task)
def sys_exit(rval=0):
yield _sys_exit, rval
def simulator(init_task, log=log_devnull()):
'''A discrete event-based simulator allowing any degree of
concurrency. All tasks run exactly when scheduled.
Simulator state is initialized with 1+ "seed" tasks (all of which
take zero arguments and will be executed at t=0), obtained by calling
init_fn(time_fn, enqueue_fn). The init_fn must accept a dictionary as
input, which will be filled with functions that give access to the
simulator's internal state.
A "task" consists of the triple (t, fn, args), where t is the time
at which the task should run, fn is the callable to invoke, and args
is the set of arguments to invoke fn with (or the empty tuple if no
args are needed).
'''
# simulator loop variables
task, pid, fn, args, delay, reschedule = (None,)*6
# initialize the log
next(log)
def log_send(t, kwargs):
log.send((now, None, pid, t, kwargs))
# time-related syscalls
now = 0
def sys_now():
return now
def sys_sleep(t):
nonlocal delay
delay += t
def sys_busy(t, kwargs):
log_send(t, kwargs)
sys_sleep(t)
# three different ways to give control to a task
task_start = lambda _: task.send(None)
task_send = lambda args: task.send(args)
task_throw = lambda args: task.throw(*args)
# create and identify tasks
todo = []
pid_maker = itertools.count(1)
active_tasks = collections.defaultdict(lambda: next(pid_maker))
def task_activate(task, pid=None, fn=task_start, args=None):
# A minimal activation delay prevents a new task with a low
# PID from displacing the current task at todo[0].
pid = pid or active_tasks[task]
old = todo[0] if todo else None
heapq.heappush(todo, (now+1, pid, task, fn, args))
assert not old or (old == todo[0])
return pid
def sys_spawn(task):
return task_activate(task)
def sys_getpid():
return pid
# park, and unpark tasks
class TaskParked(Exception):
pass
parked_tasks,unpark_events = {},set()
def sys_park():
try:
unpark_events.remove(pid)
except KeyError:
parked_tasks[pid] = (task,now)
raise TaskParked from None
return 0
def sys_unpark(pid):
try:
task,then = parked_tasks.pop(pid)
except KeyError:
unpark_events.add(pid)
else:
task_activate(task, pid, fn=task_send, args=now-then)
# terminate simulation
def sys_exit(rval):
raise StopIteration(rval)
# initialize simulator's TODO list
syscalls = [None]*100
syscalls[_sys_cpu] = lambda:None
syscalls[_sys_now] = sys_now
syscalls[_sys_sleep] = sys_sleep
syscalls[_sys_busy] = sys_busy
syscalls[_sys_getpid] = sys_getpid
syscalls[_sys_park] = sys_park
syscalls[_sys_unpark] = sys_unpark
syscalls[_sys_spawn] = sys_spawn
syscalls[_sys_exit] = sys_exit
# go!
rval = 0
task_activate(init_task)
while todo:
delay,reschedule = 0,False
t, pid, task, fn, args = todo[0]
assert now <= t
now = t
try:
# Pass the task the results of its last syscall, and let
# it continue to its next syscall (tasks only consume
# simulated CPU time during syscalls, often sys_busy). If
# an exception escapes from a task, terminate the
# simulation on the assumption that we've hit a bug.
syscall, *args = fn(args)
except StopIteration:
# task exited; remove from active set and continue
#errlog('pid %d exited', pid)
del active_tasks[task]
else:
try:
# execute syscall, schedule delayed return
args = syscalls[syscall](*args)
except TaskParked:
# somebody else will unpark the task later
#errlog('pid %d parked', pid)
pass
except StopIteration as si:
#errlog('pid %d called sys_exit(%s)', pid, si.value)
rval = si.value
break
except:
err = sys.exc_info()
#errlog('pid %d: %s(%s) threw %s (%s)', pid, fn, args, err[0], err[1])
fn, args = task_throw, err
sys_busy(ONE_TICK, dict(title='Exception', color='red'))
reschedule = True
else:
fn, reschedule = task_send, True
# now what?
if not reschedule:
heapq.heappop(todo)
elif delay > 0:
heapq.heapreplace(todo, (now+delay, pid, task, fn, args))
else:
todo[0] = (now, pid, task, fn, args)
if len(parked_tasks):
errlog('Oops: %d parked tasks at exit:', len(parked_tasks))
for pid,(task,then) in parked_tasks.items():
errlog('\tpid=%d since %.2f (%d ticks ago)'
% (pid, then/float(ONE_TICK), (now-then)//ONE_TICK))
return rval
def cpu_simulator(ncpu=1):
assert not 'code rot'
'''A discrete event simulator representing a system with cooperative
multitasking over some fixed number of processors (ncpu).
'''
todo = []
cpus = [(0,i) for i in xrange(ncpu)]
task_idle, cpu_idle = 0,0
# accessors
def time():
return cpus[0][0]
def task_cpu():
return cpus[0][1]
def enqueue(when, task, args=()):
heapq.heappush(todo, (when, task, args))
def task_busy(t):
assert t > 0
nt = time()+t
heapq.heapreplace(cpus, (nt, task_cpu()))
enqueue(nt,
todo.extend((0, fn, None) for fn in init_fn(time=time,
enqueue=enqueue,
task_cpu=task_cpu,
task_busy=task_busy
)))
heapq.heapify(todo)
heapq.heapfiy(cpu)
while todo:
t, fn, args = heapq.heappop(todo)
now,cpu = cpu[0]
if now < t:
cpu_idle += t - now
cpu[0][0] = t
elif t < now:
task_idle += now - t
fn(args)
def tarjan_vanilla(tx_deps, include_trivial=False):
'''A vanilla implementation of Tarjan's algorithm.
This variant is not used directly, but rather serves as a
reference point for specialized variants we do use.
'''
last_index = 0
def new_index():
nonlocal last_index
last_index += 1
return last_index
index = collections.defaultdict(new_index)
def index_seen(dep):
tmp = last_index
j = index[dep]
unseen = (j == last_index and tmp != last_index)
return j,unseen
low,stack,s,scs = {}, set(), [], []
def connect(pid, deps):
i = last_index
ilow = low[i] = i
s.append(pid)
stack.add(i)
for dep in deps:
j,unseen = index_seen(dep)
if unseen:
jlow = connect(dep, tx_deps.get(dep, ()))
if jlow < ilow:
ilow = low[i] = jlow
elif j in stack and j < ilow:
ilow = low[i] = j
if ilow == i:
sc,dep = [],None
while pid != dep:
dep = s.pop()
stack.remove(index[dep])
sc.append(dep)
if len(sc) > 1 or include_trivial:
scs.append(sc)
return ilow
for pid,deps in tx_deps.items():
i,unseen = index_seen(pid)
if unseen:
connect(pid, deps)
return scs
def test_tarjan():
deps = collections.defaultdict(set)
deps[1].update({2, 9})
deps[2].update({3})
deps[3].update({1})
deps[4].update({3, 5})
deps[5].update({4, 6})
deps[6].update({3, 7})
deps[7].update({6})
deps[8].update({5, 8})
deps[10].update({8})
scs = tarjan_vanilla(deps, True)
for sc in scs:
sc.sort()
errlog('%s', ' '.join(str(x) for x in sc))
def tarjan_serialize(tx_deps, old_safe_point, new_safe_point, show_all=False):
'''A variant of Tarjan's algorithm specialized for finding
serialization failures in the partitioned dependency graph we
generate. It focuses only on strongly connected components
"anchored" between old and new safe points, and usually only
reports SCC of size 2 or more.
'''
last_index = 0
def new_index():
nonlocal last_index
last_index += 1
return last_index
index = collections.defaultdict(new_index)
def index_seen(dep):
tmp = last_index
j = index[dep]
unseen = (j == last_index and tmp != last_index)
return j,unseen
low,stack,s,scs = {}, set(), [], []
def connect(pid, deps):
i = last_index
ilow = low[i] = i
s.append(pid)
stack.add(i)
for dep in deps:
# safe to skip, see long comment in dependency_tracker()
if new_safe_point <= dep:
continue
j,unseen = index_seen(dep)
if unseen:
jlow = connect(dep, tx_deps.get(dep, ()))
if jlow < ilow:
ilow = low[i] = jlow
elif j in stack and j < ilow:
ilow = low[i] = j
if ilow == i:
sc,dep,unreported = [],None,False
while pid != dep:
dep = s.pop()
unreported |= old_safe_point <= dep
stack.remove(index[dep])
sc.append(dep)
if unreported and (len(sc) > 1 or show_all):
scs.append(sc)
return ilow
new_deps = collections.defaultdict(dict)
for pid,deps in tx_deps.items():
if pid < new_safe_point:
i,unseen = index_seen(pid)
if unseen:
connect(pid, deps)
if old_safe_point <= pid:
new_deps[pid] = deps
return new_deps, scs
def tarjan_incycle(tx_deps, who):
'''A variant of Tarjan's algorithm that only cares about the SCC that
a given node belongs to. If "who" belongs to a non-trivial SCC,
return the cluster. Otherwise, return None.
'''
last_index = 0
def new_index():
nonlocal last_index
last_index += 1
return last_index
index = collections.defaultdict(new_index)
def index_seen(dep):
tmp = last_index
j = index[dep]
unseen = (j == last_index and tmp != last_index)
return j,unseen
low,stack,s,scs = {}, set(), [], []
scc = None
def connect(pid, deps):
nonlocal scc
i = last_index
ilow = low[i] = i
s.append(pid)
stack.add(i)
for dep in deps:
j,unseen = index_seen(dep)
if unseen:
jlow = connect(dep, tx_deps.get(dep, ()))
if jlow < ilow:
ilow = low[i] = jlow
elif j in stack and j < ilow:
ilow = low[i] = j
if ilow == i:
sc,dep,found = [],None,False
while pid != dep:
dep = s.pop()
if dep == who:
found = True
stack.remove(index[dep])
sc.append(dep)
if found and len(sc) > 1:
scc = sc
return ilow
_,unseen = index_seen(who)
assert unseen
connect(who, tx_deps[who])
return scc
class NamedTuple(object):
def __init__(self, **args):
self.__dict__.update(args)
def dependency_tracker():
'''Create a dependency tracker.
The tracker exports two hook functions: on_access and on_commit.
on_access(pid, rid, dep) notifies the tracker that transaction
"pid" has accessed (read or overwritten) the value which
transaction "dep" wrote to record "rid"; it returns two values:
the value of the read (larger of the dependency and the current
safe point, see below), and a boolean indicating isolation status
(False means there was an isolation failure, where the value read
was written by a transaction that is still in flight).
on_finish(pid, committed) notifies the tracker that transaction
"pid" has finished. If "committed" its dependencies should be
checked for serialization errors. The checker will perform such
checks during this or some future invocation of on_finish(), and
will return a list of failures whenever such a check fails. Checks
are allowed to occur during any call to on_finish, including when
reporting a failed transaction.
Internally, the tracker maintains data structures that protect old
transactions from repeated checking, so that the cost of checking
does not grow over time.
The central concept used is a "safe point" -- the pid of some
transaction known to have committed before any active transaction
began. Transactions incur no dependencies when accessing values
written before the current safe point began. An optimal safe point
could be maintained using a heap, but would be expensive---heap
maintenance is O(n lg n)---and would force us to check each
transaction individually for failures over an ever-growing
dependency graph. Instead, the system maintains a pending safe
point as well as a set of "pending" transactions. The pending safe
point will be installed once all pending transactions have
finished. The last such transaction to complete is chosen to
become the next pending safe point, and all currently active
transactions are added to the pending set.
There are six kinds of transactions in the dependency graph, based
on how their lifetime compares with the selection and installation
of safe points:
now
|
v
-------------- A --------------- B --------------- C --------------
|--T1--| |--T3--| |--T5--|
|--T2--| |--T4--| |--T6--|
In the above figure, time flows from left to right and is measured
in terms of transaction ids, which increase
monotonically. Installation of safe point A must wait until all
transactions that coexisted with A complete; B is the last to do
so, and is chosen to become the next safe point. As before, it
cannot be installed until all coexisting transactions complete,
with C being the last to do so. Note that the definition of a safe
point means that every transaction will see at most one safe point
installed during its lifetime.
We choose to perform serialization checks whenever a new safe
point is installed. In the above figure, suppose that C has just
committed, allowing us to install safe point B. There is no point
in checking live transactions (e.g. T6) yet because they could
still enter a cycle after the check. It's also unpalatable to
check the set of T4 and T5, because differentiating between T3 and
T4 depends on commit "time" which is messy in our formulation
based on transaction starts. Instead, at the time C commits (and B
is installed) we check all transactions that began between A and
B. All such are guaranteed to have committed, and most SCC we find
will not have been reported before.
At each round, the checker will report failures involving T3, T4,
T2/T3, T2/T4, T3/T4, and T2/T3/T4. We distinguish T3 from T4 by
noting that all deps for T3 occur *before* new_safe_point, while
one or more deps occur *after* new_safe_point in T4.
NOTE: While running Tarjan, we could encounter strongly connected
components involving T3/T4/T5. However, our formulation of safe
points disallows any direct dependencies between T3 and T5,
meaning that T4 must be the common member of 2+ unrelated cycles
(a figure-eight with T4 at the crossing). Each cycle is a
different (unrelated) serialization failure, so we are content to
partition the SCC, reporting the T3/T4 subcomponent now and the
T4/T5 subcomponent next time; by a similar argument, the T1/T2
subcomponent of a T1/T2/T3 failure will have been reported last
time, and we must now report the T2/T3 subcomponent.
NOTE: The first transaction to commit after system startup also
discovers an empty pending set, and will do the right thing by
becoming the new safe point and populating the pending set.
'''
active = set()
safe_point = 0
pending = set()
pending_safe_point = 0
# track the set of dependencies for each transaction
deps = collections.defaultdict(dict)
#all_deps = {}
def on_access(pid, rid, dep):
active.add(pid)
if dep == pid:
isolated = True
elif dep > safe_point:
deps[pid].setdefault(dep, rid)
isolated = dep not in active
else:
dep = safe_point
isolated = True
return dep,isolated
def on_finish(pid, is_commit):
nonlocal deps, safe_point, pending, pending_safe_point
active.remove(pid)
# only remember committed transactions
if not is_commit:
deps.pop(pid, None)
else:
#all_deps[pid] = deps[pid]
pass
pending.discard(pid)
if pending:
return ()
# any serialization failures since last safe point?
deps, problems = tarjan_serialize(deps, safe_point, pending_safe_point)
#_, all_problems = tarjan(all_deps, 0, 1000**3)
#for p in all_problems:
# sys.stderr.write('scc: %s\n' % (' '.join(map(str, p))))
# advance the safe point
safe_point = pending_safe_point
pending_safe_point = pid
pending = set(active)
return problems
return NamedTuple(on_access=on_access, on_finish=on_finish)
def make_nocc_db(nrecs=100):
'''A database model implementing no concurrency control
whatsoever. All accesses are processed in the order they arrive,
without blocking, and transactions will experience isolation and
serialization failures on a regular basis.
'''
db = [0]*nrecs
tracker = dependency_tracker()
tx_count, acc_count, iso_failures, ser_failures = 0,0,0,0
def db_access(rid):
nonlocal acc_count, iso_failures
pid = yield from sys_getpid()
val,isolated = tracker.on_access(pid, rid, db[rid])
if not isolated:
iso_failures += 1
acc_count += 1
return pid, val
def tx_read(rid, for_update=False):
pid, val = yield from db_access(rid)
yield from sys_busy(random.randint(ONE_TICK, 2*ONE_TICK),
color='green', title='%s=db[%s]' % (val, rid))
return val
def tx_write(rid):
pid, _ = yield from db_access(rid)
db[rid] = pid
yield from sys_busy(random.randint(ONE_TICK, 2*ONE_TICK),
color='blue', title='db[%s]=%s' % (rid, pid))
def tx_commit():
nonlocal tx_count, ser_failures
yield from sys_busy(random.randint(ONE_TICK, 2*ONE_TICK), color='yellow')
yield from sys_sleep(random.randint(5*ONE_TICK, 10*ONE_TICK))
yield from sys_busy(random.randint(ONE_TICK, 2*ONE_TICK), color='orange')
tx_count += 1
pid = yield from sys_getpid()
cycles = tracker.on_finish(pid, True)
for cycle in cycles:
ser_failures += len(cycle)
errlog('Serialization failure found: %s', ' '.join(map(str, cycle)))
def fini():
errlog('''
Stats:
Total transactions: %d (%d serialization failures)
Total accesses: %d (%d isolation failures)''',
tx_count, ser_failures, acc_count, iso_failures)
return NamedTuple(nrecs=nrecs, tx_read=tx_read, tx_write=tx_write, tx_commit=tx_commit, fini=fini)
class AbortTransaction(Exception):
'''Raised whenever a transaction fails.
The user is responsible to call tx_abort on their database so that
any outstanding changes can be rolled back (otherwise, the transaction
is left hanging, in its faile state, forever).
'''
pass
class DeadlockDetected(AbortTransaction):
pass
def make_2pl_db(nrecs=100, verbose=False):
'''A database model implementing strict two phase locking
(2PL). Transactions must acquire appropriate locks before
accessing data, and may block (or even deadlock) when requesting
locks already held by other transactions. Neither isolation nor
serialization failures are possible.
'''
db = [0]*nrecs
tracker = dependency_tracker()
tx_count, tx_failures, ser_failures = 0,0,0
acc_count, lock_waits, iso_failures = 0,0,0
wait_histo = collections.defaultdict(lambda:0)
resp_histo = collections.defaultdict(lambda:0)
dlock_histo = collections.defaultdict(lambda:0)
in_flight = {}
tx_logs = collections.defaultdict(dict)
q,e = not verbose, errlog
def histo_add(h, delay):
h[delay.bit_length()] += 1
def histo_print(h, title, xtime=True, logx=True, logy=True):
b = 1./ONE_TICK if xtime else 1
errlog('\n\tLog-log distribution of %s:', title)
fmt = '\t\t%8.2f: %5d %s' if xtime else '\t\t%8d: %5d %s'
for k in sorted(h.keys()):
n = h[k]
x = b*(2.**k if logx else k) if k else 0
y = n.bit_length() if logy else n
errlog(fmt, x, n, '*'*y)
'''We support the following lock modes:
N - no lock (compat: R, U, X)
R - read lock (compat: R, U; non-blocking upgrade)
S - multiple read locks held (compat: R, U)
U - read lock with intent to upgrade (compat: R; non-blocking upgrade)
V - one U and at least one R (compat: R)
X - exclusive write lock (compat: None)
P - upgrade to X in progress (compat: None)
W - lock blocked (compat: None)
States N, S, V, P and W are synthetic lock modes used internally
by the lock manager. A lock may be in such a mode, but no
transaction requests it specifically. S (V) is identical to R (U),
except that the former signals the lock manager that a lock holder
will block on attempting to upgrade, while the latter allows the
(single) holder to upgrade to X without blocking. P indicates that
some request is blocked on upgrade (see below). These extra modes
allow upgraders to make decisions based only on the lock's current
mode, without examining the request list.
To keep lock tables small, we assign the smallest ordinals to lock
modes that transactions are allowed to request.
'''
R, U, X, P, N, S, V, W = 0, 1, 2, 3, 4, 5, 6, 7
mode_names = dict(N=N, R=R, U=U, X=X, P=P, S=S, V=V, W=W)
mode_names = dict((v,k) for (k,v) in mode_names.items())
mode_name = lambda m: mode_names.get(m, m)
'''The supremum table is used to determine compatibility efficiently,
and new lock requests rely heavily on it. The value returned by
supr[current_mode][requested_mode] indicates the effective lock
mode caused by the arrival of the new mode. W indicates an
incompatible request that must block.
'''
_,supr = W,[None]*8
# R U X P N
supr[N] = ( R, U, X, P, N )
supr[R] = ( S, V, _, _, R )
supr[S] = ( S, V, _, _, S )
supr[U] = ( V, _, _, _, U )
supr[V] = ( V, _, _, _, V )
supr[P] = ( _, _, _, _, P )
supr[X] = ( _, _, _, _, X )
supr[_] = ( _, _, _, _, _ )
'''Lock upgrades are a pain, because they put the transaction in a
position of both holding and needing to acquire a lock... we also
need to be able to process every possible type of upgrade, because
users could accidentally (or maliciously) do things like read,
read-for-update, then write (for example).
Many upgrade requests either succeed or deadlock immediately;
those that remain (R->X and U->X, with readers present) must
block. We implement the blocking as follows. First, the request
moves from the holders list to the upgraders list and its mode
changes from U to P (lock mode changes to P as well). P is an
asymmetric mode, where supr[*][P] = supr[*][U] and supr[P][*] =
supr[X][*]. So, while R-U-R is possible, R-P-R is not (ensuring
the upgrade request does not starve); similarly, R-X is not
possible, but R-P is (allowing the upgrade to occur). Once the
upgrader unblocks, it can safely change its mode (and that of the
lock) from P to X without affecting its successors.
The logic that decides whether to succeed/block/deadlock is
encoded into a three-dimensional lookup:
upgr[lock.m, req.m, mode] = (rmode, gmode, smode)
Where rmode is the mode to request, gmode is the mode to change to
after the request is granted, and smode is the new lock supremum
to use once the request is granted. An empty entry means deadlock,
while a missing one means an illegal upgrade has been requested
(e.g. upgr[X][R][U]). If rmode != gmode, the upgrade request must
block (going to the "upgrader" position) and the lock goes into
rmode for the interim. With six lock modes, three of which that
can be requested and upgraded to by transactions, we have 6*3*3 =
54 possible situations:
'''
upgr = {}
upgr[R, R, R] = (R, R, R) # trivial
upgr[R, R, U] = (U, U, U)
upgr[R, R, X] = (X, X, X)
#upgr[R, U, *] = illegal
#upgr[R, X, *] = illegal
upgr[S, R, R] = (R, R, S) # trivial
upgr[S, R, U] = (U, U, V)
upgr[S, R, X] = (P, X, X)
#upgr[S, U, *] = illegal
#upgr[S, X, *] = illegal
upgr[U, R, R] = (R, R, U) # trivial
upgr[U, R, U] = None
upgr[U, R, X] = None
upgr[U, U, R] = (U, U, U) # trivial
upgr[U, U, U] = (U, U, U) # trivial
upgr[U, U, X] = (X, X, X)
#upgr[U, X, *] = illegal
upgr[V, R, R] = (R, R, V) # trivial
upgr[V, R, U] = None
upgr[V, R, X] = None
upgr[V, U, R] = (U, U, V) # trivial
upgr[V, U, U] = (U, U, V) # trivial
upgr[V, U, X] = (P, X, X)
#upgr[V, X, *] = illegal
upgr[P, R, R] = (R, R, P) # trivial
upgr[P, R, U] = None
upgr[P, R, X] = None
#upgr[P, U, *] = illegal
#upgr[P, X, *] = illegal
#upgr[X, R, *] = illegal
#upgr[X, U, *] = illegal
upgr[X, X, R] = (X, X, X) # trivial
upgr[X, X, U] = (X, X, X) # trivial
upgr[X, X, X] = (X, X, X) # trivial
'''
30 of the cases are illegal, 12 are trivial (lock strength
unchanged), six deadlock, four succeed immediately, and two block.
If rmode != gmode, then the request must block and the lock state
is set to rmode for the interim. Otherwise, the request is either
trivial or succeeds immediately (we don't actually care which).
NOTE: If we add intent lock modes to the mix, then many more kinds of
upgrade become possible (e.g. IR -> {R, IX} -> RIX -> UIX -> X)
but we should be able to handle the blocking cases in the same
way, by introducing more lock modes. The main issue is that intent
modes allow multiple upgrades to proceed (and succeed) at the same
time (e.g. IR IR IX -> R R), which might complicate setting of the
final lock mode (IR -> R implies R, but IR IR -> R R implies
S). We expect that asymmetric transient modes should cover this,
but that has not been pondered heavily, let alone proven.
'''
class LockRequest(object):
def __init__(self, pid, mode):
self.pid, self.m = pid,mode
def __repr__(self):
return 'LockRequest(%s, %s)' % (self.pid, mode_name(self.m))
def __str__(self):
return '%s:%s' % (self.pid, mode_name(self.m))
'''We partition a lock's request list into three segments: holders,
upgraders (those who already hold the lock but who wait to acquire
a stronger mode), and waiters (those who wish to acquire the lock
but are currently blocked from doing so). The lock also maintains
two modes: its "wait" mode (for new requests), and its "held" mode
(for upgrades).
'''
class Lock(object):
def __init__(self):
self.m, self.wm, self.holders, self.upgrader = N,N,[],None
self.waitlist = collections.deque()
def __repr__(self):
return ('Lock(m=%s/%s, h=[%s], u=%s, w=[%s])'
% (mode_name(self.m), mode_name(self.wm),
' '.join(map(str, self.holders)),
str(self.upgrader), ' '.join(map(str, self.waitlist))))
locks = collections.defaultdict(Lock)
tx_locks = collections.defaultdict(dict)
tx_deps = {}
empty_set, empty_dict = set(), {}
def lock_acquire(pid, rid, mode):
now = yield from sys_now()
must_wait,is_upgrade = False,False
lock = locks[rid]
q or e('lock_acquire: t=%.2f pid=%d rid=%d lock=%s/%s mode=%s',
now/float(ONE_TICK), pid, rid,
mode_name(lock.m), mode_name(lock.wm), mode_name(mode))
my_locks = tx_locks[pid]
req = my_locks.get(rid, None)
if req:
# existing request, may need upgrade
try:
rmode,gmode,smode = upgr[lock.m, req.m, mode]
except TypeError:
q or e('\tUpgrade deadlock detected: lock.m=%s/%s req.m=%s/%s',
mode_name(lock.m), mode_name(lock.wm),
mode_name(req.m), mode_name(mode))
histo_add(dlock_histo, 1)
raise DeadlockDetected() from None
assert not lock.upgrader
if rmode == gmode:
q or e('\tUpgrade from %s to %s (%s) succeeds immediately',
mode_name(req.m), mode_name(mode), mode_name(gmode))
lock.m,req.m = smode,gmode
return
# abandon current request and block with a new one
req.m = N
req = my_locks[rid] = LockRequest(pid, rmode)
# mark the lock as upgrading
if lock.wm == lock.m:
lock.wm = rmode
lock.m = rmode
lock.upgrader = req
# block, then patch things up
yield from sys_busy(ONE_TICK//10, color='pink',
title='Upgrade: rid=%s lock.m=%s req.m=%s rmode=%s'
% (rid, lock.m, req.m, rmode))
try:
yield from lock_block(lock, req, set())
except DeadlockDetected:
lock.holders.append(req)
lock.upgrader = None
raise
lock.m,req.m = smode,gmode
else:
req = my_locks[rid] = LockRequest(pid, mode)
lock.wm = supr[lock.wm][req.m]
if lock.wm != W:
q or e('\tRequest granted: lock.m=%s', mode_name(lock.wm))
assert not lock.upgrader and not lock.waitlist
lock.holders.append(req)