Skip to content

Commit 477da40

Browse files
authored
output disassembly (#46)
* output disassembly * update readme
1 parent 3defd73 commit 477da40

File tree

3 files changed

+270
-1
lines changed

3 files changed

+270
-1
lines changed

BitFaster.Caching.Benchmarks/Lru/LruJustGet.cs renamed to BitFaster.Caching.Benchmarks/Lru/LruJustGetOrAdd.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212

1313
namespace BitFaster.Caching.Benchmarks
1414
{
15+
[DisassemblyDiagnoser(printSource: true)]
1516
[MemoryDiagnoser]
16-
public class LruJustGet
17+
public class LruJustGetOrAdd
1718
{
1819
private static readonly ConcurrentDictionary<int, int> dictionary = new ConcurrentDictionary<int, int>(8, 9, EqualityComparer<int>.Default);
1920

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
using System;
2+
using System.Collections.Concurrent;
3+
using System.Collections.Generic;
4+
using System.Text;
5+
using BenchmarkDotNet.Attributes;
6+
using BitFaster.Caching.Lru;
7+
8+
namespace BitFaster.Caching.Benchmarks.Lru
9+
{
10+
[DisassemblyDiagnoser(printSource: true)]
11+
[MemoryDiagnoser]
12+
public class LruJustTryGet
13+
{
14+
private static readonly ConcurrentDictionary<int, int> dictionary = new ConcurrentDictionary<int, int>(8, 9, EqualityComparer<int>.Default);
15+
16+
private static readonly FastConcurrentLru<int, int> fastConcurrentLru = new FastConcurrentLru<int, int>(8, 9, EqualityComparer<int>.Default);
17+
private static readonly FastConcurrentTLru<int, int> fastConcurrentTLru = new FastConcurrentTLru<int, int>(8, 9, EqualityComparer<int>.Default, TimeSpan.FromMinutes(1));
18+
19+
20+
[GlobalSetup]
21+
public void GlobalSetup()
22+
{
23+
dictionary.TryAdd(1, 1);
24+
fastConcurrentLru.GetOrAdd(1, k => k);
25+
fastConcurrentTLru.GetOrAdd(1, k => k);
26+
}
27+
28+
[Benchmark(Baseline = true)]
29+
public int ConcurrentDictionary()
30+
{
31+
dictionary.TryGetValue(1, out var value);
32+
return value;
33+
}
34+
35+
[Benchmark()]
36+
public int FastConcurrentLru()
37+
{
38+
fastConcurrentLru.TryGet(1, out var value);
39+
return value;
40+
}
41+
42+
[Benchmark()]
43+
public int FastConcurrentTLru()
44+
{
45+
fastConcurrentTLru.TryGet(1, out var value);
46+
return value;
47+
}
48+
}
49+
}

README.md

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,222 @@ FastConcurrentLru does not allocate and is approximately 10x faster than System.
204204
TemplateConcurrentLru features injectable policies defined as structs. Since structs are subject to special JIT optimizations, the implementation is much faster than if these policies were defined as classes. Using this technique, 'Fast' versions without hit counting are within 30% of the speed of a ConcurrentDictionary.
205205

206206
Since DateTime.UtcNow is around 4x slower than a ConcurrentDictionary lookup, policies that involve time based expiry are significantly slower. Since these are injected as structs and the slow code is optimized away, it is possible maintain the fastest possible speed for the non-TTL policy.
207+
208+
### TemplateConcurrentLru.TryGet
209+
210+
This is the source code for the TryGet method. It calls into two value type generic type arguments: policy (1) and hitcounter (2).
211+
212+
```csharp
213+
public bool TryGet(K key, out V value)
214+
{
215+
I item;
216+
if (dictionary.TryGetValue(key, out item))
217+
{
218+
if (this.policy.ShouldDiscard(item)) // 1
219+
{
220+
this.Move(item, ItemDestination.Remove);
221+
value = default(V);
222+
return false;
223+
}
224+
225+
value = item.Value;
226+
this.policy.Touch(item);
227+
this.hitCounter.IncrementHit(); // 2
228+
return true;
229+
}
230+
231+
value = default(V);
232+
this.hitCounter.IncrementMiss(); // 2
233+
return false;
234+
}
235+
```
236+
237+
### FastConcurrentLru (LruPolicy & NullHitCounter)
238+
239+
LruPolicy is hardcoded to never discard items, so the branch and subsequent code for ShouldDiscard are completely eliminated by JIT (1).
240+
241+
```csharp
242+
public readonly struct LruPolicy<K, V> : IPolicy<K, V, LruItem<K, V>>
243+
{
244+
...
245+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
246+
public bool ShouldDiscard(LruItem<K, V> item)
247+
{
248+
return false;
249+
}
250+
...
251+
}
252+
```
253+
254+
Hit count methods are no-op, so are completely eliminated by the jit (2).
255+
256+
```csharp
257+
public struct NullHitCounter : IHitCounter
258+
{
259+
public double HitRatio => 0.0;
260+
261+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
262+
public void IncrementMiss()
263+
{
264+
}
265+
266+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
267+
public void IncrementHit()
268+
{
269+
}
270+
}
271+
```
272+
273+
The JITted assembly code for the TryGet method with these value type implementations is 76 bytes:
274+
275+
```assembly
276+
; BitFaster.Caching.Lru.TemplateConcurrentLru`5[[System.Int32, System.Private.CoreLib],[System.Int32, System.Private.CoreLib],[System.__Canon, System.Private.CoreLib],[BitFaster.Caching.Lru.LruPolicy`2[[System.Int32, System.Private.CoreLib],[System.Int32, System.Private.CoreLib]], BitFaster.Caching],[BitFaster.Caching.Lru.NullHitCounter, BitFaster.Caching]].TryGet(Int32, Int32 ByRef)
277+
push rsi
278+
sub rsp,30
279+
xor eax,eax
280+
mov [rsp+28],rax
281+
mov rsi,r8
282+
mov rcx,[rcx+8]
283+
lea r8,[rsp+28]
284+
cmp [rcx],ecx
285+
call qword ptr [7FFED15190A0]
286+
test eax,eax
287+
je short M01_L00
288+
mov rax,[rsp+28]
289+
mov eax,[rax+0C]
290+
mov [rsi],eax
291+
mov rax,[rsp+28]
292+
mov byte ptr [rax+10],1
293+
mov eax,1
294+
add rsp,30
295+
pop rsi
296+
ret
297+
M01_L00:
298+
xor eax,eax
299+
mov [rsi],eax
300+
add rsp,30
301+
pop rsi
302+
ret
303+
; Total bytes of code 76
304+
```
305+
306+
### FastConcurrentTLru (TLruLongTicksPolicy & NullHitCounter)
307+
308+
The policy for TLru can expire items, so the branch 2 is not eliminated.
309+
310+
```csharp
311+
public readonly struct TLruLongTicksPolicy<K, V> : IPolicy<K, V, LongTickCountLruItem<K, V>>
312+
{
313+
...
314+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
315+
public bool ShouldDiscard(LongTickCountLruItem<K, V> item)
316+
{
317+
if (Stopwatch.GetTimestamp() - item.TickCount > this.timeToLive)
318+
{
319+
return true;
320+
}
321+
322+
return false;
323+
}
324+
...
325+
}
326+
```
327+
328+
As a result, the JITted code now includes branch 2 and the assembly code grows considerably to 312 bytes.
329+
330+
```assembly
331+
; BitFaster.Caching.Lru.TemplateConcurrentLru`5[[System.Int32, System.Private.CoreLib],[System.Int32, System.Private.CoreLib],[System.__Canon, System.Private.CoreLib],[BitFaster.Caching.Lru.TLruLongTicksPolicy`2[[System.Int32, System.Private.CoreLib],[System.Int32, System.Private.CoreLib]], BitFaster.Caching],[BitFaster.Caching.Lru.NullHitCounter, BitFaster.Caching]].TryGet(Int32, Int32 ByRef)
332+
push rbp
333+
push r15
334+
push r14
335+
push r13
336+
push r12
337+
push rdi
338+
push rsi
339+
push rbx
340+
sub rsp,88
341+
lea rbp,[rsp+0C0]
342+
xor ebx,ebx
343+
mov [rbp+0FFC0],rbx
344+
mov [rbp+0FFB8],rbx
345+
mov [rbp+20],r8
346+
mov rsi,rcx
347+
mov ebx,edx
348+
lea rcx,[rbp+0FF70]
349+
mov rdx,r10
350+
call CORINFO_HELP_INIT_PINVOKE_FRAME
351+
mov r14,rax
352+
mov rcx,rsp
353+
mov [rbp+0FF90],rcx
354+
mov rcx,rbp
355+
mov [rbp+0FFA0],rcx
356+
mov rcx,[rsi+8]
357+
lea r8,[rbp+0FFC0]
358+
mov edx,ebx
359+
cmp [rcx],ecx
360+
call qword ptr [7FFED15290A0]
361+
test eax,eax
362+
je near ptr M01_L03
363+
mov [rbp+10],rsi
364+
mov rbx,[rsi+40]
365+
mov r15,[rbp+0FFC0]
366+
mov [rbp+0FFB0],r15
367+
lea rcx,[rbp+0FFB8]
368+
xor r11d,r11d
369+
mov rax,offset MD_Interop+Kernel32.QueryPerformanceCounter(Int64*)
370+
mov [rbp+0FF80],rax
371+
lea rax,[M01_L00]
372+
mov [rbp+0FF98],rax
373+
lea rax,[rbp+0FF70]
374+
mov [r14+10],rax
375+
mov byte ptr [r14+0C],0
376+
call qword ptr [7FFED151D7D0]
377+
M01_L00:
378+
mov byte ptr [r14+0C],1
379+
cmp dword ptr [7FFED1524BD8],0
380+
je short M01_L01
381+
call qword ptr [7FFED1528278]
382+
M01_L01:
383+
mov rcx,[rbp+0FF78]
384+
mov [r14+10],rcx
385+
mov rcx,[rbp+0FFB8]
386+
mov r15,[rbp+0FFB0]
387+
sub rcx,[r15+18]
388+
cmp rcx,rbx
389+
jle short M01_L02
390+
mov rcx,[rbp+10]
391+
mov rdx,[rbp+0FFC0]
392+
mov r8d,2
393+
call BitFaster.Caching.Lru.TemplateConcurrentLru`5[[System.Int32, System.Private.CoreLib],[System.Int32, System.Private.CoreLib],[System.__Canon, System.Private.CoreLib],[BitFaster.Caching.Lru.TLruLongTicksPolicy`2[[System.Int32, System.Private.CoreLib],[System.Int32, System.Private.CoreLib]], BitFaster.Caching],[BitFaster.Caching.Lru.NullHitCounter, BitFaster.Caching]].Move(System.__Canon, BitFaster.Caching.Lru.ItemDestination)
394+
xor eax,eax
395+
mov rdi,[rbp+20]
396+
mov [rdi],eax
397+
jmp short M01_L04
398+
M01_L02:
399+
mov rax,[rbp+0FFC0]
400+
mov eax,[rax+0C]
401+
mov rdi,[rbp+20]
402+
mov [rdi],eax
403+
mov rax,[rbp+0FFC0]
404+
mov byte ptr [rax+10],1
405+
mov eax,1
406+
jmp short M01_L04
407+
M01_L03:
408+
xor eax,eax
409+
mov rdi,[rbp+20]
410+
mov [rdi],eax
411+
M01_L04:
412+
movzx eax,al
413+
mov byte ptr [r14+0C],1
414+
lea rsp,[rbp+0FFC8]
415+
pop rbx
416+
pop rsi
417+
pop rdi
418+
pop r12
419+
pop r13
420+
pop r14
421+
pop r15
422+
pop rbp
423+
ret
424+
; Total bytes of code 312
425+
```

0 commit comments

Comments
 (0)