Bloom filter x64

Share your advanced PureBasic knowledge/code with the community.
User avatar
idle
Always Here
Always Here
Posts: 5039
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Bloom filter x64

Post by idle »

A high throughput bloomfilter.
https://en.wikipedia.org/wiki/Bloom_filter

Added original c fasthash for c backend
bloom items = 1,048,576 memory = 7.87 mb
Compressed size 1903041 b1858 kb 1.81 mb
Bloom get = 1048576
Bloom2 get = 1048576
Bloom3 get = 1048576
max false positives expected = 1024 actual 1033
Bloom set 26 ms
Bloom get 21 ms
map set 353 ms
map get 280 ms

Code: Select all

;High speed 64 bit Bloomfilter 
;Author Idle Andrew Ferguson
;licence MIT   
;v1.1 X64 ASM & C Backends, ARM64 PI4 and M1 C backends  

Structure bloom 
  *vt
  *filter 
  size.q
  elements.q
  hashes.q[2]
EndStructure 

Interface Ibloom 
  Set(*key,len)
  Get(*key,len) 
  GetSize()
  Or(*other.bloom)
  Compress(*len.long=0)
  Save(file.s)
  Free() 
EndInterface

Declare Bloom_Set(*this.bloom,*key,len) 
Declare Bloom_Get(*this.bloom,*key,len) 
Declare Bloom_Free(*this.bloom) 
Declare Bloom_GetSize(*this.bloom)
Declare Bloom_OR(*this.Bloom,*other.bloom) 
Declare Bloom_Compress(*this.bloom,*len.long=0) 
Declare Bloom_Save(*this.bloom,file.s) 

Declare Bloom_Decompress(*buf) 
Declare Bloom_Load(file.s) 
Declare Bloom_New(Items.i,MaxErrors.d=0.001)

DataSection : vt_bloom: 
  Data.i @Bloom_Set() 
  Data.i @Bloom_Get()
  Data.i @Bloom_GetSize()
  Data.i @Bloom_OR()
  Data.i @Bloom_Compress()
  Data.i @Bloom_Save()
  Data.i @Bloom_Free() 
EndDataSection 

CompilerIf #PB_Compiler_Backend = #PB_Backend_C  
    
  Procedure.q FastHash64(*buf,len,Seed.q=0)
    Protected result.q  
    ;FastHash64 algorithm by Zilong Tan
    !typedef unsigned long long uint64_t; 
    
    !#define mix(h) ({				      	  \
    !			(h) ^= (h) >> 23;		          \
    !			(h) *= 0x2127599bf4325c37ULL;	\
    !			(h) ^= (h) >> 47; })
    !
    
    !	const uint64_t m = 0x880355f21e6d1965ULL;
    !	const uint64_t *pos = (const uint64_t *)p_buf;
    !	const uint64_t *end = pos + (v_len / 8);
    !	const unsigned char *pos2;
    !	uint64_t h = v_seed ^ (v_len * m);
    !	uint64_t v;
    ! uint64_t result; 
    
    !	while (pos != end) {
    !		v  = *pos++;
    !		h ^= mix(v);
    !		h *= m;
    !	}
    
    !	pos2 = (const unsigned char*)pos;
    !	v = 0;
    
    !	switch (v_len & 7) {
    !	case 7: v ^= (uint64_t)pos2[6] << 48;
    !	case 6: v ^= (uint64_t)pos2[5] << 40;
    !	case 5: v ^= (uint64_t)pos2[4] << 32;
    !	case 4: v ^= (uint64_t)pos2[3] << 24;
    !	case 3: v ^= (uint64_t)pos2[2] << 16;
    !	case 2: v ^= (uint64_t)pos2[1] << 8;
    !	case 1: v ^= (uint64_t)pos2[0];
    !		h ^= mix(v);
    !		h *= m;
    !	}
    !
    !	v_result = mix(h);
    
    ProcedureReturn result 
  EndProcedure  
  
  
CompilerElse 
  
  Procedure.q FastHash64(*Buffer, Len, Seed.q=0)
    ; FastHash64 algorithm by Zilong Tan ported by wilbert
    !mov r10, 0x2127599bf4325c37
    !mov r11, 0x880355f21e6d1965
    !mov rdx, [p.p_Buffer]
    !mov rcx, [p.v_Len]
    !mov rax, rcx         ; h = seed ^ (len * m);
    !imul rax, r11
    !xor rax, [p.v_Seed]
    !sub rcx, 8
    !jc .l1
    ; 8 byte loop  
    !.l0:
    !mov r8, [rdx]        ; v = *pos++;
    !add rdx, 8
    ; -- mix(v) start --
    !mov r9, r8
    !shr r9, 23
    !xor r8, r9
    !imul r8, r10
    !mov r9, r8
    !shr r9, 47
    !xor r8, r9
    ; -- mix end --
    !xor rax, r8          ; h ^= mix(v);
    !imul rax, r11        ; h *= m;
    !sub rcx, 8
    !jnc .l0
    ; remaining bytes
    !.l1:
    !add rcx, 8
    !jz .l5
    !xor r8, r8
    !test rcx, 4
    !jz .l2
    ; get 4 bytes
    !mov r8d, [rdx]
    !add rdx, 4
    !ror r8, 32
    !.l2:
    !test rcx, 2
    !jz .l3
    ; get 2 bytes
    !movzx r9d, word [rdx]
    !add rdx, 2
    !xor r8, r9
    !ror r8, 16
    !.l3:
    !test rcx, 1
    !jz .l4
    ; get 1 byte
    !movzx r9d, byte [rdx]
    !xor r8, r9
    !ror r8, 8
    !.l4:
    !and rcx, 7
    !shl rcx, 3
    !rol r8, cl
    ; -- mix(v) start --
    !mov r9, r8
    !shr r9, 23
    !xor r8, r9
    !imul r8, r10
    !mov r9, r8
    !shr r9, 47
    !xor r8, r9
    ; -- mix end --
    !xor rax, r8          ; h ^= mix(v);
    !imul rax, r11        ; h *= m;
    ; -- mix(h) start --
    !.l5:
    !mov r9, rax
    !shr r9, 23
    !xor rax, r9
    !imul rax, r10
    !mov r9, rax
    !shr r9, 47
    !xor rax, r9
    ; -- mix end --
    ProcedureReturn       ; return mix(h);
  EndProcedure
  
CompilerEndIf 

Procedure Bloom_New(Items.i,MaxErrors.d=0.001)
  Protected *this.bloom,MaxError.d,size.i,pow.i  
  
  *this = AllocateMemory(SizeOf(bloom))
  
  If *this
    *this\vt = ?vt_bloom
    *this\elements = (-(2.0 * (items))) / (Log(1.0 - Pow(MaxErrors,0.5))) 
    *this\size = *this\elements / 8 
    *this\filter = AllocateMemory(*this\size) 
    *this\hashes[0] = 11400714819323198485
    *this\hashes[1] = 11400714819323198485 << 1
    
    ProcedureReturn *this 
  EndIf 
  
EndProcedure 

Procedure Bloom_Free(*this.bloom) 
  If *this 
    FreeMemory(*this\filter) 
    FreeMemory(*this)
    *this = 0 
  EndIf 
EndProcedure  

Procedure Bloom_GetSize(*this.bloom) 
  ProcedureReturn *this\size 
EndProcedure     

Procedure  Bloom_Set(*this.Bloom,*key,len)
  Protected hash.q,thash.q,a.i,*ta.Ascii
  Protected thash1.q
  thash1 =  FastHash64(*key,len) 
  hash =  (thash1 ! *this\hashes[0]) & $fffffffffffffff 
  thash = (thash1 ! *this\hashes[1]) & $fffffffffffffff
  hash % *this\elements
  thash % *this\elements
  *ta = *this\filter+(hash>>3)
  *ta\a | (1 << (hash & $07))
  *ta = *this\filter+(thash>>3)
  *ta\a | (1 << (thash & $07))
  ProcedureReturn thash1 
EndProcedure

Procedure Bloom_Get(*this.Bloom,*Key,len)
  Protected hash.q,thash.q,tret,retrn,a,*ta.Ascii
  Protected thash1.q,t1,t2
  thash1 =  FastHash64(*key,len) 
  hash =  (thash1 ! *this\hashes[0]) & $fffffffffffffff 
  thash = (thash1 ! *this\hashes[1]) & $fffffffffffffff 
  hash % *this\elements
  thash % *this\elements
  *ta = *this\filter+(hash>>3)
  t1 = (*ta\a & (1 << (hash & $07))) 
  *ta = *this\filter+(thash>>3)
  t2 = (*ta\a & (1 << (thash & $07)))
  If (t1 <> 0 And t2 <> 0) 
    ProcedureReturn #True
  Else
    ProcedureReturn #False
  EndIf
EndProcedure

UseLZMAPacker() 

#BLOOM_COMPPRESS = #PB_PackerPlugin_Lzma

Procedure Bloom_OR(*a.Bloom,*b.bloom)
  Protected *pa.Ascii,*pb.Ascii,a 
  
  If (*a And *b)
    
    If *a\Size <= *b\Size 
      *a\filter = ReAllocateMemory(*a\filter,*b\size)
      *a\size = *b\size  
    EndIf    
    
    *pa = *a\filter 
    *pb = *b\filter 
    
    For a = 0 To *b\size-1 
      *pa\a | *pb\a 
      *pa+1
      *pb+1
    Next 
  EndIf 
  
EndProcedure   

Procedure Bloom_Decompress(*buf) 
  Protected *this.bloom,size,size1,len  
  *this = AllocateMemory(SizeOf(bloom))
  
  size = PeekI(*buf) 
  size1 = PeekI(*buf+8)
  len = PeekI(*buf+16) 
  
  If *this
    *this\vt = ?vt_bloom
    *this\elements = len   
    *this\size = size1  
    *this\filter = AllocateMemory(*this\size) 
    *this\hashes[0] = 11400714819323198485
    *this\hashes[1] = 11400714819323198485 << 1
    
    len = UncompressMemory(*buf+24,size,*this\filter,size1,#BLOOM_COMPPRESS) 
    
    Debug *this\elements
    Debug *this\size 
    
    If len = size1 
      ProcedureReturn *this 
    EndIf 
  EndIf    
EndProcedure 

Procedure Bloom_Compress(*this.bloom,*len.long=0) 
  
  ;Debug *this\size 
  Debug "mem size filter " + Str(MemorySize(*this\filter)) 
  Debug "num elements " + Str(*this\elements) 
  Debug "size " +Str(*this\size) 
  
  Protected outsize ,*buf,len  
  outsize = *this\size + 24
  *buf = AllocateMemory(outsize) 
  len = CompressMemory(*this\filter,*this\size,*buf+24,outsize,#BLOOM_COMPPRESS,9) 
  
  Debug "Compressed Len : " + Str(len) 
  
  If len  
    PokeL(*buf,len) 
    PokeL(*buf+8,*this\size)
    PokeL(*buf+16,*this\elements) 
    If *len 
      *len\l = len+24 
    EndIf   
    ProcedureReturn *buf   
  EndIf   
  
EndProcedure 

Procedure Bloom_load(file.s) 
  
  Protected fn,bloom,*mem,len  
  fn = OpenFile(-1,file) 
  If fn 
    len = Lof(fn)
    If len 
      *mem = AllocateMemory(len) 
      ReadData(fn,*mem,len) 
      bloom = Bloom_Decompress(*mem) 
      FreeMemory(*mem) 
      CloseFile(fn) 
      ProcedureReturn bloom 
      
    EndIf 
  EndIf   
  
EndProcedure   

Procedure Bloom_Save(*this.bloom,file.s)
  Protected *buf,fn,res,len.l 
  fn = CreateFile(-1,file) 
  If fn 
    
    *buf = Bloom_Compress(*this,@len) 
    If (*buf And len > 0)  
      res = WriteData(fn,*buf,len)
    EndIf 
    FreeMemory(*buf) 
    CloseFile(fn) 
    ProcedureReturn res  
  EndIf 
  
EndProcedure    

CompilerIf #PB_Compiler_IsMainFile
  
  EnableExplicit
  
  Procedure MakeRandom(*buf.Ascii,seed=0) 
    Protected a, len,c  
    If seed <> 0 
      RandomSeed(seed)
    EndIf 
    len = (MemorySize(*buf)-1) 
    For a = 0 To Len 
      c = Random(3) 
      Select c 
        Case 0 
          *buf\a = 'A' 
        Case 1 
          *buf\a = 'C' 
        Case 2 
          *buf\a = 'G' 
        Case 3 
          *buf\a = 'T' 
      EndSelect 
      *buf+1 
    Next   
    
  EndProcedure  
  
  
  Global bloom.ibloom,bloom2,bloom3  
  Global k=31
  Global ers.d = 1.0 / 1024    ;max errors   
  Global size = 1024*1024
  Global size2 = 2 * size 
  Global size3
  Global out.s 
  Global *buf,*pc
  Global st,et,et1,stm,etm,etm1,a,ct,ct1,ct2,ct3,x
  
  *buf = AllocateMemory(size2)
  MakeRandom(*buf) 
  
  bloom = Bloom_new(size,ers) 
  
  Global NewMap mp(size) 
  
  st= ElapsedMilliseconds() 
  For a = 1 To size
    bloom_Set(bloom,*buf+a,k)
  Next 
  et = ElapsedMilliseconds()  
  For a = 1 To size
    ct + bloom_get(bloom,*buf+a,k)
  Next 
  et1 = ElapsedMilliseconds()  
  
  stm= ElapsedMilliseconds() 
  For a = 1 To size
    mp(PeekS(*buf+a,k,#PB_Ascii))=1
  Next 
  etm = ElapsedMilliseconds()  
  For a = 1 To size
    x=mp(PeekS(*buf+a,k,#PB_Ascii))
  Next 
  etm1 = ElapsedMilliseconds()  
  
  ;test for false positives, items that haven't been added to the set. 
  For a = size To size2
    ct1 + bloom_get(bloom,*buf+a,k) 
  Next 
  
  *pc = bloom\Compress()  ;compress bloom to buffer 
  If *pc 
    size3 = PeekL(*pc) 
  EndIf   
  
  bloom2 = Bloom_Decompress(*pc) ;returns a new bloom 
  FreeMemory(*pc) 
  
  For a = 1 To size
    ct2 + bloom_get(bloom2,*buf+a,k)
  Next 
  
  bloom3 = Bloom_new(size,ers) 
  Bloom_OR(bloom3,bloom2) 
  
  If  Bloom_Save(bloom3,"testbloom.blm") 
    Bloom_Free(bloom3)
    bloom3 = Bloom_load("testbloom.blm")
  EndIf    
  
  For a = 1 To size
    ct3 + bloom_get(bloom3,*buf+a,k)
  Next 
  
  
  out = "bloom items = " + FormatNumber(size,0,".",",") + " memory = " + StrF(bloom_GetSize(bloom) / 1024 / 1024,2) + " mb" + #CRLF$
  out + "Compressed size " + Str(size3) + " b" +  Str(size3 / 1024) + " kb  " + StrF(size3 / 1024 / 1024,2) + " mb" + #CRLF$
  out + "Bloom get = " + Str(ct) +#CRLF$
  out + "Bloom2 get = " + Str(ct2) +#CRLF$
  out + "Bloom3 get = " + Str(ct3) +#CRLF$
  
  out + "max false positives expected = " + Str(size*ers) + " actual " + Str(ct1) + #CRLF$ 
  out + "Bloom set " + Str(et-st) + " ms" + #CRLF$  
  out + "Bloom get " + Str(et1-et) + " ms" + #CRLF$  
  out + "map set " + Str(etm-stm) + " ms" + #CRLF$  
  out + "map get " + Str(etm1-etm) + " ms" + #CRLF$  
  
  
  SetClipboardText(out) 
  
  MessageRequester("bloom",out) 
  
  bloom_Free(bloom) 
  
CompilerEndIf   

User avatar
Kwai chang caine
Always Here
Always Here
Posts: 5342
Joined: Sun Nov 05, 2006 11:42 pm
Location: Lyon - France

Re: Bloom filter x64

Post by Kwai chang caine »

Hello IDLE :D

Too much hard to understand for my little head :oops:
But i still got to test your code plane, and apparently that works :wink:
Compiler wrote:bloom items = 1,000,000 memory = 7.42 mb
Bloom get = 1000000
max false positives expected = 1000 actual 1021
Bloom set 1.73 seconds
Bloom get 1.79 seconds
map set 1.16 seconds
map get 0.91 seconds
Thanks for sharing 8)
ImageThe happiness is a road...
Not a destination
User avatar
idle
Always Here
Always Here
Posts: 5039
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Bloom filter x64

Post by idle »

Thanks kcc, try it without the debugger

As an explanation for those that don't mind TLDR
A bloom filter basically provides a fast test to see if an item exists in a set. Compered to a map the bloom uses at least 4 times less memory and the longer the key the more efficient the bloom is as the map also stores the key. So the bloom saves memory and runs ~2 to 10x faster than a map depending on key length.

Say you wanted to build a cache for 100,000,000 items and the key is a md5 32 bytes long. The bloom uses 742mb, same map would use ~9000mb, which is ((number of items * 32 * key length * 2) / 1048576 ) now what if only 1% of the items were actually repeats you'd only need to store 1,000,000 items in your map ~90mb and still have knowledge of 100,000,000 items in your bloom in 742 mb. So while the bloom doesn't store any data it provides a quick way to say if you've seen some key before which can then be used to fetch the data from secondary storage rather than exhausting all the memory of the system as you still need room for the payload data in ram for it to be of any use.

This one is made for a kmer counter (counts sequences of dna k letters long), focus is on the speed for large kmer keys into 100's counters are often limited to k=31 as that's what can fit into an integer when converting ATCG to 2 bit representation.
If the goal was to count the human genome which is ~3.2gb in ASCII and you wanted to count the frequency of kmers in ram and you tried to use a pb map to count them it's not going to happen, there's not enough ram! 3,435,973,836 items * 32 * k=31 * size of char=2 or ~6348gb even a bloom for that would be 200gb so you still need to chop it up into smaller more manageable chunks and collate the results. Despite the simplicity of the task kmer counters are really hard to scale efficiently. but bloom filter comes in very handy for filtering out unique items so you only store and count repetitions in secondary storage.
User avatar
Kwai chang caine
Always Here
Always Here
Posts: 5342
Joined: Sun Nov 05, 2006 11:42 pm
Location: Lyon - France

Re: Bloom filter x64

Post by Kwai chang caine »

I have tested a new time your code witthout debuger on the same machine
Compiler wrote:bloom items = 1,000,000 memory = 7.42 mb
Bloom get = 1000000
max false positives expected = 1000 actual 1026
Bloom set 0.20 seconds
Bloom get 0.20 seconds
map set 0.91 seconds
map get 0.64 seconds
I understand a little bit better what is the BLOOM
I think it's perhaps used in database for find more quickly data :idea:
Sometimes... we wonder, where you are going to look for such complicated subjects :shock:
A little bit the Einstein of PB :wink:

Thanks a lot, for your full explanation, and for trying to make KCC a little less ignorant 8)
Have a good day MASTER... or night ...i never know :lol:
ImageThe happiness is a road...
Not a destination
AZJIO
Addict
Addict
Posts: 1312
Joined: Sun May 14, 2017 1:48 am

Re: Bloom filter x64

Post by AZJIO »

Linux Arch x64 (Anarhy)

bloom items = 1,000,000 memory = 7.42 mb
Bloom get = 1000000
max false positives expected = 1000 actual 1012
Bloom set 0.67 seconds
Bloom get 0.68 seconds
map set 0.45 seconds
map get 0.36 seconds

bloom items = 10,000,000 memory = 74.20 mb
Bloom get = 10000000
max false positives expected = 10000 actual 10265
Bloom set 7.05 seconds
Bloom get 7.20 seconds
map set 4.95 seconds
map get 4.01 seconds
User avatar
idle
Always Here
Always Here
Posts: 5039
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Bloom filter x64

Post by idle »

Turn off the debugger.
AZJIO
Addict
Addict
Posts: 1312
Joined: Sun May 14, 2017 1:48 am

Re: Bloom filter x64

Post by AZJIO »

+DisableDebugger

bloom items = 1,000,000 memory = 7.42 mb
Bloom get = 1000000
max false positives expected = 1000 actual 976
Bloom set 0.12 seconds
Bloom get 0.11 seconds
map set 0.42 seconds
map get 0.30 seconds
User avatar
jacdelad
Addict
Addict
Posts: 1431
Joined: Wed Feb 03, 2021 12:46 pm
Location: Planet Riesa
Contact:

Re: Bloom filter x64

Post by jacdelad »

Wait, the debugger influences the count of false positives?
PureBasic 6.04/XProfan X4a/Embarcadero RAD Studio 11/Perl 5.2/Python 3.10
Windows 11/Ryzen 5800X/32GB RAM/Radeon 7770 OC/3TB SSD/11TB HDD
Synology DS1821+/36GB RAM/130TB
Synology DS920+/20GB RAM/54TB
Synology DS916+ii/8GB RAM/12TB
User avatar
StarBootics
Addict
Addict
Posts: 984
Joined: Sun Jul 07, 2013 11:35 am
Location: Canada

Re: Bloom filter x64

Post by StarBootics »

Hello everyone,

Any chance to have this code C backend compatible ? I think it's just the FastHash64() procedure than need to be changed.

Interesting piece of code by the way. I'm currently experimenting with HashTable but my Hash function kind a work but it's not very efficient.

Best regards
StarBootics
The Stone Age did not end due to a shortage of stones !
AZJIO
Addict
Addict
Posts: 1312
Joined: Sun May 14, 2017 1:48 am

Re: Bloom filter x64

Post by AZJIO »

I don't understand how to use it. Is it possible to make an example where there are several lines (10pcs), and check that a certain word is already in the list.

Happened

Code: Select all

CompilerIf #PB_Compiler_IsMainFile
	
	EnableExplicit
	
	Global bloom.Ibloom
	Global ers.d = 0.001
	Global cnt = 4
	
	bloom.Ibloom = Bloom_new(cnt, ers)

	bloom\Set(@"sdfg", Len("sdfg"))
	bloom\Set(@"iuop", Len("iuop"))
	bloom\Set(@"reterty", Len("reterty"))
	bloom\Set(@"fuf", Len("fuf"))
	
	Debug bloom\get(@"qwqw", Len("qwqw"))
	Debug bloom\get(@"reterty", Len("reterty"))
	Debug bloom\get(@"fuf", Len("fuf"))
	
	
	bloom\Free()
	
CompilerEndIf

doesn't work like that

Code: Select all

CompilerIf #PB_Compiler_IsMainFile
	
	EnableExplicit
	
	Global bloom.Ibloom
	Global ers.d = 0.001
	Global cnt, *p

	
	NewList word.s()
	AddElement(word())
	word() = "sdfg"
	AddElement(word())
	word() = "iuop"
	AddElement(word())
	word() = "reterty"
	cnt = ListSize(word())

	
	bloom.Ibloom = Bloom_new(cnt, ers)
	
	ResetList(word())

	Repeat
		*p = NextElement(word())
		If *p
; 			Debug PeekS(*p, 4, #PB_Ascii)
			bloom\Set(*p, Len(word()))
; 			bloom\Set(*p, StringByteLength(word(), #PB_Ascii))
		Else
			Break
		EndIf
	ForEver

	Debug bloom\get(@"reterty", Len("reterty"))
	Debug bloom\get(@"fuf", Len("fuf"))
	
	
	bloom\Free()
	
CompilerEndIf
User avatar
idle
Always Here
Always Here
Posts: 5039
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Bloom filter x64

Post by idle »

Code: Select all

EnableExplicit

Global NewList word.s()
Global *word.string,len   

AddElement(word())
word() = "sdfg"
AddElement(word())
word() = "iuop"
AddElement(word())
word() = "reterty"

Global cnt = ListSize(word())
Global ers.d = 0.001
Global bloom.Ibloom = Bloom_new(cnt,ers)

ForEach word()
  *word = @word() 
  Debug PeekS(@*word\s,-1,#PB_Unicode)
  len = StringByteLength(word())
  If *word 
    bloom\Set(@*word\s,len)
  EndIf   
Next   

Debug bloom\get(@"reterty", StringByteLength("reterty"))
Debug bloom\get(@"fuf", StringByteLength("fuf"))


jacdelad wrote: Wed Jul 06, 2022 12:25 am Wait, the debugger influences the count of false positives?
No but it effects the run time.


@StarBootics

The hash function is quite important for performance, I can have a go poring it to c backend tomorrow but it's a bit late for me tonight and I got stuck on movzx.
User avatar
idle
Always Here
Always Here
Posts: 5039
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Bloom filter x64

Post by idle »

StarBootics wrote: Wed Jul 06, 2022 12:32 am Hello everyone,

Any chance to have this code C backend compatible ? I think it's just the FastHash64() procedure than need to be changed.

Interesting piece of code by the way. I'm currently experimenting with HashTable but my Hash function kind a work but it's not very efficient.

Best regards
StarBootics
I redid the asm for c backend see 1st post for code , no difference in speed though
c backend
bloom items = 10,000,000 memory = 74.20 mb
Bloom get = 10000000
max false positives expected = 10000 actual 10041
Bloom set 0.55 seconds
Bloom get 0.52 seconds
map set 3.52 seconds
map get 2.93 seconds

asm backend
bloom items = 10,000,000 memory = 74.20 mb
Bloom get = 10000000
max false positives expected = 10000 actual 10044
Bloom set 0.63 seconds
Bloom get 0.56 seconds
map set 3.21 seconds
map get 2.79 seconds
User avatar
StarBootics
Addict
Addict
Posts: 984
Joined: Sun Jul 07, 2013 11:35 am
Location: Canada

Re: Bloom filter x64

Post by StarBootics »

Thanks,

Something strange the number of false positive on my computer appear to be lower. Time for the Bloom set/get is comparable to the result you have.
bloom items = 10,000,000 memory = 74.20 mb
Bloom get = 10000000
max false positives expected = 10000 actual 9909
Bloom set 0.62 seconds
Bloom get 0.60 seconds
map set 3.31 seconds
map get 2.86 seconds
Something to keep in mind that the PeekS() function is little bit slow and that might push the result in favor of the Bloom filter data structure.

Best regards
StarBootics
Last edited by StarBootics on Thu Jul 07, 2022 5:42 am, edited 1 time in total.
The Stone Age did not end due to a shortage of stones !
User avatar
idle
Always Here
Always Here
Posts: 5039
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Bloom filter x64

Post by idle »

StarBootics wrote: Thu Jul 07, 2022 4:41 am Thanks,

Something strange the number of false positive on my computer appear to be lower. Time for the Bloom set/get is comparable to the result you have.
bloom items = 10,000,000 memory = 74.20 mb
Bloom get = 10000000
max false positives expected = 10000 actual 9909
Bloom set 0.62 seconds
Bloom get 0.60 seconds
map set 3.31 seconds
map get 2.86 seconds
Best regards
StarBootics
No it's normal the test is with random data, so the number of collisions will vary but should be lower than the % you specify.
User avatar
idle
Always Here
Always Here
Posts: 5039
Joined: Fri Sep 21, 2007 5:52 am
Location: New Zealand

Re: Bloom filter x64

Post by idle »

updated with inline c for macOS or clang
works on windows linux macOS asm and c backends
Post Reply