Page 1 of 3

Population count

Posted: Thu May 14, 2015 11:57 am
by Little John
Hi all,

I want to count the number of bits that are set to 1 in a 32 bit variable.

The following code seems to work correctly on Windows, with PB 5.31 x86 and x64.
Are there any mistakes in the code, or is there any other room for improvement?
Thanks in advance!

Code: Select all

#HasSSE42 = #False  ; Does the processor support the SSE 4.2 instruction set?
                    ; Can be checked with idle's fine CPUInfo module:
                    ; http://www.purebasic.fr/english/viewtopic.php?f=12&t=61060


Procedure.l PopCount (k.l)
   ; -- Population count:
   ; get number of bits that are set to 1 in 'k'
   
   CompilerIf #HasSSE42
      ! mov    eax, [p.v_k]
      ! popcnt eax, eax
      
   CompilerElse   
      ! mov  edx, [p.v_k]
      ! xor  eax, eax
      ! mov  ecx, 32
      
      ! popcount_again:  
      ! dec  ecx
      ! bt   edx, ecx
      ! jnc  popcount_notset
      ! inc  eax
      ! popcount_notset:   
      ! or   ecx, ecx
      ! jnz  popcount_again
   CompilerEndIf   
   
   ProcedureReturn           ; return the value of eax
EndProcedure


Debug PopCount(%0)                                 ;  0
Debug PopCount(%11)                                ;  2
Debug PopCount(%10000000000000000000000000000000)  ;  1
Debug PopCount(%11111111111111111111111111111111)  ; 32

Re: Population count

Posted: Thu May 14, 2015 12:29 pm
by wilbert
Here's a second method (Brian Kernighan's way)
Especially faster when not all bits are set.

Code: Select all

 Procedure.l PopCount_m2(l.l)
  !xor eax, eax
  !mov edx, [p.v_l]
  !and edx, edx
  !jz popcount_m2_done
  !popcount_m2_loop:
  !inc eax
  !lea ecx, [edx - 1]
  !and edx, ecx
  !jnz popcount_m2_loop
  !popcount_m2_done:
  ProcedureReturn
EndProcedure

Re: Population count

Posted: Thu May 14, 2015 12:30 pm
by luis
I don't remember much about ASM, but I would simplify the second part like this by using the carry:

Code: Select all

 !mov edx, [p.v_k] 
 !xor eax, eax
 !mov ecx, 32 
  
 !popcount:  
 !shl edx, 1
 !adc eax, 0
 !loop popcount
If you have something against loop (I don't) you can replace it with

Code: Select all

 
 !dec ecx
 !jnz popcount 

Re: Population count

Posted: Thu May 14, 2015 3:39 pm
by Little John
Cool. 8) Many thanks, wilbert and luis!

These are typical results that I get for speed testing (PB 5.31 x86):

Code: Select all

---------------------------
PopCount speed test
---------------------------
t0 =   29 ms   (11%)  ; 'popcnt' asm instruction
t1 =  259 ms  (100%)  ; wilbert
t2 =  334 ms  (128%)  ; luis
t3 =  737 ms  (284%)  ; luis (using 'loop')
t4 = 1316 ms  (508%)  ; LJ
t5 = 1240 ms  (478%)  ; Trond (recursive)
t6 =  854 ms  (329%)  ; Trond
t7 =   40 ms   (15%)  ; idle
---------------------------
For the record, here is the code that I used for testing. Thanks again!

Code: Select all

; PB 5.31

EnableExplicit

Procedure.l PopCount0 (k.l)
   ! mov    eax, [p.v_k]
   ! popcnt eax, eax
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount1 (k.l)
   ; -- wilbert
   
   !xor eax, eax
   !mov edx, [p.v_k]
   !and edx, edx
   !jz popcount_m2_done
   
   !popcount_m2_loop:
   !inc eax
   !lea ecx, [edx - 1]
   !and edx, ecx
   !jnz popcount_m2_loop
   
   !popcount_m2_done:
   ProcedureReturn
EndProcedure


Procedure.l PopCount2 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount2:
   !shl edx, 1
   !adc eax, 0
   !dec ecx
   !jnz popcount2
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount3 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount3:
   !shl edx, 1
   !adc eax, 0
   !loop popcount3
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount4 (k.l)
   ; -- LJ
   
   ! mov  edx, [p.v_k]
   ! xor  eax, eax
   ! mov  ecx, 32
   
   ! popcount_again:
   ! dec  ecx
   ! bt   edx, ecx
   ! jnc  popcount_notset
   ! inc  eax
   ! popcount_notset:   
   ! or   ecx, ecx
   ! jnz  popcount_again
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount5 (v.l, i = 0)
   ; -- Trond
   
   If i < 32
      ProcedureReturn (v & 1 << i) >> i + PopCount5(v, i+1)
   EndIf
EndProcedure


Procedure.l PopCount6 (v.l)
   ; -- Trond
   
   Protected i.i, Agg.l
   
   For i = 0 To 31
      Agg + (v & 1 << i) >> i
   Next
   ProcedureReturn Agg
EndProcedure


Procedure.l Popcount7 (x.l)
   ; -- idle (adapted for 32 bit)
   
   ;x - (x >> 1) &  $55555555           
   !mov eax, [p.v_x]
   !mov edx, eax
   !shr edx, 1
   !and edx, 0x55555555
   !sub eax, edx
   ;x = (x & $33333333) + ((x >> 2) & $33333333)
   !mov edx, eax       ;x
   !and eax, 0x33333333
   !shr edx, 2
   !and edx, 0x33333333
   !add eax, edx
   ;x = (x + (x >> 4)) & $0f0f0f0f0f
   !mov edx, eax
   !shr edx, 4
   !add eax, edx
   !and eax, 0x0f0f0f0f
   ;x * 0x01010101 >> 24
   !imul eax, 0x01010101
   !shr eax, 24
   ProcedureReturn 
EndProcedure 


; ---- Initialisation ----
Define.i i, x, rep=10000000
Define.i t0, t1, t2, t3, t4, t5, t6, t7

Dim Rnd.l(rep)
For i = 1 To rep
   Rnd(i) = Random(2147483647)
Next


; ---- Small check whether all procedures return the same results ----
For i = 1 To 100
   x = PopCount0(Rnd(i))
   If x <> PopCount1(Rnd(i)) Or
      x <> PopCount2(Rnd(i)) Or
      x <> PopCount3(Rnd(i)) Or
      x <> PopCount4(Rnd(i)) Or
      x <> PopCount5(Rnd(i)) Or
      x <> PopCount6(Rnd(i)) Or
      x <> PopCount7(Rnd(i))
      MessageRequester("Error",
                       "Different results for PopCount(" + Rnd(i) + ")")
      End
   EndIf
Next   


; ---- Speed test ----
t0 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount0(Rnd(i))
Next   
t0 = ElapsedMilliseconds() - t0

t1 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount1(Rnd(i))
Next   
t1 = ElapsedMilliseconds() - t1

t2 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount2(Rnd(i))
Next   
t2 = ElapsedMilliseconds() - t2

t3 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount3(Rnd(i))
Next   
t3 = ElapsedMilliseconds() - t3

t4 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount4(Rnd(i))
Next   
t4 = ElapsedMilliseconds() - t4

t5 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount5(Rnd(i))
Next   
t5 = ElapsedMilliseconds() - t5

t6 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount6(Rnd(i))
Next   
t6 = ElapsedMilliseconds() - t6

t7 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount7(Rnd(i))
Next   
t7 = ElapsedMilliseconds() - t7


MessageRequester("PopCount speed test",
                 "t0 = " + t0 + " ms     (" + Int(100*t0/t1) + "%)" + #LF$ +
                 "t1 = " + t1 + " ms   (100%)" + #LF$ +
                 "t2 = " + t2 + " ms   (" + Int(100*t2/t1) + "%)" + #LF$ +
                 "t3 = " + t3 + " ms   (" + Int(100*t3/t1) + "%)" + #LF$ +
                 "t4 = " + t4 + " ms  (" + Int(100*t4/t1) + "%)" + #LF$ +
                 "t5 = " + t5 + " ms  (" + Int(100*t5/t1) + "%)" + #LF$ +
                 "t6 = " + t6 + " ms   (" + Int(100*t6/t1) + "%)" + #LF$ +
                 "t7 = " + t7 + " ms     (" + Int(100*t7/t1) + "%)")

Re: Population count

Posted: Thu May 14, 2015 5:24 pm
by Trond
On linux x64 it says popcnt: invalid instruction.

I made two non-asm version to compare. The second is actually faster than PopCount4().

Code: Select all

Procedure PopCount5(v.l, i = 0)
  If i < 32
    ProcedureReturn (v & 1 << i) >> i + PopCount5(v, i+1)
  EndIf
EndProcedure

Procedure PopCount6(v.l, i = 0)
  For I = 0 To 31
    Agg + (v & 1 << i) >> i
  Next
  ProcedureReturn Agg
EndProcedure

Re: Population count

Posted: Thu May 14, 2015 11:14 pm
by idle
swar method no branching

Code: Select all

Procedure Popcount32(x.l)
  ;x - (x >> 1) &  $55555555           
  !mov rax, [p.v_x]
  !mov rdx, rax
  !shr rdx, 1
  !and rdx, 0x55555555 
  !sub rax,rdx 
  ;x = (x & $33333333) + ((x >> 2) & $33333333)
  !mov rdx, rax       ;x 
  !and rax, 0x33333333 
  !shr rdx, 2 
  !and rdx, 0x33333333
  !add rax,rdx
  ;x = (x + (x >> 4)) & $0f0f0f0f0f
  !mov rdx, rax 
  !shr rdx, 4 
  !add rax,rdx 
  !and rax, 0x0f0f0f0f
  ;x * 0x01010101 >> 24
  !imul eax, 0x01010101
  !shr eax, 24
  ProcedureReturn  
EndProcedure 

x = Random($FFFFFFFF)
Debug Bin(x)
Debug Popcount32(x) 


Re: Population count

Posted: Thu May 14, 2015 11:57 pm
by luis
@idle, wow that's insane !

How it works explained (same code in C) -> http://www.playingwithpointers.com/swar.html

Re: Population count

Posted: Fri May 15, 2015 4:57 am
by wilbert
Very nice Idle.
By far the fastest method after the popcnt instruction. :)

Re: Population count

Posted: Fri May 15, 2015 5:26 am
by idle
If you need a 64 bit popcount, it's the same but it can't be done with immediate's
(which makes me wonder if the previous one would need to be done like this for 32 bit x86)

Code: Select all

Procedure Popcount64(x.i)
  ;x - (x >> 1) &  $5555555555555555           
  !mov rax, [p.v_x]
  !mov rdx, rax
  !shr rdx, 1
  !mov r15, 0x5555555555555555
  !and rdx, r15
  !sub rax,rdx 
  ;x = (x & $3333333333333333) + ((x >> 2) & $3333333333333333)
  !mov rdx, rax       ;x 
  !mov r15, 0x3333333333333333 
  !and rax, r15  
  !shr rdx, 2 
  !and rdx, r15
  !add rax,rdx
  ;x = (x + (x >> 4)) & $0f0f0f0f0f0f0f0f
  !mov rdx, rax 
  !shr rdx, 4 
  !add rax,rdx 
  !mov r15, 0x0f0f0f0f0f0f0f0f
  !and rax, r15 
  ;x * 0101010101010101 >> 56
  !mov r15, 0x0101010101010101
  !imul rax, r15
  !shr rax, 56 
  ProcedureReturn  
EndProcedure 

Re: Population count

Posted: Fri May 15, 2015 5:27 am
by Little John
Trond and idle, many thanks for your posts!
I've included your code in the speed test above.
Trond wrote:On linux x64 it says popcnt: invalid instruction.
Same problem here (using Linux Mint 17.1).
On the same computer, the code works fine on Windows 7.
So it's actually a problem of (only some distributions of?) Linux.
Trond wrote:I made two non-asm version to compare. The second is actually faster than PopCount4().
This is a good example that smart PB code can be faster than badly written ASM code.
In my tests, even your recursive version is faster than my PopCount4(). :-)

Idle, your code looks like a magic trick to me. :-)
But it works. It's almost as fast as the 'popcnt' instruction.
Incredible!
(I took your first code and adapted it for 32 bit.)

Re: Population count

Posted: Fri May 15, 2015 5:54 am
by idle
The problem with popcnt on linux is the fasm version
I just updated it to the latest version and then it works

Code: Select all

Procedure PopCountSSE4(x.i)
  !popcnt rax,[p.v_x]   
  ProcedureReturn 
EndProcedure   

x = Random($FFFFFFFF)
Debug Bin(x)
Debug PopCountSSE4(x) 

Re: Population count

Posted: Fri May 15, 2015 6:21 am
by said
For fun, i took Trond's PopCount6 and removed the loop for, with plain silly repetition block see in PopCount8() the result is interesting (72% vs 340% on my PC)

Code: Select all


EnableExplicit

Procedure.l PopCount0 (k.l)
   ! mov    eax, [p.v_k]
   ! popcnt eax, eax
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount1 (k.l)
   ; -- wilbert
   
   !xor eax, eax
   !mov edx, [p.v_k]
   !and edx, edx
   !jz popcount_m2_done
   
   !popcount_m2_loop:
   !inc eax
   !lea ecx, [edx - 1]
   !and edx, ecx
   !jnz popcount_m2_loop
   
   !popcount_m2_done:
   ProcedureReturn
EndProcedure


Procedure.l PopCount2 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount2:
   !shl edx, 1
   !adc eax, 0
   !dec ecx
   !jnz popcount2
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount3 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount3:
   !shl edx, 1
   !adc eax, 0
   !loop popcount3
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount4 (k.l)
   ; -- LJ
   
   ! mov  edx, [p.v_k]
   ! xor  eax, eax
   ! mov  ecx, 32
   
   ! popcount_again:
   ! dec  ecx
   ! bt   edx, ecx
   ! jnc  popcount_notset
   ! inc  eax
   ! popcount_notset:   
   ! or   ecx, ecx
   ! jnz  popcount_again
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount5 (v.l, i = 0)
   ; -- Trond
   
   If i < 32
      ProcedureReturn (v & 1 << i) >> i + PopCount5(v, i+1)
   EndIf
EndProcedure


Procedure.l PopCount6 (v.l)
   ; -- Trond
   
   Protected i.i, Agg.l
   
   For i = 0 To 31
      Agg + (v & 1 << i) >> i
   Next
   ProcedureReturn Agg
EndProcedure


Procedure.l Popcount7 (x.l)
   ; -- idle (adapted for 32 bit)
   
   ;x - (x >> 1) &  $55555555           
   !mov eax, [p.v_x]
   !mov edx, eax
   !shr edx, 1
   !and edx, 0x55555555
   !sub eax, edx
   ;x = (x & $33333333) + ((x >> 2) & $33333333)
   !mov edx, eax       ;x
   !and eax, 0x33333333
   !shr edx, 2
   !and edx, 0x33333333
   !add eax, edx
   ;x = (x + (x >> 4)) & $0f0f0f0f0f
   !mov edx, eax
   !shr edx, 4
   !add eax, edx
   !and eax, 0x0f0f0f0f
   ;x * 0x01010101 >> 24
   !imul eax, 0x01010101
   !shr eax, 24
   ProcedureReturn 
EndProcedure 

Procedure.l Popcount8(v.l)
    ; --- Trond without a loop
    Protected y.l
    
    y = (v & 1 <<  0) >>  0 +
        (v & 1 <<  1) >>  1 +
        (v & 1 <<  2) >>  2 +
        (v & 1 <<  3) >>  3 +
        (v & 1 <<  4) >>  4 +
        (v & 1 <<  5) >>  5 +
        (v & 1 <<  6) >>  6 +
        (v & 1 <<  7) >>  7 +
        (v & 1 <<  8) >>  8 +
        (v & 1 <<  9) >>  9 +
        (v & 1 << 10) >> 10 +
        (v & 1 << 11) >> 11 +
        (v & 1 << 12) >> 12 +
        (v & 1 << 13) >> 13 +
        (v & 1 << 14) >> 14 +
        (v & 1 << 15) >> 15 +
        (v & 1 << 16) >> 16 +
        (v & 1 << 17) >> 17 +
        (v & 1 << 18) >> 18 +
        (v & 1 << 19) >> 19 +
        (v & 1 << 20) >> 20 +
        (v & 1 << 21) >> 21 +
        (v & 1 << 22) >> 22 +
        (v & 1 << 23) >> 23 +
        (v & 1 << 24) >> 24 +
        (v & 1 << 25) >> 25 +
        (v & 1 << 26) >> 26 +
        (v & 1 << 27) >> 27 +
        (v & 1 << 28) >> 28 +
        (v & 1 << 29) >> 29 +
        (v & 1 << 30) >> 30 +
        (v & 1 << 31) >> 31 
    
    ProcedureReturn y
EndProcedure

; ---- Initialisation ----
Define.i i, x, rep=10000000
Define.i t0, t1, t2, t3, t4, t5, t6, t7, t8

Dim Rnd.l(rep)
For i = 1 To rep
   Rnd(i) = Random(2147483647)
Next


; ---- Small check whether all procedures return the same results ----
For i = 1 To 100
   x = PopCount0(Rnd(i))
   If x <> PopCount1(Rnd(i)) Or
      x <> PopCount2(Rnd(i)) Or
      x <> PopCount3(Rnd(i)) Or
      x <> PopCount4(Rnd(i)) Or
      x <> PopCount5(Rnd(i)) Or
      x <> PopCount6(Rnd(i)) Or
      x <> PopCount7(Rnd(i)) Or
      x <> PopCount8(Rnd(i)) 
      
      MessageRequester("Error",
                       "Different results for PopCount(" + Rnd(i) + ")")
      End
   EndIf
Next   


; ---- Speed test ----
t0 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount0(Rnd(i))
Next   
t0 = ElapsedMilliseconds() - t0

t1 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount1(Rnd(i))
Next   
t1 = ElapsedMilliseconds() - t1

t2 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount2(Rnd(i))
Next   
t2 = ElapsedMilliseconds() - t2

t3 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount3(Rnd(i))
Next   
t3 = ElapsedMilliseconds() - t3

t4 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount4(Rnd(i))
Next   
t4 = ElapsedMilliseconds() - t4

t5 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount5(Rnd(i))
Next   
t5 = ElapsedMilliseconds() - t5

t6 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount6(Rnd(i))
Next   
t6 = ElapsedMilliseconds() - t6

t7 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount7(Rnd(i))
Next   
t7 = ElapsedMilliseconds() - t7

t8 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount8(Rnd(i))
Next   
t8 = ElapsedMilliseconds() - t8

MessageRequester("PopCount speed test",
                 "t0 = " + t0 + " ms     (" + Int(100*t0/t1) + "%)" + #LF$ +
                 "t1 = " + t1 + " ms   (100%)" + #LF$ +
                 "t2 = " + t2 + " ms   (" + Int(100*t2/t1) + "%)" + #LF$ +
                 "t3 = " + t3 + " ms   (" + Int(100*t3/t1) + "%)" + #LF$ +
                 "t4 = " + t4 + " ms   (" + Int(100*t4/t1) + "%)" + #LF$ +
                 "t5 = " + t5 + " ms   (" + Int(100*t5/t1) + "%)" + #LF$ +
                 "t6 = " + t6 + " ms   (" + Int(100*t6/t1) + "%)" + #LF$ +
                 "t7 = " + t7 + " ms   (" + Int(100*t7/t1) + "%)" + #LF$ +
                 "t8 = " + t8 + " ms   (" + Int(100*t8/t1) + "%)")

Re: Population count

Posted: Fri May 15, 2015 6:31 am
by wilbert
idle wrote:The problem with popcnt on linux is the fasm version
I just updated it to the latest version and then it works
If for some reason you wouldn't want to do that you can define the opcode with some bytes

Code: Select all

Procedure.l PopCount(x.l)
  !mov eax, [p.v_x]
  !dd 0xc0b80ff3; popcnt eax, eax
  ProcedureReturn 
EndProcedure
Strange that the Linux version of PureBasic doesn't have the same assembler version.
I thought both Linux and Windows used Fasm.
On the other hand, on OS X we have Nasm for x86 and Yasm for x64. :shock: (no idea why not the same is used for both).

Re: Population count

Posted: Fri May 15, 2015 6:49 am
by idle
wilbert wrote: Strange that the Linux version of PureBasic doesn't have the same assembler version.
I thought both Linux and Windows used Fasm.
On the other hand, on OS X we have Nasm for x86 and Yasm for x64. :shock: (no idea why not the same is used for both).

I guess Fred forgot to update fasm for 5.22 linux x64 or maybe there was an unresolved issue with fasm around the time of 5.22 release?

The different assemblers kind of preclude using preprocessor logic in the assembler. it's a shame fasm
hasn't built a mac version yet.

Re: Population count

Posted: Fri May 15, 2015 9:57 am
by davido
@Little John,
Thank you for bringing this subject up.
I had written a PB version by the checking each bit. It runs at 600%.
I've chosen the PB version of the method described by idle; it runs at 210%.
I present it, just in case it can be improved upon:

Code: Select all


EnableExplicit

Procedure.l PopCount0 (k.l)
   ! mov    eax, [p.v_k]
   ! popcnt eax, eax
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount1 (k.l)
   ; -- wilbert
   
   !xor eax, eax
   !mov edx, [p.v_k]
   !and edx, edx
   !jz popcount_m2_done
   
   !popcount_m2_loop:
   !inc eax
   !lea ecx, [edx - 1]
   !and edx, ecx
   !jnz popcount_m2_loop
   
   !popcount_m2_done:
   ProcedureReturn
EndProcedure


Procedure.l PopCount2 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount2:
   !shl edx, 1
   !adc eax, 0
   !dec ecx
   !jnz popcount2
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount3 (k.l)
   ; -- luis
   
   !mov edx, [p.v_k]
   !xor eax, eax
   !mov ecx, 32
   
   !popcount3:
   !shl edx, 1
   !adc eax, 0
   !loop popcount3
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount4 (k.l)
   ; -- LJ
   
   ! mov  edx, [p.v_k]
   ! xor  eax, eax
   ! mov  ecx, 32
   
   ! popcount_again:
   ! dec  ecx
   ! bt   edx, ecx
   ! jnc  popcount_notset
   ! inc  eax
   ! popcount_notset:   
   ! or   ecx, ecx
   ! jnz  popcount_again
   
   ProcedureReturn
EndProcedure


Procedure.l PopCount5 (v.l, i = 0)
   ; -- Trond
   
   If i < 32
      ProcedureReturn (v & 1 << i) >> i + PopCount5(v, i+1)
   EndIf
EndProcedure


Procedure.l PopCount6 (v.l)
   ; -- Trond
   
   Protected i.i, Agg.l
   
   For i = 0 To 31
      Agg + (v & 1 << i) >> i
   Next
   ProcedureReturn Agg
EndProcedure


Procedure.l Popcount7 (x.l)
   ; -- idle (adapted for 32 bit)
   
   ;x - (x >> 1) &  $55555555           
   !mov eax, [p.v_x]
   !mov edx, eax
   !shr edx, 1
   !and edx, 0x55555555
   !sub eax, edx
   ;x = (x & $33333333) + ((x >> 2) & $33333333)
   !mov edx, eax       ;x
   !and eax, 0x33333333
   !shr edx, 2
   !and edx, 0x33333333
   !add eax, edx
   ;x = (x + (x >> 4)) & $0f0f0f0f0f
   !mov edx, eax
   !shr edx, 4
   !add eax, edx
   !and eax, 0x0f0f0f0f
   ;x * 0x01010101 >> 24
   !imul eax, 0x01010101
   !shr eax, 24
   ProcedureReturn 
EndProcedure 

Procedure.l Popcount8(v.l)
    ; --- Trond without a loop
    Protected y.l
    
    y = (v & 1 <<  0) >>  0 +
        (v & 1 <<  1) >>  1 +
        (v & 1 <<  2) >>  2 +
        (v & 1 <<  3) >>  3 +
        (v & 1 <<  4) >>  4 +
        (v & 1 <<  5) >>  5 +
        (v & 1 <<  6) >>  6 +
        (v & 1 <<  7) >>  7 +
        (v & 1 <<  8) >>  8 +
        (v & 1 <<  9) >>  9 +
        (v & 1 << 10) >> 10 +
        (v & 1 << 11) >> 11 +
        (v & 1 << 12) >> 12 +
        (v & 1 << 13) >> 13 +
        (v & 1 << 14) >> 14 +
        (v & 1 << 15) >> 15 +
        (v & 1 << 16) >> 16 +
        (v & 1 << 17) >> 17 +
        (v & 1 << 18) >> 18 +
        (v & 1 << 19) >> 19 +
        (v & 1 << 20) >> 20 +
        (v & 1 << 21) >> 21 +
        (v & 1 << 22) >> 22 +
        (v & 1 << 23) >> 23 +
        (v & 1 << 24) >> 24 +
        (v & 1 << 25) >> 25 +
        (v & 1 << 26) >> 26 +
        (v & 1 << 27) >> 27 +
        (v & 1 << 28) >> 28 +
        (v & 1 << 29) >> 29 +
        (v & 1 << 30) >> 30 +
        (v & 1 << 31) >> 31 
    
    ProcedureReturn y
EndProcedure


Procedure popcount9(i.l)
  Protected j.i
  j = (i >> 1) & $55555555;
  i = i - j; // (A)
  i = (i & $33333333) + ((i >> 2) & $33333333); // (B)
  i = (i & $0F0F0F0F) + ((i >> 4) & $0F0F0F0F); // (C)
  i = i * $01010101; // (D)
  ProcedureReturn i >> 24;
EndProcedure

Procedure NumOfBitsSet(N.q)
  ;=======================================================
  ;Counts the number of bits set in N and returns the
  ;count as an integer.
  ;=======================================================
  Protected Count.i
  If N < 0
    ProcedureReturn -1
  Else
    While N > 0
      If N & 1 = 1
        Count + 1
      EndIf
      N >> 1
    Wend 
    ProcedureReturn Count
  EndIf
EndProcedure

; ---- Initialisation ----
Define.i i, x, rep=10000000
Define.i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta

Dim Rnd.l(rep)
For i = 1 To rep
   Rnd(i) = Random(2147483647)
Next


; ---- Small check whether all procedures return the same results ----
For i = 1 To 100
   x = PopCount0(Rnd(i))
   If x <> PopCount1(Rnd(i)) Or
      x <> PopCount2(Rnd(i)) Or
      x <> PopCount3(Rnd(i)) Or
      x <> PopCount4(Rnd(i)) Or
      x <> PopCount5(Rnd(i)) Or
      x <> PopCount6(Rnd(i)) Or
      x <> PopCount7(Rnd(i)) Or
      x <> PopCount8(Rnd(i)) Or 
      x <> PopCount9(Rnd(i)) Or
      x <> NumOfBitsSet(Rnd(i)) 
      
      MessageRequester("Error",
                       "Different results for PopCount(" + Rnd(i) + ")")
      End
   EndIf
Next   


; ---- Speed test ----
t0 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount0(Rnd(i))
Next   
t0 = ElapsedMilliseconds() - t0

t1 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount1(Rnd(i))
Next   
t1 = ElapsedMilliseconds() - t1

t2 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount2(Rnd(i))
Next   
t2 = ElapsedMilliseconds() - t2

t3 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount3(Rnd(i))
Next   
t3 = ElapsedMilliseconds() - t3

t4 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount4(Rnd(i))
Next   
t4 = ElapsedMilliseconds() - t4

t5 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount5(Rnd(i))
Next   
t5 = ElapsedMilliseconds() - t5

t6 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount6(Rnd(i))
Next   
t6 = ElapsedMilliseconds() - t6

t7 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount7(Rnd(i))
Next   
t7 = ElapsedMilliseconds() - t7

t8 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount8(Rnd(i))
Next   
t8 = ElapsedMilliseconds() - t8

t9 = ElapsedMilliseconds()
For i = 1 To rep
   x = PopCount9(Rnd(i))
Next   
t9 = ElapsedMilliseconds() - t9

ta = ElapsedMilliseconds()
For i = 1 To rep
   x = NumOfBitsSet(Rnd(i))
Next   
ta = ElapsedMilliseconds() - ta

MessageRequester("PopCount speed test",
                 "t0 = " + t0 + " ms     (" + Int(100*t0/t1) + "%) (Intel)" + #LF$ +
                 "t1 = " + t1 + " ms   (100%) (wilbert)" + #LF$ +
                 "t2 = " + t2 + " ms   (" + Int(100*t2/t1) + "%) (luis)" + #LF$ +
                 "t3 = " + t3 + " ms   (" + Int(100*t3/t1) + "%) (luis2)" + #LF$ +
                 "t4 = " + t4 + " ms   (" + Int(100*t4/t1) + "%) Little John)" + #LF$ +
                 "t5 = " + t5 + " ms   (" + Int(100*t5/t1) + "%) (Trond)" + #LF$ +
                 "t6 = " + t6 + " ms   (" + Int(100*t6/t1) + "%) (Trond2)" + #LF$ +
                 "t7 = " + t7 + " ms   (" + Int(100*t7/t1) + "%) (idle)" + #LF$ +
                 "t8 = " + t8 + " ms   (" + Int(100*t8/t1) + "%) (said version of trond unrolled)" + #LF$ +
                 "t9 = " + t9 + " ms   (" + Int(100*t9/t1) + "%) (idle-PB)" + #LF$ +
                 "ta = " + ta + " ms   (" + Int(100*ta/t1) + "%) (davido)")