It is currently Wed Jan 20, 2021 4:09 am

All times are UTC + 1 hour




Post new topic Reply to topic  [ 26 posts ]  Go to page Previous  1, 2
Author Message
 Post subject: Re: Cube root
PostPosted: Mon Dec 16, 2013 9:53 am 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3710
Location: Netherlands
Here's a modified version of the procedure Helle created that works on both x86 and x64
Code:
Procedure.d CubeRoot(Number.d)
  !movsd xmm0, [p.v_Number]
  !pcmpeqb xmm4, xmm4         ; create mask 0x7fffffffffffffff
  !psrlq xmm4, 1
  !movsd xmm1, xmm0
  !pand xmm1, xmm4            ; without sign
  !pandn xmm4, xmm0           ; sign only
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !pshufd xmm0, xmm1, 1
  !movq xmm2, [CubeRootConst]
  !pmuludq xmm0, xmm2
  !paddd xmm0, xmm2
 
  !movsd xmm2, xmm0
  !mulsd xmm2, xmm2
  !mulsd xmm2, xmm0           ; xmm2=x*x*x
  !movsd xmm3, xmm2
  !addsd xmm3, xmm3           ; 2*x*x*x
  !addsd xmm3, xmm1           ; xmm1=a
  !addsd xmm2, xmm1
  !addsd xmm2, xmm1
  !divsd xmm2, xmm3
  !mulsd xmm0, xmm2
 
  !movsd xmm2, xmm0
  !mulsd xmm2, xmm2
  !mulsd xmm2, xmm0           ; xmm2=x*x*x
  !movsd xmm3, xmm2
  !addsd xmm3, xmm3           ; 2*x*x*x
  !addsd xmm3, xmm1           ; xmm1=a
  !addsd xmm2, xmm1
  !addsd xmm2, xmm1
  !divsd xmm2, xmm3
  !mulsd xmm0, xmm2
 
  !por xmm0, xmm4
  !movsd [p.v_Number], xmm0
  !fld qword [p.v_Number]
  ProcedureReturn
  !CubeRootConst: dd 0x55555555, 0x2a9f7893
EndProcedure


Debug CubeRoot(-12345.6789)

_________________
macOS 10.15 Catalina, Windows 10


Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Mon Dec 16, 2013 7:04 pm 
Offline
Addict
Addict

Joined: Fri Nov 09, 2012 11:04 pm
Posts: 1807
Location: Uttoxeter, UK
Thanks Guys.

Very nice examples. 8)

_________________
DE AA EB


Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Mon Dec 16, 2013 9:29 pm 
Offline
Enthusiast
Enthusiast
User avatar

Joined: Wed Apr 12, 2006 7:59 pm
Posts: 174
Location: Germany
Yeah, wilbert, superfine optimizing !
And now for float...
Helle


Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Mon Dec 16, 2013 11:07 pm 
Offline
Addict
Addict
User avatar

Joined: Wed Jun 11, 2003 9:33 pm
Posts: 4655
Location: Spa, relaxing and thinking, and learning...
Nice! 8)

_________________
http://www.zeitgeistmovie.com

While world=business:world+mafia:Wend


Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Tue Dec 17, 2013 8:06 am 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3710
Location: Netherlands
Helle wrote:
Yeah, wilbert, superfine optimizing !
And now for float...
Helle

Here's float based on your routine
Code:
Procedure.f CubeRoot(Number.f)
  !mov eax, [p.v_Number]
  !mov ecx, eax
  !and eax, 0x7fffffff        ; without sign
  !and ecx, 0x80000000        ; sign only
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !movd xmm0, eax
  !movq xmm1, xmm0
  !mov edx, 0x55555555
  !mul edx
  !add edx, 0x2a508935
  !movd xmm2, edx
  !addss xmm1, xmm1           ; xmm1=2*a=constant
 
  !movq xmm3, xmm2
  !mulss xmm3, xmm3
  !mulss xmm3, xmm2           ; xmm3=x*x*x
  !movq xmm4, xmm3
  !addss xmm4, xmm4           ; 2*x*x*x
  !addss xmm4, xmm0           ; xmm0=a
  !rcpss xmm4, xmm4
  !addss xmm3, xmm1
  !mulss xmm3, xmm4
  !mulss xmm2, xmm3
 
  !movq xmm3, xmm2
  !mulss xmm3, xmm3
  !mulss xmm3, xmm2           ; xmm3=x*x*x
  !movq xmm4, xmm3
  !addss xmm4, xmm4           ; 2*x*x*x
  !addss xmm4, xmm0           ; xmm0=a
  !addss xmm3, xmm1
  !divss xmm3, xmm4
  !mulss xmm2, xmm3
 
  !movd xmm3, ecx
  !por xmm2, xmm3            ; restore sign
  !movd [p.v_Number], xmm2
  !fld dword [p.v_Number]
 ProcedureReturn
EndProcedure

_________________
macOS 10.15 Catalina, Windows 10


Last edited by wilbert on Tue Dec 17, 2013 9:18 am, edited 2 times in total.

Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Tue Dec 17, 2013 8:06 am 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3710
Location: Netherlands
... accidentally double posted my previous post ...

_________________
macOS 10.15 Catalina, Windows 10


Last edited by wilbert on Tue Dec 17, 2013 9:17 am, edited 1 time in total.

Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Tue Dec 17, 2013 8:43 am 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3710
Location: Netherlands
Here's also the updated benchmark code with the optimized versions in it
Code:
;For 32-Bit-Windows
;PB 5.21 LTS (x86)

Test_Value_D.d = -12345.6789 ;Cube Root = -23,112042408247961097779983746659
Appr_D.d                     ;for all...
Test_Value_F.f = -12345.6789
Appr_F.f


Procedure.f CubeRoot_FM1()
  !mov eax, [v_Test_Value_F]
  !mov ecx, eax
  !and eax, 0x7fffffff        ; without sign
  !and ecx, 0x80000000        ; sign only
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !movd xmm0, eax
  !movq xmm1, xmm0
  !mov edx, 0x55555555
  !mul edx
  !add edx, 0x2a508935
  !movd xmm2, edx
  !addss xmm1, xmm1           ; xmm1=2*a=constant
 
  !movq xmm3, xmm2
  !mulss xmm3, xmm3
  !mulss xmm3, xmm2           ; xmm3=x*x*x
  !movq xmm4, xmm3
  !addss xmm4, xmm4           ; 2*x*x*x
  !addss xmm4, xmm0           ; xmm0=a
  !rcpss xmm4, xmm4
  !addss xmm3, xmm1
  !mulss xmm3, xmm4
  !mulss xmm2, xmm3
 
  !movq xmm3, xmm2
  !mulss xmm3, xmm3
  !mulss xmm3, xmm2           ; xmm3=x*x*x
  !movq xmm4, xmm3
  !addss xmm4, xmm4           ; 2*x*x*x
  !addss xmm4, xmm0           ; xmm0=a
  !addss xmm3, xmm1
  !divss xmm3, xmm4
  !mulss xmm2, xmm3
 
  !movd xmm3, ecx
  !por xmm2, xmm3            ; restore sign
  !movd [v_Appr_F], xmm2
  !fld dword [v_Appr_F]
 ProcedureReturn
EndProcedure

Procedure.f CubeRoot_FM2()
  ; based on both Kahan's bit hack and this routine http://www.musicdsp.org/showone.php?id=206
  !mov eax, [v_Test_Value_F]
  !mov ecx, eax
  !and eax, 0x7fffffff        ; without sign
  !and ecx, 0x80000000        ; sign only
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !movd xmm0, eax
  !movq xmm1, xmm0
  !mov edx, 0x55555555
  !mul edx
  !add edx, 0x2a508935
  !movd xmm0, edx
 
  !movq xmm2, xmm0            ; x2: z
  !mulss xmm2, xmm0           ; x2: z*z
  !movq xmm3, xmm2            ; x3: z*z
  !addss xmm2, xmm2           ; x2: 2*z*z
  !addss xmm2, xmm3           ; x2: 3*z*z
  !rcpss xmm2, xmm2           ; x2: ~ 1/(3*z*z)
  !mulss xmm3, xmm0           ; x3: z*z*z
  !subss xmm3, xmm1           ; x3: z*z*z-x
  !mulss xmm3, xmm2           ; x3: (z*z*z-x) / (3*z*z)
  !subss xmm0, xmm3           ; x0: z'
 
  !movq xmm2, xmm0            ; x2: z
  !mulss xmm2, xmm0           ; x2: z*z
  !movq xmm3, xmm2            ; x3: z*z
  !addss xmm2, xmm2           ; x2: 2*z*z
  !addss xmm2, xmm3           ; x2: 3*z*z
  !rcpss xmm2, xmm2           ; x2: ~ 1/(3*z*z)
  !mulss xmm3, xmm0           ; x3: z*z*z
  !subss xmm3, xmm1           ; x3: z*z*z-x
  !mulss xmm3, xmm2           ; x3: (z*z*z-x) / (3*z*z)
  !subss xmm0, xmm3           ; x0: z'
 
  !movd xmm1, ecx
  !por xmm0, xmm1             ; restore sign
  !movd [v_Appr_F], xmm0
  !fld dword [v_Appr_F]
 ProcedureReturn
EndProcedure

Procedure.d CubeRoot_DM1()
  !movsd xmm0, [v_Test_Value_D]
  !pcmpeqb xmm4, xmm4         ; create mask 0x7fffffffffffffff
  !psrlq xmm4, 1
  !movsd xmm1, xmm0
  !pand xmm1, xmm4            ; without sign
  !pandn xmm4, xmm0           ; sign only
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !pshufd xmm0, xmm1, 1
  !movq xmm2, [CubeRootConst]
  !pmuludq xmm0, xmm2
  !paddd xmm0, xmm2
 
  !movsd xmm2, xmm0
  !mulsd xmm2, xmm2
  !mulsd xmm2, xmm0           ; xmm2=x*x*x
  !movsd xmm3, xmm2
  !addsd xmm3, xmm3           ; 2*x*x*x
  !addsd xmm3, xmm1           ; xmm1=a
  !addsd xmm2, xmm1
  !addsd xmm2, xmm1
  !divsd xmm2, xmm3
  !mulsd xmm0, xmm2
 
  !movsd xmm2, xmm0
  !mulsd xmm2, xmm2
  !mulsd xmm2, xmm0           ; xmm2=x*x*x
  !movsd xmm3, xmm2
  !addsd xmm3, xmm3           ; 2*x*x*x
  !addsd xmm3, xmm1           ; xmm1=a
  !addsd xmm2, xmm1
  !addsd xmm2, xmm1
  !divsd xmm2, xmm3
  !mulsd xmm0, xmm2
 
  !por xmm0, xmm4
  !movsd [v_Appr_D], xmm0
  !fld qword [v_Appr_D]
  ProcedureReturn
  !CubeRootConst: dd 0x55555555, 0x2a9f7893
EndProcedure


Procedure.f Cube_Root_F_H_x86()   ;only for x86
  !mov eax,[v_Test_Value_F]
  !and eax,7FFFFFFFh         ;without sign
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !mov [v_Appr_F],eax
  !movss xmm0,[v_Appr_F] 
  !movss xmm1,xmm0
  !lea edx,[v_Appr_F]
  !mov eax,[edx]
  !xor edx,edx
  !div dword[Value3_F]
  !add eax,[BitHack_F]
  !mov [v_Appr_F],eax
  !movss xmm2,[v_Appr_F]

  !addss xmm1,xmm1           ;xmm1=2*a=constant
  !mov ecx,2                 ;higher=more precision (if possible)
 !@@: 
  !movss xmm3,xmm2
  !mulss xmm3,xmm3
  !mulss xmm3,xmm2           ;xmm3=x*x*x
  !movss xmm4,xmm3
  !addss xmm4,xmm4           ;2*x*x*x
  !addss xmm4,xmm0           ;xmm0=a
  !addss xmm3,xmm1
  !divss xmm3,xmm4
  !mulss xmm2,xmm3
  !dec ecx 
 !jnz @b 
  ;set sign (if Test_Value negativ)
  !test byte[v_Test_Value_F+3],80h
 !jz @f
  !mulss xmm2,[Minus1_F]     ;restore sign
 !@@:
  !movss dword[v_Appr_F],xmm2
  !fld dword[v_Appr_F]
 ProcedureReturn
  !Minus1_F:  dd -1.0
  !BitHack_F: dd 709921077   ;for Kahan´s bit hack (Float); http://metamerist.com/cbrt/cbrt.htm
  !Value3_F:  dd 3
EndProcedure

Procedure.d Cube_Root_D_H_x86()   ;only for x86
  !mov eax,dword[v_Test_Value_D+4]
  !mov edx,dword[v_Test_Value_D]
  !mov dword[v_Appr_D],edx
  !and eax,7FFFFFFFh         ;without sign
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !mov dword[v_Appr_D+4],eax
  !movsd xmm0,[v_Appr_D] 
  !movsd xmm1,xmm0
  !lea edx,[v_Appr_D]
  !mov eax,[edx+4]
  !xor edx,edx
  !div dword[Value3_D]
  !add eax,[BitHack_D+4]
  !mov dword[v_Appr_D+4],eax
  !movsd xmm2,[v_Appr_D]

  !addsd xmm1,xmm1           ;xmm1=2*a=constant
  !mov ecx,2                 ;higher=more precision (if possible)
 !@@: 
  !movsd xmm3,xmm2
  !mulsd xmm3,xmm3
  !mulsd xmm3,xmm2           ;xmm3=x*x*x
  !movsd xmm4,xmm3
  !addsd xmm4,xmm4           ;2*x*x*x
  !addsd xmm4,xmm0           ;xmm0=a
  !addsd xmm3,xmm1
  !divsd xmm3,xmm4
  !mulsd xmm2,xmm3
  !dec ecx 
 !jnz @b 
  ;set sign (if Test_Value negativ)
  !test byte[v_Test_Value_D+7],80h
 !jz @f
  !mulsd xmm2,[Minus1_D]     ;restore sign
 !@@:
  !movsd qword[v_Appr_D],xmm2
  !fld qword[v_Appr_D]
 ProcedureReturn
  !Minus1_D:  dq -1.0
  !BitHack_D: dq 3071306043645493248   ;715094163<<32, for Kahan´s bit hack (Double); http://metamerist.com/cbrt/cbrt.htm
  !Value3_D:  dd 3
EndProcedure

Procedure.f Cube_Root_F_P()  ;for x86 and x64
  !fld dword[Dat_F]
  !fld dword[v_Test_Value_F]
  !fabs
  !fyl2x                     ;->log2(Src1)*exponent
  !fld st0                   ;copy the logarithm
  !frndint                   ;keep only the characteristic
  !fsub st1,st0              ;keeps only the mantissa
  !fxch                      ;get the mantissa on top
  !f2xm1                     ;->2^(mantissa)-1
  !fld1
  !faddp                     ;add 1 back
  !fscale                    ;scale it with the characteristic
  !fstp st1                  ;copy result over and "pop" it
  !test byte[v_Test_Value_F+3],$80
 !jz @f
  !fchs                      ;restore sign
 !@@:
 ProcedureReturn
  !Dat_F:     dd 0.33333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333 ;:-)
EndProcedure

Procedure.d Cube_Root_D_P()  ;for x86 and x64
  !fld qword[Dat_D]
  !fld qword[v_Test_Value_D]
  !fabs
  !fyl2x                     ;->log2(Src1)*exponent
  !fld st0                   ;copy the logarithm
  !frndint                   ;keep only the characteristic
  !fsub st1,st0              ;keeps only the mantissa
  !fxch                      ;get the mantissa on top
  !f2xm1                     ;->2^(mantissa)-1
  !fld1
  !faddp                     ;add 1 back
  !fscale                    ;scale it with the characteristic
  !fstp st1                  ;copy result over and "pop" it
  !test byte[v_Test_Value_D+7],$80
 !jz @f
  !fchs                      ;restore sign
 !@@:
 ProcedureReturn
  !Dat_D:     dq 0.33333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333 ;:-)
EndProcedure

Result$ = "Test-Value : -12345.6789" + #LFCR$

TA_FM1 = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_FM1.f = CubeRoot_FM1()
Next
TE_FM1 = ElapsedMilliseconds() - TA_FM1
Result$ + "Float M1:" + #LFCR$ + StrF(Cbrt_FM1, 7) + " / " + Str(TE_FM1) + " ms " + #LFCR$

TA_FM2 = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_FM2.f = CubeRoot_FM2()
Next
TE_FM2 = ElapsedMilliseconds() - TA_FM2
Result$ + "Float M2:" + #LFCR$ + StrF(Cbrt_FM2, 7) + " / " + Str(TE_FM2) + " ms " + #LFCR$

TA_DM1 = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_DM1.d = CubeRoot_DM1()
Next
TE_DM1 = ElapsedMilliseconds() - TA_DM1
Result$ + "Double M1:" + #LFCR$ + StrD(Cbrt_DM1, 15) + " / " + Str(TE_DM1) + " ms " + #LFCR$

TA_F_H = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_F_H_x86.f = Cube_Root_F_H_x86()
Next
TE_F_H = ElapsedMilliseconds() - TA_F_H
Result$ + "Float Helle:" + #LFCR$ + StrF(Cbrt_F_H_x86, 7) + " / " + Str(TE_F_H) + " ms " + #LFCR$

TA_D_H = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_D_H_x86.d = Cube_Root_D_H_x86()
Next
TE_D_H = ElapsedMilliseconds() - TA_D_H
Result$ + "Double Helle:" + #LFCR$ + StrD(Cbrt_D_H_x86, 15) + " / " + Str(TE_D_H) + " ms " + #LFCR$

TA_F_P = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_F_P.f = Cube_Root_F_P()
Next
TE_F_P = ElapsedMilliseconds() - TA_F_P
Result$ + "Float Psychophanta:" + #LFCR$ + StrF(Cbrt_F_P, 7) + " / " + Str(TE_F_P) + " ms " + #LFCR$

TA_D_P = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_D_P.d = Cube_Root_D_P()
Next
TE_D_P = ElapsedMilliseconds() - TA_D_P
Result$ + "Double Psychophanta:" + #LFCR$ + StrD(Cbrt_D_P, 15) + " / " + Str(TE_D_P) + " ms " + #LFCR$ + #LFCR$

Result$ + "CPU: Intel i7-4770K@4.0GHz" + #LFCR$

;SetClipboardText(Result$)

MessageRequester("Cube Root x86, 10000000 Loops", Result$)

_________________
macOS 10.15 Catalina, Windows 10


Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Tue Dec 17, 2013 12:15 pm 
Offline
Enthusiast
Enthusiast

Joined: Tue May 26, 2009 2:11 pm
Posts: 672
wilbert wrote:
Here's also the updated benchmark code with the optimized versions in it
...

And here are my results:
Benchmark wrote:
Test-Value : -12345.6789

Float M1:
-23.1120415 / 110 ms

Float M2:
-23.1120434 / 100 ms

Double M1:
-23.112042408247962 / 229 ms

Float Helle:
-23.1120415 / 219 ms

Double Helle:
-23.112042408247962 / 482 ms

Float Psychophanta:
-23.1120453 / 586 ms

Double Psychophanta:
-23.112042408247959 / 568 ms

CPU: Intel i5-760@2.8GHz

Windows Calculator wrote:
cuberoot(12345,6789) = 23,112042408247961097779983746659

_________________
Image


Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Tue Dec 17, 2013 4:34 pm 
Offline
Enthusiast
Enthusiast
User avatar

Joined: Wed Apr 12, 2006 7:59 pm
Posts: 174
Location: Germany
A tip for the approximate reciprocals (e.g. rcpss): AMD and Intel gives different results (AMD is with more precision)! See the values Intel/AMD in FM1 and FM2.
2 results:
;---------------------------------
Test-Value : -12345.6789

Float M1:
-23.1120434 / 132 ms
Float M2:
-23.1120434 / 121 ms
Double M1:
-23.112042408247962 / 164 ms
Float Helle:
-23.1120415 / 322 ms
Double Helle:
-23.112042408247962 / 745 ms
Float Psychophanta:
-23.1120453 / 1189 ms
Double Psychophanta:
-23.112042408247959 / 1177 ms

CPU: AMD FX-8120@3.6GHz
;---------------------------------
and
;---------------------------------
Test-Value : -12345.6789

Float M1:
-23.1120415 / 130 ms
Float M2:
-23.1120434 / 110 ms
Double M1:
-23.112042408247962 / 251 ms
Float Helle:
-23.1120415 / 240 ms
Double Helle:
-23.112042408247962 / 511 ms
Float Psychophanta:
-23.1120453 / 641 ms
Double Psychophanta:
-23.112042408247959 / 651 ms

CPU: Intel i7-920@2.67GHz
;---------------------------------


Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Tue Dec 17, 2013 4:45 pm 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3710
Location: Netherlands
Helle wrote:
A tip for the approximate reciprocals (e.g. rcpss): AMD and Intel gives different results (AMD is with more precision)! See the values Intel/AMD in FM1 and FM2.

That's good to know Helle, I didn't know that.
I know the routine that didn't use your computation method was less accurate but on my Core2 is appeared the version of your routine where I used a reciprocal the first time and a div the second time seemed accurate enough on my computer.
Anyway, for myself I would probably use the optimized double precision routine if I needed one.

_________________
macOS 10.15 Catalina, Windows 10


Top
 Profile  
Reply with quote  
 Post subject: Re: Cube root
PostPosted: Tue Dec 17, 2013 7:19 pm 
Offline
Addict
Addict

Joined: Fri Apr 25, 2003 11:10 pm
Posts: 1232
made some small changes to make it work on OS X
Code:
;For 32-Bit-Windows and OS X
;PB 5.21 LTS (x86)

Test_Value_D.d = -12345.6789 ;Cube Root = -23,112042408247961097779983746659
Appr_D.d                     ;for all...
Test_Value_F.f = -12345.6789
Appr_F.f


Procedure.f CubeRoot_FM1()
  !mov eax, [v_Test_Value_F]
  !mov ecx, eax
  !and eax, 0x7fffffff        ; without sign
  !and ecx, 0x80000000        ; sign only
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !movd xmm0, eax
  !movq xmm1, xmm0
  !mov edx, 0x55555555
  !mul edx
  !add edx, 0x2a508935
  !movd xmm2, edx
  !addss xmm1, xmm1           ; xmm1=2*a=constant
 
  !movq xmm3, xmm2
  !mulss xmm3, xmm3
  !mulss xmm3, xmm2           ; xmm3=x*x*x
  !movq xmm4, xmm3
  !addss xmm4, xmm4           ; 2*x*x*x
  !addss xmm4, xmm0           ; xmm0=a
  !rcpss xmm4, xmm4
  !addss xmm3, xmm1
  !mulss xmm3, xmm4
  !mulss xmm2, xmm3
 
  !movq xmm3, xmm2
  !mulss xmm3, xmm3
  !mulss xmm3, xmm2           ; xmm3=x*x*x
  !movq xmm4, xmm3
  !addss xmm4, xmm4           ; 2*x*x*x
  !addss xmm4, xmm0           ; xmm0=a
  !addss xmm3, xmm1
  !divss xmm3, xmm4
  !mulss xmm2, xmm3
 
  !movd xmm3, ecx
  !por xmm2, xmm3            ; restore sign
  !movd [v_Appr_F], xmm2
  !fld dword [v_Appr_F]
 ProcedureReturn
EndProcedure

Procedure.f CubeRoot_FM2()
  ; based on both Kahan's bit hack and this routine http://www.musicdsp.org/showone.php?id=206
  !mov eax, [v_Test_Value_F]
  !mov ecx, eax
  !and eax, 0x7fffffff        ; without sign
  !and ecx, 0x80000000        ; sign only
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !movd xmm0, eax
  !movq xmm1, xmm0
  !mov edx, 0x55555555
  !mul edx
  !add edx, 0x2a508935
  !movd xmm0, edx
 
  !movq xmm2, xmm0            ; x2: z
  !mulss xmm2, xmm0           ; x2: z*z
  !movq xmm3, xmm2            ; x3: z*z
  !addss xmm2, xmm2           ; x2: 2*z*z
  !addss xmm2, xmm3           ; x2: 3*z*z
  !rcpss xmm2, xmm2           ; x2: ~ 1/(3*z*z)
  !mulss xmm3, xmm0           ; x3: z*z*z
  !subss xmm3, xmm1           ; x3: z*z*z-x
  !mulss xmm3, xmm2           ; x3: (z*z*z-x) / (3*z*z)
  !subss xmm0, xmm3           ; x0: z'
 
  !movq xmm2, xmm0            ; x2: z
  !mulss xmm2, xmm0           ; x2: z*z
  !movq xmm3, xmm2            ; x3: z*z
  !addss xmm2, xmm2           ; x2: 2*z*z
  !addss xmm2, xmm3           ; x2: 3*z*z
  !rcpss xmm2, xmm2           ; x2: ~ 1/(3*z*z)
  !mulss xmm3, xmm0           ; x3: z*z*z
  !subss xmm3, xmm1           ; x3: z*z*z-x
  !mulss xmm3, xmm2           ; x3: (z*z*z-x) / (3*z*z)
  !subss xmm0, xmm3           ; x0: z'
 
  !movd xmm1, ecx
  !por xmm0, xmm1             ; restore sign
  !movd [v_Appr_F], xmm0
  !fld dword [v_Appr_F]
 ProcedureReturn
EndProcedure

Procedure.d CubeRoot_DM1()
  !movsd xmm0, [v_Test_Value_D]
  !pcmpeqb xmm4, xmm4         ; create mask 0x7fffffffffffffff
  !psrlq xmm4, 1
  !movsd xmm1, xmm0
  !pand xmm1, xmm4            ; without sign
  !pandn xmm4, xmm0           ; sign only
  ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
  !pshufd xmm0, xmm1, 1
  !movq xmm2, [CubeRootConst]
  !pmuludq xmm0, xmm2
  !paddd xmm0, xmm2
 
  !movsd xmm2, xmm0
  !mulsd xmm2, xmm2
  !mulsd xmm2, xmm0           ; xmm2=x*x*x
  !movsd xmm3, xmm2
  !addsd xmm3, xmm3           ; 2*x*x*x
  !addsd xmm3, xmm1           ; xmm1=a
  !addsd xmm2, xmm1
  !addsd xmm2, xmm1
  !divsd xmm2, xmm3
  !mulsd xmm0, xmm2
 
  !movsd xmm2, xmm0
  !mulsd xmm2, xmm2
  !mulsd xmm2, xmm0           ; xmm2=x*x*x
  !movsd xmm3, xmm2
  !addsd xmm3, xmm3           ; 2*x*x*x
  !addsd xmm3, xmm1           ; xmm1=a
  !addsd xmm2, xmm1
  !addsd xmm2, xmm1
  !divsd xmm2, xmm3
  !mulsd xmm0, xmm2
 
  !por xmm0, xmm4
  !movsd [v_Appr_D], xmm0
  !fld qword [v_Appr_D]
  ProcedureReturn
  !CubeRootConst: dd 0x55555555, 0x2a9f7893
EndProcedure

CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
 
  Procedure.f Cube_Root_F_H_x86()   ;only for x86
    !mov eax,[v_Test_Value_F]
    !and eax,7FFFFFFFh         ;without sign
    ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
    !mov [v_Appr_F],eax
    !movss xmm0,[v_Appr_F] 
    !movss xmm1,xmm0
    !lea edx,[v_Appr_F]
    !mov eax,[edx]
    !xor edx,edx
    !div dword[Value3_F]
    !add eax,[BitHack_F]
    !mov [v_Appr_F],eax
    !movss xmm2,[v_Appr_F]
   
    !addss xmm1,xmm1           ;xmm1=2*a=constant
    !mov ecx,2                 ;higher=more precision (if possible)
   !@@b: 
    !movss xmm3,xmm2
    !mulss xmm3,xmm3
    !mulss xmm3,xmm2           ;xmm3=x*x*x
    !movss xmm4,xmm3
    !addss xmm4,xmm4           ;2*x*x*x
    !addss xmm4,xmm0           ;xmm0=a
    !addss xmm3,xmm1
    !divss xmm3,xmm4
    !mulss xmm2,xmm3
    !dec ecx 
    !jnz @@b
    ;set sign (if Test_Value negativ)
    !test byte[v_Test_Value_F+3],80h
   !jz @@f
    !mulss xmm2,[Minus1_F]     ;restore sign
    !@@f:
    !movss [v_Appr_F],xmm2
    !fld dword[v_Appr_F]
 
   ProcedureReturn
    !Minus1_F:  dd -1.0
    !BitHack_F: dd 709921077   ;for Kahan´s bit hack (Float); http://metamerist.com/cbrt/cbrt.htm
    !Value3_F:  dd 3
  EndProcedure
 
  Procedure.d Cube_Root_D_H_x86()   ;only for x86
    !mov eax,dword[v_Test_Value_D+4]
    !mov edx,dword[v_Test_Value_D]
    !mov dword[v_Appr_D],edx
    !and eax,7FFFFFFFh         ;without sign
    ;now find a good approximation for the start-value; http://metamerist.com/cbrt/cbrt.htm - Kahan´s bit hack
    !mov dword[v_Appr_D+4],eax
    !movsd xmm0,[v_Appr_D] 
    !movsd xmm1,xmm0
    !lea edx,[v_Appr_D]
    !mov eax,[edx+4]
    !xor edx,edx
    !div dword[Value3_D]
    !add eax,[BitHack_D+4]
    !mov dword[v_Appr_D+4],eax
    !movsd xmm2,[v_Appr_D]
 
    !addsd xmm1,xmm1           ;xmm1=2*a=constant
    !mov ecx,2                 ;higher=more precision (if possible)
   !@@b1: 
    !movsd xmm3,xmm2
    !mulsd xmm3,xmm3
    !mulsd xmm3,xmm2           ;xmm3=x*x*x
    !movsd xmm4,xmm3
    !addsd xmm4,xmm4           ;2*x*x*x
    !addsd xmm4,xmm0           ;xmm0=a
    !addsd xmm3,xmm1
    !divsd xmm3,xmm4
    !mulsd xmm2,xmm3
    !dec ecx 
   !jnz @@b1 
    ;set sign (if Test_Value negativ)
    !test byte[v_Test_Value_D+7],80h
   !jz @@f1
    !mulsd xmm2,[Minus1_D]     ;restore sign
   !@@f1:
    !movsd [v_Appr_D],xmm2
    !fld qword[v_Appr_D]
   ProcedureReturn
    !Minus1_D:  dq -1.0
    !BitHack_D: dq 3071306043645493248   ;715094163<<32, for Kahan´s bit hack (Double); http://metamerist.com/cbrt/cbrt.htm
    !Value3_D:  dd 3
  EndProcedure
 
CompilerEndIf

Procedure.f Cube_Root_F_P()  ;for x86 and x64
  !fld dword[Dat_F]
  !fld dword[v_Test_Value_F]
  !fabs
  !fyl2x                     ;->log2(Src1)*exponent
  !fld st0                   ;copy the logarithm
  !frndint                   ;keep only the characteristic
  !fsub st1,st0              ;keeps only the mantissa
  !fxch                      ;get the mantissa on top
  !f2xm1                     ;->2^(mantissa)-1
  !fld1
  !faddp                     ;add 1 back
  !fscale                    ;scale it with the characteristic
  !fstp st1                  ;copy result over and "pop" it
  !test byte[v_Test_Value_F+3],$80
 !jz @@f2
  !fchs                      ;restore sign
 !@@f2:
 ProcedureReturn
  !Dat_F:     dd 0.33333333333333333333 ;:-)
EndProcedure


Procedure.d Cube_Root_D_P()  ;for x86 and x64
  !fld qword[Dat_D]
  !fld qword[v_Test_Value_D]
  !fabs
  !fyl2x                     ;->log2(Src1)*exponent
  !fld st0                   ;copy the logarithm
  !frndint                   ;keep only the characteristic
  !fsub st1,st0              ;keeps only the mantissa
  !fxch                      ;get the mantissa on top
  !f2xm1                     ;->2^(mantissa)-1
  !fld1
  !faddp                     ;add 1 back
  !fscale                    ;scale it with the characteristic
  !fstp st1                  ;copy result over and "pop" it
  !test byte[v_Test_Value_D+7],$80
 !jz @@f3
  !fchs                      ;restore sign
 !@@f3:
 ProcedureReturn
  !Dat_D:     dq 0.33333333333333333333 ;:-)
EndProcedure

Result$ = "Test-Value : -12345.6789" + #LFCR$

TA_FM1 = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_FM1.f = CubeRoot_FM1()
Next
TE_FM1 = ElapsedMilliseconds() - TA_FM1
Result$ + "Float M1:" + #LFCR$ + StrF(Cbrt_FM1, 7) + " / " + Str(TE_FM1) + " ms " + #LFCR$

TA_FM2 = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_FM2.f = CubeRoot_FM2()
Next
TE_FM2 = ElapsedMilliseconds() - TA_FM2
Result$ + "Float M2:" + #LFCR$ + StrF(Cbrt_FM2, 7) + " / " + Str(TE_FM2) + " ms " + #LFCR$

TA_DM1 = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_DM1.d = CubeRoot_DM1()
Next
TE_DM1 = ElapsedMilliseconds() - TA_DM1
Result$ + "Double M1:" + #LFCR$ + StrD(Cbrt_DM1, 15) + " / " + Str(TE_DM1) + " ms " + #LFCR$

CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
 
  TA_F_H = ElapsedMilliseconds()
  For i = 1 To 10000000
    Cbrt_F_H_x86.f = Cube_Root_F_H_x86()
  Next
  TE_F_H = ElapsedMilliseconds() - TA_F_H
  Result$ + "Float Helle:" + #LFCR$ + StrF(Cbrt_F_H_x86, 7) + " / " + Str(TE_F_H) + " ms " + #LFCR$
 
  TA_D_H = ElapsedMilliseconds()
  For i = 1 To 10000000
    Cbrt_D_H_x86.d = Cube_Root_D_H_x86()
  Next
  TE_D_H = ElapsedMilliseconds() - TA_D_H
  Result$ + "Double Helle:" + #LFCR$ + StrD(Cbrt_D_H_x86, 15) + " / " + Str(TE_D_H) + " ms " + #LFCR$
 
CompilerEndIf

TA_F_P = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_F_P.f = Cube_Root_F_P()
Next
TE_F_P = ElapsedMilliseconds() - TA_F_P
Result$ + "Float Psychophanta:" + #LFCR$ + StrF(Cbrt_F_P, 7) + " / " + Str(TE_F_P) + " ms " + #LFCR$

TA_D_P = ElapsedMilliseconds()
For i = 1 To 10000000
  Cbrt_D_P.d = Cube_Root_D_P()
Next
TE_D_P = ElapsedMilliseconds() - TA_D_P
Result$ + "Double Psychophanta:" + #LFCR$ + StrD(Cbrt_D_P, 15) + " / " + Str(TE_D_P) + " ms " + #LFCR$ + #LFCR$

;Result$ + "CPU: Intel i7-4770K@4.0GHz" + #LFCR$

CompilerIf #PB_OS_MacOS
  ImportC ""
    sysctlbyname.l(s.s,*buffer,*length,*null,*null2)
  EndImport
  *buffer = AllocateMemory(128)
  length.l=MemorySize(*buffer)
  err.l=sysctlbyname("machdep.cpu.brand_string", *buffer, @length, 0, 0)
 
  If (Not err)
    Result$ + PeekS(*buffer) + #LFCR$
  EndIf
CompilerEndIf

;SetClipboardText(Result$)

MessageRequester("Cube Root x86, 10000000 Loops", Result$)


Top
 Profile  
Reply with quote  
Display posts from previous:  Sort by  
Post new topic Reply to topic  [ 26 posts ]  Go to page Previous  1, 2

All times are UTC + 1 hour


Who is online

Users browsing this forum: No registered users and 1 guest


You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum

Search for:
Jump to:  

 


Powered by phpBB © 2008 phpBB Group
subSilver+ theme by Canver Software, sponsor Sanal Modifiye