Updated my Split() code with nco2k's suggestions in later posts.
I ran speed tests normalized to nco2k's SplitString() renamed as SplitC(). Added Wilbert's Split() code from another post and SplitPB() using PB's functions for comparison.
My Split() has options for; case sensitivity, trimming, UCase/LCase, Ascii/Unicode. (Not engaged in comparison.)
The results are appended to the code.
Code: Select all
CompilerIf #PB_Compiler_Debugger
MessageRequester("FYI", "Disable debugger for speed tests.")
CompilerEndIf
EnableExplicit
#CMA = ',' : #CMA$ = ","
Macro Min(x, y)
(Bool((x) <= (y)) * (x) + Bool((y) < (x)) * (y))
EndMacro
ImportC ""
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
__StrLen_(*String) As "wcslen"
__StrStr_(*String1, *String2) As "wcsstr"
CompilerElse
__StrLen_(*String) As "_wcslen"
__StrStr_(*String1, *String2) As "_wcsstr"
CompilerEndIf
EndImport
Procedure.i SplitC(*String.Character, Array A$(1), *Delimiter, ArrayGrowth=1000)
;nco2k, http://www.purebasic.fr/english/viewtopic.php?p=516082#p516082
;skywalk; modified to catch trailing null's.
; Ex. "1,2,3,," Splits with ',' -> [1][2][3][][].
Protected.i *StringEnd, DelimiterLength, ArrayIndex
Protected.i Resize = 511
Protected.i nStrings = ArraySize(A$())
If *String ; Avoid 'Pointer is null' if pointing to an array element = Empty$
If *String\c
DelimiterLength = _wcslen_(*Delimiter) * SizeOf(Character)
If DelimiterLength
Repeat
*StringEnd = _wcsstr_(*String, *Delimiter)
If *StringEnd
A$(ArrayIndex) = PeekS(*String, (*StringEnd - *String) / SizeOf(Character))
ArrayIndex + 1
If ArrayIndex > nStrings
nStrings + Resize
ReDim A$(nStrings)
EndIf
*String = *StringEnd + DelimiterLength
Else
Break
EndIf
ForEver
EndIf
ReDim A$(ArrayIndex)
A$(ArrayIndex) = PeekS(*String) ; Catch final trailing null's if any.
ArrayIndex + 1
EndIf
EndIf
ProcedureReturn ArrayIndex
EndProcedure
Procedure.i SplitPB(s$, Array A$(1), Delm$=#CMA$, Trimsp.i=0, SetCase.i=#PB_String_Equal, UseCase.i=#PB_String_CaseSensitive)
;FYI; Do NOT use. Very slow for large strings. Shown for comparison only.
; Return: Count of elements or nStrings found.
; If none, then the entire string is assigned to A$(0)
; s$ = normal null terminated string
Protected.i i, nStrings
Protected.i DelmLen = Len(Delm$)
If s$ = #Empty$
Dim A$(0)
A$(0) = #Empty$
nStrings = 0
ElseIf Delm$ = #Empty$
Dim A$(0)
A$(0) = s$
nStrings = 1
Else ; OK to Split
nStrings = CountString(s$, Delm$)
Dim A$(nStrings)
While i <= nStrings
A$(i) = StringField(s$, i+1, Delm$)
i + 1
Wend
If trimsp
For i = 0 To nStrings
A$(i) = Trim(A$(i))
Next i
If trimsp = #TAB
For i = 0 To nStrings
A$(i) = Trim(A$(i), #TAB$)
Next i
; Repeat Trim Spaces to catch any created by dropping of TAB's.
For i = 0 To nStrings
A$(i) = Trim(A$(i))
Next i
EndIf
EndIf
If SetCase = #PB_String_LowerCase
For i = 0 To nStrings
A$(i) = LCase(A$(i))
Next i
ElseIf SetCase = #PB_String_UpperCase
For i = 0 To nStrings
A$(i) = UCase(A$(i))
Next i
EndIf
nStrings + 1 ; Catch #Empty$ due to dangling Delm$, "1,2,"
EndIf
ProcedureReturn nStrings
EndProcedure
Procedure.i SplitWilbert(s$, Array StringArray.s(1), Delm$)
;Wilbert, http://www.purebasic.fr/english/viewtopic.php?p=486360#p486360
Protected S.String, *S.Integer = @S
Protected.i asize, i, p, slen
asize = CountString(s$, Delm$)
slen = Len(Delm$)
ReDim StringArray(asize)
*S\i = @s$
While i < asize
p = FindString(S\s, Delm$)
StringArray(i) = PeekS(*S\i, p - 1)
*S\i + (p + slen - 1) << #PB_Compiler_Unicode
i + 1
Wend
StringArray(i) = S\s
*S\i = 0
ProcedureReturn asize + 1
EndProcedure
Procedure.i Split(*s.Character, Array A$(1), Delm$=#CMA$, Trimsp.i=0, SetCase.i=#PB_String_Equal, UseCase.i=#PB_String_CaseSensitive, Enc.i=#PB_Unicode)
;skywalk
; RETURN: Count of elements or nStrings found. A$(nStrings-1)
; If none, then the entire string is assigned to A$(0)
; IN: *s = pointer to normal null terminated string
; While searching *s, Split() does NOT inserts [Nulls] for each delimiter found.
Protected.i i, nStrings ;x86; use Quad for very large files. Change For..Next's to While..Wend's.
Protected.i lenChar, iLen, DelmLen, Resize
Protected.i *s_prev = *s
DelmLen = MemoryStringLength(@Delm$) ;, Enc) ; Len in char's of Delm$, (Unicode native format).
If DelmLen = 0
Dim A$(0)
A$(0) = PeekS(*s, -1, Enc)
nStrings = 1
ElseIf *s = 0 ; Avoid 'Pointer is null' if pointing to an array element = Empty$
Dim A$(0)
A$(0) = #Empty$
nStrings = 0
ElseIf *s\c = 0
Dim A$(0)
A$(0) = #Empty$
nStrings = 0
Else ; OK to Split
If Enc = #PB_Unicode ; 2 bytes/char
lenChar = 2
Else ; #PB_Ascii = 1 bytes/char, #PB_UTF8 = variable bytes/char
lenChar = 1
EndIf
Resize = 511;Min(99, (sLen / DelmLen))
Dim A$(Resize)
nStrings = Resize
If (UseCase = #PB_String_CaseSensitive) And (DelmLen = 1)
; Use faster single character Split routine.
Protected.c Delmc = Asc(Delm$)
While *s\c ; > 0 means valid Character, = 0 means String terminated.
If *s\c <> Delmc
*s + lenChar
iLen + 1
Else
;*s\c = 0 ; Old method terminates string with Chr(0) here. Using iLen instead of altering source string.
A$(i) = PeekS(*s_prev, iLen, Enc)
i + 1
*s_prev = *s + lenChar ; remember last pointer
*s = *s_prev
iLen = 0
If i > nStrings
nStrings + Resize
ReDim A$(nStrings)
EndIf
EndIf
Wend
Else ; Use slower multi-char split routine.
; Convert Delm$ to appropriate Encoding before comparisons.
Protected.i *d
If Enc = #PB_Unicode
*d = @Delm$
ElseIf Enc = #PB_Ascii
*d = Ascii(Delm$)
Else
*d = UTF8(Delm$)
EndIf
While *s\c ; > 0 means valid Character, = 0 means String terminated.
If CompareMemoryString(*s, *d, UseCase, DelmLen, Enc) ; <> 0 means different memory
*s + lenChar
iLen + 1
Else ; = 0 means identical memory
;*s\c = 0 ; Old method terminates string with Chr(0) here. Using iLen instead of altering source string.
A$(i) = PeekS(*s_prev, iLen, Enc)
i + 1
*s_prev = *s + DelmLen * lenChar ; Store previous pointer
*s = *s_prev
iLen = 0
If i > nStrings
nStrings + Resize
ReDim A$(nStrings)
EndIf
EndIf
Wend
If Enc <> #PB_Unicode
FreeMemory(*d)
EndIf
EndIf
If *s <> *s_prev ; Reached last valid element
A$(i) = PeekS(*s_prev, iLen, Enc) ; Catch #Empty$ due to dangling Delm$, "1,2,"
EndIf
nStrings = i
ReDim A$(nStrings)
If trimsp
For i = 0 To nStrings
A$(i) = Trim(A$(i))
Next i
If trimsp = #TAB
For i = 0 To nStrings
A$(i) = Trim(A$(i), #TAB$)
Next i
; Repeat Trim Spaces to catch any created by dropping of TAB's.
For i = 0 To nStrings
A$(i) = Trim(A$(i))
Next i
EndIf
EndIf
; Consider UCase(s$)/LCase(s$) prior to running Split() operation to save loop time.
; But, this limits the casing of the Delm$.
If SetCase = #PB_String_LowerCase
For i = 0 To nStrings
A$(i) = LCase(A$(i))
Next i
ElseIf SetCase = #PB_String_UpperCase
For i = 0 To nStrings
A$(i) = UCase(A$(i))
Next i
EndIf
nStrings + 1
EndIf
ProcedureReturn nStrings
EndProcedure
;-{ TEST
Dim a$(0)
Define.i i, nPts
Define.s d$ = ","
Define.s s$, r$ = ",,,, ,,A,,,B,,C,,ZZ,"
For i = 1 To 10000
s$ + r$; + #CRLF$
Next i
If 0
nPts = Split(@s$, a$(), d$)
Debug "Split(@'" + s$ + "', '" + d$ + "') = " + Str(nPts)
For i = 0 To nPts - 1
Debug Str(i) + ",{" + a$(i) + "}"
Next i
EndIf
If 0
nPts = SplitWilbert(s$, a$(), d$)
Debug "SplitWil('" + s$ + "', '" + d$ + "') = " + Str(nPts)
For i = 0 To nPts - 1
Debug Str(i) + ",{" + a$(i) + "}"
Next i
EndIf
If 0
nPts = SplitC(@s$, a$(), @d$)
Debug "SplitC(@'" + s$ + "', @'" + d$ + "') = " + Str(nPts)
For i = 0 To nPts - 1
Debug Str(i) + ",{" + a$(i) + "}"
Next i
EndIf
;-} TEST
;-{ TEST SPEED
CompilerIf #PB_Compiler_Debugger = 0
Macro ML_pcChange(y1, y2)
; Compute % change from y1 to y2.
100.0 * (y2 - (y1)) / (y1 + 1e-16)
EndMacro
SetPriorityClass_(GetCurrentProcess_(), #REALTIME_PRIORITY_CLASS)
#Tries = 1 ;-! SET #TRIES
Define.i u,time,t1,t2,t3,t4
Define.i tw = 32
Define.s r$
Define.s code1$ = "SplitC"
Define.s code2$ = "SplitWil" ;http://www.purebasic.fr/english/viewtopic.php?p=486360#p486360
Define.s code3$ = "Split"
Define.s code4$ = "SplitPB"
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> INSERT CODE 1 HERE...
Dim a$(0)
nPts = SplitC(@s$, a$(), @d$)
Debug "SplitC(@'" + s$ + "', @'" + d$ + "') = " + Str(nPts)
For i = 0 To nPts - 1
Debug Str(i) + ",{" + a$(i) + "}"
Next i
Next u
t1 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> INSERT CODE 2 HERE...
Dim a$(0)
nPts = SplitWilbert(s$, a$(), d$)
Debug "SplitWil('" + s$ + "','" + d$ + "') = " + Str(nPts)
For i = 0 To nPts - 1
Debug Str(i) + ",{" + a$(i) + "}"
Next i
Next u
t2 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> INSERT CODE 3 HERE...
Dim a$(0)
nPts = Split(@s$, a$(), d$)
Debug "Split(@'" + s$ + "', '" + d$ + "') = " + Str(nPts)
For i = 0 To nPts - 1
Debug Str(i) + ",{" + a$(i) + "}"
Next i
Next u
t3 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> INSERT CODE 4 HERE...
nPts = SplitPB(s$, a$(), d$)
Debug "SplitPB('" + s$ + "','" + d$ + "') = " + Str(nPts)
For i = 0 To nPts - 1
Debug Str(i) + ",{" + a$(i) + "}"
Next i
Next u
t4 = ElapsedMilliseconds()-time
r$ = LSet("; Count(n),",tw) + Str(#Tries) + #CRLF$
r$ + LSet("; "+code1$+"(ms),",tw) + Str(t1) + #CRLF$
r$ + LSet("; "+code2$+"(ms),",tw) + Str(t2) + #CRLF$
r$ + LSet("; "+code3$+"(ms),",tw) + Str(t3) + #CRLF$
r$ + LSet("; "+code4$+"(ms),",tw) + Str(t4) + #CRLF$
r$ + LSet("; "+code1$+" : "+code2$+"(%),",tw) + StrD(ML_pcChange(t1,t2),2) + #CRLF$
r$ + LSet("; "+code1$+" : "+code3$+"(%),",tw) + StrD(ML_pcChange(t1,t3),2) + #CRLF$
r$ + LSet("; "+code1$+" : "+code4$+"(%),",tw) + StrD(ML_pcChange(t1,t4),2) + #CRLF$
If MessageRequester("Speed Test - Copy To Clipboard?",r$,#PB_MessageRequester_YesNo) = #PB_MessageRequester_Yes
SetClipboardText(r$)
EndIf
SetPriorityClass_(GetCurrentProcess_(), #NORMAL_PRIORITY_CLASS)
CompilerEndIf
Debug "--END--"
Debug s$
;-} TEST SPEED
;/////////////////////////////////////////////
; d$ = ",,"
; s$ = 10 x r$ = ",,,, ,,A,,,B,,C,,ZZ,"
; Count(n), 100000
; SplitC(ms), 1373
; SplitWil(ms), 1380
; Split(ms), 1050
; SplitPB(ms), 3116
; SplitC : SplitWil(%), 0.51
; SplitC : Split(%), -23.53
; SplitC : SplitPB(%), 126.95
;/////////////////////////////////////////////
; d$ = ",,"
; s$ = 10000 x r$ = ",,,, ,,A,,,B,,C,,ZZ,"
; Count(n), 1
; SplitC(ms), 8
; SplitWil(ms), 2816
; Split(ms), 17
; SplitPB(ms), 21749
; SplitC : SplitWil(%), 35100.00
; SplitC : Split(%), 112.50
; SplitC : SplitPB(%), 271762.50
;/////////////////////////////////////////////
; d$ = ","
; s$ = 10000 x r$ = ",,,, ,,A,,,B,,C,,ZZ,"
; Count(n), 1
; SplitC(ms), 22
; SplitWil(ms), 6441
; Split(ms), 35
; SplitPB(ms), 13602
; SplitC : SplitWil(%), 29177.27
; SplitC : Split(%), 59.09
; SplitC : SplitPB(%), 61727.27
;/////////////////////////////////////////////
EDIT: Edits made per nco2k suggestions and more optimizations.