Code: Select all
; -- lexical parser for a simplified C Language
; Tested in UTF8
; PBx64 v5.71
;
; Based on https://rosettacode.org/wiki/Compiler/lexical_analyzer
; And
; https://rosettacode.org/wiki/Compiler/syntax_analyzer
; Distribute and use freely
;
; Kingwolf71 Feb/2020
; Bugfix: 2/3/2020
;
DeclareModule LJLang
#WithEOL = 1
Structure stTree
NodeType.i
value.s
*left.stTree
*right.stTree
EndStructure
Declare.s Error( *error.Integer )
Declare Parse()
Declare LoadLJ( file.s )
EndDeclareModule
Module LJLang
EnableExplicit
; ======================================================================================================
;- Constants
; ======================================================================================================
#LJTOKENS = 36
#LJATTRS = 26
Enumeration
#ljIDENT
#ljINT
#ljIF
#ljPRTS
#ljWHILE
#ljNEGATE
#ljMULTIPLY
#ljMOD
#ljSUBTRACT
#ljLESSEQUAL
#ljGreaterEqual
#ljNotEqual
#ljOr
#ljSTR
#ljSEQ
#ljPRTC
#ljPRTI
#ljASSIGN
#ljNOT
#ljDIVIDE
#ljADD
#ljLESS
#ljGREATER
#ljEQUAL
#ljAND
#ljXOR
#ljLeftBrace
#ljRightBrace
#ljLeftParent
#ljRightParent
#ljSemi
#ljComma
#ljElse
#ljOP
#ljKeyword
#ljEOF
EndEnumeration
; ======================================================================================================
;- Structures
; ======================================================================================================
Structure stSymbols
name.s
TokenType.i
EndStructure
Structure stToken
name.s
TokenType.i
TokenExtra.i
value.s
row.i
col.i
EndStructure
Structure stPrec
bRightAssociation.i
bBinary.i
bUnary.i
Precedence.i
NodeType.i
EndStructure
; ======================================================================================================
;- Macros
; ======================================================================================================
Macro NextToken()
;Debug "---[ " + #PB_Compiler_Procedure + " ]---"
;Debug Str( ListIndex( llTokenList() ) ) + " " + llTokenList()\name
NextElement( llTokenList() )
EndMacro
Macro Install( symbolname, id )
AddElement( llSymbols() )
llSymbols()\name = symbolname
llSymbols()\TokenType = id
EndMacro
Macro AddTokenSimple( tkentype )
AddElement( llTokenList() )
llTokenList()\TokenType = tkentype
llTokenList()\TokenExtra= tkentype
llTokenList()\name = gszATR( tkentype )
llTokenList()\row = gLineNumber
llTokenList()\col = gCol
EndMacro
Macro AddToken( tkentype, tkenextra, text, info )
AddElement( llTokenList() )
llTokenList()\TokenType = tkentype
llTokenList()\TokenExtra= tkenextra
llTokenList()\row = gLineNumber
llTokenList()\col = gCol
If text = ""
llTokenList()\name = gszATR( tkenextra )
Else
llTokenList()\name = text
EndIf
llTokenList()\value = info
EndMacro
Macro SetError( text, err )
If err > 0 And err < 10
gszlastError = text + " on line " + Str( gLineNumber ) + ", col = " + Str( gCol )
ElseIf err > 10
gszlastError = text + " on line " + Str( llTokenList()\row ) + ", col = " + Str( llTokenList()\col )
Else
gszlastError = text
EndIf
gLastError = err
ProcedureReturn err
EndMacro
Macro NextCharacter()
gNextChar = Mid( gszFileText, gPos, 1 )
gPos + 1 : gCol + 1
If gNextChar = #LF$
gCol = 1
gLineNumber + 1
gNextChar = Mid( gszFileText, gPos, 1 )
gPos + 1
EndIf
EndMacro
Macro SetPre( id, right, bi, un, prec )
gPreTable( id )\NodeType = id
gPreTable( id )\bRightAssociation = right
gPreTable( id )\bBinary = bi
gPreTable( id )\bUnary = un
gPreTable( id )\Precedence = prec
EndMacro
Macro SetPre2( id, op )
gPreTable( id )\bRightAssociation = 0
gPreTable( id )\bBinary = 0
gPreTable( id )\bUnary = 0
gPreTable( id )\Precedence = -1
gPreTable( id )\NodeType = op
EndMacro
Macro DebugParser()
Debug "---[ Parser ]--------------"
ForEach llTokenList()
temp = RSet( Str(llTokenList()\row), 6 ) + " " + RSet( Str(llTokenList()\col), 6 ) + " "
If llTokenList()\TokenExtra <> llTokenList()\TokenType
temp + LSet( gszATR( llTokenList()\TokenType ) + "_" + llTokenList()\name, 34 ) + llTokenList()\value
Else
temp + LSet( llTokenList()\name, 34 ) + llTokenList()\value
EndIf
Debug temp
Next
EndMacro
; ======================================================================================================
;- Globals
; ======================================================================================================
Global Dim gszATR.s( #LJTOKENS )
Global Dim gPreTable.stPrec( #LJTOKENS )
Global NewList llSymbols.stSymbols()
Global NewList llTokenList.stToken()
Global gLineNumber
Global gStack
Global gCol
Global gMemSize
Global gPos
Global gExit
Global gszFileText.s
Global gszlastError.s
Global gNextChar.s
Global gLastError
Global gszInv.s = Chr( 34 )
Global gszEOF.s = Chr( 255 )
; ======================================================================================================
;- Procedure Definitions
; ======================================================================================================
Declare paren_expr()
; ======================================================================================================
;- Module Function
; ======================================================================================================
Procedure Init()
gszATR( 0 ) = "Identifier"
gszATR( 1 ) = "Integer"
gszATR( 2 ) = "If"
gszATR( 3 ) = "Prts"
gszATR( 4 ) = "While"
gszATR( 5 ) = "Negate"
gszATR( 6 ) = "Multiply"
gszATR( 7 ) = "Mod"
gszATR( 8 ) = "Subtract"
gszATR( 9 ) = "LessEqual"
gszATR( 10 ) = "GreaterEqual"
gszATR( 11 ) = "NotEqual"
gszATR( 12 ) = "Or"
gszATR( 13 ) = "String"
gszATR( 14 ) = "Sequence"
gszATR( 15 ) = "Prtc"
gszATR( 16 ) = "Prti"
gszATR( 17 ) = "Assign"
gszATR( 18 ) = "Not"
gszATR( 19 ) = "Divide"
gszATR( 20 ) = "Add"
gszATR( 21 ) = "Less"
gszATR( 22 ) = "Greater"
gszATR( 23 ) = "Equal"
gszATR( 24 ) = "And"
gszATR( 25 ) = "Xor"
gszATR( 26 ) = "LeftBrace"
gszATR( 27 ) = "RightBrace"
gszATR( 28 ) = "LeftParent"
gszATR( 29 ) = "RightParent"
gszATR( 30 ) = "SemiColon"
gszATR( 31 ) = "Comma"
gszATR( 32 ) = "Else"
gszATR( 33 ) = "Op"
gszATR( 34 ) = "Keyword"
gszATR( 35 ) = "End of file"
SetPre2( #ljEOF, -1 )
SetPre( #ljMULTIPLY, 0, 1, 0, 13 )
SetPre( #ljDIVIDE, 0, 1, 0, 13 )
SetPre( #ljMOD, 0, 1, 0, 13 )
SetPre( #ljADD, 0, 1, 0, 12 )
SetPre( #ljSUBTRACT, 0, 1, 0, 12 )
SetPre( #ljNEGATE, 0, 0, 1, 14 )
SetPre( #ljNOT, 0, 0, 1, 14 )
SetPre( #ljLESS, 0, 1, 0, 10 )
SetPre( #ljLESSEQUAL, 0, 1, 0, 10 )
SetPre( #ljGREATER, 0, 1, 0, 10 )
SetPre( #ljGreaterEqual, 0, 1, 0, 10 )
SetPre( #ljEQUAL, 0, 1, 0, 9 )
SetPre( #ljNotEqual, 0, 1, 0, 9 )
SetPre2( #ljASSIGN, #ljASSIGN )
SetPre( #ljAND, 0, 1, 0, 5 )
SetPre( #ljOr, 0, 1, 0, 4 )
SetPre2( #ljIF, #ljIF )
SetPre2( #ljElse, -1 )
SetPre2( #ljWHILE, #ljWHILE )
SetPre2( #ljPRTS, -1 )
SetPre2( #ljPRTI, -1 )
SetPre2( #ljPRTC, -1 )
SetPre2( #ljLeftParent, -1 )
SetPre2( #ljLeftBrace, -1 )
SetPre2( #ljRightParent, -1 )
SetPre2( #ljRightBrace, -1 )
SetPre2( #ljComma, -1 )
SetPre2( #ljSemi, -1 )
SetPre2( #ljIDENT, #ljIDENT )
SetPre2( #ljINT, #ljINT )
SetPre2( #ljSTR, #ljSTR )
ClearList( llTokenList() )
ClearList( llSymbols() )
gLineNumber = 1
gCol = 1
gStack = 0
gExit = 0
Install( "else", #ljElse )
install( "if", #ljIF )
install( "print", #ljPRTS )
install( "putc", #ljPRTC )
install( "while", #ljWHILE )
EndProcedure
Procedure LoadLJ( filename.s )
Protected f, *mem
gMemSize = FileSize( filename )
If gMemSize > 0
f = ReadFile( #PB_Any, filename, #PB_UTF8 )
If Not f
SetError( "Could not open file", -3 )
EndIf
*Mem = AllocateMemory( gMemSize + 16, #PB_Memory_NoClear )
ReadData( f, *Mem, gMemSize )
CloseFile( f )
CompilerIf( #WithEOL = 1 )
gszFileText = PeekS( *mem, -1, #PB_UTF8 ) + gszEOF
CompilerElse
gszFileText = PeekS( *mem, -1, #PB_UTF8 )
CompilerEndIf
gMemSize = Len( gszFileText )
FreeMemory( *mem )
gpos = 1
ProcedureReturn 0
EndIf
SetError( "Invalid file", -2 )
EndProcedure
Procedure.s Error( *error.Integer )
Protected szerror.s
If gLastError
*error\i = gLastError
szerror = gszlastError
gLastError = 0
gszlastError = ""
ProcedureReturn szerror
EndIf
*error\i = 0
ProcedureReturn ""
EndProcedure
Procedure IsNumber( init.i = 0 )
Static flag
If init
flag = 0
Else
If gNextChar >= "0" And gNextChar <= "9"
ProcedureReturn 1
ElseIf Not flag And gNextChar = ".'"
flag + 1
ProcedureReturn 1
EndIf
EndIf
ProcedureReturn 0
EndProcedure
Procedure IsAlpha()
If ( gNextChar >= "a" And gNextChar <= "z" ) Or (gNextChar >= "A" And gNextChar <= "Z" ) Or (gNextChar >= "0" And gNextChar <= "9" )
ProcedureReturn 1
EndIf
ProcedureReturn 0
EndProcedure
Procedure Follow( expect.s, ifyes.i, ifno.i, *err.Integer )
NextCharacter()
If gNextChar = expect
AddToken( #ljOP, ifyes, "", "" )
Else
If ifno = -1
*err\i = 5
SetError( "follow unrecognized character", 5 )
Else
AddToken( #ljOP, ifno, "", "" )
gPos - 1
EndIf
EndIf
ProcedureReturn 0
EndProcedure
Procedure Scanner()
Protected err, first, i
Protected.s text, temp
While gPos <= gMemSize
NextCharacter()
Select gNextChar
Case gszEOF
AddTokenSimple( #ljEOF )
Break
Case " ", #CR$, #TAB$, ""
Continue
Case "{"
AddTokenSimple( #ljLeftBrace )
Case "}"
AddTokenSimple( #ljRightBrace )
Case "("
AddTokenSimple( #ljLeftParent )
Case ")"
AddTokenSimple( #ljRightParent )
Case "+"
AddToken( #ljOP, #ljADD, "", "" )
Case "-"
AddToken( #ljOP, #ljSUBTRACT, "", "" )
Case "*"
AddToken( #ljOP, #ljMULTIPLY, "", "" )
Case "%"
AddToken( #ljOP, #ljMOD, "", "" )
Case ";"
AddTokenSimple( #ljSemi )
Case ","
AddTokenSimple( #ljComma )
Case "/"
NextCharacter()
If gNextChar <> "*"
AddToken( #ljOP, #ljDIVIDE, "", "" )
gPos - 1
Else ;Must be a comment
NextCharacter()
text = ""
Repeat
NextCharacter()
text + gNextChar
If gNextChar = "*"
NextCharacter()
If gNextChar = "/"
Break
EndIf
ElseIf gNextChar = #CR$
gLineNumber + 1
EndIf
Until gPos >= gMemSize
If gPos >= gMemSize
SetError( "EOF in comment", 1 )
EndIf
EndIf
Case "'"
NextCharacter()
If gNextChar = "'"
SetError( "Empty character literal", 2 )
ElseIf gNextChar = "\"
NextCharacter()
Select gNextChar
Case "'"
SetError( "Empty escape character literal", 2 )
Case "n"
first = 10
Case "r"
first = 13
Case "\"
first = 92
Default
SetError( "Invalid escape character", 3 )
EndSelect
Else
first = Asc( gNextChar )
EndIf
NextCharacter()
If gNextChar <> "'"
SetError( "Multi-Character literal", 4 )
Else
AddToken( #ljINT, #ljINT, "", Str(first) )
EndIf
Case "<"
If Follow( "=", #ljLESSEQUAL, #ljLESS, @err ) : ProcedureReturn err : EndIf
Case ">"
If Follow( "=", #ljGreaterEqual, #ljGREATER, @err ) : ProcedureReturn err : EndIf
Case "!"
If Follow( "=", #ljNotEqual, #ljNOT, @err ) : ProcedureReturn err : EndIf
Case "="
If Follow( "=", #ljEQUAL, #ljASSIGN, @err ) : ProcedureReturn err : EndIf
Case "&"
If Follow( "&", #ljAND, -1, @err ) : ProcedureReturn err : EndIf
Case "|"
If Follow( "|", #ljOr, -1, @err ) : ProcedureReturn err : EndIf
Case "%"
If Follow( "%%", #ljxOr, -1, @err ) : ProcedureReturn err : EndIf
Case gszInv
text = gNextChar
Repeat
NextCharacter()
If gNextChar = gszInv
text + gNextChar
AddToken( #ljStr, #ljSTR, "", text )
Break
ElseIf gNextChar = #LF$ Or gNextChar = #CR$
SetError( "EOL in string", 7 )
Else
text + gNextChar
EndIf
Until gPos >= gMemSize
If gPos >= gMemSize
SetError( "EOF in string", 6 )
EndIf
Default
IsNumber( 1 ) ; reset digit flag
first = IsNumber()
text = ""
While gPos < gMemSize And ( IsAlpha() Or gNextChar = "_" )
If Not IsNumber() : first = 0 : EndIf
text + gNextChar
NextCharacter()
Wend
If gPos >= gMemSize
SetError( "EOL in number or variable '" + text + "'", 8 )
EndIf
If Len( text ) < 1
SetError( "Unknown sequence number or variable '" + text + "'", 9 )
EndIf
gPos - 1
i = 0
temp = LCase( text )
ForEach llSymbols()
i + 1
If llSymbols()\name = temp
AddToken( #ljKeyword, llSymbols()\TokenType, "", text )
i = -1
Break
EndIf
Next
If i > 0
If first
AddToken( #ljINT, #ljINT, "", text )
Else
AddToken( #ljIDENT, #ljIDENT, "", text )
EndIf
EndIf
EndSelect
Wend
ProcedureReturn 0
EndProcedure
Procedure Expect( function.s, TokenType )
;Debug "--Expect--"
;Debug llTokenList()\name + " --> " + gszATR( TokenType )
If llTokenList()\TokenExtra = TokenType
NextToken()
ProcedureReturn 0
EndIf
SetError( "Expecting " + gszATR( TokenType ) + " but found " + gszATR( llTokenList()\TokenExtra ) + " for " + function, 11 )
EndProcedure
Procedure MakeNode( NodeType, *left.stTree, *right.stTree )
Protected *p.stTree = AllocateStructure( stTree )
*p\NodeType = NodeType
*p\left = *left
*p\right = *right
ProcedureReturn *p
EndProcedure
Procedure Makeleaf( NodeType, value.s )
Protected *p.stTree = AllocateStructure( stTree )
*p\NodeType = NodeType
*p\value = value
ProcedureReturn *p
EndProcedure
Procedure expr( var )
Protected.stTree *x, *node
Protected op, q
Select llTokenList()\TokenExtra
Case #ljLeftParent
*x = paren_expr()
Case #ljSUBTRACT, #ljADD
op = llTokenList()\TokenExtra
NextToken()
*node = expr( gPreTable( #ljNEGATE )\Precedence )
If op = #ljSUBTRACT
*x = MakeNode( #ljNEGATE, *node, 0 )
Else
*x = *Node
EndIf
Case #ljNOT
NextToken()
*x = MakeNode( #ljNOT, expr( gPreTable( #ljNOT )\Precedence ), 0 )
Case #ljIDENT
*x = Makeleaf( #ljIDENT, llTokenList()\value )
NextToken()
Case #ljINT
*x = Makeleaf( #ljINT, llTokenList()\value )
NextToken()
Default
SetError( "Expecting a primary, found " + llTokenList()\name, 12 )
EndSelect
While gPreTable( llTokenList()\TokenExtra )\bBinary And gPreTable( llTokenList()\TokenExtra )\Precedence >= var
op = llTokenList()\TokenExtra
NextToken()
q = gPreTable( op )\Precedence
If Not gPreTable( op )\bRightAssociation
q + 1
EndIf
*node = expr( q )
*x = MakeNode( gPreTable( op )\NodeType, *x, *node )
Wend
ProcedureReturn *x
EndProcedure
Procedure paren_expr()
Protected *p.stTree
Expect( "paren_expr", #ljLeftParent )
*p = expr( 0 )
Expect( "paren_expr", #ljRightParent )
ProcedureReturn *p
EndProcedure
Procedure stmt()
Protected.stTree *p, *v, *e, *s, *s2
gStack + 1
If gStack > 32
NextToken()
SetError( "Stack overflow", 15 )
EndIf
;Debug llTokenList()\name + " --> " + gszATR( llTokenList()\TokenExtra )
Select llTokenList()\TokenExtra
Case #ljIF
NextToken()
*e = paren_expr()
*s = stmt()
*s2 = 0
If llTokenList()\TokenType = #ljElse
NextToken()
*s2 = stmt()
EndIf
*p = MakeNode( #ljIF, *e, MakeNode( #ljIF, *s, *s2 ) )
Case #ljPRTC
NextToken()
*e = paren_expr()
*p = MakeNode( #ljPRTC, *e, 0 )
expect( "putc", #ljSemi )
Case #ljPRTS
NextToken()
expect( "print", #ljLeftParent )
Repeat
If llTokenList()\TokenExtra = #ljSTR
*e = MakeNode( #ljPRTS, Makeleaf( #ljSTR, llTokenList()\value ), 0 )
NextToken()
Else
*e = MakeNode( #ljPRTI, expr( 0 ), 0 )
EndIf
*p = MakeNode( #ljSEQ, *p, *e )
If llTokenList()\TokenExtra <> #ljComma
Break
EndIf
expect( "print", #ljComma )
Until llTokenList()\TokenType = #ljEOF
Expect( "Print", #ljRightParent )
Expect( "Print", #ljSemi )
Case #ljSemi
NextToken()
Case #ljIDENT
*v = Makeleaf( #ljIDENT, llTokenList()\value )
NextToken()
Expect( "Assign", #ljASSIGN )
*e = expr( 0 )
*p = MakeNode( #ljASSIGN, *v, *e )
Expect( "Assign", #ljSemi )
Case #ljWHILE
NextToken()
*e = paren_expr()
*s = stmt()
*p = MakeNode( #ljWHILE, *e, *s )
Case #ljLeftBrace
Expect( "Left Bracket", #ljLeftBrace )
While llTokenList()\TokenExtra <> #ljRightBrace And llTokenList()\TokenExtra <> #ljEOF
*p = MakeNode( #ljSEQ, *p, stmt() )
Wend
Expect( "Left Bracket", #ljRightBrace )
Case #ljEOF
gExit + 1
Default
SetError( "Expecting beginning of a statament, found " + llTokenList()\name, 14 )
EndSelect
ProcedureReturn *p
EndProcedure
Procedure DisplayNode( *p.stTree )
If *p
If *p\NodeType = #ljIDENT Or *p\NodeType = #ljINT Or *p\NodeType = #ljSTR
Debug LSet( gszATR( *p\NodeType ), 30 ) + *p\value
Else
Debug LSet( gszATR( *p\NodeType ), 30 )
DisplayNode( *p\left )
DisplayNode( *p\right )
EndIf
Else
Debug ";"
EndIf
EndProcedure
Procedure Parse()
Protected err
Protected *p.stTree
Protected total
Protected.s temp
Init()
If Scanner()
ProcedureReturn gLastError
EndIf
DebugParser()
Debug "---[ Syntax Analysis ]--------------"
FirstElement( llTokenList() )
total = ListSize( llTokenList() ) - 1
Repeat
gStack = 0
*p = MakeNode( #ljSEQ, *p, stmt() )
If gLastError
Debug gszlastError
Break
EndIf
Until ListIndex( llTokenList() ) >= total Or gExit
DisplayNode( *p )
EndProcedure
EndModule
CompilerIf #PB_Compiler_IsMainFile
; -- Module demo
Define err
Define *t.LJLang::stTree
If LJLang::LoadLJ( "demo4.lj" )
Debug "Error: " + LJLang::Error( @err )
EndIf
If LJLang::Parse()
Debug "Error: " + LJLang::Error( @err )
EndIf
CompilerEndIf
Example 1
Code: Select all
count = 1;
while (count < 10) {
print("count is: ", count, "\n");
count = count + 1;
}
- Some bug fixes
- Made output in line with other examples
- Integrated the syntax analysis
EDIT3: 2/Mar/2020
- Bugfix in syntax generator
Personal note
I am impressed how easy it was to translate the C code into Purebasic.