# # python for parser interpretation # Copyright Aaron Robert Watters, 1994 # # BUGS: # Lexical error handling is not nice # Parse error handling is not nice # # Lex analysis may be slow for big grammars # Setting case sensitivity for keywords MUST happen BEFORE # declaration of keywords. import kjSet import string import re import string # set this flag for regression testing at each load RUNTESTS = 0 # set this flag to enable warning for default reductions WARNONDEFAULTS = 0 # some local constants TERMFLAG = -1 # FLAG FOR TERMINAL NOMATCHFLAG = -2 # FLAG FOR NO MATCH IN FSM MOVETOFLAG = -3 # FLAG FOR "SHIFT" IN SN FSM REDUCEFLAG = -4 # FLAG FOR REDUCTION IN FSM TRANSFLAG = -5 # FLAG FOR TRANSIENT STATE IN FSM KEYFLAG = -6 # FLAG FOR KEYWORD NONTERMFLAG = -7 # FLAG FOR NONTERMINAL TERMFLAG = -8 # FLAG FOR TERMINAL EOFFLAG = "*" # FLAG for End of file # set this string to the Module name (filename) # used for dumping reconstructable objects THISMODULE = "kjParser" # regular expression for matching whitespace WHITERE = "["+string.whitespace+"]+" WHITEREGEX = re.compile(WHITERE) # local errors LexTokenError = "LexTokenError" # may happen on bad string UnkTermError = "UnkTermError" # ditto BadPunctError= "BadPunctError" # if try to make whitespace a punct ParseInitError = "ParseInitError" # shouldn't happen? #EOFError # may happen on bad string FlowError = "FlowError" # shouldn't happen!!! (bug) #SyntaxError # may happen on bad string #TypeError ReductError = "ReductError" # shouldn't happen? NondetError = "NondetError" # shouldn't happen? # the end of file is interpreted in the lexical stream as # a terminal... # this should be appended to the lexical stream: ENDOFFILETOKEN = (TERMFLAG, EOFFLAG) # in FSM use the following terminal to indicate eof ENDOFFILETERM = (ENDOFFILETOKEN, EOFFLAG) # Utility function for match conversion from regex to re def RMATCH(re, key, start=0): #print "RMATCH: %s -> %s <- start=%s" % (re.pattern, key, start) group = re.match(key, start) if group is None: #print "RMATCH: -1" return -1 len = group.end() - group.start() #print "RMATCH: %s (%s)" % (len, group.group()) return len # utility function for error diagnostics def DumpStringWindow(Str, Pos, Offset=15): L = [] L.append("near ::") start = Pos-Offset end = Pos+Offset if start<0: start = 0 if end>len(Str): end = len(Str) L.append(`Str[start:Pos]`+"*"+`Str[Pos:end]`) from string import join return join(L, "\n") # lexical dictionary class # this data structure is used by lexical parser below. # # basic operations: # LD.punctuation(string) # registers a string as a punctuation # EG: LD.punctuation(":") # Punctuations are treated as a special kind of keyword # that is recognized even when not surrounded by whitespace. # IE, "xend" will not be recognized as "x end", but "x;" will be # recognized as "x ;" if "end" is a regular keyword but # ";" is a punctuation. Only single character punctuations # are supported (now), ie, ":=" must be recognized as # ":" "=" above the lexical level. # # LD.comment(compiled_reg_expression) # registers a comment pattern # EG LD.comment(regex.compile("--.*\n")) # asks to recognize ansi/sql comments like "-- correct?\n" # # LD.keyword(keyword_string, canonicalstring) # specifies a keyword string that should map to the canonicalstring # when translated to the lexical stream. # EG: LD.keyword("begin","BEGIN"); LD.keyword("BEGIN","BEGIN") # will recognize upcase or downcase begins, not mixed case. # (automatic upcasing is allowed below at parser level). # # LD[compiled_reg_expression] = (TerminalFlag, Function) # assignment! # specifies a regular expression that should be associated # with the lexical terminal marker TerminalFlag # EG: LD[regex.compile("[0-9]+")] = ("integer",string.atoi) # the Function should be a function on one string argument # that interprets the matching string as a value. if None is # given, just the string itself will be used as the # interpretation. (a better choice above would be a function # which "tries" atoi first and uses atol on overflow). # NOTE: ambiguity among regular expressions will be decided # arbitrarily (fix?). # # LD[string] # retrieval! # returns ((KEYFLAG, Keywordstring), Keywordstring) # if the (entire) string matches a keyword or a # punctuation Keywordstring. # otherwise returns ((TERMFLAG, Terminalname), value) # if the (entire) string matches the regular expression for # a terminal flaged by Terminalname; value is the interpreted # value. TerminalFlag better be something other than # KEYFLAG! # otherwise raises an error! # comments not filtered here! # # the following additional functions are used for autodocumentation # in declaring rules, etcetera. # begin = LD.keyword("begin") # sets variable "begin" to (KEYFLAG, "BEGIN") if # "begin" maps to keyword "BEGIN" in LD # integer = LD.terminal("integer") # sets variable integer to ("integer", Function) # if "integer" is a registered terminal Function is # its associated interpretation function. # class LexDictionary: def __init__(self): # commentpatterns is simply a list of compiled regular expressions # that represent comments self.commentpatterns = [] # commentstrings is used for debugging/dumping/reconstruction etc. self.commentstrings = [] # punctuationlist is a string of punctuations self.punctuationlist = "" # keywordmap is a dictionary mapping recognized keyword strings # and punctuations to their constant representations. self.keywordmap = KeywordDict() # regexprlist is a list of triples (regex,Flag,function) mapping # regular expressions to their flag and interpreter function. self.regexprlist = [] def Dump(self): print "comments = ", self.commentstrings print "punctuations = ", self.punctuationlist print "keywordmap =" self.keywordmap.Dump() print "regexprlist =", self.regexprlist def __getitem__(self,key): # try to match string to a keyword try: return self.keywordmap[key] except KeyError: # try to match a regular expression found = 0 # so far not found length = len(key) for triple in self.regexprlist: (regexpr, Flag, Function) = triple index = RMATCH(regexpr,key) if index == length: found = 1 # use the function to interpret the string, if given if Function != None: value = Function(key) else: value = key # NONLOCAL RETURN return (Flag, value) #endfor raise LexTokenError, "no match for string: " + `key` #enddef __getitem__ # LD.keyword("this") will make a new keyword "this" if not found # def keyword(self,str): # upcase the string, if needed if self.keywordmap.caseInsensitive: str = string.upper(str) if not self.keywordmap.has_key(str): # redundancy for to avoid excess construction during parsing token = (KEYFLAG,str) self.keywordmap[str] = (token,str) else: (token, str2) = self.keywordmap[str] return token # LD.terminal("this") will just look for "this" # LD.terminal("this", RE, F) will register a new terminal # RE must be a compiled regular expression or string reg ex # F must be an interpretation function # def terminal(self, string, RegExpr=None, Function=None): if RegExpr != None and Function != None: if type(RegExpr) == type(""): RegExpr = re.compile(RegExpr) self[ RegExpr ] = ( string, Function) for triple in self.regexprlist: (regexpr,token,Function) = triple if token[1] == string: # nonlocal exit return token #endfor # error if no exit by now raise UnkTermError, "no such terminal" def __setitem__(self,key,value): if type(key) == type(''): # if it's a string it must be a keyword if self.keywordmap.caseInsensitive: value = string.upper(value) key = string.upper(key) self.keywordmap[key] = ( (KEYFLAG, value), value) else: # otherwise it better be a compiled regular expression (not #verified) (Name, Function) = value Flag = (TERMFLAG, Name) regexpr = key self.regexprlist = self.regexprlist + \ [ (regexpr, Flag, Function) ] # register a regular expression as a comment def comment(self, string): # regexpr better be a uncompiled string regular expression! (not verified) regexpr = re.compile(string) self.commentpatterns = self.commentpatterns + [ regexpr ] self.commentstrings = self.commentstrings + [ string ] # register a string as a punctuation def punctuation(self,Instring): if type(Instring) != type("") or len(Instring)!=1: raise BadPunctError, "punctuation must be string of length 1" if Instring in string.whitespace: raise BadPunctError, "punctuation may not be whitespace" self.punctuationlist = self.punctuationlist + Instring return self.keyword(Instring) # testing and altering case sensitivity behavior def isCaseSensitive(self): return not self.keywordmap.caseInsensitive # setting case sensitivity MUST happen before keyword # declarations! def SetCaseSensitivity(self, Boolean): self.keywordmap.caseInsensitive = not Boolean # function to do same as __getitem__ above but looking _inside_ a string # instead of at the whole string # returns (token,skip) # where token is one of # ((KEYFLAG,name),name) or ((TERMFLAG,termname),value) # and skip is the length of substring of string that matches thetoken def Token(self, String, StartPosition): finished = 0 # dummy, exit should be nonlocal totalOffset = 0 while not finished: # flag EOF if past end of string? if len(String) <= StartPosition: return (ENDOFFILETERM, 0) # skip whitespace whitespacefound = 0 skip = RMATCH(WHITEREGEX,String, StartPosition) if skip > 0: StartPosition = StartPosition + skip totalOffset = totalOffset + skip whitespacefound = 1 # try to find comment, keyword, term in that order: # looking for comment commentfound = 0 for commentexpr in self.commentpatterns: offset = RMATCH(commentexpr,String,StartPosition) if offset != -1: if offset<1: info = DumpStringWindow(String,StartPosition) raise LexTokenError, "zero length comment "+info commentfound = 1 StartPosition = StartPosition + offset totalOffset = totalOffset + offset # looking for a keyword keypair = self.keywordmap.hasPrefix(String,StartPosition, self.punctuationlist) if keypair != 0: return ( keypair[0], keypair[1] + totalOffset) # looking for terminal for (regexpr, Flag, Function) in self.regexprlist: offset = RMATCH(regexpr,String,StartPosition) if offset != -1: matchstring = String[StartPosition : offset+StartPosition] if Function != None: value = Function(matchstring) else: value = matchstring return ((Flag, value) , offset + totalOffset) if not (commentfound or whitespacefound): info = DumpStringWindow(String,StartPosition) raise LexTokenError, "Lexical parse failure "+info #endwhile #enddef #endclass LexDictionary # alternate, experimental implementation class lexdictionary: def __init__(self): self.skip = "" self.commentstrings = [] self.punctuationlist = "" self.keywordmap = KeywordDict() self.termlist = [] # list of (term, regex, flag, interpret_fn) self.uncompiled = 1 # only compile after full initialization. self.laststring= self.lastindex= self.lastresult = None def Dump(self, *k): raise "sorry", "not implemented" __getitem__ = Dump def keyword(self, str): kwm = self.keywordmap if kwm.caseInsensitive: str = string.upper(str) try: (token, str2) = kwm[str] except: token = (KEYFLAG, str) self.keywordmap[str] = (token,str) return token def terminal(self, str, regexstr=None, Function=None): if regexstr is not None: flag = (TERMFLAG, str) self.termlist.append( (str, regexstr, flag, Function) ) return flag else: for (s,fl,fn) in self.termlist: if fl[1]==str: return fl else: raise UnkTermError, "no such terminal" __setitem__ = Dump def comment(self, str): self.commentstrings.append(str) def punctuation(self, Instring): if type(Instring) != type("") or len(Instring)!=1: raise BadPunctError, "punctuation must be string of length 1" if Instring in string.whitespace: raise BadPunctError, "punctuation may not be whitespace" self.punctuationlist = self.punctuationlist + Instring return self.keyword(Instring) def SetCaseSensitivity(self, Boolean): self.keywordmap.caseInsensitive = not Boolean def Token(self, String, StartPosition): # shortcut for reductions. if self.laststring is String and self.lastindex == StartPosition: #print "lastresult", self.lastresult return self.lastresult self.lastindex = StartPosition self.laststring = String #print `String[StartPosition: StartPosition+60]` if self.uncompiled: self.compile() self.uncompiled = None finished = 0 totalOffset = 0 skipprog = self.skipprog keypairfn = self.keywordmap.hasPrefix punctlist = self.punctuationlist termregex = self.termregex while not finished: if len(String) <= StartPosition: result = self.lastresult = (ENDOFFILETERM, 0) return result # skip ws and comments #skip = skipprog.match(String, StartPosition) skip = RMATCH(skipprog, String, StartPosition) if skip>0: if skip==0: info = DumpStringWindow(String, StartPosition) raise LexTokenError, \ "zero length whitespace or comment "+info StartPosition = StartPosition + skip totalOffset = totalOffset + skip continue # look for keyword keypair = keypairfn(String, StartPosition, punctlist) if keypair!=0: #print "keyword", keypair result = self.lastresult = (keypair[0], keypair[1]+totalOffset) return result # look for terminal #print "Termregex: %s --> %s <-- start=%s" % (termregex.pattern, String, StartPosition) offset = termregex.match(String, StartPosition) if offset is not None: g = offset.group for (term, regex, flag, fn) in self.termlist: test = g(term) if test: #print "terminal", test if fn is not None: value = fn(test) else: value = test result = self.lastresult = ( (flag, value), offset.end() - offset.start() + totalOffset) return result # error if we get here info = DumpStringWindow(String, StartPosition) raise LexTokenError, "Lexical token not found "+info def isCaseSensitive(self): return not self.keywordmap.caseInsensitive def compile(self): from string import joinfields, whitespace import re skipregexen = self.commentstrings + [WHITERE] skipregex = "(" + joinfields(skipregexen, ")|(") + ")" #print skipregex; import sys; sys.exit(1) self.skipprog = re.compile(skipregex) termregexen = [] termnames = [] for (term, rgex, flag, fn) in self.termlist: fragment = "(?P<%s>%s)" % (term, rgex) termregexen.append(fragment) termnames.append(term) termregex = joinfields(termregexen, "|") self.termregex = re.compile(termregex) self.termnames = termnames LexDictionary = lexdictionary ##### test! # a utility class: dictionary of prefixes # should be generalized to allow upcasing of keyword matches class KeywordDict: def __init__(self, caseInsensitive = 0): self.FirstcharDict = {} self.KeyDict = {} self.caseInsensitive = caseInsensitive def Dump(self): if self.caseInsensitive: print " case insensitive" else: print " case sensitive" keys = self.KeyDict.keys() print " keyDict has ", len(keys), " elts" for key in keys: print " ", key," maps to ",self.KeyDict[key] firstchars = self.FirstcharDict.keys() print " firstcharDict has ", len(firstchars), " elts" for char in firstchars: print " ", char," maps to ",self.FirstcharDict[char] # set item assumes value has correct case already, if case sensitive def __setitem__(self, key, value): if len(key)<1: raise LexTokenError, "Keyword of length 0" if self.caseInsensitive: KEY = string.upper(key) else: KEY = key firstchar = KEY[0:1] if self.FirstcharDict.has_key(firstchar): self.FirstcharDict[firstchar] = \ self.FirstcharDict[firstchar] + [(KEY, value)] else: self.FirstcharDict[firstchar] = [(KEY, value)] self.KeyDict[KEY] = value # if String has a registered keyword at start position # return its canonical representation and offset, else 0 # keywords that are not punctuations should be # recognized only if followed # by a punctuation or whitespace char # def hasPrefix(self,String,StartPosition,punctuationlist): First = String[StartPosition:StartPosition+1] fcd = self.FirstcharDict caseins = self.caseInsensitive if caseins: First = string.upper(First) if fcd.has_key(First): Keylist = fcd[First] else: return 0 for (key,value) in Keylist: offset = len(key) EndPosition = StartPosition+offset match = String[StartPosition : EndPosition] if caseins: match = string.upper(match) if key == match: if len(key)==1 and key in punctuationlist: # punctuations are recognized regardless of nextchar return (value,offset) else: # nonpuncts must have punct or whitespace following #(uses punct as single char convention) if EndPosition == len(String): return (value, offset) else: nextchar = String[EndPosition] if nextchar in string.whitespace\ or nextchar in punctuationlist: return (value, offset) return 0 # if no exit inside for loop, fail def __getitem__(self,key): if self.caseInsensitive: key = string.upper(key) return self.KeyDict[key] def has_key(self,key): if self.caseInsensitive: key = string.upper(key) return self.KeyDict.has_key(key) #endclass KeywordDict: # LexStringWalker walks through a string looking for # substrings recognized by a lexical dictionary # # ERROR REPORTING NEEDS IMPROVEMENT class LexStringWalker: def __init__(self, String, LexDict): self.Position = 0 self.NextPosition = 0 self.String = String self.LexDict = LexDict self.PastEOF = 0 self.Done = 0 def DUMP(self): return DumpStringWindow(self.String,self.Position) #reset not defined def more(self): return not self.PastEOF def getmember(self): (Token,skip) = self.LexDict.Token(self.String, self.Position) self.NextPosition = self.Position + skip if Token == ENDOFFILETERM: self.PastEOF = 1 return Token def next(self): if self.Done: data = self.DUMP() raise LexTokenError, "no next past end of file "+data elif self.PastEOF: self.Done=1 elif self.NextPosition > self.Position: self.Position = self.NextPosition else: dummy = self.getmember() if self.NextPosition <= self.Position: data = self.DUMP() raise LexTokenError, "Lexical walker not advancing "+data self.Position = self.NextPosition #endclass LexStringWalker # the parse class: # Based loosely on Aho+Ullman, Principles of Compiler Design, Ch.6. # except that they don't describe how to handle boundary # conditions, I made them up myself. # # Note: This could be implemented using just functions; it's implemented # as a class to facilitate diagnostics and debugging in case of # failures of various sorts. # # a parse accepts # a rule list # # a lexically analysed stream with methods # stream.getmember() returns the current token on the stream # stream.next() moves on to next token # stream.more() returns false if current token is the last token # # and a FSM (finite state machine) with methods # FSM.root_nonTerminal # the nonterminal at which to start parsing # FSM.initial_state # the initial state to start at # FSM.successful_final_state # the final state to go to upon successful parse # FSM.map(Current_State,Current_Token) # returns either # (TERMFLAG, 0) # if Current_State is terminal (final or reduction). # (NOMATCHFLAG, 0) # if Current_State is nonterminal, but the Current_Token # and Next_Token do not lead to a valid state in the FSM # (MOVETOFLAG, Next_State) # if Current_State is nonterminal and Current_Token, # Next_token map to Next_State from Current_State. # (REDUCEFLAG, Rulenum) # if Current_State indicates a reduction at Current_Token # for rule Rule number Rule # # and a Stack with methods (replaced with dictionary) # (init: {-1:0} ) # Stack.Top() returns top of stack (no pop) # ( Stack[Stack[-1]] ) # Stack.Push(Object) # ( Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=Object ) # Stack.MakeEmpty() # ( Stack[-1]=0 ) # Stack.IsEmpty() # ( Stack[-1] == 0 ) # Stack.Pop() # ( Stack[-1] = Stack[-1]-1 ) # stack contents created by Parser will be of form (State,Value) # where Value was inserted at FSM state State. # Value of form either (KEYFLAG, Name) # (NontermName, reductionvalue) # or (TerminalName, value) # # and an optional parameter Evaluate which if 0 indicates that # rules should be evaluated, otherwise indicates that rules # should just be reduced and the reduction structure should # be used as the result of the rule # # rule objects must support methods # Rule.reduce(Stack) # pops off the elements corresponding to the body of the Rule # from the stack and returns (NewStack,Red) where NewStack is # the stack minus the body and Red is the result of evaluating the # reduction function on this instance of the rule. # Rule.Nonterm # the nonterminal at the head of the rule class ParserObj: # Evaluate determines whether rules should be evaluated # after reductions. Context is an argument passed to the # list reduction function # def __init__(self, Rulelist, Stream, FSM, Stack, \ Evaluate=1, \ Context=None): self.Rules = Rulelist self.LexStream = Stream self.FSM = FSM self.Stack = Stack self.Context = Context # start with empty stack, initial_state, no nonterminal #self.Stack[-1] = 0# self.Stack.MakeEmpty() self.Stack[:] = [] self.State = FSM.initial_state self.currentNonterm = None self.Evaluate = Evaluate # DoOneReduction accepts tokens from the stream and pushes # them onto the stack until a reduction state is reached. # # Resolve the reduction # def DoOneReduction(self): current=self.State FSM=self.FSM Stack = self.Stack Context = self.Context Stream = self.LexStream # the internal FSM.StateTokenMap dictionary is used directly here. STMap = FSM.StateTokenMap #if FSM.final_state(current): # raise ParseInitError, 'trying to reduce starting at final state' tokenVal = Stream.getmember() #print "tokenVal", tokenVal token = tokenVal[0] # push the token and traverse FSM until terminal state is reached #(flag, nextThing) = FSM.map(current, token) key = (current, token) try: (flag, nextThing) = STMap[key][0] except KeyError: flag = NOMATCHFLAG while flag == MOVETOFLAG: nextState = nextThing #print current, " shift ", token, # no sanity check, possible infinite loop # push current token and next state ThingToPush = (nextState, tokenVal) #print "pushing ", ThingToPush #Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=ThingToPush Stack.append(ThingToPush) #Stack.Push( ThingToPush ) # move to next token, next state Stream.next() # error if end of stream if not Stream.more(): # optimized Stream.PastEOF (?) data = Stream.DUMP() raise EOFError, 'end of stream during parse '+data current = nextState tokenVal = Stream.getmember() token = tokenVal[0] #MAP = FSM.map(current,token) key = (current, token) try: (flag, nextThing) = STMap[key][0] except KeyError: flag = NOMATCHFLAG # at end of while loop we should be at a reduction state if flag == REDUCEFLAG: rulenum = nextThing #print current, " reduce ", token, self.Rules[rulenum] # normal case # perform reduction rule = self.Rules[rulenum] Nonterm = rule.Nonterm self.currentNonterm = Nonterm (Stack, reduct) = rule.reduce( Stack , Context ) # self.Stack = Stack #not needed, unless stack rep changes GotoState = self.GotoState(rule) # push the Gotostate and result of rule reduction on stack ThingToPush = (GotoState, (Nonterm, reduct) ) # push the result of the reduction and exit normally #print "pushing ", ThingToPush #Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=ThingToPush Stack.append(ThingToPush) #Stack.Push(ThingToPush) self.State=GotoState return 1 # normal successful completion # some error cases elif flag == NOMATCHFLAG: self.ParseError(current,tokenVal, "nomatch1") #elif FSM.final_state(current): # raise BadFinalError, 'unexpected final state reached in reduction' else: data = Stream.DUMP() s = """ flag = %s map = %s """ % (flag, FSM.map(current,token)) data = data + s raise FlowError, 'unexpected else '+data #enddef DoOneReduction # compute the state to goto after a reduction is performed # on a rule. # Algorithm: determine the state at beginning of reduction # and the next state indicated by the head nonterminal of the rule. # special case: empty stack and root nonterminal > success. # def GotoState(self, rule): FSM = self.FSM Stack = self.Stack Head = rule.Nonterm if len(Stack)==0: #Stack[-1]==0: #Stack.IsEmpty(): BeforeState = FSM.initial_state else: BeforeState = Stack[-1][0] #Stack[Stack[-1]][0] #Stack.Top()[0] # is this right? if the stack is empty and the Head # is the root nonterm, then goto is final state if len(Stack)==0 and Head == FSM.root_nonTerminal:#Stack.isEmpty() Result = FSM.successful_final_state else: # consider eliminating the call to .map here? (efficiency) (flag, Result) = FSM.map(BeforeState, Head) if flag != MOVETOFLAG: #FSM.DUMP() self.ParseError(BeforeState, Head, "notmoveto") return Result def ParseError( self, State, Token, *rest): # make this parse error nicer (add diagnostic methods?) L = [""] L.append("*******************************") L.append("current state = "+`State`) L.append("expects: ") expects = "" for (flag,name) in self.FSM.Expects(State): if flag in (TERMFLAG, KEYFLAG): expects = expects + `name`+ ", " L.append(expects) L.append(`rest`) L.append("current token = " + `Token`) #print "Stack =", #self.StackDump(5) #print from string import join data = self.LexStream.DUMP() + join(L, "\n") raise SyntaxError, 'unexpected token sequence.' + data def StackDump(self, N): Stack = self.Stack Topkey = len(Stack) if Topkey>N: Start = Topkey - N else: Start = 1 for i in range(Start,Topkey+1): print " :: ", Stack[i], # execute parsing until done: def GO(self): while self.State != self.FSM.successful_final_state: #self.FSM.final_state(self.State): self.DoOneReduction() # should I check that stack has only one elt here? # return result of last reduction return self.Stack[-1][1] #self.Stack.Top()[1] #endclass ParserObj # function for declaring a variable to represent a nonterminal: # eg Program = nonterminal("program") # included for convenient autodocumentation # def nonterminal(string): return (NONTERMFLAG, string) # declaring a terminal WITHOUT INSTALLING IT IN A LexDict def termrep(string): return (TERMFLAG, string) # the rule class # a rule is defined by a goal nonterminal marker of form # (NONTERMFLAG, Name) # and a list defining the body which must contain elts of form # (KEYFLAG, Name) or (NONTERMFLAG, Name) of (TERMFLAG, Name) # and a reduction function which takes a list of the same size # as the BodyList (consisting of the results of the evaluations of # the previous reductions) # and returns an interpretation for the body # the following function is used as a default reduction function # for rules def DefaultReductFun( RuleResultsList, Context ): if WARNONDEFAULTS: print "warn: default reduction." print " ", RuleResultsList return RuleResultsList class ParseRule: def __init__(self, goalNonTerm, BodyList, \ ReductFunction = DefaultReductFun): #print BodyList # check some of the arguments (very limited!) if len(goalNonTerm) != 2 or goalNonTerm[0] != NONTERMFLAG: raise TypeError, "goal of rule must be nonterminal" for m in BodyList: #print m if len(m) != 2: raise TypeError, "invalid body form for rule" self.Nonterm = goalNonTerm self.Body = BodyList self.ReductFun = ReductFunction # for dumping/reconstruction: LOSES THE INTERPRETATION FUNCTION! def __repr__(self): return THISMODULE + ".ParseRule" + `self.components()` # marshal-able components of a rule def components(self): return (self.Nonterm, self.Body) # rule.reduce(Stack) pops of the stack elements corresponding # to the body of the rule and prepares the appropriate reduction # object for evaluation (or not) at higher levels # def reduce(self, Stack, Context=None): #print "reducing", Stack Blength = len(self.Body) #print Blength, len(self.Body) # pop off previous results from stack corresponding to body BodyResults = [None] * Blength #BodyNames = [None] * Blength # for debug #print "popping: " for i in range(1,Blength+1): Bindex = Blength - i # stack contents pop off in reverse order # get and destructure the rule body entry RuleEntry = self.Body[Bindex] ( REkind , REname ) = RuleEntry # get and destructure the stack entry PoppedValue = Stack[-i] #Stack.Top() #print PoppedValue, #del Stack[-1]# = Stack[-1]-1 #Stack.Pop() SETokVal = PoppedValue[1] SEvalue = SETokVal[1] SEname = SETokVal[0][1] # the names from rule and stack must match (?) if SEname != REname: print SEname, REname print self raise ReductError, " token names don't match" # store the values for the reduction BodyResults[Bindex] = SEvalue #BodyNames[Bindex] = SEname # debug #endfor del Stack[len(Stack)-Blength:] #print "reduced", Stack #print # evaluate the reduction, in context reduct = self.ReductFun(BodyResults, Context) if WARNONDEFAULTS and self.ReductFun is DefaultReductFun: # should check whether name is defined before this... print " default used on ", self.Name #Reduction( self.ReductFun, BodyResults, BodyNames ) return (Stack, reduct) #enddef ParseRule.reduce #endclass ParseRule # for debugging: look through a rule list # and print names of rules that have default binding # def PrintDefaultBindings(rulelist): for r in rulelist: if r.ReductFun is DefaultReductFun: print r.Name # the FSM class # class FSMachine: def __init__(self, rootNonTerm): # start and success state conventions startState=1 successState=0 self.root_nonTerminal = rootNonTerm self.initial_state = startState self.successful_final_state = successState # the list of states of the FSM, implemented as a dictionary # entries are identified by their index # content is # a list whose first elt is either TRANSFLAG, or TERMFLAG # other list elts may be added by other layers (parse generator) # indicating the kind of the state. self.States = {} # allocate start and success states self.States[startState]=[TRANSFLAG] self.States[successState]=[TERMFLAG] # the most recently allocated state self.maxState= startState # the map of current token+state number to next state #with entries of form (tokenname,state):nextstate_sequence # self.StateTokenMap = {} #enddef FSM() # ForbiddenMark is for filtering out maps to an error state def DUMP(self, DumpMapData=1, DumpStateData=1, ForbiddenMark={}): print "root nonterminal is ", self.root_nonTerminal print "start at ", self.initial_state print "end at ", self.successful_final_state print "number of states: ", self.maxState if DumpStateData: print for State in range(0,self.maxState+1): Data = self.States[State] print State, ": ", Data if DumpMapData: print for key in self.StateTokenMap.keys(): map = self.StateTokenMap[key] if map[0][0] == MOVETOFLAG: ToStateData = self.States[map[0][1]] if len(ToStateData) < 2: Mark = None else: Mark = ToStateData[1] if Mark != ForbiddenMark: print key, " > ", map, " = ", ToStateData else: print key, " > reduction to rule number ", map[0][1] # what tokens does a state expect? def Expects(self, State): keys = self.StateTokenMap.keys() Tokens = kjSet.NewSet( [] ) for (state1,token) in keys: if State == state1: kjSet.addMember(token,Tokens) return kjSet.get_elts(Tokens) # "allocate" a new state of specified kind # kind must either be TRANSFLAG, TERMFLAG or a rule object # returns the number of the new state def NewState(self, kind, AdditionalInfo = []): if not kind in (TRANSFLAG,TERMFLAG,REDUCEFLAG): raise TypeError, "unknown state kind" available = self.maxState+1 self.States[available] = [kind] + AdditionalInfo self.maxState = available return available # Install a reduction transition in the FSM: # a reduction is represented by mapping to a rule index # no nondeterminism is allowed. def SetReduction(self, fromState, TokenRep, Rulenum): key = (fromState, TokenRep) if not self.StateTokenMap.has_key(key): self.StateTokenMap[ key ] = ((REDUCEFLAG, Rulenum),) else: raise ReductError, "attempt to set ambiguous reduction" # Install a "shift" or "goto transition in the FSM: # supports nondeterminism by storing a sequence of possible transitions # def SetMap(self, fromState, TokenRep, toState): key = (fromState, TokenRep) if self.StateTokenMap.has_key(key): Old = self.StateTokenMap[key] if Old[0][0] != MOVETOFLAG: # if the old value was not an integer, not a "normal state": # complain: raise NondetError, \ "attempt to make inappropriate transition ambiguous" self.StateTokenMap[ key ] = Old + ((MOVETOFLAG,toState),) else: self.StateTokenMap[ key ] = ((MOVETOFLAG,toState),) # Find the action indicated by fsm on # (current_state, current_token) input. # # note: in the event of nondeterministic choice this chooses # the first possibility listed. # ParseObj.DoOneReduction() currently uses the internal structure # of StateTokenMap directly, rather than using this function. # def map(self, current_state, current_token): StateEntry = self.States[current_state][0] if StateEntry == TERMFLAG: return (TERMFLAG, 0) elif StateEntry == TRANSFLAG: # try to find a transition for this token and state key = (current_state, current_token) try: TMap = self.StateTokenMap[key] #print "TMap ", TMap #print "key ", key #print return TMap[0] except KeyError: return (NOMATCHFLAG, 0) else: raise FlowError, "unexpected else (2)" #enddef map #endclass FSMachine # the grammar class: # a grammar consists of # - a LexDict lexical dictionary; # - a deterministic FSMachine; # - a Rulelist # and optionally a dictionary that maps Rulenames # to Rulelist indices (used for dumping and externally) # class Grammar: def __init__(self, LexD, DFA, RuleL, RuleNameDict = None): # for auto initialization set LexD,DFA,RuleL to None if LexD == None and DFA == None and RuleL == None: self.LexD = LexDictionary() # use a dummy root nonterminal -- must fix elsewhere! self.DFA = FSMachine("ERROR") self.RuleL = [] else: self.LexD = LexD self.DFA = DFA self.RuleL = RuleL if RuleNameDict != None: self.AddNameDict(RuleNameDict) self.CleanUp() #enddef __init__ # look for default bindings def PrintDefaults(self): print "Default bindings on:" PrintDefaultBindings(self.RuleL) # setting case sensitivity: must happen before keyword installation # in LexD. def SetCaseSensitivity( self, Boolean ): self.LexD.SetCaseSensitivity( Boolean ) # this may be silly, but to save some space in construction # a token dictionary may be used that facilitates sharing of # token representations. This method either initializes # the dictionary or disposes of it if it exists def CleanUp(self): self.IndexToToken = {} # this dictionary is used by automatically # generated grammars to determine whether # a string represents a nonterminal self.NonTermDict = {} # similarly for terminals self.TermDict = {} # this string may be used to keep a printable # representation of the rules of the grammar # (usually in automatic grammar generation self.RuleString = "" # to associate a token to an integer use # self.IndexToToken[int] = tokenrep # this method associates rules to names using a # RuleNameDict dictionary which maps names to rule indices. # after invocation # self.RuleNameToIndex[ name ] gives the index # in self.RuleL for the rule associated with name, and # self.RuleL[index].Name gives the name associated # with the rule self.RuleL[index] # def AddNameDict(self, RuleNameDict): self.RuleNameToIndex = RuleNameDict # add a Name attribute to the rules of the rule list for ruleName in RuleNameDict.keys(): index = RuleNameDict[ ruleName ] self.RuleL[ index ].Name = ruleName # parse a string using the grammar, return result and context def DoParse( self, String, Context = None, DoReductions = 1 ): # construct the ParserObj Stream = LexStringWalker( String, self.LexD ) Stack = [] # {-1:0} #Walkers.SimpleStack() ParseOb = ParserObj( self.RuleL, Stream, self.DFA, Stack, \ DoReductions, Context ) # do the parse ParseResult = ParseOb.GO() # return final result of reduction and the context return (ParseResult[1], Context) #enddef DoParse # parse a string using the grammar, but only return # the result of the last reduction, without the context def DoParse1( self, String, Context=None, DoReductions=1 ): return self.DoParse(String, Context, DoReductions)[0] # if the Name dictionary has been initialized # this method will (re)bind a reduction function to # a rule associated with Rulename # def Bind( self, Rulename, NewFunction ): ruleindex = self.RuleNameToIndex[ Rulename ] rule = self.RuleL[ ruleindex ] rule.ReductFun = NewFunction #enddef Bind # bind a terminal to a regular expression and interp function # in the lexical dictionary (convenience) def Addterm( self, termname, regexpstr, funct ): self.TermDict[ termname ] =\ self.LexD.terminal( termname, regexpstr, funct ) #endclass Grammar # function to create a "null grammar" def NullGrammar(): return Grammar(None,None,None,{}) # unmarshalling a marshalled grammar created by # buildmodule.CGrammar.MarshalDump(Tofile) # tightly coupled with buildmodule code... # file should be open and "pointing to" the marshalled rep. # # warning: doesn't bind semantics! # def UnMarshalGram(file): Grammar = NullGrammar() UnMarshal = UnMarshaller(file, Grammar) UnMarshal.MakeLex() UnMarshal.MakeRules() UnMarshal.MakeTransitions() UnMarshal.Cleanup() return UnMarshal.Gram # unmarshalling object for unmarshalling grammar from a file # class UnMarshaller: def __init__(self, file, Grammar): import marshal self.Gram = Grammar BigList = marshal.load(file) if type(BigList) != type([]): raise FlowError, "bad type for unmarshalled list" if len(BigList) != 9: raise FlowError, "unmarshalled list of wrong size" self.tokens = BigList[0] self.punct = BigList[1] self.comments = BigList[2] self.RuleTups = BigList[3] self.MaxStates = BigList[4] self.reducts = BigList[5] self.moveTos = BigList[6] self.Root = BigList[7] self.CaseSensitivity = BigList[8] Grammar.SetCaseSensitivity( self.CaseSensitivity ) def MakeLex(self): Grammar=self.Gram LexD = Grammar.LexD # punctuations LexD.punctuationlist = self.punct # comments for commentregex in self.comments: LexD.comment(commentregex) #LexD.commentstring = self.comments # keywords, terminals, nonterms # rewrite the tokens list for sharing and extra safety LexTokens = {} tokens = self.tokens for tokenindex in range(len(tokens)): (kind,name) = tokens[tokenindex] if kind == KEYFLAG: tokens[tokenindex] = LexD.keyword(name) elif not kind in [TERMFLAG, NONTERMFLAG]: raise FlowError, "unknown token type" # not needed self.tokens = tokens def MakeRules(self): Grammar = self.Gram Grammar.DFA.root_nonTerminal = self.Root NameIndex = Grammar.RuleNameToIndex RuleTuples = self.RuleTups nRules = len(RuleTuples) RuleList = [None] * nRules for index in range(nRules): (Name, Components) = RuleTuples[index] rule = apply(ParseRule, Components) rule.Name = Name RuleList[index] = rule NameIndex[Name] = index Grammar.RuleL = RuleList def MakeTransitions(self): Grammar = self.Gram DFA = Grammar.DFA StateTokenMap = DFA.StateTokenMap tokens = self.tokens # record the state number DFA.maxState = self.MaxStates # this is historical, unfortunately... CLEAN IT UP SOMEDAY! # THE DFA.States DICT IS NOT NEEDED (?) (here) for state in range(1, self.MaxStates+1): DFA.States[state] = [TRANSFLAG] # record the reductions for (fromState, TokenIndex, rulenum) in self.reducts: DFA.SetReduction(fromState, tokens[TokenIndex], rulenum) # record the transitions for (fromState, TokenIndex, ToState) in self.moveTos: DFA.SetMap(fromState, tokens[TokenIndex], ToState) def Cleanup(self): Grammar = self.Gram Grammar.CleanUp() ################# FOLLOWING CODE IS FOR REGRESSION TESTING ONLY ################# DELETE IT IF YOU WANT/NEED #### All tests for this module deleted, since #### ParseBuild module tests are sufficient.