"""implement the parser for pyexpander.
"""
import bisect
import re
__version__= "2.1.2" #VERSION#
# pylint: disable= invalid-name consider-using-f-string
# we use '\n' as line separator and rely on python's built in line end
# conversion to '\n' on all platforms.
# You may howver, use change_linesep() to change the line separator.
LINESEP= "\n"
LINESEP_LEN= len(LINESEP)
[docs]def change_linesep(sep):
"""change line separator, this is here just for tests.
"""
# pylint: disable= global-statement
global LINESEP, LINESEP_LEN
LINESEP= sep
LINESEP_LEN= len(sep)
[docs]class IndexedString:
"""a string together with row column information.
Here is an example:
>>> txt='''01234
... 67
... 9abcd'''
>>> l=IndexedString(txt)
>>> l.rowcol(0)
(1, 1)
>>> l.rowcol(1)
(1, 2)
>>> l.rowcol(4)
(1, 5)
>>> l.rowcol(5)
(1, 6)
>>> l.rowcol(6)
(2, 1)
>>> l.rowcol(7)
(2, 2)
>>> l.rowcol(8)
(2, 3)
>>> l.rowcol(9)
(3, 1)
>>> l.rowcol(13)
(3, 5)
>>> l.rowcol(14)
(3, 6)
>>> l.rowcol(16)
(3, 8)
"""
# pylint: disable= too-few-public-methods
def __init__(self, st):
self._st=st
self._lines=None
self._positions=None
def _list(self):
"""calculate and remember positions where lines begin."""
l= len(self._st)
pos=0
self._lines=[1]
self._positions=[0]
lineno=1
while True:
# look for the standard lineseparator in the string:
p= self._st.find(LINESEP, pos)
if p<0:
# not found
break
pos= p+LINESEP_LEN
if pos>=l:
break
lineno+=1
self._lines.append(lineno)
self._positions.append(pos)
[docs] def rowcol(self,pos):
"""calculate (row,column) from a string position."""
if self._lines is None:
self._list()
idx= bisect.bisect_right(self._positions, pos)-1
off= self._positions[idx]
return(self._lines[idx], pos-off+1)
[docs] def st(self):
"""return the raw string."""
return self._st
def __str__(self):
return "IndexedString(...)"
def __repr__(self):
# Note: if repr(some object) gets too long since
# repr(IndexedString(..)) basically prints the whole input file
# you may in-comment the following line in order to make
# the output shorter:
#return "IndexedString(...)"
return "IndexedString(%s)" % repr(self._st)
[docs]class ParseException(Exception):
"""used for Exceptions in this module."""
def __init__(self, value, pos=None, rowcol=None):
super().__init__(value, pos, rowcol)
self.value = value
self.pos= pos
self.rowcol= rowcol
def __str__(self):
"""return string representation"""
if self.rowcol is not None:
return "%s line %d, col %d" % \
(self.value,self.rowcol[0],self.rowcol[1])
if self.pos is not None:
return "%s position: %d" % (self.value,self.pos)
return "%s" % self.value
rx_pyIdent= re.compile(r'([A-Za-z_][\w\.]*)$')
rx_csv=re.compile(r'\s*,\s*')
[docs]def scanPyIdentList(st):
"""scan a list of python identifiers.
Here are some examples:
>>> scanPyIdentList("a,b")
['a', 'b']
>>> scanPyIdentList("a,b.d, c")
['a', 'b.d', 'c']
>>> scanPyIdentList("a,b.d, c&")
Traceback (most recent call last):
...
ParseException: list of python identifiers expected
"""
lst= re.split(rx_csv, st)
for elm in lst:
m= rx_pyIdent.match(elm)
if m is None:
raise ParseException("list of python identifiers expected")
return lst
rx_py_in= re.compile(r'^\s*(.*?)\s*\b(in)\b\s*(.*?)\s*$')
[docs]def scanPyIn(st):
"""scan a python "in" statement.
Here are some examples:
>>> scanPyIn(" (a,b) in k.items() ")
('(a,b)', 'in', 'k.items()')
"""
m= rx_py_in.match(st)
if m is None:
raise ParseException("python \"in\" expression expected")
return m.groups()
rx_bracketed= re.compile(r'\{[A-Za-z_]\w*\}')
[docs]def parseBracketed(idxst,pos):
"""parse an identifier in curly brackets.
Here are some examples:
>>> def test(st,pos):
... idxst= IndexedString(st)
... (a,b)= parseBracketed(idxst,pos)
... print(st[a:b])
...
>>> test(r'{abc}',0)
{abc}
>>> test(r'{ab8c}',0)
{ab8c}
>>> test(r'{c}',0)
{c}
>>> test(r'{}',0)
Traceback (most recent call last):
...
ParseException: command enclosed in curly brackets at line 1, col 1
>>> test(r'{abc',0)
Traceback (most recent call last):
...
ParseException: command enclosed in curly brackets at line 1, col 1
>>> test(r'x{ab8c}',1)
{ab8c}
"""
if not isinstance(idxst, IndexedString):
raise TypeError("idxst par wrong: %s" % repr(idxst))
st= idxst.st()
m= rx_bracketed.match(st,pos)
if m is None:
raise ParseException("command enclosed in curly brackets at",
rowcol= idxst.rowcol(pos))
return(pos,m.end())
# from python 3 documentation:
# stringprefix ::= "r" | "u" | "R" | "U" | "f" | "F"
# | "fr" | "Fr" | "fR" | "FR" | "rf" | "rF" | "Rf" | "RF"
rx_StringLiteralStart= re.compile(r'''(fr|fR|Fr|FR|rf|rF|Rf|RF|f|F|r|R|u|U|)("""|''' + \
"""'''""" + \
r'''|'|")''')
[docs]def parseStringLiteral(idxst,pos):
r"""parse a python string literal.
returns 2 numbers, the index where the string starts and
the index of the first character *after* the string
Here are some examples:
>>> def test(st,pos):
... idxst= IndexedString(st)
... (a,b)= parseStringLiteral(idxst,pos)
... print(st[a:b])
...
>>> test(r'''"abc"''',0)
"abc"
>>> test("'''ab'c'd'''",0)
'''ab'c'd'''
>>> test("'''ab'cd''''",0)
'''ab'cd'''
>>> test(r'''F"abc"''',0)
F"abc"
>>> test(r'''xF"abc"''',1)
F"abc"
>>> test(r'''xFr"abc"''',1)
Fr"abc"
>>> test(r'''xFr"ab\\"c"''',1)
Fr"ab\\"
>>> test(r'''xFr"ab\"c"''',1)
Fr"ab\"c"
>>> test(r'''xFr"ab\"c"''',0)
Traceback (most recent call last):
...
ParseException: start of string expected at line 1, col 1
>>> test(r'''"ab''',0)
Traceback (most recent call last):
...
ParseException: end of string not found at line 1, col 1
>>> test(r"'''ab'",0)
Traceback (most recent call last):
...
ParseException: end of string not found at line 1, col 1
>>> test(r'''"ab\"''',0)
Traceback (most recent call last):
...
ParseException: end of string not found at line 1, col 1
"""
if not isinstance(idxst, IndexedString):
raise TypeError("idxst par wrong: %s" % repr(idxst))
st= idxst.st()
m= rx_StringLiteralStart.match(st,pos)
if m is None:
raise ParseException("start of string expected at",
rowcol= idxst.rowcol(pos))
prefix= m.group(1)
starter= m.group(2) # """ or ''' or " or '
#is_unicode= False
#is_raw= False
#if -1!=prefix.find("r"):
# is_raw= True
#elif -1!=prefix.find("R"):
# is_raw= True
#if -1!=prefix.find("u"):
# is_unicode= True
#elif -1!=prefix.find("U"):
# is_unicode= True
startpos= pos+len(prefix)+len(starter)
while True:
idx= st.find(starter, startpos)
# if startpos>len(st), idx is also -1
if idx==-1:
raise ParseException("end of string not found at",
rowcol= idxst.rowcol(pos))
if st[idx-1]=="\\":
# maybe escaped quote char
e= None
try:
if st[idx-2]!="\\":
# only then it is an escaped quote char
startpos= idx+1
continue
except IndexError as _e:
e= _e
if e is not None:
raise ParseException("end of string not found at",
rowcol= idxst.rowcol(pos))
break
if len(starter)==1:
# simple single quoted string
return(pos,idx+1)
return(pos,idx+3)
rx_CodePart= re.compile(r'''((?:UR|Ur|uR|ur|r|u|R|U|)(?:"""|''' + """'''""" + \
r'''|'|")|#|\(|\))''')
[docs]def parseCode(idxst,pos):
r"""parse python code, it MUST start with a '('.
Here are some examples:
>>> def test(st,pos):
... idxst= IndexedString(st)
... (a,b)= parseCode(idxst,pos)
... print(st[a:b])
...
>>> test(r'(a+b)',0)
(a+b)
>>> test(r'(a+(b*c))',0)
(a+(b*c))
>>> test(r'(a+(b*c)+")")',0)
(a+(b*c)+")")
>>> test(r"(a+(b*c)+''')''')",0)
(a+(b*c)+''')''')
>>> test(r"(a+(b*c)+''')'''+# comment )\n)",0)
Traceback (most recent call last):
...
ParseException: end of bracket expression not found at line 1, col 1
>>>
>>> test("(a+(b*c)+''')'''+# comment )\n)",0)
(a+(b*c)+''')'''+# comment )
)
"""
if not isinstance(idxst, IndexedString):
raise TypeError("idxst par wrong: %s" % repr(idxst))
st= idxst.st()
if st[pos]!="(":
raise ParseException("start of bracket expression not found at",
rowcol= idxst.rowcol(pos))
startpos= pos+1
while True:
m= rx_CodePart.search(st, startpos)
if m is None:
raise ParseException("end of bracket expression not found at",
rowcol= idxst.rowcol(pos))
matched= m.group(1)
if matched=="#":
# a comment
(_,b)= parseComment(idxst, m.start())
startpos= b
continue
if matched=="(":
# an inner bracket
(_,b)= parseCode(idxst, m.start())
startpos= b
continue
if matched==")":
return(pos,m.start()+1)
# from here it must be a string literal
(_,b)= parseStringLiteral(idxst, m.start())
startpos= b
continue
[docs]class ParsedItem:
"""base class of parsed items."""
def __init__(self, idxst, start, end):
if not isinstance(idxst, IndexedString):
raise TypeError("idxst par wrong: %s" % repr(idxst))
self._idxst= idxst
self._start= start
self._end= end
[docs] def string(self):
"""return the string that represents the ParsedItem."""
return self._idxst.st()[self._start:self._end+1]
[docs] def start(self):
"""return the start of the ParsedItem in the source string."""
return self._start
[docs] def end(self):
"""return the end of the ParsedItem in the source string."""
return self._end
[docs] def rowcol(self, pos= None):
"""calculate (row,column) from a string position."""
if pos is None:
pos= self.start()
return self._idxst.rowcol(pos)
[docs] def positions(self):
"""return start and end of ParsedItem in the source string."""
return "(%d, %d)" % (self._start, self._end)
def __str__(self):
return "('%s', %s, %s)" % (self.__class__.__name__, \
self.positions(), repr(self.string()))
def __repr__(self):
return "%s(%s, %s, %s)" % (self.__class__.__name__, \
repr(self._idxst), repr(self._start), repr(self._end))
[docs]class ParsedLiteral(ParsedItem):
"""class of a parsed literal.
A literal is a substring in the input that shouldn't be modified by
pyexpander.
"""
def __init__(self, idxst, start, end):
ParsedItem.__init__(self, idxst, start, end)
[docs]class ParsedVar(ParsedItem):
"""class of a parsed variable.
A variable in pyexpander has the form "$(identifier)".
"""
def __init__(self, idxst, start, end):
ParsedItem.__init__(self, idxst, start, end)
[docs]class ParsedEval(ParsedItem):
"""class of an pyexpander expression.
A pyexpander expression has the form "$(expression)" e.g. "$(a+1)". This is
different from ParsedVar where the string within the brackets is a simple
identifier.
"""
def __init__(self, idxst, start, end):
ParsedItem.__init__(self, idxst, start, end)
[docs]class ParsedPureCommand(ParsedItem):
"""class of a pyexpander command without arguments.
A pure command has the form "$name". Such a command has no arguments which
would be enclosed in round brackets immediately following the name.
"""
def __init__(self, idxst, start, end):
ParsedItem.__init__(self, idxst, start, end)
[docs]class ParsedCommand(ParsedItem):
"""class of a pyexpander command with arguments.
A command has the form "$name(argument1, argument2, ...)".
"""
def __init__(self, idxst, start, end, ident):
ParsedItem.__init__(self, idxst, start, end)
self.ident= ident
[docs] def args(self):
"""return the arguments of the command."""
return self.string()
def __str__(self):
return "('%s', %s, %s, %s)" % (self.__class__.__name__, \
self.positions(), repr(self.string()), \
repr(self.ident))
def __repr__(self):
return "%s(%s, %s, %s, %s)" % (self.__class__.__name__, \
repr(self._idxst), repr(self._start), repr(self._end), \
repr(self.ident))
rx_DollarFollows= re.compile(r'\s*([A-Za-z_]\w*|\(|\{|#)')
rx_Bracket= re.compile(r'\s*\(')
[docs]def parseDollar(idxst, pos):
r"""parse things that follow a dollar.
Here are some examples:
>>> def test(st,pos):
... idxst= IndexedString(st)
... (p,elm)= parseDollar(idxst,pos)
... print("Parsed: %s" % elm)
... print("rest of string:%s" % st[p:])
...
>>> test("$abc",0)
Parsed: ('ParsedPureCommand', (1, 3), 'abc')
rest of string:
>>> test("$abc%&/",0)
Parsed: ('ParsedPureCommand', (1, 3), 'abc')
rest of string:%&/
>>> test("$abc(2*3)",0)
Parsed: ('ParsedCommand', (5, 7), '2*3', 'abc')
rest of string:
>>> test(" $abc(2*sin(x))",1)
Parsed: ('ParsedCommand', (6, 13), '2*sin(x)', 'abc')
rest of string:
>>> test(" $abc(2*sin(x))bn",1)
Parsed: ('ParsedCommand', (6, 13), '2*sin(x)', 'abc')
rest of string:bn
>>> test(" $# a comment\nnew line",1)
Parsed: ('ParsedComment', (3, 13), ' a comment\n')
rest of string:new line
>>> test("$(abc)",0)
Parsed: ('ParsedVar', (2, 4), 'abc')
rest of string:
>>> test("$(abc*2)",0)
Parsed: ('ParsedEval', (2, 6), 'abc*2')
rest of string:
>>> test(" $(2*x(y))abc",1)
Parsed: ('ParsedEval', (3, 8), '2*x(y)')
rest of string:abc
>>> test(" $ (2*x(y))abc",1)
Parsed: ('ParsedEval', (6, 11), '2*x(y)')
rest of string:abc
"""
if not isinstance(idxst, IndexedString):
raise TypeError("idxst par wrong: %s" % repr(idxst))
st= idxst.st()
if st[pos]!="$":
raise ParseException("'$' expected at",
rowcol= idxst.rowcol(pos))
m= rx_DollarFollows.match(st, pos+1)
if m is None:
raise ParseException("unexpected characters after '$' at",
rowcol= idxst.rowcol(pos))
pos= m.start(1) # start of group 1
matched= m.group(1)
if matched=="#":
# an expander comment
(a, b)= parseComment(idxst, pos)
elm= ParsedComment(idxst, a+1, b-1)
return (b, elm)
if matched=="(":
(a, b)= parseCode(idxst, pos)
m_ident= rx_pyIdent.match(st, a+1, b-1)
if m_ident is not None:
elm= ParsedVar(idxst, a+1, b-2)
else:
elm= ParsedEval(idxst, a+1, b-2)
return (b, elm)
if matched=="{":
# a purecommand enclosed in "{}" brackets
(a, b)= parseBracketed(idxst, pos)
elm= ParsedPureCommand(idxst, a+1, b-2)
return (b, elm)
# from here: a purecommand or a command
# skip spaces and look for an opening bracket:
mb= rx_Bracket.match(st, m.end())
if mb is not None:
# the bracket was found, parse the python code enclosed in brackets:
(a, b)= parseCode(idxst, mb.end()-1)
elm= ParsedCommand(idxst, a+1, b-2, matched)
return (b, elm)
elm= ParsedPureCommand(idxst, pos, m.end()-1)
return (m.end(), elm)
[docs]def parseBackslash(idxst, pos):
r"""parses a backslash.
>>> import os
>>> def test(st,pos,sep=None):
... if sep:
... change_linesep(sep)
... idxst= IndexedString(st)
... (p,elm)= parseBackslash(idxst,pos)
... print("Parsed: %s" % elm)
... print("rest of string:%s" % repr(st[p:]))
... change_linesep(os.linesep)
...
>>> test(r"\abc",0)
Parsed: ('ParsedLiteral', (0, 0), '\\')
rest of string:'abc'
>>> test("\\",0)
Parsed: ('ParsedLiteral', (0, 0), '\\')
rest of string:''
>>> test("\\\rab",0,"\r")
Parsed: None
rest of string:'ab'
>>> test("\\\rab",0,"\n")
Parsed: ('ParsedLiteral', (0, 0), '\\')
rest of string:'\rab'
>>> test("\\\rab",0,"\r\n")
Parsed: ('ParsedLiteral', (0, 0), '\\')
rest of string:'\rab'
>>> test("\\\nab",0,"\r")
Parsed: ('ParsedLiteral', (0, 0), '\\')
rest of string:'\nab'
>>> test("\\\nab",0,"\n")
Parsed: None
rest of string:'ab'
>>> test("\\\nab",0,"\r\n")
Parsed: ('ParsedLiteral', (0, 0), '\\')
rest of string:'\nab'
>>> test("\\\r\nab",0,"\r")
Parsed: None
rest of string:'\nab'
>>> test("\\\r\nab",0,"\n")
Parsed: ('ParsedLiteral', (0, 0), '\\')
rest of string:'\r\nab'
>>> test("\\\r\nab",0,"\r\n")
Parsed: None
rest of string:'ab'
"""
# backslash found
if not isinstance(idxst, IndexedString):
raise TypeError("idxst par wrong: %s" % repr(idxst))
st= idxst.st()
if st[pos]!="\\":
raise ParseException("backslash expected at",
rowcol= idxst.rowcol(pos))
l_= len(st)
# pylint: disable= no-else-return
if pos+1>=l_:
# no more characters
elm= ParsedLiteral(idxst, pos, pos)
return (pos+1, elm)
else:
# at least one more character
nextchar= st[pos+1]
if nextchar=="\\":
elm= ParsedLiteral(idxst, pos, pos)
return (pos+2, elm)
if nextchar=="$":
elm= ParsedLiteral(idxst, pos+1, pos+1)
return (pos+2, elm)
if pos+LINESEP_LEN>=l_:
# not enough characters for LINESEP
elm= ParsedLiteral(idxst, pos, pos)
return (pos+1, elm)
else:
for i in range(LINESEP_LEN):
if st[pos+1+i]!=LINESEP[i]:
# no line separator follows
elm= ParsedLiteral(idxst, pos, pos)
return (pos+1, elm)
return(pos+LINESEP_LEN+1, None)
rx_top= re.compile(r'(\$|\\)')
[docs]def parseAll(idxst, pos):
r"""parse everything.
>>> def test(st,pos):
... idxst= IndexedString(st)
... pprint(parseAll(idxst,pos))
...
>>> test("abc",0)
('ParsedLiteral', (0, 2), 'abc')
>>> test("abc$xyz",0)
('ParsedLiteral', (0, 2), 'abc')
('ParsedPureCommand', (4, 6), 'xyz')
>>> test("abc${xyz}efg",0)
('ParsedLiteral', (0, 2), 'abc')
('ParsedPureCommand', (5, 7), 'xyz')
('ParsedLiteral', (9, 11), 'efg')
>>> test("abc$xyz(2*4)",0)
('ParsedLiteral', (0, 2), 'abc')
('ParsedCommand', (8, 10), '2*4', 'xyz')
>>> test("abc$(2*4)ab",0)
('ParsedLiteral', (0, 2), 'abc')
('ParsedEval', (5, 7), '2*4')
('ParsedLiteral', (9, 10), 'ab')
>>> test("abc\\$(2*4)ab",0)
('ParsedLiteral', (0, 2), 'abc')
('ParsedLiteral', (4, 4), '$')
('ParsedLiteral', (5, 11), '(2*4)ab')
>>> test("ab$func(1+2)\\\nnew line",0)
('ParsedLiteral', (0, 1), 'ab')
('ParsedCommand', (8, 10), '1+2', 'func')
('ParsedLiteral', (14, 21), 'new line')
>>> test("ab$func(1+2)\nnew line",0)
('ParsedLiteral', (0, 1), 'ab')
('ParsedCommand', (8, 10), '1+2', 'func')
('ParsedLiteral', (12, 20), '\nnew line')
>>> test("ab$(xyz)(56)",0)
('ParsedLiteral', (0, 1), 'ab')
('ParsedVar', (4, 6), 'xyz')
('ParsedLiteral', (8, 11), '(56)')
>>> test(r'''
... Some text with a macro: $(xy)
... an escaped dollar: \$(xy)
... a macro within letters: abc${xy}def
... a pyexpander command structure:
... $if(a=1)
... here
... $else
... there
... $endif
... now a continued\
... line
... from here:$# the rest is a comment
... now an escaped continued\\
... line
... ''',0)
('ParsedLiteral', (0, 24), '\nSome text with a macro: ')
('ParsedVar', (27, 28), 'xy')
('ParsedLiteral', (30, 49), '\nan escaped dollar: ')
('ParsedLiteral', (51, 51), '$')
('ParsedLiteral', (52, 83), '(xy)\na macro within letters: abc')
('ParsedPureCommand', (86, 87), 'xy')
('ParsedLiteral', (89, 124), 'def\na pyexpander command structure:\n')
('ParsedCommand', (129, 131), 'a=1', 'if')
('ParsedLiteral', (133, 138), '\nhere\n')
('ParsedPureCommand', (140, 143), 'else')
('ParsedLiteral', (144, 150), '\nthere\n')
('ParsedPureCommand', (152, 156), 'endif')
('ParsedLiteral', (157, 172), '\nnow a continued')
('ParsedLiteral', (175, 189), 'line\nfrom here:')
('ParsedComment', (192, 214), ' the rest is a comment\n')
('ParsedLiteral', (215, 238), 'now an escaped continued')
('ParsedLiteral', (239, 239), '\\')
('ParsedLiteral', (241, 246), '\nline\n')
"""
if not isinstance(idxst, IndexedString):
raise TypeError("idxst par wrong: %s" % repr(idxst))
st= idxst.st()
parselist=[]
l= len(st)
while True:
if pos>=l:
return parselist
m= rx_top.search(st, pos)
if m is None:
parselist.append(ParsedLiteral(idxst, pos, len(st)-1))
return parselist
if m.start()>pos:
parselist.append(ParsedLiteral(idxst, pos, m.start()-1))
if m.group(1)=="\\":
(p, elm)= parseBackslash(idxst, m.start())
if elm is not None:
parselist.append(elm)
pos= p
continue
# from here it must be a dollar sign
(pos, elm)= parseDollar(idxst, m.start())
parselist.append(elm)
continue
[docs]def pprint(parselist):
"""pretty print a parselist."""
for elm in parselist:
print(str(elm))
def _test():
"""perform the doctest tests."""
# pylint: disable= import-outside-toplevel
import doctest
print("testing...")
doctest.testmod()
print("done")
if __name__ == "__main__":
_test()