odoo/bin/reportlab/tools/docco/t_parse.py

247 lines
9.8 KiB
Python

#Copyright ReportLab Europe Ltd. 2000-2004
#see license.txt for license details
#history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/tools/docco/t_parse.py
"""
Template parsing module inspired by REXX (with thanks to Donn Cave for discussion).
Template initialization has the form:
T = Template(template_string, wild_card_marker, single_char_marker,
x = regex_x, y = regex_y, ...)
Parsing has the form
([match1, match2, ..., matchn], lastindex) = T.PARSE(string)
Only the first argument is mandatory.
The resultant object efficiently parses strings that match the template_string,
giving a list of substrings that correspond to each "directive" of the template.
Template directives:
Wildcard:
The template may be initialized with a wildcard that matches any string
up to the string matching the next directive (which may not be a wild
card or single character marker) or the next literal sequence of characters
of the template. The character that represents a wildcard is specified
by the wild_card_marker parameter, which has no default.
For example, using X as the wildcard:
>>> T = Template("prefixXinteriorX", "X")
>>> T.PARSE("prefix this is before interior and this is after")
([' this is before ', ' and this is after'], 47)
>>> T = Template("<X>X<X>", "X")
>>> T.PARSE('<A HREF="index.html">go to index</A>')
(['A HREF="index.html"', 'go to index', '/A'], 36)
Obviously the character used to represent the wildcard must be distinct
from the characters used to represent literals or other directives.
Fixed length character sequences:
The template may have a marker character which indicates a fixed
length field. All adjacent instances of this marker will be matched
by a substring of the same length in the parsed string. For example:
>>> T = Template("NNN-NN-NNNN", single_char_marker="N")
>>> T.PARSE("1-2-34-5-12")
(['1-2', '34', '5-12'], 11)
>>> T.PARSE("111-22-3333")
(['111', '22', '3333'], 11)
>>> T.PARSE("1111-22-3333")
ValueError: literal not found at (3, '-')
A template may have multiple fixed length markers, which allows fixed
length fields to be adjacent, but recognized separately. For example:
>>> T = Template("MMDDYYX", "X", "MDY")
>>> T.PARSE("112489 Somebody's birthday!")
(['11', '24', '89', " Somebody's birthday!"], 27)
Regular expression markers:
The template may have markers associated with regular expressions.
the regular expressions may be either string represenations of compiled.
For example:
>>> T = Template("v: s i", v=id, s=str, i=int)
>>> T.PARSE("this_is_an_identifier: 'a string' 12344")
(['this_is_an_identifier', "'a string'", '12344'], 39)
>>>
Here id, str, and int are regular expression conveniences provided by
this module.
Directive markers may be mixed and matched, except that wildcards cannot precede
wildcards or single character markers.
Example:
>>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str)
>>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'")
(['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72)
>>>
"""
import re, string
from types import StringType
from string import find
#
# template parsing
#
# EG: T = Template("(NNN)NNN-NNNN X X", "X", "N")
# ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters")
#
class Template:
def __init__(self,
template,
wild_card_marker=None,
single_char_marker=None,
**marker_to_regex_dict):
self.template = template
self.wild_card = wild_card_marker
self.char = single_char_marker
# determine the set of markers for this template
markers = marker_to_regex_dict.keys()
if wild_card_marker:
markers.append(wild_card_marker)
if single_char_marker:
for ch in single_char_marker: # allow multiple scm's
markers.append(ch)
self.char = single_char_primary = single_char_marker[0]
self.markers = markers
for mark in markers:
if len(mark)>1:
raise ValueError, "Marks must be single characters: "+`mark`
# compile the regular expressions if needed
self.marker_dict = marker_dict = {}
for (mark, rgex) in marker_to_regex_dict.items():
if type(rgex) == StringType:
rgex = re.compile(rgex)
marker_dict[mark] = rgex
# determine the parse sequence
parse_seq = []
# dummy last char
lastchar = None
index = 0
last = len(template)
# count the number of directives encountered
ndirectives = 0
while index<last:
start = index
thischar = template[index]
# is it a wildcard?
if thischar == wild_card_marker:
if lastchar == wild_card_marker:
raise ValueError, "two wild cards in sequence is not allowed"
parse_seq.append( (wild_card_marker, None) )
index = index+1
ndirectives = ndirectives+1
# is it a sequence of single character markers?
elif single_char_marker and thischar in single_char_marker:
if lastchar == wild_card_marker:
raise ValueError, "wild card cannot precede single char marker"
while index<last and template[index] == thischar:
index = index+1
parse_seq.append( (single_char_primary, index-start) )
ndirectives = ndirectives+1
# is it a literal sequence?
elif not thischar in markers:
while index<last and not template[index] in markers:
index = index+1
parse_seq.append( (None, template[start:index]) )
# otherwise it must be a re marker
else:
rgex = marker_dict[thischar]
parse_seq.append( (thischar, rgex) )
ndirectives = ndirectives+1
index = index+1
lastchar = template[index-1]
self.parse_seq = parse_seq
self.ndirectives = ndirectives
def PARSE(self, str, start=0):
ndirectives = self.ndirectives
wild_card = self.wild_card
single_char = self.char
parse_seq = self.parse_seq
lparse_seq = len(parse_seq) - 1
# make a list long enough for substitutions for directives
result = [None] * ndirectives
current_directive_index = 0
currentindex = start
# scan through the parse sequence, recognizing
for parse_index in xrange(lparse_seq + 1):
(indicator, data) = parse_seq[parse_index]
# is it a literal indicator?
if indicator is None:
if find(str, data, currentindex) != currentindex:
raise ValueError, "literal not found at "+`(currentindex,data)`
currentindex = currentindex + len(data)
else:
# anything else is a directive
# is it a wildcard?
if indicator == wild_card:
# if it is the last directive then it matches the rest of the string
if parse_index == lparse_seq:
last = len(str)
# otherwise must look at next directive to find end of wildcard
else:
# next directive must be re or literal
(nextindicator, nextdata) = parse_seq[parse_index+1]
if nextindicator is None:
# search for literal
last = find(str, nextdata, currentindex)
if last<currentindex:
raise ValueError, \
"couldn't terminate wild with lit "+`currentindex`
else:
# data is a re, search for it
last = nextdata.search(str, currentindex)
if last<currentindex:
raise ValueError, \
"couldn't terminate wild with re "+`currentindex`
elif indicator == single_char:
# data is length to eat
last = currentindex + data
else:
# other directives are always regular expressions
last = data.match(str, currentindex) + currentindex
if last<currentindex:
raise ValueError, "couldn't match re at "+`currentindex`
#print "accepting", str[currentindex:last]
result[current_directive_index] = str[currentindex:last]
current_directive_index = current_directive_index+1
currentindex = last
# sanity check
if current_directive_index != ndirectives:
raise SystemError, "not enough directives found?"
return (result, currentindex)
# some useful regular expressions
USERNAMEREGEX = \
"["+string.letters+"]["+string.letters+string.digits+"_]*"
STRINGLITREGEX = "'[^\n']*'"
SIMPLEINTREGEX = "["+string.digits+"]+"
id = re.compile(USERNAMEREGEX)
str = re.compile(STRINGLITREGEX)
int = re.compile(SIMPLEINTREGEX)
def test():
global T, T1, T2, T3
T = Template("(NNN)NNN-NNNN X X", "X", "N")
print T.PARSE("(908)949-2726 Aaron Watters")
T1 = Template("s --> s blah", s=str)
s = "' <-- a string --> ' --> 'blah blah another string blah' blah"
print T1.PARSE(s)
T2 = Template("s --> NNNiX", "X", "N", s=str, i=int)
print T2.PARSE("'A STRING' --> 15964653alpha beta gamma")
T3 = Template("XsXi", "X", "N", s=str, i=int)
print T3.PARSE("prefix'string'interior1234junk not parsed")
T4 = Template("MMDDYYX", "X", "MDY")
print T4.PARSE("122961 Somebody's birthday!")
if __name__=="__main__": test()