# A parser for XML, using the derived class as static DTD. # Author: Sjoerd Mullender. # sgmlop support added by fredrik@pythonware.com (May 19, 1998) import re import string try: from _xmlplus.parsers import sgmlop #import sgmlop # this works for both builtin on the path or relative except ImportError: sgmlop = None # standard entity defs ENTITYDEFS = { 'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'' } # XML parser base class -- find tags and call handler functions. # Usage: p = XMLParser(); p.feed(data); ...; p.close(). # The dtd is defined by deriving a class which defines methods with # special names to handle tags: start_foo and end_foo to handle # and , respectively. The data between tags is passed to the # parser by calling self.handle_data() with some data as argument (the # data may be split up in arbutrary chunks). Entity references are # passed by calling self.handle_entityref() with the entity reference # as argument. # -------------------------------------------------------------------- # original re-based XML parser _S = '[ \t\r\n]+' _opS = '[ \t\r\n]*' _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' interesting = re.compile('[&<]') incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|' '<([a-zA-Z_:][^<>]*|' '/([a-zA-Z_:][^<>]*)?|' '![^<>]*|' '\?[^<>]*)?') ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?') entityref = re.compile('&(?P' + _Name + ')[^-a-zA-Z0-9._:]') charref = re.compile('&#(?P[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])') space = re.compile(_S) newline = re.compile('\n') starttagopen = re.compile('<' + _Name) endtagopen = re.compile('/?)>') endbracket = re.compile('>') tagfind = re.compile(_Name) cdataopen = re.compile('') special = re.compile('[^<>]*)>') procopen = re.compile('<\?(?P' + _Name + ')' + _S) procclose = re.compile('\?>') commentopen = re.compile('') doubledash = re.compile('--') attrfind = re.compile( _opS + '(?P' + _Name + ')' '(' + _opS + '=' + _opS + '(?P\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))') class SlowXMLParser: # Interface -- initialize and reset this instance def __init__(self, verbose=0): self.verbose = verbose self.reset() # Interface -- reset this instance. Loses all unprocessed data def reset(self): self.rawdata = '' self.stack = [] self.lasttag = '???' self.nomoretags = 0 self.literal = 0 self.lineno = 1 # For derived classes only -- enter literal mode (CDATA) till EOF def setnomoretags(self): self.nomoretags = self.literal = 1 # For derived classes only -- enter literal mode (CDATA) def setliteral(self, *args): self.literal = 1 # Interface -- feed some data to the parser. Call this as # often as you want, with as little or as much text as you # want (may include '\n'). (This just saves the text, all the # processing is done by goahead().) def feed(self, data): self.rawdata = self.rawdata + data self.goahead(0) # Interface -- handle the remaining data def close(self): self.goahead(1) # Interface -- translate references def translate_references(self, data): newdata = [] i = 0 while 1: res = ref.search(data, i) if res is None: newdata.append(data[i:]) return string.join(newdata, '') if data[res.end(0) - 1] != ';': self.syntax_error(self.lineno, '; missing after entity/char reference') newdata.append(data[i:res.start(0)]) str = res.group(1) if str[0] == '#': if str[1] == 'x': newdata.append(chr(string.atoi(str[2:], 16))) else: newdata.append(chr(string.atoi(str[1:]))) else: try: newdata.append(self.entitydefs[str]) except KeyError: # can't do it, so keep the entity ref in newdata.append('&' + str + ';') i = res.end(0) # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: if self.nomoretags: data = rawdata[i:n] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = n break res = interesting.search(rawdata, i) if res: j = res.start(0) else: j = n if i < j: data = rawdata[i:j] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = j if i == n: break if rawdata[i] == '<': if starttagopen.match(rawdata, i): if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = i+1 continue k = self.parse_starttag(i) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue if endtagopen.match(rawdata, i): k = self.parse_endtag(i) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k self.literal = 0 continue if commentopen.match(rawdata, i): if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = i+1 continue k = self.parse_comment(i) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue if cdataopen.match(rawdata, i): k = self.parse_cdata(i) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:i], '\n') i = k continue res = procopen.match(rawdata, i) if res: k = self.parse_proc(i, res) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue res = special.match(rawdata, i) if res: if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = i+1 continue self.handle_special(res.group('special')) self.lineno = self.lineno + string.count(res.group(0), '\n') i = res.end(0) continue elif rawdata[i] == '&': res = charref.match(rawdata, i) if res is not None: i = res.end(0) if rawdata[i-1] != ';': self.syntax_error(self.lineno, '; missing in charref') i = i-1 self.handle_charref(res.group('char')[:-1]) self.lineno = self.lineno + string.count(res.group(0), '\n') continue res = entityref.match(rawdata, i) if res is not None: i = res.end(0) if rawdata[i-1] != ';': self.syntax_error(self.lineno, '; missing in entityref') i = i-1 self.handle_entityref(res.group('name')) self.lineno = self.lineno + string.count(res.group(0), '\n') continue else: raise RuntimeError, 'neither < nor & ??' # We get here only if incomplete matches but # nothing else res = incomplete.match(rawdata, i) if not res: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = i+1 continue j = res.end(0) if j == n: break # Really incomplete self.syntax_error(self.lineno, 'bogus < or &') data = res.group(0) self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = j # end while if end and i < n: data = rawdata[i:n] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = n self.rawdata = rawdata[i:] # XXX if end: check for empty stack # Internal -- parse comment, return length or -1 if not terminated def parse_comment(self, i): rawdata = self.rawdata if rawdata[i:i+4] <> '