"""Radically simple xml parsing Example parse text in xml ( "this", {"type": "xml"}, [ "text ", ("b", None, ["in"], None), " xml" ] None ) { 0: "this" "type": "xml" 1: ["text ", {0: "b", 1:["in"]}, " xml"] } Ie, xml tag translates to a tuple: (name, dictofattributes, contentlist, miscellaneousinfo) where miscellaneousinfo can be anything, (but defaults to None) (with the intention of adding, eg, line number information) special cases: name of "" means "top level, no containing tag". Top level parse always looks like this ("", list, None, None) contained text of None means In order to support stuff like AT THE MOMENT & ETCETERA ARE IGNORED. THEY MUST BE PROCESSED IN A POST-PROCESSING STEP. PROLOGUES ARE NOT UNDERSTOOD. OTHER STUFF IS PROBABLY MISSING. """ RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser. import string try: #raise ImportError, "dummy error" simpleparse = 0 import pyRXPU def warnCB(s): print s pyRXP_parser = pyRXPU.Parser( ErrorOnValidityErrors=1, NoNoDTDWarning=1, ExpandCharacterEntities=1, ExpandGeneralEntities=1, warnCB = warnCB, srcName='string input', ReturnUTF8 = 1, ) def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None): pyRXP_parser.eoCB = eoCB p = pyRXP_parser.parse(xmlText) return oneOutermostTag and p or ('',None,[p],None) except ImportError: simpleparse = 1 NONAME = "" NAMEKEY = 0 CONTENTSKEY = 1 CDATAMARKER = "" replacelist = [("<", "<"), (">", ">"), ("&", "&")] # amp must be last #replacelist = [] def unEscapeContentList(contentList): result = [] from string import replace for e in contentList: if "&" in e: for (old, new) in replacelist: e = replace(e, old, new) result.append(e) return result def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList): """official interface: discard unused cursor info""" if RequirePyRXP: raise ImportError, "pyRXP not found, fallback parser disabled" (result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer) if oneOutermostTag: return result[2][0] else: return result if simpleparse: parsexml = parsexmlSimple def parseFile(filename): raw = open(filename, 'r').read() return parsexml(raw) verbose = 0 def skip_prologue(text, cursor): """skip any prologue found after cursor, return index of rest of text""" ### NOT AT ALL COMPLETE!!! definitely can be confused!!! from string import find prologue_elements = ("!DOCTYPE", "?xml", "!--") done = None while done is None: #print "trying to skip:", repr(text[cursor:cursor+20]) openbracket = find(text, "<", cursor) if openbracket<0: break past = openbracket+1 found = None for e in prologue_elements: le = len(e) if text[past:past+le]==e: found = 1 cursor = find(text, ">", past) if cursor<0: raise ValueError, "can't close prologue %s" % `e` cursor = cursor+1 if found is None: done=1 #print "done skipping" return cursor def parsexml0(xmltext, startingat=0, toplevel=1, # snarf in some globals strip=string.strip, split=string.split, find=string.find, entityReplacer=unEscapeContentList, #len=len, None=None #LENCDATAMARKER=LENCDATAMARKER, CDATAMARKER=CDATAMARKER ): """simple recursive descent xml parser... return (dictionary, endcharacter) special case: comment returns (None, endcharacter)""" #from string import strip, split, find #print "parsexml0", `xmltext[startingat: startingat+10]` # DEFAULTS NameString = NONAME ContentList = AttDict = ExtraStuff = None if toplevel is not None: #if verbose: print "at top level" #if startingat!=0: # raise ValueError, "have to start at 0 for top level!" xmltext = strip(xmltext) cursor = startingat #look for interesting starting points firstbracket = find(xmltext, "<", cursor) afterbracket2char = xmltext[firstbracket+1:firstbracket+3] #print "a", `afterbracket2char` #firstampersand = find(xmltext, "&", cursor) #if firstampersand>0 and firstampersand0: #afterbracket2char = xmltext[firstbracket:firstbracket+2] if toplevel is not None: #print "toplevel with no outer tag" NameString = name = NONAME cursor = skip_prologue(xmltext, cursor) #break elif firstbracket<0: raise ValueError, "non top level entry should be at start tag: %s" % repr(xmltext[:10]) # special case: CDATA elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="": raise ValueError, "invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20]) return (None, endcomment+1) # shortcut exit else: # get the rest of the tag #if verbose: print "parsing start tag" # make sure the tag isn't in doublequote pairs closebracket = find(xmltext, ">", firstbracket) noclose = closebracket<0 startsearch = closebracket+1 pastfirstbracket = firstbracket+1 tagcontent = xmltext[pastfirstbracket:closebracket] # shortcut, no equal means nothing but name in the tag content if '=' not in tagcontent: if tagcontent[-1]=="/": # simple case #print "simple case", tagcontent tagcontent = tagcontent[:-1] docontents = None name = strip(tagcontent) NameString = name cursor = startsearch else: if '"' in tagcontent: # check double quotes stop = None # not inside double quotes! (the split should have odd length) if noclose or len(split(tagcontent+".", '"'))% 2: stop=1 while stop is None: closebracket = find(xmltext, ">", startsearch) startsearch = closebracket+1 noclose = closebracket<0 tagcontent = xmltext[pastfirstbracket:closebracket] # not inside double quotes! (the split should have odd length) if noclose or len(split(tagcontent+".", '"'))% 2: stop=1 if noclose: raise ValueError, "unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20]) cursor = startsearch #cursor = closebracket+1 # handle simple tag /> syntax if xmltext[closebracket-1]=="/": #if verbose: print "it's a simple tag" closebracket = closebracket-1 tagcontent = tagcontent[:-1] docontents = None #tagcontent = xmltext[firstbracket+1:closebracket] tagcontent = strip(tagcontent) taglist = split(tagcontent, "=") #if not taglist: # raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20]) taglist0 = taglist[0] taglist0list = split(taglist0) #if len(taglist0list)>2: # raise ValueError, "bad tag head %s" % repr(taglist0) name = taglist0list[0] #print "tag name is", name NameString = name # now parse the attributes attributename = taglist0list[-1] # put a fake att name at end of last taglist entry for consistent parsing taglist[-1] = taglist[-1]+" f" AttDict = D = {} taglistindex = 1 lasttaglistindex = len(taglist) #for attentry in taglist[1:]: while taglistindexlasttaglistindex: raise ValueError, "unclosed value " + repr(attentry) nextattentry = taglist[taglistindex] taglistindex = taglistindex+1 attentry = "%s=%s" % (attentry, nextattentry) attentry = strip(attentry) # only needed for while loop... attlist = split(attentry) nextattname = attlist[-1] attvalue = attentry[:-len(nextattname)] attvalue = strip(attvalue) try: first = attvalue[0]; last=attvalue[-1] except: raise ValueError, "attvalue,attentry,attlist="+repr((attvalue, attentry,attlist)) if first==last=='"' or first==last=="'": attvalue = attvalue[1:-1] #print attributename, "=", attvalue D[attributename] = attvalue attributename = nextattname # pass over other tags and content looking for end tag if docontents is not None: #print "now looking for end tag" ContentList = L while docontents is not None: nextopenbracket = find(xmltext, "<", cursor) if nextopenbracket", nextopenbracket) if nextclosebracket\n%s\n" % (name, attributes, textpprint, name) # otherwise must be a simple tag return "<%s %s/>" % (name, attributes) dump = 0 def testparse(s): from time import time from pprint import pprint now = time() D = parsexmlSimple(s) print "DONE", time()-now if dump&4: pprint(D) #pprint(D) if dump&1: print "============== reformatting" p = pprettyprint(D) print p def test(): testparse("""text <>in xml text in xml ]]> just testing brackets feature """) filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml", "samples/hamlet.xml"] #filenames = ["moa.xml"] dump=1 if __name__=="__main__": test() from time import time now = time() for f in filenames: t = open(f).read() print "parsing", f testparse(t) print "elapsed", time()-now