Module python xml.py, Extensible Markup Language Scanner, Checker, and Utilities From: http://www.w3.org/XML/9705/xml.py ---------------------------------------------- #!/usr/local/bin/python """Extensible Markup Language Scanner, Checker, and Utilities This xml module includes the followin classes: Scanner -- handles XML syntax only: no well-formedness nor validity constraints WellFormed -- scanner client that well-formedness constraints InferEndTags -- a WellFormed subclass that overrides start tag processing to do a limited form of end-tag inference that is sufficient for HTML 3.2 See: Extensible Markup Language http://www.w3.org/TR/WD-xml-lang This implementation is not quite complete. There are also some differences from the spec. @@ marks things I intend to fix. The other differences are things I probably want to change in the XML spec. @# marks things that I'm still not comfortable with. This module also includes a test harness. Usage: pyton xml.py [-emit start|end|empty|comment|eref|cref|...]+ [-all] < entity Copyright (c) 1997 by Massachusetts Institute of Technology (MIT), INRIA, Keio Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee or royalty is hereby granted, as per . written by Dan Connolly http://www.w3.org/People/Connolly/ $Id: xml.py,v 1.8 1997/08/11 02:45:25 connolly Exp $ """ import regex from string import lower, find, atoi, index, count ############################################################# # The Client class and supporting utilities, exceptions, etc. # Result = 'xml.Done' Lit = 'lit' # quoted literals: "dsj" or 'sdfsd' Symbol = 'symbol' # symbol, e.g. text in Number = 'number' # numbers, e.g. 10 in #@# hmm... float, cardinal, int, units? Enum = 'enum' # enumerated/booleans, e.g. ismap in SpacePat = '[ \t\r\n]+' #@@ unicode space? class Client: def __init__(self): pass ########### # Methods to override to handle parsed data # # raise Result to tell the scanner to pause # and return a result # def text(self, str): pass def openStart(self, name): pass def attribute(self, name, type, value): pass def closeStart(self): pass def closeEmpty(self): pass def endTag(self, name=None): pass def comment(self, stuff): pass def pi(self, stuff): pass def decl(self, name, parts): pass def cref(self, numeral): pass def eref(self, name): pass def eof(self): pass def fixcase(str): """Normalize the case of names and symbols.""" return lower(str) Entities = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"} def evalLit(str, replacements = Entities): """Evaluate an XML literal. raises index error if ; missing, key error if undefined entity""" return unescape(str[1:-1], replacements) MarkupDelimChars = (('<', '<'), ('>', '>'), ('&', '&')) LitDelimChars = (('"', '"'), ("'", '''), ('&', '&')) def escape(str, replacements = MarkupDelimChars): """remember to put & last!""" for pat, repl in replacements: str = regsub.gsub(pat, repl, str) return str def unescape(str, replacements = Entities): """raises index error if ; missing, key error if undefined entity""" out = '' while 1: i = find(str, '&') if i < 0: return out + str else: out = out + str[:i] str = str[i+1:] j = index(str, ';') out = out + deref(str[:j], replacements) str=str[j+1:] def deref(str, replacements = Entities): """raises index error if ; missing, key error if undefined entity""" if str[0] == '#': return chr(atoi(str[1:])) #@@ x, hex if str[:2] == 'U-': return chr(atoi(str[2:], 16)) else: return replacements[str] def ScanErrorMessage(outfp, info): message, line, pending, offending = info outfp.write(("ERROR: %d: %s\n" + \ "context: %s{!!!}%s\n" ) % \ (line, message, pending, offending)) def NotWellFormedMessage(outfp, info, wfc): message, line, offending = info s = wfc.stack if s: name, ln = s[-1].name, s[-1].line ref = "(see <%s> line: %d)" % (name, ln) else: ref = '' outfp.write(("%d: %s %s\n" + \ "bad text: %s\n") % \ (line, message, ref, offending)) for elm in wfc.stack: outfp.write(" line %d: <%s[%d] %s>\n" % (elm.line, elm.name, elm.idx, `elm.attrs`)) ScanError = 'xml.ScanError' class Scanner: ########### # Exported Scanner methods def __init__(self): self._line = 1 self.done = -1 self.bb = self.at = 0 self._buf = '' # #@# this scanner doesn't signal an error when ]]> occurs # outside a marked section self.RE = union(('[^&<]+', ' Section = 'section' # Decl = 'decl' # PI = 'pi' # Events = {Text:Text, Start:Start, End:End, CloseStart:CloseStart, EndTag:EndTag, CloseEmpty: CloseEmpty, ERef:ERef, CRef:CRef, Comment:Comment, Decl:Decl, PI:PI, Section:Section} class Tester(InferEndTags): def __init__(self): InferEndTags.__init__(self) self.mask = [] def showEvents(self, mask): self.mask = mask def process(self, type, info): text = self.p.raw() if self.mask is None or type in self.mask: print "==>%s: %s [%s]\n" % (type, `info`, text) def text(self, text): InferEndTags.text(self, text) self.process(Text, None) def closeStart(self): InferEndTags.closeStart(self) elm = self.elm self.process(CloseStart, (elm.name, elm.idx, elm.line, elm.attrs)) def startElement(self, elm): names = [] for e in self.stack: names.append(e.name) self.process(Start, (names, elm.idx, elm.line, elm.attrs)) def closeEmpty(self): InferEndTags.closeEmpty(self) elm = self.elm self.process(CloseEmpty, (elm.name, elm.idx, elm.line, elm.attrs)) def endTag(self, name = None): InferEndTags.endTag(self, name) self.process(EndTag, name) def endElement(self, elm): self.process(End, elm.name) def cref(self, num): InferEndTags.cref(self, num) self.process(CRef, num) def eref(self, num): InferEndTags.eref(self, num) self.process(ERef, num) def comment(self, str): InferEndTags.comment(self, str) self.process(Comment, str) def pi(self, str): InferEndTags.pi(self, str) self.process(PI, str) def decl(self, name, parts): InferEndTags.decl(self, name, parts) self.process(Decl, (name, parts)) def section(self): InferEndTags.section(self, str) self.process(Section, None) def main(infp): p = Scanner() cl = Tester() cl.scanner(p) getOpts(cl) try: p.feed(infp.read()) p.next(cl) cl.eof() except ScanError, info: ScanErrorMessage(sys.stderr, info) sys.exit(1) except NotWellFormed, info: NotWellFormedMessage(sys.stderr, info, cl) sys.exit(1) def getOpts(x): mask = [] while len(sys.argv) > 1: opt = sys.argv[1] if opt[0] != '-': break del sys.argv[1] if opt == '-emit': mask.append(Events[sys.argv[1]]) del sys.argv[1] elif opt == '-all': mask = None elif opt == '-html': x.html32() else: sys.stderr.write("unkonwn option: %s\n" % opt) sys.exit(1) x.showEvents(mask) if __name__ == '__main__': main(sys.stdin)