diff --git a/src/searchengine/nova3/sgmllib3.py b/src/searchengine/nova3/sgmllib3.py new file mode 100644 index 000000000..88a02a307 --- /dev/null +++ b/src/searchengine/nova3/sgmllib3.py @@ -0,0 +1,547 @@ +"""A parser for SGML, using the derived class as a static DTD.""" + +# XXX This only supports those SGML features used by HTML. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). RCDATA is +# not supported at all. + +import _markupbase +import re + +__all__ = ["SGMLParser", "SGMLParseError"] + +# Regular expressions used for parsing + +interesting = re.compile('[&<]') +incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|' + '/([a-zA-Z][^<>]*)?|' + '![^<>]*)?') + +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') +charref = re.compile('&#([0-9]+)[^0-9]') + +starttagopen = re.compile('<[>a-zA-Z]') +shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') +shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') +piclose = re.compile('>') +endbracket = re.compile('[<>]') +tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') +attrfind = re.compile( + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + + +class SGMLParseError(RuntimeError): + """Exception raised for all parse errors.""" + pass + + +# SGML parser base class -- find tags and call handler functions. +# Usage: p = SGMLParser(); p.feed(data); ...; p.close(). +# The dtd is defined by deriving a class which defines methods +# with special names to handle tags: start_foo and end_foo to handle +# and , respectively, or do_foo to handle by itself. +# (Tags are converted to lower case for this purpose.) The data +# between tags is passed to the parser by calling self.handle_data() +# with some data as argument (the data may be split up in arbitrary +# chunks). Entity references are passed by calling +# self.handle_entityref() with the entity reference as argument. + +class SGMLParser(_markupbase.ParserBase): + # Definition of entities -- derived classes may override + entity_or_charref = re.compile('&(?:' + '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' + ')(;?)') + + def __init__(self, verbose=0): + """Initialize and reset this instance.""" + self.verbose = verbose + self.reset() + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" + self.__starttag_text = None + self.rawdata = '' + self.stack = [] + self.lasttag = '???' + self.nomoretags = 0 + self.literal = 0 + _markupbase.ParserBase.reset(self) + + def setnomoretags(self): + """Enter literal mode (CDATA) till EOF. + + Intended for derived classes only. + """ + self.nomoretags = self.literal = 1 + + def setliteral(self, *args): + """Enter literal mode (CDATA). + + Intended for derived classes only. + """ + self.literal = 1 + + def feed(self, data): + """Feed some data to the parser. + + Call this as often as you want, with as little or as much text + as you want (may include '\n'). (This just saves the text, + all the processing is done by goahead().) + """ + + self.rawdata = self.rawdata + data + self.goahead(0) + + def close(self): + """Handle the remaining data.""" + self.goahead(1) + + def error(self, message): + raise SGMLParseError(message) + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.nomoretags: + self.handle_data(rawdata[i:n]) + i = n + break + match = interesting.search(rawdata, i) + if match: j = match.start() + else: j = n + if i < j: + self.handle_data(rawdata[i:j]) + i = j + if i == n: break + if rawdata[i] == '<': + if starttagopen.match(rawdata, i): + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_starttag(i) + if k < 0: break + i = k + continue + if rawdata.startswith(" (i + 1): + self.handle_data("<") + i = i+1 + else: + # incomplete + break + continue + if rawdata.startswith("