diff --git a/src/searchengine/nova3/sgmllib3.py b/src/searchengine/nova3/sgmllib3.py
new file mode 100644
index 000000000..88a02a307
--- /dev/null
+++ b/src/searchengine/nova3/sgmllib3.py
@@ -0,0 +1,547 @@
+"""A parser for SGML, using the derived class as a static DTD."""
+
+# XXX This only supports those SGML features used by HTML.
+
+# XXX There should be a way to distinguish between PCDATA (parsed
+# character data -- the normal case), RCDATA (replaceable character
+# data -- only char and entity references and end tags are special)
+# and CDATA (character data -- only end tags are special). RCDATA is
+# not supported at all.
+
+import _markupbase
+import re
+
+__all__ = ["SGMLParser", "SGMLParseError"]
+
+# Regular expressions used for parsing
+
+interesting = re.compile('[&<]')
+incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
+ '<([a-zA-Z][^<>]*|'
+ '/([a-zA-Z][^<>]*)?|'
+ '![^<>]*)?')
+
+entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
+charref = re.compile('([0-9]+)[^0-9]')
+
+starttagopen = re.compile('<[>a-zA-Z]')
+shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
+shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
+piclose = re.compile('>')
+endbracket = re.compile('[<>]')
+tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
+attrfind = re.compile(
+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
+ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+
+
+class SGMLParseError(RuntimeError):
+ """Exception raised for all parse errors."""
+ pass
+
+
+# SGML parser base class -- find tags and call handler functions.
+# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
+# The dtd is defined by deriving a class which defines methods
+# with special names to handle tags: start_foo and end_foo to handle
+# and , respectively, or do_foo to handle by itself.
+# (Tags are converted to lower case for this purpose.) The data
+# between tags is passed to the parser by calling self.handle_data()
+# with some data as argument (the data may be split up in arbitrary
+# chunks). Entity references are passed by calling
+# self.handle_entityref() with the entity reference as argument.
+
+class SGMLParser(_markupbase.ParserBase):
+ # Definition of entities -- derived classes may override
+ entity_or_charref = re.compile('&(?:'
+ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
+ ')(;?)')
+
+ def __init__(self, verbose=0):
+ """Initialize and reset this instance."""
+ self.verbose = verbose
+ self.reset()
+
+ def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
+ self.__starttag_text = None
+ self.rawdata = ''
+ self.stack = []
+ self.lasttag = '???'
+ self.nomoretags = 0
+ self.literal = 0
+ _markupbase.ParserBase.reset(self)
+
+ def setnomoretags(self):
+ """Enter literal mode (CDATA) till EOF.
+
+ Intended for derived classes only.
+ """
+ self.nomoretags = self.literal = 1
+
+ def setliteral(self, *args):
+ """Enter literal mode (CDATA).
+
+ Intended for derived classes only.
+ """
+ self.literal = 1
+
+ def feed(self, data):
+ """Feed some data to the parser.
+
+ Call this as often as you want, with as little or as much text
+ as you want (may include '\n'). (This just saves the text,
+ all the processing is done by goahead().)
+ """
+
+ self.rawdata = self.rawdata + data
+ self.goahead(0)
+
+ def close(self):
+ """Handle the remaining data."""
+ self.goahead(1)
+
+ def error(self, message):
+ raise SGMLParseError(message)
+
+ # Internal -- handle data as far as reasonable. May leave state
+ # and data to be processed by a subsequent call. If 'end' is
+ # true, force handling all data as if followed by EOF marker.
+ def goahead(self, end):
+ rawdata = self.rawdata
+ i = 0
+ n = len(rawdata)
+ while i < n:
+ if self.nomoretags:
+ self.handle_data(rawdata[i:n])
+ i = n
+ break
+ match = interesting.search(rawdata, i)
+ if match: j = match.start()
+ else: j = n
+ if i < j:
+ self.handle_data(rawdata[i:j])
+ i = j
+ if i == n: break
+ if rawdata[i] == '<':
+ if starttagopen.match(rawdata, i):
+ if self.literal:
+ self.handle_data(rawdata[i])
+ i = i+1
+ continue
+ k = self.parse_starttag(i)
+ if k < 0: break
+ i = k
+ continue
+ if rawdata.startswith("", i):
+ k = self.parse_endtag(i)
+ if k < 0: break
+ i = k
+ self.literal = 0
+ continue
+ if self.literal:
+ if n > (i + 1):
+ self.handle_data("<")
+ i = i+1
+ else:
+ # incomplete
+ break
+ continue
+ if rawdata.startswith("