From d2103e17599f4751a5a90df8c96ac3fb61de6893 Mon Sep 17 00:00:00 2001 From: imDMG Date: Thu, 17 Jan 2019 18:04:19 +0500 Subject: [PATCH] Initial commit --- kinozal.py | 244 ++++++++++++++++++ modules/helpers.py | 123 +++++++++ modules/nova2.py | 190 ++++++++++++++ modules/nova2dl.py | 63 +++++ modules/novaprinter.py | 67 +++++ modules/sgmllib3.py | 547 +++++++++++++++++++++++++++++++++++++++++ modules/socks.py | 391 +++++++++++++++++++++++++++++ 7 files changed, 1625 insertions(+) create mode 100644 kinozal.py create mode 100644 modules/helpers.py create mode 100644 modules/nova2.py create mode 100644 modules/nova2dl.py create mode 100644 modules/novaprinter.py create mode 100644 modules/sgmllib3.py create mode 100644 modules/socks.py diff --git a/kinozal.py b/kinozal.py new file mode 100644 index 0000000..1f6a551 --- /dev/null +++ b/kinozal.py @@ -0,0 +1,244 @@ +# VERSION: 1.0 +# AUTHORS: imDMG + +# LICENSING INFORMATION + +import tempfile +import os +import logging +import time + +from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler +from urllib.parse import urlencode, quote, unquote +from urllib.error import URLError, HTTPError +from http.cookiejar import CookieJar +from html.parser import HTMLParser +from novaprinter import prettyPrinter + +# setup logging into qBittorrent/logs +logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', + datefmt='%m-%d %H:%M', + filename=os.path.abspath( + os.path.join(os.path.dirname(__file__), '../..', 'logs')) + "/kinozal_se.log", + # filename="kinozal_se.log", + filemode='w') + +# benchmark +# start_time = time.time() + + +class kinozal(object): + name = 'Kinozal' + url = 'http://kinozal.tv' + supported_categories = {'all': '0', + 'movies': '1002', + 'tv': '1001', + 'music': '1004', + 'games': '23', + 'anime': '20', + 'software': '32'} + + # Set proxies (default false) + # make sure that proxies keys is'nt empty + proxy = True + proxies = { + 'http': '', + 'https': '', + } + + # credentials + username = "USERNAME" + password = "PASSWORD" + ua = 'Mozilla/5.0 (X11; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0' + + def __init__(self): + # establish connection + # + # make cookie + cj = CookieJar() + self.session = build_opener(HTTPCookieProcessor(cj)) + + # add proxy handler if needed + if self.proxy and any(self.proxies.keys()): + self.session.add_handler(ProxyHandler(self.proxies)) + + # change user-agent + self.session.addheaders.pop() + self.session.addheaders.append(('User-Agent', self.ua)) + + form_data = {"username": self.username, "password": self.password} + data_encoded = urlencode(form_data).encode('cp1251') + + try: + response = self.session.open(self.url + '/takelogin.php', data_encoded) + # Only continue if response status is OK. + if response.getcode() != 200: + raise HTTPError(response.geturl(), response.getcode(), + "HTTP request to {} failed with status: {}".format(self.url, response.getcode()), + response.info(), None) + except (URLError, HTTPError) as e: + logging.error(e) + raise e + + if 'uid' not in [cookie.name for cookie in cj]: + logging.debug(cj) + + class WorstParser(HTMLParser): + def __init__(self, url=''): + HTMLParser.__init__(self) + self.url = url + self.torrent = {'link': '', + 'name': '', + 'size': '', + 'seeds': '', + 'leech': '', + 'desc_link': '', } + + # we need a page markup to know when stop and collect data, + # because available methods, in this class, do not communicate each other + # as a result, we make markup to transfer information + # from one method to another, along a chain + # + # markup on result table + self.result_table = False # table with results is found + self.torrent_row = False # found torrent row for collect data + self.index_td = 0 # td counter in torrent row + self.write = None # trigger to detecting when to collect data + + # markup pagination + self.paginator = False # found more pages in result + self.pages = 0 # page counter + + self.found_torrents = 0 + + def handle_starttag(self, tag, attrs): + # search result table by class t_peer + if tag == 'table': + for name, value in attrs: + if name == 'class' and 't_peer' in value: + self.result_table = True + + # search for torrent row by class bg + if self.result_table and tag == 'tr': + for name, value in attrs: + if name == 'class' and 'bg' in value: + self.torrent_row = True + + # count td for find right td + if self.torrent_row and tag == 'td': + if self.index_td == 3: + self.write = "size" + elif self.index_td == 4: + self.write = "seeds" + elif self.index_td == 5: + self.write = "leech" + + self.index_td += 1 + + # search for torrent link by classes r0 or r1 + if self.torrent_row and tag == 'a': + for name, value in attrs: + if name == 'class' and 'r' in value: + self.torrent['link'] = 'http://dl.kinozal.tv/download.php?id=' + attrs[0][1].split('=')[1] + self.torrent['desc_link'] = self.url + attrs[0][1] + self.write = "name" + + # search for right div with class paginator + if self.found_torrents == 50 and tag == 'div': + for name, value in attrs: + if name == 'class' and value == 'paginator': + self.paginator = True + + # search for block with page numbers + if self.paginator and tag == 'li': + self.pages += 1 + + def handle_endtag(self, tag): + # detecting that torrent row is closed and print all collected data + if self.torrent_row and tag == 'tr': + self.torrent["engine_url"] = self.url + logging.debug('tr: ' + str(self.torrent)) + prettyPrinter(self.torrent) + self.torrent = {key: '' for key in self.torrent} + self.index_td = 0 + self.torrent_row = False + self.found_torrents += 1 + + # detecting that table with result is close + if self.result_table and tag == 'table': + self.result_table = False + + # detecting that we found all pagination + if self.paginator and tag == 'ul': + self.paginator = False + + def handle_data(self, data: str): + # detecting that we need write data at this moment + if self.write and self.result_table: + if self.write == 'size': + data = self.units_convert(data) + self.torrent[self.write] = data.strip() + self.write = None + + @staticmethod + def units_convert(unit): + # replace size units + table = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'} + x = unit.split(" ") + x[1] = table[x[1]] + + return " ".join(x) + + def error(self, message): + pass + + def download_torrent(self, url): + # Create a torrent file + file, path = tempfile.mkstemp('.torrent') + file = os.fdopen(file, "wb") + + # Download url + try: + response = self.session.open(url) + # Only continue if response status is OK. + if response.getcode() != 200: + raise HTTPError(response.geturl(), response.getcode(), + "HTTP request to {} failed with status: {}".format(url, response.getcode()), + response.info(), None) + except (URLError, HTTPError) as e: + logging.error(e) + raise e + + # Write it to a file + file.write(response.read()) + file.close() + + # return file path + logging.debug(path + " " + url) + print(path + " " + url) + + def search(self, what, cat='all'): + query = '%s/browse.php?s=%s&c=%s' % (self.url, unquote(quote(what)), self.supported_categories[cat]) + response = self.session.open(query) + parser = self.WorstParser(self.url) + parser.feed(response.read().decode('cp1251')) + parser.close() + + # if first request return that we have pages, we do cycle + if parser.pages: + for x in range(1, parser.pages): + response = self.session.open('%s&page=%s' % (query, x)) + parser.feed(response.read().decode('cp1251')) + parser.close() + + +# logging.debug("--- %s seconds ---" % (time.time() - start_time)) +if __name__ == "__main__": + kinozal_se = kinozal() + # print(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..', 'logs'))) + # print(kinozal_se.WorstParser.units_convert("500 КБ")) + # kinozal_se.search('terror lostfilm', 'tv') + # kinozal_se._handle_connection(True) + # kinozal_se.download_torrent('http://dl.kinozal.tv/download.php?id=1609776') + # print("--- %s seconds ---" % (time.time() - start_time)) diff --git a/modules/helpers.py b/modules/helpers.py new file mode 100644 index 0000000..f17dad5 --- /dev/null +++ b/modules/helpers.py @@ -0,0 +1,123 @@ +#VERSION: 1.42 + +# Author: +# Christophe DUMEZ (chris@qbittorrent.org) + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the author nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import gzip +import html.entities +import io +import os +import re +import socket +import socks +import tempfile +import urllib.error +import urllib.parse +import urllib.request + +# Some sites blocks default python User-agent +user_agent = 'Mozilla/5.0 (X11; Linux i686; rv:38.0) Gecko/20100101 Firefox/38.0' +headers = {'User-Agent': user_agent} +# SOCKS5 Proxy support +if "sock_proxy" in os.environ and len(os.environ["sock_proxy"].strip()) > 0: + proxy_str = os.environ["sock_proxy"].strip() + m = re.match(r"^(?:(?P[^:]+):(?P[^@]+)@)?(?P[^:]+):(?P\w+)$", + proxy_str) + if m is not None: + socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, m.group('host'), + int(m.group('port')), True, m.group('username'), m.group('password')) + socket.socket = socks.socksocket + + +def htmlentitydecode(s): + # First convert alpha entities (such as é) + # (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html) + def entity2char(m): + entity = m.group(1) + if entity in html.entities.name2codepoint: + return chr(html.entities.name2codepoint[entity]) + return " " # Unknown entity: We replace with a space. + t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s) + + # Then convert numerical entities (such as é) + t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t) + + # Then convert hexa entities (such as é) + return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t) + + +def retrieve_url(url): + """ Return the content of the url page as a string """ + req = urllib.request.Request(url, headers=headers) + try: + response = urllib.request.urlopen(req) + except urllib.error.URLError as errno: + print(" ".join(("Connection error:", str(errno.reason)))) + return "" + dat = response.read() + # Check if it is gzipped + if dat[:2] == b'\x1f\x8b': + # Data is gzip encoded, decode it + compressedstream = io.BytesIO(dat) + gzipper = gzip.GzipFile(fileobj=compressedstream) + extracted_data = gzipper.read() + dat = extracted_data + info = response.info() + charset = 'utf-8' + try: + ignore, charset = info['Content-Type'].split('charset=') + except Exception: + pass + dat = dat.decode(charset, 'replace') + dat = htmlentitydecode(dat) + # return dat.encode('utf-8', 'replace') + return dat + + +def download_file(url, referer=None): + """ Download file at url and write it to a file, return the path to the file and the url """ + file, path = tempfile.mkstemp() + file = os.fdopen(file, "wb") + # Download url + req = urllib.request.Request(url, headers=headers) + if referer is not None: + req.add_header('referer', referer) + response = urllib.request.urlopen(req) + dat = response.read() + # Check if it is gzipped + if dat[:2] == b'\x1f\x8b': + # Data is gzip encoded, decode it + compressedstream = io.BytesIO(dat) + gzipper = gzip.GzipFile(fileobj=compressedstream) + extracted_data = gzipper.read() + dat = extracted_data + + # Write it to a file + file.write(dat) + file.close() + # return file path + return (path + " " + url) diff --git a/modules/nova2.py b/modules/nova2.py new file mode 100644 index 0000000..0b11e3c --- /dev/null +++ b/modules/nova2.py @@ -0,0 +1,190 @@ +#VERSION: 1.43 + +# Author: +# Fabien Devaux +# Contributors: +# Christophe Dumez (qbittorrent integration) +# Thanks to gab #gcu @ irc.freenode.net (multipage support on PirateBay) +# Thanks to Elias (torrentreactor and isohunt search engines) +# +# Licence: BSD + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the author nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import urllib.parse +from os import path +from glob import glob +from sys import argv +from multiprocessing import Pool, cpu_count + +THREADED = True +try: + MAX_THREADS = cpu_count() +except NotImplementedError: + MAX_THREADS = 1 + +CATEGORIES = {'all', 'movies', 'tv', 'music', 'games', 'anime', 'software', 'pictures', 'books'} + +################################################################################ +# Every engine should have a "search" method taking +# a space-free string as parameter (ex. "family+guy") +# it should call prettyPrinter() with a dict as parameter. +# The keys in the dict must be: link,name,size,seeds,leech,engine_url +# As a convention, try to list results by decreasing number of seeds or similar +################################################################################ + + +def initialize_engines(): + """ Import available engines + + Return list of available engines + """ + supported_engines = [] + + engines = glob(path.join(path.dirname(__file__), 'engines', '*.py')) + for engine in engines: + engi = path.basename(engine).split('.')[0].strip() + if len(engi) == 0 or engi.startswith('_'): + continue + try: + # import engines.[engine] + engine_module = __import__(".".join(("engines", engi))) + # get low-level module + engine_module = getattr(engine_module, engi) + # bind class name + globals()[engi] = getattr(engine_module, engi) + supported_engines.append(engi) + except Exception: + pass + + return supported_engines + + +def engines_to_xml(supported_engines): + """ Generates xml for supported engines """ + tab = " " * 4 + + for short_name in supported_engines: + search_engine = globals()[short_name]() + + supported_categories = "" + if hasattr(search_engine, "supported_categories"): + supported_categories = " ".join((key + for key in search_engine.supported_categories.keys() + if key != "all")) + + yield "".join((tab, "<", short_name, ">\n", + tab, tab, "", search_engine.name, "\n", + tab, tab, "", search_engine.url, "\n", + tab, tab, "", supported_categories, "\n", + tab, "\n")) + + +def displayCapabilities(supported_engines): + """ + Display capabilities in XML format + + + long name + http://example.com + movies music games + + + """ + xml = "".join(("\n", + "".join(engines_to_xml(supported_engines)), + "")) + print(xml) + + +def run_search(engine_list): + """ Run search in engine + + @param engine_list List with engine, query and category + + @retval False if any exceptions occurred + @retval True otherwise + """ + engine, what, cat = engine_list + try: + engine = engine() + # avoid exceptions due to invalid category + if hasattr(engine, 'supported_categories'): + if cat in engine.supported_categories: + engine.search(what, cat) + else: + engine.search(what) + + return True + except Exception: + return False + + +def main(args): + supported_engines = initialize_engines() + + if not args: + raise SystemExit("./nova2.py [all|engine1[,engine2]*] \n" + "available engines: %s" % (','.join(supported_engines))) + + elif args[0] == "--capabilities": + displayCapabilities(supported_engines) + return + + elif len(args) < 3: + raise SystemExit("./nova2.py [all|engine1[,engine2]*] \n" + "available engines: %s" % (','.join(supported_engines))) + + # get only unique engines with set + engines_list = set(e.lower() for e in args[0].strip().split(',')) + + if 'all' in engines_list: + engines_list = supported_engines + else: + # discard un-supported engines + engines_list = [engine for engine in engines_list + if engine in supported_engines] + + if not engines_list: + # engine list is empty. Nothing to do here + return + + cat = args[1].lower() + + if cat not in CATEGORIES: + raise SystemExit(" - ".join(('Invalid category', cat))) + + what = urllib.parse.quote(' '.join(args[2:])) + if THREADED: + # child process spawning is controlled min(number of searches, number of cpu) + with Pool(min(len(engines_list), MAX_THREADS)) as pool: + pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list)) + else: + # py3 note: map is needed to be evaluated for content to be executed + all(map(run_search, ([globals()[engine], what, cat] for engine in engines_list))) + + +if __name__ == "__main__": + main(argv[1:]) diff --git a/modules/nova2dl.py b/modules/nova2dl.py new file mode 100644 index 0000000..c04a437 --- /dev/null +++ b/modules/nova2dl.py @@ -0,0 +1,63 @@ +#VERSION: 1.22 + +# Author: +# Christophe DUMEZ (chris@qbittorrent.org) + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the author nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import sys +import os +import glob +from helpers import download_file + +supported_engines = dict() + +engines = glob.glob(os.path.join(os.path.dirname(__file__), 'engines', '*.py')) +for engine in engines: + e = engine.split(os.sep)[-1][:-3] + if len(e.strip()) == 0: + continue + if e.startswith('_'): + continue + try: + exec("from engines.%s import %s" % (e, e)) + exec("engine_url = %s.url" % e) + supported_engines[engine_url] = e + except Exception: + pass + +if __name__ == '__main__': + if len(sys.argv) < 3: + raise SystemExit('./nova2dl.py engine_url download_parameter') + engine_url = sys.argv[1].strip() + download_param = sys.argv[2].strip() + if engine_url not in list(supported_engines.keys()): + raise SystemExit('./nova2dl.py: this engine_url was not recognized') + exec("engine = %s()" % supported_engines[engine_url]) + if hasattr(engine, 'download_torrent'): + engine.download_torrent(download_param) + else: + print(download_file(download_param)) + sys.exit(0) diff --git a/modules/novaprinter.py b/modules/novaprinter.py new file mode 100644 index 0000000..09250dc --- /dev/null +++ b/modules/novaprinter.py @@ -0,0 +1,67 @@ +#VERSION: 1.46 + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the author nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +def prettyPrinter(dictionary): + dictionary['size'] = anySizeToBytes(dictionary['size']) + outtext = "|".join((dictionary["link"], dictionary["name"].replace("|", " "), + str(dictionary["size"]), str(dictionary["seeds"]), + str(dictionary["leech"]), dictionary["engine_url"])) + if 'desc_link' in dictionary: + outtext = "|".join((outtext, dictionary["desc_link"])) + + # fd 1 is stdout + with open(1, 'w', encoding='utf-8', closefd=False) as utf8stdout: + print(outtext, file=utf8stdout) + + +def anySizeToBytes(size_string): + """ + Convert a string like '1 KB' to '1024' (bytes) + """ + # separate integer from unit + try: + size, unit = size_string.split() + except: + try: + size = size_string.strip() + unit = ''.join([c for c in size if c.isalpha()]) + if len(unit) > 0: + size = size[:-len(unit)] + except: + return -1 + if len(size) == 0: + return -1 + size = float(size) + if len(unit) == 0: + return int(size) + short_unit = unit.upper()[0] + + # convert + units_dict = {'T': 40, 'G': 30, 'M': 20, 'K': 10} + if short_unit in units_dict: + size = size * 2**units_dict[short_unit] + return int(size) diff --git a/modules/sgmllib3.py b/modules/sgmllib3.py new file mode 100644 index 0000000..88a02a3 --- /dev/null +++ b/modules/sgmllib3.py @@ -0,0 +1,547 @@ +"""A parser for SGML, using the derived class as a static DTD.""" + +# XXX This only supports those SGML features used by HTML. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). RCDATA is +# not supported at all. + +import _markupbase +import re + +__all__ = ["SGMLParser", "SGMLParseError"] + +# Regular expressions used for parsing + +interesting = re.compile('[&<]') +incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|' + '/([a-zA-Z][^<>]*)?|' + '![^<>]*)?') + +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') +charref = re.compile('&#([0-9]+)[^0-9]') + +starttagopen = re.compile('<[>a-zA-Z]') +shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') +shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') +piclose = re.compile('>') +endbracket = re.compile('[<>]') +tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') +attrfind = re.compile( + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + + +class SGMLParseError(RuntimeError): + """Exception raised for all parse errors.""" + pass + + +# SGML parser base class -- find tags and call handler functions. +# Usage: p = SGMLParser(); p.feed(data); ...; p.close(). +# The dtd is defined by deriving a class which defines methods +# with special names to handle tags: start_foo and end_foo to handle +# and , respectively, or do_foo to handle by itself. +# (Tags are converted to lower case for this purpose.) The data +# between tags is passed to the parser by calling self.handle_data() +# with some data as argument (the data may be split up in arbitrary +# chunks). Entity references are passed by calling +# self.handle_entityref() with the entity reference as argument. + +class SGMLParser(_markupbase.ParserBase): + # Definition of entities -- derived classes may override + entity_or_charref = re.compile('&(?:' + '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' + ')(;?)') + + def __init__(self, verbose=0): + """Initialize and reset this instance.""" + self.verbose = verbose + self.reset() + + def reset(self): + """Reset this instance. Loses all unprocessed data.""" + self.__starttag_text = None + self.rawdata = '' + self.stack = [] + self.lasttag = '???' + self.nomoretags = 0 + self.literal = 0 + _markupbase.ParserBase.reset(self) + + def setnomoretags(self): + """Enter literal mode (CDATA) till EOF. + + Intended for derived classes only. + """ + self.nomoretags = self.literal = 1 + + def setliteral(self, *args): + """Enter literal mode (CDATA). + + Intended for derived classes only. + """ + self.literal = 1 + + def feed(self, data): + """Feed some data to the parser. + + Call this as often as you want, with as little or as much text + as you want (may include '\n'). (This just saves the text, + all the processing is done by goahead().) + """ + + self.rawdata = self.rawdata + data + self.goahead(0) + + def close(self): + """Handle the remaining data.""" + self.goahead(1) + + def error(self, message): + raise SGMLParseError(message) + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.nomoretags: + self.handle_data(rawdata[i:n]) + i = n + break + match = interesting.search(rawdata, i) + if match: j = match.start() + else: j = n + if i < j: + self.handle_data(rawdata[i:j]) + i = j + if i == n: break + if rawdata[i] == '<': + if starttagopen.match(rawdata, i): + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_starttag(i) + if k < 0: break + i = k + continue + if rawdata.startswith(" (i + 1): + self.handle_data("<") + i = i+1 + else: + # incomplete + break + continue + if rawdata.startswith("