From f81e7610954ca74b8c71672940ef9b65aab1f8a7 Mon Sep 17 00:00:00 2001 From: imDMG Date: Tue, 29 Jan 2019 21:08:08 +0500 Subject: [PATCH] v2:regex, threading --- kinozal.py | 209 ++++++++++++++++++++--------------------------------- 1 file changed, 80 insertions(+), 129 deletions(-) diff --git a/kinozal.py b/kinozal.py index cc96880..b3a7300 100644 --- a/kinozal.py +++ b/kinozal.py @@ -1,20 +1,21 @@ -# VERSION: 1.3 +# VERSION: 2.0 # AUTHORS: imDMG [imdmgg@gmail.com] # Kinozal.tv search engine plugin for qBittorrent -import tempfile -import os -import logging import json -# import re +import logging +import math +import os +import re +import tempfile +import threading import time from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler from urllib.parse import urlencode from urllib.error import URLError, HTTPError from http.cookiejar import CookieJar -from html.parser import HTMLParser from novaprinter import prettyPrinter # setup logging into qBittorrent/logs @@ -24,7 +25,6 @@ logging.basicConfig(level=logging.INFO, filename=os.path.abspath(os.path.join(os.path.dirname(__file__), '../../logs', 'kinozal.log')), filemode='w') -# benchmark start_time = time.time() @@ -51,6 +51,8 @@ class kinozal(object): raise e def __init__(self): + logging.info('Initialisation') + self.result = [] # establish connection # # make cookie @@ -79,113 +81,30 @@ class kinozal(object): else: logging.info('We successfully authorized') - class WorstParser(HTMLParser): - def __init__(self, url=''): - HTMLParser.__init__(self) - self.url = url - self.torrent = {'link': '', - 'name': '', - 'size': '', - 'seeds': '', - 'leech': '', - 'desc_link': '', } - - # we need a page markup to know when stop and collect data, - # because available methods, in this class, do not communicate each other - # as a result, we make markup to transfer information - # from one method to another, along a chain - # - # markup on result table - self.result_table = False # table with results is found - self.torrent_row = False # found torrent row for collect data - self.index_td = 0 # td counter in torrent row - self.write = None # trigger to detecting when to collect data - - # markup pagination - self.paginator = False # found more pages in result - self.pages = 0 # page counter - - self.found_torrents = 0 - - def handle_starttag(self, tag, attrs): - # search result table by class t_peer - if tag == 'table': - for name, value in attrs: - if name == 'class' and 't_peer' in value: - self.result_table = True - - # search for torrent row by class bg - if self.result_table and tag == 'tr': - for name, value in attrs: - if name == 'class' and 'bg' in value: - self.torrent_row = True - - # count td for find right td - if self.torrent_row and tag == 'td': - if self.index_td == 3: - self.write = "size" - elif self.index_td == 4: - self.write = "seeds" - elif self.index_td == 5: - self.write = "leech" - - self.index_td += 1 - - # search for torrent link by classes r0 or r1 - if self.torrent_row and tag == 'a': - for name, value in attrs: - if name == 'class' and 'r' in value: - self.torrent['link'] = 'http://dl.kinozal.tv/download.php?id=' + attrs[0][1].split('=')[1] - self.torrent['desc_link'] = self.url + attrs[0][1] - self.write = "name" - - # search for right div with class paginator - if self.found_torrents == 50 and tag == 'div': - for name, value in attrs: - if name == 'class' and value == 'paginator': - self.paginator = True - - # search for block with page numbers - if self.paginator and tag == 'li': - self.pages += 1 - - def handle_endtag(self, tag): - # detecting that torrent row is closed and print all collected data - if self.torrent_row and tag == 'tr': - self.torrent["engine_url"] = self.url - logging.debug('self.torrent: ' + str(self.torrent)) - prettyPrinter(self.torrent) - self.torrent = {key: '' for key in self.torrent} - self.index_td = 0 - self.torrent_row = False - self.found_torrents += 1 - - # detecting that table with result is close - if self.result_table and tag == 'table': - self.result_table = False - - # detecting that we found all pagination - if self.paginator and tag == 'ul': - self.paginator = False - - def handle_data(self, data: str): - # detecting that we need write data at this moment - if self.write and self.result_table: - if self.write == 'size': - data = self.units_convert(data) - self.torrent[self.write] = data.strip() - self.write = None - - @staticmethod - def units_convert(unit): - # replace size units - find = unit.split()[1] - replace = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'}[find] - - return unit.replace(find, replace) - - def error(self, message): - pass + def draw(self, html: str): + torrents = re.findall(r'nam">(.*?)' + r'.+?s\'>.+?s\'>(.*?)<.+?sl_s\'>(\d+)<.+?sl_p\'>(\d+)<', html, re.S) + + for tor in torrents: + torrent = {"engine_url": self.url, + "desc_link": tor[0], + "name": tor[1], + "link": 'http://dl.kinozal.tv/download.php?id=' + tor[0].split('=')[1], + "size": self.units_convert(tor[2]), + "seeds": tor[3], + "leech": tor[4]} + + prettyPrinter(torrent) + del torrents + # return len(torrents) + + @staticmethod + def units_convert(unit): + # replace size units + find = unit.split()[1] + replace = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'}[find] + + return unit.replace(find, replace) def download_torrent(self, url: str): if self.blocked: @@ -214,24 +133,54 @@ class kinozal(object): logging.debug(path + " " + url) print(path + " " + url) - def search(self, what, cat='all'): + def searching(self, query, first=False): + response = self._catch_error_request(query) + page = response.read().decode('cp1251') + self.draw(page) + total = int(re.search(r'Найдено\s+?(\d+)\s+?раздач', page)[1]) if first else -1 + + return total + + def search_old(self, what, cat='all'): if self.blocked: return - query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"), self.supported_categories[cat]) - response = self._catch_error_request(query) - parser = self.WorstParser(self.url) - parser.feed(response.read().decode('cp1251')) - parser.close() + total, current = -1, 0 + while total != current: + query = '{}/browse.php?s={}&c={}&page={}'.format(self.url, what.replace(" ", "+"), + self.supported_categories[cat], + math.ceil(current / 50)) + response = self._catch_error_request(query) + page = response.read().decode('cp1251') + if total == -1: + total = int(re.search(r'Найдено\s+?(\d+)\s+?раздач', page)[1]) + current += self.draw(page) - # if first request return that we have pages, we do cycle - if parser.pages: - for x in range(1, parser.pages): - response = self._catch_error_request('{}&page={}'.format(query, x)) - parser.feed(response.read().decode('cp1251')) - parser.close() + logging.debug("--- {} seconds ---".format(time.time() - start_time)) + logging.info("Found torrents: {}".format(total)) + + def search(self, what, cat='all'): + if self.blocked: + return + query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"), + self.supported_categories[cat]) + + # make first request (maybe it enough) + total = self.searching(query, True) + # do async requests + if total > 50: + tasks = [] + for x in range(1, math.ceil(total / 50)): + task = threading.Thread(target=self.searching, args=(query + "&page={}".format(x),)) + tasks.append(task) + task.start() + + # wait slower request in stack + for task in tasks: + task.join() + del tasks logging.debug("--- {} seconds ---".format(time.time() - start_time)) - logging.info("Found torrents: {}".format(parser.found_torrents)) + logging.info("Found torrents: {}".format(total)) def _catch_error_request(self, url='', data=None): url = url if url else self.url @@ -258,6 +207,8 @@ class kinozal(object): if __name__ == "__main__": + # f = open("result.html", "r") kinozal_se = kinozal() - # kinozal_se.download_torrent("http://kinozal.tv/details.php?id=1263407") - # kinozal_se.search('supernatural') + # kinozal_se.draw(f.read()) + kinozal_se.search('doctor') + print("--- %s seconds ---" % (time.time() - start_time))