From 66270f4ee43c15f25dd99c89c8e857642ff58d06 Mon Sep 17 00:00:00 2001 From: imDMG Date: Tue, 29 Jan 2019 23:16:00 +0500 Subject: [PATCH] v2: regex, threading - speeding up --- kinozal.py | 2 +- nnmclub.py | 191 ++++++++++++++--------------------------------------- 2 files changed, 51 insertions(+), 142 deletions(-) diff --git a/kinozal.py b/kinozal.py index b3a7300..d422020 100644 --- a/kinozal.py +++ b/kinozal.py @@ -87,7 +87,7 @@ class kinozal(object): for tor in torrents: torrent = {"engine_url": self.url, - "desc_link": tor[0], + "desc_link": self.url + tor[0], "name": tor[1], "link": 'http://dl.kinozal.tv/download.php?id=' + tor[0].split('=')[1], "size": self.units_convert(tor[2]), diff --git a/nnmclub.py b/nnmclub.py index b3f9716..ce04e15 100644 --- a/nnmclub.py +++ b/nnmclub.py @@ -1,19 +1,21 @@ -# VERSION: 1.2 +# VERSION: 2.0 # AUTHORS: imDMG [imdmgg@gmail.com] # NoNaMe-Club search engine plugin for qBittorrent -import tempfile -import os -import logging import json +import logging +import math +import os +import re +import tempfile +import threading import time from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler from urllib.parse import urlencode # , parse_qs from urllib.error import URLError, HTTPError from http.cookiejar import Cookie, CookieJar -from html.parser import HTMLParser from novaprinter import prettyPrinter # setup logging into qBittorrent/logs @@ -43,7 +45,7 @@ class nnmclub(object): try: # try to load user data from file with open(os.path.abspath(os.path.join(os.path.dirname(__file__), 'nnmclub.json'))) as f: - config = json.load(f) + config: dict = json.load(f) except OSError as e: # file not found logging.error(e) @@ -73,14 +75,11 @@ class nnmclub(object): response = self._catch_error_request(self.url + 'login.php') if not self.blocked: - parser = self.WorstParser(self.url, True) - parser.feed(response.read().decode('cp1251')) - parser.close() - + code = re.search(r'code"\svalue="(.+?)"', response.read().decode('cp1251'))[1] form_data = {"username": self.config['username'], "password": self.config['password'], "autologin": "on", - "code": parser.login_code, + "code": code, "login": "Вход"} # so we first encode keys to cp1251 then do default decode whole string data_encoded = urlencode({k: v.encode('cp1251') for k, v in form_data.items()}).encode() @@ -92,122 +91,22 @@ class nnmclub(object): else: logging.info('We successfully authorized') - class WorstParser(HTMLParser): - def __init__(self, url='', login=False): - HTMLParser.__init__(self) - self.url = url - self.login = login - self.torrent = {'link': '', - 'name': '', - 'size': '', - 'seeds': '', - 'leech': '', - 'desc_link': '', } - - self.login_code = None - - # we need a page markup to know when stop and collect data, - # because available methods, in this class, do not communicate each other - # as a result, we make markup to transfer information - # from one method to another, along a chain - # - # markup on result table - self.result_table = False # table with results is found - self.torrent_row = False # found torrent row for collect data - self.index_td = 0 # td counter in torrent row - self.write = None # trigger to detecting when to collect data - - # markup pagination - self.paginator = False # found more pages in result - self.pages = 0 # page counter - - self.search_id = 0 - self.found_torrents = 0 - - def handle_starttag(self, tag, attrs): - # login - if self.login and tag == 'input': - tmp = dict(attrs) - if tmp.get('name') == 'code': - self.login_code = tmp['value'] - return - - # search result table by class tablesorter - if tag == 'table': - for name, value in attrs: - if name == 'class' and 'tablesorter' in value: - self.result_table = True - - # search for torrent row by class prow - if self.result_table and tag == 'tr': - for name, value in attrs: - if name == 'class' and 'prow' in value: - self.torrent_row = True - - # count td for find right td - if self.torrent_row and tag == 'td': - if self.index_td == 5: - self.write = "size" - elif self.index_td == 7: - self.write = "seeds" - elif self.index_td == 8: - self.write = "leech" - - self.index_td += 1 - - # search for torrent link by classes r0 or r1 - if self.torrent_row and tag == 'a': - if self.index_td == 3: - self.torrent['desc_link'] = self.url + attrs[1][1] - self.write = "name" - - if self.index_td == 5: - self.torrent['link'] = self.url + attrs[0][1] - - # search for right div with class paginator - if self.found_torrents == 50 and tag == 'span': - for name, value in attrs: - if name == 'class' and value == 'nav': - self.paginator = True - - # search for block with page numbers - if self.paginator and tag == 'a': - # if not self.pages: - # parsing for search_id - # self.search_id = parse_qs(attrs[0][1].split('?')[1])['search_id'] - self.pages += 1 - - def handle_endtag(self, tag): - # detecting that torrent row is closed and print all collected data - if self.torrent_row and tag == 'tr': - self.torrent["engine_url"] = self.url - logging.debug('torrent row: ' + str(self.torrent)) - prettyPrinter(self.torrent) - self.torrent = {key: '' for key in self.torrent} - self.index_td = 0 - self.torrent_row = False - self.found_torrents += 1 - - # detecting that table with result is close - if self.result_table and tag == 'table': - self.result_table = False - - # detecting that we found all pagination - if self.paginator and tag == 'span': - self.paginator = False - - def handle_data(self, data: str): - # detecting that we need write data at this moment - if self.write and self.result_table: - if data.startswith(''): - data = data[3:-5] - if self.index_td == 5: - data = data.split('')[1].strip() - self.torrent[self.write] = data.strip() - self.write = None - - def error(self, message): - pass + def draw(self, html: str): + torrents = re.findall(r'd\stopic.+?href="(.+?)".+?(.+?).+?href="(d.+?)"' + r'.+?/u>\s(.+?)<.+?b>(\d+)(\d+)<', html, re.S) + + for tor in torrents: + torrent = {"engine_url": self.url, + "desc_link": self.url + tor[0], + "name": tor[1], + "link": self.url + tor[2], + "size": tor[3].replace(',', '.'), + "seeds": tor[4], + "leech": tor[5]} + + prettyPrinter(torrent) + del torrents + # return len(torrents) def download_torrent(self, url): if self.blocked: @@ -227,27 +126,37 @@ class nnmclub(object): logging.debug(path + " " + url) print(path + " " + url) + def searching(self, query, first=False): + response = self._catch_error_request(query) + page = response.read().decode('cp1251') + self.draw(page) + total = int(re.search(r'\(max:\s(\d{1,3})\)', page)[1]) if first else -1 + + return total + def search(self, what, cat='all'): if self.blocked: return c = self.supported_categories[cat] query = '{}tracker.php?nm={}&{}'.format(self.url, what.replace(" ", "+"), "f=-1" if c == '-1' else "c=" + c) - response = self._catch_error_request(query) - parser = self.WorstParser(self.url) - parser.feed(response.read().decode('cp1251')) - parser.close() - - # if first request return that we have pages, we do cycle - if parser.pages: - for x in range(1, parser.pages): - response = self._catch_error_request('{}&start={}'.format(query, # &search_id= - # parser.search_id, - parser.found_torrents)) - parser.feed(response.read().decode('cp1251')) - parser.close() + + # make first request (maybe it enough) + total = self.searching(query, True) + # do async requests + if total > 50: + tasks = [] + for x in range(1, math.ceil(total / 50)): + task = threading.Thread(target=self.searching, args=(query + "&start={}".format(x * 50),)) + tasks.append(task) + task.start() + + # wait slower request in stack + for task in tasks: + task.join() + del tasks logging.debug("--- {} seconds ---".format(time.time() - start_time)) - logging.info("Found torrents: {}".format(parser.found_torrents)) + logging.info("Found torrents: {}".format(total)) def _catch_error_request(self, url='', data=None): url = url if url else self.url