v2: regex, threading - speeding up

6 years ago · 66270f4ee4
2 changed files with 51 additions and 142 deletions
--- a/kinozal.py
+++ b/kinozal.py
@ -87,7 +87,7 @@ class kinozal(object):
        for tor in torrents:
            torrent = {"engine_url": self.url,
-                       "desc_link": tor[0],
+                       "desc_link": self.url + tor[0],
                       "name": tor[1],
                       "link": 'http://dl.kinozal.tv/download.php?id=' + tor[0].split('=')[1],
                       "size": self.units_convert(tor[2]),
--- a/nnmclub.py
+++ b/nnmclub.py
@ -1,19 +1,21 @@
-# VERSION: 1.2
+# VERSION: 2.0
 # AUTHORS: imDMG [imdmgg@gmail.com]
 # NoNaMe-Club search engine plugin for qBittorrent
 import tempfile
 import os
 import logging
 import json
 import logging
 import math
 import os
 import re
 import tempfile
 import threading
 import time
 from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler
 from urllib.parse import urlencode  # , parse_qs
 from urllib.error import URLError, HTTPError
 from http.cookiejar import Cookie, CookieJar
 from html.parser import HTMLParser
 from novaprinter import prettyPrinter
 # setup logging into qBittorrent/logs
@ -43,7 +45,7 @@ class nnmclub(object):
    try:
        # try to load user data from file
        with open(os.path.abspath(os.path.join(os.path.dirname(__file__), 'nnmclub.json'))) as f:
-            config = json.load(f)
+            config: dict = json.load(f)
    except OSError as e:
        # file not found
        logging.error(e)
@ -73,14 +75,11 @@ class nnmclub(object):
        response = self._catch_error_request(self.url + 'login.php')
        if not self.blocked:
-            parser = self.WorstParser(self.url, True)
+            code = re.search(r'code"\svalue="(.+?)"', response.read().decode('cp1251'))[1]
            parser.feed(response.read().decode('cp1251'))
            parser.close()
            form_data = {"username": self.config['username'],
                         "password": self.config['password'],
                         "autologin": "on",
-                         "code": parser.login_code,
+                         "code": code,
                         "login": "Вход"}
            # so we first encode keys to cp1251 then do default decode whole string
            data_encoded = urlencode({k: v.encode('cp1251') for k, v in form_data.items()}).encode()
@ -92,122 +91,22 @@ class nnmclub(object):
            else:
                logging.info('We successfully authorized')
-    class WorstParser(HTMLParser):
+    def draw(self, html: str):
-        def __init__(self, url='', login=False):
+        torrents = re.findall(r'd\stopic.+?href="(.+?)".+?<b>(.+?)</b>.+?href="(d.+?)"'
-            HTMLParser.__init__(self)
+                              r'.+?/u>\s(.+?)<.+?b>(\d+)</.+?b>(\d+)<', html, re.S)
-            self.url = url
+
-            self.login = login
+        for tor in torrents:
-            self.torrent = {'link': '',
+            torrent = {"engine_url": self.url,
-                            'name': '',
+                       "desc_link": self.url + tor[0],
-                            'size': '',
+                       "name": tor[1],
-                            'seeds': '',
+                       "link": self.url + tor[2],
-                            'leech': '',
+                       "size": tor[3].replace(',', '.'),
-                            'desc_link': '', }
+                       "seeds": tor[4],
-
+                       "leech": tor[5]}
            self.login_code = None
            # we need a page markup to know when stop and collect data,
            # because available methods, in this class, do not communicate each other
            # as a result, we make markup to transfer information
            # from one method to another, along a chain
            #
            # markup on result table
            self.result_table = False  # table with results is found
            self.torrent_row = False  # found torrent row for collect data
            self.index_td = 0  # td counter in torrent row
            self.write = None  # trigger to detecting when to collect data
            # markup pagination
            self.paginator = False  # found more pages in result
            self.pages = 0  # page counter
            self.search_id = 0
            self.found_torrents = 0
        def handle_starttag(self, tag, attrs):
            # login
            if self.login and tag == 'input':
                tmp = dict(attrs)
                if tmp.get('name') == 'code':
                    self.login_code = tmp['value']
                    return
-            # search result table by class tablesorter
+            prettyPrinter(torrent)
-            if tag == 'table':
+        del torrents
-                for name, value in attrs:
+        # return len(torrents)
                    if name == 'class' and 'tablesorter' in value:
                        self.result_table = True
            # search for torrent row by class prow
            if self.result_table and tag == 'tr':
                for name, value in attrs:
                    if name == 'class' and 'prow' in value:
                        self.torrent_row = True
            # count td for find right td
            if self.torrent_row and tag == 'td':
                if self.index_td == 5:
                    self.write = "size"
                elif self.index_td == 7:
                    self.write = "seeds"
                elif self.index_td == 8:
                    self.write = "leech"
                self.index_td += 1
            # search for torrent link by classes r0 or r1
            if self.torrent_row and tag == 'a':
                if self.index_td == 3:
                    self.torrent['desc_link'] = self.url + attrs[1][1]
                    self.write = "name"
                if self.index_td == 5:
                    self.torrent['link'] = self.url + attrs[0][1]
            # search for right div with class paginator
            if self.found_torrents == 50 and tag == 'span':
                for name, value in attrs:
                    if name == 'class' and value == 'nav':
                        self.paginator = True
            # search for block with page numbers
            if self.paginator and tag == 'a':
                # if not self.pages:
                    # parsing for search_id
                    # self.search_id = parse_qs(attrs[0][1].split('?')[1])['search_id']
                self.pages += 1
        def handle_endtag(self, tag):
            # detecting that torrent row is closed and print all collected data
            if self.torrent_row and tag == 'tr':
                self.torrent["engine_url"] = self.url
                logging.debug('torrent row: ' + str(self.torrent))
                prettyPrinter(self.torrent)
                self.torrent = {key: '' for key in self.torrent}
                self.index_td = 0
                self.torrent_row = False
                self.found_torrents += 1
            # detecting that table with result is close
            if self.result_table and tag == 'table':
                self.result_table = False
            # detecting that we found all pagination
            if self.paginator and tag == 'span':
                self.paginator = False
        def handle_data(self, data: str):
            # detecting that we need write data at this moment
            if self.write and self.result_table:
                if data.startswith('<b>'):
                    data = data[3:-5]
                if self.index_td == 5:
                    data = data.split('</u>')[1].strip()
                self.torrent[self.write] = data.strip()
                self.write = None
        def error(self, message):
            pass
    def download_torrent(self, url):
        if self.blocked:
@ -227,27 +126,37 @@ class nnmclub(object):
        logging.debug(path + " " + url)
        print(path + " " + url)
    def searching(self, query, first=False):
        response = self._catch_error_request(query)
        page = response.read().decode('cp1251')
        self.draw(page)
        total = int(re.search(r'\(max:\s(\d{1,3})\)', page)[1]) if first else -1
        return total
    def search(self, what, cat='all'):
        if self.blocked:
            return
        c = self.supported_categories[cat]
        query = '{}tracker.php?nm={}&{}'.format(self.url, what.replace(" ", "+"), "f=-1" if c == '-1' else "c=" + c)
-        response = self._catch_error_request(query)
+
-        parser = self.WorstParser(self.url)
+        # make first request (maybe it enough)
-        parser.feed(response.read().decode('cp1251'))
+        total = self.searching(query, True)
-        parser.close()
+        # do async requests
-
+        if total > 50:
-        # if first request return that we have pages, we do cycle
+            tasks = []
-        if parser.pages:
+            for x in range(1, math.ceil(total / 50)):
-            for x in range(1, parser.pages):
+                task = threading.Thread(target=self.searching, args=(query + "&start={}".format(x * 50),))
-                response = self._catch_error_request('{}&start={}'.format(query,      # &search_id=
+                tasks.append(task)
-                                                                                      # parser.search_id,
+                task.start()
-                                                                                      parser.found_torrents))
+
-                parser.feed(response.read().decode('cp1251'))
+            # wait slower request in stack
-                parser.close()
+            for task in tasks:
                task.join()
            del tasks
        logging.debug("--- {} seconds ---".format(time.time() - start_time))
-        logging.info("Found torrents: {}".format(parser.found_torrents))
+        logging.info("Found torrents: {}".format(total))
    def _catch_error_request(self, url='', data=None):
        url = url if url else self.url