v2: regex, threading - speeding up

2025-03-13 05:41:30 +00:00 · 2019-01-29 23:16:00 +05:00 · 2019-01-29 23:16:00 +05:00 · 66270f4ee4
commit 66270f4ee4
parent f81e761095
2 changed files with 48 additions and 139 deletions
--- a/kinozal.py
+++ b/kinozal.py
@ -87,7 +87,7 @@ class kinozal(object):

        for tor in torrents:
            torrent = {"engine_url": self.url,
-                       "desc_link": tor[0],
+                       "desc_link": self.url + tor[0],
                       "name": tor[1],
                       "link": 'http://dl.kinozal.tv/download.php?id=' + tor[0].split('=')[1],
                       "size": self.units_convert(tor[2]),
--- a/nnmclub.py
+++ b/nnmclub.py
@ -1,19 +1,21 @@
-# VERSION: 1.2
+# VERSION: 2.0
 # AUTHORS: imDMG [imdmgg@gmail.com]

 # NoNaMe-Club search engine plugin for qBittorrent

-import tempfile
-import os
-import logging
 import json
+import logging
+import math
+import os
+import re
+import tempfile
+import threading
 import time

 from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler
 from urllib.parse import urlencode  # , parse_qs
 from urllib.error import URLError, HTTPError
 from http.cookiejar import Cookie, CookieJar
-from html.parser import HTMLParser
 from novaprinter import prettyPrinter

 # setup logging into qBittorrent/logs
@ -43,7 +45,7 @@ class nnmclub(object):
    try:
        # try to load user data from file
        with open(os.path.abspath(os.path.join(os.path.dirname(__file__), 'nnmclub.json'))) as f:
-            config = json.load(f)
+            config: dict = json.load(f)
    except OSError as e:
        # file not found
        logging.error(e)
@ -73,14 +75,11 @@ class nnmclub(object):

        response = self._catch_error_request(self.url + 'login.php')
        if not self.blocked:
-            parser = self.WorstParser(self.url, True)
-            parser.feed(response.read().decode('cp1251'))
-            parser.close()
-
+            code = re.search(r'code"\svalue="(.+?)"', response.read().decode('cp1251'))[1]
            form_data = {"username": self.config['username'],
                         "password": self.config['password'],
                         "autologin": "on",
-                         "code": parser.login_code,
+                         "code": code,
                         "login": "Вход"}
            # so we first encode keys to cp1251 then do default decode whole string
            data_encoded = urlencode({k: v.encode('cp1251') for k, v in form_data.items()}).encode()
@ -92,122 +91,22 @@ class nnmclub(object):
            else:
                logging.info('We successfully authorized')

-    class WorstParser(HTMLParser):
-        def __init__(self, url='', login=False):
-            HTMLParser.__init__(self)
-            self.url = url
-            self.login = login
-            self.torrent = {'link': '',
-                            'name': '',
-                            'size': '',
-                            'seeds': '',
-                            'leech': '',
-                            'desc_link': '', }
+    def draw(self, html: str):
+        torrents = re.findall(r'd\stopic.+?href="(.+?)".+?<b>(.+?)</b>.+?href="(d.+?)"'
+                              r'.+?/u>\s(.+?)<.+?b>(\d+)</.+?b>(\d+)<', html, re.S)

-            self.login_code = None
+        for tor in torrents:
+            torrent = {"engine_url": self.url,
+                       "desc_link": self.url + tor[0],
+                       "name": tor[1],
+                       "link": self.url + tor[2],
+                       "size": tor[3].replace(',', '.'),
+                       "seeds": tor[4],
+                       "leech": tor[5]}

-            # we need a page markup to know when stop and collect data,
-            # because available methods, in this class, do not communicate each other
-            # as a result, we make markup to transfer information
-            # from one method to another, along a chain
-            #
-            # markup on result table
-            self.result_table = False  # table with results is found
-            self.torrent_row = False  # found torrent row for collect data
-            self.index_td = 0  # td counter in torrent row
-            self.write = None  # trigger to detecting when to collect data
-
-            # markup pagination
-            self.paginator = False  # found more pages in result
-            self.pages = 0  # page counter
-
-            self.search_id = 0
-            self.found_torrents = 0
-
-        def handle_starttag(self, tag, attrs):
-            # login
-            if self.login and tag == 'input':
-                tmp = dict(attrs)
-                if tmp.get('name') == 'code':
-                    self.login_code = tmp['value']
-                    return
-
-            # search result table by class tablesorter
-            if tag == 'table':
-                for name, value in attrs:
-                    if name == 'class' and 'tablesorter' in value:
-                        self.result_table = True
-
-            # search for torrent row by class prow
-            if self.result_table and tag == 'tr':
-                for name, value in attrs:
-                    if name == 'class' and 'prow' in value:
-                        self.torrent_row = True
-
-            # count td for find right td
-            if self.torrent_row and tag == 'td':
-                if self.index_td == 5:
-                    self.write = "size"
-                elif self.index_td == 7:
-                    self.write = "seeds"
-                elif self.index_td == 8:
-                    self.write = "leech"
-
-                self.index_td += 1
-
-            # search for torrent link by classes r0 or r1
-            if self.torrent_row and tag == 'a':
-                if self.index_td == 3:
-                    self.torrent['desc_link'] = self.url + attrs[1][1]
-                    self.write = "name"
-
-                if self.index_td == 5:
-                    self.torrent['link'] = self.url + attrs[0][1]
-
-            # search for right div with class paginator
-            if self.found_torrents == 50 and tag == 'span':
-                for name, value in attrs:
-                    if name == 'class' and value == 'nav':
-                        self.paginator = True
-
-            # search for block with page numbers
-            if self.paginator and tag == 'a':
-                # if not self.pages:
-                    # parsing for search_id
-                    # self.search_id = parse_qs(attrs[0][1].split('?')[1])['search_id']
-                self.pages += 1
-
-        def handle_endtag(self, tag):
-            # detecting that torrent row is closed and print all collected data
-            if self.torrent_row and tag == 'tr':
-                self.torrent["engine_url"] = self.url
-                logging.debug('torrent row: ' + str(self.torrent))
-                prettyPrinter(self.torrent)
-                self.torrent = {key: '' for key in self.torrent}
-                self.index_td = 0
-                self.torrent_row = False
-                self.found_torrents += 1
-
-            # detecting that table with result is close
-            if self.result_table and tag == 'table':
-                self.result_table = False
-
-            # detecting that we found all pagination
-            if self.paginator and tag == 'span':
-                self.paginator = False
-
-        def handle_data(self, data: str):
-            # detecting that we need write data at this moment
-            if self.write and self.result_table:
-                if data.startswith('<b>'):
-                    data = data[3:-5]
-                if self.index_td == 5:
-                    data = data.split('</u>')[1].strip()
-                self.torrent[self.write] = data.strip()
-                self.write = None
-
-        def error(self, message):
-            pass
+            prettyPrinter(torrent)
+        del torrents
+        # return len(torrents)

    def download_torrent(self, url):
        if self.blocked:
@ -227,27 +126,37 @@ class nnmclub(object):
        logging.debug(path + " " + url)
        print(path + " " + url)

+    def searching(self, query, first=False):
+        response = self._catch_error_request(query)
+        page = response.read().decode('cp1251')
+        self.draw(page)
+        total = int(re.search(r'\(max:\s(\d{1,3})\)', page)[1]) if first else -1
+
+        return total
+
    def search(self, what, cat='all'):
        if self.blocked:
            return
        c = self.supported_categories[cat]
        query = '{}tracker.php?nm={}&{}'.format(self.url, what.replace(" ", "+"), "f=-1" if c == '-1' else "c=" + c)
-        response = self._catch_error_request(query)
-        parser = self.WorstParser(self.url)
-        parser.feed(response.read().decode('cp1251'))
-        parser.close()

-        # if first request return that we have pages, we do cycle
-        if parser.pages:
-            for x in range(1, parser.pages):
-                response = self._catch_error_request('{}&start={}'.format(query,      # &search_id=
-                                                                                      # parser.search_id,
-                                                                                      parser.found_torrents))
-                parser.feed(response.read().decode('cp1251'))
-                parser.close()
+        # make first request (maybe it enough)
+        total = self.searching(query, True)
+        # do async requests
+        if total > 50:
+            tasks = []
+            for x in range(1, math.ceil(total / 50)):
+                task = threading.Thread(target=self.searching, args=(query + "&start={}".format(x * 50),))
+                tasks.append(task)
+                task.start()
+
+            # wait slower request in stack
+            for task in tasks:
+                task.join()
+            del tasks

        logging.debug("--- {} seconds ---".format(time.time() - start_time))
-        logging.info("Found torrents: {}".format(parser.found_torrents))
+        logging.info("Found torrents: {}".format(total))

    def _catch_error_request(self, url='', data=None):
        url = url if url else self.url