v2:regex, threading

2025-01-30 08:24:14 +00:00 · 2019-01-29 21:08:08 +05:00 · 2019-01-29 21:08:08 +05:00 · f81e761095
commit f81e761095
parent 3733b2d614
1 changed files with 75 additions and 124 deletions
--- a/kinozal.py
+++ b/kinozal.py
@ -1,20 +1,21 @@
-# VERSION: 1.3
+# VERSION: 2.0
 # AUTHORS: imDMG [imdmgg@gmail.com]
 # Kinozal.tv search engine plugin for qBittorrent
 import tempfile
 import os
 import logging
 import json
-# import re
+import logging
 import math
 import os
 import re
 import tempfile
 import threading
 import time
 from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler
 from urllib.parse import urlencode
 from urllib.error import URLError, HTTPError
 from http.cookiejar import CookieJar
 from html.parser import HTMLParser
 from novaprinter import prettyPrinter
 # setup logging into qBittorrent/logs
@ -24,7 +25,6 @@ logging.basicConfig(level=logging.INFO,
                    filename=os.path.abspath(os.path.join(os.path.dirname(__file__), '../../logs', 'kinozal.log')),
                    filemode='w')
 # benchmark
 start_time = time.time()
@ -51,6 +51,8 @@ class kinozal(object):
        raise e
    def __init__(self):
        logging.info('Initialisation')
        self.result = []
        # establish connection
        #
        # make cookie
@ -79,113 +81,30 @@ class kinozal(object):
            else:
                logging.info('We successfully authorized')
-    class WorstParser(HTMLParser):
+    def draw(self, html: str):
-        def __init__(self, url=''):
+        torrents = re.findall(r'nam"><a\s+?href="(.+?)"\s+?class="r\d">(.*?)</a>'
-            HTMLParser.__init__(self)
+                              r'.+?s\'>.+?s\'>(.*?)<.+?sl_s\'>(\d+)<.+?sl_p\'>(\d+)<', html, re.S)
            self.url = url
            self.torrent = {'link': '',
                            'name': '',
                            'size': '',
                            'seeds': '',
                            'leech': '',
                            'desc_link': '', }
-            # we need a page markup to know when stop and collect data,
+        for tor in torrents:
-            # because available methods, in this class, do not communicate each other
+            torrent = {"engine_url": self.url,
-            # as a result, we make markup to transfer information
+                       "desc_link": tor[0],
-            # from one method to another, along a chain
+                       "name": tor[1],
-            #
+                       "link": 'http://dl.kinozal.tv/download.php?id=' + tor[0].split('=')[1],
-            # markup on result table
+                       "size": self.units_convert(tor[2]),
-            self.result_table = False  # table with results is found
+                       "seeds": tor[3],
-            self.torrent_row = False  # found torrent row for collect data
+                       "leech": tor[4]}
            self.index_td = 0  # td counter in torrent row
            self.write = None  # trigger to detecting when to collect data
-            # markup pagination
+            prettyPrinter(torrent)
-            self.paginator = False  # found more pages in result
+        del torrents
-            self.pages = 0  # page counter
+        # return len(torrents)
-            self.found_torrents = 0
+    @staticmethod
    def units_convert(unit):
        # replace size units
        find = unit.split()[1]
        replace = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'}[find]
-        def handle_starttag(self, tag, attrs):
+        return unit.replace(find, replace)
            # search result table by class t_peer
            if tag == 'table':
                for name, value in attrs:
                    if name == 'class' and 't_peer' in value:
                        self.result_table = True
            # search for torrent row by class bg
            if self.result_table and tag == 'tr':
                for name, value in attrs:
                    if name == 'class' and 'bg' in value:
                        self.torrent_row = True
            # count td for find right td
            if self.torrent_row and tag == 'td':
                if self.index_td == 3:
                    self.write = "size"
                elif self.index_td == 4:
                    self.write = "seeds"
                elif self.index_td == 5:
                    self.write = "leech"
                self.index_td += 1
            # search for torrent link by classes r0 or r1
            if self.torrent_row and tag == 'a':
                for name, value in attrs:
                    if name == 'class' and 'r' in value:
                        self.torrent['link'] = 'http://dl.kinozal.tv/download.php?id=' + attrs[0][1].split('=')[1]
                        self.torrent['desc_link'] = self.url + attrs[0][1]
                        self.write = "name"
            # search for right div with class paginator
            if self.found_torrents == 50 and tag == 'div':
                for name, value in attrs:
                    if name == 'class' and value == 'paginator':
                        self.paginator = True
            # search for block with page numbers
            if self.paginator and tag == 'li':
                self.pages += 1
        def handle_endtag(self, tag):
            # detecting that torrent row is closed and print all collected data
            if self.torrent_row and tag == 'tr':
                self.torrent["engine_url"] = self.url
                logging.debug('self.torrent: ' + str(self.torrent))
                prettyPrinter(self.torrent)
                self.torrent = {key: '' for key in self.torrent}
                self.index_td = 0
                self.torrent_row = False
                self.found_torrents += 1
            # detecting that table with result is close
            if self.result_table and tag == 'table':
                self.result_table = False
            # detecting that we found all pagination
            if self.paginator and tag == 'ul':
                self.paginator = False
        def handle_data(self, data: str):
            # detecting that we need write data at this moment
            if self.write and self.result_table:
                if self.write == 'size':
                    data = self.units_convert(data)
                self.torrent[self.write] = data.strip()
                self.write = None
        @staticmethod
        def units_convert(unit):
            # replace size units
            find = unit.split()[1]
            replace = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'}[find]
            return unit.replace(find, replace)
        def error(self, message):
            pass
    def download_torrent(self, url: str):
        if self.blocked:
@ -214,24 +133,54 @@ class kinozal(object):
            logging.debug(path + " " + url)
            print(path + " " + url)
    def searching(self, query, first=False):
        response = self._catch_error_request(query)
        page = response.read().decode('cp1251')
        self.draw(page)
        total = int(re.search(r'</span>Найдено\s+?(\d+)\s+?раздач', page)[1]) if first else -1
        return total
    def search_old(self, what, cat='all'):
        if self.blocked:
            return
        total, current = -1, 0
        while total != current:
            query = '{}/browse.php?s={}&c={}&page={}'.format(self.url, what.replace(" ", "+"),
                                                             self.supported_categories[cat],
                                                             math.ceil(current / 50))
            response = self._catch_error_request(query)
            page = response.read().decode('cp1251')
            if total == -1:
                total = int(re.search(r'</span>Найдено\s+?(\d+)\s+?раздач</td>', page)[1])
            current += self.draw(page)
        logging.debug("--- {} seconds ---".format(time.time() - start_time))
        logging.info("Found torrents: {}".format(total))
    def search(self, what, cat='all'):
        if self.blocked:
            return
-        query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"), self.supported_categories[cat])
+        query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"),
-        response = self._catch_error_request(query)
+                                                 self.supported_categories[cat])
        parser = self.WorstParser(self.url)
        parser.feed(response.read().decode('cp1251'))
        parser.close()
-        # if first request return that we have pages, we do cycle
+        # make first request (maybe it enough)
-        if parser.pages:
+        total = self.searching(query, True)
-            for x in range(1, parser.pages):
+        # do async requests
-                response = self._catch_error_request('{}&page={}'.format(query, x))
+        if total > 50:
-                parser.feed(response.read().decode('cp1251'))
+            tasks = []
-                parser.close()
+            for x in range(1, math.ceil(total / 50)):
                task = threading.Thread(target=self.searching, args=(query + "&page={}".format(x),))
                tasks.append(task)
                task.start()
            # wait slower request in stack
            for task in tasks:
                task.join()
            del tasks
        logging.debug("--- {} seconds ---".format(time.time() - start_time))
-        logging.info("Found torrents: {}".format(parser.found_torrents))
+        logging.info("Found torrents: {}".format(total))
    def _catch_error_request(self, url='', data=None):
        url = url if url else self.url
@ -258,6 +207,8 @@ class kinozal(object):
 if __name__ == "__main__":
    # f = open("result.html", "r")
    kinozal_se = kinozal()
-    # kinozal_se.download_torrent("http://kinozal.tv/details.php?id=1263407")
+    # kinozal_se.draw(f.read())
-    # kinozal_se.search('supernatural')
+    kinozal_se.search('doctor')
    print("--- %s seconds ---" % (time.time() - start_time))