v2:regex, threading

This commit is contained in:
imDMG 2019-01-29 21:08:08 +05:00
parent 3733b2d614
commit f81e761095

View File

@ -1,20 +1,21 @@
# VERSION: 1.3 # VERSION: 2.0
# AUTHORS: imDMG [imdmgg@gmail.com] # AUTHORS: imDMG [imdmgg@gmail.com]
# Kinozal.tv search engine plugin for qBittorrent # Kinozal.tv search engine plugin for qBittorrent
import tempfile
import os
import logging
import json import json
# import re import logging
import math
import os
import re
import tempfile
import threading
import time import time
from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler
from urllib.parse import urlencode from urllib.parse import urlencode
from urllib.error import URLError, HTTPError from urllib.error import URLError, HTTPError
from http.cookiejar import CookieJar from http.cookiejar import CookieJar
from html.parser import HTMLParser
from novaprinter import prettyPrinter from novaprinter import prettyPrinter
# setup logging into qBittorrent/logs # setup logging into qBittorrent/logs
@ -24,7 +25,6 @@ logging.basicConfig(level=logging.INFO,
filename=os.path.abspath(os.path.join(os.path.dirname(__file__), '../../logs', 'kinozal.log')), filename=os.path.abspath(os.path.join(os.path.dirname(__file__), '../../logs', 'kinozal.log')),
filemode='w') filemode='w')
# benchmark
start_time = time.time() start_time = time.time()
@ -51,6 +51,8 @@ class kinozal(object):
raise e raise e
def __init__(self): def __init__(self):
logging.info('Initialisation')
self.result = []
# establish connection # establish connection
# #
# make cookie # make cookie
@ -79,113 +81,30 @@ class kinozal(object):
else: else:
logging.info('We successfully authorized') logging.info('We successfully authorized')
class WorstParser(HTMLParser): def draw(self, html: str):
def __init__(self, url=''): torrents = re.findall(r'nam"><a\s+?href="(.+?)"\s+?class="r\d">(.*?)</a>'
HTMLParser.__init__(self) r'.+?s\'>.+?s\'>(.*?)<.+?sl_s\'>(\d+)<.+?sl_p\'>(\d+)<', html, re.S)
self.url = url
self.torrent = {'link': '',
'name': '',
'size': '',
'seeds': '',
'leech': '',
'desc_link': '', }
# we need a page markup to know when stop and collect data, for tor in torrents:
# because available methods, in this class, do not communicate each other torrent = {"engine_url": self.url,
# as a result, we make markup to transfer information "desc_link": tor[0],
# from one method to another, along a chain "name": tor[1],
# "link": 'http://dl.kinozal.tv/download.php?id=' + tor[0].split('=')[1],
# markup on result table "size": self.units_convert(tor[2]),
self.result_table = False # table with results is found "seeds": tor[3],
self.torrent_row = False # found torrent row for collect data "leech": tor[4]}
self.index_td = 0 # td counter in torrent row
self.write = None # trigger to detecting when to collect data
# markup pagination prettyPrinter(torrent)
self.paginator = False # found more pages in result del torrents
self.pages = 0 # page counter # return len(torrents)
self.found_torrents = 0 @staticmethod
def units_convert(unit):
# replace size units
find = unit.split()[1]
replace = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'}[find]
def handle_starttag(self, tag, attrs): return unit.replace(find, replace)
# search result table by class t_peer
if tag == 'table':
for name, value in attrs:
if name == 'class' and 't_peer' in value:
self.result_table = True
# search for torrent row by class bg
if self.result_table and tag == 'tr':
for name, value in attrs:
if name == 'class' and 'bg' in value:
self.torrent_row = True
# count td for find right td
if self.torrent_row and tag == 'td':
if self.index_td == 3:
self.write = "size"
elif self.index_td == 4:
self.write = "seeds"
elif self.index_td == 5:
self.write = "leech"
self.index_td += 1
# search for torrent link by classes r0 or r1
if self.torrent_row and tag == 'a':
for name, value in attrs:
if name == 'class' and 'r' in value:
self.torrent['link'] = 'http://dl.kinozal.tv/download.php?id=' + attrs[0][1].split('=')[1]
self.torrent['desc_link'] = self.url + attrs[0][1]
self.write = "name"
# search for right div with class paginator
if self.found_torrents == 50 and tag == 'div':
for name, value in attrs:
if name == 'class' and value == 'paginator':
self.paginator = True
# search for block with page numbers
if self.paginator and tag == 'li':
self.pages += 1
def handle_endtag(self, tag):
# detecting that torrent row is closed and print all collected data
if self.torrent_row and tag == 'tr':
self.torrent["engine_url"] = self.url
logging.debug('self.torrent: ' + str(self.torrent))
prettyPrinter(self.torrent)
self.torrent = {key: '' for key in self.torrent}
self.index_td = 0
self.torrent_row = False
self.found_torrents += 1
# detecting that table with result is close
if self.result_table and tag == 'table':
self.result_table = False
# detecting that we found all pagination
if self.paginator and tag == 'ul':
self.paginator = False
def handle_data(self, data: str):
# detecting that we need write data at this moment
if self.write and self.result_table:
if self.write == 'size':
data = self.units_convert(data)
self.torrent[self.write] = data.strip()
self.write = None
@staticmethod
def units_convert(unit):
# replace size units
find = unit.split()[1]
replace = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'}[find]
return unit.replace(find, replace)
def error(self, message):
pass
def download_torrent(self, url: str): def download_torrent(self, url: str):
if self.blocked: if self.blocked:
@ -214,24 +133,54 @@ class kinozal(object):
logging.debug(path + " " + url) logging.debug(path + " " + url)
print(path + " " + url) print(path + " " + url)
def searching(self, query, first=False):
response = self._catch_error_request(query)
page = response.read().decode('cp1251')
self.draw(page)
total = int(re.search(r'</span>Найдено\s+?(\d+)\s+?раздач', page)[1]) if first else -1
return total
def search_old(self, what, cat='all'):
if self.blocked:
return
total, current = -1, 0
while total != current:
query = '{}/browse.php?s={}&c={}&page={}'.format(self.url, what.replace(" ", "+"),
self.supported_categories[cat],
math.ceil(current / 50))
response = self._catch_error_request(query)
page = response.read().decode('cp1251')
if total == -1:
total = int(re.search(r'</span>Найдено\s+?(\d+)\s+?раздач</td>', page)[1])
current += self.draw(page)
logging.debug("--- {} seconds ---".format(time.time() - start_time))
logging.info("Found torrents: {}".format(total))
def search(self, what, cat='all'): def search(self, what, cat='all'):
if self.blocked: if self.blocked:
return return
query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"), self.supported_categories[cat]) query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"),
response = self._catch_error_request(query) self.supported_categories[cat])
parser = self.WorstParser(self.url)
parser.feed(response.read().decode('cp1251'))
parser.close()
# if first request return that we have pages, we do cycle # make first request (maybe it enough)
if parser.pages: total = self.searching(query, True)
for x in range(1, parser.pages): # do async requests
response = self._catch_error_request('{}&page={}'.format(query, x)) if total > 50:
parser.feed(response.read().decode('cp1251')) tasks = []
parser.close() for x in range(1, math.ceil(total / 50)):
task = threading.Thread(target=self.searching, args=(query + "&page={}".format(x),))
tasks.append(task)
task.start()
# wait slower request in stack
for task in tasks:
task.join()
del tasks
logging.debug("--- {} seconds ---".format(time.time() - start_time)) logging.debug("--- {} seconds ---".format(time.time() - start_time))
logging.info("Found torrents: {}".format(parser.found_torrents)) logging.info("Found torrents: {}".format(total))
def _catch_error_request(self, url='', data=None): def _catch_error_request(self, url='', data=None):
url = url if url else self.url url = url if url else self.url
@ -258,6 +207,8 @@ class kinozal(object):
if __name__ == "__main__": if __name__ == "__main__":
# f = open("result.html", "r")
kinozal_se = kinozal() kinozal_se = kinozal()
# kinozal_se.download_torrent("http://kinozal.tv/details.php?id=1263407") # kinozal_se.draw(f.read())
# kinozal_se.search('supernatural') kinozal_se.search('doctor')
print("--- %s seconds ---" % (time.time() - start_time))