Browse Source

v2:regex, threading

master
imDMG 6 years ago
parent
commit
f81e761095
  1. 191
      kinozal.py

191
kinozal.py

@ -1,20 +1,21 @@
# VERSION: 1.3 # VERSION: 2.0
# AUTHORS: imDMG [imdmgg@gmail.com] # AUTHORS: imDMG [imdmgg@gmail.com]
# Kinozal.tv search engine plugin for qBittorrent # Kinozal.tv search engine plugin for qBittorrent
import tempfile
import os
import logging
import json import json
# import re import logging
import math
import os
import re
import tempfile
import threading
import time import time
from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler
from urllib.parse import urlencode from urllib.parse import urlencode
from urllib.error import URLError, HTTPError from urllib.error import URLError, HTTPError
from http.cookiejar import CookieJar from http.cookiejar import CookieJar
from html.parser import HTMLParser
from novaprinter import prettyPrinter from novaprinter import prettyPrinter
# setup logging into qBittorrent/logs # setup logging into qBittorrent/logs
@ -24,7 +25,6 @@ logging.basicConfig(level=logging.INFO,
filename=os.path.abspath(os.path.join(os.path.dirname(__file__), '../../logs', 'kinozal.log')), filename=os.path.abspath(os.path.join(os.path.dirname(__file__), '../../logs', 'kinozal.log')),
filemode='w') filemode='w')
# benchmark
start_time = time.time() start_time = time.time()
@ -51,6 +51,8 @@ class kinozal(object):
raise e raise e
def __init__(self): def __init__(self):
logging.info('Initialisation')
self.result = []
# establish connection # establish connection
# #
# make cookie # make cookie
@ -79,102 +81,22 @@ class kinozal(object):
else: else:
logging.info('We successfully authorized') logging.info('We successfully authorized')
class WorstParser(HTMLParser): def draw(self, html: str):
def __init__(self, url=''): torrents = re.findall(r'nam"><a\s+?href="(.+?)"\s+?class="r\d">(.*?)</a>'
HTMLParser.__init__(self) r'.+?s\'>.+?s\'>(.*?)<.+?sl_s\'>(\d+)<.+?sl_p\'>(\d+)<', html, re.S)
self.url = url
self.torrent = {'link': '', for tor in torrents:
'name': '', torrent = {"engine_url": self.url,
'size': '', "desc_link": tor[0],
'seeds': '', "name": tor[1],
'leech': '', "link": 'http://dl.kinozal.tv/download.php?id=' + tor[0].split('=')[1],
'desc_link': '', } "size": self.units_convert(tor[2]),
"seeds": tor[3],
# we need a page markup to know when stop and collect data, "leech": tor[4]}
# because available methods, in this class, do not communicate each other
# as a result, we make markup to transfer information prettyPrinter(torrent)
# from one method to another, along a chain del torrents
# # return len(torrents)
# markup on result table
self.result_table = False # table with results is found
self.torrent_row = False # found torrent row for collect data
self.index_td = 0 # td counter in torrent row
self.write = None # trigger to detecting when to collect data
# markup pagination
self.paginator = False # found more pages in result
self.pages = 0 # page counter
self.found_torrents = 0
def handle_starttag(self, tag, attrs):
# search result table by class t_peer
if tag == 'table':
for name, value in attrs:
if name == 'class' and 't_peer' in value:
self.result_table = True
# search for torrent row by class bg
if self.result_table and tag == 'tr':
for name, value in attrs:
if name == 'class' and 'bg' in value:
self.torrent_row = True
# count td for find right td
if self.torrent_row and tag == 'td':
if self.index_td == 3:
self.write = "size"
elif self.index_td == 4:
self.write = "seeds"
elif self.index_td == 5:
self.write = "leech"
self.index_td += 1
# search for torrent link by classes r0 or r1
if self.torrent_row and tag == 'a':
for name, value in attrs:
if name == 'class' and 'r' in value:
self.torrent['link'] = 'http://dl.kinozal.tv/download.php?id=' + attrs[0][1].split('=')[1]
self.torrent['desc_link'] = self.url + attrs[0][1]
self.write = "name"
# search for right div with class paginator
if self.found_torrents == 50 and tag == 'div':
for name, value in attrs:
if name == 'class' and value == 'paginator':
self.paginator = True
# search for block with page numbers
if self.paginator and tag == 'li':
self.pages += 1
def handle_endtag(self, tag):
# detecting that torrent row is closed and print all collected data
if self.torrent_row and tag == 'tr':
self.torrent["engine_url"] = self.url
logging.debug('self.torrent: ' + str(self.torrent))
prettyPrinter(self.torrent)
self.torrent = {key: '' for key in self.torrent}
self.index_td = 0
self.torrent_row = False
self.found_torrents += 1
# detecting that table with result is close
if self.result_table and tag == 'table':
self.result_table = False
# detecting that we found all pagination
if self.paginator and tag == 'ul':
self.paginator = False
def handle_data(self, data: str):
# detecting that we need write data at this moment
if self.write and self.result_table:
if self.write == 'size':
data = self.units_convert(data)
self.torrent[self.write] = data.strip()
self.write = None
@staticmethod @staticmethod
def units_convert(unit): def units_convert(unit):
@ -184,9 +106,6 @@ class kinozal(object):
return unit.replace(find, replace) return unit.replace(find, replace)
def error(self, message):
pass
def download_torrent(self, url: str): def download_torrent(self, url: str):
if self.blocked: if self.blocked:
return return
@ -214,24 +133,54 @@ class kinozal(object):
logging.debug(path + " " + url) logging.debug(path + " " + url)
print(path + " " + url) print(path + " " + url)
def search(self, what, cat='all'): def searching(self, query, first=False):
response = self._catch_error_request(query)
page = response.read().decode('cp1251')
self.draw(page)
total = int(re.search(r'</span>Найдено\s+?(\d+)\s+?раздач', page)[1]) if first else -1
return total
def search_old(self, what, cat='all'):
if self.blocked: if self.blocked:
return return
query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"), self.supported_categories[cat]) total, current = -1, 0
while total != current:
query = '{}/browse.php?s={}&c={}&page={}'.format(self.url, what.replace(" ", "+"),
self.supported_categories[cat],
math.ceil(current / 50))
response = self._catch_error_request(query) response = self._catch_error_request(query)
parser = self.WorstParser(self.url) page = response.read().decode('cp1251')
parser.feed(response.read().decode('cp1251')) if total == -1:
parser.close() total = int(re.search(r'</span>Найдено\s+?(\d+)\s+?раздач</td>', page)[1])
current += self.draw(page)
# if first request return that we have pages, we do cycle logging.debug("--- {} seconds ---".format(time.time() - start_time))
if parser.pages: logging.info("Found torrents: {}".format(total))
for x in range(1, parser.pages):
response = self._catch_error_request('{}&page={}'.format(query, x)) def search(self, what, cat='all'):
parser.feed(response.read().decode('cp1251')) if self.blocked:
parser.close() return
query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"),
self.supported_categories[cat])
# make first request (maybe it enough)
total = self.searching(query, True)
# do async requests
if total > 50:
tasks = []
for x in range(1, math.ceil(total / 50)):
task = threading.Thread(target=self.searching, args=(query + "&page={}".format(x),))
tasks.append(task)
task.start()
# wait slower request in stack
for task in tasks:
task.join()
del tasks
logging.debug("--- {} seconds ---".format(time.time() - start_time)) logging.debug("--- {} seconds ---".format(time.time() - start_time))
logging.info("Found torrents: {}".format(parser.found_torrents)) logging.info("Found torrents: {}".format(total))
def _catch_error_request(self, url='', data=None): def _catch_error_request(self, url='', data=None):
url = url if url else self.url url = url if url else self.url
@ -258,6 +207,8 @@ class kinozal(object):
if __name__ == "__main__": if __name__ == "__main__":
# f = open("result.html", "r")
kinozal_se = kinozal() kinozal_se = kinozal()
# kinozal_se.download_torrent("http://kinozal.tv/details.php?id=1263407") # kinozal_se.draw(f.read())
# kinozal_se.search('supernatural') kinozal_se.search('doctor')
print("--- %s seconds ---" % (time.time() - start_time))

Loading…
Cancel
Save