Browse Source

v2:regex, threading

master
imDMG 6 years ago
parent
commit
f81e761095
  1. 209
      kinozal.py

209
kinozal.py

@ -1,20 +1,21 @@ @@ -1,20 +1,21 @@
# VERSION: 1.3
# VERSION: 2.0
# AUTHORS: imDMG [imdmgg@gmail.com]
# Kinozal.tv search engine plugin for qBittorrent
import tempfile
import os
import logging
import json
# import re
import logging
import math
import os
import re
import tempfile
import threading
import time
from urllib.request import build_opener, HTTPCookieProcessor, ProxyHandler
from urllib.parse import urlencode
from urllib.error import URLError, HTTPError
from http.cookiejar import CookieJar
from html.parser import HTMLParser
from novaprinter import prettyPrinter
# setup logging into qBittorrent/logs
@ -24,7 +25,6 @@ logging.basicConfig(level=logging.INFO, @@ -24,7 +25,6 @@ logging.basicConfig(level=logging.INFO,
filename=os.path.abspath(os.path.join(os.path.dirname(__file__), '../../logs', 'kinozal.log')),
filemode='w')
# benchmark
start_time = time.time()
@ -51,6 +51,8 @@ class kinozal(object): @@ -51,6 +51,8 @@ class kinozal(object):
raise e
def __init__(self):
logging.info('Initialisation')
self.result = []
# establish connection
#
# make cookie
@ -79,113 +81,30 @@ class kinozal(object): @@ -79,113 +81,30 @@ class kinozal(object):
else:
logging.info('We successfully authorized')
class WorstParser(HTMLParser):
def __init__(self, url=''):
HTMLParser.__init__(self)
self.url = url
self.torrent = {'link': '',
'name': '',
'size': '',
'seeds': '',
'leech': '',
'desc_link': '', }
# we need a page markup to know when stop and collect data,
# because available methods, in this class, do not communicate each other
# as a result, we make markup to transfer information
# from one method to another, along a chain
#
# markup on result table
self.result_table = False # table with results is found
self.torrent_row = False # found torrent row for collect data
self.index_td = 0 # td counter in torrent row
self.write = None # trigger to detecting when to collect data
# markup pagination
self.paginator = False # found more pages in result
self.pages = 0 # page counter
self.found_torrents = 0
def handle_starttag(self, tag, attrs):
# search result table by class t_peer
if tag == 'table':
for name, value in attrs:
if name == 'class' and 't_peer' in value:
self.result_table = True
# search for torrent row by class bg
if self.result_table and tag == 'tr':
for name, value in attrs:
if name == 'class' and 'bg' in value:
self.torrent_row = True
# count td for find right td
if self.torrent_row and tag == 'td':
if self.index_td == 3:
self.write = "size"
elif self.index_td == 4:
self.write = "seeds"
elif self.index_td == 5:
self.write = "leech"
self.index_td += 1
# search for torrent link by classes r0 or r1
if self.torrent_row and tag == 'a':
for name, value in attrs:
if name == 'class' and 'r' in value:
self.torrent['link'] = 'http://dl.kinozal.tv/download.php?id=' + attrs[0][1].split('=')[1]
self.torrent['desc_link'] = self.url + attrs[0][1]
self.write = "name"
# search for right div with class paginator
if self.found_torrents == 50 and tag == 'div':
for name, value in attrs:
if name == 'class' and value == 'paginator':
self.paginator = True
# search for block with page numbers
if self.paginator and tag == 'li':
self.pages += 1
def handle_endtag(self, tag):
# detecting that torrent row is closed and print all collected data
if self.torrent_row and tag == 'tr':
self.torrent["engine_url"] = self.url
logging.debug('self.torrent: ' + str(self.torrent))
prettyPrinter(self.torrent)
self.torrent = {key: '' for key in self.torrent}
self.index_td = 0
self.torrent_row = False
self.found_torrents += 1
# detecting that table with result is close
if self.result_table and tag == 'table':
self.result_table = False
# detecting that we found all pagination
if self.paginator and tag == 'ul':
self.paginator = False
def handle_data(self, data: str):
# detecting that we need write data at this moment
if self.write and self.result_table:
if self.write == 'size':
data = self.units_convert(data)
self.torrent[self.write] = data.strip()
self.write = None
@staticmethod
def units_convert(unit):
# replace size units
find = unit.split()[1]
replace = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'}[find]
return unit.replace(find, replace)
def error(self, message):
pass
def draw(self, html: str):
torrents = re.findall(r'nam"><a\s+?href="(.+?)"\s+?class="r\d">(.*?)</a>'
r'.+?s\'>.+?s\'>(.*?)<.+?sl_s\'>(\d+)<.+?sl_p\'>(\d+)<', html, re.S)
for tor in torrents:
torrent = {"engine_url": self.url,
"desc_link": tor[0],
"name": tor[1],
"link": 'http://dl.kinozal.tv/download.php?id=' + tor[0].split('=')[1],
"size": self.units_convert(tor[2]),
"seeds": tor[3],
"leech": tor[4]}
prettyPrinter(torrent)
del torrents
# return len(torrents)
@staticmethod
def units_convert(unit):
# replace size units
find = unit.split()[1]
replace = {'ТБ': 'TB', 'ГБ': 'GB', 'МБ': 'MB', 'КБ': 'KB'}[find]
return unit.replace(find, replace)
def download_torrent(self, url: str):
if self.blocked:
@ -214,24 +133,54 @@ class kinozal(object): @@ -214,24 +133,54 @@ class kinozal(object):
logging.debug(path + " " + url)
print(path + " " + url)
def search(self, what, cat='all'):
def searching(self, query, first=False):
response = self._catch_error_request(query)
page = response.read().decode('cp1251')
self.draw(page)
total = int(re.search(r'</span>Найдено\s+?(\d+)\s+?раздач', page)[1]) if first else -1
return total
def search_old(self, what, cat='all'):
if self.blocked:
return
query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"), self.supported_categories[cat])
response = self._catch_error_request(query)
parser = self.WorstParser(self.url)
parser.feed(response.read().decode('cp1251'))
parser.close()
total, current = -1, 0
while total != current:
query = '{}/browse.php?s={}&c={}&page={}'.format(self.url, what.replace(" ", "+"),
self.supported_categories[cat],
math.ceil(current / 50))
response = self._catch_error_request(query)
page = response.read().decode('cp1251')
if total == -1:
total = int(re.search(r'</span>Найдено\s+?(\d+)\s+?раздач</td>', page)[1])
current += self.draw(page)
# if first request return that we have pages, we do cycle
if parser.pages:
for x in range(1, parser.pages):
response = self._catch_error_request('{}&page={}'.format(query, x))
parser.feed(response.read().decode('cp1251'))
parser.close()
logging.debug("--- {} seconds ---".format(time.time() - start_time))
logging.info("Found torrents: {}".format(total))
def search(self, what, cat='all'):
if self.blocked:
return
query = '{}/browse.php?s={}&c={}'.format(self.url, what.replace(" ", "+"),
self.supported_categories[cat])
# make first request (maybe it enough)
total = self.searching(query, True)
# do async requests
if total > 50:
tasks = []
for x in range(1, math.ceil(total / 50)):
task = threading.Thread(target=self.searching, args=(query + "&page={}".format(x),))
tasks.append(task)
task.start()
# wait slower request in stack
for task in tasks:
task.join()
del tasks
logging.debug("--- {} seconds ---".format(time.time() - start_time))
logging.info("Found torrents: {}".format(parser.found_torrents))
logging.info("Found torrents: {}".format(total))
def _catch_error_request(self, url='', data=None):
url = url if url else self.url
@ -258,6 +207,8 @@ class kinozal(object): @@ -258,6 +207,8 @@ class kinozal(object):
if __name__ == "__main__":
# f = open("result.html", "r")
kinozal_se = kinozal()
# kinozal_se.download_torrent("http://kinozal.tv/details.php?id=1263407")
# kinozal_se.search('supernatural')
# kinozal_se.draw(f.read())
kinozal_se.search('doctor')
print("--- %s seconds ---" % (time.time() - start_time))

Loading…
Cancel
Save