1
0
mirror of https://github.com/d47081/qBittorrent.git synced 2025-01-25 22:14:32 +00:00

[searchengine] Fix piratebay. Closes #2270

This commit is contained in:
DoumanAsh 2015-02-09 09:30:44 +03:00
parent b7898cccd0
commit 503626bde8
4 changed files with 243 additions and 171 deletions

View File

@ -1,4 +1,4 @@
#VERSION: 2.01 #VERSION: 2.10
#AUTHORS: Fabien Devaux (fab@gnux.info) #AUTHORS: Fabien Devaux (fab@gnux.info)
#CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org) #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org)
# Arthur (custparasite@gmx.se) # Arthur (custparasite@gmx.se)
@ -27,113 +27,149 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from novaprinter import prettyPrinter
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from httplib import HTTPSConnection as https
#qBt
from novaprinter import prettyPrinter
from helpers import download_file from helpers import download_file
import urllib2
PREVIOUS_IDS = set()
class piratebay(object): class piratebay(object):
""" Search engine class """
url = 'https://thepiratebay.se' url = 'https://thepiratebay.se'
name = 'The Pirate Bay' name = 'The Pirate Bay'
supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'} supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'}
def download_torrent(self, info): def download_torrent(self, info):
""" Downloader """
print(download_file(info)) print(download_file(info))
class MyHtmlParseWithBlackJack(HTMLParser): class MyHtmlParseWithBlackJack(HTMLParser):
def __init__(self, results, url): """ Parser class """
def __init__(self, list_searches, url):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.list_searches = list_searches
self.url = url self.url = url
self.results = results
self.current_item = None self.current_item = None
self.size_found = False self.save_item = None
self.unit_found = False self.result_table = False #table with results is found
self.seed_found = False self.result_tbody = False
self.skip_td = False self.add_query = True
self.leech_found = False self.result_query = False
self.dispatcher = {'a' : self.handle_tag_a_ref,
'font' : self.handle_tag_font_size,
'td' : self.handle_tag_td_sl }
def handle_tag_a_ref(self, attrs): def handle_start_tag_default(self, attrs):
""" Default handler for start tag dispatcher """
pass
def handle_start_tag_a(self, attrs):
""" Handler for start tag a """
params = dict(attrs) params = dict(attrs)
#1 link = params["href"]
if params['href'].startswith('/torrent/'): if link.startswith("/torrent"):
get_id = params['href'].split('/')[2] self.current_item["desc_link"] = "".join((self.url, link))
if not get_id in PREVIOUS_IDS: self.save_item = "name"
self.current_item = {} elif link.startswith("magnet"):
self.current_item['desc_link'] = self.url + params['href'].strip() self.current_item["link"] = link
self.current_item['name'] = params['title'][12:].strip()
self.current_item['id'] = get_id
#2
elif (not self.current_item is None) and (params['href'].startswith('magnet:')):
self.current_item['link'] = params['href'].strip()
def handle_tag_font_size(self, attrs): def handle_start_tag_font(self, attrs):
if not self.current_item is None: """ Handler for start tag font """
params = dict(attrs) for attr in attrs:
#3 if attr[1] == "detDesc":
if params['class'] == "detDesc": self.save_item = "size"
self.size_found = True break
def handle_tag_td_sl(self, attrs): def handle_start_tag_td(self, attrs):
if not self.current_item is None: """ Handler for start tag td """
params = dict(attrs) for attr in attrs:
if not self.current_item is None: if attr[1] == "right":
if self.seed_found: if "seeds" in self.current_item.keys():
#5 self.save_item = "leech"
self.current_item['leech'] = ''
self.leech_found = True
self.seed_found = False
else: else:
#4 self.save_item = "seeds"
self.current_item['seeds'] = '' break
self.seed_found = True
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if tag in self.dispatcher: """ Parser's start tag handler """
self.dispatcher[tag](attrs) if self.current_item:
dispatcher = getattr(self, "_".join(("handle_start_tag", tag)), self.handle_start_tag_default)
dispatcher(attrs)
elif self.result_tbody:
if tag == "tr":
self.current_item = {"engine_url" : self.url}
elif tag == "table":
self.result_table = "searchResult" == attrs[0][1]
elif self.add_query:
if self.result_query and tag == "a":
if len(self.list_searches) < 10:
self.list_searches.append(attrs[0][1])
else:
self.add_query = False
self.result_query = False
elif tag == "div":
self.result_query = "center" == attrs[0][1]
def handle_endtag(self, tag):
""" Parser's end tag handler """
if self.result_tbody:
if tag == "tr":
prettyPrinter(self.current_item)
self.current_item = None
elif tag == "font":
self.save_item = None
elif tag == "table":
self.result_table = self.result_tbody = False
elif self.result_table:
if tag == "thead":
self.result_tbody = True
elif tag == "table":
self.result_table = self.result_tbody = False
elif self.add_query and self.result_query:
if tag == "div":
self.add_query = self.result_query = False
def handle_data(self, data): def handle_data(self, data):
if not self.current_item is None: """ Parser's data handler """
if self.size_found: if self.save_item == "size":
#with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by'] temp_data = data.split()
temp = data.split() if "Size" in temp_data:
if 'Size' in temp: self.current_item[self.save_item] = temp_data[2]
sizeIn = temp.index('Size') elif "ULed" in temp_data:
self.current_item['size'] = temp[sizeIn + 1] temp_string = self.current_item[self.save_item]
self.size_found = False self.current_item[self.save_item] = " ".join((temp_string, temp_data[0][:-1]))
self.unit_found = True elif self.save_item:
elif self.unit_found: self.current_item[self.save_item] = data
temp = data.split() self.save_item = None
self.current_item['size'] = ' '.join((self.current_item['size'], temp[0]))
self.unit_found = False
elif self.seed_found:
self.current_item['seeds'] += data.rstrip()
elif self.leech_found:
self.current_item['leech'] += data.rstrip()
self.current_item['engine_url'] = self.url
prettyPrinter(self.current_item)
PREVIOUS_IDS.add(self.current_item['id'])
self.results.append('a')
self.current_item = None
self.size_found = False
self.unit_found = False
self.seed_found = False
self.leech_found = False
def search(self, what, cat='all'): def search(self, what, cat='all'):
ret = [] """ Performs search """
i = 0 connection = https("thepiratebay.se")
while i < 11:
results = [] #prepare query. 7 is filtering by seeders
parser = self.MyHtmlParseWithBlackJack(results, self.url) cat = cat.lower()
query = '%s/search/%s/%d/99/%s' % (self.url, what, i, self.supported_categories[cat]) query = "/".join(("/search", what, "0", "7", self.supported_categories[cat]))
dat = urllib2.urlopen(query)
parser.feed(dat.read().decode('utf-8')) connection.request("GET", query)
response = connection.getresponse()
if response.status != 200:
return
list_searches = []
parser = self.MyHtmlParseWithBlackJack(list_searches, self.url)
parser.feed(response.read().decode('utf-8'))
parser.close()
parser.add_query = False
for search_query in list_searches:
connection.request("GET", search_query)
response = connection.getresponse()
parser.feed(response.read().decode('utf-8'))
parser.close() parser.close()
if len(results) <= 0:
break connection.close()
i += 1 return

View File

@ -1,6 +1,6 @@
torrentreactor: 1.33 torrentreactor: 1.33
mininova: 1.51 mininova: 1.51
piratebay: 2.01 piratebay: 2.10
extratorrent: 1.2 extratorrent: 1.2
kickasstorrents: 1.25 kickasstorrents: 1.25
btdigg: 1.23 btdigg: 1.23

View File

@ -1,4 +1,4 @@
#VERSION: 2.01 #VERSION: 2.10
#AUTHORS: Fabien Devaux (fab@gnux.info) #AUTHORS: Fabien Devaux (fab@gnux.info)
#CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org) #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org)
# Arthur (custparasite@gmx.se) # Arthur (custparasite@gmx.se)
@ -27,113 +27,149 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from novaprinter import prettyPrinter
from html.parser import HTMLParser from html.parser import HTMLParser
from http.client import HTTPSConnection as https
#qBt
from novaprinter import prettyPrinter
from helpers import download_file from helpers import download_file
import urllib.request
PREVIOUS_IDS = set()
class piratebay(object): class piratebay(object):
""" Search engine class """
url = 'https://thepiratebay.se' url = 'https://thepiratebay.se'
name = 'The Pirate Bay' name = 'The Pirate Bay'
supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'} supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'}
def download_torrent(self, info): def download_torrent(self, info):
""" Downloader """
print(download_file(info)) print(download_file(info))
class MyHtmlParseWithBlackJack(HTMLParser): class MyHtmlParseWithBlackJack(HTMLParser):
def __init__(self, results, url): """ Parser class """
super().__init__() def __init__(self, list_searches, url):
HTMLParser.__init__(self)
self.list_searches = list_searches
self.url = url self.url = url
self.results = results
self.current_item = None self.current_item = None
self.size_found = False self.save_item = None
self.unit_found = False self.result_table = False #table with results is found
self.seed_found = False self.result_tbody = False
self.skip_td = False self.add_query = True
self.leech_found = False self.result_query = False
self.dispatcher = {'a' : self.handle_tag_a_ref,
'font' : self.handle_tag_font_size,
'td' : self.handle_tag_td_sl }
def handle_tag_a_ref(self, attrs): def handle_start_tag_default(self, attrs):
""" Default handler for start tag dispatcher """
pass
def handle_start_tag_a(self, attrs):
""" Handler for start tag a """
params = dict(attrs) params = dict(attrs)
#1 link = params["href"]
if params['href'].startswith('/torrent/'): if link.startswith("/torrent"):
get_id = params['href'].split('/')[2] self.current_item["desc_link"] = "".join((self.url, link))
if not get_id in PREVIOUS_IDS: self.save_item = "name"
self.current_item = {} elif link.startswith("magnet"):
self.current_item['desc_link'] = self.url + params['href'].strip() self.current_item["link"] = link
self.current_item['name'] = params['title'][12:].strip()
self.current_item['id'] = get_id
#2
elif (not self.current_item is None) and (params['href'].startswith('magnet:')):
self.current_item['link'] = params['href'].strip()
def handle_tag_font_size(self, attrs): def handle_start_tag_font(self, attrs):
if not self.current_item is None: """ Handler for start tag font """
params = dict(attrs) for attr in attrs:
#3 if attr[1] == "detDesc":
if params['class'] == "detDesc": self.save_item = "size"
self.size_found = True break
def handle_tag_td_sl(self, attrs): def handle_start_tag_td(self, attrs):
if not self.current_item is None: """ Handler for start tag td """
params = dict(attrs) for attr in attrs:
if not self.current_item is None: if attr[1] == "right":
if self.seed_found: if "seeds" in self.current_item.keys():
#5 self.save_item = "leech"
self.current_item['leech'] = ''
self.leech_found = True
self.seed_found = False
else: else:
#4 self.save_item = "seeds"
self.current_item['seeds'] = '' break
self.seed_found = True
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if tag in self.dispatcher: """ Parser's start tag handler """
self.dispatcher[tag](attrs) if self.current_item:
dispatcher = getattr(self, "_".join(("handle_start_tag", tag)), self.handle_start_tag_default)
dispatcher(attrs)
elif self.result_tbody:
if tag == "tr":
self.current_item = {"engine_url" : self.url}
elif tag == "table":
self.result_table = "searchResult" == attrs[0][1]
elif self.add_query:
if self.result_query and tag == "a":
if len(self.list_searches) < 10:
self.list_searches.append(attrs[0][1])
else:
self.add_query = False
self.result_query = False
elif tag == "div":
self.result_query = "center" == attrs[0][1]
def handle_endtag(self, tag):
""" Parser's end tag handler """
if self.result_tbody:
if tag == "tr":
prettyPrinter(self.current_item)
self.current_item = None
elif tag == "font":
self.save_item = None
elif tag == "table":
self.result_table = self.result_tbody = False
elif self.result_table:
if tag == "thead":
self.result_tbody = True
elif tag == "table":
self.result_table = self.result_tbody = False
elif self.add_query and self.result_query:
if tag == "div":
self.add_query = self.result_query = False
def handle_data(self, data): def handle_data(self, data):
if not self.current_item is None: """ Parser's data handler """
if self.size_found: if self.save_item == "size":
#with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by'] temp_data = data.split()
temp = data.split() if "Size" in temp_data:
if 'Size' in temp: self.current_item[self.save_item] = temp_data[2]
sizeIn = temp.index('Size') elif "ULed" in temp_data:
self.current_item['size'] = temp[sizeIn + 1] temp_string = self.current_item[self.save_item]
self.size_found = False self.current_item[self.save_item] = " ".join((temp_string, temp_data[0][:-1]))
self.unit_found = True elif self.save_item:
elif self.unit_found: self.current_item[self.save_item] = data
temp = data.split() self.save_item = None
self.current_item['size'] = ' '.join((self.current_item['size'], temp[0]))
self.unit_found = False
elif self.seed_found:
self.current_item['seeds'] += data.rstrip()
elif self.leech_found:
self.current_item['leech'] += data.rstrip()
self.current_item['engine_url'] = self.url
prettyPrinter(self.current_item)
PREVIOUS_IDS.add(self.current_item['id'])
self.results.append('a')
self.current_item = None
self.size_found = False
self.unit_found = False
self.seed_found = False
self.leech_found = False
def search(self, what, cat='all'): def search(self, what, cat='all'):
ret = [] """ Performs search """
i = 0 connection = https("thepiratebay.se")
while i < 11:
results = [] #prepare query. 7 is filtering by seeders
parser = self.MyHtmlParseWithBlackJack(results, self.url) cat = cat.lower()
query = '%s/search/%s/%d/99/%s' % (self.url, what, i, self.supported_categories[cat]) query = "/".join(("/search", what, "0", "7", self.supported_categories[cat]))
dat = urllib.request.urlopen(query)
parser.feed(dat.read().decode('utf-8')) connection.request("GET", query)
response = connection.getresponse()
if response.status != 200:
return
list_searches = []
parser = self.MyHtmlParseWithBlackJack(list_searches, self.url)
parser.feed(response.read().decode('utf-8'))
parser.close()
parser.add_query = False
for search_query in list_searches:
connection.request("GET", search_query)
response = connection.getresponse()
parser.feed(response.read().decode('utf-8'))
parser.close() parser.close()
if len(results) <= 0:
break connection.close()
i += 1 return

View File

@ -1,6 +1,6 @@
torrentreactor: 1.33 torrentreactor: 1.33
mininova: 1.51 mininova: 1.51
piratebay: 2.01 piratebay: 2.10
extratorrent: 1.2 extratorrent: 1.2
kickasstorrents: 1.25 kickasstorrents: 1.25
btdigg: 1.23 btdigg: 1.23