From 503626bde829c87b080e89b75fdc7fcf7311f484 Mon Sep 17 00:00:00 2001 From: DoumanAsh Date: Mon, 9 Feb 2015 09:30:44 +0300 Subject: [PATCH] [searchengine] Fix piratebay. Closes #2270 --- .../searchengine/nova/engines/piratebay.py | 208 ++++++++++------- .../searchengine/nova/engines/versions.txt | 2 +- .../searchengine/nova3/engines/piratebay.py | 210 ++++++++++-------- .../searchengine/nova3/engines/versions.txt | 2 +- 4 files changed, 247 insertions(+), 175 deletions(-) diff --git a/src/gui/searchengine/nova/engines/piratebay.py b/src/gui/searchengine/nova/engines/piratebay.py index 8c5e93d23..070714d50 100644 --- a/src/gui/searchengine/nova/engines/piratebay.py +++ b/src/gui/searchengine/nova/engines/piratebay.py @@ -1,4 +1,4 @@ -#VERSION: 2.01 +#VERSION: 2.10 #AUTHORS: Fabien Devaux (fab@gnux.info) #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org) # Arthur (custparasite@gmx.se) @@ -27,113 +27,149 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -from novaprinter import prettyPrinter from HTMLParser import HTMLParser +from httplib import HTTPSConnection as https +#qBt +from novaprinter import prettyPrinter from helpers import download_file -import urllib2 - -PREVIOUS_IDS = set() class piratebay(object): + """ Search engine class """ url = 'https://thepiratebay.se' name = 'The Pirate Bay' supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'} def download_torrent(self, info): + """ Downloader """ print(download_file(info)) class MyHtmlParseWithBlackJack(HTMLParser): - def __init__(self, results, url): + """ Parser class """ + def __init__(self, list_searches, url): HTMLParser.__init__(self) + self.list_searches = list_searches self.url = url - self.results = results self.current_item = None - self.size_found = False - self.unit_found = False - self.seed_found = False - self.skip_td = False - self.leech_found = False - self.dispatcher = {'a' : self.handle_tag_a_ref, - 'font' : self.handle_tag_font_size, - 'td' : self.handle_tag_td_sl } - - def handle_tag_a_ref(self, attrs): + self.save_item = None + self.result_table = False #table with results is found + self.result_tbody = False + self.add_query = True + self.result_query = False + + def handle_start_tag_default(self, attrs): + """ Default handler for start tag dispatcher """ + pass + + def handle_start_tag_a(self, attrs): + """ Handler for start tag a """ params = dict(attrs) - #1 - if params['href'].startswith('/torrent/'): - get_id = params['href'].split('/')[2] - if not get_id in PREVIOUS_IDS: - self.current_item = {} - self.current_item['desc_link'] = self.url + params['href'].strip() - self.current_item['name'] = params['title'][12:].strip() - self.current_item['id'] = get_id - #2 - elif (not self.current_item is None) and (params['href'].startswith('magnet:')): - self.current_item['link'] = params['href'].strip() - - def handle_tag_font_size(self, attrs): - if not self.current_item is None: - params = dict(attrs) - #3 - if params['class'] == "detDesc": - self.size_found = True - - def handle_tag_td_sl(self, attrs): - if not self.current_item is None: - params = dict(attrs) - if not self.current_item is None: - if self.seed_found: - #5 - self.current_item['leech'] = '' - self.leech_found = True - self.seed_found = False + link = params["href"] + if link.startswith("/torrent"): + self.current_item["desc_link"] = "".join((self.url, link)) + self.save_item = "name" + elif link.startswith("magnet"): + self.current_item["link"] = link + + def handle_start_tag_font(self, attrs): + """ Handler for start tag font """ + for attr in attrs: + if attr[1] == "detDesc": + self.save_item = "size" + break + + def handle_start_tag_td(self, attrs): + """ Handler for start tag td """ + for attr in attrs: + if attr[1] == "right": + if "seeds" in self.current_item.keys(): + self.save_item = "leech" else: - #4 - self.current_item['seeds'] = '' - self.seed_found = True + self.save_item = "seeds" + break def handle_starttag(self, tag, attrs): - if tag in self.dispatcher: - self.dispatcher[tag](attrs) + """ Parser's start tag handler """ + if self.current_item: + dispatcher = getattr(self, "_".join(("handle_start_tag", tag)), self.handle_start_tag_default) + dispatcher(attrs) - def handle_data(self, data): - if not self.current_item is None: - if self.size_found: - #with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by'] - temp = data.split() - if 'Size' in temp: - sizeIn = temp.index('Size') - self.current_item['size'] = temp[sizeIn + 1] - self.size_found = False - self.unit_found = True - elif self.unit_found: - temp = data.split() - self.current_item['size'] = ' '.join((self.current_item['size'], temp[0])) - self.unit_found = False - elif self.seed_found: - self.current_item['seeds'] += data.rstrip() - elif self.leech_found: - self.current_item['leech'] += data.rstrip() - self.current_item['engine_url'] = self.url + elif self.result_tbody: + if tag == "tr": + self.current_item = {"engine_url" : self.url} + + elif tag == "table": + self.result_table = "searchResult" == attrs[0][1] + + elif self.add_query: + if self.result_query and tag == "a": + if len(self.list_searches) < 10: + self.list_searches.append(attrs[0][1]) + else: + self.add_query = False + self.result_query = False + elif tag == "div": + self.result_query = "center" == attrs[0][1] + + + def handle_endtag(self, tag): + """ Parser's end tag handler """ + if self.result_tbody: + if tag == "tr": prettyPrinter(self.current_item) - PREVIOUS_IDS.add(self.current_item['id']) - self.results.append('a') self.current_item = None - self.size_found = False - self.unit_found = False - self.seed_found = False - self.leech_found = False + elif tag == "font": + self.save_item = None + elif tag == "table": + self.result_table = self.result_tbody = False + + elif self.result_table: + if tag == "thead": + self.result_tbody = True + elif tag == "table": + self.result_table = self.result_tbody = False + + elif self.add_query and self.result_query: + if tag == "div": + self.add_query = self.result_query = False + + def handle_data(self, data): + """ Parser's data handler """ + if self.save_item == "size": + temp_data = data.split() + if "Size" in temp_data: + self.current_item[self.save_item] = temp_data[2] + elif "ULed" in temp_data: + temp_string = self.current_item[self.save_item] + self.current_item[self.save_item] = " ".join((temp_string, temp_data[0][:-1])) + elif self.save_item: + self.current_item[self.save_item] = data + self.save_item = None + def search(self, what, cat='all'): - ret = [] - i = 0 - while i < 11: - results = [] - parser = self.MyHtmlParseWithBlackJack(results, self.url) - query = '%s/search/%s/%d/99/%s' % (self.url, what, i, self.supported_categories[cat]) - dat = urllib2.urlopen(query) - parser.feed(dat.read().decode('utf-8')) + """ Performs search """ + connection = https("thepiratebay.se") + + #prepare query. 7 is filtering by seeders + cat = cat.lower() + query = "/".join(("/search", what, "0", "7", self.supported_categories[cat])) + + connection.request("GET", query) + response = connection.getresponse() + if response.status != 200: + return + + list_searches = [] + parser = self.MyHtmlParseWithBlackJack(list_searches, self.url) + parser.feed(response.read().decode('utf-8')) + parser.close() + + parser.add_query = False + for search_query in list_searches: + connection.request("GET", search_query) + response = connection.getresponse() + parser.feed(response.read().decode('utf-8')) parser.close() - if len(results) <= 0: - break - i += 1 + + connection.close() + return diff --git a/src/gui/searchengine/nova/engines/versions.txt b/src/gui/searchengine/nova/engines/versions.txt index fba6a35f7..5effbc74c 100644 --- a/src/gui/searchengine/nova/engines/versions.txt +++ b/src/gui/searchengine/nova/engines/versions.txt @@ -1,6 +1,6 @@ torrentreactor: 1.33 mininova: 1.51 -piratebay: 2.01 +piratebay: 2.10 extratorrent: 1.2 kickasstorrents: 1.25 btdigg: 1.23 diff --git a/src/gui/searchengine/nova3/engines/piratebay.py b/src/gui/searchengine/nova3/engines/piratebay.py index 346d814a6..752c06f07 100644 --- a/src/gui/searchengine/nova3/engines/piratebay.py +++ b/src/gui/searchengine/nova3/engines/piratebay.py @@ -1,4 +1,4 @@ -#VERSION: 2.01 +#VERSION: 2.10 #AUTHORS: Fabien Devaux (fab@gnux.info) #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org) # Arthur (custparasite@gmx.se) @@ -27,113 +27,149 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -from novaprinter import prettyPrinter from html.parser import HTMLParser +from http.client import HTTPSConnection as https +#qBt +from novaprinter import prettyPrinter from helpers import download_file -import urllib.request - -PREVIOUS_IDS = set() class piratebay(object): + """ Search engine class """ url = 'https://thepiratebay.se' name = 'The Pirate Bay' supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'} def download_torrent(self, info): + """ Downloader """ print(download_file(info)) class MyHtmlParseWithBlackJack(HTMLParser): - def __init__(self, results, url): - super().__init__() + """ Parser class """ + def __init__(self, list_searches, url): + HTMLParser.__init__(self) + self.list_searches = list_searches self.url = url - self.results = results self.current_item = None - self.size_found = False - self.unit_found = False - self.seed_found = False - self.skip_td = False - self.leech_found = False - self.dispatcher = {'a' : self.handle_tag_a_ref, - 'font' : self.handle_tag_font_size, - 'td' : self.handle_tag_td_sl } - - def handle_tag_a_ref(self, attrs): + self.save_item = None + self.result_table = False #table with results is found + self.result_tbody = False + self.add_query = True + self.result_query = False + + def handle_start_tag_default(self, attrs): + """ Default handler for start tag dispatcher """ + pass + + def handle_start_tag_a(self, attrs): + """ Handler for start tag a """ params = dict(attrs) - #1 - if params['href'].startswith('/torrent/'): - get_id = params['href'].split('/')[2] - if not get_id in PREVIOUS_IDS: - self.current_item = {} - self.current_item['desc_link'] = self.url + params['href'].strip() - self.current_item['name'] = params['title'][12:].strip() - self.current_item['id'] = get_id - #2 - elif (not self.current_item is None) and (params['href'].startswith('magnet:')): - self.current_item['link'] = params['href'].strip() - - def handle_tag_font_size(self, attrs): - if not self.current_item is None: - params = dict(attrs) - #3 - if params['class'] == "detDesc": - self.size_found = True - - def handle_tag_td_sl(self, attrs): - if not self.current_item is None: - params = dict(attrs) - if not self.current_item is None: - if self.seed_found: - #5 - self.current_item['leech'] = '' - self.leech_found = True - self.seed_found = False + link = params["href"] + if link.startswith("/torrent"): + self.current_item["desc_link"] = "".join((self.url, link)) + self.save_item = "name" + elif link.startswith("magnet"): + self.current_item["link"] = link + + def handle_start_tag_font(self, attrs): + """ Handler for start tag font """ + for attr in attrs: + if attr[1] == "detDesc": + self.save_item = "size" + break + + def handle_start_tag_td(self, attrs): + """ Handler for start tag td """ + for attr in attrs: + if attr[1] == "right": + if "seeds" in self.current_item.keys(): + self.save_item = "leech" else: - #4 - self.current_item['seeds'] = '' - self.seed_found = True + self.save_item = "seeds" + break def handle_starttag(self, tag, attrs): - if tag in self.dispatcher: - self.dispatcher[tag](attrs) + """ Parser's start tag handler """ + if self.current_item: + dispatcher = getattr(self, "_".join(("handle_start_tag", tag)), self.handle_start_tag_default) + dispatcher(attrs) - def handle_data(self, data): - if not self.current_item is None: - if self.size_found: - #with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by'] - temp = data.split() - if 'Size' in temp: - sizeIn = temp.index('Size') - self.current_item['size'] = temp[sizeIn + 1] - self.size_found = False - self.unit_found = True - elif self.unit_found: - temp = data.split() - self.current_item['size'] = ' '.join((self.current_item['size'], temp[0])) - self.unit_found = False - elif self.seed_found: - self.current_item['seeds'] += data.rstrip() - elif self.leech_found: - self.current_item['leech'] += data.rstrip() - self.current_item['engine_url'] = self.url + elif self.result_tbody: + if tag == "tr": + self.current_item = {"engine_url" : self.url} + + elif tag == "table": + self.result_table = "searchResult" == attrs[0][1] + + elif self.add_query: + if self.result_query and tag == "a": + if len(self.list_searches) < 10: + self.list_searches.append(attrs[0][1]) + else: + self.add_query = False + self.result_query = False + elif tag == "div": + self.result_query = "center" == attrs[0][1] + + + def handle_endtag(self, tag): + """ Parser's end tag handler """ + if self.result_tbody: + if tag == "tr": prettyPrinter(self.current_item) - PREVIOUS_IDS.add(self.current_item['id']) - self.results.append('a') self.current_item = None - self.size_found = False - self.unit_found = False - self.seed_found = False - self.leech_found = False + elif tag == "font": + self.save_item = None + elif tag == "table": + self.result_table = self.result_tbody = False + + elif self.result_table: + if tag == "thead": + self.result_tbody = True + elif tag == "table": + self.result_table = self.result_tbody = False + + elif self.add_query and self.result_query: + if tag == "div": + self.add_query = self.result_query = False + + def handle_data(self, data): + """ Parser's data handler """ + if self.save_item == "size": + temp_data = data.split() + if "Size" in temp_data: + self.current_item[self.save_item] = temp_data[2] + elif "ULed" in temp_data: + temp_string = self.current_item[self.save_item] + self.current_item[self.save_item] = " ".join((temp_string, temp_data[0][:-1])) + elif self.save_item: + self.current_item[self.save_item] = data + self.save_item = None + def search(self, what, cat='all'): - ret = [] - i = 0 - while i < 11: - results = [] - parser = self.MyHtmlParseWithBlackJack(results, self.url) - query = '%s/search/%s/%d/99/%s' % (self.url, what, i, self.supported_categories[cat]) - dat = urllib.request.urlopen(query) - parser.feed(dat.read().decode('utf-8')) + """ Performs search """ + connection = https("thepiratebay.se") + + #prepare query. 7 is filtering by seeders + cat = cat.lower() + query = "/".join(("/search", what, "0", "7", self.supported_categories[cat])) + + connection.request("GET", query) + response = connection.getresponse() + if response.status != 200: + return + + list_searches = [] + parser = self.MyHtmlParseWithBlackJack(list_searches, self.url) + parser.feed(response.read().decode('utf-8')) + parser.close() + + parser.add_query = False + for search_query in list_searches: + connection.request("GET", search_query) + response = connection.getresponse() + parser.feed(response.read().decode('utf-8')) parser.close() - if len(results) <= 0: - break - i += 1 + + connection.close() + return diff --git a/src/gui/searchengine/nova3/engines/versions.txt b/src/gui/searchengine/nova3/engines/versions.txt index fee1372af..a3906bd5b 100644 --- a/src/gui/searchengine/nova3/engines/versions.txt +++ b/src/gui/searchengine/nova3/engines/versions.txt @@ -1,6 +1,6 @@ torrentreactor: 1.33 mininova: 1.51 -piratebay: 2.01 +piratebay: 2.10 extratorrent: 1.2 kickasstorrents: 1.25 btdigg: 1.23