From a62e30ea88dcba5bd013c6581515452b149df927 Mon Sep 17 00:00:00 2001 From: DoumanAsh Date: Sun, 12 Oct 2014 20:15:18 +0400 Subject: [PATCH] Pirate bay search engine update --- src/searchengine/nova/engines/piratebay.py | 183 +++++++++++--------- src/searchengine/nova/engines/versions.txt | 2 +- src/searchengine/nova3/engines/piratebay.py | 183 +++++++++++--------- src/searchengine/nova3/engines/versions.txt | 2 +- 4 files changed, 204 insertions(+), 166 deletions(-) diff --git a/src/searchengine/nova/engines/piratebay.py b/src/searchengine/nova/engines/piratebay.py index 94896be7b..ff1477845 100644 --- a/src/searchengine/nova/engines/piratebay.py +++ b/src/searchengine/nova/engines/piratebay.py @@ -1,6 +1,7 @@ -#VERSION: 1.53 +#VERSION: 2.00 #AUTHORS: Fabien Devaux (fab@gnux.info) #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org) +# Arthur (custparasite@gmx.se) # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -27,94 +28,112 @@ # POSSIBILITY OF SUCH DAMAGE. from novaprinter import prettyPrinter -import sgmllib -from helpers import retrieve_url, download_file +from HTMLParser import HTMLParser +from helpers import download_file +import urllib2 PREVIOUS_IDS = set() class piratebay(object): - url = 'https://thepiratebay.se' - name = 'The Pirate Bay' - supported_categories = {'all': '0', 'movies': '200', 'music': '100', 'games': '400', 'software': '300'} + url = 'http://thepiratebay.se' + name = 'The Pirate Bay' + supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'} - def __init__(self): - self.results = [] - self.parser = self.SimpleSGMLParser(self.results, self.url) + def download_torrent(self, info): + print(download_file(info)) - def download_torrent(self, info): - print download_file(info) + class MyHtmlParseWithBlackJack(HTMLParser): + def __init__(self, results, url): + HTMLParser.__init__(self) + self.url = url + self.results = results + self.current_item = None + self.size_found = False + self.unit_found = False + self.seed_found = False + self.skip_td = False + self.leech_found = False + self.dispatcher = {'a' : self.handle_tag_a_ref, + 'font' : self.handle_tag_font_size, + 'td' : self.handle_tag_td_sl } - class SimpleSGMLParser(sgmllib.SGMLParser): - def __init__(self, results, url, *args): - sgmllib.SGMLParser.__init__(self) - self.td_counter = None - self.current_item = None - self.results = results - self.url = url - self.code = 0 - self.in_name = None + def handle_tag_a_ref(self, attrs): + params = dict(attrs) + #1 + if params['href'].startswith('/torrent/'): + get_id = params['href'].split('/')[2] + if not get_id in PREVIOUS_IDS: + self.current_item = {} + self.current_item['desc_link'] = self.url + params['href'].strip() + self.current_item['name'] = params['title'][12:].strip() + self.current_item['id'] = get_id + #2 + elif (not self.current_item is None) and (params['href'].startswith('magnet:')): + self.current_item['link'] = params['href'].strip() - def start_a(self, attr): - params = dict(attr) - if params['href'].startswith('/torrent/'): - self.current_item = {} - self.td_counter = 0 - self.current_item['desc_link'] = self.url + params['href'].strip() - self.in_name = True - self.current_item['id'] = params['href'].split('/')[2] - elif params['href'].startswith('magnet:'): - self.current_item['link']=params['href'].strip() - self.in_name = False + def handle_tag_font_size(self, attrs): + if not self.current_item is None: + params = dict(attrs) + #3 + if params['class'] == "detDesc": + self.size_found = True - def handle_data(self, data): - if self.td_counter == 0: - if self.in_name: - if not self.current_item.has_key('name'): - self.current_item['name'] = '' - self.current_item['name']+= data.strip() - else: - #Parse size - if 'Size' in data: - self.current_item['size'] = data[data.index("Size")+5:] - self.current_item['size'] = self.current_item['size'][:self.current_item['size'].index(',')] - elif self.td_counter == 1: - if not self.current_item.has_key('seeds'): - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 2: - if not self.current_item.has_key('leech'): - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() + def handle_tag_td_sl(self, attrs): + if not self.current_item is None: + params = dict(attrs) + if not self.current_item is None: + if self.seed_found: + #5 + self.current_item['leech'] = '' + self.leech_found = True + self.seed_found = False + else: + #4 + self.current_item['seeds'] = '' + self.seed_found = True - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 3: - self.td_counter = None - # Display item - if self.current_item: - if self.current_item['id'] in PREVIOUS_IDS: - self.results = [] - self.reset() - return - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - prettyPrinter(self.current_item) - PREVIOUS_IDS.add(self.current_item['id']) - self.results.append('a') - def search(self, what, cat='all'): - ret = [] - i = 0 - order = 'se' - while True and i<11: - results = [] - parser = self.SimpleSGMLParser(results, self.url) - dat = retrieve_url(self.url+'/search/%s/%d/7/%s' % (what, i, self.supported_categories[cat])) - parser.feed(dat) - parser.close() - if len(results) <= 0: - break - i += 1 + def handle_starttag(self, tag, attrs): + if tag in self.dispatcher: + self.dispatcher[tag](attrs) + + def handle_data(self, data): + if not self.current_item is None: + if self.size_found: + #with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by'] + temp = data.split() + if 'Size' in temp: + sizeIn = temp.index('Size') + self.current_item['size'] = temp[sizeIn + 1] + self.size_found = False + self.unit_found = True + elif self.unit_found: + temp = data.split() + self.current_item['size'] = ' '.join((self.current_item['size'], temp[0])) + self.unit_found = False + elif self.seed_found: + self.current_item['seeds'] += data.rstrip() + elif self.leech_found: + self.current_item['leech'] += data.rstrip() + self.current_item['engine_url'] = self.url + prettyPrinter(self.current_item) + PREVIOUS_IDS.add(self.current_item['id']) + self.results.append('a') + self.current_item = None + self.size_found = False + self.unit_found = False + self.seed_found = False + self.leech_found = False + + def search(self, what, cat='all'): + ret = [] + i = 0 + while i < 11: + results = [] + parser = self.MyHtmlParseWithBlackJack(results, self.url) + query = '%s/search/%s/%d/99/%s' % (self.url, what, i, self.supported_categories[cat]) + dat = urllib2.urlopen(query) + parser.feed(dat.read().decode('utf-8')) + parser.close() + if len(results) <= 0: + break + i += 1 diff --git a/src/searchengine/nova/engines/versions.txt b/src/searchengine/nova/engines/versions.txt index c6dedad80..237827571 100644 --- a/src/searchengine/nova/engines/versions.txt +++ b/src/searchengine/nova/engines/versions.txt @@ -1,6 +1,6 @@ torrentreactor: 1.33 mininova: 1.50 -piratebay: 1.53 +piratebay: 2.00 vertor: 1.3 extratorrent: 1.2 kickasstorrents: 1.24 diff --git a/src/searchengine/nova3/engines/piratebay.py b/src/searchengine/nova3/engines/piratebay.py index 5dcc4e019..00f2c6d30 100644 --- a/src/searchengine/nova3/engines/piratebay.py +++ b/src/searchengine/nova3/engines/piratebay.py @@ -1,6 +1,7 @@ -#VERSION: 1.53 +#VERSION: 2.00 #AUTHORS: Fabien Devaux (fab@gnux.info) #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org) +# Arthur (custparasite@gmx.se) # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -27,94 +28,112 @@ # POSSIBILITY OF SUCH DAMAGE. from novaprinter import prettyPrinter -import sgmllib3 -from helpers import retrieve_url, download_file +from html.parser import HTMLParser +from helpers import download_file +import urllib.request PREVIOUS_IDS = set() class piratebay(object): - url = 'https://thepiratebay.se' - name = 'The Pirate Bay' - supported_categories = {'all': '0', 'movies': '200', 'music': '100', 'games': '400', 'software': '300'} + url = 'http://thepiratebay.se' + name = 'The Pirate Bay' + supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'} - def __init__(self): - self.results = [] - self.parser = self.SimpleSGMLParser(self.results, self.url) + def download_torrent(self, info): + print(download_file(info)) - def download_torrent(self, info): - print(download_file(info)) + class MyHtmlParseWithBlackJack(HTMLParser): + def __init__(self, results, url): + super().__init__() + self.url = url + self.results = results + self.current_item = None + self.size_found = False + self.unit_found = False + self.seed_found = False + self.skip_td = False + self.leech_found = False + self.dispatcher = {'a' : self.handle_tag_a_ref, + 'font' : self.handle_tag_font_size, + 'td' : self.handle_tag_td_sl } - class SimpleSGMLParser(sgmllib3.SGMLParser): - def __init__(self, results, url, *args): - sgmllib3.SGMLParser.__init__(self) - self.td_counter = None - self.current_item = None - self.results = results - self.url = url - self.code = 0 - self.in_name = None + def handle_tag_a_ref(self, attrs): + params = dict(attrs) + #1 + if params['href'].startswith('/torrent/'): + get_id = params['href'].split('/')[2] + if not get_id in PREVIOUS_IDS: + self.current_item = {} + self.current_item['desc_link'] = self.url + params['href'].strip() + self.current_item['name'] = params['title'][12:].strip() + self.current_item['id'] = get_id + #2 + elif (not self.current_item is None) and (params['href'].startswith('magnet:')): + self.current_item['link'] = params['href'].strip() - def start_a(self, attr): - params = dict(attr) - if params['href'].startswith('/torrent/'): - self.current_item = {} - self.td_counter = 0 - self.current_item['desc_link'] = self.url + params['href'].strip() - self.in_name = True - self.current_item['id'] = params['href'].split('/')[2] - elif params['href'].startswith('magnet:'): - self.current_item['link']=params['href'].strip() - self.in_name = False + def handle_tag_font_size(self, attrs): + if not self.current_item is None: + params = dict(attrs) + #3 + if params['class'] == "detDesc": + self.size_found = True - def handle_data(self, data): - if self.td_counter == 0: - if self.in_name: - if 'name' not in self.current_item: - self.current_item['name'] = '' - self.current_item['name']+= data.strip() - else: - #Parse size - if 'Size' in data: - self.current_item['size'] = data[data.index("Size")+5:] - self.current_item['size'] = self.current_item['size'][:self.current_item['size'].index(',')] - elif self.td_counter == 1: - if 'seeds' not in self.current_item: - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 2: - if 'leech' not in self.current_item: - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() + def handle_tag_td_sl(self, attrs): + if not self.current_item is None: + params = dict(attrs) + if not self.current_item is None: + if self.seed_found: + #5 + self.current_item['leech'] = '' + self.leech_found = True + self.seed_found = False + else: + #4 + self.current_item['seeds'] = '' + self.seed_found = True - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 3: - self.td_counter = None - # Display item - if self.current_item: - if self.current_item['id'] in PREVIOUS_IDS: - self.results = [] - self.reset() - return - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - prettyPrinter(self.current_item) - PREVIOUS_IDS.add(self.current_item['id']) - self.results.append('a') - def search(self, what, cat='all'): - ret = [] - i = 0 - order = 'se' - while True and i<11: - results = [] - parser = self.SimpleSGMLParser(results, self.url) - dat = retrieve_url(self.url+'/search/%s/%d/7/%s' % (what, i, self.supported_categories[cat])) - parser.feed(dat) - parser.close() - if len(results) <= 0: - break - i += 1 + def handle_starttag(self, tag, attrs): + if tag in self.dispatcher: + self.dispatcher[tag](attrs) + + def handle_data(self, data): + if not self.current_item is None: + if self.size_found: + #with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by'] + temp = data.split() + if 'Size' in temp: + sizeIn = temp.index('Size') + self.current_item['size'] = temp[sizeIn + 1] + self.size_found = False + self.unit_found = True + elif self.unit_found: + temp = data.split() + self.current_item['size'] = ' '.join((self.current_item['size'], temp[0])) + self.unit_found = False + elif self.seed_found: + self.current_item['seeds'] += data.rstrip() + elif self.leech_found: + self.current_item['leech'] += data.rstrip() + self.current_item['engine_url'] = self.url + prettyPrinter(self.current_item) + PREVIOUS_IDS.add(self.current_item['id']) + self.results.append('a') + self.current_item = None + self.size_found = False + self.unit_found = False + self.seed_found = False + self.leech_found = False + + def search(self, what, cat='all'): + ret = [] + i = 0 + while i < 11: + results = [] + parser = self.MyHtmlParseWithBlackJack(results, self.url) + query = '%s/search/%s/%d/99/%s' % (self.url, what, i, self.supported_categories[cat]) + dat = urllib.request.urlopen(query) + parser.feed(dat.read().decode('utf-8')) + parser.close() + if len(results) <= 0: + break + i += 1 diff --git a/src/searchengine/nova3/engines/versions.txt b/src/searchengine/nova3/engines/versions.txt index 2ea509604..5237b738f 100644 --- a/src/searchengine/nova3/engines/versions.txt +++ b/src/searchengine/nova3/engines/versions.txt @@ -1,6 +1,6 @@ torrentreactor: 1.33 mininova: 1.50 -piratebay: 1.53 +piratebay: 2.00 vertor: 1.3 extratorrent: 1.2 kickasstorrents: 1.24