From 033817f70b8593bd8a2b6267f2aa35f284eccdf4 Mon Sep 17 00:00:00 2001 From: DoumanAsh Date: Fri, 12 Jun 2015 01:16:37 +0300 Subject: [PATCH] [search engine] Update Legit Torrent to remove sgmllib --- .../nova/engines/legittorrents.py | 128 +++++++++--------- src/searchengine/nova/engines/versions.txt | 2 +- .../nova3/engines/legittorrents.py | 128 +++++++++--------- src/searchengine/nova3/engines/versions.txt | 2 +- 4 files changed, 126 insertions(+), 134 deletions(-) diff --git a/src/searchengine/nova/engines/legittorrents.py b/src/searchengine/nova/engines/legittorrents.py index 6ae66e070..59c9f2d15 100644 --- a/src/searchengine/nova/engines/legittorrents.py +++ b/src/searchengine/nova/engines/legittorrents.py @@ -1,5 +1,6 @@ -#VERSION: 1.05 +#VERSION: 2.00 #AUTHORS: Christophe Dumez (chris@qbittorrent.org) +# Douman (custparasite@gmx.se) # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -28,78 +29,73 @@ from novaprinter import prettyPrinter from helpers import retrieve_url, download_file -import sgmllib -import re +from HTMLParser import HTMLParser +from re import compile as re_compile class legittorrents(object): - url = 'http://www.legittorrents.info' - name = 'Legit Torrents' - supported_categories = {'all': '', 'movies': '1', 'tv': '13', 'music': '2', 'games': '3', 'anime': '5', 'books': '6'} + url = 'http://www.legittorrents.info' + name = 'Legit Torrents' + supported_categories = {'all': '0', 'movies': '1', 'tv': '13', 'music': '2', 'games': '3', 'anime': '5', 'books': '6'} - def download_torrent(self, info): - print download_file(info) + def download_torrent(self, info): + print(download_file(info)) - class SimpleSGMLParser(sgmllib.SGMLParser): - def __init__(self, results, url, *args): - sgmllib.SGMLParser.__init__(self) - self.url = url - self.td_counter = None - self.current_item = None - self.start_name = False - self.results = results + class MyHtmlParseWithBlackJack(HTMLParser): + """ Parser class """ + def __init__(self, url): + HTMLParser.__init__(self) + self.url = url + self.current_item = None + self.save_item_key = None - def start_a(self, attr): - params = dict(attr) - if params.has_key('href') and params['href'].startswith('download.php?'): - self.current_item['link'] = self.url + '/' + params['href'].strip() - elif params.has_key('href') and params['href'].startswith('index.php?page=torrent-details'): - self.current_item = {} - self.td_counter = 0 - self.current_item['desc_link'] = self.url + '/' + params['href'].strip() + def handle_starttag(self, tag, attrs): + """ Parser's start tag handler """ + if self.current_item: + params = dict(attrs) + if tag == "a": + link = params["href"] + if link.startswith("index") and "title" in params: + #description link + self.current_item["name"] = params["title"][14:] + self.current_item["desc_link"] = "/".join((self.url, link)) + elif link.startswith("download"): + self.current_item["link"] = "/".join((self.url, link)) + elif tag == "td": + if "class" in params and params["class"].startswith("#FF"): + self.save_item_key = "leech" if "seeds" in self.current_item else "seeds" - def handle_data(self, data): - if self.td_counter == 0: - if not self.current_item.has_key('name'): - self.current_item['name'] = data.strip() - elif self.td_counter == 3: - if not self.current_item.has_key('seeds'): - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 4: - if not self.current_item.has_key('leech'): - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() + elif tag == "tr": + self.current_item = {} + self.current_item["size"] = "" + self.current_item["engine_url"] = self.url - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 5: - self.td_counter = None - # Display item - if self.current_item: - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - self.current_item['size'] = '' - prettyPrinter(self.current_item) - self.results.append('a') + def handle_endtag(self, tag): + """ Parser's end tag handler """ + if self.current_item and tag == "tr": + if len(self.current_item) > 4: + prettyPrinter(self.current_item) + self.current_item = None + + def handle_data(self, data): + """ Parser's data handler """ + if self.save_item_key: + self.current_item[self.save_item_key] = data.strip() + self.save_item_key = None + + def search(self, what, cat='all'): + """ Performs search """ + query = "".join((self.url, "/index.php?page=torrents&search=", what, "&category=", self.supported_categories.get(cat, '0'), "&active=1")) + + get_table = re_compile('(?s)(.*)') + data = get_table.search(retrieve_url(query)).group(0) + #extract first ten pages of next results + next_pages = re_compile('(?m)') + next_pages = ["".join((self.url, page)) for page in next_pages.findall(data)[:10]] - def search(self, what, cat='all'): - ret = [] - i = 1 - while True and i<11: - results = [] - parser = self.SimpleSGMLParser(results, self.url) - dat = retrieve_url(self.url+'/index.php?page=torrents&search=%s&category=%s&active=1&order=3&by=2&pages=%d'%(what, self.supported_categories[cat], i)) - results_re = re.compile('(?s).*') - for match in results_re.finditer(dat): - res_tab = match.group(0) - parser.feed(res_tab) + parser = self.MyHtmlParseWithBlackJack(self.url) + parser.feed(data) parser.close() - break - if len(results) <= 0: - break - i += 1 + for page in next_pages: + parser.feed(get_table.search(retrieve_url(page)).group(0)) + parser.close() diff --git a/src/searchengine/nova/engines/versions.txt b/src/searchengine/nova/engines/versions.txt index a1170497f..25b2da2a0 100644 --- a/src/searchengine/nova/engines/versions.txt +++ b/src/searchengine/nova/engines/versions.txt @@ -2,7 +2,7 @@ btdigg: 1.25 demonoid: 1.1 extratorrent: 2.0 kickasstorrents: 1.27 -legittorrents: 1.05 +legittorrents: 2.00 mininova: 2.00 piratebay: 2.11 torrentreactor: 1.36 diff --git a/src/searchengine/nova3/engines/legittorrents.py b/src/searchengine/nova3/engines/legittorrents.py index 40c40decc..c3e9bf20a 100644 --- a/src/searchengine/nova3/engines/legittorrents.py +++ b/src/searchengine/nova3/engines/legittorrents.py @@ -1,5 +1,6 @@ -#VERSION: 1.05 +#VERSION: 2.00 #AUTHORS: Christophe Dumez (chris@qbittorrent.org) +# Douman (custparasite@gmx.se) # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -28,78 +29,73 @@ from novaprinter import prettyPrinter from helpers import retrieve_url, download_file -import sgmllib3 as sgmllib -import re +from html.parser import HTMLParser +from re import compile as re_compile class legittorrents(object): - url = 'http://www.legittorrents.info' - name = 'Legit Torrents' - supported_categories = {'all': '', 'movies': '1', 'tv': '13', 'music': '2', 'games': '3', 'anime': '5', 'books': '6'} + url = 'http://www.legittorrents.info' + name = 'Legit Torrents' + supported_categories = {'all': '0', 'movies': '1', 'tv': '13', 'music': '2', 'games': '3', 'anime': '5', 'books': '6'} - def download_torrent(self, info): - print(download_file(info)) + def download_torrent(self, info): + print(download_file(info)) - class SimpleSGMLParser(sgmllib.SGMLParser): - def __init__(self, results, url, *args): - sgmllib.SGMLParser.__init__(self) - self.url = url - self.td_counter = None - self.current_item = None - self.start_name = False - self.results = results + class MyHtmlParseWithBlackJack(HTMLParser): + """ Parser class """ + def __init__(self, url): + HTMLParser.__init__(self) + self.url = url + self.current_item = None + self.save_item_key = None - def start_a(self, attr): - params = dict(attr) - if 'href' in params and params['href'].startswith('download.php?'): - self.current_item['link'] = self.url + '/' + params['href'].strip() - elif 'href' in params and params['href'].startswith('index.php?page=torrent-details'): - self.current_item = {} - self.td_counter = 0 - self.current_item['desc_link'] = self.url + '/' + params['href'].strip() + def handle_starttag(self, tag, attrs): + """ Parser's start tag handler """ + if self.current_item: + params = dict(attrs) + if tag == "a": + link = params["href"] + if link.startswith("index") and "title" in params: + #description link + self.current_item["name"] = params["title"][14:] + self.current_item["desc_link"] = "/".join((self.url, link)) + elif link.startswith("download"): + self.current_item["link"] = "/".join((self.url, link)) + elif tag == "td": + if "class" in params and params["class"].startswith("#FF"): + self.save_item_key = "leech" if "seeds" in self.current_item else "seeds" - def handle_data(self, data): - if self.td_counter == 0: - if 'name' not in self.current_item: - self.current_item['name'] = data.strip() - elif self.td_counter == 3: - if 'seeds' not in self.current_item: - self.current_item['seeds'] = '' - self.current_item['seeds']+= data.strip() - elif self.td_counter == 4: - if 'leech' not in self.current_item: - self.current_item['leech'] = '' - self.current_item['leech']+= data.strip() + elif tag == "tr": + self.current_item = {} + self.current_item["size"] = "" + self.current_item["engine_url"] = self.url - def start_td(self,attr): - if isinstance(self.td_counter,int): - self.td_counter += 1 - if self.td_counter > 5: - self.td_counter = None - # Display item - if self.current_item: - self.current_item['engine_url'] = self.url - if not self.current_item['seeds'].isdigit(): - self.current_item['seeds'] = 0 - if not self.current_item['leech'].isdigit(): - self.current_item['leech'] = 0 - self.current_item['size'] = '' - prettyPrinter(self.current_item) - self.results.append('a') + def handle_endtag(self, tag): + """ Parser's end tag handler """ + if self.current_item and tag == "tr": + if len(self.current_item) > 4: + prettyPrinter(self.current_item) + self.current_item = None + + def handle_data(self, data): + """ Parser's data handler """ + if self.save_item_key: + self.current_item[self.save_item_key] = data.strip() + self.save_item_key = None + + def search(self, what, cat='all'): + """ Performs search """ + query = "".join((self.url, "/index.php?page=torrents&search=", what, "&category=", self.supported_categories.get(cat, '0'), "&active=1")) + + get_table = re_compile('(?s)(.*)
') + data = get_table.search(retrieve_url(query)).group(0) + #extract first ten pages of next results + next_pages = re_compile('(?m)') + next_pages = ["".join((self.url, page)) for page in next_pages.findall(data)[:10]] - def search(self, what, cat='all'): - ret = [] - i = 1 - while True and i<11: - results = [] - parser = self.SimpleSGMLParser(results, self.url) - dat = retrieve_url(self.url+'/index.php?page=torrents&search=%s&category=%s&active=1&order=3&by=2&pages=%d'%(what, self.supported_categories[cat], i)) - results_re = re.compile('(?s).*') - for match in results_re.finditer(dat): - res_tab = match.group(0) - parser.feed(res_tab) + parser = self.MyHtmlParseWithBlackJack(self.url) + parser.feed(data) parser.close() - break - if len(results) <= 0: - break - i += 1 + for page in next_pages: + parser.feed(get_table.search(retrieve_url(page)).group(0)) + parser.close() diff --git a/src/searchengine/nova3/engines/versions.txt b/src/searchengine/nova3/engines/versions.txt index a1170497f..25b2da2a0 100644 --- a/src/searchengine/nova3/engines/versions.txt +++ b/src/searchengine/nova3/engines/versions.txt @@ -2,7 +2,7 @@ btdigg: 1.25 demonoid: 1.1 extratorrent: 2.0 kickasstorrents: 1.27 -legittorrents: 1.05 +legittorrents: 2.00 mininova: 2.00 piratebay: 2.11 torrentreactor: 1.36