From 920aefddde0cd385b59c7cdf5d15969e5df633db Mon Sep 17 00:00:00 2001 From: DoumanAsh Date: Sat, 11 Apr 2015 10:28:17 +0300 Subject: [PATCH] [search engine] Final enhancements. --- src/searchengine/nova/engines/extratorrent.py | 6 +++++- src/searchengine/nova/engines/mininova.py | 11 +++++------ src/searchengine/nova/engines/torrentreactor.py | 17 +++-------------- src/searchengine/nova/engines/torrentz.py | 4 ++-- src/searchengine/nova/engines/versions.txt | 2 +- src/searchengine/nova/nova2.py | 6 +++++- src/searchengine/nova/novaprinter.py | 2 +- src/searchengine/nova3/engines/extratorrent.py | 6 +++++- src/searchengine/nova3/engines/mininova.py | 11 +++++------ .../nova3/engines/torrentreactor.py | 17 +++-------------- src/searchengine/nova3/engines/torrentz.py | 2 +- src/searchengine/nova3/engines/versions.txt | 2 +- src/searchengine/nova3/nova2.py | 9 +++++++-- 13 files changed, 44 insertions(+), 51 deletions(-) diff --git a/src/searchengine/nova/engines/extratorrent.py b/src/searchengine/nova/engines/extratorrent.py index 19fce553c..4fe940b24 100644 --- a/src/searchengine/nova/engines/extratorrent.py +++ b/src/searchengine/nova/engines/extratorrent.py @@ -60,6 +60,7 @@ class extratorrent(object): self.pending_size = False self.next_queries = True self.pending_next_queries = False + self.next_queries_set = set() def handle_starttag(self, tag, attrs): if self.current_item: @@ -74,7 +75,7 @@ class extratorrent(object): #description self.current_item["desc_link"] = "".join((self.url, link)) #remove view at the beginning - self.current_item["name"] = params["title"][5:] + self.current_item["name"] = params["title"][5:].replace("&", "&") self.pending_size = True elif link[8] == "_": #download link @@ -108,7 +109,10 @@ class extratorrent(object): elif self.pending_next_queries: if tag == "a": params = dict(attrs) + if params["title"] in self.next_queries_set: + return self.list_searches.append(params['href']) + self.next_queries_set.add(params["title"]) if params["title"] == "10": self.pending_next_queries = False else: diff --git a/src/searchengine/nova/engines/mininova.py b/src/searchengine/nova/engines/mininova.py index dc132cd6c..e105a4f3b 100644 --- a/src/searchengine/nova/engines/mininova.py +++ b/src/searchengine/nova/engines/mininova.py @@ -68,12 +68,11 @@ class mininova(object): params = dict(attrs) link = params["href"] - if link.startswith("/get/"): - #download link - self.current_item["link"] = "".join((self.url, link)) - elif link.startswith("/tor/"): + if link.startswith("/tor/"): #description self.current_item["desc_link"] = "".join((self.url, link)) + #get download link from description by id + self.current_item["link"] = "".join((self.url, "/get/", link[5:-2])) self.cur_item_name = "name" self.current_item["name"] = "" elif self.next_queries and link.startswith("/search"): @@ -83,7 +82,7 @@ class mininova(object): def handle_starttag_td(self, attrs): """ Handler of td start tag """ if ("align", "right") in attrs: - if not "size" in self.current_item.keys(): + if not "size" in self.current_item: self.cur_item_name = "size" self.current_item["size"] = "" @@ -113,7 +112,7 @@ class mininova(object): prettyPrinter(self.current_item) self.current_item = None elif self.cur_item_name: - if tag == "a" or tag == "span": + if tag == "a" or tag == "td": self.cur_item_name = None def handle_data(self, data): diff --git a/src/searchengine/nova/engines/torrentreactor.py b/src/searchengine/nova/engines/torrentreactor.py index bff138f91..3f0ef7f65 100644 --- a/src/searchengine/nova/engines/torrentreactor.py +++ b/src/searchengine/nova/engines/torrentreactor.py @@ -28,10 +28,9 @@ # POSSIBILITY OF SUCH DAMAGE. from novaprinter import prettyPrinter -from helpers import download_file +from helpers import download_file, retrieve_url import urllib from HTMLParser import HTMLParser -from httplib import HTTPConnection as http from re import compile as re_compile class torrentreactor(object): @@ -100,23 +99,13 @@ class torrentreactor(object): def search(self, what, cat='all'): i = 0 dat = '' - connection = http("www.torrentreactor.net") - while True and i<11: + while i < 11: results = [] parser = self.SimpleHTMLParser(results, self.url) - query = '/torrents-search/%s/%d?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat]) - connection.request("GET", query) - response = connection.getresponse() - if response.status != 200: - break - - dat = response.read().decode('utf-8') - + dat = retrieve_url('%s/torrent-search/%s/%s?sort=seeders.desc&type=all&period=none&categories=%s'%(self.url, what, (i*35), self.supported_categories[cat])) parser.feed(dat) parser.close() if len(results) <= 0: break i += 1 - - connection.close() diff --git a/src/searchengine/nova/engines/torrentz.py b/src/searchengine/nova/engines/torrentz.py index 2d7b5eef7..20a2b3be8 100644 --- a/src/searchengine/nova/engines/torrentz.py +++ b/src/searchengine/nova/engines/torrentz.py @@ -1,4 +1,4 @@ -#VERSION: 2.13 +#VERSION: 2.14 #AUTHORS: Diego de las Heras (diegodelasheras@gmail.com) # Redistribution and use in source and binary forms, with or without @@ -105,7 +105,7 @@ class torrentz(object): while i < 6: results_list = [] # "what" is already urlencoded - html = retrieve_url(self.url + '/any?f=%s&p=%d' % (what, i)) + html = retrieve_url('%s/any?f=%s&p=%d' % (self.url, what, i)) parser = self.MyHtmlParser(results_list, self.url, trackers) parser.feed(html) parser.close() diff --git a/src/searchengine/nova/engines/versions.txt b/src/searchengine/nova/engines/versions.txt index bbff49afd..c24143cfb 100644 --- a/src/searchengine/nova/engines/versions.txt +++ b/src/searchengine/nova/engines/versions.txt @@ -5,5 +5,5 @@ piratebay: 2.11 extratorrent: 2.0 kickasstorrents: 1.26 btdigg: 1.24 -torrentz: 2.13 +torrentz: 2.14 legittorrents: 1.03 diff --git a/src/searchengine/nova/nova2.py b/src/searchengine/nova/nova2.py index cef9681b7..d54f5c16d 100644 --- a/src/searchengine/nova/nova2.py +++ b/src/searchengine/nova/nova2.py @@ -117,6 +117,8 @@ def displayCapabilities(supported_engines): def run_search(engine_list): """ Run search in engine + @param engine_list List with engine, query and category + @retval False if any exceptions occured @retval True otherwise """ @@ -149,6 +151,7 @@ def main(args): raise SystemExit("./nova2.py [all|engine1[,engine2]*] \n" "available engines: %s" % (','.join(supported_engines))) + #get only unique engines with set engines_list = set(e.lower() for e in args[0].strip().split(',')) if 'all' in engines_list: @@ -170,10 +173,11 @@ def main(args): what = urllib.quote(' '.join(args[2:])) if THREADED: + #child process spawning is controlled min(number of searches, number of cpu) pool = Pool(min(len(engines_list), cpu_count())) pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list)) else: - _ = [run_search([globals()[engine], what, cat]) for engine in engines_list] + map(run_search, ([globals()[engine], what, cat] for engine in engines_list)) if __name__ == "__main__": main(argv[1:]) diff --git a/src/searchengine/nova/novaprinter.py b/src/searchengine/nova/novaprinter.py index fc16949e6..9cc598f14 100644 --- a/src/searchengine/nova/novaprinter.py +++ b/src/searchengine/nova/novaprinter.py @@ -37,7 +37,7 @@ def prettyPrinter(dictionary): outtext = "|".join((outtext, dictionary["desc_link"])) with open(1, 'w', encoding='utf-8', closefd=False) as utf8_stdout: - utf8_stdout.write("".join((outtext, "\n"))) + utf8_stdout.write(unicode("".join((outtext, "\n")))) def anySizeToBytes(size_string): """ diff --git a/src/searchengine/nova3/engines/extratorrent.py b/src/searchengine/nova3/engines/extratorrent.py index de3dcb9a2..d0bd10bb8 100644 --- a/src/searchengine/nova3/engines/extratorrent.py +++ b/src/searchengine/nova3/engines/extratorrent.py @@ -60,6 +60,7 @@ class extratorrent(object): self.pending_size = False self.next_queries = True self.pending_next_queries = False + self.next_queries_set = set() def handle_starttag(self, tag, attrs): if self.current_item: @@ -74,7 +75,7 @@ class extratorrent(object): #description self.current_item["desc_link"] = "".join((self.url, link)) #remove view at the beginning - self.current_item["name"] = params["title"][5:] + self.current_item["name"] = params["title"][5:].replace("&", "&") self.pending_size = True elif link[8] == "_": #download link @@ -108,7 +109,10 @@ class extratorrent(object): elif self.pending_next_queries: if tag == "a": params = dict(attrs) + if params["title"] in self.next_queries_set: + return self.list_searches.append(params['href']) + self.next_queries_set.add(params["title"]) if params["title"] == "10": self.pending_next_queries = False else: diff --git a/src/searchengine/nova3/engines/mininova.py b/src/searchengine/nova3/engines/mininova.py index 12544db09..b402c70c1 100644 --- a/src/searchengine/nova3/engines/mininova.py +++ b/src/searchengine/nova3/engines/mininova.py @@ -68,12 +68,11 @@ class mininova(object): params = dict(attrs) link = params["href"] - if link.startswith("/get/"): - #download link - self.current_item["link"] = "".join((self.url, link)) - elif link.startswith("/tor/"): + if link.startswith("/tor/"): #description self.current_item["desc_link"] = "".join((self.url, link)) + #get download link from description by id + self.current_item["link"] = "".join((self.url, "/get/", link[5:-2])) self.cur_item_name = "name" self.current_item["name"] = "" elif self.next_queries and link.startswith("/search"): @@ -83,7 +82,7 @@ class mininova(object): def handle_starttag_td(self, attrs): """ Handler of td start tag """ if ("align", "right") in attrs: - if not "size" in self.current_item.keys(): + if not "size" in self.current_item: self.cur_item_name = "size" self.current_item["size"] = "" @@ -113,7 +112,7 @@ class mininova(object): prettyPrinter(self.current_item) self.current_item = None elif self.cur_item_name: - if tag == "a" or tag == "span": + if tag == "a" or tag == "td": self.cur_item_name = None def handle_data(self, data): diff --git a/src/searchengine/nova3/engines/torrentreactor.py b/src/searchengine/nova3/engines/torrentreactor.py index e4005663c..6782ae450 100644 --- a/src/searchengine/nova3/engines/torrentreactor.py +++ b/src/searchengine/nova3/engines/torrentreactor.py @@ -28,10 +28,9 @@ # POSSIBILITY OF SUCH DAMAGE. from novaprinter import prettyPrinter -from helpers import download_file +from helpers import download_file, retrieve_url from urllib import parse from html.parser import HTMLParser -from http.client import HTTPConnection as http from re import compile as re_compile class torrentreactor(object): @@ -100,23 +99,13 @@ class torrentreactor(object): def search(self, what, cat='all'): i = 0 dat = '' - connection = http("www.torrentreactor.net") - while True and i<11: + while i < 11: results = [] parser = self.SimpleHTMLParser(results, self.url) - query = '/torrents-search/%s/%d?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat]) - connection.request("GET", query) - response = connection.getresponse() - if response.status != 200: - break - - dat = response.read().decode('utf-8') - + dat = retrieve_url('%s/torrent-search/%s/%s?sort=seeders.desc&type=all&period=none&categories=%s'%(self.url, what, (i*35), self.supported_categories[cat])) parser.feed(dat) parser.close() if len(results) <= 0: break i += 1 - - connection.close() diff --git a/src/searchengine/nova3/engines/torrentz.py b/src/searchengine/nova3/engines/torrentz.py index 9ced90268..d6c117f42 100644 --- a/src/searchengine/nova3/engines/torrentz.py +++ b/src/searchengine/nova3/engines/torrentz.py @@ -1,4 +1,4 @@ -#VERSION: 2.13 +#VERSION: 2.14 #AUTHORS: Diego de las Heras (diegodelasheras@gmail.com) # Redistribution and use in source and binary forms, with or without diff --git a/src/searchengine/nova3/engines/versions.txt b/src/searchengine/nova3/engines/versions.txt index 479082923..b5300792a 100644 --- a/src/searchengine/nova3/engines/versions.txt +++ b/src/searchengine/nova3/engines/versions.txt @@ -5,5 +5,5 @@ piratebay: 2.11 extratorrent: 2.0 kickasstorrents: 1.26 btdigg: 1.23 -torrentz: 2.13 +torrentz: 2.14 legittorrents: 1.04 diff --git a/src/searchengine/nova3/nova2.py b/src/searchengine/nova3/nova2.py index c67852db3..3b483fdd7 100644 --- a/src/searchengine/nova3/nova2.py +++ b/src/searchengine/nova3/nova2.py @@ -116,6 +116,8 @@ def displayCapabilities(supported_engines): def run_search(engine_list): """ Run search in engine + @param engine_list List with engine, query and category + @retval False if any exceptions occured @retval True otherwise """ @@ -128,6 +130,7 @@ def run_search(engine_list): engine.search(what, cat) else: engine.search(what) + return True except: return False @@ -147,6 +150,7 @@ def main(args): raise SystemExit("./nova2.py [all|engine1[,engine2]*] \n" "available engines: %s" % (','.join(supported_engines))) + #get only unique engines with set engines_list = set(e.lower() for e in args[0].strip().split(',')) if 'all' in engines_list: @@ -166,12 +170,13 @@ def main(args): raise SystemExit(" - ".join(('Invalid category', cat))) what = urllib.parse.quote(' '.join(args[2:])) - if THREADED: + #child process spawning is controlled min(number of searches, number of cpu) with Pool(min(len(engines_list), cpu_count())) as pool: pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list)) else: - _ = [run_search([globals()[engine], what, cat]) for engine in engines_list] + #py3 note: map is needed to be evaluated for content to be executed + all(map(run_search, ([globals()[engine], what, cat] for engine in engines_list))) if __name__ == "__main__": main(argv[1:])