[search engine] Final enhancements.

2025-08-30 07:32:06 +00:00 · 2015-04-11 10:28:17 +03:00 · 2015-04-11 10:28:17 +03:00 · 920aefddde
commit 920aefddde
parent 7dafb384e9
13 changed files with 44 additions and 51 deletions
--- a/src/searchengine/nova/engines/extratorrent.py
+++ b/src/searchengine/nova/engines/extratorrent.py
@ -60,6 +60,7 @@ class extratorrent(object):
            self.pending_size = False
            self.next_queries = True
            self.pending_next_queries = False
+            self.next_queries_set = set()

        def handle_starttag(self, tag, attrs):
            if self.current_item:
@ -74,7 +75,7 @@ class extratorrent(object):
                        #description
                        self.current_item["desc_link"] = "".join((self.url, link))
                        #remove view at the beginning
-                        self.current_item["name"] = params["title"][5:]
+                        self.current_item["name"] = params["title"][5:].replace("&amp;", "&")
                        self.pending_size = True
                    elif link[8] == "_":
                        #download link
@ -108,7 +109,10 @@ class extratorrent(object):
            elif self.pending_next_queries:
                if tag == "a":
                    params = dict(attrs)
+                    if params["title"] in self.next_queries_set:
+                        return
                    self.list_searches.append(params['href'])
+                    self.next_queries_set.add(params["title"])
                    if params["title"] == "10":
                        self.pending_next_queries = False
                else:
--- a/src/searchengine/nova/engines/mininova.py
+++ b/src/searchengine/nova/engines/mininova.py
@ -68,12 +68,11 @@ class mininova(object):
            params = dict(attrs)
            link = params["href"]

-            if link.startswith("/get/"):
-                #download link
-                self.current_item["link"] = "".join((self.url, link))
-            elif link.startswith("/tor/"):
+            if link.startswith("/tor/"):
                #description
                self.current_item["desc_link"] = "".join((self.url, link))
+                #get download link from description by id
+                self.current_item["link"] = "".join((self.url, "/get/", link[5:-2]))
                self.cur_item_name = "name"
                self.current_item["name"] = ""
            elif self.next_queries and link.startswith("/search"):
@ -83,7 +82,7 @@ class mininova(object):
        def handle_starttag_td(self, attrs):
            """ Handler of td start tag """
            if ("align", "right") in attrs:
-                if not "size" in self.current_item.keys():
+                if not "size" in self.current_item:
                    self.cur_item_name = "size"
                    self.current_item["size"] = ""

@ -113,7 +112,7 @@ class mininova(object):
                prettyPrinter(self.current_item)
                self.current_item = None
            elif self.cur_item_name:
-                if tag == "a" or tag == "span":
+                if tag == "a" or tag == "td":
                    self.cur_item_name = None

        def handle_data(self, data):
--- a/src/searchengine/nova/engines/torrentreactor.py
+++ b/src/searchengine/nova/engines/torrentreactor.py
@ -28,10 +28,9 @@
 # POSSIBILITY OF SUCH DAMAGE.

 from novaprinter import prettyPrinter
-from helpers import download_file
+from helpers import download_file, retrieve_url
 import urllib
 from HTMLParser import HTMLParser
-from httplib import HTTPConnection as http
 from re import compile as re_compile

 class torrentreactor(object):
@ -100,23 +99,13 @@ class torrentreactor(object):
    def search(self, what, cat='all'):
        i = 0
        dat = ''
-        connection = http("www.torrentreactor.net")

-        while True and i<11:
+        while i < 11:
            results = []
            parser = self.SimpleHTMLParser(results, self.url)
-            query = '/torrents-search/%s/%d?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat])
-            connection.request("GET", query)
-            response = connection.getresponse()
-            if response.status != 200:
-                break
-
-            dat = response.read().decode('utf-8')
-
+            dat = retrieve_url('%s/torrent-search/%s/%s?sort=seeders.desc&type=all&period=none&categories=%s'%(self.url, what, (i*35), self.supported_categories[cat]))
            parser.feed(dat)
            parser.close()
            if len(results) <= 0:
                break
            i += 1
-
-        connection.close()
--- a/src/searchengine/nova/engines/torrentz.py
+++ b/src/searchengine/nova/engines/torrentz.py
@ -1,4 +1,4 @@
-#VERSION: 2.13
+#VERSION: 2.14
 #AUTHORS: Diego de las Heras (diegodelasheras@gmail.com)

 # Redistribution and use in source and binary forms, with or without
@ -105,7 +105,7 @@ class torrentz(object):
        while i < 6:
            results_list = []
            # "what" is already urlencoded
-            html = retrieve_url(self.url + '/any?f=%s&p=%d' % (what, i))
+            html = retrieve_url('%s/any?f=%s&p=%d' % (self.url, what, i))
            parser = self.MyHtmlParser(results_list, self.url, trackers)
            parser.feed(html)
            parser.close()
--- a/src/searchengine/nova/engines/versions.txt
+++ b/src/searchengine/nova/engines/versions.txt
@ -5,5 +5,5 @@ piratebay: 2.11
 extratorrent: 2.0
 kickasstorrents: 1.26
 btdigg: 1.24
-torrentz: 2.13
+torrentz: 2.14
 legittorrents: 1.03
--- a/src/searchengine/nova/nova2.py
+++ b/src/searchengine/nova/nova2.py
@ -117,6 +117,8 @@ def displayCapabilities(supported_engines):
 def run_search(engine_list):
    """ Run search in engine

+        @param engine_list List with engine, query and category
+
        @retval False if any exceptions occured
        @retval True  otherwise
    """
@ -149,6 +151,7 @@ def main(args):
        raise SystemExit("./nova2.py [all|engine1[,engine2]*] <category> <keywords>\n"
                         "available engines: %s" % (','.join(supported_engines)))

+    #get only unique engines with set
    engines_list = set(e.lower() for e in args[0].strip().split(','))

    if 'all' in engines_list:
@ -170,10 +173,11 @@ def main(args):
    what = urllib.quote(' '.join(args[2:]))

    if THREADED:
+        #child process spawning is controlled min(number of searches, number of cpu)
        pool = Pool(min(len(engines_list), cpu_count()))
        pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list))
    else:
-        _ = [run_search([globals()[engine], what, cat]) for engine in engines_list]
+        map(run_search, ([globals()[engine], what, cat] for engine in engines_list))

 if __name__ == "__main__":
    main(argv[1:])
--- a/src/searchengine/nova/novaprinter.py
+++ b/src/searchengine/nova/novaprinter.py
@ -37,7 +37,7 @@ def prettyPrinter(dictionary):
        outtext = "|".join((outtext, dictionary["desc_link"]))

    with open(1, 'w', encoding='utf-8', closefd=False) as utf8_stdout:
-        utf8_stdout.write("".join((outtext, "\n")))
+        utf8_stdout.write(unicode("".join((outtext, "\n"))))

 def anySizeToBytes(size_string):
    """
--- a/src/searchengine/nova3/engines/extratorrent.py
+++ b/src/searchengine/nova3/engines/extratorrent.py
@ -60,6 +60,7 @@ class extratorrent(object):
            self.pending_size = False
            self.next_queries = True
            self.pending_next_queries = False
+            self.next_queries_set = set()

        def handle_starttag(self, tag, attrs):
            if self.current_item:
@ -74,7 +75,7 @@ class extratorrent(object):
                        #description
                        self.current_item["desc_link"] = "".join((self.url, link))
                        #remove view at the beginning
-                        self.current_item["name"] = params["title"][5:]
+                        self.current_item["name"] = params["title"][5:].replace("&amp;", "&")
                        self.pending_size = True
                    elif link[8] == "_":
                        #download link
@ -108,7 +109,10 @@ class extratorrent(object):
            elif self.pending_next_queries:
                if tag == "a":
                    params = dict(attrs)
+                    if params["title"] in self.next_queries_set:
+                        return
                    self.list_searches.append(params['href'])
+                    self.next_queries_set.add(params["title"])
                    if params["title"] == "10":
                        self.pending_next_queries = False
                else:
--- a/src/searchengine/nova3/engines/mininova.py
+++ b/src/searchengine/nova3/engines/mininova.py
@ -68,12 +68,11 @@ class mininova(object):
            params = dict(attrs)
            link = params["href"]

-            if link.startswith("/get/"):
-                #download link
-                self.current_item["link"] = "".join((self.url, link))
-            elif link.startswith("/tor/"):
+            if link.startswith("/tor/"):
                #description
                self.current_item["desc_link"] = "".join((self.url, link))
+                #get download link from description by id
+                self.current_item["link"] = "".join((self.url, "/get/", link[5:-2]))
                self.cur_item_name = "name"
                self.current_item["name"] = ""
            elif self.next_queries and link.startswith("/search"):
@ -83,7 +82,7 @@ class mininova(object):
        def handle_starttag_td(self, attrs):
            """ Handler of td start tag """
            if ("align", "right") in attrs:
-                if not "size" in self.current_item.keys():
+                if not "size" in self.current_item:
                    self.cur_item_name = "size"
                    self.current_item["size"] = ""

@ -113,7 +112,7 @@ class mininova(object):
                prettyPrinter(self.current_item)
                self.current_item = None
            elif self.cur_item_name:
-                if tag == "a" or tag == "span":
+                if tag == "a" or tag == "td":
                    self.cur_item_name = None

        def handle_data(self, data):
--- a/src/searchengine/nova3/engines/torrentreactor.py
+++ b/src/searchengine/nova3/engines/torrentreactor.py
@ -28,10 +28,9 @@
 # POSSIBILITY OF SUCH DAMAGE.

 from novaprinter import prettyPrinter
-from helpers import download_file
+from helpers import download_file, retrieve_url
 from urllib import parse
 from html.parser import HTMLParser
-from http.client import HTTPConnection as http
 from re import compile as re_compile

 class torrentreactor(object):
@ -100,23 +99,13 @@ class torrentreactor(object):
    def search(self, what, cat='all'):
        i = 0
        dat = ''
-        connection = http("www.torrentreactor.net")

-        while True and i<11:
+        while i < 11:
            results = []
            parser = self.SimpleHTMLParser(results, self.url)
-            query = '/torrents-search/%s/%d?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat])
-            connection.request("GET", query)
-            response = connection.getresponse()
-            if response.status != 200:
-                break
-
-            dat = response.read().decode('utf-8')
-
+            dat = retrieve_url('%s/torrent-search/%s/%s?sort=seeders.desc&type=all&period=none&categories=%s'%(self.url, what, (i*35), self.supported_categories[cat]))
            parser.feed(dat)
            parser.close()
            if len(results) <= 0:
                break
            i += 1
-
-        connection.close()
--- a/src/searchengine/nova3/engines/torrentz.py
+++ b/src/searchengine/nova3/engines/torrentz.py
@ -1,4 +1,4 @@
-#VERSION: 2.13
+#VERSION: 2.14
 #AUTHORS: Diego de las Heras (diegodelasheras@gmail.com)

 # Redistribution and use in source and binary forms, with or without
--- a/src/searchengine/nova3/engines/versions.txt
+++ b/src/searchengine/nova3/engines/versions.txt
@ -5,5 +5,5 @@ piratebay: 2.11
 extratorrent: 2.0
 kickasstorrents: 1.26
 btdigg: 1.23
-torrentz: 2.13
+torrentz: 2.14
 legittorrents: 1.04
--- a/src/searchengine/nova3/nova2.py
+++ b/src/searchengine/nova3/nova2.py
@ -116,6 +116,8 @@ def displayCapabilities(supported_engines):
 def run_search(engine_list):
    """ Run search in engine

+        @param engine_list List with engine, query and category
+
        @retval False if any exceptions occured
        @retval True  otherwise
    """
@ -128,6 +130,7 @@ def run_search(engine_list):
            engine.search(what, cat)
        else:
            engine.search(what)
+
        return True
    except:
        return False
@ -147,6 +150,7 @@ def main(args):
        raise SystemExit("./nova2.py [all|engine1[,engine2]*] <category> <keywords>\n"
                         "available engines: %s" % (','.join(supported_engines)))

+    #get only unique engines with set
    engines_list = set(e.lower() for e in args[0].strip().split(','))

    if 'all' in engines_list:
@ -166,12 +170,13 @@ def main(args):
        raise SystemExit(" - ".join(('Invalid category', cat)))

    what = urllib.parse.quote(' '.join(args[2:]))
-
    if THREADED:
+        #child process spawning is controlled min(number of searches, number of cpu)
        with Pool(min(len(engines_list), cpu_count())) as pool:
            pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list))
    else:
-        _ = [run_search([globals()[engine], what, cat]) for engine in engines_list]
+        #py3 note: map is needed to be evaluated for content to be executed
+        all(map(run_search, ([globals()[engine], what, cat] for engine in engines_list)))

 if __name__ == "__main__":
    main(argv[1:])