Browse Source

[search engine] Final enhancements.

adaptive-webui-19844
DoumanAsh 10 years ago
parent
commit
920aefddde
  1. 6
      src/searchengine/nova/engines/extratorrent.py
  2. 11
      src/searchengine/nova/engines/mininova.py
  3. 17
      src/searchengine/nova/engines/torrentreactor.py
  4. 4
      src/searchengine/nova/engines/torrentz.py
  5. 2
      src/searchengine/nova/engines/versions.txt
  6. 6
      src/searchengine/nova/nova2.py
  7. 2
      src/searchengine/nova/novaprinter.py
  8. 6
      src/searchengine/nova3/engines/extratorrent.py
  9. 11
      src/searchengine/nova3/engines/mininova.py
  10. 17
      src/searchengine/nova3/engines/torrentreactor.py
  11. 2
      src/searchengine/nova3/engines/torrentz.py
  12. 2
      src/searchengine/nova3/engines/versions.txt
  13. 9
      src/searchengine/nova3/nova2.py

6
src/searchengine/nova/engines/extratorrent.py

@ -60,6 +60,7 @@ class extratorrent(object):
self.pending_size = False self.pending_size = False
self.next_queries = True self.next_queries = True
self.pending_next_queries = False self.pending_next_queries = False
self.next_queries_set = set()
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if self.current_item: if self.current_item:
@ -74,7 +75,7 @@ class extratorrent(object):
#description #description
self.current_item["desc_link"] = "".join((self.url, link)) self.current_item["desc_link"] = "".join((self.url, link))
#remove view at the beginning #remove view at the beginning
self.current_item["name"] = params["title"][5:] self.current_item["name"] = params["title"][5:].replace("&", "&")
self.pending_size = True self.pending_size = True
elif link[8] == "_": elif link[8] == "_":
#download link #download link
@ -108,7 +109,10 @@ class extratorrent(object):
elif self.pending_next_queries: elif self.pending_next_queries:
if tag == "a": if tag == "a":
params = dict(attrs) params = dict(attrs)
if params["title"] in self.next_queries_set:
return
self.list_searches.append(params['href']) self.list_searches.append(params['href'])
self.next_queries_set.add(params["title"])
if params["title"] == "10": if params["title"] == "10":
self.pending_next_queries = False self.pending_next_queries = False
else: else:

11
src/searchengine/nova/engines/mininova.py

@ -68,12 +68,11 @@ class mininova(object):
params = dict(attrs) params = dict(attrs)
link = params["href"] link = params["href"]
if link.startswith("/get/"): if link.startswith("/tor/"):
#download link
self.current_item["link"] = "".join((self.url, link))
elif link.startswith("/tor/"):
#description #description
self.current_item["desc_link"] = "".join((self.url, link)) self.current_item["desc_link"] = "".join((self.url, link))
#get download link from description by id
self.current_item["link"] = "".join((self.url, "/get/", link[5:-2]))
self.cur_item_name = "name" self.cur_item_name = "name"
self.current_item["name"] = "" self.current_item["name"] = ""
elif self.next_queries and link.startswith("/search"): elif self.next_queries and link.startswith("/search"):
@ -83,7 +82,7 @@ class mininova(object):
def handle_starttag_td(self, attrs): def handle_starttag_td(self, attrs):
""" Handler of td start tag """ """ Handler of td start tag """
if ("align", "right") in attrs: if ("align", "right") in attrs:
if not "size" in self.current_item.keys(): if not "size" in self.current_item:
self.cur_item_name = "size" self.cur_item_name = "size"
self.current_item["size"] = "" self.current_item["size"] = ""
@ -113,7 +112,7 @@ class mininova(object):
prettyPrinter(self.current_item) prettyPrinter(self.current_item)
self.current_item = None self.current_item = None
elif self.cur_item_name: elif self.cur_item_name:
if tag == "a" or tag == "span": if tag == "a" or tag == "td":
self.cur_item_name = None self.cur_item_name = None
def handle_data(self, data): def handle_data(self, data):

17
src/searchengine/nova/engines/torrentreactor.py

@ -28,10 +28,9 @@
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from novaprinter import prettyPrinter from novaprinter import prettyPrinter
from helpers import download_file from helpers import download_file, retrieve_url
import urllib import urllib
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from httplib import HTTPConnection as http
from re import compile as re_compile from re import compile as re_compile
class torrentreactor(object): class torrentreactor(object):
@ -100,23 +99,13 @@ class torrentreactor(object):
def search(self, what, cat='all'): def search(self, what, cat='all'):
i = 0 i = 0
dat = '' dat = ''
connection = http("www.torrentreactor.net")
while True and i<11: while i < 11:
results = [] results = []
parser = self.SimpleHTMLParser(results, self.url) parser = self.SimpleHTMLParser(results, self.url)
query = '/torrents-search/%s/%d?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat]) dat = retrieve_url('%s/torrent-search/%s/%s?sort=seeders.desc&type=all&period=none&categories=%s'%(self.url, what, (i*35), self.supported_categories[cat]))
connection.request("GET", query)
response = connection.getresponse()
if response.status != 200:
break
dat = response.read().decode('utf-8')
parser.feed(dat) parser.feed(dat)
parser.close() parser.close()
if len(results) <= 0: if len(results) <= 0:
break break
i += 1 i += 1
connection.close()

4
src/searchengine/nova/engines/torrentz.py

@ -1,4 +1,4 @@
#VERSION: 2.13 #VERSION: 2.14
#AUTHORS: Diego de las Heras (diegodelasheras@gmail.com) #AUTHORS: Diego de las Heras (diegodelasheras@gmail.com)
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
@ -105,7 +105,7 @@ class torrentz(object):
while i < 6: while i < 6:
results_list = [] results_list = []
# "what" is already urlencoded # "what" is already urlencoded
html = retrieve_url(self.url + '/any?f=%s&p=%d' % (what, i)) html = retrieve_url('%s/any?f=%s&p=%d' % (self.url, what, i))
parser = self.MyHtmlParser(results_list, self.url, trackers) parser = self.MyHtmlParser(results_list, self.url, trackers)
parser.feed(html) parser.feed(html)
parser.close() parser.close()

2
src/searchengine/nova/engines/versions.txt

@ -5,5 +5,5 @@ piratebay: 2.11
extratorrent: 2.0 extratorrent: 2.0
kickasstorrents: 1.26 kickasstorrents: 1.26
btdigg: 1.24 btdigg: 1.24
torrentz: 2.13 torrentz: 2.14
legittorrents: 1.03 legittorrents: 1.03

6
src/searchengine/nova/nova2.py

@ -117,6 +117,8 @@ def displayCapabilities(supported_engines):
def run_search(engine_list): def run_search(engine_list):
""" Run search in engine """ Run search in engine
@param engine_list List with engine, query and category
@retval False if any exceptions occured @retval False if any exceptions occured
@retval True otherwise @retval True otherwise
""" """
@ -149,6 +151,7 @@ def main(args):
raise SystemExit("./nova2.py [all|engine1[,engine2]*] <category> <keywords>\n" raise SystemExit("./nova2.py [all|engine1[,engine2]*] <category> <keywords>\n"
"available engines: %s" % (','.join(supported_engines))) "available engines: %s" % (','.join(supported_engines)))
#get only unique engines with set
engines_list = set(e.lower() for e in args[0].strip().split(',')) engines_list = set(e.lower() for e in args[0].strip().split(','))
if 'all' in engines_list: if 'all' in engines_list:
@ -170,10 +173,11 @@ def main(args):
what = urllib.quote(' '.join(args[2:])) what = urllib.quote(' '.join(args[2:]))
if THREADED: if THREADED:
#child process spawning is controlled min(number of searches, number of cpu)
pool = Pool(min(len(engines_list), cpu_count())) pool = Pool(min(len(engines_list), cpu_count()))
pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list)) pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list))
else: else:
_ = [run_search([globals()[engine], what, cat]) for engine in engines_list] map(run_search, ([globals()[engine], what, cat] for engine in engines_list))
if __name__ == "__main__": if __name__ == "__main__":
main(argv[1:]) main(argv[1:])

2
src/searchengine/nova/novaprinter.py

@ -37,7 +37,7 @@ def prettyPrinter(dictionary):
outtext = "|".join((outtext, dictionary["desc_link"])) outtext = "|".join((outtext, dictionary["desc_link"]))
with open(1, 'w', encoding='utf-8', closefd=False) as utf8_stdout: with open(1, 'w', encoding='utf-8', closefd=False) as utf8_stdout:
utf8_stdout.write("".join((outtext, "\n"))) utf8_stdout.write(unicode("".join((outtext, "\n"))))
def anySizeToBytes(size_string): def anySizeToBytes(size_string):
""" """

6
src/searchengine/nova3/engines/extratorrent.py

@ -60,6 +60,7 @@ class extratorrent(object):
self.pending_size = False self.pending_size = False
self.next_queries = True self.next_queries = True
self.pending_next_queries = False self.pending_next_queries = False
self.next_queries_set = set()
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if self.current_item: if self.current_item:
@ -74,7 +75,7 @@ class extratorrent(object):
#description #description
self.current_item["desc_link"] = "".join((self.url, link)) self.current_item["desc_link"] = "".join((self.url, link))
#remove view at the beginning #remove view at the beginning
self.current_item["name"] = params["title"][5:] self.current_item["name"] = params["title"][5:].replace("&amp;", "&")
self.pending_size = True self.pending_size = True
elif link[8] == "_": elif link[8] == "_":
#download link #download link
@ -108,7 +109,10 @@ class extratorrent(object):
elif self.pending_next_queries: elif self.pending_next_queries:
if tag == "a": if tag == "a":
params = dict(attrs) params = dict(attrs)
if params["title"] in self.next_queries_set:
return
self.list_searches.append(params['href']) self.list_searches.append(params['href'])
self.next_queries_set.add(params["title"])
if params["title"] == "10": if params["title"] == "10":
self.pending_next_queries = False self.pending_next_queries = False
else: else:

11
src/searchengine/nova3/engines/mininova.py

@ -68,12 +68,11 @@ class mininova(object):
params = dict(attrs) params = dict(attrs)
link = params["href"] link = params["href"]
if link.startswith("/get/"): if link.startswith("/tor/"):
#download link
self.current_item["link"] = "".join((self.url, link))
elif link.startswith("/tor/"):
#description #description
self.current_item["desc_link"] = "".join((self.url, link)) self.current_item["desc_link"] = "".join((self.url, link))
#get download link from description by id
self.current_item["link"] = "".join((self.url, "/get/", link[5:-2]))
self.cur_item_name = "name" self.cur_item_name = "name"
self.current_item["name"] = "" self.current_item["name"] = ""
elif self.next_queries and link.startswith("/search"): elif self.next_queries and link.startswith("/search"):
@ -83,7 +82,7 @@ class mininova(object):
def handle_starttag_td(self, attrs): def handle_starttag_td(self, attrs):
""" Handler of td start tag """ """ Handler of td start tag """
if ("align", "right") in attrs: if ("align", "right") in attrs:
if not "size" in self.current_item.keys(): if not "size" in self.current_item:
self.cur_item_name = "size" self.cur_item_name = "size"
self.current_item["size"] = "" self.current_item["size"] = ""
@ -113,7 +112,7 @@ class mininova(object):
prettyPrinter(self.current_item) prettyPrinter(self.current_item)
self.current_item = None self.current_item = None
elif self.cur_item_name: elif self.cur_item_name:
if tag == "a" or tag == "span": if tag == "a" or tag == "td":
self.cur_item_name = None self.cur_item_name = None
def handle_data(self, data): def handle_data(self, data):

17
src/searchengine/nova3/engines/torrentreactor.py

@ -28,10 +28,9 @@
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
from novaprinter import prettyPrinter from novaprinter import prettyPrinter
from helpers import download_file from helpers import download_file, retrieve_url
from urllib import parse from urllib import parse
from html.parser import HTMLParser from html.parser import HTMLParser
from http.client import HTTPConnection as http
from re import compile as re_compile from re import compile as re_compile
class torrentreactor(object): class torrentreactor(object):
@ -100,23 +99,13 @@ class torrentreactor(object):
def search(self, what, cat='all'): def search(self, what, cat='all'):
i = 0 i = 0
dat = '' dat = ''
connection = http("www.torrentreactor.net")
while True and i<11: while i < 11:
results = [] results = []
parser = self.SimpleHTMLParser(results, self.url) parser = self.SimpleHTMLParser(results, self.url)
query = '/torrents-search/%s/%d?sort=seeders.desc&type=all&period=none&categories=%s'%(what, (i*35), self.supported_categories[cat]) dat = retrieve_url('%s/torrent-search/%s/%s?sort=seeders.desc&type=all&period=none&categories=%s'%(self.url, what, (i*35), self.supported_categories[cat]))
connection.request("GET", query)
response = connection.getresponse()
if response.status != 200:
break
dat = response.read().decode('utf-8')
parser.feed(dat) parser.feed(dat)
parser.close() parser.close()
if len(results) <= 0: if len(results) <= 0:
break break
i += 1 i += 1
connection.close()

2
src/searchengine/nova3/engines/torrentz.py

@ -1,4 +1,4 @@
#VERSION: 2.13 #VERSION: 2.14
#AUTHORS: Diego de las Heras (diegodelasheras@gmail.com) #AUTHORS: Diego de las Heras (diegodelasheras@gmail.com)
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without

2
src/searchengine/nova3/engines/versions.txt

@ -5,5 +5,5 @@ piratebay: 2.11
extratorrent: 2.0 extratorrent: 2.0
kickasstorrents: 1.26 kickasstorrents: 1.26
btdigg: 1.23 btdigg: 1.23
torrentz: 2.13 torrentz: 2.14
legittorrents: 1.04 legittorrents: 1.04

9
src/searchengine/nova3/nova2.py

@ -116,6 +116,8 @@ def displayCapabilities(supported_engines):
def run_search(engine_list): def run_search(engine_list):
""" Run search in engine """ Run search in engine
@param engine_list List with engine, query and category
@retval False if any exceptions occured @retval False if any exceptions occured
@retval True otherwise @retval True otherwise
""" """
@ -128,6 +130,7 @@ def run_search(engine_list):
engine.search(what, cat) engine.search(what, cat)
else: else:
engine.search(what) engine.search(what)
return True return True
except: except:
return False return False
@ -147,6 +150,7 @@ def main(args):
raise SystemExit("./nova2.py [all|engine1[,engine2]*] <category> <keywords>\n" raise SystemExit("./nova2.py [all|engine1[,engine2]*] <category> <keywords>\n"
"available engines: %s" % (','.join(supported_engines))) "available engines: %s" % (','.join(supported_engines)))
#get only unique engines with set
engines_list = set(e.lower() for e in args[0].strip().split(',')) engines_list = set(e.lower() for e in args[0].strip().split(','))
if 'all' in engines_list: if 'all' in engines_list:
@ -166,12 +170,13 @@ def main(args):
raise SystemExit(" - ".join(('Invalid category', cat))) raise SystemExit(" - ".join(('Invalid category', cat)))
what = urllib.parse.quote(' '.join(args[2:])) what = urllib.parse.quote(' '.join(args[2:]))
if THREADED: if THREADED:
#child process spawning is controlled min(number of searches, number of cpu)
with Pool(min(len(engines_list), cpu_count())) as pool: with Pool(min(len(engines_list), cpu_count())) as pool:
pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list)) pool.map(run_search, ([globals()[engine], what, cat] for engine in engines_list))
else: else:
_ = [run_search([globals()[engine], what, cat]) for engine in engines_list] #py3 note: map is needed to be evaluated for content to be executed
all(map(run_search, ([globals()[engine], what, cat] for engine in engines_list)))
if __name__ == "__main__": if __name__ == "__main__":
main(argv[1:]) main(argv[1:])

Loading…
Cancel
Save