- Should completly fix unicode problems for all search engines

2025-03-13 05:41:17 +00:00 · 2009-03-26 20:14:05 +00:00 · 2009-03-26 20:14:05 +00:00 · 9e46c6c047
commit 9e46c6c047
parent a2e9210665
7 changed files with 18 additions and 19 deletions
--- a/src/search_engine/engines/isohunt.py
+++ b/src/search_engine/engines/isohunt.py
@ -1,4 +1,4 @@
-#VERSION: 1.1
+#VERSION: 1.2
 #AUTHORS: Christophe Dumez (chris@qbittorrent.org)

 # Redistribution and use in source and binary forms, with or without
@ -27,7 +27,7 @@

 from novaprinter import prettyPrinter
 import re
-import urllib
+from helpers import retrieve_url

 class isohunt(object):
 	url = 'http://isohunt.com'
@ -37,7 +37,7 @@ class isohunt(object):
 		i = 1
 		while True and i<11:
 			res = 0
-			dat = urllib.urlopen(self.url+'/torrents.php?ihq=%s&ihp=%s&ihs1=2&iho1=d'%(what,i)).read().decode('utf8', 'replace')
+			dat = retrieve_url(self.url+'/torrents.php?ihq=%s&ihp=%s&ihs1=2&iho1=d'%(what,i))
 			# I know it's not very readable, but the SGML parser feels in pain
 			section_re = re.compile('(?s)id=link.*?</tr><tr')
 			torrent_re = re.compile('(?s)torrent_details/(?P<link>.*?[^/]+).*?'
--- a/src/search_engine/engines/mininova.py
+++ b/src/search_engine/engines/mininova.py
@ -1,4 +1,4 @@
-#VERSION: 1.2
+#VERSION: 1.21
 #AUTHORS: Fabien Devaux (fab@gnux.info)

 # Redistribution and use in source and binary forms, with or without
@ -64,14 +64,12 @@ class mininova(object):
 				return ''.join([ get_text(n) for n in txt.childNodes])
 		page = 1
 		while True and page<11:
-			file = open('/home/chris/mytest.txt', 'w')
-			file.write(self.url+'/search/%s/seeds/%d'%(what, page))
-			file.close()
 			res = 0
 			dat = retrieve_url(self.url+'/search/%s/seeds/%d'%(what, page))
 			dat = re.sub("<a href=\"http://www.boardreader.com/index.php.*\"", "<a href=\"plop\"", dat)
 			dat = re.sub("<=", "&lt;=", dat)
 			dat = re.sub("&\s", "&amp; ", dat)
+			dat = re.sub("&(?!amp)", "&amp;", dat)
 			x = minidom.parseString(dat)
 			table = x.getElementsByTagName('table').item(0)
 			if not table: return
--- a/src/search_engine/engines/piratebay.py
+++ b/src/search_engine/engines/piratebay.py
@ -1,4 +1,4 @@
-#VERSION: 1.04
+#VERSION: 1.1
 #AUTHORS: Fabien Devaux (fab@gnux.info)

 # Redistribution and use in source and binary forms, with or without
@ -27,7 +27,7 @@

 from novaprinter import prettyPrinter
 import sgmllib
-import urllib
+from helpers import retrieve_url

 class piratebay(object):
 	url = 'http://thepiratebay.org'
@ -96,7 +96,7 @@ class piratebay(object):
 		while True and i<11:
 			results = []
 			parser = self.SimpleSGMLParser(results, self.url)
-			dat = urllib.urlopen(self.url+'/search/%s/%u/7' % (what, i)).read()
+			dat = retrieve_url(self.url+'/search/%s/%u/7' % (what, i))
 			parser.feed(dat)
 			parser.close()
 			if len(results) <= 0:
--- a/src/search_engine/engines/torrentreactor.py
+++ b/src/search_engine/engines/torrentreactor.py
@ -1,4 +1,4 @@
-#VERSION: 1.02
+#VERSION: 1.1
 #AUTHORS: Gekko Dam Beer (gekko04@users.sourceforge.net)

 # Redistribution and use in source and binary forms, with or without
@ -27,7 +27,7 @@

 from novaprinter import prettyPrinter
 import sgmllib
-import urllib
+from helpers import retrieve_url

 class torrentreactor(object):
 	url = 'http://www.torrentreactor.net'
@ -92,8 +92,7 @@ class torrentreactor(object):
 		while True and i<11:
 			results = []
 			parser = self.SimpleSGMLParser(results, self.url)
-			dat = urllib.urlopen(self.url+'/search.php?search=&words=%s&cid=&sid=&type=2&orderby=a.seeds&asc=0&skip=%s'%(what,(i*35))).read().decode('utf-8', 'replace')
-			#print "loading page: "+self.url+'/search.php?search=&words=%s&cid=&sid=&type=2&orderby=a.seeds&asc=0&skip=%s'%(what,(i*35))
+			dat = retrieve_url(self.url+'/search.php?search=&words=%s&cid=&sid=&type=2&orderby=a.seeds&asc=0&skip=%s'%(what,(i*35)))
 			parser.feed(dat)
 			parser.close()
 			if len(results) <= 0:
--- a/src/search_engine/engines/versions.txt
+++ b/src/search_engine/engines/versions.txt
@ -1,5 +1,5 @@
-isohunt: 1.1
-torrentreactor: 1.02
+isohunt: 1.2
+torrentreactor: 1.1
 btjunkie: 2.1
-mininova: 1.2
-piratebay: 1.04
+mininova: 1.21
+piratebay: 1.1
--- a/src/search_engine/helpers.py
+++ b/src/search_engine/helpers.py
@ -54,6 +54,6 @@ def retrieve_url(url):
        ignore, charset = info['Content-Type'].split('charset=')
    except:
        pass
-    dat = dat.decode(charset)
+    dat = dat.decode(charset, 'replace')
    dat = htmlentitydecode(dat)
    return dat.encode('utf-8', 'replace')
--- a/src/search_engine/novaprinter.py
+++ b/src/search_engine/novaprinter.py
@ -25,6 +25,8 @@
 # POSSIBILITY OF SUCH DAMAGE.

 def prettyPrinter(dictionnary):
+	if isinstance(dictionnary['size'], str):
+		dictionnary['size'] = dictionnary['size'].decode('utf-8')
 	dictionnary['size'] = anySizeToBytes(dictionnary['size'])
 	if isinstance(dictionnary['name'], unicode):
 		dictionnary['name'] = dictionnary['name'].encode('utf-8')