Browse Source

- Should completly fix unicode problems for all search engines

adaptive-webui-19844
Christophe Dumez 16 years ago
parent
commit
9e46c6c047
  1. 6
      src/search_engine/engines/isohunt.py
  2. 6
      src/search_engine/engines/mininova.py
  3. 6
      src/search_engine/engines/piratebay.py
  4. 7
      src/search_engine/engines/torrentreactor.py
  5. 8
      src/search_engine/engines/versions.txt
  6. 2
      src/search_engine/helpers.py
  7. 2
      src/search_engine/novaprinter.py

6
src/search_engine/engines/isohunt.py

@ -1,4 +1,4 @@
#VERSION: 1.1 #VERSION: 1.2
#AUTHORS: Christophe Dumez (chris@qbittorrent.org) #AUTHORS: Christophe Dumez (chris@qbittorrent.org)
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
@ -27,7 +27,7 @@
from novaprinter import prettyPrinter from novaprinter import prettyPrinter
import re import re
import urllib from helpers import retrieve_url
class isohunt(object): class isohunt(object):
url = 'http://isohunt.com' url = 'http://isohunt.com'
@ -37,7 +37,7 @@ class isohunt(object):
i = 1 i = 1
while True and i<11: while True and i<11:
res = 0 res = 0
dat = urllib.urlopen(self.url+'/torrents.php?ihq=%s&ihp=%s&ihs1=2&iho1=d'%(what,i)).read().decode('utf8', 'replace') dat = retrieve_url(self.url+'/torrents.php?ihq=%s&ihp=%s&ihs1=2&iho1=d'%(what,i))
# I know it's not very readable, but the SGML parser feels in pain # I know it's not very readable, but the SGML parser feels in pain
section_re = re.compile('(?s)id=link.*?</tr><tr') section_re = re.compile('(?s)id=link.*?</tr><tr')
torrent_re = re.compile('(?s)torrent_details/(?P<link>.*?[^/]+).*?' torrent_re = re.compile('(?s)torrent_details/(?P<link>.*?[^/]+).*?'

6
src/search_engine/engines/mininova.py

@ -1,4 +1,4 @@
#VERSION: 1.2 #VERSION: 1.21
#AUTHORS: Fabien Devaux (fab@gnux.info) #AUTHORS: Fabien Devaux (fab@gnux.info)
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
@ -64,14 +64,12 @@ class mininova(object):
return ''.join([ get_text(n) for n in txt.childNodes]) return ''.join([ get_text(n) for n in txt.childNodes])
page = 1 page = 1
while True and page<11: while True and page<11:
file = open('/home/chris/mytest.txt', 'w')
file.write(self.url+'/search/%s/seeds/%d'%(what, page))
file.close()
res = 0 res = 0
dat = retrieve_url(self.url+'/search/%s/seeds/%d'%(what, page)) dat = retrieve_url(self.url+'/search/%s/seeds/%d'%(what, page))
dat = re.sub("<a href=\"http://www.boardreader.com/index.php.*\"", "<a href=\"plop\"", dat) dat = re.sub("<a href=\"http://www.boardreader.com/index.php.*\"", "<a href=\"plop\"", dat)
dat = re.sub("<=", "&lt;=", dat) dat = re.sub("<=", "&lt;=", dat)
dat = re.sub("&\s", "&amp; ", dat) dat = re.sub("&\s", "&amp; ", dat)
dat = re.sub("&(?!amp)", "&amp;", dat)
x = minidom.parseString(dat) x = minidom.parseString(dat)
table = x.getElementsByTagName('table').item(0) table = x.getElementsByTagName('table').item(0)
if not table: return if not table: return

6
src/search_engine/engines/piratebay.py

@ -1,4 +1,4 @@
#VERSION: 1.04 #VERSION: 1.1
#AUTHORS: Fabien Devaux (fab@gnux.info) #AUTHORS: Fabien Devaux (fab@gnux.info)
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
@ -27,7 +27,7 @@
from novaprinter import prettyPrinter from novaprinter import prettyPrinter
import sgmllib import sgmllib
import urllib from helpers import retrieve_url
class piratebay(object): class piratebay(object):
url = 'http://thepiratebay.org' url = 'http://thepiratebay.org'
@ -96,7 +96,7 @@ class piratebay(object):
while True and i<11: while True and i<11:
results = [] results = []
parser = self.SimpleSGMLParser(results, self.url) parser = self.SimpleSGMLParser(results, self.url)
dat = urllib.urlopen(self.url+'/search/%s/%u/7' % (what, i)).read() dat = retrieve_url(self.url+'/search/%s/%u/7' % (what, i))
parser.feed(dat) parser.feed(dat)
parser.close() parser.close()
if len(results) <= 0: if len(results) <= 0:

7
src/search_engine/engines/torrentreactor.py

@ -1,4 +1,4 @@
#VERSION: 1.02 #VERSION: 1.1
#AUTHORS: Gekko Dam Beer (gekko04@users.sourceforge.net) #AUTHORS: Gekko Dam Beer (gekko04@users.sourceforge.net)
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
@ -27,7 +27,7 @@
from novaprinter import prettyPrinter from novaprinter import prettyPrinter
import sgmllib import sgmllib
import urllib from helpers import retrieve_url
class torrentreactor(object): class torrentreactor(object):
url = 'http://www.torrentreactor.net' url = 'http://www.torrentreactor.net'
@ -92,8 +92,7 @@ class torrentreactor(object):
while True and i<11: while True and i<11:
results = [] results = []
parser = self.SimpleSGMLParser(results, self.url) parser = self.SimpleSGMLParser(results, self.url)
dat = urllib.urlopen(self.url+'/search.php?search=&words=%s&cid=&sid=&type=2&orderby=a.seeds&asc=0&skip=%s'%(what,(i*35))).read().decode('utf-8', 'replace') dat = retrieve_url(self.url+'/search.php?search=&words=%s&cid=&sid=&type=2&orderby=a.seeds&asc=0&skip=%s'%(what,(i*35)))
#print "loading page: "+self.url+'/search.php?search=&words=%s&cid=&sid=&type=2&orderby=a.seeds&asc=0&skip=%s'%(what,(i*35))
parser.feed(dat) parser.feed(dat)
parser.close() parser.close()
if len(results) <= 0: if len(results) <= 0:

8
src/search_engine/engines/versions.txt

@ -1,5 +1,5 @@
isohunt: 1.1 isohunt: 1.2
torrentreactor: 1.02 torrentreactor: 1.1
btjunkie: 2.1 btjunkie: 2.1
mininova: 1.2 mininova: 1.21
piratebay: 1.04 piratebay: 1.1

2
src/search_engine/helpers.py

@ -54,6 +54,6 @@ def retrieve_url(url):
ignore, charset = info['Content-Type'].split('charset=') ignore, charset = info['Content-Type'].split('charset=')
except: except:
pass pass
dat = dat.decode(charset) dat = dat.decode(charset, 'replace')
dat = htmlentitydecode(dat) dat = htmlentitydecode(dat)
return dat.encode('utf-8', 'replace') return dat.encode('utf-8', 'replace')

2
src/search_engine/novaprinter.py

@ -25,6 +25,8 @@
# POSSIBILITY OF SUCH DAMAGE. # POSSIBILITY OF SUCH DAMAGE.
def prettyPrinter(dictionnary): def prettyPrinter(dictionnary):
if isinstance(dictionnary['size'], str):
dictionnary['size'] = dictionnary['size'].decode('utf-8')
dictionnary['size'] = anySizeToBytes(dictionnary['size']) dictionnary['size'] = anySizeToBytes(dictionnary['size'])
if isinstance(dictionnary['name'], unicode): if isinstance(dictionnary['name'], unicode):
dictionnary['name'] = dictionnary['name'].encode('utf-8') dictionnary['name'] = dictionnary['name'].encode('utf-8')

Loading…
Cancel
Save