Christophe Dumez
16 years ago
8 changed files with 95 additions and 22 deletions
@ -1,5 +1,5 @@
@@ -1,5 +1,5 @@
|
||||
isohunt: 1.1 |
||||
torrentreactor: 1.02 |
||||
btjunkie: 2.0 |
||||
mininova: 1.13 |
||||
btjunkie: 2.1 |
||||
mininova: 1.2 |
||||
piratebay: 1.04 |
||||
|
@ -0,0 +1,59 @@
@@ -0,0 +1,59 @@
|
||||
#VERSION: 1.0 |
||||
|
||||
# Redistribution and use in source and binary forms, with or without |
||||
# modification, are permitted provided that the following conditions are met: |
||||
# |
||||
# * Redistributions of source code must retain the above copyright notice, |
||||
# this list of conditions and the following disclaimer. |
||||
# * Redistributions in binary form must reproduce the above copyright |
||||
# notice, this list of conditions and the following disclaimer in the |
||||
# documentation and/or other materials provided with the distribution. |
||||
# * Neither the name of the author nor the names of its contributors may be |
||||
# used to endorse or promote products derived from this software without |
||||
# specific prior written permission. |
||||
# |
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
||||
# POSSIBILITY OF SUCH DAMAGE. |
||||
|
||||
import re, htmlentitydefs |
||||
import urllib2 |
||||
|
||||
def htmlentitydecode(s): |
||||
# First convert alpha entities (such as é) |
||||
# (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html) |
||||
def entity2char(m): |
||||
entity = m.group(1) |
||||
if entity in htmlentitydefs.name2codepoint: |
||||
return unichr(htmlentitydefs.name2codepoint[entity]) |
||||
return u" " # Unknown entity: We replace with a space. |
||||
t = re.sub(u'&(%s);' % u'|'.join(htmlentitydefs.name2codepoint), entity2char, s) |
||||
|
||||
# Then convert numerical entities (such as é) |
||||
t = re.sub(u'&#(\d+);', lambda x: unichr(int(x.group(1))), t) |
||||
|
||||
# Then convert hexa entities (such as é) |
||||
return re.sub(u'&#x(\w+);', lambda x: unichr(int(x.group(1),16)), t) |
||||
|
||||
def retrieve_url(url): |
||||
""" Return the content of the url page as a string """ |
||||
req = urllib2.Request(url) |
||||
response = urllib2.urlopen(req) |
||||
dat = response.read() |
||||
info = response.info() |
||||
charset = 'utf-8' |
||||
try: |
||||
ignore, charset = info['Content-Type'].split('charset=') |
||||
except: |
||||
pass |
||||
dat = dat.decode(charset) |
||||
dat = htmlentitydecode(dat) |
||||
return dat.encode('utf-8', 'replace') |
Loading…
Reference in new issue