#VERSION: 1.0 # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the author nor the names of its contributors may be # used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import re, htmlentitydefs import urllib2 def htmlentitydecode(s): # First convert alpha entities (such as é) # (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html) def entity2char(m): entity = m.group(1) if entity in htmlentitydefs.name2codepoint: return unichr(htmlentitydefs.name2codepoint[entity]) return u" " # Unknown entity: We replace with a space. t = re.sub(u'&(%s);' % u'|'.join(htmlentitydefs.name2codepoint), entity2char, s) # Then convert numerical entities (such as é) t = re.sub(u'&#(\d+);', lambda x: unichr(int(x.group(1))), t) # Then convert hexa entities (such as é) return re.sub(u'&#x(\w+);', lambda x: unichr(int(x.group(1),16)), t) def retrieve_url(url): """ Return the content of the url page as a string """ req = urllib2.Request(url) response = urllib2.urlopen(req) dat = response.read() info = response.info() charset = 'utf-8' try: ignore, charset = info['Content-Type'].split('charset=') except: pass dat = dat.decode(charset) dat = htmlentitydecode(dat) return dat.encode('utf-8', 'replace')