Browse Source

extracted template, implemented GeoLocationService with OpenStreetMap

master
toyg 11 years ago
parent
commit
46ef9c8f47
  1. 12
      Twistmapper.py
  2. 4
      map.html
  3. 159
      twistscraper.py

12
Twistmapper.py

@ -11,12 +11,12 @@ from twistscraper import TwisterScraper
TEMPLATE = None TEMPLATE = None
with open("map.html", "rb") as mapTemplate: with open("map.html", "rb") as mapTemplate:
TEMPLATE = Template(mapTemplate.read()) TEMPLATE = Template(mapTemplate.read().decode('utf-8'))
def generate_map(userdb): def generate_map(userdb):
ts = TwisterScraper(userdb) ts = TwisterScraper(userdb)
loc_users = [u for u in ts.db.users.values() if u.location != ''] loc_users = [u for u in ts.db.users.values() if hasattr(u, 'location') and u.location != '']
noLoc_user_num = len(ts.db.users) - len(loc_users) noLoc_user_num = len(ts.db.users) - len(loc_users)
loc_users_fake_num = 0 loc_users_fake_num = 0
locDb = {} locDb = {}
@ -32,7 +32,7 @@ def generate_map(userdb):
locDb[u.location]['users'] = [u.username] locDb[u.location]['users'] = [u.username]
else: else:
loc_users_fake_num += 1 loc_users_fake_num += 1
# second pass to aggregate misspellings # second pass to aggregate misspellings
done = [] done = []
newLocDb = {} newLocDb = {}
for loc, locDict in locDb.items(): for loc, locDict in locDb.items():
@ -48,7 +48,7 @@ def generate_map(userdb):
# find the most popular name # find the most popular name
locMax = max(sameCoord, key=lambda x: len(x[1])) locMax = max(sameCoord, key=lambda x: len(x[1]))
location = locMax[0] location = locMax[0]
coordHash = '/'.join([str(locDict['coordinates']['lat']), str(locDict['coordinates']['lng'])]) coordHash = '/'.join([str(locDict['coordinates'][0]), str(locDict['coordinates'][1])])
# if we haven't seen this set of coordinates yet... # if we haven't seen this set of coordinates yet...
if coordHash not in done: if coordHash not in done:
@ -70,8 +70,8 @@ def generate_map(userdb):
for k in newLocDb.keys(): for k in newLocDb.keys():
locStrings.append("['<h4>{name} - {numusers}</h4><small>{users}</small>', {lat}, {lng}]".format( locStrings.append("['<h4>{name} - {numusers}</h4><small>{users}</small>', {lat}, {lng}]".format(
name=k.replace("'", "&apos;"), name=k.replace("'", "&apos;"),
lat=newLocDb[k]['coordinates']['lat'], lat=newLocDb[k]['coordinates'][0],
lng=newLocDb[k]['coordinates']['lng'], lng=newLocDb[k]['coordinates'][1],
users=',<br />'.join(newLocDb[k]['users']), users=',<br />'.join(newLocDb[k]['users']),
numusers=len(newLocDb[k]['users']))) numusers=len(newLocDb[k]['users'])))
locStrings.sort() locStrings.sort()

4
map.html

@ -30,7 +30,9 @@
<li>Users with unrealistic location: $users_fake_loc</li> <li>Users with unrealistic location: $users_fake_loc</li>
<li>Users without location: $users_no_loc</li> <li>Users without location: $users_no_loc</li>
</ul> </ul>
<div id="bottom">For any feedback, ping @toyg on Twister.</div> <div id="bottom">For any feedback, ping @toyg on Twister. <br/>Location coordinates provided by <a
href="http://nominatim.openstreetmap.org">OpenStreetMap Nominatim</a>.
</div>
<script type="text/javascript"> <script type="text/javascript">
// Define your locations: HTML content for the info window, latitude, longitude // Define your locations: HTML content for the info window, latitude, longitude
var locations = [$locations]; var locations = [$locations];

159
twistscraper.py

@ -1,17 +1,16 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json
from http.client import HTTPException from http.client import HTTPException
from urllib.parse import urlencode from urllib.parse import urlencode
from urllib.request import urlopen from urllib.request import urlopen, Request
from genericpath import exists import datetime
from os.path import expanduser import json
__author__ = 'Giacomo Lacava'
import time, datetime
import pickle import pickle
import time
import sys import sys
from os.path import expanduser, exists
cacheTimeout = 24 * 3600 cacheTimeout = 24 * 3600
try: try:
@ -21,43 +20,102 @@ except ImportError as exc:
sys.exit(-1) sys.exit(-1)
class GeoLocationService: class MaxGeoRequestsException(Exception):
def __init__(self, since):
super(Exception, self).__init__()
self.lastReset = since
print(self.__str__())
def __str__(self):
return "Reached max amounts of requests per hour ({} since {})".format(GeoLocationService.MAXREQUESTS,
self.lastReset.isoformat())
class Borg:
_shared_state = {}
def __init__(self):
self.__dict__ = self._shared_state
class GeoLocationService(Borg):
MAXREQUESTS = 60 * 60 # 1 req per second
CACHEFILE = expanduser('~/.twister/_localusers_geolocation.db') CACHEFILE = expanduser('~/.twister/_localusers_geolocation.db')
_GMAP_URL = "https://maps.googleapis.com/maps/api/geocode/json?sensor=false&{query}" NOMINATIM_URL = "http://nominatim.openstreetmap.org/search?format=jsonv2&{query}"
def __init__(self): def __init__(self):
self.db = {} super(GeoLocationService, self).__init__()
if exists(GeoLocationService.CACHEFILE): if len(self.__dict__) == 0: # set up only if it's the first instance
with open(GeoLocationService.CACHEFILE, 'rb') as gcache: self.db = {}
self.db = pickle.load(gcache) self._counter = 0
self._lastCounterReset = None
self._resetCounter()
if exists(GeoLocationService.CACHEFILE):
with open(GeoLocationService.CACHEFILE, 'rb') as gcache:
self.db = pickle.load(gcache)
def _resetCounter(self):
self._counter = 0
self._lastCounterReset = datetime.datetime.now()
def canWeAsk(self):
""" Check if we can make a lookup.
:return: boolean
"""
if self._counter <= (GeoLocationService.MAXREQUESTS - 1):
return True
now = datetime.datetime.now()
delta = now - self._lastCounterReset
if delta.total_seconds() > (60 * 60):
self._resetCounter()
return True
return False
def locate(self, location): def locate(self, location):
""" """
Query Google API and save coordinates. Should work until we start having more than 50 new locatable Query Google API and save coordinates. Max 50 requests per hour
users per hour.
:return: dict with coordinates { 'lat':12345, 'lng':13245 } :return: dict with coordinates { 'lat':12345, 'lng':13245 }
:raises: MaxGeoRequestsException when geolocation threshold has been reached
""" """
# if in cache, return that # if in cache, return that
if location in self.db: if location in self.db:
# this harmonization is due to old data
if type(self.db[location]) == dict:
coordTuple = (self.db[location]['lat'], self.db[location]['lng'])
self.db[location] = coordTuple
return self.db[location] return self.db[location]
# ok, let's look it up
loc = urlencode({'address': location}) # not in cache? ok, let's look it up
urldoc = urlopen(GeoLocationService._GMAP_URL.format(query=loc))
jsObj = json.loads(urldoc.readall().decode('utf-8')) if not self.canWeAsk():
if len(jsObj['results']) > 0: # sorry, can't do it now
# discard commercial results raise MaxGeoRequestsException(self._lastCounterReset)
locTypes = jsObj['results'][0]['address_components'][0]['types']
if not 'premise' in locTypes and not 'route' in locTypes and not 'establishment' in locTypes and not 'subpremise' in locTypes: print("Looking up \"{}\"".format(location))
coords = jsObj['results'][0]['geometry']['location'] loc = urlencode({'q': location})
# let's cache it and save db print(GeoLocationService.NOMINATIM_URL.format(query=loc))
self.db[location] = coords request = Request(GeoLocationService.NOMINATIM_URL.format(query=loc))
self.saveDb() request.add_header('User-Agent', 'Twister User-Mapper script http://static.pythonaro.com/twistmap/')
return coords urldoc = urlopen(request)
# still here? it's all rubbish self._counter += 1
jsonText = urldoc.readall().decode('utf-8')
jsObj = json.loads(jsonText)
if len(jsObj) > 0:
coords = jsObj[0]['lat'], jsObj[0]['lon']
# let's cache it and save db
self.db[location] = coords
self.saveDb()
time.sleep(1) # to follow nominatim usage policy: http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy
return coords
# still here? it's all rubbish
return None return None
def saveDb(self): def saveDb(self):
""" Save db to file """
with open(GeoLocationService.CACHEFILE, 'wb') as gfile: with open(GeoLocationService.CACHEFILE, 'wb') as gfile:
pickle.dump(self.db, gfile) pickle.dump(self.db, gfile)
@ -78,14 +136,31 @@ class User:
def locate(self): def locate(self):
# OO wrapper for GeoLocationService.locate() # OO wrapper for GeoLocationService.locate()
if self.location == '': if hasattr(self, 'location') and self.location == '':
return None return None
if self.coords is not None: if hasattr(self, 'coords') and self.coords is not None:
return self.coords return self.coords
if not hasattr(self, 'locService'):
self.__dict__['locService'] = GeoLocationService()
self.coords = self.locService.locate(self.location) self.coords = self.locService.locate(self.location)
return self.coords return self.coords
def __setstate__(self, data):
""" Custom unpickling function to re-instantiate the location service
:param data: dictionary passed by pickle.load()
"""
self.__dict__ = data
self.locService = GeoLocationService()
def __getstate__(self):
""" Custom pickler to drop references to the location service
:return: dict containing the object state
"""
self.locService = None
return self.__dict__
class TwisterDb: class TwisterDb:
def __init__(self): def __init__(self):
@ -104,6 +179,7 @@ class TwisterScraper:
passwd=password) passwd=password)
self.twister = AuthServiceProxy(self.serverUrl) self.twister = AuthServiceProxy(self.serverUrl)
self.dbFile = dbPath self.dbFile = dbPath
self.locService = GeoLocationService()
try: try:
with open(self.dbFile, 'rb') as dbFile: with open(self.dbFile, 'rb') as dbFile:
@ -120,9 +196,6 @@ class TwisterScraper:
def scrape_users(self): def scrape_users(self):
nextHash = 0 nextHash = 0
#if self.db.lastBlockHash is not None and len(self.db.users) != 0:
# nextHash = self.db.lastBlockHash
#else:
nextHash = self.twister.getblockhash(0) nextHash = self.twister.getblockhash(0)
usernames = set() usernames = set()
@ -142,7 +215,7 @@ class TwisterScraper:
if len(self.db.users) == 0: if len(self.db.users) == 0:
# first run # first run
for u in usernames: for u in usernames:
blankUser = User() blankUser = User(self.locService)
blankUser.username = u blankUser.username = u
blankUser.updateTime = datetime.datetime.now() - self.CACHE_MAX_DURATION blankUser.updateTime = datetime.datetime.now() - self.CACHE_MAX_DURATION
self.saveDb() self.saveDb()
@ -157,13 +230,20 @@ class TwisterScraper:
for n, u in enumerate(to_fetch): for n, u in enumerate(to_fetch):
try: try:
user = self._fetch_user_details(u) user = self._fetch_user_details(u)
if hasattr(u, 'location'):
try:
u.locate()
except MaxGeoRequestsException:
print("Could not locate '' because of max request limit reached")
self.db.users[user.username] = user self.db.users[user.username] = user
self.saveDb() if n % 5 == 0:
self.saveDb()
print("({line} of {total}) Fetched {user} ...".format(user=u, line=n, total=total_to_fetch)) print("({line} of {total}) Fetched {user} ...".format(user=u, line=n, total=total_to_fetch))
except HTTPException as e: except HTTPException as e:
print("Connection error retrieving user {0}: {1}".format(u, str(e))) print("Connection error retrieving user {0}: {1}".format(u, str(e)))
def saveDb(self): def saveDb(self):
print("Saving db")
try: try:
with open(self.dbFile, 'wb') as dbFile: with open(self.dbFile, 'wb') as dbFile:
pickle.dump(self.db, dbFile) pickle.dump(self.db, dbFile)
@ -177,7 +257,8 @@ class TwisterScraper:
pass pass
with open(self.dbFile, 'wb') as dbFile: with open(self.dbFile, 'wb') as dbFile:
pickle.dump(self.db, dbFile) pickle.dump(self.db, dbFile)
# once clean, re-raise
raise
def get_posts_since(self, username, dateObj, maxNum=1000): def get_posts_since(self, username, dateObj, maxNum=1000):
since_epoch = time.mktime(dateObj.timetuple()) since_epoch = time.mktime(dateObj.timetuple())
@ -201,7 +282,7 @@ class TwisterScraper:
return all_posts[index:] return all_posts[index:]
def _fetch_user_details(self, username): def _fetch_user_details(self, username):
user = User() user = User(self.locService)
user.username = username user.username = username
avatarData = self.twister.dhtget(username, "avatar", "s") avatarData = self.twister.dhtget(username, "avatar", "s")

Loading…
Cancel
Save