Browse Source

extracted template, implemented GeoLocationService with OpenStreetMap

master
toyg 11 years ago
parent
commit
46ef9c8f47
  1. 12
      Twistmapper.py
  2. 4
      map.html
  3. 159
      twistscraper.py

12
Twistmapper.py

@ -11,12 +11,12 @@ from twistscraper import TwisterScraper @@ -11,12 +11,12 @@ from twistscraper import TwisterScraper
TEMPLATE = None
with open("map.html", "rb") as mapTemplate:
TEMPLATE = Template(mapTemplate.read())
TEMPLATE = Template(mapTemplate.read().decode('utf-8'))
def generate_map(userdb):
ts = TwisterScraper(userdb)
loc_users = [u for u in ts.db.users.values() if u.location != '']
loc_users = [u for u in ts.db.users.values() if hasattr(u, 'location') and u.location != '']
noLoc_user_num = len(ts.db.users) - len(loc_users)
loc_users_fake_num = 0
locDb = {}
@ -32,7 +32,7 @@ def generate_map(userdb): @@ -32,7 +32,7 @@ def generate_map(userdb):
locDb[u.location]['users'] = [u.username]
else:
loc_users_fake_num += 1
# second pass to aggregate misspellings
# second pass to aggregate misspellings
done = []
newLocDb = {}
for loc, locDict in locDb.items():
@ -48,7 +48,7 @@ def generate_map(userdb): @@ -48,7 +48,7 @@ def generate_map(userdb):
# find the most popular name
locMax = max(sameCoord, key=lambda x: len(x[1]))
location = locMax[0]
coordHash = '/'.join([str(locDict['coordinates']['lat']), str(locDict['coordinates']['lng'])])
coordHash = '/'.join([str(locDict['coordinates'][0]), str(locDict['coordinates'][1])])
# if we haven't seen this set of coordinates yet...
if coordHash not in done:
@ -70,8 +70,8 @@ def generate_map(userdb): @@ -70,8 +70,8 @@ def generate_map(userdb):
for k in newLocDb.keys():
locStrings.append("['<h4>{name} - {numusers}</h4><small>{users}</small>', {lat}, {lng}]".format(
name=k.replace("'", "&apos;"),
lat=newLocDb[k]['coordinates']['lat'],
lng=newLocDb[k]['coordinates']['lng'],
lat=newLocDb[k]['coordinates'][0],
lng=newLocDb[k]['coordinates'][1],
users=',<br />'.join(newLocDb[k]['users']),
numusers=len(newLocDb[k]['users'])))
locStrings.sort()

4
map.html

@ -30,7 +30,9 @@ @@ -30,7 +30,9 @@
<li>Users with unrealistic location: $users_fake_loc</li>
<li>Users without location: $users_no_loc</li>
</ul>
<div id="bottom">For any feedback, ping @toyg on Twister.</div>
<div id="bottom">For any feedback, ping @toyg on Twister. <br/>Location coordinates provided by <a
href="http://nominatim.openstreetmap.org">OpenStreetMap Nominatim</a>.
</div>
<script type="text/javascript">
// Define your locations: HTML content for the info window, latitude, longitude
var locations = [$locations];

159
twistscraper.py

@ -1,17 +1,16 @@ @@ -1,17 +1,16 @@
# -*- coding: utf-8 -*-
import json
from http.client import HTTPException
from urllib.parse import urlencode
from urllib.request import urlopen
from genericpath import exists
from os.path import expanduser
__author__ = 'Giacomo Lacava'
import time, datetime
from urllib.request import urlopen, Request
import datetime
import json
import pickle
import time
import sys
from os.path import expanduser, exists
cacheTimeout = 24 * 3600
try:
@ -21,43 +20,102 @@ except ImportError as exc: @@ -21,43 +20,102 @@ except ImportError as exc:
sys.exit(-1)
class GeoLocationService:
class MaxGeoRequestsException(Exception):
def __init__(self, since):
super(Exception, self).__init__()
self.lastReset = since
print(self.__str__())
def __str__(self):
return "Reached max amounts of requests per hour ({} since {})".format(GeoLocationService.MAXREQUESTS,
self.lastReset.isoformat())
class Borg:
_shared_state = {}
def __init__(self):
self.__dict__ = self._shared_state
class GeoLocationService(Borg):
MAXREQUESTS = 60 * 60 # 1 req per second
CACHEFILE = expanduser('~/.twister/_localusers_geolocation.db')
_GMAP_URL = "https://maps.googleapis.com/maps/api/geocode/json?sensor=false&{query}"
NOMINATIM_URL = "http://nominatim.openstreetmap.org/search?format=jsonv2&{query}"
def __init__(self):
self.db = {}
if exists(GeoLocationService.CACHEFILE):
with open(GeoLocationService.CACHEFILE, 'rb') as gcache:
self.db = pickle.load(gcache)
super(GeoLocationService, self).__init__()
if len(self.__dict__) == 0: # set up only if it's the first instance
self.db = {}
self._counter = 0
self._lastCounterReset = None
self._resetCounter()
if exists(GeoLocationService.CACHEFILE):
with open(GeoLocationService.CACHEFILE, 'rb') as gcache:
self.db = pickle.load(gcache)
def _resetCounter(self):
self._counter = 0
self._lastCounterReset = datetime.datetime.now()
def canWeAsk(self):
""" Check if we can make a lookup.
:return: boolean
"""
if self._counter <= (GeoLocationService.MAXREQUESTS - 1):
return True
now = datetime.datetime.now()
delta = now - self._lastCounterReset
if delta.total_seconds() > (60 * 60):
self._resetCounter()
return True
return False
def locate(self, location):
"""
Query Google API and save coordinates. Should work until we start having more than 50 new locatable
users per hour.
Query Google API and save coordinates. Max 50 requests per hour
:return: dict with coordinates { 'lat':12345, 'lng':13245 }
:raises: MaxGeoRequestsException when geolocation threshold has been reached
"""
# if in cache, return that
if location in self.db:
# this harmonization is due to old data
if type(self.db[location]) == dict:
coordTuple = (self.db[location]['lat'], self.db[location]['lng'])
self.db[location] = coordTuple
return self.db[location]
# ok, let's look it up
loc = urlencode({'address': location})
urldoc = urlopen(GeoLocationService._GMAP_URL.format(query=loc))
jsObj = json.loads(urldoc.readall().decode('utf-8'))
if len(jsObj['results']) > 0:
# discard commercial results
locTypes = jsObj['results'][0]['address_components'][0]['types']
if not 'premise' in locTypes and not 'route' in locTypes and not 'establishment' in locTypes and not 'subpremise' in locTypes:
coords = jsObj['results'][0]['geometry']['location']
# let's cache it and save db
self.db[location] = coords
self.saveDb()
return coords
# still here? it's all rubbish
# not in cache? ok, let's look it up
if not self.canWeAsk():
# sorry, can't do it now
raise MaxGeoRequestsException(self._lastCounterReset)
print("Looking up \"{}\"".format(location))
loc = urlencode({'q': location})
print(GeoLocationService.NOMINATIM_URL.format(query=loc))
request = Request(GeoLocationService.NOMINATIM_URL.format(query=loc))
request.add_header('User-Agent', 'Twister User-Mapper script http://static.pythonaro.com/twistmap/')
urldoc = urlopen(request)
self._counter += 1
jsonText = urldoc.readall().decode('utf-8')
jsObj = json.loads(jsonText)
if len(jsObj) > 0:
coords = jsObj[0]['lat'], jsObj[0]['lon']
# let's cache it and save db
self.db[location] = coords
self.saveDb()
time.sleep(1) # to follow nominatim usage policy: http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy
return coords
# still here? it's all rubbish
return None
def saveDb(self):
""" Save db to file """
with open(GeoLocationService.CACHEFILE, 'wb') as gfile:
pickle.dump(self.db, gfile)
@ -78,14 +136,31 @@ class User: @@ -78,14 +136,31 @@ class User:
def locate(self):
# OO wrapper for GeoLocationService.locate()
if self.location == '':
if hasattr(self, 'location') and self.location == '':
return None
if self.coords is not None:
if hasattr(self, 'coords') and self.coords is not None:
return self.coords
if not hasattr(self, 'locService'):
self.__dict__['locService'] = GeoLocationService()
self.coords = self.locService.locate(self.location)
return self.coords
def __setstate__(self, data):
""" Custom unpickling function to re-instantiate the location service
:param data: dictionary passed by pickle.load()
"""
self.__dict__ = data
self.locService = GeoLocationService()
def __getstate__(self):
""" Custom pickler to drop references to the location service
:return: dict containing the object state
"""
self.locService = None
return self.__dict__
class TwisterDb:
def __init__(self):
@ -104,6 +179,7 @@ class TwisterScraper: @@ -104,6 +179,7 @@ class TwisterScraper:
passwd=password)
self.twister = AuthServiceProxy(self.serverUrl)
self.dbFile = dbPath
self.locService = GeoLocationService()
try:
with open(self.dbFile, 'rb') as dbFile:
@ -120,9 +196,6 @@ class TwisterScraper: @@ -120,9 +196,6 @@ class TwisterScraper:
def scrape_users(self):
nextHash = 0
#if self.db.lastBlockHash is not None and len(self.db.users) != 0:
# nextHash = self.db.lastBlockHash
#else:
nextHash = self.twister.getblockhash(0)
usernames = set()
@ -142,7 +215,7 @@ class TwisterScraper: @@ -142,7 +215,7 @@ class TwisterScraper:
if len(self.db.users) == 0:
# first run
for u in usernames:
blankUser = User()
blankUser = User(self.locService)
blankUser.username = u
blankUser.updateTime = datetime.datetime.now() - self.CACHE_MAX_DURATION
self.saveDb()
@ -157,13 +230,20 @@ class TwisterScraper: @@ -157,13 +230,20 @@ class TwisterScraper:
for n, u in enumerate(to_fetch):
try:
user = self._fetch_user_details(u)
if hasattr(u, 'location'):
try:
u.locate()
except MaxGeoRequestsException:
print("Could not locate '' because of max request limit reached")
self.db.users[user.username] = user
self.saveDb()
if n % 5 == 0:
self.saveDb()
print("({line} of {total}) Fetched {user} ...".format(user=u, line=n, total=total_to_fetch))
except HTTPException as e:
print("Connection error retrieving user {0}: {1}".format(u, str(e)))
def saveDb(self):
print("Saving db")
try:
with open(self.dbFile, 'wb') as dbFile:
pickle.dump(self.db, dbFile)
@ -177,7 +257,8 @@ class TwisterScraper: @@ -177,7 +257,8 @@ class TwisterScraper:
pass
with open(self.dbFile, 'wb') as dbFile:
pickle.dump(self.db, dbFile)
# once clean, re-raise
raise
def get_posts_since(self, username, dateObj, maxNum=1000):
since_epoch = time.mktime(dateObj.timetuple())
@ -201,7 +282,7 @@ class TwisterScraper: @@ -201,7 +282,7 @@ class TwisterScraper:
return all_posts[index:]
def _fetch_user_details(self, username):
user = User()
user = User(self.locService)
user.username = username
avatarData = self.twister.dhtget(username, "avatar", "s")

Loading…
Cancel
Save