|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from http.client import HTTPException
|
|
|
|
from urllib.parse import urlencode
|
|
|
|
from urllib.request import urlopen, Request
|
|
|
|
import datetime
|
|
|
|
import json
|
|
|
|
import pickle
|
|
|
|
import time
|
|
|
|
import sys
|
|
|
|
|
|
|
|
from os.path import expanduser, exists
|
|
|
|
|
|
|
|
|
|
|
|
cacheTimeout = 24 * 3600
|
|
|
|
|
|
|
|
try:
|
|
|
|
from bitcoinrpc.authproxy import AuthServiceProxy
|
|
|
|
except ImportError as exc:
|
|
|
|
sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n")
|
|
|
|
sys.exit(-1)
|
|
|
|
|
|
|
|
|
|
|
|
class MaxGeoRequestsException(Exception):
|
|
|
|
def __init__(self, since):
|
|
|
|
super(Exception, self).__init__()
|
|
|
|
self.lastReset = since
|
|
|
|
print(self.__str__())
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return "Reached max amounts of requests per hour ({} since {})".format(GeoLocationService.MAXREQUESTS,
|
|
|
|
self.lastReset.isoformat())
|
|
|
|
|
|
|
|
|
|
|
|
class Borg:
|
|
|
|
_shared_state = {}
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.__dict__ = self._shared_state
|
|
|
|
|
|
|
|
|
|
|
|
class GeoLocationService(Borg):
|
|
|
|
MAXREQUESTS = 60 * 60 # 1 req per second
|
|
|
|
CACHEFILE = expanduser('~/.twister/_localusers_geolocation.db')
|
|
|
|
NOMINATIM_URL = "http://nominatim.openstreetmap.org/search?format=jsonv2&{query}"
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
super(GeoLocationService, self).__init__()
|
|
|
|
if len(self.__dict__) == 0: # set up only if it's the first instance
|
|
|
|
self.db = {}
|
|
|
|
self._counter = 0
|
|
|
|
self._lastCounterReset = None
|
|
|
|
self._resetCounter()
|
|
|
|
|
|
|
|
if exists(GeoLocationService.CACHEFILE):
|
|
|
|
with open(GeoLocationService.CACHEFILE, 'rb') as gcache:
|
|
|
|
self.db = pickle.load(gcache)
|
|
|
|
|
|
|
|
def _resetCounter(self):
|
|
|
|
self._counter = 0
|
|
|
|
self._lastCounterReset = datetime.datetime.now()
|
|
|
|
|
|
|
|
def canWeAsk(self):
|
|
|
|
""" Check if we can make a lookup.
|
|
|
|
|
|
|
|
:return: boolean
|
|
|
|
"""
|
|
|
|
if self._counter <= (GeoLocationService.MAXREQUESTS - 1):
|
|
|
|
return True
|
|
|
|
now = datetime.datetime.now()
|
|
|
|
delta = now - self._lastCounterReset
|
|
|
|
if delta.total_seconds() > (60 * 60):
|
|
|
|
self._resetCounter()
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
def locate(self, location):
|
|
|
|
"""
|
|
|
|
Query Google API and save coordinates. Max 50 requests per hour
|
|
|
|
:return: dict with coordinates { 'lat':12345, 'lng':13245 }
|
|
|
|
:raises: MaxGeoRequestsException when geolocation threshold has been reached
|
|
|
|
"""
|
|
|
|
|
|
|
|
# if in cache, return that
|
|
|
|
if location in self.db:
|
|
|
|
# this harmonization is due to old data
|
|
|
|
if type(self.db[location]) == dict:
|
|
|
|
coordTuple = (self.db[location]['lat'], self.db[location]['lng'])
|
|
|
|
self.db[location] = coordTuple
|
|
|
|
return self.db[location]
|
|
|
|
|
|
|
|
# not in cache? ok, let's look it up
|
|
|
|
|
|
|
|
if not self.canWeAsk():
|
|
|
|
# sorry, can't do it now
|
|
|
|
raise MaxGeoRequestsException(self._lastCounterReset)
|
|
|
|
|
|
|
|
print("Looking up \"{}\"".format(location))
|
|
|
|
loc = urlencode({'q': location})
|
|
|
|
print(GeoLocationService.NOMINATIM_URL.format(query=loc))
|
|
|
|
request = Request(GeoLocationService.NOMINATIM_URL.format(query=loc))
|
|
|
|
request.add_header('User-Agent', 'Twister User-Mapper script http://static.pythonaro.com/twistmap/')
|
|
|
|
urldoc = urlopen(request)
|
|
|
|
self._counter += 1
|
|
|
|
jsonText = urldoc.readall().decode('utf-8')
|
|
|
|
jsObj = json.loads(jsonText)
|
|
|
|
if len(jsObj) > 0:
|
|
|
|
coords = jsObj[0]['lat'], jsObj[0]['lon']
|
|
|
|
# let's cache it and save db
|
|
|
|
self.db[location] = coords
|
|
|
|
self.saveDb()
|
|
|
|
time.sleep(1) # to follow nominatim usage policy: http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy
|
|
|
|
return coords
|
|
|
|
|
|
|
|
# still here? it's all rubbish
|
|
|
|
return None
|
|
|
|
|
|
|
|
def saveDb(self):
|
|
|
|
""" Save db to file """
|
|
|
|
with open(GeoLocationService.CACHEFILE, 'wb') as gfile:
|
|
|
|
pickle.dump(self.db, gfile)
|
|
|
|
|
|
|
|
|
|
|
|
class User:
|
|
|
|
def __init__(self):
|
|
|
|
self.locService = GeoLocationService()
|
|
|
|
self.username = ""
|
|
|
|
self.avatar = ""
|
|
|
|
self.fullname = ""
|
|
|
|
self.location = ""
|
|
|
|
self.coords = None
|
|
|
|
self.bio = ""
|
|
|
|
self.url = ""
|
|
|
|
self.updateTime = 0
|
|
|
|
self.following = []
|
|
|
|
|
|
|
|
|
|
|
|
def locate(self):
|
|
|
|
# OO wrapper for GeoLocationService.locate()
|
|
|
|
if hasattr(self, 'location') and self.location == '':
|
|
|
|
return None
|
|
|
|
if hasattr(self, 'coords') and self.coords is not None:
|
|
|
|
return self.coords
|
|
|
|
|
|
|
|
if not hasattr(self, 'locService'):
|
|
|
|
self.__dict__['locService'] = GeoLocationService()
|
|
|
|
|
|
|
|
self.coords = self.locService.locate(self.location)
|
|
|
|
return self.coords
|
|
|
|
|
|
|
|
def __setstate__(self, data):
|
|
|
|
""" Custom unpickling function to re-instantiate the location service
|
|
|
|
:param data: dictionary passed by pickle.load()
|
|
|
|
"""
|
|
|
|
self.__dict__ = data
|
|
|
|
self.locService = GeoLocationService()
|
|
|
|
|
|
|
|
def __getstate__(self):
|
|
|
|
""" Custom pickler to drop references to the location service
|
|
|
|
:return: dict containing the object state
|
|
|
|
"""
|
|
|
|
self.locService = None
|
|
|
|
return self.__dict__
|
|
|
|
|
|
|
|
|
|
|
|
class TwisterDb:
|
|
|
|
def __init__(self):
|
|
|
|
self.lastBlockHash = None
|
|
|
|
self.users = {}
|
|
|
|
|
|
|
|
|
|
|
|
class TwisterScraper:
|
|
|
|
CACHE_MAX_DURATION = datetime.timedelta(7) # ([days [, seconds [,microseconds]]])
|
|
|
|
|
|
|
|
def __init__(self, dbPath, server='localhost', port=28332, user='user', password='pwd', protocol='http'):
|
|
|
|
self.serverUrl = '{protocol}://{user}:{passwd}@{server}:{port}'.format(protocol=protocol,
|
|
|
|
server=server,
|
|
|
|
port=port,
|
|
|
|
user=user,
|
|
|
|
passwd=password)
|
|
|
|
self.twister = AuthServiceProxy(self.serverUrl)
|
|
|
|
self.dbFile = dbPath
|
|
|
|
self.locService = GeoLocationService()
|
|
|
|
|
|
|
|
try:
|
|
|
|
with open(self.dbFile, 'rb') as dbFile:
|
|
|
|
self.db = pickle.load(dbFile)
|
|
|
|
except FileNotFoundError:
|
|
|
|
self.db = TwisterDb()
|
|
|
|
self.saveDb()
|
|
|
|
|
|
|
|
def get_user(self, username):
|
|
|
|
if username in self.db.users:
|
|
|
|
return self.db.users[username]
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
|
|
|
|
def scrape_users(self):
|
|
|
|
nextHash = 0
|
|
|
|
nextHash = self.twister.getblockhash(0)
|
|
|
|
|
|
|
|
usernames = set()
|
|
|
|
index = 0
|
|
|
|
while True:
|
|
|
|
block = self.twister.getblock(nextHash)
|
|
|
|
self.db.lastBlockHash = block['hash']
|
|
|
|
usernames = usernames.union(set(block['usernames']))
|
|
|
|
if len(usernames) > index:
|
|
|
|
index = len(usernames)
|
|
|
|
print('Found {0} usernames'.format(index))
|
|
|
|
if "nextblockhash" in block:
|
|
|
|
nextHash = block["nextblockhash"]
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
|
|
|
if len(self.db.users) == 0:
|
|
|
|
# first run
|
|
|
|
for u in usernames:
|
|
|
|
blankUser = User(self.locService)
|
|
|
|
blankUser.username = u
|
|
|
|
blankUser.updateTime = datetime.datetime.now() - self.CACHE_MAX_DURATION
|
|
|
|
self.saveDb()
|
|
|
|
|
|
|
|
now = datetime.datetime.now()
|
|
|
|
old_users = self.db.users.keys()
|
|
|
|
need_refresh = [u for u in old_users if (self.db.users[u].updateTime + self.CACHE_MAX_DURATION) < now]
|
|
|
|
new_users = usernames.difference(set(old_users))
|
|
|
|
to_fetch = new_users.union(set(need_refresh))
|
|
|
|
|
|
|
|
total_to_fetch = len(to_fetch)
|
|
|
|
for n, u in enumerate(to_fetch):
|
|
|
|
try:
|
|
|
|
user = self._fetch_user_details(u)
|
|
|
|
if hasattr(u, 'location'):
|
|
|
|
try:
|
|
|
|
u.locate()
|
|
|
|
except MaxGeoRequestsException:
|
|
|
|
print("Could not locate '' because of max request limit reached")
|
|
|
|
self.db.users[user.username] = user
|
|
|
|
if n % 5 == 0:
|
|
|
|
self.saveDb()
|
|
|
|
print("({line} of {total}) Fetched {user} ...".format(user=u, line=n, total=total_to_fetch))
|
|
|
|
except HTTPException as e:
|
|
|
|
print("Connection error retrieving user {0}: {1}".format(u, str(e)))
|
|
|
|
|
|
|
|
def saveDb(self):
|
|
|
|
print("Saving db")
|
|
|
|
try:
|
|
|
|
with open(self.dbFile, 'wb') as dbFile:
|
|
|
|
pickle.dump(self.db, dbFile)
|
|
|
|
except (KeyboardInterrupt, Exception):
|
|
|
|
print("Closing db before quitting...")
|
|
|
|
if dbFile:
|
|
|
|
# close the hung descriptor and re-try the dumping
|
|
|
|
try:
|
|
|
|
dbFile.close()
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
with open(self.dbFile, 'wb') as dbFile:
|
|
|
|
pickle.dump(self.db, dbFile)
|
|
|
|
# once clean, re-raise
|
|
|
|
raise
|
|
|
|
|
|
|
|
def get_posts_since(self, username, dateObj, maxNum=1000):
|
|
|
|
since_epoch = time.mktime(dateObj.timetuple())
|
|
|
|
all_posts = self.twister.getposts(1000, [{'username': username}])
|
|
|
|
all_posts = sorted(all_posts, key=lambda x: x['userpost']['time'])
|
|
|
|
index = int(len(all_posts) / 2)
|
|
|
|
|
|
|
|
def _post_time(i):
|
|
|
|
return all_posts[i]['userpost']['time']
|
|
|
|
|
|
|
|
while 0 > index > len(all_posts):
|
|
|
|
if _post_time(index - 1) < since_epoch < _post_time(index + 1):
|
|
|
|
if _post_time(index) < since_epoch:
|
|
|
|
index += 1
|
|
|
|
break
|
|
|
|
elif _post_time(index) > since_epoch:
|
|
|
|
index = int(index / 2)
|
|
|
|
elif _post_time(index) < since_epoch:
|
|
|
|
index = int(index + index / 2)
|
|
|
|
|
|
|
|
return all_posts[index:]
|
|
|
|
|
|
|
|
def _fetch_user_details(self, username):
|
|
|
|
user = User(self.locService)
|
|
|
|
user.username = username
|
|
|
|
|
|
|
|
avatarData = self.twister.dhtget(username, "avatar", "s")
|
|
|
|
if len(avatarData) == 1:
|
|
|
|
if 'p' in avatarData[0]:
|
|
|
|
if 'v' in avatarData[0]['p']:
|
|
|
|
user.avatar = avatarData[0]['p']['v']
|
|
|
|
|
|
|
|
profileData = self.twister.dhtget(username, 'profile', 's')
|
|
|
|
if len(profileData) == 1:
|
|
|
|
if 'p' in profileData[0]:
|
|
|
|
if 'v' in profileData[0]['p']:
|
|
|
|
profile = profileData[0]['p']['v']
|
|
|
|
for key in ['location', 'url', 'bio', 'fullname']:
|
|
|
|
if key in profile:
|
|
|
|
setattr(user, key, profile[key])
|
|
|
|
|
|
|
|
user.following = self.twister.getfollowing(username)
|
|
|
|
|
|
|
|
user.updateTime = datetime.datetime.now()
|
|
|
|
return user
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
ts = TwisterScraper(expanduser('~/.twister/_localusersdb'), 'localhost')
|
|
|
|
ts.scrape_users()
|
|
|
|
print("Total users in db: {0}".format(len(ts.db.users)))
|
|
|
|
|