Various Python scripts for Twister
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

189 lines
6.6 KiB

# -*- coding: utf-8 -*-
import json
from http.client import HTTPException
from urllib.parse import urlencode
from urllib.request import urlopen
from os.path import expanduser
__author__ = 'Giacomo Lacava'
import time, datetime
import pickle
import sys
cacheTimeout = 24 * 3600
try:
from bitcoinrpc.authproxy import AuthServiceProxy
except ImportError as exc:
sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n")
sys.exit(-1)
class User:
username = ""
avatar = ""
fullname = ""
location = ""
coords = None
bio = ""
url = ""
updateTime = 0
following = []
_GMAP_URL = "https://maps.googleapis.com/maps/api/geocode/json?sensor=false&{query}"
def locate(self):
"""
Query Google API and save coordinates. Should work until we start having more than 50 new locatable
users per hour.
:return: dict with coordinates { 'lat':12345, 'lng':13245 }
"""
if self.location == '':
return None
if self.coords is not None:
return self.coords
loc = urlencode({'address': self.location})
urldoc = urlopen(User._GMAP_URL.format(query=loc))
jsObj = json.loads(urldoc.readall().decode('utf-8'))
if len(jsObj['results']) > 0:
# discard commercial results
locTypes = jsObj['results'][0]['address_components'][0]['types']
if not 'premise' in locTypes and not 'route' in locTypes and not 'establishment' in locTypes and not 'subpremise' in locTypes:
self.coords = jsObj['results'][0]['geometry']['location']
return self.coords
# still here? it's all rubbish
return None
class TwisterDb:
def __init__(self):
self.lastBlockHash = None
self.users = {}
class TwisterScraper:
CACHE_MAX_DURATION = datetime.timedelta(7) # ([days [, seconds [,microseconds]]])
def __init__(self, dbPath, server='localhost', port=28332, user='user', password='pwd', protocol='http'):
self.serverUrl = '{protocol}://{user}:{passwd}@{server}:{port}'.format(protocol=protocol,
server=server,
port=port,
user=user,
passwd=password)
self.twister = AuthServiceProxy(self.serverUrl)
self.dbFile = dbPath
try:
with open(self.dbFile, 'rb') as dbFile:
self.db = pickle.load(dbFile)
except FileNotFoundError:
self.db = TwisterDb()
self.saveDb()
def get_user(self, username):
if username in self.db.users:
return self.db.users[username]
else:
return None
def scrape_users(self):
nextHash = 0
#if self.db.lastBlockHash is not None and len(self.db.users) != 0:
# nextHash = self.db.lastBlockHash
#else:
nextHash = self.twister.getblockhash(0)
usernames = set()
index = 0
while True:
block = self.twister.getblock(nextHash)
self.db.lastBlockHash = block['hash']
usernames = usernames.union(set(block['usernames']))
if len(usernames) > index:
index = len(usernames)
print('Found {0} usernames'.format(index))
if "nextblockhash" in block:
nextHash = block["nextblockhash"]
else:
break
if len(self.db.users) == 0:
# first run
for u in usernames:
blankUser = User()
blankUser.username = u
blankUser.updateTime = datetime.datetime.now() - self.CACHE_MAX_DURATION
self.saveDb()
now = datetime.datetime.now()
old_users = self.db.users.keys()
need_refresh = [u for u in old_users if (self.db.users[u].updateTime + self.CACHE_MAX_DURATION) < now]
new_users = usernames.difference(set(old_users))
to_fetch = new_users.union(set(need_refresh))
total_to_fetch = len(to_fetch)
for n, u in enumerate(to_fetch):
try:
user = self._fetch_user_details(u)
self.db.users[user.username] = user
self.saveDb()
print("({line} of {total}) Fetched {user} ...".format(user=u, line=n, total=total_to_fetch))
except HTTPException as e:
print("Connection error retrieving user {0}: {1}".format(u, str(e)))
def saveDb(self):
with open(self.dbFile, 'wb') as dbFile:
pickle.dump(self.db, dbFile)
def get_posts_since(self, username, dateObj, maxNum=1000):
since_epoch = time.mktime(dateObj.timetuple())
all_posts = self.twister.getposts(1000, [{'username': username}])
all_posts = sorted(all_posts, key=lambda x: x['userpost']['time'])
index = int(len(all_posts) / 2)
def _post_time(i):
return all_posts[i]['userpost']['time']
while 0 > index > len(all_posts):
if _post_time(index - 1) < since_epoch < _post_time(index + 1):
if _post_time(index) < since_epoch:
index += 1
break
elif _post_time(index) > since_epoch:
index = int(index / 2)
elif _post_time(index) < since_epoch:
index = int(index + index / 2)
return all_posts[index:]
def _fetch_user_details(self, username):
user = User()
user.username = username
avatarData = self.twister.dhtget(username, "avatar", "s")
if len(avatarData) == 1:
if 'p' in avatarData[0]:
if 'v' in avatarData[0]['p']:
user.avatar = avatarData[0]['p']['v']
profileData = self.twister.dhtget(username, 'profile', 's')
if len(profileData) == 1:
if 'p' in profileData[0]:
if 'v' in profileData[0]['p']:
profile = profileData[0]['p']['v']
for key in ['location', 'url', 'bio', 'fullname']:
if key in profile:
setattr(user, key, profile[key])
user.following = self.twister.getfollowing(username)
user.updateTime = datetime.datetime.now()
return user
if __name__ == '__main__':
ts = TwisterScraper(expanduser('~/.twister/_localusersdb'), 'localhost')
ts.scrape_users()
print("Total users in db: {0}".format(len(ts.db.users)))