diff --git a/Twistmapper.py b/Twistmapper.py new file mode 100644 index 0000000..bc04086 --- /dev/null +++ b/Twistmapper.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +from datetime import datetime +from string import Template + +from os.path import expanduser + + +__author__ = 'Giacomo Lacava' + +from twistscraper import TwisterScraper + +TEMPLATE = None +with open("map.html", "rb") as mapTemplate: + TEMPLATE = Template(mapTemplate.read()) + + +def generate_map(userdb): + ts = TwisterScraper(userdb) + loc_users = [u for u in ts.db.users.values() if u.location != ''] + noLoc_user_num = len(ts.db.users) - len(loc_users) + loc_users_fake_num = 0 + locDb = {} + + for u in loc_users: + if u.location in locDb: + locDb[u.location]['users'].append(u.username) + else: + locData = u.locate() + if locData is not None: + locDb[u.location] = {} + locDb[u.location]['coordinates'] = locData + locDb[u.location]['users'] = [u.username] + else: + loc_users_fake_num += 1 + # second pass to aggregate misspellings + done = [] + newLocDb = {} + for loc, locDict in locDb.items(): + # find all elements with same coordinates + sameCoord = [(l, lObj['users']) for l, lObj in locDb.items() if lObj['coordinates'] == locDict['coordinates']] + if len(sameCoord) == 1: + # if only one element, copy it straight to the new dict + newLocDb[loc] = locDict + + elif len(sameCoord) > 1: + # if we're here, multiple locations have the same name + + # find the most popular name + locMax = max(sameCoord, key=lambda x: len(x[1])) + location = locMax[0] + coordHash = '/'.join([str(locDict['coordinates']['lat']), str(locDict['coordinates']['lng'])]) + # if we haven't seen this set of coordinates yet... + if coordHash not in done: + + # ... collect all users ... + users = [] + for l, us in sameCoord: + for u in us: + users.append(u) + users.sort() + + # ... and add the aggregated result + if location not in newLocDb: + newLocDb[location] = {} + newLocDb[location]['users'] = users + newLocDb[location]['coordinates'] = locDict['coordinates'] + done.append(coordHash) + + locStrings = [] + for k in newLocDb.keys(): + locStrings.append("['

{name} - {numusers}

{users}', {lat}, {lng}]".format( + name=k.replace("'", "'"), + lat=newLocDb[k]['coordinates']['lat'], + lng=newLocDb[k]['coordinates']['lng'], + users=',
'.join(newLocDb[k]['users']), + numusers=len(newLocDb[k]['users']))) + locStrings.sort() + return TEMPLATE.substitute(locations=',\n'.join(locStrings), + users_real_loc=len(loc_users), + users_fake_loc=loc_users_fake_num, + users_no_loc=noLoc_user_num, + timestamp=datetime.now().isoformat()) + + +if __name__ == '__main__': + html = generate_map(expanduser('~/.twister/_localusersdb')) + with open(expanduser('~/twistermap.html'), 'wb') as tmf: + tmf.write(html.encode('utf-8')) \ No newline at end of file diff --git a/map.html b/map.html new file mode 100644 index 0000000..f711ecc --- /dev/null +++ b/map.html @@ -0,0 +1,111 @@ + + + + + Map of Twister Users + + + + + +

Map of Twister Users

+ +

+ (as self-reported in profile) +

+
+ +
For any feedback, ping @toyg on Twister.
+ + + \ No newline at end of file diff --git a/twistmonitor.py b/twistmonitor.py new file mode 100644 index 0000000..284438f --- /dev/null +++ b/twistmonitor.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +import pickle +from operator import attrgetter +from threading import Thread +from time import sleep +from os.path import expanduser + +import feedparser + +from twistscraper import TwisterScraper + +__author__ = 'Giacomo Lacava' + +GITHUB_REPO_URL = 'https://github.com/{user}/{repo}' +GITHUB_COMMIT_FEED_TEMPLATE = GITHUB_REPO_URL + '/commits/master.atom' + +CORE_COMMIT_FEED = GITHUB_COMMIT_FEED_TEMPLATE.format(user='miguelfreitas', repo='twister-core') +HTML_COMMIT_FEED = GITHUB_COMMIT_FEED_TEMPLATE.format(user='miguelfreitas', repo='twister-html') +SEED_COMMIT_FEED = GITHUB_COMMIT_FEED_TEMPLATE.format(user='miguelfreitas', repo='twister-seeder') +CORE_REPO_URL = GITHUB_REPO_URL.format(user='miguelfreitas', repo='twister-core') +HTML_REPO_URL = GITHUB_REPO_URL.format(user='miguelfreitas', repo='twister-html') +SEED_REPO_URL = GITHUB_REPO_URL.format(user='miguelfreitas', repo='twister-seeder') + + +class TwisterMonitor(Thread): + MESSAGE = 'Twister update: {msg} - Pull it now: {url}' + + def __init__(self, twister_monitor, username, repo_feed=CORE_COMMIT_FEED, repo_url=CORE_REPO_URL): + Thread.__init__(self) + self.ts = twister_monitor + self.cacheFile = expanduser('~/.twister/_twm_cache') + self.cache = {} + self.username = username + self.feed = repo_feed + self.repo = repo_url + self.loadCache() + + def loadCache(self): + try: + with open(self.cacheFile, 'rb') as f: + self.cache = pickle.load(f) + except FileNotFoundError: + self.cache = {} + + def get_commits(self): + print("Fetching {0}".format(self.feed)) + f = feedparser.parse(self.feed) + if f['bozo'] == 1: + raise Exception('Bad feed! Status: {status} - Error {err}'.format(status=f.status, err=f.bozo_exception)) + + if self.feed not in self.cache: + self.cache[self.feed] = [] + + f.entries.sort(key=attrgetter('updated_parsed')) + for entry in f.entries: + print("Checking {0}".format(entry.id)) + if entry.id not in self.cache[self.feed]: + message = TwisterMonitor.MESSAGE.format(msg=entry.title, url=self.repo) + cut = 1 + while len(message) >= 140: + message = TwisterMonitor.MESSAGE.format(msg=(entry.title[:-cut] + '...'), url=self.repo) + cut += 1 + + print("Checking last post key...") + key = 1 + lastpost = self.ts.twister.getposts(1, [{"username": self.username}]) + if len(lastpost) == 1: + key = lastpost[0]['userpost']['k'] + 1 + print("Posting '{0}' with key {1}...".format(message, key)) + self.ts.twister.newpostmsg(self.username, key, message) + print("Posted!") + self.cache[self.feed].append(entry.id) + self.saveCache() + sleep(10 * 60) + + def saveCache(self): + with open(self.cacheFile, 'wb') as f: + pickle.dump(self.cache, f) + + def run(self): + while True: + try: + self.get_commits() + except Exception as e: + print("Exception following!") + print(e) + sleep(60 * 60) # in seconds + + +if __name__ == '__main__': + botID = 'twmonitor' + ts = TwisterScraper(expanduser('~/.twister/_localusersdb')) + monitor = TwisterMonitor(ts, botID, CORE_COMMIT_FEED, CORE_REPO_URL) + monitor.start() + sleep(4 * 60) + monitor_ui = TwisterMonitor(ts, botID, HTML_COMMIT_FEED, HTML_REPO_URL) + monitor_ui.start() + sleep(6 * 60) + monitor_seed = TwisterMonitor(ts, botID, SEED_COMMIT_FEED, SEED_REPO_URL) + monitor_seed.start() diff --git a/twistscraper.py b/twistscraper.py new file mode 100644 index 0000000..6ed64e1 --- /dev/null +++ b/twistscraper.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- +import json +from http.client import HTTPException +from urllib.parse import urlencode +from urllib.request import urlopen +from os.path import expanduser + +__author__ = 'Giacomo Lacava' + +import time, datetime +import pickle +import sys + +cacheTimeout = 24 * 3600 + +try: + from bitcoinrpc.authproxy import AuthServiceProxy +except ImportError as exc: + sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n") + sys.exit(-1) + + +class User: + username = "" + avatar = "" + fullname = "" + location = "" + coords = None + bio = "" + url = "" + updateTime = 0 + following = [] + + _GMAP_URL = "https://maps.googleapis.com/maps/api/geocode/json?sensor=false&{query}" + + def locate(self): + """ + Query Google API and save coordinates. Should work until we start having more than 50 new locatable + users per hour. + :return: dict with coordinates { 'lat':12345, 'lng':13245 } + """ + if self.location == '': + return None + if self.coords is not None: + return self.coords + + loc = urlencode({'address': self.location}) + urldoc = urlopen(User._GMAP_URL.format(query=loc)) + jsObj = json.loads(urldoc.readall().decode('utf-8')) + if len(jsObj['results']) > 0: + # discard commercial results + locTypes = jsObj['results'][0]['address_components'][0]['types'] + if not 'premise' in locTypes and not 'route' in locTypes and not 'establishment' in locTypes and not 'subpremise' in locTypes: + self.coords = jsObj['results'][0]['geometry']['location'] + return self.coords + # still here? it's all rubbish + return None + + +class TwisterDb: + def __init__(self): + self.lastBlockHash = None + self.users = {} + + +class TwisterScraper: + CACHE_MAX_DURATION = datetime.timedelta(7) # ([days [, seconds [,microseconds]]]) + + def __init__(self, dbPath, server='localhost', port=28332, user='user', password='pwd', protocol='http'): + self.serverUrl = '{protocol}://{user}:{passwd}@{server}:{port}'.format(protocol=protocol, + server=server, + port=port, + user=user, + passwd=password) + self.twister = AuthServiceProxy(self.serverUrl) + self.dbFile = dbPath + + try: + with open(self.dbFile, 'rb') as dbFile: + self.db = pickle.load(dbFile) + except FileNotFoundError: + self.db = TwisterDb() + self.saveDb() + + def get_user(self, username): + if username in self.db.users: + return self.db.users[username] + else: + return None + + def scrape_users(self): + nextHash = 0 + #if self.db.lastBlockHash is not None and len(self.db.users) != 0: + # nextHash = self.db.lastBlockHash + #else: + nextHash = self.twister.getblockhash(0) + + usernames = set() + index = 0 + while True: + block = self.twister.getblock(nextHash) + self.db.lastBlockHash = block['hash'] + usernames = usernames.union(set(block['usernames'])) + if len(usernames) > index: + index = len(usernames) + print('Found {0} usernames'.format(index)) + if "nextblockhash" in block: + nextHash = block["nextblockhash"] + else: + break + + if len(self.db.users) == 0: + # first run + for u in usernames: + blankUser = User() + blankUser.username = u + blankUser.updateTime = datetime.datetime.now() - self.CACHE_MAX_DURATION + self.saveDb() + + now = datetime.datetime.now() + old_users = self.db.users.keys() + need_refresh = [u for u in old_users if (self.db.users[u].updateTime + self.CACHE_MAX_DURATION) < now] + new_users = usernames.difference(set(old_users)) + to_fetch = new_users.union(set(need_refresh)) + + total_to_fetch = len(to_fetch) + for n, u in enumerate(to_fetch): + try: + user = self._fetch_user_details(u) + self.db.users[user.username] = user + self.saveDb() + print("({line} of {total}) Fetched {user} ...".format(user=u, line=n, total=total_to_fetch)) + except HTTPException as e: + print("Connection error retrieving user {0}: {1}".format(u, str(e))) + + def saveDb(self): + with open(self.dbFile, 'wb') as dbFile: + pickle.dump(self.db, dbFile) + + def get_posts_since(self, username, dateObj, maxNum=1000): + since_epoch = time.mktime(dateObj.timetuple()) + all_posts = self.twister.getposts(1000, [{'username': username}]) + all_posts = sorted(all_posts, key=lambda x: x['userpost']['time']) + index = int(len(all_posts) / 2) + + def _post_time(i): + return all_posts[i]['userpost']['time'] + + while 0 > index > len(all_posts): + if _post_time(index - 1) < since_epoch < _post_time(index + 1): + if _post_time(index) < since_epoch: + index += 1 + break + elif _post_time(index) > since_epoch: + index = int(index / 2) + elif _post_time(index) < since_epoch: + index = int(index + index / 2) + + return all_posts[index:] + + def _fetch_user_details(self, username): + user = User() + user.username = username + + avatarData = self.twister.dhtget(username, "avatar", "s") + if len(avatarData) == 1: + if 'p' in avatarData[0]: + if 'v' in avatarData[0]['p']: + user.avatar = avatarData[0]['p']['v'] + + profileData = self.twister.dhtget(username, 'profile', 's') + if len(profileData) == 1: + if 'p' in profileData[0]: + if 'v' in profileData[0]['p']: + profile = profileData[0]['p']['v'] + for key in ['location', 'url', 'bio', 'fullname']: + if key in profile: + setattr(user, key, profile[key]) + + user.following = self.twister.getfollowing(username) + + user.updateTime = datetime.datetime.now() + return user + + +if __name__ == '__main__': + ts = TwisterScraper(expanduser('~/.twister/_localusersdb'), 'localhost') + ts.scrape_users() + print("Total users in db: {0}".format(len(ts.db.users)))