Browse Source

Initial checkin

master
toyg 11 years ago
parent
commit
014452c226
  1. 88
      Twistmapper.py
  2. 111
      map.html
  3. 100
      twistmonitor.py
  4. 189
      twistscraper.py

88
Twistmapper.py

@ -0,0 +1,88 @@ @@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
from datetime import datetime
from string import Template
from os.path import expanduser
__author__ = 'Giacomo Lacava'
from twistscraper import TwisterScraper
TEMPLATE = None
with open("map.html", "rb") as mapTemplate:
TEMPLATE = Template(mapTemplate.read())
def generate_map(userdb):
ts = TwisterScraper(userdb)
loc_users = [u for u in ts.db.users.values() if u.location != '']
noLoc_user_num = len(ts.db.users) - len(loc_users)
loc_users_fake_num = 0
locDb = {}
for u in loc_users:
if u.location in locDb:
locDb[u.location]['users'].append(u.username)
else:
locData = u.locate()
if locData is not None:
locDb[u.location] = {}
locDb[u.location]['coordinates'] = locData
locDb[u.location]['users'] = [u.username]
else:
loc_users_fake_num += 1
# second pass to aggregate misspellings
done = []
newLocDb = {}
for loc, locDict in locDb.items():
# find all elements with same coordinates
sameCoord = [(l, lObj['users']) for l, lObj in locDb.items() if lObj['coordinates'] == locDict['coordinates']]
if len(sameCoord) == 1:
# if only one element, copy it straight to the new dict
newLocDb[loc] = locDict
elif len(sameCoord) > 1:
# if we're here, multiple locations have the same name
# find the most popular name
locMax = max(sameCoord, key=lambda x: len(x[1]))
location = locMax[0]
coordHash = '/'.join([str(locDict['coordinates']['lat']), str(locDict['coordinates']['lng'])])
# if we haven't seen this set of coordinates yet...
if coordHash not in done:
# ... collect all users ...
users = []
for l, us in sameCoord:
for u in us:
users.append(u)
users.sort()
# ... and add the aggregated result
if location not in newLocDb:
newLocDb[location] = {}
newLocDb[location]['users'] = users
newLocDb[location]['coordinates'] = locDict['coordinates']
done.append(coordHash)
locStrings = []
for k in newLocDb.keys():
locStrings.append("['<h4>{name} - {numusers}</h4><small>{users}</small>', {lat}, {lng}]".format(
name=k.replace("'", "&apos;"),
lat=newLocDb[k]['coordinates']['lat'],
lng=newLocDb[k]['coordinates']['lng'],
users=',<br />'.join(newLocDb[k]['users']),
numusers=len(newLocDb[k]['users'])))
locStrings.sort()
return TEMPLATE.substitute(locations=',\n'.join(locStrings),
users_real_loc=len(loc_users),
users_fake_loc=loc_users_fake_num,
users_no_loc=noLoc_user_num,
timestamp=datetime.now().isoformat())
if __name__ == '__main__':
html = generate_map(expanduser('~/.twister/_localusersdb'))
with open(expanduser('~/twistermap.html'), 'wb') as tmf:
tmf.write(html.encode('utf-8'))

111
map.html

@ -0,0 +1,111 @@ @@ -0,0 +1,111 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
<title>Map of Twister Users</title>
<script src="http://maps.google.com/maps/api/js?sensor=false"></script>
<script src="http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.10.1.min.js"></script>
<style>
body {
font-family: Helvetica, Verdana, Arial, sans-serif;
}
h1, h4 {
padding-bottom: 0;
margin-bottom: 0;
}
</style>
</head>
<body>
<h1>Map of Twister Users</h1>
<p>
<small>(as self-reported in profile)</small>
</p>
<div id="map" style="width: 600px; height: 400px;"></div>
<ul>
<li>Updated at: $timestamp</li>
<li>Users with realistic location: $users_real_loc</li>
<li>Users with unrealistic location: $users_fake_loc</li>
<li>Users without location: $users_no_loc</li>
</ul>
<div id="bottom">For any feedback, ping @toyg on Twister.</div>
<script type="text/javascript">
// Define your locations: HTML content for the info window, latitude, longitude
var locations = [$locations];
// Setup the different icons and shadows
var iconURLPrefix = 'http://maps.google.com/mapfiles/ms/icons/';
var icons = [
iconURLPrefix + 'red-dot.png',
iconURLPrefix + 'green-dot.png',
iconURLPrefix + 'blue-dot.png',
iconURLPrefix + 'orange-dot.png',
iconURLPrefix + 'purple-dot.png',
iconURLPrefix + 'pink-dot.png',
iconURLPrefix + 'yellow-dot.png'
]
var icons_length = icons.length;
var shadow = {
anchor: new google.maps.Point(15, 33),
url: iconURLPrefix + 'msmarker.shadow.png'
};
var map = new google.maps.Map(document.getElementById('map'), {
zoom: 10,
center: new google.maps.LatLng(-37.92, 151.25),
mapTypeId: google.maps.MapTypeId.ROADMAP,
mapTypeControl: false,
streetViewControl: false,
panControl: false,
zoomControlOptions: {
position: google.maps.ControlPosition.LEFT_BOTTOM
}
});
var infowindow = new google.maps.InfoWindow({
maxWidth: 160
});
var marker;
var markers = new Array();
var iconCounter = 0;
// Add the markers and infowindows to the map
for (var i = 0; i < locations.length; i++) {
marker = new google.maps.Marker({
position: new google.maps.LatLng(locations[i][1], locations[i][2]),
map: map,
icon: icons[iconCounter],
shadow: shadow
});
markers.push(marker);
google.maps.event.addListener(marker, 'click', (function (marker, i) {
return function () {
infowindow.setContent(locations[i][0]);
infowindow.open(map, marker);
}
})(marker, i));
iconCounter++;
if (iconCounter >= icons_length) {
iconCounter = 0;
}
}
function AutoCenter() {
var bounds = new google.maps.LatLngBounds();
$$.each(markers, function (index, marker) {
bounds.extend(marker.position);
});
map.fitBounds(bounds);
}
AutoCenter();
</script>
</body>
</html>

100
twistmonitor.py

@ -0,0 +1,100 @@ @@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
import pickle
from operator import attrgetter
from threading import Thread
from time import sleep
from os.path import expanduser
import feedparser
from twistscraper import TwisterScraper
__author__ = 'Giacomo Lacava'
GITHUB_REPO_URL = 'https://github.com/{user}/{repo}'
GITHUB_COMMIT_FEED_TEMPLATE = GITHUB_REPO_URL + '/commits/master.atom'
CORE_COMMIT_FEED = GITHUB_COMMIT_FEED_TEMPLATE.format(user='miguelfreitas', repo='twister-core')
HTML_COMMIT_FEED = GITHUB_COMMIT_FEED_TEMPLATE.format(user='miguelfreitas', repo='twister-html')
SEED_COMMIT_FEED = GITHUB_COMMIT_FEED_TEMPLATE.format(user='miguelfreitas', repo='twister-seeder')
CORE_REPO_URL = GITHUB_REPO_URL.format(user='miguelfreitas', repo='twister-core')
HTML_REPO_URL = GITHUB_REPO_URL.format(user='miguelfreitas', repo='twister-html')
SEED_REPO_URL = GITHUB_REPO_URL.format(user='miguelfreitas', repo='twister-seeder')
class TwisterMonitor(Thread):
MESSAGE = 'Twister update: {msg} - Pull it now: {url}'
def __init__(self, twister_monitor, username, repo_feed=CORE_COMMIT_FEED, repo_url=CORE_REPO_URL):
Thread.__init__(self)
self.ts = twister_monitor
self.cacheFile = expanduser('~/.twister/_twm_cache')
self.cache = {}
self.username = username
self.feed = repo_feed
self.repo = repo_url
self.loadCache()
def loadCache(self):
try:
with open(self.cacheFile, 'rb') as f:
self.cache = pickle.load(f)
except FileNotFoundError:
self.cache = {}
def get_commits(self):
print("Fetching {0}".format(self.feed))
f = feedparser.parse(self.feed)
if f['bozo'] == 1:
raise Exception('Bad feed! Status: {status} - Error {err}'.format(status=f.status, err=f.bozo_exception))
if self.feed not in self.cache:
self.cache[self.feed] = []
f.entries.sort(key=attrgetter('updated_parsed'))
for entry in f.entries:
print("Checking {0}".format(entry.id))
if entry.id not in self.cache[self.feed]:
message = TwisterMonitor.MESSAGE.format(msg=entry.title, url=self.repo)
cut = 1
while len(message) >= 140:
message = TwisterMonitor.MESSAGE.format(msg=(entry.title[:-cut] + '...'), url=self.repo)
cut += 1
print("Checking last post key...")
key = 1
lastpost = self.ts.twister.getposts(1, [{"username": self.username}])
if len(lastpost) == 1:
key = lastpost[0]['userpost']['k'] + 1
print("Posting '{0}' with key {1}...".format(message, key))
self.ts.twister.newpostmsg(self.username, key, message)
print("Posted!")
self.cache[self.feed].append(entry.id)
self.saveCache()
sleep(10 * 60)
def saveCache(self):
with open(self.cacheFile, 'wb') as f:
pickle.dump(self.cache, f)
def run(self):
while True:
try:
self.get_commits()
except Exception as e:
print("Exception following!")
print(e)
sleep(60 * 60) # in seconds
if __name__ == '__main__':
botID = 'twmonitor'
ts = TwisterScraper(expanduser('~/.twister/_localusersdb'))
monitor = TwisterMonitor(ts, botID, CORE_COMMIT_FEED, CORE_REPO_URL)
monitor.start()
sleep(4 * 60)
monitor_ui = TwisterMonitor(ts, botID, HTML_COMMIT_FEED, HTML_REPO_URL)
monitor_ui.start()
sleep(6 * 60)
monitor_seed = TwisterMonitor(ts, botID, SEED_COMMIT_FEED, SEED_REPO_URL)
monitor_seed.start()

189
twistscraper.py

@ -0,0 +1,189 @@ @@ -0,0 +1,189 @@
# -*- coding: utf-8 -*-
import json
from http.client import HTTPException
from urllib.parse import urlencode
from urllib.request import urlopen
from os.path import expanduser
__author__ = 'Giacomo Lacava'
import time, datetime
import pickle
import sys
cacheTimeout = 24 * 3600
try:
from bitcoinrpc.authproxy import AuthServiceProxy
except ImportError as exc:
sys.stderr.write("Error: install python-bitcoinrpc (https://github.com/jgarzik/python-bitcoinrpc)\n")
sys.exit(-1)
class User:
username = ""
avatar = ""
fullname = ""
location = ""
coords = None
bio = ""
url = ""
updateTime = 0
following = []
_GMAP_URL = "https://maps.googleapis.com/maps/api/geocode/json?sensor=false&{query}"
def locate(self):
"""
Query Google API and save coordinates. Should work until we start having more than 50 new locatable
users per hour.
:return: dict with coordinates { 'lat':12345, 'lng':13245 }
"""
if self.location == '':
return None
if self.coords is not None:
return self.coords
loc = urlencode({'address': self.location})
urldoc = urlopen(User._GMAP_URL.format(query=loc))
jsObj = json.loads(urldoc.readall().decode('utf-8'))
if len(jsObj['results']) > 0:
# discard commercial results
locTypes = jsObj['results'][0]['address_components'][0]['types']
if not 'premise' in locTypes and not 'route' in locTypes and not 'establishment' in locTypes and not 'subpremise' in locTypes:
self.coords = jsObj['results'][0]['geometry']['location']
return self.coords
# still here? it's all rubbish
return None
class TwisterDb:
def __init__(self):
self.lastBlockHash = None
self.users = {}
class TwisterScraper:
CACHE_MAX_DURATION = datetime.timedelta(7) # ([days [, seconds [,microseconds]]])
def __init__(self, dbPath, server='localhost', port=28332, user='user', password='pwd', protocol='http'):
self.serverUrl = '{protocol}://{user}:{passwd}@{server}:{port}'.format(protocol=protocol,
server=server,
port=port,
user=user,
passwd=password)
self.twister = AuthServiceProxy(self.serverUrl)
self.dbFile = dbPath
try:
with open(self.dbFile, 'rb') as dbFile:
self.db = pickle.load(dbFile)
except FileNotFoundError:
self.db = TwisterDb()
self.saveDb()
def get_user(self, username):
if username in self.db.users:
return self.db.users[username]
else:
return None
def scrape_users(self):
nextHash = 0
#if self.db.lastBlockHash is not None and len(self.db.users) != 0:
# nextHash = self.db.lastBlockHash
#else:
nextHash = self.twister.getblockhash(0)
usernames = set()
index = 0
while True:
block = self.twister.getblock(nextHash)
self.db.lastBlockHash = block['hash']
usernames = usernames.union(set(block['usernames']))
if len(usernames) > index:
index = len(usernames)
print('Found {0} usernames'.format(index))
if "nextblockhash" in block:
nextHash = block["nextblockhash"]
else:
break
if len(self.db.users) == 0:
# first run
for u in usernames:
blankUser = User()
blankUser.username = u
blankUser.updateTime = datetime.datetime.now() - self.CACHE_MAX_DURATION
self.saveDb()
now = datetime.datetime.now()
old_users = self.db.users.keys()
need_refresh = [u for u in old_users if (self.db.users[u].updateTime + self.CACHE_MAX_DURATION) < now]
new_users = usernames.difference(set(old_users))
to_fetch = new_users.union(set(need_refresh))
total_to_fetch = len(to_fetch)
for n, u in enumerate(to_fetch):
try:
user = self._fetch_user_details(u)
self.db.users[user.username] = user
self.saveDb()
print("({line} of {total}) Fetched {user} ...".format(user=u, line=n, total=total_to_fetch))
except HTTPException as e:
print("Connection error retrieving user {0}: {1}".format(u, str(e)))
def saveDb(self):
with open(self.dbFile, 'wb') as dbFile:
pickle.dump(self.db, dbFile)
def get_posts_since(self, username, dateObj, maxNum=1000):
since_epoch = time.mktime(dateObj.timetuple())
all_posts = self.twister.getposts(1000, [{'username': username}])
all_posts = sorted(all_posts, key=lambda x: x['userpost']['time'])
index = int(len(all_posts) / 2)
def _post_time(i):
return all_posts[i]['userpost']['time']
while 0 > index > len(all_posts):
if _post_time(index - 1) < since_epoch < _post_time(index + 1):
if _post_time(index) < since_epoch:
index += 1
break
elif _post_time(index) > since_epoch:
index = int(index / 2)
elif _post_time(index) < since_epoch:
index = int(index + index / 2)
return all_posts[index:]
def _fetch_user_details(self, username):
user = User()
user.username = username
avatarData = self.twister.dhtget(username, "avatar", "s")
if len(avatarData) == 1:
if 'p' in avatarData[0]:
if 'v' in avatarData[0]['p']:
user.avatar = avatarData[0]['p']['v']
profileData = self.twister.dhtget(username, 'profile', 's')
if len(profileData) == 1:
if 'p' in profileData[0]:
if 'v' in profileData[0]['p']:
profile = profileData[0]['p']['v']
for key in ['location', 'url', 'bio', 'fullname']:
if key in profile:
setattr(user, key, profile[key])
user.following = self.twister.getfollowing(username)
user.updateTime = datetime.datetime.now()
return user
if __name__ == '__main__':
ts = TwisterScraper(expanduser('~/.twister/_localusersdb'), 'localhost')
ts.scrape_users()
print("Total users in db: {0}".format(len(ts.db.users)))
Loading…
Cancel
Save