|
|
|
@ -2,11 +2,12 @@
@@ -2,11 +2,12 @@
|
|
|
|
|
# |
|
|
|
|
# linearize-data.py: Construct a linear, no-fork version of the chain. |
|
|
|
|
# |
|
|
|
|
# Copyright (c) 2013 The Bitcoin developers |
|
|
|
|
# Copyright (c) 2013-2014 The Bitcoin developers |
|
|
|
|
# Distributed under the MIT/X11 software license, see the accompanying |
|
|
|
|
# file COPYING or http://www.opensource.org/licenses/mit-license.php. |
|
|
|
|
# |
|
|
|
|
|
|
|
|
|
from __future__ import print_function, division |
|
|
|
|
import json |
|
|
|
|
import struct |
|
|
|
|
import re |
|
|
|
@ -17,10 +18,10 @@ import sys
@@ -17,10 +18,10 @@ import sys
|
|
|
|
|
import hashlib |
|
|
|
|
import datetime |
|
|
|
|
import time |
|
|
|
|
from collections import namedtuple |
|
|
|
|
|
|
|
|
|
settings = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def uint32(x): |
|
|
|
|
return x & 0xffffffffL |
|
|
|
|
|
|
|
|
@ -78,116 +79,174 @@ def get_block_hashes(settings):
@@ -78,116 +79,174 @@ def get_block_hashes(settings):
|
|
|
|
|
|
|
|
|
|
return blkindex |
|
|
|
|
|
|
|
|
|
def mkblockset(blkindex): |
|
|
|
|
def mkblockmap(blkindex): |
|
|
|
|
blkmap = {} |
|
|
|
|
for hash in blkindex: |
|
|
|
|
blkmap[hash] = True |
|
|
|
|
for height,hash in enumerate(blkindex): |
|
|
|
|
blkmap[hash] = height |
|
|
|
|
return blkmap |
|
|
|
|
|
|
|
|
|
def copydata(settings, blkindex, blkset): |
|
|
|
|
inFn = 0 |
|
|
|
|
inF = None |
|
|
|
|
outFn = 0 |
|
|
|
|
outsz = 0 |
|
|
|
|
outF = None |
|
|
|
|
outFname = None |
|
|
|
|
blkCount = 0 |
|
|
|
|
|
|
|
|
|
lastDate = datetime.datetime(2000, 1, 1) |
|
|
|
|
highTS = 1408893517 - 315360000 |
|
|
|
|
timestampSplit = False |
|
|
|
|
fileOutput = True |
|
|
|
|
setFileTime = False |
|
|
|
|
maxOutSz = settings['max_out_sz'] |
|
|
|
|
if 'output' in settings: |
|
|
|
|
fileOutput = False |
|
|
|
|
if settings['file_timestamp'] != 0: |
|
|
|
|
setFileTime = True |
|
|
|
|
if settings['split_timestamp'] != 0: |
|
|
|
|
timestampSplit = True |
|
|
|
|
|
|
|
|
|
while True: |
|
|
|
|
if not inF: |
|
|
|
|
fname = "%s/blk%05d.dat" % (settings['input'], inFn) |
|
|
|
|
print("Input file" + fname) |
|
|
|
|
try: |
|
|
|
|
inF = open(fname, "rb") |
|
|
|
|
except IOError: |
|
|
|
|
print "Done" |
|
|
|
|
return |
|
|
|
|
|
|
|
|
|
inhdr = inF.read(8) |
|
|
|
|
if (not inhdr or (inhdr[0] == "\0")): |
|
|
|
|
inF.close() |
|
|
|
|
inF = None |
|
|
|
|
inFn = inFn + 1 |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
inMagic = inhdr[:4] |
|
|
|
|
if (inMagic != settings['netmagic']): |
|
|
|
|
print("Invalid magic:" + inMagic) |
|
|
|
|
return |
|
|
|
|
inLenLE = inhdr[4:] |
|
|
|
|
su = struct.unpack("<I", inLenLE) |
|
|
|
|
inLen = su[0] |
|
|
|
|
rawblock = inF.read(inLen) |
|
|
|
|
blk_hdr = rawblock[:80] |
|
|
|
|
|
|
|
|
|
hash_str = calc_hash_str(blk_hdr) |
|
|
|
|
if not hash_str in blkset: |
|
|
|
|
print("Skipping unknown block " + hash_str) |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
if blkindex[blkCount] != hash_str: |
|
|
|
|
print("Out of order block.") |
|
|
|
|
print("Expected " + blkindex[blkCount]) |
|
|
|
|
print("Got " + hash_str) |
|
|
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
if not fileOutput and ((outsz + inLen) > maxOutSz): |
|
|
|
|
outF.close() |
|
|
|
|
if setFileTime: |
|
|
|
|
# Block header and extent on disk |
|
|
|
|
BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) |
|
|
|
|
|
|
|
|
|
class BlockDataCopier: |
|
|
|
|
def __init__(self, settings, blkindex, blkmap): |
|
|
|
|
self.settings = settings |
|
|
|
|
self.blkindex = blkindex |
|
|
|
|
self.blkmap = blkmap |
|
|
|
|
|
|
|
|
|
self.inFn = 0 |
|
|
|
|
self.inF = None |
|
|
|
|
self.outFn = 0 |
|
|
|
|
self.outsz = 0 |
|
|
|
|
self.outF = None |
|
|
|
|
self.outFname = None |
|
|
|
|
self.blkCountIn = 0 |
|
|
|
|
self.blkCountOut = 0 |
|
|
|
|
|
|
|
|
|
self.lastDate = datetime.datetime(2000, 1, 1) |
|
|
|
|
self.highTS = 1408893517 - 315360000 |
|
|
|
|
self.timestampSplit = False |
|
|
|
|
self.fileOutput = True |
|
|
|
|
self.setFileTime = False |
|
|
|
|
self.maxOutSz = settings['max_out_sz'] |
|
|
|
|
if 'output' in settings: |
|
|
|
|
self.fileOutput = False |
|
|
|
|
if settings['file_timestamp'] != 0: |
|
|
|
|
self.setFileTime = True |
|
|
|
|
if settings['split_timestamp'] != 0: |
|
|
|
|
self.timestampSplit = True |
|
|
|
|
# Extents and cache for out-of-order blocks |
|
|
|
|
self.blockExtents = {} |
|
|
|
|
self.outOfOrderData = {} |
|
|
|
|
self.outOfOrderSize = 0 # running total size for items in outOfOrderData |
|
|
|
|
|
|
|
|
|
def writeBlock(self, inhdr, blk_hdr, rawblock): |
|
|
|
|
if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz): |
|
|
|
|
self.outF.close() |
|
|
|
|
if self.setFileTime: |
|
|
|
|
os.utime(outFname, (int(time.time()), highTS)) |
|
|
|
|
outF = None |
|
|
|
|
outFname = None |
|
|
|
|
outFn = outFn + 1 |
|
|
|
|
outsz = 0 |
|
|
|
|
self.outF = None |
|
|
|
|
self.outFname = None |
|
|
|
|
self.outFn = outFn + 1 |
|
|
|
|
self.outsz = 0 |
|
|
|
|
|
|
|
|
|
(blkDate, blkTS) = get_blk_dt(blk_hdr) |
|
|
|
|
if timestampSplit and (blkDate > lastDate): |
|
|
|
|
if self.timestampSplit and (blkDate > self.lastDate): |
|
|
|
|
print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str) |
|
|
|
|
lastDate = blkDate |
|
|
|
|
if outF: |
|
|
|
|
outF.close() |
|
|
|
|
if setFileTime: |
|
|
|
|
os.utime(outFname, (int(time.time()), highTS)) |
|
|
|
|
outF = None |
|
|
|
|
outFname = None |
|
|
|
|
outFn = outFn + 1 |
|
|
|
|
outsz = 0 |
|
|
|
|
|
|
|
|
|
if not outF: |
|
|
|
|
if fileOutput: |
|
|
|
|
outFname = settings['output_file'] |
|
|
|
|
self.outF = None |
|
|
|
|
self.outFname = None |
|
|
|
|
self.outFn = self.outFn + 1 |
|
|
|
|
self.outsz = 0 |
|
|
|
|
|
|
|
|
|
if not self.outF: |
|
|
|
|
if self.fileOutput: |
|
|
|
|
outFname = self.settings['output_file'] |
|
|
|
|
else: |
|
|
|
|
outFname = "%s/blk%05d.dat" % (settings['output'], outFn) |
|
|
|
|
outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn) |
|
|
|
|
print("Output file" + outFname) |
|
|
|
|
outF = open(outFname, "wb") |
|
|
|
|
|
|
|
|
|
outF.write(inhdr) |
|
|
|
|
outF.write(rawblock) |
|
|
|
|
outsz = outsz + inLen + 8 |
|
|
|
|
|
|
|
|
|
blkCount = blkCount + 1 |
|
|
|
|
if blkTS > highTS: |
|
|
|
|
highTS = blkTS |
|
|
|
|
|
|
|
|
|
if (blkCount % 1000) == 0: |
|
|
|
|
print("Wrote " + str(blkCount) + " blocks") |
|
|
|
|
self.outF = open(outFname, "wb") |
|
|
|
|
|
|
|
|
|
self.outF.write(inhdr) |
|
|
|
|
self.outF.write(blk_hdr) |
|
|
|
|
self.outF.write(rawblock) |
|
|
|
|
self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) |
|
|
|
|
|
|
|
|
|
self.blkCountOut = self.blkCountOut + 1 |
|
|
|
|
if blkTS > self.highTS: |
|
|
|
|
self.highTS = blkTS |
|
|
|
|
|
|
|
|
|
if (self.blkCountOut % 1000) == 0: |
|
|
|
|
print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % |
|
|
|
|
(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) |
|
|
|
|
|
|
|
|
|
def inFileName(self, fn): |
|
|
|
|
return "%s/blk%05d.dat" % (self.settings['input'], fn) |
|
|
|
|
|
|
|
|
|
def fetchBlock(self, extent): |
|
|
|
|
'''Fetch block contents from disk given extents''' |
|
|
|
|
with open(self.inFileName(extent.fn), "rb") as f: |
|
|
|
|
f.seek(extent.offset) |
|
|
|
|
return f.read(extent.size) |
|
|
|
|
|
|
|
|
|
def copyOneBlock(self): |
|
|
|
|
'''Find the next block to be written in the input, and copy it to the output.''' |
|
|
|
|
extent = self.blockExtents.pop(self.blkCountOut) |
|
|
|
|
if self.blkCountOut in self.outOfOrderData: |
|
|
|
|
# If the data is cached, use it from memory and remove from the cache |
|
|
|
|
rawblock = self.outOfOrderData.pop(self.blkCountOut) |
|
|
|
|
self.outOfOrderSize -= len(rawblock) |
|
|
|
|
else: # Otherwise look up data on disk |
|
|
|
|
rawblock = self.fetchBlock(extent) |
|
|
|
|
|
|
|
|
|
self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) |
|
|
|
|
|
|
|
|
|
def run(self): |
|
|
|
|
while self.blkCountOut < len(self.blkindex): |
|
|
|
|
if not self.inF: |
|
|
|
|
fname = self.inFileName(self.inFn) |
|
|
|
|
print("Input file" + fname) |
|
|
|
|
try: |
|
|
|
|
self.inF = open(fname, "rb") |
|
|
|
|
except IOError: |
|
|
|
|
print("Premature end of block data") |
|
|
|
|
return |
|
|
|
|
|
|
|
|
|
inhdr = self.inF.read(8) |
|
|
|
|
if (not inhdr or (inhdr[0] == "\0")): |
|
|
|
|
self.inF.close() |
|
|
|
|
self.inF = None |
|
|
|
|
self.inFn = self.inFn + 1 |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
inMagic = inhdr[:4] |
|
|
|
|
if (inMagic != self.settings['netmagic']): |
|
|
|
|
print("Invalid magic:" + inMagic) |
|
|
|
|
return |
|
|
|
|
inLenLE = inhdr[4:] |
|
|
|
|
su = struct.unpack("<I", inLenLE) |
|
|
|
|
inLen = su[0] - 80 # length without header |
|
|
|
|
blk_hdr = self.inF.read(80) |
|
|
|
|
inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) |
|
|
|
|
|
|
|
|
|
hash_str = calc_hash_str(blk_hdr) |
|
|
|
|
if not hash_str in blkmap: |
|
|
|
|
print("Skipping unknown block " + hash_str) |
|
|
|
|
self.inF.seek(inLen, os.SEEK_CUR) |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
blkHeight = self.blkmap[hash_str] |
|
|
|
|
self.blkCountIn += 1 |
|
|
|
|
|
|
|
|
|
if self.blkCountOut == blkHeight: |
|
|
|
|
# If in-order block, just copy |
|
|
|
|
rawblock = self.inF.read(inLen) |
|
|
|
|
self.writeBlock(inhdr, blk_hdr, rawblock) |
|
|
|
|
|
|
|
|
|
# See if we can catch up to prior out-of-order blocks |
|
|
|
|
while self.blkCountOut in self.blockExtents: |
|
|
|
|
self.copyOneBlock() |
|
|
|
|
|
|
|
|
|
else: # If out-of-order, skip over block data for now |
|
|
|
|
self.blockExtents[blkHeight] = inExtent |
|
|
|
|
if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: |
|
|
|
|
# If there is space in the cache, read the data |
|
|
|
|
# Reading the data in file sequence instead of seeking and fetching it later is preferred, |
|
|
|
|
# but we don't want to fill up memory |
|
|
|
|
self.outOfOrderData[blkHeight] = self.inF.read(inLen) |
|
|
|
|
self.outOfOrderSize += inLen |
|
|
|
|
else: # If no space in cache, seek forward |
|
|
|
|
self.inF.seek(inLen, os.SEEK_CUR) |
|
|
|
|
|
|
|
|
|
print("Done (%i blocks written)" % (self.blkCountOut)) |
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
if len(sys.argv) != 2: |
|
|
|
|
print "Usage: linearize-data.py CONFIG-FILE" |
|
|
|
|
print("Usage: linearize-data.py CONFIG-FILE") |
|
|
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
f = open(sys.argv[1]) |
|
|
|
@ -216,22 +275,25 @@ if __name__ == '__main__':
@@ -216,22 +275,25 @@ if __name__ == '__main__':
|
|
|
|
|
settings['split_timestamp'] = 0 |
|
|
|
|
if 'max_out_sz' not in settings: |
|
|
|
|
settings['max_out_sz'] = 1000L * 1000 * 1000 |
|
|
|
|
if 'out_of_order_cache_sz' not in settings: |
|
|
|
|
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 |
|
|
|
|
|
|
|
|
|
settings['max_out_sz'] = long(settings['max_out_sz']) |
|
|
|
|
settings['split_timestamp'] = int(settings['split_timestamp']) |
|
|
|
|
settings['file_timestamp'] = int(settings['file_timestamp']) |
|
|
|
|
settings['netmagic'] = settings['netmagic'].decode('hex') |
|
|
|
|
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) |
|
|
|
|
|
|
|
|
|
if 'output_file' not in settings and 'output' not in settings: |
|
|
|
|
print("Missing output file / directory") |
|
|
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
blkindex = get_block_hashes(settings) |
|
|
|
|
blkset = mkblockset(blkindex) |
|
|
|
|
blkmap = mkblockmap(blkindex) |
|
|
|
|
|
|
|
|
|
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset: |
|
|
|
|
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap: |
|
|
|
|
print("not found") |
|
|
|
|
else: |
|
|
|
|
copydata(settings, blkindex, blkset) |
|
|
|
|
BlockDataCopier(settings, blkindex, blkmap).run() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|