Browse Source

Merge pull request #5051

aedc74d contrib: make linearize-data.py cope with out-of-order blocks (Wladimir J. van der Laan)
0.10
Wladimir J. van der Laan 10 years ago
parent
commit
97a34c28d5
No known key found for this signature in database
GPG Key ID: 74810B012346C9A6
  1. 2
      contrib/linearize/example-linearize.cfg
  2. 228
      contrib/linearize/linearize-data.py

2
contrib/linearize/example-linearize.cfg

@ -15,3 +15,5 @@ output_file=/home/example/Downloads/bootstrap.dat
hashlist=hashlist.txt hashlist=hashlist.txt
split_year=1 split_year=1
# Maxmimum size in bytes of out-of-order blocks cache in memory
out_of_order_cache_sz = 100000000

228
contrib/linearize/linearize-data.py

@ -2,11 +2,12 @@
# #
# linearize-data.py: Construct a linear, no-fork version of the chain. # linearize-data.py: Construct a linear, no-fork version of the chain.
# #
# Copyright (c) 2013 The Bitcoin developers # Copyright (c) 2013-2014 The Bitcoin developers
# Distributed under the MIT/X11 software license, see the accompanying # Distributed under the MIT/X11 software license, see the accompanying
# file COPYING or http://www.opensource.org/licenses/mit-license.php. # file COPYING or http://www.opensource.org/licenses/mit-license.php.
# #
from __future__ import print_function, division
import json import json
import struct import struct
import re import re
@ -17,10 +18,10 @@ import sys
import hashlib import hashlib
import datetime import datetime
import time import time
from collections import namedtuple
settings = {} settings = {}
def uint32(x): def uint32(x):
return x & 0xffffffffL return x & 0xffffffffL
@ -78,116 +79,174 @@ def get_block_hashes(settings):
return blkindex return blkindex
def mkblockset(blkindex): def mkblockmap(blkindex):
blkmap = {} blkmap = {}
for hash in blkindex: for height,hash in enumerate(blkindex):
blkmap[hash] = True blkmap[hash] = height
return blkmap return blkmap
def copydata(settings, blkindex, blkset): # Block header and extent on disk
inFn = 0 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
inF = None
outFn = 0 class BlockDataCopier:
outsz = 0 def __init__(self, settings, blkindex, blkmap):
outF = None self.settings = settings
outFname = None self.blkindex = blkindex
blkCount = 0 self.blkmap = blkmap
lastDate = datetime.datetime(2000, 1, 1) self.inFn = 0
highTS = 1408893517 - 315360000 self.inF = None
timestampSplit = False self.outFn = 0
fileOutput = True self.outsz = 0
setFileTime = False self.outF = None
maxOutSz = settings['max_out_sz'] self.outFname = None
self.blkCountIn = 0
self.blkCountOut = 0
self.lastDate = datetime.datetime(2000, 1, 1)
self.highTS = 1408893517 - 315360000
self.timestampSplit = False
self.fileOutput = True
self.setFileTime = False
self.maxOutSz = settings['max_out_sz']
if 'output' in settings: if 'output' in settings:
fileOutput = False self.fileOutput = False
if settings['file_timestamp'] != 0: if settings['file_timestamp'] != 0:
setFileTime = True self.setFileTime = True
if settings['split_timestamp'] != 0: if settings['split_timestamp'] != 0:
timestampSplit = True self.timestampSplit = True
# Extents and cache for out-of-order blocks
self.blockExtents = {}
self.outOfOrderData = {}
self.outOfOrderSize = 0 # running total size for items in outOfOrderData
def writeBlock(self, inhdr, blk_hdr, rawblock):
if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
self.outF.close()
if self.setFileTime:
os.utime(outFname, (int(time.time()), highTS))
self.outF = None
self.outFname = None
self.outFn = outFn + 1
self.outsz = 0
while True: (blkDate, blkTS) = get_blk_dt(blk_hdr)
if not inF: if self.timestampSplit and (blkDate > self.lastDate):
fname = "%s/blk%05d.dat" % (settings['input'], inFn) print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
lastDate = blkDate
if outF:
outF.close()
if setFileTime:
os.utime(outFname, (int(time.time()), highTS))
self.outF = None
self.outFname = None
self.outFn = self.outFn + 1
self.outsz = 0
if not self.outF:
if self.fileOutput:
outFname = self.settings['output_file']
else:
outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
print("Output file" + outFname)
self.outF = open(outFname, "wb")
self.outF.write(inhdr)
self.outF.write(blk_hdr)
self.outF.write(rawblock)
self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
self.blkCountOut = self.blkCountOut + 1
if blkTS > self.highTS:
self.highTS = blkTS
if (self.blkCountOut % 1000) == 0:
print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
(self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
def inFileName(self, fn):
return "%s/blk%05d.dat" % (self.settings['input'], fn)
def fetchBlock(self, extent):
'''Fetch block contents from disk given extents'''
with open(self.inFileName(extent.fn), "rb") as f:
f.seek(extent.offset)
return f.read(extent.size)
def copyOneBlock(self):
'''Find the next block to be written in the input, and copy it to the output.'''
extent = self.blockExtents.pop(self.blkCountOut)
if self.blkCountOut in self.outOfOrderData:
# If the data is cached, use it from memory and remove from the cache
rawblock = self.outOfOrderData.pop(self.blkCountOut)
self.outOfOrderSize -= len(rawblock)
else: # Otherwise look up data on disk
rawblock = self.fetchBlock(extent)
self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
def run(self):
while self.blkCountOut < len(self.blkindex):
if not self.inF:
fname = self.inFileName(self.inFn)
print("Input file" + fname) print("Input file" + fname)
try: try:
inF = open(fname, "rb") self.inF = open(fname, "rb")
except IOError: except IOError:
print "Done" print("Premature end of block data")
return return
inhdr = inF.read(8) inhdr = self.inF.read(8)
if (not inhdr or (inhdr[0] == "\0")): if (not inhdr or (inhdr[0] == "\0")):
inF.close() self.inF.close()
inF = None self.inF = None
inFn = inFn + 1 self.inFn = self.inFn + 1
continue continue
inMagic = inhdr[:4] inMagic = inhdr[:4]
if (inMagic != settings['netmagic']): if (inMagic != self.settings['netmagic']):
print("Invalid magic:" + inMagic) print("Invalid magic:" + inMagic)
return return
inLenLE = inhdr[4:] inLenLE = inhdr[4:]
su = struct.unpack("<I", inLenLE) su = struct.unpack("<I", inLenLE)
inLen = su[0] inLen = su[0] - 80 # length without header
rawblock = inF.read(inLen) blk_hdr = self.inF.read(80)
blk_hdr = rawblock[:80] inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
hash_str = calc_hash_str(blk_hdr) hash_str = calc_hash_str(blk_hdr)
if not hash_str in blkset: if not hash_str in blkmap:
print("Skipping unknown block " + hash_str) print("Skipping unknown block " + hash_str)
self.inF.seek(inLen, os.SEEK_CUR)
continue continue
if blkindex[blkCount] != hash_str: blkHeight = self.blkmap[hash_str]
print("Out of order block.") self.blkCountIn += 1
print("Expected " + blkindex[blkCount])
print("Got " + hash_str)
sys.exit(1)
if not fileOutput and ((outsz + inLen) > maxOutSz): if self.blkCountOut == blkHeight:
outF.close() # If in-order block, just copy
if setFileTime: rawblock = self.inF.read(inLen)
os.utime(outFname, (int(time.time()), highTS)) self.writeBlock(inhdr, blk_hdr, rawblock)
outF = None
outFname = None
outFn = outFn + 1
outsz = 0
(blkDate, blkTS) = get_blk_dt(blk_hdr)
if timestampSplit and (blkDate > lastDate):
print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
lastDate = blkDate
if outF:
outF.close()
if setFileTime:
os.utime(outFname, (int(time.time()), highTS))
outF = None
outFname = None
outFn = outFn + 1
outsz = 0
if not outF:
if fileOutput:
outFname = settings['output_file']
else:
outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
print("Output file" + outFname)
outF = open(outFname, "wb")
outF.write(inhdr) # See if we can catch up to prior out-of-order blocks
outF.write(rawblock) while self.blkCountOut in self.blockExtents:
outsz = outsz + inLen + 8 self.copyOneBlock()
blkCount = blkCount + 1 else: # If out-of-order, skip over block data for now
if blkTS > highTS: self.blockExtents[blkHeight] = inExtent
highTS = blkTS if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
# If there is space in the cache, read the data
# Reading the data in file sequence instead of seeking and fetching it later is preferred,
# but we don't want to fill up memory
self.outOfOrderData[blkHeight] = self.inF.read(inLen)
self.outOfOrderSize += inLen
else: # If no space in cache, seek forward
self.inF.seek(inLen, os.SEEK_CUR)
if (blkCount % 1000) == 0: print("Done (%i blocks written)" % (self.blkCountOut))
print("Wrote " + str(blkCount) + " blocks")
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 2: if len(sys.argv) != 2:
print "Usage: linearize-data.py CONFIG-FILE" print("Usage: linearize-data.py CONFIG-FILE")
sys.exit(1) sys.exit(1)
f = open(sys.argv[1]) f = open(sys.argv[1])
@ -216,22 +275,25 @@ if __name__ == '__main__':
settings['split_timestamp'] = 0 settings['split_timestamp'] = 0
if 'max_out_sz' not in settings: if 'max_out_sz' not in settings:
settings['max_out_sz'] = 1000L * 1000 * 1000 settings['max_out_sz'] = 1000L * 1000 * 1000
if 'out_of_order_cache_sz' not in settings:
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
settings['max_out_sz'] = long(settings['max_out_sz']) settings['max_out_sz'] = long(settings['max_out_sz'])
settings['split_timestamp'] = int(settings['split_timestamp']) settings['split_timestamp'] = int(settings['split_timestamp'])
settings['file_timestamp'] = int(settings['file_timestamp']) settings['file_timestamp'] = int(settings['file_timestamp'])
settings['netmagic'] = settings['netmagic'].decode('hex') settings['netmagic'] = settings['netmagic'].decode('hex')
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
if 'output_file' not in settings and 'output' not in settings: if 'output_file' not in settings and 'output' not in settings:
print("Missing output file / directory") print("Missing output file / directory")
sys.exit(1) sys.exit(1)
blkindex = get_block_hashes(settings) blkindex = get_block_hashes(settings)
blkset = mkblockset(blkindex) blkmap = mkblockmap(blkindex)
if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset: if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap:
print("not found") print("not found")
else: else:
copydata(settings, blkindex, blkset) BlockDataCopier(settings, blkindex, blkmap).run()

Loading…
Cancel
Save