@ -2,11 +2,12 @@
#
#
# linearize-data.py: Construct a linear, no-fork version of the chain.
# linearize-data.py: Construct a linear, no-fork version of the chain.
#
#
# Copyright (c) 2013 The Bitcoin developers
# Copyright (c) 2013-2014 The Bitcoin developers
# Distributed under the MIT/X11 software license, see the accompanying
# Distributed under the MIT/X11 software license, see the accompanying
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
#
#
from __future__ import print_function , division
import json
import json
import struct
import struct
import re
import re
@ -17,10 +18,10 @@ import sys
import hashlib
import hashlib
import datetime
import datetime
import time
import time
from collections import namedtuple
settings = { }
settings = { }
def uint32 ( x ) :
def uint32 ( x ) :
return x & 0xffffffff L
return x & 0xffffffff L
@ -78,116 +79,174 @@ def get_block_hashes(settings):
return blkindex
return blkindex
def mkblockset ( blkindex ) :
def mkblockmap ( blkindex ) :
blkmap = { }
blkmap = { }
for hash in blkindex :
for height , hash in enumerate ( blkindex ) :
blkmap [ hash ] = True
blkmap [ hash ] = height
return blkmap
return blkmap
def copydata ( settings , blkindex , blkset ) :
# Block header and extent on disk
inFn = 0
BlockExtent = namedtuple ( ' BlockExtent ' , [ ' fn ' , ' offset ' , ' inhdr ' , ' blkhdr ' , ' size ' ] )
inF = None
outFn = 0
class BlockDataCopier :
outsz = 0
def __init__ ( self , settings , blkindex , blkmap ) :
outF = None
self . settings = settings
outFname = None
self . blkindex = blkindex
blkCount = 0
self . blkmap = blkmap
lastDate = datetime . datetime ( 2000 , 1 , 1 )
self . inFn = 0
highTS = 1408893517 - 315360000
self . inF = None
timestampSplit = False
self . outFn = 0
fileOutput = True
self . outsz = 0
setFileTime = False
self . outF = None
maxOutSz = settings [ ' max_out_sz ' ]
self . outFname = None
if ' output ' in settings :
self . blkCountIn = 0
fileOutput = False
self . blkCountOut = 0
if settings [ ' file_timestamp ' ] != 0 :
setFileTime = True
self . lastDate = datetime . datetime ( 2000 , 1 , 1 )
if settings [ ' split_timestamp ' ] != 0 :
self . highTS = 1408893517 - 315360000
timestampSplit = True
self . timestampSplit = False
self . fileOutput = True
while True :
self . setFileTime = False
if not inF :
self . maxOutSz = settings [ ' max_out_sz ' ]
fname = " %s /blk %05d .dat " % ( settings [ ' input ' ] , inFn )
if ' output ' in settings :
print ( " Input file " + fname )
self . fileOutput = False
try :
if settings [ ' file_timestamp ' ] != 0 :
inF = open ( fname , " rb " )
self . setFileTime = True
except IOError :
if settings [ ' split_timestamp ' ] != 0 :
print " Done "
self . timestampSplit = True
return
# Extents and cache for out-of-order blocks
self . blockExtents = { }
inhdr = inF . read ( 8 )
self . outOfOrderData = { }
if ( not inhdr or ( inhdr [ 0 ] == " \0 " ) ) :
self . outOfOrderSize = 0 # running total size for items in outOfOrderData
inF . close ( )
inF = None
def writeBlock ( self , inhdr , blk_hdr , rawblock ) :
inFn = inFn + 1
if not self . fileOutput and ( ( self . outsz + self . inLen ) > self . maxOutSz ) :
continue
self . outF . close ( )
if self . setFileTime :
inMagic = inhdr [ : 4 ]
if ( inMagic != settings [ ' netmagic ' ] ) :
print ( " Invalid magic: " + inMagic )
return
inLenLE = inhdr [ 4 : ]
su = struct . unpack ( " <I " , inLenLE )
inLen = su [ 0 ]
rawblock = inF . read ( inLen )
blk_hdr = rawblock [ : 80 ]
hash_str = calc_hash_str ( blk_hdr )
if not hash_str in blkset :
print ( " Skipping unknown block " + hash_str )
continue
if blkindex [ blkCount ] != hash_str :
print ( " Out of order block. " )
print ( " Expected " + blkindex [ blkCount ] )
print ( " Got " + hash_str )
sys . exit ( 1 )
if not fileOutput and ( ( outsz + inLen ) > maxOutSz ) :
outF . close ( )
if setFileTime :
os . utime ( outFname , ( int ( time . time ( ) ) , highTS ) )
os . utime ( outFname , ( int ( time . time ( ) ) , highTS ) )
outF = None
self . outF = None
outFname = None
self . outFname = None
outFn = outFn + 1
self . outFn = outFn + 1
outsz = 0
self . outsz = 0
( blkDate , blkTS ) = get_blk_dt ( blk_hdr )
( blkDate , blkTS ) = get_blk_dt ( blk_hdr )
if timestampSplit and ( blkDate > lastDate ) :
if self . timestampSplit and ( blkDate > self . lastDate ) :
print ( " New month " + blkDate . strftime ( " % Y- % m " ) + " @ " + hash_str )
print ( " New month " + blkDate . strftime ( " % Y- % m " ) + " @ " + hash_str )
lastDate = blkDate
lastDate = blkDate
if outF :
if outF :
outF . close ( )
outF . close ( )
if setFileTime :
if setFileTime :
os . utime ( outFname , ( int ( time . time ( ) ) , highTS ) )
os . utime ( outFname , ( int ( time . time ( ) ) , highTS ) )
outF = None
self . outF = None
outFname = None
self . outFname = None
outFn = outFn + 1
self . outFn = self . outFn + 1
outsz = 0
self . outsz = 0
if not outF :
if not self . outF :
if fileOutput :
if self . fileOutput :
outFname = settings [ ' output_file ' ]
outFname = self . settings [ ' output_file ' ]
else :
else :
outFname = " %s /blk %05d .dat " % ( settings [ ' output ' ] , outFn )
outFname = " %s /blk %05d .dat " % ( self . settings [ ' output ' ] , outFn )
print ( " Output file " + outFname )
print ( " Output file " + outFname )
outF = open ( outFname , " wb " )
self . outF = open ( outFname , " wb " )
outF . write ( inhdr )
self . outF . write ( inhdr )
outF . write ( rawblock )
self . outF . write ( blk_hdr )
outsz = outsz + inLen + 8
self . outF . write ( rawblock )
self . outsz = self . outsz + len ( inhdr ) + len ( blk_hdr ) + len ( rawblock )
blkCount = blkCount + 1
if blkTS > highTS :
self . blkCountOut = self . blkCountOut + 1
highTS = blkTS
if blkTS > self . highTS :
self . highTS = blkTS
if ( blkCount % 1000 ) == 0 :
print ( " Wrote " + str ( blkCount ) + " blocks " )
if ( self . blkCountOut % 1000 ) == 0 :
print ( ' %i blocks scanned, %i blocks written (of %i , %.1f %% complete) ' %
( self . blkCountIn , self . blkCountOut , len ( self . blkindex ) , 100.0 * self . blkCountOut / len ( self . blkindex ) ) )
def inFileName ( self , fn ) :
return " %s /blk %05d .dat " % ( self . settings [ ' input ' ] , fn )
def fetchBlock ( self , extent ) :
''' Fetch block contents from disk given extents '''
with open ( self . inFileName ( extent . fn ) , " rb " ) as f :
f . seek ( extent . offset )
return f . read ( extent . size )
def copyOneBlock ( self ) :
''' Find the next block to be written in the input, and copy it to the output. '''
extent = self . blockExtents . pop ( self . blkCountOut )
if self . blkCountOut in self . outOfOrderData :
# If the data is cached, use it from memory and remove from the cache
rawblock = self . outOfOrderData . pop ( self . blkCountOut )
self . outOfOrderSize - = len ( rawblock )
else : # Otherwise look up data on disk
rawblock = self . fetchBlock ( extent )
self . writeBlock ( extent . inhdr , extent . blkhdr , rawblock )
def run ( self ) :
while self . blkCountOut < len ( self . blkindex ) :
if not self . inF :
fname = self . inFileName ( self . inFn )
print ( " Input file " + fname )
try :
self . inF = open ( fname , " rb " )
except IOError :
print ( " Premature end of block data " )
return
inhdr = self . inF . read ( 8 )
if ( not inhdr or ( inhdr [ 0 ] == " \0 " ) ) :
self . inF . close ( )
self . inF = None
self . inFn = self . inFn + 1
continue
inMagic = inhdr [ : 4 ]
if ( inMagic != self . settings [ ' netmagic ' ] ) :
print ( " Invalid magic: " + inMagic )
return
inLenLE = inhdr [ 4 : ]
su = struct . unpack ( " <I " , inLenLE )
inLen = su [ 0 ] - 80 # length without header
blk_hdr = self . inF . read ( 80 )
inExtent = BlockExtent ( self . inFn , self . inF . tell ( ) , inhdr , blk_hdr , inLen )
hash_str = calc_hash_str ( blk_hdr )
if not hash_str in blkmap :
print ( " Skipping unknown block " + hash_str )
self . inF . seek ( inLen , os . SEEK_CUR )
continue
blkHeight = self . blkmap [ hash_str ]
self . blkCountIn + = 1
if self . blkCountOut == blkHeight :
# If in-order block, just copy
rawblock = self . inF . read ( inLen )
self . writeBlock ( inhdr , blk_hdr , rawblock )
# See if we can catch up to prior out-of-order blocks
while self . blkCountOut in self . blockExtents :
self . copyOneBlock ( )
else : # If out-of-order, skip over block data for now
self . blockExtents [ blkHeight ] = inExtent
if self . outOfOrderSize < self . settings [ ' out_of_order_cache_sz ' ] :
# If there is space in the cache, read the data
# Reading the data in file sequence instead of seeking and fetching it later is preferred,
# but we don't want to fill up memory
self . outOfOrderData [ blkHeight ] = self . inF . read ( inLen )
self . outOfOrderSize + = inLen
else : # If no space in cache, seek forward
self . inF . seek ( inLen , os . SEEK_CUR )
print ( " Done ( %i blocks written) " % ( self . blkCountOut ) )
if __name__ == ' __main__ ' :
if __name__ == ' __main__ ' :
if len ( sys . argv ) != 2 :
if len ( sys . argv ) != 2 :
print " Usage: linearize-data.py CONFIG-FILE "
print ( " Usage: linearize-data.py CONFIG-FILE " )
sys . exit ( 1 )
sys . exit ( 1 )
f = open ( sys . argv [ 1 ] )
f = open ( sys . argv [ 1 ] )
@ -216,22 +275,25 @@ if __name__ == '__main__':
settings [ ' split_timestamp ' ] = 0
settings [ ' split_timestamp ' ] = 0
if ' max_out_sz ' not in settings :
if ' max_out_sz ' not in settings :
settings [ ' max_out_sz ' ] = 1000 L * 1000 * 1000
settings [ ' max_out_sz ' ] = 1000 L * 1000 * 1000
if ' out_of_order_cache_sz ' not in settings :
settings [ ' out_of_order_cache_sz ' ] = 100 * 1000 * 1000
settings [ ' max_out_sz ' ] = long ( settings [ ' max_out_sz ' ] )
settings [ ' max_out_sz ' ] = long ( settings [ ' max_out_sz ' ] )
settings [ ' split_timestamp ' ] = int ( settings [ ' split_timestamp ' ] )
settings [ ' split_timestamp ' ] = int ( settings [ ' split_timestamp ' ] )
settings [ ' file_timestamp ' ] = int ( settings [ ' file_timestamp ' ] )
settings [ ' file_timestamp ' ] = int ( settings [ ' file_timestamp ' ] )
settings [ ' netmagic ' ] = settings [ ' netmagic ' ] . decode ( ' hex ' )
settings [ ' netmagic ' ] = settings [ ' netmagic ' ] . decode ( ' hex ' )
settings [ ' out_of_order_cache_sz ' ] = int ( settings [ ' out_of_order_cache_sz ' ] )
if ' output_file ' not in settings and ' output ' not in settings :
if ' output_file ' not in settings and ' output ' not in settings :
print ( " Missing output file / directory " )
print ( " Missing output file / directory " )
sys . exit ( 1 )
sys . exit ( 1 )
blkindex = get_block_hashes ( settings )
blkindex = get_block_hashes ( settings )
blkset = mkblockset ( blkindex )
blkmap = mkblockmap ( blkindex )
if not " 000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f " in blkset :
if not " 000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f " in blkmap :
print ( " not found " )
print ( " not found " )
else :
else :
copydata ( settings , blkindex , blkset )
BlockDataCopier ( settings , blkindex , blkmap ) . run ( )