Browse Source

Merge #9373: Linearize script update (hash byte reversal and Python 3 support)

3c8f63b Make linearize scripts Python 3-compatible. (Doug)
d5aa198 Allow linearization scripts to support hash byte reversal (Doug)
0.14
Wladimir J. van der Laan 8 years ago
parent
commit
406f35d99d
No known key found for this signature in database
GPG Key ID: 74810B012346C9A6
  1. 45
      contrib/linearize/README.md
  2. 4
      contrib/linearize/example-linearize.cfg
  3. 43
      contrib/linearize/linearize-data.py
  4. 36
      contrib/linearize/linearize-hashes.py

45
contrib/linearize/README.md

@ -1,33 +1,48 @@
# Linearize # Linearize
Construct a linear, no-fork, best version of the blockchain. Construct a linear, no-fork, best version of the Bitcoin blockchain. The scripts
run using Python 3 but are compatible with Python 2.
## Step 1: Download hash list ## Step 1: Download hash list
$ ./linearize-hashes.py linearize.cfg > hashlist.txt $ ./linearize-hashes.py linearize.cfg > hashlist.txt
Required configuration file settings for linearize-hashes: Required configuration file settings for linearize-hashes:
* RPC: rpcuser, rpcpassword * RPC: `rpcuser`, `rpcpassword`
Optional config file setting for linearize-hashes: Optional config file setting for linearize-hashes:
* RPC: host, port * RPC: `host` (Default: `127.0.0.1`)
* Block chain: min_height, max_height * RPC: `port` (Default: `8332`)
* Blockchain: `min_height`, `max_height`
* `rev_hash_bytes`: If true, the written block hash list will be
byte-reversed. (In other words, the hash returned by getblockhash will have its
bytes reversed.) False by default. Intended for generation of
standalone hash lists but safe to use with linearize-data.py, which will output
the same data no matter which byte format is chosen.
The `linearize-hashes` script requires a connection, local or remote, to a
JSON-RPC server. Running `bitcoind` or `bitcoin-qt -server` will be sufficient.
## Step 2: Copy local block data ## Step 2: Copy local block data
$ ./linearize-data.py linearize.cfg $ ./linearize-data.py linearize.cfg
Required configuration file settings: Required configuration file settings:
* "input": bitcoind blocks/ directory containing blkNNNNN.dat * `output_file`: The file that will contain the final blockchain.
* "hashlist": text file containing list of block hashes, linearized-hashes.py
output.
* "output_file": bootstrap.dat
or or
* "output": output directory for linearized blocks/blkNNNNN.dat output * `output`: Output directory for linearized `blocks/blkNNNNN.dat` output.
Optional config file setting for linearize-data: Optional config file setting for linearize-data:
* "netmagic": network magic number * `file_timestamp`: Set each file's last-modified time to that of the most
* "max_out_sz": maximum output file size (default `1000*1000*1000`) recent block in that file.
* "split_timestamp": Split files when a new month is first seen, in addition to * `genesis`: The hash of the genesis block in the blockchain.
reaching a maximum file size. * `input`: bitcoind blocks/ directory containing blkNNNNN.dat
* "file_timestamp": Set each file's last-modified time to that of the * `hashlist`: text file containing list of block hashes created by
most recent block in that file. linearize-hashes.py.
* `max_out_sz`: Maximum size for files created by the `output_file` option.
(Default: `1000*1000*1000 bytes`)
* `netmagic`: Network magic number.
* `rev_hash_bytes`: If true, the block hash list written by linearize-hashes.py
will be byte-reversed when read by linearize-data.py. See the linearize-hashes
entry for more information.
* `split_timestamp`: Split blockchain files when a new month is first seen, in
addition to reaching a maximum file size (`max_out_sz`).

4
contrib/linearize/example-linearize.cfg

@ -23,7 +23,9 @@ input=/home/example/.bitcoin/blocks
output_file=/home/example/Downloads/bootstrap.dat output_file=/home/example/Downloads/bootstrap.dat
hashlist=hashlist.txt hashlist=hashlist.txt
split_year=1
# Maxmimum size in bytes of out-of-order blocks cache in memory # Maxmimum size in bytes of out-of-order blocks cache in memory
out_of_order_cache_sz = 100000000 out_of_order_cache_sz = 100000000
# Do we want the reverse the hash bytes coming from getblockhash?
rev_hash_bytes = False

43
contrib/linearize/linearize-data.py

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
# #
# linearize-data.py: Construct a linear, no-fork version of the chain. # linearize-data.py: Construct a linear, no-fork version of the chain.
# #
@ -8,23 +8,33 @@
# #
from __future__ import print_function, division from __future__ import print_function, division
try: # Python 3
import http.client as httplib
except ImportError: # Python 2
import httplib
import json import json
import struct import struct
import re import re
import os import os
import os.path import os.path
import base64 import base64
import httplib
import sys import sys
import hashlib import hashlib
import datetime import datetime
import time import time
from collections import namedtuple from collections import namedtuple
from binascii import hexlify, unhexlify
settings = {} settings = {}
##### Switch endian-ness #####
def hex_switchEndian(s):
""" Switches the endianness of a hex string (in pairs of hex chars) """
pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
return b''.join(pairList[::-1]).decode()
def uint32(x): def uint32(x):
return x & 0xffffffffL return x & 0xffffffff
def bytereverse(x): def bytereverse(x):
return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) | return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
@ -35,14 +45,14 @@ def bufreverse(in_buf):
for i in range(0, len(in_buf), 4): for i in range(0, len(in_buf), 4):
word = struct.unpack('@I', in_buf[i:i+4])[0] word = struct.unpack('@I', in_buf[i:i+4])[0]
out_words.append(struct.pack('@I', bytereverse(word))) out_words.append(struct.pack('@I', bytereverse(word)))
return ''.join(out_words) return b''.join(out_words)
def wordreverse(in_buf): def wordreverse(in_buf):
out_words = [] out_words = []
for i in range(0, len(in_buf), 4): for i in range(0, len(in_buf), 4):
out_words.append(in_buf[i:i+4]) out_words.append(in_buf[i:i+4])
out_words.reverse() out_words.reverse()
return ''.join(out_words) return b''.join(out_words)
def calc_hdr_hash(blk_hdr): def calc_hdr_hash(blk_hdr):
hash1 = hashlib.sha256() hash1 = hashlib.sha256()
@ -59,7 +69,7 @@ def calc_hash_str(blk_hdr):
hash = calc_hdr_hash(blk_hdr) hash = calc_hdr_hash(blk_hdr)
hash = bufreverse(hash) hash = bufreverse(hash)
hash = wordreverse(hash) hash = wordreverse(hash)
hash_str = hash.encode('hex') hash_str = hexlify(hash).decode('utf-8')
return hash_str return hash_str
def get_blk_dt(blk_hdr): def get_blk_dt(blk_hdr):
@ -69,17 +79,21 @@ def get_blk_dt(blk_hdr):
dt_ym = datetime.datetime(dt.year, dt.month, 1) dt_ym = datetime.datetime(dt.year, dt.month, 1)
return (dt_ym, nTime) return (dt_ym, nTime)
# When getting the list of block hashes, undo any byte reversals.
def get_block_hashes(settings): def get_block_hashes(settings):
blkindex = [] blkindex = []
f = open(settings['hashlist'], "r") f = open(settings['hashlist'], "r")
for line in f: for line in f:
line = line.rstrip() line = line.rstrip()
if settings['rev_hash_bytes'] == 'true':
line = hex_switchEndian(line)
blkindex.append(line) blkindex.append(line)
print("Read " + str(len(blkindex)) + " hashes") print("Read " + str(len(blkindex)) + " hashes")
return blkindex return blkindex
# The block map shouldn't give or receive byte-reversed hashes.
def mkblockmap(blkindex): def mkblockmap(blkindex):
blkmap = {} blkmap = {}
for height,hash in enumerate(blkindex): for height,hash in enumerate(blkindex):
@ -207,7 +221,7 @@ class BlockDataCopier:
inMagic = inhdr[:4] inMagic = inhdr[:4]
if (inMagic != self.settings['netmagic']): if (inMagic != self.settings['netmagic']):
print("Invalid magic: " + inMagic.encode('hex')) print("Invalid magic: " + hexlify(inMagic).decode('utf-8'))
return return
inLenLE = inhdr[4:] inLenLE = inhdr[4:]
su = struct.unpack("<I", inLenLE) su = struct.unpack("<I", inLenLE)
@ -265,6 +279,12 @@ if __name__ == '__main__':
settings[m.group(1)] = m.group(2) settings[m.group(1)] = m.group(2)
f.close() f.close()
# Force hash byte format setting to be lowercase to make comparisons easier.
# Also place upfront in case any settings need to know about it.
if 'rev_hash_bytes' not in settings:
settings['rev_hash_bytes'] = 'false'
settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
if 'netmagic' not in settings: if 'netmagic' not in settings:
settings['netmagic'] = 'f9beb4d9' settings['netmagic'] = 'f9beb4d9'
if 'genesis' not in settings: if 'genesis' not in settings:
@ -278,14 +298,14 @@ if __name__ == '__main__':
if 'split_timestamp' not in settings: if 'split_timestamp' not in settings:
settings['split_timestamp'] = 0 settings['split_timestamp'] = 0
if 'max_out_sz' not in settings: if 'max_out_sz' not in settings:
settings['max_out_sz'] = 1000L * 1000 * 1000 settings['max_out_sz'] = 1000 * 1000 * 1000
if 'out_of_order_cache_sz' not in settings: if 'out_of_order_cache_sz' not in settings:
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
settings['max_out_sz'] = long(settings['max_out_sz']) settings['max_out_sz'] = int(settings['max_out_sz'])
settings['split_timestamp'] = int(settings['split_timestamp']) settings['split_timestamp'] = int(settings['split_timestamp'])
settings['file_timestamp'] = int(settings['file_timestamp']) settings['file_timestamp'] = int(settings['file_timestamp'])
settings['netmagic'] = settings['netmagic'].decode('hex') settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
if 'output_file' not in settings and 'output' not in settings: if 'output_file' not in settings and 'output' not in settings:
@ -295,9 +315,8 @@ if __name__ == '__main__':
blkindex = get_block_hashes(settings) blkindex = get_block_hashes(settings)
blkmap = mkblockmap(blkindex) blkmap = mkblockmap(blkindex)
# Block hash map won't be byte-reversed. Neither should the genesis hash.
if not settings['genesis'] in blkmap: if not settings['genesis'] in blkmap:
print("Genesis block not found in hashlist") print("Genesis block not found in hashlist")
else: else:
BlockDataCopier(settings, blkindex, blkmap).run() BlockDataCopier(settings, blkindex, blkmap).run()

36
contrib/linearize/linearize-hashes.py

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
# #
# linearize-hashes.py: List blocks in a linear, no-fork version of the chain. # linearize-hashes.py: List blocks in a linear, no-fork version of the chain.
# #
@ -8,32 +8,47 @@
# #
from __future__ import print_function from __future__ import print_function
try: # Python 3
import http.client as httplib
except ImportError: # Python 2
import httplib
import json import json
import struct import struct
import re import re
import base64 import base64
import httplib
import sys import sys
settings = {} settings = {}
##### Switch endian-ness #####
def hex_switchEndian(s):
""" Switches the endianness of a hex string (in pairs of hex chars) """
pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
return b''.join(pairList[::-1]).decode()
class BitcoinRPC: class BitcoinRPC:
def __init__(self, host, port, username, password): def __init__(self, host, port, username, password):
authpair = "%s:%s" % (username, password) authpair = "%s:%s" % (username, password)
self.authhdr = "Basic %s" % (base64.b64encode(authpair)) authpair = authpair.encode('utf-8')
self.conn = httplib.HTTPConnection(host, port, False, 30) self.authhdr = b"Basic " + base64.b64encode(authpair)
self.conn = httplib.HTTPConnection(host, port=port, timeout=30)
def execute(self, obj): def execute(self, obj):
try:
self.conn.request('POST', '/', json.dumps(obj), self.conn.request('POST', '/', json.dumps(obj),
{ 'Authorization' : self.authhdr, { 'Authorization' : self.authhdr,
'Content-type' : 'application/json' }) 'Content-type' : 'application/json' })
except ConnectionRefusedError:
print('RPC connection refused. Check RPC settings and the server status.',
file=sys.stderr)
return None
resp = self.conn.getresponse() resp = self.conn.getresponse()
if resp is None: if resp is None:
print("JSON-RPC: no response", file=sys.stderr) print("JSON-RPC: no response", file=sys.stderr)
return None return None
body = resp.read() body = resp.read().decode('utf-8')
resp_obj = json.loads(body) resp_obj = json.loads(body)
return resp_obj return resp_obj
@ -64,12 +79,17 @@ def get_block_hashes(settings, max_blocks_per_call=10000):
batch.append(rpc.build_request(x, 'getblockhash', [height + x])) batch.append(rpc.build_request(x, 'getblockhash', [height + x]))
reply = rpc.execute(batch) reply = rpc.execute(batch)
if reply is None:
print('Cannot continue. Program will halt.')
return None
for x,resp_obj in enumerate(reply): for x,resp_obj in enumerate(reply):
if rpc.response_is_error(resp_obj): if rpc.response_is_error(resp_obj):
print('JSON-RPC: error at height', height+x, ': ', resp_obj['error'], file=sys.stderr) print('JSON-RPC: error at height', height+x, ': ', resp_obj['error'], file=sys.stderr)
exit(1) exit(1)
assert(resp_obj['id'] == x) # assume replies are in-sequence assert(resp_obj['id'] == x) # assume replies are in-sequence
if settings['rev_hash_bytes'] == 'true':
resp_obj['result'] = hex_switchEndian(resp_obj['result'])
print(resp_obj['result']) print(resp_obj['result'])
height += num_blocks height += num_blocks
@ -101,6 +121,8 @@ if __name__ == '__main__':
settings['min_height'] = 0 settings['min_height'] = 0
if 'max_height' not in settings: if 'max_height' not in settings:
settings['max_height'] = 313000 settings['max_height'] = 313000
if 'rev_hash_bytes' not in settings:
settings['rev_hash_bytes'] = 'false'
if 'rpcuser' not in settings or 'rpcpassword' not in settings: if 'rpcuser' not in settings or 'rpcpassword' not in settings:
print("Missing username and/or password in cfg file", file=stderr) print("Missing username and/or password in cfg file", file=stderr)
sys.exit(1) sys.exit(1)
@ -109,5 +131,7 @@ if __name__ == '__main__':
settings['min_height'] = int(settings['min_height']) settings['min_height'] = int(settings['min_height'])
settings['max_height'] = int(settings['max_height']) settings['max_height'] = int(settings['max_height'])
get_block_hashes(settings) # Force hash byte format setting to be lowercase to make comparisons easier.
settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
get_block_hashes(settings)

Loading…
Cancel
Save