Merge #9202: bench: Add support for measuring CPU cycles

3532818 bench: Add support for measuring CPU cycles (Wladimir J. van der Laan)
8 years ago · e56cf67e6b
5 changed files with 121 additions and 5 deletions
--- a/src/Makefile.bench.include
+++ b/src/Makefile.bench.include
@ -22,7 +22,9 @@ bench_bench_bitcoin_SOURCES = \
  bench/mempool_eviction.cpp \
  bench/verify_script.cpp \
  bench/base58.cpp \
-  bench/lockedpool.cpp
+  bench/lockedpool.cpp \
  bench/perf.cpp \
  bench/perf.h
 nodist_bench_bench_bitcoin_SOURCES = $(GENERATED_TEST_FILES)
--- a/src/bench/bench.cpp
+++ b/src/bench/bench.cpp
@ -3,6 +3,7 @@
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 #include "bench.h"
 #include "perf.h"
 #include <iostream>
 #include <iomanip>
@ -26,7 +27,9 @@ BenchRunner::BenchRunner(std::string name, BenchFunction func)
 void
 BenchRunner::RunAll(double elapsedTimeForOne)
 {
-    std::cout << "#Benchmark" << "," << "count" << "," << "min" << "," << "max" << "," << "average" << "\n";
+    perf_init();
    std::cout << "#Benchmark" << "," << "count" << "," << "min" << "," << "max" << "," << "average" << ","
              << "min_cycles" << "," << "max_cycles" << "," << "average_cycles" << "\n";
    for (std::map<std::string,BenchFunction>::iterator it = benchmarks.begin();
         it != benchmarks.end(); ++it) {
@ -35,6 +38,7 @@ BenchRunner::RunAll(double elapsedTimeForOne)
        BenchFunction& func = it->second;
        func(state);
    }
    perf_fini();
 }
 bool State::KeepRunning()
@ -44,8 +48,10 @@ bool State::KeepRunning()
      return true;
    }
    double now;
    uint64_t nowCycles;
    if (count == 0) {
        lastTime = beginTime = now = gettimedouble();
        lastCycles = beginCycles = nowCycles = perf_cpucycles();
    }
    else {
        now = gettimedouble();
@ -53,6 +59,13 @@ bool State::KeepRunning()
        double elapsedOne = elapsed * countMaskInv;
        if (elapsedOne < minTime) minTime = elapsedOne;
        if (elapsedOne > maxTime) maxTime = elapsedOne;
        // We only use relative values, so don't have to handle 64-bit wrap-around specially
        nowCycles = perf_cpucycles();
        uint64_t elapsedOneCycles = (nowCycles - lastCycles) * countMaskInv;
        if (elapsedOneCycles < minCycles) minCycles = elapsedOneCycles;
        if (elapsedOneCycles > maxCycles) maxCycles = elapsedOneCycles;
        if (elapsed*128 < maxElapsed) {
          // If the execution was much too fast (1/128th of maxElapsed), increase the count mask by 8x and restart timing.
          // The restart avoids including the overhead of this code in the measurement.
@ -61,6 +74,8 @@ bool State::KeepRunning()
          count = 0;
          minTime = std::numeric_limits<double>::max();
          maxTime = std::numeric_limits<double>::min();
          minCycles = std::numeric_limits<uint64_t>::max();
          maxCycles = std::numeric_limits<uint64_t>::min();
          return true;
        }
        if (elapsed*16 < maxElapsed) {
@ -72,6 +87,7 @@ bool State::KeepRunning()
        }
    }
    lastTime = now;
    lastCycles = nowCycles;
    ++count;
    if (now - beginTime < maxElapsed) return true; // Keep going
@ -80,7 +96,9 @@ bool State::KeepRunning()
    // Output results
    double average = (now-beginTime)/count;
-    std::cout << std::fixed << std::setprecision(15) << name << "," << count << "," << minTime << "," << maxTime << "," << average << "\n";
+    int64_t averageCycles = (nowCycles-beginCycles)/count;
    std::cout << std::fixed << std::setprecision(15) << name << "," << count << "," << minTime << "," << maxTime << "," << average << ","
              << minCycles << "," << maxCycles << "," << averageCycles << "\n";
    return false;
 }
--- a/src/bench/bench.h
+++ b/src/bench/bench.h
@ -41,12 +41,18 @@ namespace benchmark {
        double maxElapsed;
        double beginTime;
        double lastTime, minTime, maxTime, countMaskInv;
-        int64_t count;
+        uint64_t count;
-        int64_t countMask;
+        uint64_t countMask;
        uint64_t beginCycles;
        uint64_t lastCycles;
        uint64_t minCycles;
        uint64_t maxCycles;
    public:
        State(std::string _name, double _maxElapsed) : name(_name), maxElapsed(_maxElapsed), count(0) {
            minTime = std::numeric_limits<double>::max();
            maxTime = std::numeric_limits<double>::min();
            minCycles = std::numeric_limits<uint64_t>::max();
            maxCycles = std::numeric_limits<uint64_t>::min();
            countMask = 1;
            countMaskInv = 1./(countMask + 1);
        }
--- a/src/bench/perf.cpp
+++ b/src/bench/perf.cpp
@ -0,0 +1,53 @@
 // Copyright (c) 2016 The Bitcoin Core developers
 // Distributed under the MIT software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 #include "perf.h"
 #if defined(__i386__) || defined(__x86_64__)
 /* These architectures support quering the cycle counter
 * from user space, no need for any syscall overhead.
 */
 void perf_init(void) { }
 void perf_fini(void) { }
 #elif defined(__linux__)
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <linux/perf_event.h>
 static int fd = -1;
 static struct perf_event_attr attr;
 void perf_init(void)
 {
    attr.type = PERF_TYPE_HARDWARE;
    attr.config = PERF_COUNT_HW_CPU_CYCLES;
    fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
 }
 void perf_fini(void)
 {
    if (fd != -1) {
        close(fd);
    }
 }
 uint64_t perf_cpucycles(void)
 {
    uint64_t result = 0;
    if (fd == -1 || read(fd, &result, sizeof(result)) < (ssize_t)sizeof(result)) {
        return 0;
    }
    return result;
 }
 #else /* Unhandled platform */
 void perf_init(void) { }
 void perf_fini(void) { }
 uint64_t perf_cpucycles(void) { return 0; }
 #endif
--- a/src/bench/perf.h
+++ b/src/bench/perf.h
@ -0,0 +1,37 @@
 // Copyright (c) 2016 The Bitcoin Core developers
 // Distributed under the MIT software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 /** Functions for measurement of CPU cycles */
 #ifndef H_PERF
 #define H_PERF
 #include <stdint.h>
 #if defined(__i386__)
 static inline uint64_t perf_cpucycles(void)
 {
    uint64_t x;
    __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
    return x;
 }
 #elif defined(__x86_64__)
 static inline uint64_t perf_cpucycles(void)
 {
    uint32_t hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ((uint64_t)lo)|(((uint64_t)hi)<<32);
 }
 #else
 uint64_t perf_cpucycles(void);
 #endif
 void perf_init(void);
 void perf_fini(void);
 #endif // H_PERF