In this article we write two equivalent programs in C++ and in Java, in exactly the same way to do exactly the same thing: the (in)famous bubble sort algorithm. Then we proceed to measure the latency. On this experiment, Java was faster than C++ even with the -O3
compiler option.
Note: We used the same isolated cpu core for all tests through thread pinning.
Java Version
java version "17.0.1" 2021-10-19 LTS Java(TM) SE Runtime Environment (build 17.0.1+12-LTS-39) Java HotSpot(TM) 64-Bit Server VM (build 17.0.1+12-LTS-39, mixed mode, sharing)
Java
Iterations: 9,000,000 Avg Time: 825.97 nanos StDev: 156.4 nanos Min Time: 781 nanos Max Time: 107335 nanos 75% (6,750,000) = [avg: 801, stdev: 5.2, max: 876] - 25% (2,250,000) = [avg: 899, stdev: 301.03, min: 876] 90% (8,100,000) = [avg: 816, stdev: 32.78, max: 893] - 10% (900,000) = [avg: 915, stdev: 475.5, min: 893] 99% (8,910,000) = [avg: 823, stdev: 39.0, max: 904] - 1% (90,000) = [avg: 1077, stdev: 1493.9, min: 904] 99.9% (8,991,000) = [avg: 824, stdev: 39.62, max: 915] - 0.1% (9,000) = [avg: 2610, stdev: 4439.05, min: 915] 99.99% (8,999,100) = [avg: 824, stdev: 47.73, max: 13448] - 0.01% (900) = [avg: 15264, stdev: 3653.04, min: 13484] 99.999% (8,999,910) = [avg: 825, stdev: 141.2, max: 16186] - 0.001% (90) = [avg: 19503, stdev: 10169.29, min: 16187] 99.9999% (8,999,991) = [avg: 825, stdev: 149.83, max: 25498] - 0.0001% (9) = [avg: 38328, stdev: 24606.03, min: 25787] 99.99999% (8,999,999) = [avg: 825, stdev: 152.32, max: 34315] - 0.00001% (1) = [avg: 107335, stdev: 0.0, min: 107335]
C++ (-O3)
Iterations: 9,000,000 Avg Time: 1229 nanos Stdev: 157.94 nanos Min Time: 1141 nanos Max Time: 35318 nanos 75% (6,750,000) = [avg: 1183, stdev: 7.59, max: 1197] - 25% (2,250,000) = [avg: 1364, stdev: 273.80, min: 1197] 90% (8,100,000) = [avg: 1203, stdev: 63.25, max: 1431] - 10% (900,000) = [avg: 1458, stdev: 393.68, min: 1431] 99% (8,910,000) = [avg: 1225, stdev: 91.58, max: 1462] - 1% (90,000) = [avg: 1595, stdev: 1236.27, min: 1462] 99.9% (8,991,000) = [avg: 1227, stdev: 94.04, max: 1484] - 0.1% (9,000) = [avg: 2732, stdev: 3721.22, min: 1484] 99.99% (8,999,100) = [avg: 1227, stdev: 95.25, max: 2305] - 0.01% (900) = [avg: 12314, stdev: 5985.26, min: 2305] 99.999% (8,999,910) = [avg: 1228, stdev: 146.17, max: 16629] - 0.001% (90) = [avg: 19763, stdev: 3783.30, min: 16631] 99.9999% (8,999,991) = [avg: 1229, stdev: 155.39, max: 23087] - 0.0001% (9) = [avg: 29166, stdev: 4156.72, min: 24677] 99.99999% (8,999,999) = [avg: 1229, stdev: 157.53, max: 34189] - 0.00001% (1) = [avg: 35318, stdev: 0.00, min: 35318]
Java Source Code
package com.coralblocks.coralthreads.sample; import java.util.Arrays; import com.coralblocks.coralbits.bench.Benchmarker; import com.coralblocks.coralbits.util.OSUtils; import com.coralblocks.coralbits.util.SystemUtils; import com.coralblocks.coralthreads.Affinity; public class TestPerformance { // java -server -verbose:gc -cp ./target/classes:./target/coralthreads-all.jar:coralthreads-all.jar -DcoralThreadsVerbose=false -DbenchWorstPercs=true -DbenchTotals=true -DbenchStdev=true -DbenchMorePercs=true -DdetailedBenchmarker=true -DprocToBind=1 -DexcludeNanoTimeCost=true com.coralblocks.coralthreads.sample.TestPerformance 10000000 1000000 60 private static int[] HEAP_ARRAY; public static void main(String[] args) { int iterations = Integer.parseInt(args[0]); int warmup = Integer.parseInt(args[1]); int arraySize = Integer.parseInt(args[2]); int procToBind = SystemUtils.getInt("procToBind", -1); if (procToBind != -1 && OSUtils.isLinux()) { Affinity.set(procToBind); } HEAP_ARRAY = new int[arraySize]; Benchmarker bench = Benchmarker.create(warmup); long x = 0; for(int i = 0; i < iterations; i++) { bench.mark(); doSomething(HEAP_ARRAY, HEAP_ARRAY.length); bench.measure(); for(int j = 0; j < HEAP_ARRAY.length; j++) { x += HEAP_ARRAY[j]; } } System.out.println("Value computed: " + x); System.out.println("Array: " + Arrays.toString(HEAP_ARRAY)); bench.printResults(); } private static void swapping(int[] array, int x, int y) { int temp = array[x]; array[x] = array[y]; array[y] = temp; } private static void bubbleSort(int[] array, int size) { for(int i = 0; i < size; i++) { int swaps = 0; // flag to detect any swap is there or not for(int j = 0; j < size - i - 1; j++) { if (array[j] > array[j + 1]) { // when the current item is bigger than next swapping(array, j, j + 1); swaps = 1; } } if (swaps == 0) break; // No swap in this pass, so array is sorted } } /* * For speed, it is important to extract the hot code (i.e. the code executed in a loop) to its own method so the JIT can inline/optimize/compile. * * Note that the main() method above is executed only once. */ private final static void doSomething(int[] array, int size) { for(int z = 0; z < size; z++) { array[z] = size - z; } bubbleSort(array, size); } }
C++ Source Code
#include <iostream> #include <string> #include <random> #include <cmath> #include <algorithm> #include <limits> #include <sys/time.h> #include <map> #include <sched.h> #include <sstream> #include <iomanip> using namespace std; // TO COMPILE: g++ TestPerformance.cpp -o TestPerformance -std=c++11 -O3 // TO EXECUTE: ./TestPerformance 10000000 1000000 60 1 static const bool MORE_PERCS = true; static const bool INCLUDE_WORST_PERCS = true; static const bool INCLUDE_TOTALS = true; static const bool INCLUDE_RATIOS = false; static const bool INCLUDE_STDEV = true; static const bool EXCLUDE_NANO_TS_COST = true; long get_nano_ts(timespec* ts) { clock_gettime(CLOCK_MONOTONIC, ts); return ts->tv_sec * 1000000000 + ts->tv_nsec; } static const long NANO_COST_ITERATIONS = 10000000; static long calc_nano_ts_cost() { struct timespec ts; long start = get_nano_ts(&ts); long finish = start; for (long i = 0; i < NANO_COST_ITERATIONS; i++) { finish = get_nano_ts(&ts); } finish = get_nano_ts(&ts); return (finish - start) / NANO_COST_ITERATIONS; } struct mi { long value; }; void add_perc(stringstream& ss, int size, double perc, map<int, mi*>* map) { if (map->empty()) return; int max = -1; int minBottom = -1; long x = round(perc * size); long i = 0; long iBottom = 0; long sum = 0; long sumBottom = 0; bool trueForTopFalseForBottom = true; bool flag = false; const int arraySize = 1024 * 1024 * 10; int* tempData = new int[arraySize]; double stdevTop = -1; for(auto iter = map->begin(); iter != map->end(); iter++) { if (flag) break; int time = iter->first; long count = (iter->second)->value; for(int a = 0; a < count; a++) { if (trueForTopFalseForBottom) { tempData[i] = time; i++; sum += time; if (i == x) { max = time; if (INCLUDE_STDEV) { double avg = (double) sum / (double) i; double temp = 0; for(int b = 0; b < i; b++) { int t = tempData[b]; temp += (avg - t) * (avg - t); } stdevTop = sqrt(((double) temp / (double) i)); } if (INCLUDE_WORST_PERCS) { trueForTopFalseForBottom = false; } else { flag = true; break; } } } else { tempData[iBottom] = time; iBottom++; sumBottom += time; if (minBottom == -1) { minBottom = time; } } } } ss << " | " << fixed << setprecision(5) << (perc * 100) << "%"; if (INCLUDE_TOTALS) ss << " (" << i << ")"; ss << " = [avg: " << (sum / i); if (INCLUDE_STDEV) ss << ", stdev: " << fixed << setprecision(2) << stdevTop; ss << ", max: " << max << "]"; if (INCLUDE_WORST_PERCS) { ss << " - " << fixed << setprecision(5) << ((1 - perc) * 100) << "%"; if (INCLUDE_TOTALS) ss << " (" << (iBottom > 0 ? iBottom : 0) << ")"; ss << " = [avg: " << (iBottom > 0 ? (sumBottom / iBottom) : -1); if (INCLUDE_STDEV) { ss << ", stdev: "; if (iBottom <= 0) { ss << "?"; } else { double avgBottom = (sumBottom / iBottom); double temp = 0; for(int b = 0; b < iBottom; b++) { long t = tempData[b]; temp += (avgBottom - t) * (avgBottom - t); } double stdevBottom = sqrt((double) temp / (double) iBottom); ss << fixed << setprecision(2) << stdevBottom; } } ss << ", min: " << (minBottom != -1 ? minBottom : -1) << "]"; if (INCLUDE_RATIOS) ss << " R: " << fixed << setprecision(2) << (iBottom > 0 ? ( ((double) (sumBottom / iBottom) / (double) (sum / i) ) - 1) * 100 : -1) << "%"; } delete[] tempData; } void swapping(int &a, int &b) { //swap the content of a and b int temp; temp = a; a = b; b = temp; } void display(int *array, int size) { for(int i = 0; i<size; i++) cout << array[i] << " "; cout << endl; } void bubbleSort(int *array, int size) { for(int i = 0; i<size; i++) { int swaps = 0; //flag to detect any swap is there or not for(int j = 0; j<size-i-1; j++) { if(array[j] > array[j+1]) { //when the current item is bigger than next swapping(array[j], array[j+1]); swaps = 1; //set swap flag } } if(!swaps) break; // No swap in this pass, so array is sorted } } void doSomething(int *array, int size) { for(int z = 0; z < size; z++) { array[z] = size - z; } bubbleSort(array, size); } int main(int argc, char* argv[]) { int iterations = stoi(argv[1]); int warmup = stoi(argv[2]); int arraySize = stoi(argv[3]); int proc = stoi(argv[4]); cpu_set_t my_set; CPU_ZERO(&my_set); CPU_SET(proc, &my_set); sched_setaffinity(0, sizeof(cpu_set_t), &my_set); long nanoTimeCost = EXCLUDE_NANO_TS_COST ? calc_nano_ts_cost() : 0; struct timespec ts; long long x = 0; long long totalTime = 0; int minTime = numeric_limits<int>::max(); int maxTime = numeric_limits<int>::min(); map<int, mi*>* results = new map<int, mi*>(); int * array = (int*) malloc(arraySize * sizeof(int)); for(int i = 0; i < iterations; i++) { long start = get_nano_ts(&ts); doSomething(array, arraySize); long end = get_nano_ts(&ts); for(int j = 0; j < arraySize; j++) { x += array[j]; } int res = end - start - nanoTimeCost; if (res <= 0) res = 1; if (i >= warmup) { totalTime += res; minTime = min(minTime, res); maxTime = max(maxTime, res); auto iter = results->find(res); if (iter != results->end()) { (iter->second)->value = (iter->second)->value + 1; } else { mi* elem = new mi(); elem->value = 1; (*results)[res] = elem; } } } int count = iterations - warmup; double avg = totalTime / count; cout << "Value computed: " << x << endl; display(array, arraySize); cout << "Nano timestamp cost: " << nanoTimeCost << endl; free(array); stringstream ss; ss << "Iterations: " << count << " | Avg Time: " << avg; if (INCLUDE_STDEV) { long temp = 0; long x = 0; for(auto iter = results->begin(); iter != results->end(); iter++) { int time = iter->first; long count = (iter->second)->value; for(int a = 0; a < count; a++) { temp += (avg - time) * (avg - time); x++; } } double stdev = sqrt( temp / x ); ss << " | Stdev: " << fixed << setprecision(2) << stdev; } if (count > 0) { ss << " | Min Time: " << minTime << " | Max Time: " << maxTime; } add_perc(ss, count, 0.75, results); add_perc(ss, count, 0.90, results); add_perc(ss, count, 0.99, results); add_perc(ss, count, 0.999, results); add_perc(ss, count, 0.9999, results); add_perc(ss, count, 0.99999, results); if (MORE_PERCS) { add_perc(ss, count, 0.999999, results); add_perc(ss, count, 0.9999999, results); } cout << ss.str() << endl << endl; delete results; return 0; }