Skip to content
Snippets Groups Projects
test.cpp 4.66 KiB
Newer Older
  • Learn to ignore specific revisions
  • Daniel Müller's avatar
    Daniel Müller committed
    #include <iostream>
    #include <iomanip>
    #include <sstream>
    #include <string>
    #include <thread>
    #include <chrono>
    #include <cmath>
    #include <cstdint>
    #include <mutex>
    #include <set>
    #include <fstream>
    
    extern "C"
    {
    #include <sched.h>
    #include <unistd.h>
    }
    
    const long n = 42l * 1024 * 1024 * 100;
    const double h = 1.0 / (double)n;
    
    std::mutex used_cpu_ids_mtx;
    std::set<int> used_cpu_ids;
    
    
    // A mutex and macro to sync the output to stdout
    std::mutex iomtx;
    #define IO_SYNC(X)                                 \
        {                                              \
            std::lock_guard<std::mutex> iolock(iomtx); \
            X                                          \
        }
    
    // Get the current timestamp in micro seconds
    uint64_t now_micro()
    {
        return std::chrono::time_point_cast<std::chrono::microseconds>(
                   std::chrono::high_resolution_clock::now()
        ).time_since_epoch().count();
    }
    
    // Print stats about the calling thread, including the currently executing CPU Core ID
    void print_thread_stats()
    {
        // Get the CPU Core ID on which this thread is currently being executed
        auto cpu = sched_getcpu();
        {
            std::lock_guard<std::mutex> lock(used_cpu_ids_mtx);
            used_cpu_ids.insert(cpu);
        }
    
        std::cout
            // Process ID of this process
            << "pid = "
            << getpid()
            // Thread ID of this thread
            << ", thread_id = "
            << std::this_thread::get_id()
            // CPU Core ID on which this is currently being executed
            << ", cpu_id = "
            << cpu;
    }
    
    // In Linux CPUs with SMT/Hyperthreading are counted twice in the cpu list. So for example on a 
    // machine with SMT and 4 cores, the cpu list might look like this:
    // 0, 1, 2, 3, 4, 5, 6, 7
    // While there are 8 cpus in the list, only 4 cores are actually physically available. So there are 
    // pairs of cpus (so called siblings) that are actually the same physical core, but split up into 
    // 2 virtual cores. Those pairs could be for example 0,4  1,5  2,6  3,7  . 
    // This function returns the sibling pair for a given cpu core as a string
    std::string cpu_siblings(int cpu_id)
    {
        std::stringstream filename;
        filename 
            << "/sys/devices/system/cpu/cpu"
            << cpu_id
            << "/topology/thread_siblings_list";
    
        std::ifstream file(filename.str());
        
        std::string siblings;
        file >> siblings;
    
        return siblings;
    }
    
    void pi_thread(int thread_num, int numThreads, double *partial_pi)
    {
        IO_SYNC(
            print_thread_stats();
            std::cout << '\n';
        );
    
        auto tstart = now_micro();
    
        double sum = 0.0;
        for (long i = thread_num + 1; i <= n; i += numThreads)
        {
            double x = h * ((double)i - 0.5);
            sum += 4.0 / (1.0 + x * x);
        }
        *partial_pi = h * sum;
    
        auto elapsed = now_micro() - tstart;
    
        IO_SYNC(
            print_thread_stats();
            std::cout
                // The time spent calculating on this specific thread
                << ", thread_calc_time = "
                << (elapsed / 1000)
                << " ms"
                << '\n';
        );
    }
    
    int main(int argc, char *argv[])
    {
        if (argc < 2)
        {
            std::cerr << "Usage: " << argv[0] << " <number-of-threads>" << std::endl;
            return -1;
        }
    
        int numThreads = std::stoi(argv[1]);
    
        char hostname[256];
        gethostname(hostname, 256);
    
        auto hwc = std::thread::hardware_concurrency();
    
        std::cout << "Running on node: " << hostname << '\n';
        std::cout << "CPP detected hardware concurrency: " << hwc << '\n';
        std::cout << "Main thread: ";
        print_thread_stats();
        std::cout << "\n--------------------\n";
    
        auto tstart = now_micro();
    
        std::thread threads[numThreads];
        double partials[numThreads];
    
        for (int thread_num = 0; thread_num < numThreads; thread_num++)
        {
            threads[thread_num] = std::thread(
                pi_thread, thread_num, numThreads, &(partials[thread_num])
            );
        }
    
        double pi = 0;
        for (int i = 0; i < numThreads; ++i)
        {
            threads[i].join();
            pi += partials[i];
        }
    
        auto elapsed_ms = (now_micro() - tstart) / 1000;
    
        std::cout << "--------------------\n";
        std::cout << std::setprecision(16) << "Error is " << std::fabs(pi - M_PI) << '\n';
        std::cout << "Calculation took " << elapsed_ms << " ms\n";
        std::cout << "Num Threads = " << numThreads << '\n';
    
        // Print what cores have been utilized over the runtime of the program. Since those are only
        // sampled at specific point, it is theoretically possible that cores are missing
        std::cout << "Utilized CPU ids: \n";
        for (auto cpu : used_cpu_ids)
        {
            // For each core, also print the siblings
            std::cout << "  " << cpu << ", siblings: " << cpu_siblings(cpu) << '\n';
        }
    
        return 0;
    }