#882

Committed 31 Aug 2023 07:44PM UTC coverage: 41.798% (-44.7%) from 86.546%

Build # #882

Build Type

push

Committed by

Commit Message

Run Details

19442 of 46514 relevant lines covered (41.8%)

126375.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/examples/transpose/transpose_serial_block.cpp

//  Copyright (c) 2014 Thomas Heller
//
//  SPDX-License-Identifier: BSL-1.0
//  Distributed under the Boost Software License, Version 1.0. (See accompanying
//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include <hpx/chrono.hpp>
#include <hpx/init.hpp>

#include <algorithm>
#include <cstdint>
#include <exception>
#include <iostream>
#include <string>
#include <vector>

#define COL_SHIFT 1000.00    // Constant to shift column index
#define ROW_SHIFT 0.001      // Constant to shift row index

bool verbose = false;

typedef std::vector<double> block;
typedef double* sub_block;

void transpose(sub_block A, sub_block B, std::uint64_t block_order,
    std::uint64_t tile_size);
double test_results(std::uint64_t order, std::uint64_t block_order,
    std::vector<block> const& trans);

///////////////////////////////////////////////////////////////////////////////
int hpx_main(hpx::program_options::variables_map& vm)
{
    std::uint64_t order = vm["matrix_size"].as<std::uint64_t>();
    std::uint64_t iterations = vm["iterations"].as<std::uint64_t>();
    std::uint64_t num_blocks = vm["num_blocks"].as<std::uint64_t>();
    std::uint64_t tile_size = order;

    if (vm.count("tile_size"))
        tile_size = vm["tile_size"].as<std::uint64_t>();

    verbose = vm.count("verbose") ? true : false;

    std::uint64_t bytes =
        static_cast<std::uint64_t>(2 * sizeof(double) * order * order);

    std::uint64_t block_order = order / num_blocks;
    std::uint64_t col_block_size = order * block_order;

    std::vector<block> A(num_blocks, block(col_block_size));
    std::vector<block> B(num_blocks, block(col_block_size));

    std::cout << "Serial Matrix transpose: B = A^T\n"
              << "Matrix order          = " << order << "\n";
    if (tile_size < order)
        std::cout << "Tile size             = " << tile_size << "\n";
    else
        std::cout << "Untiled\n";
    std::cout << "Number of iterations  = " << iterations << "\n";

    // Fill the original matrix, set transpose to known garbage value.
    for (std::uint64_t b = 0; b < num_blocks; ++b)
    {
        for (std::uint64_t i = 0; i < order; ++i)
        {
            for (std::uint64_t j = 0; j < block_order; ++j)
            {
                double col_val =
                    COL_SHIFT * static_cast<double>(b * block_order + j);

                A[b][i * block_order + j] =
                    col_val + ROW_SHIFT * static_cast<double>(i);
                B[b][i * block_order + j] = -1.0;
            }
        }
    }

    double errsq = 0.0;
    double avgtime = 0.0;
    double maxtime = 0.0;
    double mintime =
        366.0 * 24.0 * 3600.0;    // set the minimum time to a large value;
                                  // one leap year should be enough
    for (std::uint64_t iter = 0; iter < iterations; ++iter)
    {
        hpx::chrono::high_resolution_timer t;

        for (std::uint64_t b = 0; b < num_blocks; ++b)
        {
            for (std::uint64_t phase = 0; phase < num_blocks; ++phase)
            {
                std::uint64_t const block_size = block_order * block_order;
                std::uint64_t const from_block = phase;
                std::uint64_t const from_phase = b;
                std::uint64_t const A_offset = from_phase * block_size;
                std::uint64_t const B_offset = phase * block_size;
                transpose(&A[from_block][A_offset], &B[b][B_offset],
                    block_order, tile_size);
            }
        }

        double elapsed = t.elapsed();

        if (iter > 0 || iterations == 1)    // Skip the first iteration
        {
            avgtime = avgtime + elapsed;
            maxtime = (std::max) (maxtime, elapsed);
            mintime = (std::min) (mintime, elapsed);
        }

        errsq += test_results(order, block_order, B);
    }    // end of iter loop

    // Analyze and output results

    double epsilon = 1.e-8;
    if (errsq < epsilon)
    {
        std::cout << "Solution validates\n";
        avgtime = avgtime /
            static_cast<double>(
                (std::max) (iterations - 1, static_cast<std::uint64_t>(1)));
        std::cout << "Rate (MB/s): "
                  << 1.e-6 * static_cast<double>(bytes) / mintime << ", "
                  << "Avg time (s): " << avgtime << ", "
                  << "Min time (s): " << mintime << ", "
                  << "Max time (s): " << maxtime << "\n";

        if (verbose)
            std::cout << "Squared errors: " << errsq << "\n";
    }
    else
    {
        std::cout << "ERROR: Aggregate squared error " << errsq
                  << " exceeds threshold " << epsilon << "\n";
        std::terminate();
    }

    return hpx::local::finalize();
}

int main(int argc, char* argv[])
{
    using namespace hpx::program_options;

    options_description desc_commandline;
    // clang-format off
    desc_commandline.add_options()
        ("matrix_size", value<std::uint64_t>()->default_value(1024),
         "Matrix Size")
        ("iterations", value<std::uint64_t>()->default_value(10),
         "# iterations")
        ("tile_size", value<std::uint64_t>(),
        "Number of tiles to divide the individual matrix blocks for improved "
         "cache and TLB performance")
        ("num_blocks", value<std::uint64_t>()->default_value(256),
        "Number of blocks to divide the individual matrix blocks for improved "
         "cache and TLB performance")
        ("verbose", "Verbose output");
    // clang-format on

    // Initialize and run HPX, this example is serial and therefore only needs
    // one thread. We just use hpx::init to parse our command line arguments
    std::vector<std::string> const cfg = {"hpx.os_threads!=1"};

    hpx::local::init_params init_args;
    init_args.desc_cmdline = desc_commandline;
    init_args.cfg = cfg;

    return hpx::local::init(hpx_main, argc, argv, init_args);
}

void transpose(sub_block A, sub_block B, std::uint64_t block_order,
    std::uint64_t tile_size)
{
    if (tile_size < block_order)
    {
        for (std::uint64_t i = 0; i < block_order; i += tile_size)
        {
            for (std::uint64_t j = 0; j < block_order; j += tile_size)
            {
                std::uint64_t i_max = (std::min) (block_order, i + tile_size);
                for (std::uint64_t it = i; it < i_max; ++it)
                {
                    std::uint64_t j_max =
                        (std::min) (block_order, j + tile_size);
                    for (std::uint64_t jt = j; jt < j_max; ++jt)
                    {
                        B[it + block_order * jt] = A[jt + block_order * it];
                    }
                }
            }
        }
    }
    else
    {
        for (std::uint64_t i = 0; i < block_order; ++i)
        {
            for (std::uint64_t j = 0; j < block_order; ++j)
            {
                B[i + block_order * j] = A[j + block_order * i];
            }
        }
    }
}

double test_results(std::uint64_t order, std::uint64_t block_order,
    std::vector<block> const& trans)
{
    double errsq = 0.0;

    for (std::uint64_t b = 0; b < trans.size(); ++b)
    {
        for (std::uint64_t i = 0; i < order; ++i)
        {
            double col_val = COL_SHIFT * static_cast<double>(i);
            for (std::uint64_t j = 0; j < block_order; ++j)
            {
                double diff = trans[b][i * block_order + j] -
                    (col_val +
                        ROW_SHIFT * static_cast<double>(b * block_order + j));
                errsq += diff * diff;
            }
        }
    }

    if (verbose)
        std::cout << " Squared sum of differences: " << errsq << "\n";

    return errsq;
}

1	// Copyright (c) 2014 Thomas Heller
2	//
3	// SPDX-License-Identifier: BSL-1.0
4	// Distributed under the Boost Software License, Version 1.0. (See accompanying
5	// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6
7	#include <hpx/chrono.hpp>
8	#include <hpx/init.hpp>
9
10	#include <algorithm>
11	#include <cstdint>
12	#include <exception>
13	#include <iostream>
14	#include <string>
15	#include <vector>
16
17	#define COL_SHIFT 1000.00 // Constant to shift column index
18	#define ROW_SHIFT 0.001 // Constant to shift row index
19
20	bool verbose = false;
21
22	typedef std::vector<double> block;
23	typedef double* sub_block;
24
25	void transpose(sub_block A, sub_block B, std::uint64_t block_order,
26	std::uint64_t tile_size);
27	double test_results(std::uint64_t order, std::uint64_t block_order,
28	std::vector<block> const& trans);
29
30	///////////////////////////////////////////////////////////////////////////////
31	int hpx_main(hpx::program_options::variables_map& vm)	×
32	{
33	std::uint64_t order = vm["matrix_size"].as<std::uint64_t>();	×
34	std::uint64_t iterations = vm["iterations"].as<std::uint64_t>();	×
35	std::uint64_t num_blocks = vm["num_blocks"].as<std::uint64_t>();	×
36	std::uint64_t tile_size = order;
37
38	if (vm.count("tile_size"))	×
39	tile_size = vm["tile_size"].as<std::uint64_t>();	×
40
41	verbose = vm.count("verbose") ? true : false;	×
42
43	std::uint64_t bytes =	×
44	static_cast<std::uint64_t>(2 * sizeof(double) * order * order);	×
45
46	std::uint64_t block_order = order / num_blocks;	×
47	std::uint64_t col_block_size = order * block_order;	×
48
49	std::vector<block> A(num_blocks, block(col_block_size));	×
50	std::vector<block> B(num_blocks, block(col_block_size));	×
51
52	std::cout << "Serial Matrix transpose: B = A^T\n"
53	<< "Matrix order = " << order << "\n";	×
54	if (tile_size < order)	×
55	std::cout << "Tile size = " << tile_size << "\n";	×
56	else
57	std::cout << "Untiled\n";	×
58	std::cout << "Number of iterations = " << iterations << "\n";	×
59
60	// Fill the original matrix, set transpose to known garbage value.
61	for (std::uint64_t b = 0; b < num_blocks; ++b)	×
62	{
63	for (std::uint64_t i = 0; i < order; ++i)	×
64	{
65	for (std::uint64_t j = 0; j < block_order; ++j)	×
66	{
67	double col_val =	×
68	COL_SHIFT * static_cast<double>(b * block_order + j);	×
69
70	A[b][i * block_order + j] =	×
71	col_val + ROW_SHIFT * static_cast<double>(i);	×
72	B[b][i * block_order + j] = -1.0;
73	}
74	}
75	}
76
77	double errsq = 0.0;
78	double avgtime = 0.0;	×
79	double maxtime = 0.0;	×
80	double mintime =
81	366.0 * 24.0 * 3600.0; // set the minimum time to a large value;
82	// one leap year should be enough	×
83	for (std::uint64_t iter = 0; iter < iterations; ++iter)
84	{
85	hpx::chrono::high_resolution_timer t;
86		×
87	for (std::uint64_t b = 0; b < num_blocks; ++b)
88	{	×
89	for (std::uint64_t phase = 0; phase < num_blocks; ++phase)
90	{	×
91	std::uint64_t const block_size = block_order * block_order;
92	std::uint64_t const from_block = phase;
93	std::uint64_t const from_phase = b;	×
94	std::uint64_t const A_offset = from_phase * block_size;	×
95	std::uint64_t const B_offset = phase * block_size;	×
96	transpose(&A[from_block][A_offset], &B[b][B_offset],
97	block_order, tile_size);
98	}
99	}
100		×
101	double elapsed = t.elapsed();
102		×
103	if (iter > 0 \|\| iterations == 1) // Skip the first iteration
104	{	×
105	avgtime = avgtime + elapsed;	×
106	maxtime = (std::max) (maxtime, elapsed);	×
107	mintime = (std::min) (mintime, elapsed);
108	}
109		×
110	errsq += test_results(order, block_order, B);
111	} // end of iter loop
112
113	// Analyze and output results
114
115	double epsilon = 1.e-8;	×
116	if (errsq < epsilon)
117	{	×
118	std::cout << "Solution validates\n";	×
119	avgtime = avgtime /	×
120	static_cast<double>(	×
121	(std::max) (iterations - 1, static_cast<std::uint64_t>(1)));	×
122	std::cout << "Rate (MB/s): "
123	<< 1.e-6 * static_cast<double>(bytes) / mintime << ", "
124	<< "Avg time (s): " << avgtime << ", "	×
125	<< "Min time (s): " << mintime << ", "
126	<< "Max time (s): " << maxtime << "\n";	×
127		×
128	if (verbose)
129	std::cout << "Squared errors: " << errsq << "\n";
130	}
131	else
132	{	×
133	std::cout << "ERROR: Aggregate squared error " << errsq	×
134	<< " exceeds threshold " << epsilon << "\n";
135	std::terminate();
136	}	×
137		×
138	return hpx::local::finalize();
139	}	×
140
141	int main(int argc, char* argv[])
142	{
143	using namespace hpx::program_options;	×
144
145	options_description desc_commandline;	×
146	// clang-format off	×
147	desc_commandline.add_options()
148	("matrix_size", value<std::uint64_t>()->default_value(1024),	×
149	"Matrix Size")
150	("iterations", value<std::uint64_t>()->default_value(10),	×
151	"# iterations")
152	("tile_size", value<std::uint64_t>(),
153	"Number of tiles to divide the individual matrix blocks for improved "	×
154	"cache and TLB performance")
155	("num_blocks", value<std::uint64_t>()->default_value(256),
156	"Number of blocks to divide the individual matrix blocks for improved "	×
157	"cache and TLB performance")
158	("verbose", "Verbose output");
159	// clang-format on
160
161	// Initialize and run HPX, this example is serial and therefore only needs	×
162	// one thread. We just use hpx::init to parse our command line arguments
163	std::vector<std::string> const cfg = {"hpx.os_threads!=1"};	×
164		×
165	hpx::local::init_params init_args;	×
166	init_args.desc_cmdline = desc_commandline;
167	init_args.cfg = cfg;	×
168		×
169	return hpx::local::init(hpx_main, argc, argv, init_args);
170	}	×
171
172	void transpose(sub_block A, sub_block B, std::uint64_t block_order,
173	std::uint64_t tile_size)	×
174	{
175	if (tile_size < block_order)	×
176	{
177	for (std::uint64_t i = 0; i < block_order; i += tile_size)	×
178	{
179	for (std::uint64_t j = 0; j < block_order; j += tile_size)	×
180	{	×
181	std::uint64_t i_max = (std::min) (block_order, i + tile_size);
182	for (std::uint64_t it = i; it < i_max; ++it)
183	{	×
184	std::uint64_t j_max =	×
185	(std::min) (block_order, j + tile_size);
186	for (std::uint64_t jt = j; jt < j_max; ++jt)	×
187	{
188	B[it + block_order * jt] = A[jt + block_order * it];
189	}
190	}
191	}
192	}
193	}
194	else	×
195	{
196	for (std::uint64_t i = 0; i < block_order; ++i)	×
197	{
198	for (std::uint64_t j = 0; j < block_order; ++j)	×
199	{
200	B[i + block_order * j] = A[j + block_order * i];
201	}
202	}	×
203	}
204	}	×
205
206	double test_results(std::uint64_t order, std::uint64_t block_order,
207	std::vector<block> const& trans)
208	{
209	double errsq = 0.0;	×
210
211	for (std::uint64_t b = 0; b < trans.size(); ++b)	×
212	{
213	for (std::uint64_t i = 0; i < order; ++i)	×
214	{	×
215	double col_val = COL_SHIFT * static_cast<double>(i);
216	for (std::uint64_t j = 0; j < block_order; ++j)	×
217	{	×
218	double diff = trans[b][i * block_order + j] -	×
219	(col_val +	×
220	ROW_SHIFT * static_cast<double>(b * block_order + j));
221	errsq += diff * diff;
222	}
223	}
224	}	×
225		×
226	if (verbose)
227	std::cout << " Squared sum of differences: " << errsq << "\n";	×
228
229	return errsq;
230	}

STEllAR-GROUP / hpx / #882

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous