#882

Committed 31 Aug 2023 07:44PM UTC coverage: 41.798% (-44.7%) from 86.546%

Build # #882

Build Type

push

Committed by

Commit Message

Run Details

19442 of 46514 relevant lines covered (41.8%)

126375.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/examples/transpose/transpose_smp_block.cpp

//  Copyright (c) 2014 Thomas Heller
//
//  SPDX-License-Identifier: BSL-1.0
//  Distributed under the Boost Software License, Version 1.0. (See accompanying
//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include <hpx/algorithm.hpp>
#include <hpx/init.hpp>
#include <hpx/modules/iterator_support.hpp>
#include <hpx/numeric.hpp>

#include <algorithm>
#include <cstdint>
#include <exception>
#include <iostream>
#include <vector>

#define COL_SHIFT 1000.00    // Constant to shift column index
#define ROW_SHIFT 0.001      // Constant to shift row index

bool verbose = false;

typedef std::vector<double> block;
typedef double* sub_block;

void transpose(sub_block A, sub_block B, std::uint64_t block_order,
    std::uint64_t tile_size);
double test_results(std::uint64_t order, std::uint64_t block_order,
    std::vector<block> const& trans);

///////////////////////////////////////////////////////////////////////////////
int hpx_main(hpx::program_options::variables_map& vm)
{
    std::uint64_t order = vm["matrix_size"].as<std::uint64_t>();
    std::uint64_t iterations = vm["iterations"].as<std::uint64_t>();
    std::uint64_t num_blocks = vm["num_blocks"].as<std::uint64_t>();
    std::uint64_t tile_size = order;

    if (vm.count("tile_size"))
        tile_size = vm["tile_size"].as<std::uint64_t>();

    verbose = vm.count("verbose") ? true : false;

    std::uint64_t bytes =
        static_cast<std::uint64_t>(2 * sizeof(double) * order * order);

    std::uint64_t block_order = order / num_blocks;
    std::uint64_t col_block_size = order * block_order;

    std::vector<block> A(num_blocks, block(col_block_size));
    std::vector<block> B(num_blocks, block(col_block_size));

    std::cout << "Serial Matrix transpose: B = A^T\n"
              << "Matrix order          = " << order << "\n";
    if (tile_size < order)
        std::cout << "Tile size             = " << tile_size << "\n";
    else
        std::cout << "Untiled\n";
    std::cout << "Number of iterations  = " << iterations << "\n";

    using hpx::execution::par;
    using hpx::execution::task;
    using hpx::ranges::for_each;

    std::uint64_t const start = 0;

    // Fill the original matrix, set transpose to known garbage value.
    auto range = hpx::util::counting_shape(start, num_blocks);
    for_each(par, range, [&](std::uint64_t b) {
        for (std::uint64_t i = 0; i < order; ++i)
        {
            for (std::uint64_t j = 0; j < block_order; ++j)
            {
                double col_val =
                    COL_SHIFT * static_cast<double>(b * block_order + j);

                A[b][i * block_order + j] =
                    col_val + ROW_SHIFT * static_cast<double>(i);
                B[b][i * block_order + j] = -1.0;
            }
        }
    });

    double errsq = 0.0;
    double avgtime = 0.0;
    double maxtime = 0.0;
    double mintime =
        366.0 * 24.0 * 3600.0;    // set the minimum time to a large value;
                                  // one leap year should be enough
    for (std::uint64_t iter = 0; iter < iterations; ++iter)
    {
        hpx::chrono::high_resolution_timer t;

        auto range = hpx::util::counting_shape(start, num_blocks);

        std::vector<hpx::shared_future<void>> transpose_futures;
        transpose_futures.resize(num_blocks);

        for_each(par, range, [&](std::uint64_t b) {
            transpose_futures[b] =
                for_each(par(task), range, [&, b](std::uint64_t phase) {
                    std::uint64_t const block_size = block_order * block_order;
                    std::uint64_t const from_block = phase;
                    std::uint64_t const from_phase = b;
                    std::uint64_t const A_offset = from_phase * block_size;
                    std::uint64_t const B_offset = phase * block_size;

                    transpose(&A[from_block][A_offset], &B[b][B_offset],
                        block_order, tile_size);
                }).share();
        });

        hpx::wait_all(transpose_futures);

        double elapsed = t.elapsed();

        if (iter > 0 || iterations == 1)    // Skip the first iteration
        {
            avgtime = avgtime + elapsed;
            maxtime = (std::max) (maxtime, elapsed);
            mintime = (std::min) (mintime, elapsed);
        }

        errsq += test_results(order, block_order, B);
    }    // end of iter loop

    // Analyze and output results

    double epsilon = 1.e-8;
    if (errsq < epsilon)
    {
        std::cout << "Solution validates\n";
        avgtime = avgtime /
            static_cast<double>(
                (std::max) (iterations - 1, static_cast<std::uint64_t>(1)));
        std::cout << "Rate (MB/s): "
                  << 1.e-6 * static_cast<double>(bytes) / mintime << ", "
                  << "Avg time (s): " << avgtime << ", "
                  << "Min time (s): " << mintime << ", "
                  << "Max time (s): " << maxtime << "\n";

        if (verbose)
            std::cout << "Squared errors: " << errsq << "\n";
    }
    else
    {
        std::cout << "ERROR: Aggregate squared error " << errsq
                  << " exceeds threshold " << epsilon << "\n";
        std::terminate();
    }

    return hpx::local::finalize();
}

int main(int argc, char* argv[])
{
    using namespace hpx::program_options;

    options_description desc_commandline;
    // clang-format off
    desc_commandline.add_options()
        ("matrix_size", value<std::uint64_t>()->default_value(1024),
         "Matrix Size")
        ("iterations", value<std::uint64_t>()->default_value(10),
         "# iterations")
        ("tile_size", value<std::uint64_t>(),
         "Number of tiles to divide the individual matrix blocks for improved "
         "cache and TLB performance")
        ("num_blocks", value<std::uint64_t>()->default_value(256),
         "Number of blocks to divide the individual matrix blocks for improved "
         "cache and TLB performance")
        ( "verbose", "Verbose output")
    ;
    // clang-format on

    hpx::local::init_params init_args;
    init_args.desc_cmdline = desc_commandline;

    return hpx::local::init(hpx_main, argc, argv, init_args);
}

void transpose(sub_block A, sub_block B, std::uint64_t block_order,
    std::uint64_t tile_size)
{
    if (tile_size < block_order)
    {
        for (std::uint64_t i = 0; i < block_order; i += tile_size)
        {
            for (std::uint64_t j = 0; j < block_order; j += tile_size)
            {
                std::uint64_t i_max = (std::min) (block_order, i + tile_size);
                std::uint64_t j_max = (std::min) (block_order, j + tile_size);

                for (std::uint64_t it = i; it < i_max; ++it)
                {
                    for (std::uint64_t jt = j; jt < j_max; ++jt)
                    {
                        B[it + block_order * jt] = A[jt + block_order * it];
                    }
                }
            }
        }
    }
    else
    {
        for (std::uint64_t i = 0; i < block_order; ++i)
        {
            for (std::uint64_t j = 0; j < block_order; ++j)
            {
                B[i + block_order * j] = A[j + block_order * i];
            }
        }
    }
}

double test_results(std::uint64_t order, std::uint64_t block_order,
    std::vector<block> const& trans)
{
    using hpx::transform_reduce;
    using hpx::execution::par;

    std::uint64_t const start = 0;
    std::uint64_t const end = trans.size();

    // Fill the original matrix, set transpose to known garbage value.
    auto range = hpx::util::counting_shape(start, end);
    double errsq = transform_reduce(
        par, std::begin(range), std::end(range), 0.0,
        [](double lhs, double rhs) { return lhs + rhs; },
        [&](std::uint64_t b) -> double {
            double errsq = 0.0;
            for (std::uint64_t i = 0; i < order; ++i)
            {
                double col_val = COL_SHIFT * static_cast<double>(i);
                for (std::uint64_t j = 0; j < block_order; ++j)
                {
                    double diff = trans[b][i * block_order + j] -
                        (col_val +
                            ROW_SHIFT *
                                static_cast<double>(b * block_order + j));
                    errsq += diff * diff;
                }
            }
            return errsq;
        });

    if (verbose)
        std::cout << " Squared sum of differences: " << errsq << "\n";

    return errsq;
}

1	// Copyright (c) 2014 Thomas Heller
2	//
3	// SPDX-License-Identifier: BSL-1.0
4	// Distributed under the Boost Software License, Version 1.0. (See accompanying
5	// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6
7	#include <hpx/algorithm.hpp>
8	#include <hpx/init.hpp>
9	#include <hpx/modules/iterator_support.hpp>
10	#include <hpx/numeric.hpp>
11
12	#include <algorithm>
13	#include <cstdint>
14	#include <exception>
15	#include <iostream>
16	#include <vector>
17
18	#define COL_SHIFT 1000.00 // Constant to shift column index
19	#define ROW_SHIFT 0.001 // Constant to shift row index
20
21	bool verbose = false;
22
23	typedef std::vector<double> block;
24	typedef double* sub_block;
25
26	void transpose(sub_block A, sub_block B, std::uint64_t block_order,
27	std::uint64_t tile_size);
28	double test_results(std::uint64_t order, std::uint64_t block_order,
29	std::vector<block> const& trans);
30
31	///////////////////////////////////////////////////////////////////////////////
32	int hpx_main(hpx::program_options::variables_map& vm)	×
33	{
34	std::uint64_t order = vm["matrix_size"].as<std::uint64_t>();	×
35	std::uint64_t iterations = vm["iterations"].as<std::uint64_t>();	×
36	std::uint64_t num_blocks = vm["num_blocks"].as<std::uint64_t>();	×
37	std::uint64_t tile_size = order;	×
38
39	if (vm.count("tile_size"))	×
40	tile_size = vm["tile_size"].as<std::uint64_t>();	×
41
42	verbose = vm.count("verbose") ? true : false;	×
43
44	std::uint64_t bytes =	×
45	static_cast<std::uint64_t>(2 * sizeof(double) * order * order);	×
46
47	std::uint64_t block_order = order / num_blocks;	×
48	std::uint64_t col_block_size = order * block_order;	×
49
50	std::vector<block> A(num_blocks, block(col_block_size));	×
51	std::vector<block> B(num_blocks, block(col_block_size));	×
52
53	std::cout << "Serial Matrix transpose: B = A^T\n"
54	<< "Matrix order = " << order << "\n";	×
55	if (tile_size < order)	×
56	std::cout << "Tile size = " << tile_size << "\n";	×
57	else
58	std::cout << "Untiled\n";	×
59	std::cout << "Number of iterations = " << iterations << "\n";	×
60
61	using hpx::execution::par;
62	using hpx::execution::task;
63	using hpx::ranges::for_each;
64
65	std::uint64_t const start = 0;
66
67	// Fill the original matrix, set transpose to known garbage value.
68	auto range = hpx::util::counting_shape(start, num_blocks);
69	for_each(par, range, [&](std::uint64_t b) {	×
70	for (std::uint64_t i = 0; i < order; ++i)	×
71	{
72	for (std::uint64_t j = 0; j < block_order; ++j)	×
73	{
74	double col_val =	×
75	COL_SHIFT * static_cast<double>(b * block_order + j);	×
76
77	A[b][i * block_order + j] =	×
78	col_val + ROW_SHIFT * static_cast<double>(i);	×
79	B[b][i * block_order + j] = -1.0;
80	}
81	}	×
82	});
83
84	double errsq = 0.0;
85	double avgtime = 0.0;	×
86	double maxtime = 0.0;	×
87	double mintime =
88	366.0 * 24.0 * 3600.0; // set the minimum time to a large value;
89	// one leap year should be enough	×
90	for (std::uint64_t iter = 0; iter < iterations; ++iter)
91	{
92	hpx::chrono::high_resolution_timer t;
93
94	auto range = hpx::util::counting_shape(start, num_blocks);
95		×
96	std::vector<hpx::shared_future<void>> transpose_futures;	×
97	transpose_futures.resize(num_blocks);
98		×
99	for_each(par, range, [&](std::uint64_t b) {	×
100	transpose_futures[b] =	×
101	for_each(par(task), range, [&, b](std::uint64_t phase) {	×
102	std::uint64_t const block_size = block_order * block_order;
103	std::uint64_t const from_block = phase;	×
104	std::uint64_t const from_phase = b;	×
105	std::uint64_t const A_offset = from_phase * block_size;	×
106	std::uint64_t const B_offset = phase * block_size;
107		×
108	transpose(&A[from_block][A_offset], &B[b][B_offset],	×
109	block_order, tile_size);	×
110	}).share();	×
111	});
112
113	hpx::wait_all(transpose_futures);
114		×
115	double elapsed = t.elapsed();
116		×
117	if (iter > 0 \|\| iterations == 1) // Skip the first iteration
118	{	×
119	avgtime = avgtime + elapsed;	×
120	maxtime = (std::max) (maxtime, elapsed);	×
121	mintime = (std::min) (mintime, elapsed);
122	}
123		×
124	errsq += test_results(order, block_order, B);	×
125	} // end of iter loop
126
127	// Analyze and output results
128
129	double epsilon = 1.e-8;	×
130	if (errsq < epsilon)
131	{	×
132	std::cout << "Solution validates\n";	×
133	avgtime = avgtime /	×
134	static_cast<double>(	×
135	(std::max) (iterations - 1, static_cast<std::uint64_t>(1)));	×
136	std::cout << "Rate (MB/s): "
137	<< 1.e-6 * static_cast<double>(bytes) / mintime << ", "
138	<< "Avg time (s): " << avgtime << ", "	×
139	<< "Min time (s): " << mintime << ", "
140	<< "Max time (s): " << maxtime << "\n";	×
141		×
142	if (verbose)
143	std::cout << "Squared errors: " << errsq << "\n";
144	}
145	else
146	{	×
147	std::cout << "ERROR: Aggregate squared error " << errsq	×
148	<< " exceeds threshold " << epsilon << "\n";
149	std::terminate();
150	}	×
151		×
152	return hpx::local::finalize();
153	}	×
154
155	int main(int argc, char* argv[])
156	{
157	using namespace hpx::program_options;	×
158
159	options_description desc_commandline;	×
160	// clang-format off	×
161	desc_commandline.add_options()
162	("matrix_size", value<std::uint64_t>()->default_value(1024),	×
163	"Matrix Size")
164	("iterations", value<std::uint64_t>()->default_value(10),	×
165	"# iterations")
166	("tile_size", value<std::uint64_t>(),
167	"Number of tiles to divide the individual matrix blocks for improved "	×
168	"cache and TLB performance")
169	("num_blocks", value<std::uint64_t>()->default_value(256),
170	"Number of blocks to divide the individual matrix blocks for improved "	×
171	"cache and TLB performance")
172	( "verbose", "Verbose output")
173	;
174	// clang-format on	×
175		×
176	hpx::local::init_params init_args;
177	init_args.desc_cmdline = desc_commandline;	×
178		×
179	return hpx::local::init(hpx_main, argc, argv, init_args);
180	}	×
181
182	void transpose(sub_block A, sub_block B, std::uint64_t block_order,
183	std::uint64_t tile_size)	×
184	{
185	if (tile_size < block_order)	×
186	{
187	for (std::uint64_t i = 0; i < block_order; i += tile_size)	×
188	{
189	for (std::uint64_t j = 0; j < block_order; j += tile_size)	×
190	{	×
191	std::uint64_t i_max = (std::min) (block_order, i + tile_size);
192	std::uint64_t j_max = (std::min) (block_order, j + tile_size);	×
193
194	for (std::uint64_t it = i; it < i_max; ++it)	×
195	{
196	for (std::uint64_t jt = j; jt < j_max; ++jt)	×
197	{
198	B[it + block_order * jt] = A[jt + block_order * it];
199	}
200	}
201	}
202	}
203	}
204	else	×
205	{
206	for (std::uint64_t i = 0; i < block_order; ++i)	×
207	{
208	for (std::uint64_t j = 0; j < block_order; ++j)	×
209	{
210	B[i + block_order * j] = A[j + block_order * i];
211	}
212	}	×
213	}
214	}	×
215
216	double test_results(std::uint64_t order, std::uint64_t block_order,
217	std::vector<block> const& trans)
218	{
219	using hpx::transform_reduce;
220	using hpx::execution::par;
221
222	std::uint64_t const start = 0;
223	std::uint64_t const end = trans.size();
224
225	// Fill the original matrix, set transpose to known garbage value.
226	auto range = hpx::util::counting_shape(start, end);
227	double errsq = transform_reduce(	×
228	par, std::begin(range), std::end(range), 0.0,	×
229	[](double lhs, double rhs) { return lhs + rhs; },
230	[&](std::uint64_t b) -> double {	×
231	double errsq = 0.0;
232	for (std::uint64_t i = 0; i < order; ++i)	×
233	{	×
234	double col_val = COL_SHIFT * static_cast<double>(i);
235	for (std::uint64_t j = 0; j < block_order; ++j)	×
236	{	×
237	double diff = trans[b][i * block_order + j] -	×
238	(col_val +	×
239	ROW_SHIFT *	×
240	static_cast<double>(b * block_order + j));
241	errsq += diff * diff;
242	}	×
243	}
244	return errsq;
245	});	×
246		×
247	if (verbose)
248	std::cout << " Squared sum of differences: " << errsq << "\n";	×
249
250	return errsq;
251	}

STEllAR-GROUP / hpx / #882

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous