#882

Committed 31 Aug 2023 07:44PM UTC coverage: 41.798% (-44.7%) from 86.546%

Build # #882

Build Type

push

Committed by

Commit Message

Run Details

19442 of 46514 relevant lines covered (41.8%)

126375.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/examples/transpose/transpose_block.cpp

//  Copyright (c) 2014 Thomas Heller
//
//  SPDX-License-Identifier: BSL-1.0
//  Distributed under the Boost Software License, Version 1.0. (See accompanying
//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#include <hpx/config.hpp>
#if !defined(HPX_COMPUTE_DEVICE_CODE)
#include <hpx/hpx.hpp>
#include <hpx/hpx_init.hpp>

#include <hpx/algorithm.hpp>
#include <hpx/modules/iterator_support.hpp>
#include <hpx/numeric.hpp>

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <memory>
#include <string>
#include <utility>
#include <vector>

#define COL_SHIFT 1000.00    // Constant to shift column index
#define ROW_SHIFT 0.001      // Constant to shift row index

bool verbose = false;

char const* A_block_basename = "/transpose/block/A";
char const* B_block_basename = "/transpose/block/B";

struct sub_block
{
    enum mode
    {
        reference,
        owning
    };

    sub_block()
      : size_(0)
      , data_(nullptr)
      , mode_(reference)
    {
    }

    sub_block(double* data, std::uint64_t size)
      : size_(size)
      , data_(data)
      , mode_(reference)
    {
    }

    ~sub_block()
    {
        if (data_ && mode_ == owning)
        {
            delete[] data_;
        }
    }

    sub_block(sub_block&& other)
      : size_(other.size_)
      , data_(other.data_)
      , mode_(other.mode_)
    {
        if (mode_ == owning)
        {
            other.data_ = nullptr;
            other.size_ = 0;
        }
    }

    sub_block& operator=(sub_block&& other)
    {
        size_ = other.size_;
        data_ = other.data_;
        mode_ = other.mode_;
        if (mode_ == owning)
        {
            other.data_ = nullptr;
            other.size_ = 0;
        }

        return *this;
    }

    double operator[](std::size_t i) const
    {
        HPX_ASSERT(data_);
        return data_[i];
    }

    double& operator[](std::size_t i)
    {
        HPX_ASSERT(data_);
        HPX_ASSERT(mode_ == reference);
        return data_[i];
    }

    void load(hpx::serialization::input_archive& ar, unsigned)
    {
        // clang-format off
        ar & size_;
        // clang-format on
        if (size_ > 0)
        {
            data_ = new double[size_];
            hpx::serialization::array<double> arr(data_, size_);
            ar >> arr;
            mode_ = owning;
        }
    }

    void save(hpx::serialization::output_archive& ar, unsigned) const
    {
        // clang-format off
        ar & size_;
        // clang-format on
        if (size_ > 0)
        {
            hpx::serialization::array<double> arr(data_, size_);
            ar << arr;
        }
    }

    HPX_SERIALIZATION_SPLIT_MEMBER()

    std::uint64_t size_;
    double* data_;
    mode mode_;
};

struct block_component : hpx::components::component_base<block_component>
{
    block_component() {}

    block_component(std::uint64_t size)
      : data_(size)
    {
    }

    sub_block get_sub_block(std::uint64_t offset, std::uint64_t size)
    {
        HPX_ASSERT(!data_.empty());
        return sub_block(&data_[offset], size);
    }

    HPX_DEFINE_COMPONENT_DIRECT_ACTION(block_component, get_sub_block)

    std::vector<double> data_;
};

struct block : hpx::components::client_base<block, block_component>
{
    typedef hpx::components::client_base<block, block_component> base_type;
    block() {}

    block(hpx::future<hpx::id_type>&& id)
      : base_type(std::move(id))
    {
    }

    block(
        std::uint64_t /* id */, std::uint64_t size, char const* /* base_name */)
      : base_type(hpx::new_<block_component>(hpx::find_here(), size))
    {
    }

    hpx::future<sub_block> get_sub_block(
        std::uint64_t offset, std::uint64_t size)
    {
        block_component::get_sub_block_action act;
        return hpx::async(act, get_id(), offset, size);
    }
};

// The macros below are necessary to generate the code required for exposing
// our partition type remotely.
//
// HPX_REGISTER_COMPONENT() exposes the component creation
// through hpx::new_<>().
typedef hpx::components::component<block_component> block_component_type;
HPX_REGISTER_COMPONENT(block_component_type, block_component)

// HPX_REGISTER_ACTION() exposes the component member function for remote
// invocation.
typedef block_component::get_sub_block_action get_sub_block_action;
HPX_REGISTER_ACTION(get_sub_block_action)

void transpose(hpx::future<sub_block> A, hpx::future<sub_block> B,
    std::uint64_t block_order, std::uint64_t tile_size);
double test_results(std::uint64_t order, std::uint64_t block_order,
    std::vector<block>& trans, std::uint64_t blocks_start,
    std::uint64_t blocks_end);

///////////////////////////////////////////////////////////////////////////////
int hpx_main(hpx::program_options::variables_map& vm)
{
    {
        hpx::id_type here = hpx::find_here();
        bool root = here == hpx::find_root_locality();

        std::uint64_t num_localities = hpx::get_num_localities().get();

        std::uint64_t order = vm["matrix_size"].as<std::uint64_t>();
        std::uint64_t iterations = vm["iterations"].as<std::uint64_t>();
        std::uint64_t num_local_blocks = vm["num_blocks"].as<std::uint64_t>();
        std::uint64_t tile_size = order;

        if (vm.count("tile_size"))
            tile_size = vm["tile_size"].as<std::uint64_t>();

        verbose = vm.count("verbose") ? true : false;

        std::uint64_t bytes =
            static_cast<std::uint64_t>(2 * sizeof(double) * order * order);

        std::uint64_t num_blocks = num_localities * num_local_blocks;

        std::uint64_t block_order = order / num_blocks;
        std::uint64_t col_block_size = order * block_order;

        std::uint64_t id = hpx::get_locality_id();

        std::vector<block> A(num_blocks);
        std::vector<block> B(num_blocks);

        std::uint64_t blocks_start = id * num_local_blocks;
        std::uint64_t blocks_end = (id + 1) * num_local_blocks;

        // First allocate and create our local blocks
        for (std::uint64_t b = 0; b != num_local_blocks; ++b)
        {
            std::uint64_t block_idx = b + blocks_start;
            A[block_idx] = block(block_idx, col_block_size, A_block_basename);
            B[block_idx] = block(block_idx, col_block_size, B_block_basename);
        }

        // Find all blocks ...
        std::cout << "Finding blocks ...\n";
        std::vector<hpx::future<hpx::id_type>> A_ids =
            hpx::find_all_from_basename(A_block_basename, num_blocks);
        std::vector<hpx::future<hpx::id_type>> B_ids =
            hpx::find_all_from_basename(B_block_basename, num_blocks);

        if (root)
        {
            std::cout << "Serial Matrix transpose: B = A^T\n"
                      << "Matrix order          = " << order << "\n"
                      << "Matrix local columns  = " << block_order << "\n"
                      << "Number of blocks      = " << num_blocks << "\n"
                      << "Number of localities  = " << num_localities << "\n";
            if (tile_size < order)
                std::cout << "Tile size             = " << tile_size << "\n";
            else
                std::cout << "Untiled\n";
            std::cout << "Number of iterations  = " << iterations << "\n";
        }
        using hpx::execution::par;
        using hpx::ranges::for_each;

        // Fill the original matrix, set transpose to known garbage value.
        auto range = hpx::util::counting_shape(blocks_start, blocks_end);
        for_each(par, range, [&](std::uint64_t b) {
            std::shared_ptr<block_component> A_ptr =
                hpx::get_ptr<block_component>(A[b].get_id()).get();
            std::shared_ptr<block_component> B_ptr =
                hpx::get_ptr<block_component>(B[b].get_id()).get();

            for (std::uint64_t i = 0; i != order; ++i)
            {
                for (std::uint64_t j = 0; j != block_order; ++j)
                {
                    double col_val =
                        COL_SHIFT * static_cast<double>(b * block_order + j);
                    A_ptr->data_[i * block_order + j] =
                        col_val + ROW_SHIFT * static_cast<double>(i);
                    B_ptr->data_[i * block_order + j] = -1.0;
                }
            }
            hpx::register_with_basename(A_block_basename, A[b].get_id(), b);
            hpx::register_with_basename(B_block_basename, B[b].get_id(), b);
        });

        hpx::wait_all(A_ids);
        std::cout << "Finding blocks A ... done\n";
        hpx::wait_all(B_ids);
        std::cout << "Finding blocks B ... done\n";
        for (std::uint64_t b = 0; b != num_blocks; ++b)
        {
            // Convert the gids to clients
            if (b < blocks_start || b >= blocks_end)
            {
                A[b] = block(std::move(A_ids[b]));
                B[b] = block(std::move(B_ids[b]));
            }
        }

        double errsq = 0.0;
        double avgtime = 0.0;
        double maxtime = 0.0;
        double mintime =
            366.0 * 24.0 * 3600.0;    // set the minimum time to a large value;
                                      // one leap year should be enough
        for (std::uint64_t iter = 0; iter < iterations; ++iter)
        {
            hpx::chrono::high_resolution_timer t;

            auto range = hpx::util::counting_shape(blocks_start, blocks_end);

            std::vector<hpx::future<void>> block_futures;
            block_futures.resize(num_local_blocks);

            for_each(par, range, [&](std::uint64_t b) {
                std::vector<hpx::future<void>> phase_futures;
                phase_futures.reserve(num_blocks);

                auto phase_range = hpx::util::counting_shape(
                    static_cast<std::uint64_t>(0), num_blocks);
                for (std::uint64_t phase : phase_range)
                {
                    std::uint64_t const block_size = block_order * block_order;
                    std::uint64_t const from_block = phase;
                    std::uint64_t const from_phase = b;
                    std::uint64_t const A_offset = from_phase * block_size;
                    std::uint64_t const B_offset = phase * block_size;

                    phase_futures.push_back(hpx::dataflow(&transpose,
                        A[from_block].get_sub_block(A_offset, block_size),
                        B[b].get_sub_block(B_offset, block_size), block_order,
                        tile_size));
                }

                block_futures[b - blocks_start] = hpx::when_all(phase_futures);
            });

            hpx::wait_all(block_futures);

            double elapsed = t.elapsed();

            if (iter > 0 || iterations == 1)    // Skip the first iteration
            {
                avgtime = avgtime + elapsed;
                maxtime = (std::max) (maxtime, elapsed);
                mintime = (std::min) (mintime, elapsed);
            }

            if (root)
                errsq += test_results(
                    order, block_order, B, blocks_start, blocks_end);
        }    // end of iter loop

        // Analyze and output results

        double epsilon = 1.e-8;
        if (root)
        {
            if (errsq < epsilon)
            {
                std::cout << "Solution validates\n";
                avgtime = avgtime /
                    static_cast<double>((std::max) (iterations - 1,
                        static_cast<std::uint64_t>(1)));
                std::cout << "Rate (MB/s): "
                          << 1.e-6 * static_cast<double>(bytes) / mintime
                          << ", "
                          << "Avg time (s): " << avgtime << ", "
                          << "Min time (s): " << mintime << ", "
                          << "Max time (s): " << maxtime << "\n";

                if (verbose)
                    std::cout << "Squared errors: " << errsq << "\n";
            }
            else
            {
                std::cout << "ERROR: Aggregate squared error " << errsq
                          << " exceeds threshold " << epsilon << "\n";
                hpx::terminate();
            }
        }
        {
            // unregister names...
            auto range = hpx::util::counting_shape(blocks_start, blocks_end);
            for (auto b : range)
            {
                hpx::unregister_with_basename(A_block_basename, b);
                hpx::unregister_with_basename(B_block_basename, b);
            }
        }
    }

    return hpx::finalize();
}

int main(int argc, char* argv[])
{
    using namespace hpx::program_options;

    options_description desc_commandline;
    // clang-format off
    desc_commandline.add_options()
        ("matrix_size", value<std::uint64_t>()->default_value(1024),
         "Matrix Size")
        ("iterations", value<std::uint64_t>()->default_value(10),
         "# iterations")
        ("tile_size", value<std::uint64_t>(),
         "Number of tiles to divide the individual matrix blocks for improved "
         "cache and TLB performance")
        ("num_blocks", value<std::uint64_t>()->default_value(1),
         "Number of blocks to divide the individual matrix blocks for "
         "improved cache and TLB performance")
        ( "verbose", "Verbose output")
    ;
    // clang-format on

    // Initialize and run HPX, this example requires to run hpx_main on all
    // localities
    std::vector<std::string> const cfg = {"hpx.run_hpx_main!=1"};

    hpx::init_params init_args;
    init_args.desc_cmdline = desc_commandline;
    init_args.cfg = cfg;

    return hpx::init(argc, argv, init_args);
}

void transpose(hpx::future<sub_block> Af, hpx::future<sub_block> Bf,
    std::uint64_t block_order, std::uint64_t tile_size)
{
    sub_block const A(Af.get());
    sub_block B(Bf.get());

    if (tile_size < block_order)
    {
        for (std::uint64_t i = 0; i < block_order; i += tile_size)
        {
            for (std::uint64_t j = 0; j < block_order; j += tile_size)
            {
                std::uint64_t max_i = (std::min) (block_order, i + tile_size);
                std::uint64_t max_j = (std::min) (block_order, j + tile_size);

                for (std::uint64_t it = i; it != max_i; ++it)
                {
                    for (std::uint64_t jt = j; jt != max_j; ++jt)
                    {
                        B[it + block_order * jt] = A[jt + block_order * it];
                    }
                }
            }
        }
    }
    else
    {
        for (std::uint64_t i = 0; i != block_order; ++i)
        {
            for (std::uint64_t j = 0; j != block_order; ++j)
            {
                B[i + block_order * j] = A[j + block_order * i];
            }
        }
    }
}

double test_results(std::uint64_t order, std::uint64_t block_order,
    std::vector<block>& trans, std::uint64_t blocks_start,
    std::uint64_t blocks_end)
{
    using hpx::transform_reduce;
    using hpx::execution::par;

    // Fill the original matrix, set transpose to known garbage value.
    auto range = hpx::util::counting_shape(blocks_start, blocks_end);
    double errsq = transform_reduce(
        par, std::begin(range), std::end(range), 0.0,
        [](double lhs, double rhs) { return lhs + rhs; },
        [&](std::uint64_t b) -> double {
            sub_block trans_block =
                trans[b].get_sub_block(0, order * block_order).get();
            double errsq = 0.0;
            for (std::uint64_t i = 0; i < order; ++i)
            {
                double col_val = COL_SHIFT * static_cast<double>(i);
                for (std::uint64_t j = 0; j < block_order; ++j)
                {
                    double diff = trans_block[i * block_order + j] -
                        (col_val +
                            ROW_SHIFT *
                                static_cast<double>(b * block_order + j));
                    errsq += diff * diff;
                }
            }
            return errsq;
        });

    if (verbose)
        std::cout << " Squared sum of differences: " << errsq << "\n";

    return errsq;
}
#endif

1	// Copyright (c) 2014 Thomas Heller
2	//
3	// SPDX-License-Identifier: BSL-1.0
4	// Distributed under the Boost Software License, Version 1.0. (See accompanying
5	// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6
7	#include <hpx/config.hpp>
8	#if !defined(HPX_COMPUTE_DEVICE_CODE)
9	#include <hpx/hpx.hpp>
10	#include <hpx/hpx_init.hpp>
11
12	#include <hpx/algorithm.hpp>
13	#include <hpx/modules/iterator_support.hpp>
14	#include <hpx/numeric.hpp>
15
16	#include <algorithm>
17	#include <cstddef>
18	#include <cstdint>
19	#include <iostream>
20	#include <memory>
21	#include <string>
22	#include <utility>
23	#include <vector>
24
25	#define COL_SHIFT 1000.00 // Constant to shift column index
26	#define ROW_SHIFT 0.001 // Constant to shift row index
27
28	bool verbose = false;
29
30	char const* A_block_basename = "/transpose/block/A";
31	char const* B_block_basename = "/transpose/block/B";
32
33	struct sub_block
34	{
35	enum mode
36	{
37	reference,
38	owning
39	};
40
41	sub_block()
42	: size_(0)	×
43	, data_(nullptr)	×
44	, mode_(reference)	×
45	{
46	}
47
48	sub_block(double* data, std::uint64_t size)
49	: size_(size)	×
50	, data_(data)	×
51	, mode_(reference)	×
52	{
53	}
54
55	~sub_block()	×
56	{
57	if (data_ && mode_ == owning)	×
58	{
59	delete[] data_;	×
60	}
61	}	×
62
63	sub_block(sub_block&& other)
64	: size_(other.size_)	×
65	, data_(other.data_)	×
66	, mode_(other.mode_)	×
67	{
68	if (mode_ == owning)	×
69	{
70	other.data_ = nullptr;	×
71	other.size_ = 0;	×
72	}
73	}
74
75	sub_block& operator=(sub_block&& other)
76	{
77	size_ = other.size_;
78	data_ = other.data_;
79	mode_ = other.mode_;
80	if (mode_ == owning)
81	{
82	other.data_ = nullptr;
83	other.size_ = 0;
84	}
85
86	return *this;
87	}
88
89	double operator[](std::size_t i) const
90	{
91	HPX_ASSERT(data_);
92	return data_[i];	×
93	}
94
95	double& operator[](std::size_t i)
96	{
97	HPX_ASSERT(data_);
98	HPX_ASSERT(mode_ == reference);
99	return data_[i];	×
100	}
101
102	void load(hpx::serialization::input_archive& ar, unsigned)	×
103	{
104	// clang-format off
105	ar & size_;	×
106	// clang-format on
107	if (size_ > 0)	×
108	{
109	data_ = new double[size_];	×
110	hpx::serialization::array<double> arr(data_, size_);	×
111	ar >> arr;	×
112	mode_ = owning;	×
113	}
114	}	×
115
116	void save(hpx::serialization::output_archive& ar, unsigned) const	×
117	{
118	// clang-format off
119	ar & size_;	×
120	// clang-format on
121	if (size_ > 0)	×
122	{
123	hpx::serialization::array<double> arr(data_, size_);	×
124	ar << arr;	×
125	}
126	}	×
127
128	HPX_SERIALIZATION_SPLIT_MEMBER()	×
129
130	std::uint64_t size_;
131	double* data_;
132	mode mode_;
133	};
134
135	struct block_component : hpx::components::component_base<block_component>
136	{
137	block_component() {}
138
139	block_component(std::uint64_t size)	×
140	: data_(size)	×
141	{
142	}	×
143
144	sub_block get_sub_block(std::uint64_t offset, std::uint64_t size)	×
145	{
146	HPX_ASSERT(!data_.empty());
147	return sub_block(&data_[offset], size);	×
148	}
149
150	HPX_DEFINE_COMPONENT_DIRECT_ACTION(block_component, get_sub_block)
151
152	std::vector<double> data_;
153	};
154
155	struct block : hpx::components::client_base<block, block_component>	×
156	{
157	typedef hpx::components::client_base<block, block_component> base_type;
158	block() {}	×
159
160	block(hpx::future<hpx::id_type>&& id)
161	: base_type(std::move(id))	×
162	{
163	}
164
165	block(	×
166	std::uint64_t /* id /, std::uint64_t size, char const /* base_name */)
167	: base_type(hpx::new_<block_component>(hpx::find_here(), size))	×
168	{
169	}	×
170
171	hpx::future<sub_block> get_sub_block(	×
172	std::uint64_t offset, std::uint64_t size)
173	{
174	block_component::get_sub_block_action act;
175	return hpx::async(act, get_id(), offset, size);	×
176	}
177	};
178
179	// The macros below are necessary to generate the code required for exposing
180	// our partition type remotely.
181	//
182	// HPX_REGISTER_COMPONENT() exposes the component creation
183	// through hpx::new_<>().
184	typedef hpx::components::component<block_component> block_component_type;
185	HPX_REGISTER_COMPONENT(block_component_type, block_component)	×
186
187	// HPX_REGISTER_ACTION() exposes the component member function for remote
188	// invocation.
189	typedef block_component::get_sub_block_action get_sub_block_action;
190	HPX_REGISTER_ACTION(get_sub_block_action)	×
191
192	void transpose(hpx::future<sub_block> A, hpx::future<sub_block> B,
193	std::uint64_t block_order, std::uint64_t tile_size);
194	double test_results(std::uint64_t order, std::uint64_t block_order,
195	std::vector<block>& trans, std::uint64_t blocks_start,
196	std::uint64_t blocks_end);
197
198	///////////////////////////////////////////////////////////////////////////////
199	int hpx_main(hpx::program_options::variables_map& vm)	×
200	{
201	{
202	hpx::id_type here = hpx::find_here();	×
203	bool root = here == hpx::find_root_locality();	×
204
205	std::uint64_t num_localities = hpx::get_num_localities().get();	×
206
207	std::uint64_t order = vm["matrix_size"].as<std::uint64_t>();	×
208	std::uint64_t iterations = vm["iterations"].as<std::uint64_t>();	×
209	std::uint64_t num_local_blocks = vm["num_blocks"].as<std::uint64_t>();	×
210	std::uint64_t tile_size = order;	×
211
212	if (vm.count("tile_size"))	×
213	tile_size = vm["tile_size"].as<std::uint64_t>();	×
214
215	verbose = vm.count("verbose") ? true : false;	×
216
217	std::uint64_t bytes =	×
218	static_cast<std::uint64_t>(2 * sizeof(double) * order * order);	×
219
220	std::uint64_t num_blocks = num_localities * num_local_blocks;	×
221
222	std::uint64_t block_order = order / num_blocks;	×
223	std::uint64_t col_block_size = order * block_order;	×
224
225	std::uint64_t id = hpx::get_locality_id();	×
226
227	std::vector<block> A(num_blocks);	×
228	std::vector<block> B(num_blocks);	×
229
230	std::uint64_t blocks_start = id * num_local_blocks;	×
231	std::uint64_t blocks_end = (id + 1) * num_local_blocks;	×
232
233	// First allocate and create our local blocks
234	for (std::uint64_t b = 0; b != num_local_blocks; ++b)	×
235	{
236	std::uint64_t block_idx = b + blocks_start;	×
237	A[block_idx] = block(block_idx, col_block_size, A_block_basename);	×
238	B[block_idx] = block(block_idx, col_block_size, B_block_basename);	×
239	}
240
241	// Find all blocks ...
242	std::cout << "Finding blocks ...\n";	×
243	std::vector<hpx::future<hpx::id_type>> A_ids =
244	hpx::find_all_from_basename(A_block_basename, num_blocks);	×
245	std::vector<hpx::future<hpx::id_type>> B_ids =
246	hpx::find_all_from_basename(B_block_basename, num_blocks);	×
247
248	if (root)	×
249	{
250	std::cout << "Serial Matrix transpose: B = A^T\n"
251	<< "Matrix order = " << order << "\n"
252	<< "Matrix local columns = " << block_order << "\n"	×
253	<< "Number of blocks = " << num_blocks << "\n"	×
254	<< "Number of localities = " << num_localities << "\n";	×
255	if (tile_size < order)	×
256	std::cout << "Tile size = " << tile_size << "\n";	×
257	else
258	std::cout << "Untiled\n";	×
259	std::cout << "Number of iterations = " << iterations << "\n";	×
260	}
261	using hpx::execution::par;
262	using hpx::ranges::for_each;
263
264	// Fill the original matrix, set transpose to known garbage value.
265	auto range = hpx::util::counting_shape(blocks_start, blocks_end);	×
266	for_each(par, range, [&](std::uint64_t b) {	×
267	std::shared_ptr<block_component> A_ptr =
268	hpx::get_ptr<block_component>(A[b].get_id()).get();	×
269	std::shared_ptr<block_component> B_ptr =
270	hpx::get_ptr<block_component>(B[b].get_id()).get();	×
271
272	for (std::uint64_t i = 0; i != order; ++i)	×
273	{
274	for (std::uint64_t j = 0; j != block_order; ++j)	×
275	{
276	double col_val =	×
277	COL_SHIFT * static_cast<double>(b * block_order + j);	×
278	A_ptr->data_[i * block_order + j] =	×
279	col_val + ROW_SHIFT * static_cast<double>(i);	×
280	B_ptr->data_[i * block_order + j] = -1.0;
281	}
282	}	×
283	hpx::register_with_basename(A_block_basename, A[b].get_id(), b);	×
284	hpx::register_with_basename(B_block_basename, B[b].get_id(), b);	×
285	});
286
287	hpx::wait_all(A_ids);	×
288	std::cout << "Finding blocks A ... done\n";
289	hpx::wait_all(B_ids);	×
290	std::cout << "Finding blocks B ... done\n";	×
291	for (std::uint64_t b = 0; b != num_blocks; ++b)
292	{
293	// Convert the gids to clients	×
294	if (b < blocks_start \|\| b >= blocks_end)
295	{	×
296	A[b] = block(std::move(A_ids[b]));	×
297	B[b] = block(std::move(B_ids[b]));
298	}
299	}
300
301	double errsq = 0.0;
302	double avgtime = 0.0;	×
303	double maxtime = 0.0;	×
304	double mintime =
305	366.0 * 24.0 * 3600.0; // set the minimum time to a large value;
306	// one leap year should be enough	×
307	for (std::uint64_t iter = 0; iter < iterations; ++iter)
308	{
309	hpx::chrono::high_resolution_timer t;
310		×
311	auto range = hpx::util::counting_shape(blocks_start, blocks_end);
312		×
313	std::vector<hpx::future<void>> block_futures;	×
314	block_futures.resize(num_local_blocks);
315		×
316	for_each(par, range, [&](std::uint64_t b) {	×
317	std::vector<hpx::future<void>> phase_futures;	×
318	phase_futures.reserve(num_blocks);
319
320	auto phase_range = hpx::util::counting_shape(	×
321	static_cast<std::uint64_t>(0), num_blocks);	×
322	for (std::uint64_t phase : phase_range)
323	{	×
324	std::uint64_t const block_size = block_order * block_order;
325	std::uint64_t const from_block = phase;
326	std::uint64_t const from_phase = b;	×
327	std::uint64_t const A_offset = from_phase * block_size;	×
328	std::uint64_t const B_offset = phase * block_size;
329		×
330	phase_futures.push_back(hpx::dataflow(&transpose,	×
331	A[from_block].get_sub_block(A_offset, block_size),	×
332	B[b].get_sub_block(B_offset, block_size), block_order,	×
333	tile_size));
334	}
335		×
336	block_futures[b - blocks_start] = hpx::when_all(phase_futures);	×
337	});
338
339	hpx::wait_all(block_futures);
340		×
341	double elapsed = t.elapsed();
342		×
343	if (iter > 0 \|\| iterations == 1) // Skip the first iteration
344	{	×
345	avgtime = avgtime + elapsed;	×
346	maxtime = (std::max) (maxtime, elapsed);	×
347	mintime = (std::min) (mintime, elapsed);
348	}
349		×
350	if (root)	×
351	errsq += test_results(
352	order, block_order, B, blocks_start, blocks_end);	×
353	} // end of iter loop
354
355	// Analyze and output results
356
357	double epsilon = 1.e-8;	×
358	if (root)
359	{	×
360	if (errsq < epsilon)
361	{	×
362	std::cout << "Solution validates\n";	×
363	avgtime = avgtime /	×
364	static_cast<double>((std::max) (iterations - 1,	×
365	static_cast<std::uint64_t>(1)));	×
366	std::cout << "Rate (MB/s): "
367	<< 1.e-6 * static_cast<double>(bytes) / mintime
368	<< ", "	×
369	<< "Avg time (s): " << avgtime << ", "
370	<< "Min time (s): " << mintime << ", "	×
371	<< "Max time (s): " << maxtime << "\n";	×
372
373	if (verbose)
374	std::cout << "Squared errors: " << errsq << "\n";
375	}
376	else	×
377	{	×
378	std::cout << "ERROR: Aggregate squared error " << errsq
379	<< " exceeds threshold " << epsilon << "\n";
380	hpx::terminate();
381	}
382	}	×
383	{	×
384	// unregister names...
385	auto range = hpx::util::counting_shape(blocks_start, blocks_end);	×
386	for (auto b : range)	×
387	{
388	hpx::unregister_with_basename(A_block_basename, b);
389	hpx::unregister_with_basename(B_block_basename, b);	×
390	}
391	}	×
392	}
393
394	return hpx::finalize();	×
395	}
396
397	int main(int argc, char* argv[])
398	{	×
399	using namespace hpx::program_options;
400		×
401	options_description desc_commandline;	×
402	// clang-format off
403	desc_commandline.add_options()	×
404	("matrix_size", value<std::uint64_t>()->default_value(1024),
405	"Matrix Size")	×
406	("iterations", value<std::uint64_t>()->default_value(10),
407	"# iterations")
408	("tile_size", value<std::uint64_t>(),	×
409	"Number of tiles to divide the individual matrix blocks for improved "
410	"cache and TLB performance")
411	("num_blocks", value<std::uint64_t>()->default_value(1),	×
412	"Number of blocks to divide the individual matrix blocks for "
413	"improved cache and TLB performance")
414	( "verbose", "Verbose output")
415	;
416	// clang-format on
417		×
418	// Initialize and run HPX, this example requires to run hpx_main on all
419	// localities	×
420	std::vector<std::string> const cfg = {"hpx.run_hpx_main!=1"};	×
421		×
422	hpx::init_params init_args;
423	init_args.desc_cmdline = desc_commandline;	×
424	init_args.cfg = cfg;	×
425
426	return hpx::init(argc, argv, init_args);	×
427	}
428
429	void transpose(hpx::future<sub_block> Af, hpx::future<sub_block> Bf,	×
430	std::uint64_t block_order, std::uint64_t tile_size)	×
431	{
432	sub_block const A(Af.get());	×
433	sub_block B(Bf.get());
434		×
435	if (tile_size < block_order)
436	{	×
437	for (std::uint64_t i = 0; i < block_order; i += tile_size)
438	{	×
439	for (std::uint64_t j = 0; j < block_order; j += tile_size)	×
440	{
441	std::uint64_t max_i = (std::min) (block_order, i + tile_size);	×
442	std::uint64_t max_j = (std::min) (block_order, j + tile_size);
443		×
444	for (std::uint64_t it = i; it != max_i; ++it)
445	{	×
446	for (std::uint64_t jt = j; jt != max_j; ++jt)
447	{
448	B[it + block_order * jt] = A[jt + block_order * it];
449	}
450	}
451	}
452	}
453	}	×
454	else
455	{	×
456	for (std::uint64_t i = 0; i != block_order; ++i)
457	{	×
458	for (std::uint64_t j = 0; j != block_order; ++j)
459	{
460	B[i + block_order * j] = A[j + block_order * i];
461	}	×
462	}
463	}	×
464	}
465
466	double test_results(std::uint64_t order, std::uint64_t block_order,
467	std::vector<block>& trans, std::uint64_t blocks_start,
468	std::uint64_t blocks_end)
469	{
470	using hpx::transform_reduce;
471	using hpx::execution::par;
472
473	// Fill the original matrix, set transpose to known garbage value.
474	auto range = hpx::util::counting_shape(blocks_start, blocks_end);	×
475	double errsq = transform_reduce(	×
476	par, std::begin(range), std::end(range), 0.0,
477	[](double lhs, double rhs) { return lhs + rhs; },	×
478	[&](std::uint64_t b) -> double {
479	sub_block trans_block =	×
480	trans[b].get_sub_block(0, order * block_order).get();
481	double errsq = 0.0;	×
482	for (std::uint64_t i = 0; i < order; ++i)	×
483	{
484	double col_val = COL_SHIFT * static_cast<double>(i);	×
485	for (std::uint64_t j = 0; j < block_order; ++j)	×
486	{	×
487	double diff = trans_block[i * block_order + j] -	×
488	(col_val +	×
489	ROW_SHIFT *
490	static_cast<double>(b * block_order + j));
491	errsq += diff * diff;	×
492	}
493	}
494	return errsq;	×
495	});	×
496
497	if (verbose)	×
498	std::cout << " Squared sum of differences: " << errsq << "\n";
499
500	return errsq;
501	}
502	#endif

STEllAR-GROUP / hpx / #882

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous