16617093228

Committed 30 Jul 2025 08:10AM UTC coverage: 82.68% (-0.004%) from 82.684%

Build # 16617093228

Build Type

Pull #4019

github

Committed by

web-flow

Commit Message

Merge d115e260b into 5f86536fe

Pull Request Pull Request #4019: Refactor read I/O

Coverage Stats

415 of 470 new or added lines in 21 files covered. (88.3%)

96 existing lines in 6 files now uncovered.

45097 of 54544 relevant lines covered (82.68%)

185077.55 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/vortex-scan/src/multi_thread.rs

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::iter;
use std::sync::{Arc, LazyLock};

use futures::{StreamExt, stream};
use tokio::runtime::{Builder, Runtime};
use vortex_array::ArrayRef;
use vortex_array::iter::{ArrayIterator, ArrayIteratorAdapter};
use vortex_error::{VortexExpect, VortexResult, vortex_err};

use crate::ScanBuilder;

/// We create an internal Tokio runtime used exclusively for orchestrating work-stealing
/// of CPU-bound work for multithreaded scans.
///
/// It is intentionally not exposed to the user, not configurable, and does not enable I/O or
/// timers.
///
// TODO(ngates): we may want to create an abstract CPU dispatcher (similar to the dispatcher in
//  vortex-io) that can be used to spawn CPU-bound tasks on a thread pool, e.g. using WASM threads.
static CPU_RUNTIME: LazyLock<Runtime> = LazyLock::new(|| {
    Builder::new_multi_thread()
        .thread_name("vortex-multithread-scan")
        .build()
        .vortex_expect("Failed to create a new Tokio runtime")
});

impl ScanBuilder<ArrayRef> {
    /// Execute the scan on multiple worker threads.
    pub fn into_array_iter_multithread(self) -> VortexResult<impl ArrayIterator + Send + 'static> {
        let dtype = self.dtype()?;
        Ok(ArrayIteratorAdapter::new(
            dtype,
            self.into_iter_multithread(|a| a)?,
        ))
    }

    /// Execute the scan on multiple worker threads.
    ///
    /// A `map_fn` can be passed to further transform the results of the scan while still running
    /// on the thread pool.
    pub fn into_iter_multithread<T, F>(
        self,
        map_fn: F,
    ) -> VortexResult<impl Iterator<Item = T> + Send + 'static>
    where
        T: 'static + Send,
        F: Fn(VortexResult<ArrayRef>) -> T + Send + Sync + 'static,
    {
        let concurrency = self.concurrency;
        let num_workers = CPU_RUNTIME.metrics().num_workers();

        let tasks = self.build()?;
        // We need to clone and send the map_fn into each task.
        let map_fn = Arc::new(map_fn);
        let handle = CPU_RUNTIME.handle().clone();

        let mut stream = stream::iter(tasks)
            .map(move |task| {
                let map_fn = map_fn.clone();
                // We don't _need_ to spawn the work here. But it allows Tokio to make progress on
                // the tasks in the background, even if the consumer thread is not calling
                // poll_next.

                handle.spawn(async move { task.await.transpose().map(|t| map_fn(t)) })
            })
            // TODO(ngates): this is very crude indeed. This buffered call essentially controls how
            //  many splits we have in-flight at any given time. We multiple workers by concurrency
            //  to configure per-thread concurrency, which essentially means each thread can make
            //  progress on one split while waiting for the I/O of another split to complete.
            //  In an ideal world, the number of in-flight tasks would be dynamically adjusted
            //  based on how much I/O the tasks _actually_ require. For example, all pruning tasks
            //  could be spawned immediately since they all use a single segment, this would allow
            //  head-room to run ahead and figure out the I/O demands of subsequent tasks.
            .buffered(num_workers * concurrency);

        Ok(iter::from_fn(move || {
            tokio::task::block_in_place(|| CPU_RUNTIME.handle().block_on(stream.next()))
        })
        .filter_map(|result| {
            result
                .map_err(|e| vortex_err!("Failed to join on a spawned scan task {e}"))
                .vortex_expect("Failed to join on a spawned scan task")
        }))
    }
}

1	// SPDX-License-Identifier: Apache-2.0
2	// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4	use std::iter;
5	use std::sync::{Arc, LazyLock};
6
7	use futures::{StreamExt, stream};
8	use tokio::runtime::{Builder, Runtime};
9	use vortex_array::ArrayRef;
10	use vortex_array::iter::{ArrayIterator, ArrayIteratorAdapter};
11	use vortex_error::{VortexExpect, VortexResult, vortex_err};
12
13	use crate::ScanBuilder;
14
15	/// We create an internal Tokio runtime used exclusively for orchestrating work-stealing
16	/// of CPU-bound work for multithreaded scans.
17	///
18	/// It is intentionally not exposed to the user, not configurable, and does not enable I/O or
19	/// timers.
20	///
21	// TODO(ngates): we may want to create an abstract CPU dispatcher (similar to the dispatcher in
22	// vortex-io) that can be used to spawn CPU-bound tasks on a thread pool, e.g. using WASM threads.
NEW 23	static CPU_RUNTIME: LazyLock<Runtime> = LazyLock::new(\|\| {	×
24	Builder::new_multi_thread()	×
25	.thread_name("vortex-multithread-scan")	×
26	.build()	×
27	.vortex_expect("Failed to create a new Tokio runtime")	×
28	});	×
29
30	impl ScanBuilder<ArrayRef> {
31	/// Execute the scan on multiple worker threads.
UNCOV 32	pub fn into_array_iter_multithread(self) -> VortexResult<impl ArrayIterator + Send + 'static> {	×
33	let dtype = self.dtype()?;	×
34	Ok(ArrayIteratorAdapter::new(	×
35	dtype,	×
36	self.into_iter_multithread(\|a\| a)?,	×
37	))
UNCOV 38	}	×
39
40	/// Execute the scan on multiple worker threads.
41	///
42	/// A `map_fn` can be passed to further transform the results of the scan while still running
43	/// on the thread pool.
UNCOV 44	pub fn into_iter_multithread<T, F>(	×
45	self,	×
46	map_fn: F,	×
47	) -> VortexResult<impl Iterator<Item = T> + Send + 'static>	×
48	where	×
49	T: 'static + Send,	×
50	F: Fn(VortexResult<ArrayRef>) -> T + Send + Sync + 'static,	×
51	{
UNCOV 52	let concurrency = self.concurrency;	×
53	let num_workers = CPU_RUNTIME.metrics().num_workers();	×
54
UNCOV 55	let tasks = self.build()?;	×
56	// We need to clone and send the map_fn into each task.
UNCOV 57	let map_fn = Arc::new(map_fn);	×
58	let handle = CPU_RUNTIME.handle().clone();	×
59
UNCOV 60	let mut stream = stream::iter(tasks)	×
61	.map(move \|task\| {	×
62	let map_fn = map_fn.clone();	×
63	// We don't _need_ to spawn the work here. But it allows Tokio to make progress on
64	// the tasks in the background, even if the consumer thread is not calling
65	// poll_next.
66
UNCOV 67	handle.spawn(async move { task.await.transpose().map(\|t\| map_fn(t)) })	×
68	})	×
69	// TODO(ngates): this is very crude indeed. This buffered call essentially controls how
70	// many splits we have in-flight at any given time. We multiple workers by concurrency
71	// to configure per-thread concurrency, which essentially means each thread can make
72	// progress on one split while waiting for the I/O of another split to complete.
73	// In an ideal world, the number of in-flight tasks would be dynamically adjusted
74	// based on how much I/O the tasks _actually_ require. For example, all pruning tasks
75	// could be spawned immediately since they all use a single segment, this would allow
76	// head-room to run ahead and figure out the I/O demands of subsequent tasks.
UNCOV 77	.buffered(num_workers * concurrency);	×
78
UNCOV 79	Ok(iter::from_fn(move \|\| {	×
80	tokio::task::block_in_place(\|\| CPU_RUNTIME.handle().block_on(stream.next()))	×
UNCOV 81	})	×
82	.filter_map(\|result\| {	×
83	result	×
UNCOV 84	.map_err(\|e\| vortex_err!("Failed to join on a spawned scan task {e}"))	×
85	.vortex_expect("Failed to join on a spawned scan task")	×
UNCOV 86	}))	×
87	}	×
88	}

vortex-data / vortex / 16617093228

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous