• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

llnl / dftracer-utils / 28693295402

04 Jul 2026 03:17AM UTC coverage: 52.408% (+0.1%) from 52.278%
28693295402

push

github

hariharan-devarajan
feat: silence noisy warnings on aarch64

37318 of 92666 branches covered (40.27%)

Branch coverage included in aggregate %.

33462 of 42389 relevant lines covered (78.94%)

20557.64 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

40.34
/src/dftracer/utils/python/schema_reconcile.cpp
1
#include <dftracer/utils/core/common/config.h>
2
#ifdef DFTRACER_UTILS_ENABLE_ARROW
3

4
#include <dftracer/utils/python/schema_reconcile.h>
5

6
#include <cstdint>
7
#include <cstdio>
8
#include <cstring>
9

10
namespace dftracer::utils::python {
11

12
namespace {
13

14
bool cstr_eq(const char *a, const char *b) {
744✔
15
    if (a == b) return true;
744✔
16
    if (!a || !b) return false;
744!
17
    return std::strcmp(a, b) == 0;
744✔
18
}
372✔
19

20
// Unknown formats fall back to NA so we can still emit a safe null column.
21
ArrowType type_from_format(const ArrowSchema *s) {
72✔
22
    if (!s || !s->format) return NANOARROW_TYPE_NA;
72!
23
    const char *f = s->format;
72✔
24
    if (cstr_eq(f, "n")) return NANOARROW_TYPE_NA;
72!
25
    if (cstr_eq(f, "b")) return NANOARROW_TYPE_BOOL;
72!
26
    if (cstr_eq(f, "c")) return NANOARROW_TYPE_INT8;
72!
27
    if (cstr_eq(f, "s")) return NANOARROW_TYPE_INT16;
72!
28
    if (cstr_eq(f, "i")) return NANOARROW_TYPE_INT32;
72!
29
    if (cstr_eq(f, "l")) return NANOARROW_TYPE_INT64;
72✔
30
    if (cstr_eq(f, "C")) return NANOARROW_TYPE_UINT8;
48!
31
    if (cstr_eq(f, "S")) return NANOARROW_TYPE_UINT16;
48!
32
    if (cstr_eq(f, "I")) return NANOARROW_TYPE_UINT32;
48!
33
    if (cstr_eq(f, "L")) return NANOARROW_TYPE_UINT64;
48!
34
    if (cstr_eq(f, "f")) return NANOARROW_TYPE_FLOAT;
48!
35
    if (cstr_eq(f, "g")) return NANOARROW_TYPE_DOUBLE;
48✔
36
    if (cstr_eq(f, "u")) return NANOARROW_TYPE_STRING;
24!
37
    if (cstr_eq(f, "z")) return NANOARROW_TYPE_BINARY;
×
38
    if (cstr_eq(f, "U")) return NANOARROW_TYPE_LARGE_STRING;
×
39
    if (cstr_eq(f, "Z")) return NANOARROW_TYPE_LARGE_BINARY;
×
40
    return NANOARROW_TYPE_NA;
×
41
}
36✔
42

43
int build_null_array(const ArrowSchema *child_schema, int64_t length,
72✔
44
                     ArrowArray *out) {
45
    ArrowError err;
46
    ArrowErrorInit(&err);
72✔
47
    ArrowType t = type_from_format(child_schema);
72✔
48
    if (ArrowArrayInitFromType(out, t) != NANOARROW_OK) return -1;
72!
49
    if (ArrowArrayStartAppending(out) != NANOARROW_OK) return -1;
72!
50
    if (ArrowArrayAppendNull(out, length) != NANOARROW_OK) return -1;
72!
51
    if (ArrowArrayFinishBuildingDefault(out, &err) != NANOARROW_OK) return -1;
72!
52
    return 0;
72✔
53
}
36✔
54

55
void json_escape(std::string_view in, std::string &out) {
×
56
    for (char c : in) {
×
57
        switch (c) {
×
58
            case '"':
59
                out.append("\\\"");
×
60
                break;
×
61
            case '\\':
62
                out.append("\\\\");
×
63
                break;
×
64
            case '\n':
65
                out.append("\\n");
×
66
                break;
×
67
            case '\r':
68
                out.append("\\r");
×
69
                break;
×
70
            case '\t':
71
                out.append("\\t");
×
72
                break;
×
73
            default:
74
                if (static_cast<unsigned char>(c) < 0x20) {
×
75
                    char buf[8];
76
                    std::snprintf(
×
77
                        buf, sizeof(buf), "\\u%04x",
78
                        static_cast<int>(static_cast<unsigned char>(c)));
×
79
                    out.append(buf);
×
80
                } else {
81
                    out.push_back(c);
×
82
                }
83
        }
84
    }
85
}
×
86

87
// `view` is prepared once per column (init + SetArray) by the caller; a null
88
// view signals a column that failed to bind and always emits JSON null.
89
void append_json_scalar(const ArrowArrayView *view, ArrowType t, int64_t row,
×
90
                        std::string &out) {
91
    if (!view || ArrowArrayViewIsNull(view, row)) {
×
92
        out.append("null");
×
93
        return;
×
94
    }
95
    switch (t) {
×
96
        case NANOARROW_TYPE_BOOL:
97
            out.append(ArrowArrayViewGetIntUnsafe(view, row) ? "true"
×
98
                                                             : "false");
99
            break;
×
100
        case NANOARROW_TYPE_INT8:
101
        case NANOARROW_TYPE_INT16:
102
        case NANOARROW_TYPE_INT32:
103
        case NANOARROW_TYPE_INT64: {
104
            char buf[32];
105
            std::snprintf(
×
106
                buf, sizeof(buf), "%lld",
107
                static_cast<long long>(ArrowArrayViewGetIntUnsafe(view, row)));
×
108
            out.append(buf);
×
109
            break;
×
110
        }
111
        case NANOARROW_TYPE_UINT8:
112
        case NANOARROW_TYPE_UINT16:
113
        case NANOARROW_TYPE_UINT32:
114
        case NANOARROW_TYPE_UINT64: {
115
            char buf[32];
116
            std::snprintf(buf, sizeof(buf), "%llu",
×
117
                          static_cast<unsigned long long>(
118
                              ArrowArrayViewGetUIntUnsafe(view, row)));
×
119
            out.append(buf);
×
120
            break;
×
121
        }
122
        case NANOARROW_TYPE_FLOAT:
123
        case NANOARROW_TYPE_DOUBLE: {
124
            char buf[32];
125
            std::snprintf(buf, sizeof(buf), "%g",
×
126
                          ArrowArrayViewGetDoubleUnsafe(view, row));
127
            out.append(buf);
×
128
            break;
×
129
        }
130
        case NANOARROW_TYPE_STRING:
131
        case NANOARROW_TYPE_LARGE_STRING: {
132
            auto sv = ArrowArrayViewGetStringUnsafe(view, row);
×
133
            out.push_back('"');
×
134
            json_escape(std::string_view(sv.data, sv.size_bytes), out);
×
135
            out.push_back('"');
×
136
            break;
×
137
        }
138
        default:
139
            out.append("null");
×
140
    }
141
}
142

143
}  // namespace
144

145
SchemaReconciler::SchemaReconciler() = default;
78!
146

147
bool SchemaReconciler::merge(const ArrowSchema *incoming) {
132✔
148
    if (finalized_ || !incoming) return false;
132!
149
    bool added = false;
132✔
150
    for (int64_t i = 0; i < incoming->n_children; ++i) {
1,212✔
151
        const ArrowSchema *child = incoming->children[i];
1,080✔
152
        if (!child || !child->name) continue;
1,418!
153
        std::string name(child->name);
1,080!
154
        if (name == EXTRA_COLUMN_NAME) continue;  // reserved
1,080!
155
        if (name_to_idx_.count(name)) continue;
1,080!
156
        nanoarrow::UniqueSchema copy;
404!
157
        if (ArrowSchemaDeepCopy(child, copy.get()) != NANOARROW_OK) {
404!
158
            last_error_ = "schema deep-copy failed while merging";
×
159
            return added;
×
160
        }
161
        int64_t idx = static_cast<int64_t>(names_.size());
404✔
162
        names_.push_back(name);
404!
163
        child_schemas_.push_back(std::move(copy));
404!
164
        name_to_idx_.emplace(std::move(name), idx);
404!
165
        added = true;
404✔
166
    }
1,080!
167
    return added;
132✔
168
}
66✔
169

170
int SchemaReconciler::finalize() {
50✔
171
    if (finalized_) return 0;
50!
172
    int64_t n = static_cast<int64_t>(child_schemas_.size()) + 1;
50✔
173
    ArrowSchemaInit(locked_schema_.get());
50✔
174
    if (ArrowSchemaSetTypeStruct(locked_schema_.get(), n) != NANOARROW_OK) {
50!
175
        last_error_ = "failed to initialize union struct schema";
×
176
        return -1;
×
177
    }
178
    for (size_t i = 0; i < child_schemas_.size(); ++i) {
454✔
179
        nanoarrow::UniqueSchema tmp;
404✔
180
        if (ArrowSchemaDeepCopy(child_schemas_[i].get(), tmp.get()) !=
404!
181
            NANOARROW_OK) {
182
            last_error_ = "failed to deep-copy union child";
×
183
            return -1;
×
184
        }
185
        ArrowSchemaMove(tmp.get(), locked_schema_->children[i]);
404!
186
    }
404!
187
    ArrowSchema *extra = locked_schema_->children[child_schemas_.size()];
50✔
188
    if (ArrowSchemaSetType(extra, NANOARROW_TYPE_STRING) != NANOARROW_OK) {
50!
189
        last_error_ = "failed to set _extra column type";
×
190
        return -1;
×
191
    }
192
    if (ArrowSchemaSetName(extra, EXTRA_COLUMN_NAME) != NANOARROW_OK) {
50✔
193
        last_error_ = "failed to name _extra column";
×
194
        return -1;
×
195
    }
196
    finalized_ = true;
50✔
197
    return 0;
50✔
198
}
25✔
199

200
int SchemaReconciler::copy_schema(ArrowSchema *out) const {
50✔
201
    if (!finalized_) {
50!
202
        last_error_ = "copy_schema called before finalize";
×
203
        return -1;
×
204
    }
205
    nanoarrow::UniqueSchema tmp;
50✔
206
    if (ArrowSchemaDeepCopy(locked_schema_.get(), tmp.get()) != NANOARROW_OK) {
50!
207
        last_error_ = "failed to deep-copy locked schema";
×
208
        return -1;
×
209
    }
210
    ArrowSchemaMove(tmp.get(), out);
50!
211
    return 0;
50✔
212
}
50✔
213

214
int SchemaReconciler::reconcile(const ArrowSchema *in_schema,
128✔
215
                                ArrowArray *in_array, ArrowArray *out) const {
216
    if (!finalized_) {
128!
217
        last_error_ = "reconcile called before finalize";
×
218
        return -1;
×
219
    }
220
    if (!in_schema || !in_array || !out) return -1;
128!
221

222
    int64_t num_rows = in_array->length;
128✔
223

224
    // Initialize out as a struct matching the locked schema. This allocates
225
    // children of the right types; we'll populate them below.
226
    ArrowError err;
227
    ArrowErrorInit(&err);
128✔
228
    if (ArrowArrayInitFromSchema(out, locked_schema_.get(), &err) !=
128!
229
        NANOARROW_OK) {
230
        last_error_ = "ArrowArrayInitFromSchema failed for reconciled array";
×
231
        return -1;
×
232
    }
233

234
    // Build: input-name -> input-child-index
235
    std::unordered_map<std::string, int64_t> in_idx;
128✔
236
    in_idx.reserve(static_cast<size_t>(in_schema->n_children));
128!
237
    for (int64_t i = 0; i < in_schema->n_children; ++i) {
1,176✔
238
        const ArrowSchema *c = in_schema->children[i];
1,048✔
239
        if (c && c->name) in_idx.emplace(c->name, i);
1,048!
240
    }
524✔
241

242
    // For each known union column (all except the final _extra), try to take
243
    // it from the input batch. If missing, null-pad.
244
    int64_t n_known = num_known_columns();
128!
245
    for (int64_t i = 0; i < n_known; ++i) {
1,248✔
246
        const std::string &name = names_[static_cast<size_t>(i)];
1,120✔
247
        auto it = in_idx.find(name);
1,120!
248
        if (it != in_idx.end()) {
1,120✔
249
            // Release the pre-initialized placeholder child and move the
250
            // input child into its slot (zero copy; release of the input
251
            // goes null after the move).
252
            ArrowArray *slot = out->children[i];
1,048✔
253
            if (slot->release) slot->release(slot);
1,048!
254
            ArrowArrayMove(in_array->children[it->second], slot);
1,048!
255
        } else {
524✔
256
            ArrowArray *slot = out->children[i];
72✔
257
            if (slot->release) slot->release(slot);
72!
258
            if (build_null_array(locked_schema_->children[i], num_rows, slot) !=
72!
259
                0) {
260
                last_error_ = "failed to build null column for missing field";
×
261
                return -1;
×
262
            }
263
        }
264
    }
560✔
265

266
    // Find input children whose names aren't in the union: these feed _extra.
267
    std::vector<int64_t> unknown_in;
128✔
268
    for (int64_t i = 0; i < in_schema->n_children; ++i) {
1,176✔
269
        const ArrowSchema *c = in_schema->children[i];
1,048✔
270
        if (!c || !c->name) continue;
1,048!
271
        if (!name_to_idx_.count(c->name)) unknown_in.push_back(i);
1,048!
272
    }
524✔
273

274
    // Build the _extra column. Fast path: no unknowns -> all nulls.
275
    ArrowArray *extra_slot = out->children[n_known];
128✔
276
    if (extra_slot->release) extra_slot->release(extra_slot);
128!
277
    if (unknown_in.empty()) {
128!
278
        if (ArrowArrayInitFromType(extra_slot, NANOARROW_TYPE_STRING) !=
128!
279
            NANOARROW_OK) {
280
            last_error_ = "failed to init null _extra column";
×
281
            return -1;
×
282
        }
283
        if (ArrowArrayStartAppending(extra_slot) != NANOARROW_OK ||
128!
284
            ArrowArrayAppendNull(extra_slot, num_rows) != NANOARROW_OK ||
256!
285
            ArrowArrayFinishBuildingDefault(extra_slot, &err) != NANOARROW_OK) {
128!
286
            last_error_ = "failed to append nulls to _extra";
×
287
            return -1;
×
288
        }
289
    } else {
64✔
290
        // Slow path: JSON-encode unknown fields per row.
291
        if (ArrowArrayInitFromType(extra_slot, NANOARROW_TYPE_STRING) !=
×
292
            NANOARROW_OK) {
293
            last_error_ = "failed to init string _extra column";
×
294
            return -1;
×
295
        }
296
        if (ArrowArrayStartAppending(extra_slot) != NANOARROW_OK) {
×
297
            last_error_ = "failed to start appending to _extra";
×
298
            return -1;
×
299
        }
300
        // Bind one ArrowArrayView per unknown column once, then index by row.
301
        // A column that fails to init/bind is marked invalid (emits null).
302
        struct UnknownColumn {
303
            ArrowArrayView view;
304
            ArrowType type = NANOARROW_TYPE_NA;
305
            bool valid = false;
306
        };
307
        std::vector<UnknownColumn> unknown_views(unknown_in.size());
×
308
        for (size_t k = 0; k < unknown_in.size(); ++k) {
×
309
            int64_t u = unknown_in[k];
×
310
            const ArrowSchema *cs = in_schema->children[u];
×
311
            const ArrowArray *ca = in_array->children[u];
×
312
            if (!cs || !ca || !cs->name) continue;
×
313
            UnknownColumn &uc = unknown_views[k];
×
314
            uc.type = type_from_format(cs);
×
315
            ArrowArrayViewInitFromType(&uc.view, uc.type);
×
316
            ArrowError verr;
317
            ArrowErrorInit(&verr);
×
318
            if (ArrowArrayViewSetArray(&uc.view, ca, &verr) == NANOARROW_OK) {
×
319
                uc.valid = true;
×
320
            } else {
321
                ArrowArrayViewReset(&uc.view);
×
322
            }
323
        }
324

325
        std::string buf;
×
326
        for (int64_t row = 0; row < num_rows; ++row) {
×
327
            buf.clear();
×
328
            buf.push_back('{');
×
329
            bool first = true;
×
330
            for (size_t k = 0; k < unknown_in.size(); ++k) {
×
331
                int64_t u = unknown_in[k];
×
332
                const ArrowSchema *cs = in_schema->children[u];
×
333
                const ArrowArray *ca = in_array->children[u];
×
334
                if (!cs || !ca || !cs->name) continue;
×
335
                if (!first) buf.push_back(',');
×
336
                first = false;
×
337
                buf.push_back('"');
×
338
                json_escape(cs->name, buf);
×
339
                buf.append("\":");
×
340
                const UnknownColumn &uc = unknown_views[k];
×
341
                append_json_scalar(uc.valid ? &uc.view : nullptr, uc.type, row,
×
342
                                   buf);
343
            }
344
            buf.push_back('}');
×
345
            ArrowStringView sv{buf.data(), static_cast<int64_t>(buf.size())};
×
346
            if (ArrowArrayAppendString(extra_slot, sv) != NANOARROW_OK) {
×
347
                last_error_ = "failed to append _extra row";
×
348
                for (auto &uc : unknown_views) {
×
349
                    if (uc.valid) ArrowArrayViewReset(&uc.view);
×
350
                }
351
                return -1;
×
352
            }
353
        }
354
        for (auto &uc : unknown_views) {
×
355
            if (uc.valid) ArrowArrayViewReset(&uc.view);
×
356
        }
357
        if (ArrowArrayFinishBuildingDefault(extra_slot, &err) != NANOARROW_OK) {
×
358
            last_error_ = "failed to finish _extra column";
×
359
            return -1;
×
360
        }
361
    }
×
362

363
    out->length = num_rows;
128✔
364
    out->null_count = 0;
128✔
365
    return 0;
128✔
366
}
128✔
367

368
}  // namespace dftracer::utils::python
369

370
#endif  // DFTRACER_UTILS_ENABLE_ARROW
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc