• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

roskakori / pimdb / 9077880084

14 May 2024 10:32AM UTC coverage: 81.363% (-0.2%) from 81.601%
9077880084

push

github

web-flow
Merge pull request #45 from roskakori/39-fix-column-length-to-big

#39 Fix column length to big

18 of 21 new or added lines in 6 files covered. (85.71%)

1 existing line in 1 file now uncovered.

943 of 1159 relevant lines covered (81.36%)

0.81 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/tests/build_test_data.py
1
# Copyright (c) 2020, Thomas Aglassinger.
2
# All rights reserved. Distributed under the BSD License.
3
import argparse
×
4
import logging
×
5
import os
×
NEW
6
from typing import Any, Optional
×
7

8
from pimdb import __version__
×
9
from pimdb.common import IMDB_DATASET_TO_KEY_COLUMNS_MAP, GzippedTsvReader, ImdbDataset, TsvDictWriter
×
10

11
TEST_NCONSTS = [
×
12
    # "nm0000616",  # Eric Roberts
13
    # "nm0001376",  # Isabelle Huppert
14
    # "nm0233757",  # Jaco Van Dormael
15
    # "nm0567408",  # Hattie McDaniel
16
    # "nm0707425",  # Rajinikanth
17
    # "nm1382571",  # Michael Ostrowski
18
    # "nm1801453",  # Achita Sikamana
19
    "nm3658287",  # Bianca Bradey
20
    # "nm5148470",  # Terry DeCastro
21
]
22

23
_DEFAULT_TARGET_FOLDER = os.path.join(os.path.dirname(__file__), "data")
×
24

25
log = logging.getLogger("pimdb.tests." + os.path.splitext(os.path.basename(__file__))[0])
×
26

27

NEW
28
def _parsed_arguments(args: Optional[list[str]]) -> argparse.Namespace:
×
29
    parser = argparse.ArgumentParser(
×
30
        description=(
31
            "create filtered IMDb datasets that contain only a selected few names, "
32
            "the titles they contributed to and all other names being part of these titles"
33
        )
34
    )
35
    parser.add_argument(
×
36
        "dataset_folder",
37
        metavar="FOLDER",
38
        nargs="?",
39
        default="",
40
        help="folder containing gzipped complete IMDb datasets to be used as source; default: current folder",
41
    )
42
    parser.add_argument(
×
43
        "--out",
44
        "-o",
45
        dest="target_folder",
46
        metavar="FOLDER",
47
        default=_DEFAULT_TARGET_FOLDER,
48
        help="folder where to store the filtered TSV files; default: %(default)s",
49
    )
50
    parser.add_argument(
×
51
        "--quick", "-q", action="store_true", help="use a hardcoded minimal set of source names for quick testing"
52
    )
53
    parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
×
54
    return parser.parse_args(args)
×
55

56

57
def gzipped_tsv_reader(
×
58
    folder: str, imdb_dataset: ImdbDataset, filtered_names_to_values_map: dict[str, Any]
59
) -> GzippedTsvReader:
60
    return GzippedTsvReader(
×
61
        os.path.join(folder, imdb_dataset.filename),
62
        IMDB_DATASET_TO_KEY_COLUMNS_MAP[imdb_dataset],
63
        _log_progress,
64
        10,
65
        filtered_name_to_values_map=filtered_names_to_values_map,
66
    )
67

68

69
def _log_progress(processed_count, _):
×
70
    log.info("  processed %d rows", processed_count)
×
71

72

73
def extracted_tconsts(
×
74
    gzipped_tsv_folder: str,
75
    dataset: ImdbDataset,
76
    result_column_name: str,
77
    filtered_column_name: str,
78
    filtered_values: set[str],
79
) -> set[str]:
UNCOV
80
    tsv_reader = gzipped_tsv_reader(
×
81
        gzipped_tsv_folder,
82
        dataset,
83
        filtered_names_to_values_map={filtered_column_name: filtered_values},
84
    )
85
    result = {name_to_value_map[result_column_name] for name_to_value_map in tsv_reader.column_names_to_value_maps()}
×
86
    return result
×
87

88

NEW
89
def main(args: Optional[list[str]] = None):
×
90
    arguments = _parsed_arguments(args)
×
91
    log.info("collecting principals tconsts to filter for")
×
92
    principal_tconsts = (
×
93
        extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_PRINCIPALS, "tconst", "nconst", TEST_NCONSTS)
94
        if not arguments.quick
95
        else {"tt2535470", "tt3471694", "tt5635850"}
96
    )
97
    log.info("  found %d titles", len(principal_tconsts))
×
98
    log.info("collecting episode tconsts to filter for")
×
99
    episode_tconsts = (
×
100
        extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_EPISODE, "parentTconst", "tconst", TEST_NCONSTS)
101
        if not arguments.quick
102
        else {"tt3456370"}
103
    )
104
    log.info("  found %d titles", len(episode_tconsts))
×
105
    tconsts = principal_tconsts | episode_tconsts
×
106
    log.info("collecting nconsts to filter for")
×
107
    nconsts = (
×
108
        extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_PRINCIPALS, "nconst", "tconst", tconsts)
109
        if not arguments.quick
110
        else {"nm3658287", "nm3737504", "nm5713118"}
111
    )
112
    log.info("  found %d names", len(nconsts))
×
113
    for imdb_dataset in ImdbDataset:
×
114
        target_path = os.path.join(arguments.target_folder, imdb_dataset.filename[:-3])
×
115
        log.info("writing %s", target_path)
×
116
        line_count = 0
×
117
        with open(target_path, "w", newline="", encoding="utf-8") as target_file:
×
118
            tsv_writer = TsvDictWriter(target_file)
×
119
            filtered_name_to_values_map = {}
×
120
            if imdb_dataset == ImdbDataset.TITLE_AKAS:
×
121
                filtered_name_to_values_map["titleId"] = tconsts
×
122
            else:
123
                if imdb_dataset != ImdbDataset.NAME_BASICS:
×
124
                    filtered_name_to_values_map["tconst"] = tconsts
×
125
                if imdb_dataset in [ImdbDataset.NAME_BASICS, ImdbDataset.TITLE_PRINCIPALS]:
×
126
                    filtered_name_to_values_map["nconst"] = nconsts
×
127
            reader = gzipped_tsv_reader(arguments.dataset_folder, imdb_dataset, filtered_name_to_values_map)
×
128
            for name_to_value_map in reader.column_names_to_value_maps():
×
129
                tsv_writer.write(name_to_value_map)
×
130
                line_count += 1
×
131
        log.info("  lines written: %d", line_count)
×
132

133

134
if __name__ == "__main__":
×
135
    logging.basicConfig(level=logging.INFO)
×
136
    main()
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc