9077880084

Committed 14 May 2024 10:32AM UTC coverage: 81.363% (-0.2%) from 81.601%

Build # 9077880084

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #45 from roskakori/39-fix-column-length-to-big

#39 Fix column length to big

Run Details

18 of 21 new or added lines in 6 files covered. (85.71%)

1 existing line in 1 file now uncovered.

943 of 1159 relevant lines covered (81.36%)

0.81 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/tests/build_test_data.py

# Copyright (c) 2020, Thomas Aglassinger.
# All rights reserved. Distributed under the BSD License.
import argparse
import logging
import os
from typing import Any, Optional

from pimdb import __version__
from pimdb.common import IMDB_DATASET_TO_KEY_COLUMNS_MAP, GzippedTsvReader, ImdbDataset, TsvDictWriter

TEST_NCONSTS = [
    # "nm0000616",  # Eric Roberts
    # "nm0001376",  # Isabelle Huppert
    # "nm0233757",  # Jaco Van Dormael
    # "nm0567408",  # Hattie McDaniel
    # "nm0707425",  # Rajinikanth
    # "nm1382571",  # Michael Ostrowski
    # "nm1801453",  # Achita Sikamana
    "nm3658287",  # Bianca Bradey
    # "nm5148470",  # Terry DeCastro
]

_DEFAULT_TARGET_FOLDER = os.path.join(os.path.dirname(__file__), "data")

log = logging.getLogger("pimdb.tests." + os.path.splitext(os.path.basename(__file__))[0])


def _parsed_arguments(args: Optional[list[str]]) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "create filtered IMDb datasets that contain only a selected few names, "
            "the titles they contributed to and all other names being part of these titles"
        )
    )
    parser.add_argument(
        "dataset_folder",
        metavar="FOLDER",
        nargs="?",
        default="",
        help="folder containing gzipped complete IMDb datasets to be used as source; default: current folder",
    )
    parser.add_argument(
        "--out",
        "-o",
        dest="target_folder",
        metavar="FOLDER",
        default=_DEFAULT_TARGET_FOLDER,
        help="folder where to store the filtered TSV files; default: %(default)s",
    )
    parser.add_argument(
        "--quick", "-q", action="store_true", help="use a hardcoded minimal set of source names for quick testing"
    )
    parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
    return parser.parse_args(args)


def gzipped_tsv_reader(
    folder: str, imdb_dataset: ImdbDataset, filtered_names_to_values_map: dict[str, Any]
) -> GzippedTsvReader:
    return GzippedTsvReader(
        os.path.join(folder, imdb_dataset.filename),
        IMDB_DATASET_TO_KEY_COLUMNS_MAP[imdb_dataset],
        _log_progress,
        10,
        filtered_name_to_values_map=filtered_names_to_values_map,
    )


def _log_progress(processed_count, _):
    log.info("  processed %d rows", processed_count)


def extracted_tconsts(
    gzipped_tsv_folder: str,
    dataset: ImdbDataset,
    result_column_name: str,
    filtered_column_name: str,
    filtered_values: set[str],
) -> set[str]:
    tsv_reader = gzipped_tsv_reader(
        gzipped_tsv_folder,
        dataset,
        filtered_names_to_values_map={filtered_column_name: filtered_values},
    )
    result = {name_to_value_map[result_column_name] for name_to_value_map in tsv_reader.column_names_to_value_maps()}
    return result


def main(args: Optional[list[str]] = None):
    arguments = _parsed_arguments(args)
    log.info("collecting principals tconsts to filter for")
    principal_tconsts = (
        extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_PRINCIPALS, "tconst", "nconst", TEST_NCONSTS)
        if not arguments.quick
        else {"tt2535470", "tt3471694", "tt5635850"}
    )
    log.info("  found %d titles", len(principal_tconsts))
    log.info("collecting episode tconsts to filter for")
    episode_tconsts = (
        extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_EPISODE, "parentTconst", "tconst", TEST_NCONSTS)
        if not arguments.quick
        else {"tt3456370"}
    )
    log.info("  found %d titles", len(episode_tconsts))
    tconsts = principal_tconsts | episode_tconsts
    log.info("collecting nconsts to filter for")
    nconsts = (
        extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_PRINCIPALS, "nconst", "tconst", tconsts)
        if not arguments.quick
        else {"nm3658287", "nm3737504", "nm5713118"}
    )
    log.info("  found %d names", len(nconsts))
    for imdb_dataset in ImdbDataset:
        target_path = os.path.join(arguments.target_folder, imdb_dataset.filename[:-3])
        log.info("writing %s", target_path)
        line_count = 0
        with open(target_path, "w", newline="", encoding="utf-8") as target_file:
            tsv_writer = TsvDictWriter(target_file)
            filtered_name_to_values_map = {}
            if imdb_dataset == ImdbDataset.TITLE_AKAS:
                filtered_name_to_values_map["titleId"] = tconsts
            else:
                if imdb_dataset != ImdbDataset.NAME_BASICS:
                    filtered_name_to_values_map["tconst"] = tconsts
                if imdb_dataset in [ImdbDataset.NAME_BASICS, ImdbDataset.TITLE_PRINCIPALS]:
                    filtered_name_to_values_map["nconst"] = nconsts
            reader = gzipped_tsv_reader(arguments.dataset_folder, imdb_dataset, filtered_name_to_values_map)
            for name_to_value_map in reader.column_names_to_value_maps():
                tsv_writer.write(name_to_value_map)
                line_count += 1
        log.info("  lines written: %d", line_count)


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    main()

1	# Copyright (c) 2020, Thomas Aglassinger.
2	# All rights reserved. Distributed under the BSD License.
3	import argparse	×
4	import logging	×
5	import os	×
NEW 6	from typing import Any, Optional	×
7
8	from pimdb import __version__	×
9	from pimdb.common import IMDB_DATASET_TO_KEY_COLUMNS_MAP, GzippedTsvReader, ImdbDataset, TsvDictWriter	×
10
11	TEST_NCONSTS = [	×
12	# "nm0000616", # Eric Roberts
13	# "nm0001376", # Isabelle Huppert
14	# "nm0233757", # Jaco Van Dormael
15	# "nm0567408", # Hattie McDaniel
16	# "nm0707425", # Rajinikanth
17	# "nm1382571", # Michael Ostrowski
18	# "nm1801453", # Achita Sikamana
19	"nm3658287", # Bianca Bradey
20	# "nm5148470", # Terry DeCastro
21	]
22
23	_DEFAULT_TARGET_FOLDER = os.path.join(os.path.dirname(__file__), "data")	×
24
25	log = logging.getLogger("pimdb.tests." + os.path.splitext(os.path.basename(__file__))[0])	×
26
27
NEW 28	def _parsed_arguments(args: Optional[list[str]]) -> argparse.Namespace:	×
29	parser = argparse.ArgumentParser(	×
30	description=(
31	"create filtered IMDb datasets that contain only a selected few names, "
32	"the titles they contributed to and all other names being part of these titles"
33	)
34	)
35	parser.add_argument(	×
36	"dataset_folder",
37	metavar="FOLDER",
38	nargs="?",
39	default="",
40	help="folder containing gzipped complete IMDb datasets to be used as source; default: current folder",
41	)
42	parser.add_argument(	×
43	"--out",
44	"-o",
45	dest="target_folder",
46	metavar="FOLDER",
47	default=_DEFAULT_TARGET_FOLDER,
48	help="folder where to store the filtered TSV files; default: %(default)s",
49	)
50	parser.add_argument(	×
51	"--quick", "-q", action="store_true", help="use a hardcoded minimal set of source names for quick testing"
52	)
53	parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")	×
54	return parser.parse_args(args)	×
55
56
57	def gzipped_tsv_reader(	×
58	folder: str, imdb_dataset: ImdbDataset, filtered_names_to_values_map: dict[str, Any]
59	) -> GzippedTsvReader:
60	return GzippedTsvReader(	×
61	os.path.join(folder, imdb_dataset.filename),
62	IMDB_DATASET_TO_KEY_COLUMNS_MAP[imdb_dataset],
63	_log_progress,
64	10,
65	filtered_name_to_values_map=filtered_names_to_values_map,
66	)
67
68
69	def _log_progress(processed_count, _):	×
70	log.info(" processed %d rows", processed_count)	×
71
72
73	def extracted_tconsts(	×
74	gzipped_tsv_folder: str,
75	dataset: ImdbDataset,
76	result_column_name: str,
77	filtered_column_name: str,
78	filtered_values: set[str],
79	) -> set[str]:
UNCOV 80	tsv_reader = gzipped_tsv_reader(	×
81	gzipped_tsv_folder,
82	dataset,
83	filtered_names_to_values_map={filtered_column_name: filtered_values},
84	)
85	result = {name_to_value_map[result_column_name] for name_to_value_map in tsv_reader.column_names_to_value_maps()}	×
86	return result	×
87
88
NEW 89	def main(args: Optional[list[str]] = None):	×
90	arguments = _parsed_arguments(args)	×
91	log.info("collecting principals tconsts to filter for")	×
92	principal_tconsts = (	×
93	extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_PRINCIPALS, "tconst", "nconst", TEST_NCONSTS)
94	if not arguments.quick
95	else {"tt2535470", "tt3471694", "tt5635850"}
96	)
97	log.info(" found %d titles", len(principal_tconsts))	×
98	log.info("collecting episode tconsts to filter for")	×
99	episode_tconsts = (	×
100	extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_EPISODE, "parentTconst", "tconst", TEST_NCONSTS)
101	if not arguments.quick
102	else {"tt3456370"}
103	)
104	log.info(" found %d titles", len(episode_tconsts))	×
105	tconsts = principal_tconsts \| episode_tconsts	×
106	log.info("collecting nconsts to filter for")	×
107	nconsts = (	×
108	extracted_tconsts(arguments.dataset_folder, ImdbDataset.TITLE_PRINCIPALS, "nconst", "tconst", tconsts)
109	if not arguments.quick
110	else {"nm3658287", "nm3737504", "nm5713118"}
111	)
112	log.info(" found %d names", len(nconsts))	×
113	for imdb_dataset in ImdbDataset:	×
114	target_path = os.path.join(arguments.target_folder, imdb_dataset.filename[:-3])	×
115	log.info("writing %s", target_path)	×
116	line_count = 0	×
117	with open(target_path, "w", newline="", encoding="utf-8") as target_file:	×
118	tsv_writer = TsvDictWriter(target_file)	×
119	filtered_name_to_values_map = {}	×
120	if imdb_dataset == ImdbDataset.TITLE_AKAS:	×
121	filtered_name_to_values_map["titleId"] = tconsts	×
122	else:
123	if imdb_dataset != ImdbDataset.NAME_BASICS:	×
124	filtered_name_to_values_map["tconst"] = tconsts	×
125	if imdb_dataset in [ImdbDataset.NAME_BASICS, ImdbDataset.TITLE_PRINCIPALS]:	×
126	filtered_name_to_values_map["nconst"] = nconsts	×
127	reader = gzipped_tsv_reader(arguments.dataset_folder, imdb_dataset, filtered_name_to_values_map)	×
128	for name_to_value_map in reader.column_names_to_value_maps():	×
129	tsv_writer.write(name_to_value_map)	×
130	line_count += 1	×
131	log.info(" lines written: %d", line_count)	×
132
133
134	if __name__ == "__main__":	×
135	logging.basicConfig(level=logging.INFO)	×
136	main()	×

roskakori / pimdb / 9077880084

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous