• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

alan-turing-institute / ReadabiliPy / 357

31 Mar 2021 - 16:57 coverage: 94.266% (+0.5%) from 93.735%
357

Pull #91

travis-ci

web-flow
Merge 30d915684 into 554327240
Pull Request #91: Improve the check for Node

8 of 9 new or added lines in 3 files covered. (88.89%)

411 of 436 relevant lines covered (94.27%)

2.83 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/readabilipy/__main__.py
1
# -*- coding: utf-8 -*-
2

3
"""Command line interface
!
4

5
"""
6

7
import argparse
!
8
import json
!
9

10
from .__version__ import __version__
!
NEW
11
from .simple_json import simple_json_from_html_string, have_node
!
12

13

14
def main():
!
15
    parser = argparse.ArgumentParser(
!
16
        description="Extract article data from a HTML file using either Mozilla's Readability.js package or a simplified python-only alternative."
17
    )
18
    parser.add_argument(
!
19
        "-i",
20
        "--input-file",
21
        required=True,
22
        help="Path to input file containing HTML.",
23
    )
24
    parser.add_argument(
!
25
        "-o",
26
        "--output-file",
27
        required=True,
28
        help="Path to file to output the article data to as JSON.",
29
    )
30
    parser.add_argument(
!
31
        "-c",
32
        "--content-digests",
33
        action="store_true",
34
        help="Add a 'data-content-digest' attribute containing a SHA256-based digest of the element's contents to each HTML element in the plain_content output.",
35
    )
36
    parser.add_argument(
!
37
        "-n",
38
        "--node-indexes",
39
        action="store_true",
40
        help="Add a 'data-node-index' attribute containing a hierarchical representation of the element's position in the HTML structure each HTML element in the plain_content output.",
41
    )
42
    parser.add_argument(
!
43
        "-p",
44
        "--use-python-parser",
45
        action="store_true",
46
        help="Use the pure-python 'plain_html' parser included in this project rather than Mozilla's Readability.js.",
47
    )
48
    parser.add_argument(
!
49
        "-V",
50
        "--version",
51
        help="Show version and exit",
52
        action="version",
53
        version=f"{__version__} (Readability.js supported: {'yes' if have_node() else 'no'})",
54
    )
55

56
    args = parser.parse_args()
!
57

58
    with open(args.input_file) as h:
!
59
        html = h.read()
!
60

61
    article = simple_json_from_html_string(
!
62
        html,
63
        content_digests=args.content_digests,
64
        node_indexes=args.node_indexes,
65
        use_readability=(not args.use_python_parser),
66
    )
67

68
    with open(args.output_file, "w") as j:
!
69
        json.dump(article, j, ensure_ascii=False)
!
70

71

72
if __name__ == "__main__":
!
73
    main()
!
Troubleshooting · Open an Issue · Sales · Support · ENTERPRISE · CAREERS · STATUS
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2023 Coveralls, Inc