• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

dimagi / commcare-export / 8559166587

04 Apr 2024 05:58PM UTC coverage: 80.158%. First build
8559166587

Pull #239

github

web-flow
Merge a4fcf3fb1 into f48e7a9a8
Pull Request #239: Improve DET errors

1759 of 2244 branches covered (78.39%)

11 of 16 new or added lines in 1 file covered. (68.75%)

1931 of 2409 relevant lines covered (80.16%)

4.01 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

53.45
/commcare_export/cli.py
1
import argparse
5✔
2
import getpass
5✔
3
import io
5✔
4
import json
5✔
5
import os.path
5✔
6
import sys
5✔
7
import logging
5✔
8
import dateutil.parser
5✔
9
import requests
5✔
10
import sqlalchemy
5✔
11

12
from commcare_export import builtin_queries, excel_query, writers
5✔
13
from commcare_export.checkpoint import CheckpointManagerProvider
5✔
14
from commcare_export.commcare_hq_client import (
5✔
15
    LATEST_KNOWN_VERSION,
16
    CommCareHqClient,
17
    ResourceRepeatException,
18
)
19
from commcare_export.commcare_minilinq import CommCareHqEnv
5✔
20
from commcare_export.env import BuiltInEnv, EmitterEnv, JsonPathEnv
5✔
21
from commcare_export.exceptions import (
5✔
22
    DataExportException,
23
    MissingQueryFileException,
24
)
25
from commcare_export.location_info_provider import LocationInfoProvider
5✔
26
from commcare_export.minilinq import List, MiniLinq
5✔
27
from commcare_export.misc import default_to_json
5✔
28
from commcare_export.repeatable_iterator import RepeatableIterator
5✔
29
from commcare_export.utils import get_checkpoint_manager
5✔
30
from commcare_export.version import __version__
5✔
31
from commcare_export import get_logger, get_error_logger
5✔
32

33
EXIT_STATUS_ERROR = 1
5✔
34
logger = get_logger(__file__)
5✔
35

36
commcare_hq_aliases = {
5✔
37
    'local': 'http://localhost:8000',
38
    'prod': 'https://www.commcarehq.org'
39
}
40

41

42
class Argument(object):
5✔
43

44
    def __init__(self, name, *args, **kwargs):
5✔
45
        self.name = name.replace('-', '_')
5✔
46
        self._args = ['--{}'.format(name)] + list(args)
5✔
47
        self._kwargs = kwargs
5✔
48

49
    @property
5✔
50
    def default(self):
5✔
51
        return self._kwargs.get('default')
5✔
52

53
    def add_to_parser(self, parser, **additional_kwargs):
5✔
54
        additional_kwargs.update(self._kwargs)
×
55
        parser.add_argument(*self._args, **additional_kwargs)
×
56

57

58
CLI_ARGS = [
5✔
59
    Argument(
60
        'version',
61
        default=False,
62
        action='store_true',
63
        help='Print the current version of the commcare-export tool.'
64
    ),
65
    Argument('query', required=False, help='JSON or Excel query file'),
66
    Argument('dump-query', default=False, action='store_true'),
67
    Argument(
68
        'commcare-hq',
69
        default='prod',
70
        help='Base url for the CommCare HQ instance e.g. '
71
        'https://www.commcarehq.org'
72
    ),
73
    Argument('api-version', default=LATEST_KNOWN_VERSION),
74
    Argument('project'),
75
    Argument('username'),
76
    Argument(
77
        'password',
78
        help='Enter password, or if using apikey auth-mode, enter the api key.'
79
    ),
80
    Argument(
81
        'auth-mode',
82
        default='password',
83
        choices=['password', 'apikey'],
84
        help='Use "digest" auth, or "apikey" auth (for two factor enabled '
85
        'domains).'
86
    ),
87
    Argument(
88
        'since',
89
        help='Export all data after this date. Format YYYY-MM-DD or '
90
        'YYYY-MM-DDTHH:mm:SS'
91
    ),
92
    Argument(
93
        'until',
94
        help='Export all data up until this date. Format YYYY-MM-DD or '
95
        'YYYY-MM-DDTHH:mm:SS'
96
    ),
97
    Argument(
98
        'start-over',
99
        default=False,
100
        action='store_true',
101
        help='When saving to a SQL database; the default is to pick up '
102
        'since the last success. This disables that.'
103
    ),
104
    Argument('profile'),
105
    Argument('verbose', default=False, action='store_true'),
106
    Argument(
107
        'output-format',
108
        default='json',
109
        choices=['json', 'csv', 'xls', 'xlsx', 'sql', 'markdown'],
110
        help='Output format'
111
    ),
112
    Argument(
113
        'output',
114
        metavar='PATH',
115
        default='reports.zip',
116
        help='Path to output; defaults to `reports.zip`.'
117
    ),
118
    Argument(
119
        'strict-types',
120
        default=False,
121
        action='store_true',
122
        help="When saving to a SQL database don't allow changing column types "
123
        "once they are created."
124
    ),
125
    Argument(
126
        'missing-value',
127
        default=None,
128
        help="Value to use when a field is missing from the form / case."
129
    ),
130
    Argument(
131
        'batch-size',
132
        default=200,
133
        help="Number of records to process per batch."
134
    ),
135
    Argument(
136
        'checkpoint-key',
137
        help="Use this key for all checkpoints instead of the query file MD5 "
138
        "hash in order to prevent table rebuilds after a query file has "
139
        "been edited."
140
    ),
141
    Argument(
142
        'users',
143
        default=False,
144
        action='store_true',
145
        help="Export a table containing data about this project's mobile "
146
        "workers"
147
    ),
148
    Argument(
149
        'locations',
150
        default=False,
151
        action='store_true',
152
        help="Export a table containing data about this project's locations"
153
    ),
154
    Argument(
155
        'with-organization',
156
        default=False,
157
        action='store_true',
158
        help="Export tables containing mobile worker data and location data "
159
        "and add a commcare_userid field to any exported form or case"
160
    ),
161
    Argument(
162
        'export-root-if-no-subdocument',
163
        default=False,
164
        action='store_true',
165
        help="Use this when you are exporting a nested document e.g. "
166
        "form.form..case, messaging-event.messages.[*] And you want to "
167
        "have a record exported even if the nested document does not "
168
        "exist or is empty.",
169
    ),
170
    Argument(
171
        'no-logfile',
172
        default=False,
173
        help="Specify in order to prevent information being logged to the log file and"
174
             " show all output in the console.",
175
        action='store_true',
176
    ),
177
]
178

179

180
def main(argv):
5✔
181
    parser = argparse.ArgumentParser(
×
182
        'commcare-export', 'Output a customized export of CommCareHQ data.'
183
    )
184
    for arg in CLI_ARGS:
×
185
        arg.add_to_parser(parser)
×
186

187
    args = parser.parse_args(argv)
×
188

NEW
189
    if args.output_format and args.output:
×
NEW
190
        errors = []
×
NEW
191
        errors.extend(validate_output_filename(args.output_format, args.output))
×
NEW
192
        if errors:
×
NEW
193
            raise Exception(f"Could not proceed. Following issues were found: {', '.join(errors)}.")
×
194

195
    if not args.no_logfile:
×
196
        exe_dir = os.path.dirname(sys.executable)
×
197
        log_file = os.path.join(exe_dir, "commcare_export.log")
×
198
        print(f"Printing logs to {log_file}")
×
199
        logging.basicConfig(
×
200
            filename=log_file,
201
            format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
202
            filemode='w',
203
        )
204
        sys.stderr = get_error_logger()
×
205

206
    if args.verbose:
×
207
        logging.basicConfig(
×
208
            level=logging.DEBUG,
209
            format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
210
        )
211
    else:
212
        logging.basicConfig(
×
213
            level=logging.WARN,
214
            format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
215
        )
216

217
    logging.getLogger('alembic').setLevel(logging.WARN)
×
218
    logging.getLogger('backoff').setLevel(logging.FATAL)
×
219
    logging.getLogger('urllib3').setLevel(logging.WARN)
×
220

221
    if args.version:
×
222
        print('commcare-export version {}'.format(__version__))
×
223
        sys.exit(0)
×
224

225
    if not args.project:
×
226
        error_msg = "commcare-export: error: argument --project is required"
×
227
        # output to log file through sys.stderr
228
        print(
×
229
            error_msg,
230
            file=sys.stderr
231
        )
232
        # Output to console for debugging
233
        print(error_msg)
×
234
        sys.exit(1)
×
235

236
    if args.profile:
×
237
        # hotshot is gone in Python 3
238
        import hotshot
×
239
        import hotshot.stats
×
240
        profile = hotshot.Profile(args.profile)
×
241
        profile.start()
×
242

243
    try:
×
244
        print("Running export...")
×
245
        try:
×
246
            exit_code = main_with_args(args)
×
247
            if exit_code > 0:
×
248
                print("Error occurred! See log file for error.")
×
249
            sys.exit(exit_code)
×
250
        except Exception:
×
251
            print("Error occurred! See log file for error.")
×
252
            raise
×
253
    finally:
254
        print("Export finished!")
×
255
        if args.profile:
×
256
            profile.close()
×
257
            stats = hotshot.stats.load(args.profile)
×
258
            stats.strip_dirs()
×
259
            stats.sort_stats('cumulative', 'calls')
×
260
            stats.print_stats(100)
×
261

262

263
def validate_output_filename(output_format, output_filename):
5✔
264
    """
265
    Validate file extensions for csv, xls and xlsx output formats.
266
    Ensure extension unless using sql output_format.
267
    """
268
    errors = []
5✔
269
    if output_format == 'csv' and not output_filename.endswith('.zip'):
5✔
270
        errors.append("For output format as csv, output file name should have extension zip")
5✔
271
    elif output_format == 'xls' and not output_filename.endswith('.xls'):
5✔
272
        errors.append("For output format as xls, output file name should have extension xls")
5✔
273
    elif output_format == 'xlsx' and not output_filename.endswith('.xlsx'):
5✔
274
        errors.append("For output format as xlsx, output file name should have extension xlsx")
5✔
275
    elif output_format != 'sql' and "." not in output_filename:
5✔
276
        errors.append("Missing extension in output file name")
5✔
277
    return errors
5✔
278

279

280
def _get_query(args, writer, column_enforcer=None):
5✔
281
    return _get_query_from_file(
5✔
282
        args.query,
283
        args.missing_value,
284
        writer.supports_multi_table_write,
285
        writer.max_column_length,
286
        writer.required_columns,
287
        column_enforcer,
288
        args.export_root_if_no_subdocument
289
    )
290

291

292
def _get_query_from_file(
5✔
293
    query_arg,
294
    missing_value,
295
    combine_emits,
296
    max_column_length,
297
    required_columns,
298
    column_enforcer,
299
    value_or_root
300
):
301
    if os.path.exists(query_arg):
5✔
302
        if os.path.splitext(query_arg)[1] in ['.xls', '.xlsx']:
5✔
303
            import openpyxl
5✔
304
            workbook = openpyxl.load_workbook(query_arg)
5✔
305
            return excel_query.get_queries_from_excel(
5✔
306
                workbook,
307
                missing_value,
308
                combine_emits,
309
                max_column_length,
310
                required_columns,
311
                column_enforcer,
312
                value_or_root
313
            )
314
        else:
315
            with io.open(query_arg, encoding='utf-8') as fh:
×
316
                return MiniLinq.from_jvalue(json.loads(fh.read()))
×
317

318

319
def get_queries(args, writer, lp, column_enforcer=None):
5✔
320
    query_list = []
5✔
321
    if args.query is not None:
5✔
322
        query = _get_query(args, writer, column_enforcer=column_enforcer)
5✔
323

324
        if not query:
5✔
325
            raise MissingQueryFileException(args.query)
×
326
        query_list.append(query)
5✔
327

328
    if args.users or args.with_organization:
5✔
329
        # Add user data to query
330
        query_list.append(builtin_queries.users_query)
5✔
331

332
    if args.locations or args.with_organization:
5✔
333
        # Add location data to query
334
        query_list.append(builtin_queries.get_locations_query(lp))
5✔
335

336
    return List(query_list) if len(query_list) > 1 else query_list[0]
5✔
337

338

339
def _get_writer(output_format, output, strict_types):
5✔
340
    if output_format == 'xlsx':
×
341
        return writers.Excel2007TableWriter(output)
×
342
    elif output_format == 'xls':
×
343
        return writers.Excel2003TableWriter(output)
×
344
    elif output_format == 'csv':
×
345
        if not output.endswith(".zip"):
×
346
            print(
×
347
                "WARNING: csv output is a zip file, but "
348
                "will be written to %s" % output
349
            )
350
            print(
×
351
                "Consider appending .zip to the file name to avoid confusion."
352
            )
353
        return writers.CsvTableWriter(output)
×
354
    elif output_format == 'json':
×
355
        return writers.JValueTableWriter()
×
356
    elif output_format == 'markdown':
×
357
        return writers.StreamingMarkdownTableWriter(sys.stdout)
×
358
    elif output_format == 'sql':
×
359
        # Output should be a connection URL. Writer had bizarre issues
360
        # so we use a full connection instead of passing in a URL or
361
        # engine.
362
        if output.startswith('mysql'):
×
363
            charset_split = output.split('charset=')
×
364
            if len(charset_split) > 1 and charset_split[1] != 'utf8mb4':
×
365
                raise Exception(
×
366
                    f"The charset '{charset_split[1]}' might cause problems with the export. "
367
                    f"It is recommended that you use 'utf8mb4' instead."
368
                )
369

370
        return writers.SqlTableWriter(output, strict_types)
×
371
    else:
372
        raise Exception("Unknown output format: {}".format(output_format))
×
373

374

375
def get_date_params(args):
5✔
376
    since = dateutil.parser.parse(args.since) if args.since else None
5✔
377
    until = dateutil.parser.parse(args.until) if args.until else None
5✔
378
    return since, until
5✔
379

380

381
def _get_api_client(args, commcarehq_base_url):
5✔
382
    return CommCareHqClient(
5✔
383
        url=commcarehq_base_url,
384
        project=args.project,
385
        username=args.username,
386
        password=args.password,
387
        auth_mode=args.auth_mode,
388
        version=args.api_version
389
    )
390

391

392
def _get_checkpoint_manager(args):
5✔
393
    if not args.users and not args.locations and not os.path.exists(
×
394
        args.query
395
    ):
396
        logger.warning(
×
397
            "Checkpointing disabled for non builtin, "
398
            "non file-based query"
399
        )
400
    elif args.since or args.until:
×
401
        logger.warning(
×
402
            "Checkpointing disabled when using '--since' or '--until'"
403
        )
404
    else:
405
        checkpoint_manager = get_checkpoint_manager(args)
×
406
        checkpoint_manager.create_checkpoint_table()
×
407
        return checkpoint_manager
×
408

409

410
def force_lazy_result(lazy_result):
5✔
411
    if lazy_result is not None:
5✔
412
        if isinstance(lazy_result, RepeatableIterator):
5✔
413
            list(lazy_result) if lazy_result else lazy_result
5✔
414
        else:
415
            for nested_result in lazy_result:
5✔
416
                force_lazy_result(nested_result)
5✔
417

418

419
def evaluate_query(env, query):
5✔
420
    with env:
5✔
421
        try:
5✔
422
            lazy_result = query.eval(env)
5✔
423
            force_lazy_result(lazy_result)
5✔
424
            return 0
5✔
425
        except requests.exceptions.RequestException as e:
5✔
426
            if e.response and e.response.status_code == 401:
×
427
                print(
×
428
                    "\nAuthentication failed. Please check your credentials.",
429
                    file=sys.stderr
430
                )
431
                return EXIT_STATUS_ERROR
×
432
            else:
433
                raise
×
434
        except ResourceRepeatException as e:
5✔
435
            print('Stopping because the export is stuck')
×
436
            print(e.message)
×
437
            print('Try increasing --batch-size to overcome the error')
×
438
            return EXIT_STATUS_ERROR
×
439
        except (
5✔
440
            sqlalchemy.exc.DataError,
441
            sqlalchemy.exc.InternalError,
442
            sqlalchemy.exc.ProgrammingError
443
        ) as e:
444
            print('Stopping because of database error:\n', e)
5✔
445
            return EXIT_STATUS_ERROR
5✔
446
        except KeyboardInterrupt:
×
447
            print('\nExport aborted', file=sys.stderr)
×
448
            return EXIT_STATUS_ERROR
×
449

450

451
def main_with_args(args):
5✔
452
    logger.info("CommCare Export Version {}".format(__version__))
5✔
453
    writer = _get_writer(args.output_format, args.output, args.strict_types)
5✔
454

455
    if args.query is None and args.users is False and args.locations is False:
5✔
456
        print(
×
457
            'At least one the following arguments is required: '
458
            '--query, --users, --locations',
459
            file=sys.stderr
460
        )
461
        return EXIT_STATUS_ERROR
×
462

463
    if not args.username:
5✔
464
        logger.warn("Username not provided")
×
465
        args.username = input('Please provide a username: ')
×
466

467
    if not args.password:
5✔
468
        logger.warn("Password not provided")
×
469
        # Windows getpass does not accept unicode
470
        args.password = getpass.getpass()
×
471

472
    column_enforcer = None
5✔
473
    if args.with_organization:
5✔
474
        column_enforcer = builtin_queries.ColumnEnforcer()
×
475

476
    commcarehq_base_url = commcare_hq_aliases.get(
5✔
477
        args.commcare_hq, args.commcare_hq
478
    )
479
    api_client = _get_api_client(args, commcarehq_base_url)
5✔
480
    lp = LocationInfoProvider(api_client, page_size=args.batch_size)
5✔
481
    try:
5✔
482
        query = get_queries(args, writer, lp, column_enforcer)
5✔
483
    except DataExportException as e:
×
484
        print(e.message, file=sys.stderr)
×
485
        return EXIT_STATUS_ERROR
×
486

487
    if args.dump_query:
5✔
488
        print(json.dumps(query.to_jvalue(), indent=4))
×
489
        return
×
490

491
    checkpoint_manager = None
5✔
492
    if writer.support_checkpoints:
5✔
493
        checkpoint_manager = _get_checkpoint_manager(args)
5✔
494

495
    since, until = get_date_params(args)
5✔
496
    if args.start_over:
5✔
497
        if checkpoint_manager:
5✔
498
            logger.warning(
5✔
499
                'Ignoring all checkpoints and re-fetching all data from '
500
                'CommCare.'
501
            )
502
    elif since:
5✔
503
        logger.debug('Starting from %s', args.since)
5✔
504

505
    cm = CheckpointManagerProvider(checkpoint_manager, since, args.start_over)
5!
506
    static_env = {
5✔
507
        'commcarehq_base_url': commcarehq_base_url,
508
        'get_checkpoint_manager': cm.get_checkpoint_manager,
509
        'get_location_info': lp.get_location_info,
510
        'get_location_ancestor': lp.get_location_ancestor
511
    }
512
    env = (
5✔
513
        BuiltInEnv(static_env)
514
        | CommCareHqEnv(api_client, until=until, page_size=args.batch_size)
515
        | JsonPathEnv({})
516
        | EmitterEnv(writer)
517
    )
518

519
    exit_status = evaluate_query(env, query)
5✔
520

521
    if args.output_format == 'json':
5✔
522
        print(
5✔
523
            json.dumps(
524
                list(writer.tables.values()),
525
                indent=4,
526
                default=default_to_json
527
            )
528
        )
529

530
    return exit_status
5✔
531

532

533
def entry_point():
5✔
534
    main(sys.argv[1:])
×
535

536

537
if __name__ == '__main__':
5✔
538
    entry_point()
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc