• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

my8100 / logparser / 1006

01 Jan 2025 01:45PM UTC coverage: 80.811% (-6.6%) from 87.405%
1006

push

circleci

web-flow
Release v0.8.3 and support Python 3.13 (#30)

1 of 1 new or added line in 1 file covered. (100.0%)

57 existing lines in 4 files now uncovered.

737 of 912 relevant lines covered (80.81%)

0.81 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.98
/logparser/common.py
1
# coding: utf-8
2
from collections import OrderedDict
1✔
3
import json
1✔
4
from datetime import datetime
1✔
5
import os
1✔
6
import platform
1✔
7
import sys
1✔
8
import re
1✔
9
import time
1✔
10
import traceback
1✔
11

12

13
CWD = os.path.dirname(os.path.abspath(__file__))
1✔
14
SETTINGS_PY_PATH = os.path.join(CWD, 'settings.py')
1✔
15

16
# LINESEP_PATTERN = re.compile(r'%s' % os.linesep)
17
LINESEP_PATTERN = re.compile(r'\r\n|\n|\r')
1✔
18
LINESEP_BULK_PATTERN = re.compile(r'(?:\r\n|\n|\r)\s*')  # \s includes <space>\t\r\n\f\v
1✔
19

20
# 2019-01-01 00:00:01
21
DATETIME_PATTERN = r'\d{4}-\d{2}-\d{2}[ ]\d{2}:\d{2}:\d{2}'  # <space> would be ignore with re.VERBOSE, use [ ] instead
1✔
22

23
# 2019-01-01 00:00:01 [scrapy.extensions.logstats] INFO:
24
# Crawled 2318 pages (at 2 pages/min), scraped 68438 items (at 60 items/min)
25
DATAS_PATTERN = re.compile(r"""\n
1✔
26
                            (?P<time_>%s)[ ].+?
27
                            Crawled[ ](?P<pages>\d+)[ ]pages[ ]\(at[ ](?P<pages_min>\d+)[ ]pages/min\)
28
                            ,[ ]scraped[ ](?P<items>\d+)[ ]items[ ]\(at[ ](?P<items_min>\d+)[ ]items/min\)
29
                            """ % DATETIME_PATTERN, re.VERBOSE)
30

31
LOG_CATEGORIES_PATTERN_DICT = dict(
1✔
32
    critical_logs=r'\][ ]CRITICAL:',            # [test] CRITICAL:
33
    error_logs=r'\][ ]ERROR:',                  # [test] ERROR:
34
    warning_logs=r'\][ ]WARNING:',              # [test] WARNING:
35
    redirect_logs=r':[ ]Redirecting[ ]\(',      # DEBUG: Redirecting (302) to <GET
36
    retry_logs=r'[ ][Rr]etrying[ ]<',           # DEBUG: Retrying <GET      DEBUG: Gave up retrying <GET
37
    ignore_logs=r':[ ]Ignoring[ ]response[ ]<'  # INFO: Ignoring response <404
38
)
39
for k, v in LOG_CATEGORIES_PATTERN_DICT.items():
1✔
40
    p = re.compile(r"""\n
1✔
41
                    ({time_}[ ][^\n]+?{pattern}.*?)                                  # first line (and its details)
42
                    (?=\r?\n{time_}[ ][^\n]+?(?:DEBUG|INFO|WARNING|ERROR|CRITICAL))  # ?=: Would not consume strings
43
                   """.format(time_=DATETIME_PATTERN, pattern=v), re.VERBOSE | re.DOTALL)
44
    LOG_CATEGORIES_PATTERN_DICT[k] = p
1✔
45
_odict = OrderedDict()
1✔
46
for k in ['critical_logs', 'error_logs', 'warning_logs', 'redirect_logs', 'retry_logs', 'ignore_logs']:
1✔
47
    _odict.update({k: LOG_CATEGORIES_PATTERN_DICT[k]})
1✔
48
LOG_CATEGORIES_PATTERN_DICT = _odict
1✔
49

50
# 2019-01-01 00:00:01 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
51
# 2019-01-01 00:00:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
52
# {'downloader/exception_count': 3,
53
LATEST_MATCHES_PATTERN_DICT = dict(
1✔
54
    scrapy_version=r'Scrapy[ ]\d+\.\d+\.\d+[ ]started',    # Scrapy 1.5.1 started (bot: demo)
55
    telnet_console=r'Telnet[ ]console[ ]listening[ ]on',   # Telnet console listening on 127.0.0.1:6023
56
    # Default: 'scrapy' | Overridden settings: {'TELNETCONSOLE_USERNAME': 'usr'}
57
    telnet_username=r'TELNETCONSOLE_USERNAME\W:.+',
58
    # Telnet Password: 865bba341ef25552 | Overridden settings: {'TELNETCONSOLE_PASSWORD': 'psw'}
59
    telnet_password=r'TELNETCONSOLE_PASSWORD\W:.+|Telnet[ ]Password:[ ].+',
60
    resuming_crawl=r'Resuming[ ]crawl',          # Resuming crawl (675840 requests scheduled)
61
    latest_offsite=r'Filtered[ ]offsite',        # Filtered offsite request to 'www.baidu.com'
62
    latest_duplicate=r'Filtered[ ]duplicate',    # Filtered duplicate request: <GET http://httpbin.org/headers>
63
    latest_crawl=r'Crawled[ ]\(\d+\)',           # Crawled (200) <GET http://httpbin.org/headers> (referer: None)
64
    # latest_scrape=r'Scraped[ ]from[ ]<',         # Scraped from <200 http://httpbin.org/headers>
65
    # latest_item=r'^\{.+\}',                      # {'item': 1}  TODO: multilines item
66
    latest_stat=r'Crawled[ ]\d+[ ]pages[ ]\(at'  # Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min)
67
)
68
_odict = OrderedDict()
1✔
69
for k in ['scrapy_version', 'telnet_console', 'telnet_username', 'telnet_password', 'resuming_crawl',
1✔
70
          'latest_offsite', 'latest_duplicate', 'latest_crawl', 'latest_stat']:
71
    _odict.update({k: LATEST_MATCHES_PATTERN_DICT[k]})
1✔
72
LATEST_MATCHES_PATTERN_DICT = _odict
1✔
73
for k, v in LATEST_MATCHES_PATTERN_DICT.items():
1✔
74
    if k not in ['telnet_username', 'telnet_password']:
1✔
75
        LATEST_MATCHES_PATTERN_DICT[k] = r'^%s[ ].+?%s' % (DATETIME_PATTERN, v)
1✔
76

77
# 2019-01-01 00:00:01 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/headers>
78
LATEST_SCRAPE_ITEM_PATTERN = re.compile(r"""\n
1✔
79
                                         ({time_}[ ][^\n]+?{pattern}[^\n]+?)\r?\n({{.*?)
80
                                         (?=\r?\n{time_}[ ][^\n]+?(?:DEBUG|INFO|WARNING|ERROR|CRITICAL))  # ?=:
81
                                         """.format(time_=DATETIME_PATTERN, pattern=r':[ ]Scraped[ ]from[ ]<'),
82
                                         re.VERBOSE | re.DOTALL)
83

84
# 2019-01-01 00:00:01 [scrapy.crawler] INFO: Received SIGTERM, shutting down gracefully. Send again to force
85
# 2019-01-01 00:00:01 [scrapy.core.engine] INFO: Closing spider (shutdown)
86
# 2019-01-01 00:00:01 [scrapy.crawler] INFO: Received SIGTERM twice, forcing unclean shutdown
87
SIGTERM_PATTERN = re.compile(r'^%s[ ].+?:[ ](Received[ ]SIG(?:BREAK|INT|TERM)([ ]twice)?),' % DATETIME_PATTERN)
1✔
88

89
# 'downloader/response_status_count/200': 2,
90
# 200 301 302 401 403 404 500 503
91
RESPONSE_STATUS_PATTERN = re.compile(r"'downloader/response_status_count/\d{3}':[ ](?P<count>\d+),")
1✔
92
RESPONSE_STATUS_REDIRECT_PATTERN = re.compile(r"'downloader/response_status_count/3\d{2}':[ ](?P<count>\d+),")
1✔
93

94
STATS_DUMPED_CATEGORIES_DICT = dict(
1✔
95
    critical_logs='log_count/CRITICAL',
96
    error_logs='log_count/ERROR',
97
    warning_logs='log_count/WARNING',
98
    # redirect_logs= ,
99
    retry_logs='retry/count',
100
    ignore_logs='httperror/response_ignored_count',
101
)
102

103
# https://github.com/stummjr/scrapy-fieldstats -> fields_coverage in stats
104
# 2019-01-01 00:00:01 [scrapy_fieldstats.fieldstats] INFO: Field stats:
105
# {u'Chinese \u6c49\u5b57 1': '50%', u'Chinese \u6c49\u5b57 2': '50%'}
106
# 2019-01-01 00:00:01 [scrapy_fieldstats.fieldstats] INFO: Field stats:
107
# {
108
    # 'author': {
109
        # 'name': '100.0%',
110
        # 'age':  '52.0%'
111
    # },
112
    # 'image':  '97.0%',
113
    # 'title':  '100.0%',
114
    # 'price':  '92.0%',
115
    # 'stars':  '47.5%'
116
# }
117
# 2019-01-01 00:00:01 [scrapy.core.engine] INFO: Closing spider (finished)
118
# 2019-01-01 00:00:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
119
# {'downloader/exception_count': 3,
120
# 'dupefilter/filtered': 1,
121
# 'fields_coverage': {u'Chinese \u6c49\u5b57 1': '50%',
122
                    # u'Chinese \u6c49\u5b57 2': '50%'},
123
# 'finish_reason': 'finished',
124
# }
125
# 2019-01-01 00:00:01 [scrapy.core.engine] INFO: Spider closed (finished)
126
PATTERN_LOG_ENDING = re.compile(r"""
1✔
127
                                (%s)[ ][^\n]+?
128
                                (Dumping[ ]Scrapy[ ]stats:.*?(\{.+\}).*?
129
                                |INFO:[ ]Spider[ ]closed.*)
130
                                """ % DATETIME_PATTERN, re.VERBOSE | re.DOTALL)
131

132

133
class Common(object):
1✔
134
    NA = 'N/A'
1✔
135

136
    LINESEP_PATTERN = LINESEP_PATTERN
1✔
137
    LINESEP_BULK_PATTERN = LINESEP_BULK_PATTERN
1✔
138

139
    DATETIME_PATTERN = DATETIME_PATTERN
1✔
140
    DATAS_PATTERN = DATAS_PATTERN
1✔
141
    LOG_CATEGORIES_PATTERN_DICT = LOG_CATEGORIES_PATTERN_DICT
1✔
142
    LATEST_MATCHES_PATTERN_DICT = LATEST_MATCHES_PATTERN_DICT
1✔
143
    LATEST_SCRAPE_ITEM_PATTERN = LATEST_SCRAPE_ITEM_PATTERN
1✔
144

145
    SIGTERM_PATTERN = SIGTERM_PATTERN
1✔
146
    RESPONSE_STATUS_PATTERN = RESPONSE_STATUS_PATTERN
1✔
147
    RESPONSE_STATUS_REDIRECT_PATTERN = RESPONSE_STATUS_REDIRECT_PATTERN
1✔
148
    STATS_DUMPED_CATEGORIES_DICT = STATS_DUMPED_CATEGORIES_DICT
1✔
149
    PATTERN_LOG_ENDING = PATTERN_LOG_ENDING
1✔
150

151
    CWD = CWD
1✔
152
    ON_WINDOWS = platform.system() == 'Windows'
1✔
153
    PY2 = sys.version_info.major < 3
1✔
154
    SETTINGS_PY_PATH = SETTINGS_PY_PATH
1✔
155

156
    @staticmethod
1✔
157
    def get_current_time_timestamp():
158
        current_timestamp = int(time.time())
1✔
159
        current_time = datetime.fromtimestamp(current_timestamp).strftime('%Y-%m-%d %H:%M:%S')
1✔
160
        return current_time, current_timestamp
1✔
161

162
    @staticmethod
1✔
163
    def parse_log_path(log_path):
164
        project, spider, _job = log_path.split(os.sep)[-3:]
1✔
165
        job, ext = os.path.splitext(_job)  # ('job', '') or ('job', '.log')
1✔
166
        return project, spider, job, ext
1✔
167

168
    def get_ordered_dict(self, adict, source):
1✔
169
        odict = OrderedDict(source=source)
1✔
170
        odict['last_update_time'], odict['last_update_timestamp'] = self.get_current_time_timestamp()
1✔
171
        for key in sorted(adict.keys()):
1✔
172
            odict[key] = adict[key]
1✔
173
        return odict
1✔
174

175
    @staticmethod
1✔
176
    def parse_crawler_stats(text):
177
        # 'start_time': datetime.datetime(2019, 3, 9, 13, 55, 24, 601697)
178
        # "robotstxt/exception_count/<class 'twisted.internet.error.TCPTimedOutError'>": 1,
179
        # scrapy-crawlera/scrapy_crawlera/middleware.py:
180
            # self.crawler.stats.inc_value(
181
                # 'crawlera/response/error/%s' % crawlera_error.decode('utf8'))
182
        # u"crawlera/response/error/timeout": 1
183
        # 'items_per_minute': None,
184
        # 'responses_per_minute': None,
185
        backup = text
1✔
186
        text = re.sub(r'(datetime.datetime\(.+?\))', r'"\1"', text)
1✔
187
        text = re.sub(r'(".*?)\'(.*?)\'(.*?")', r'\1_\2_\3', text)
1✔
188
        text = re.sub(r"'(.+?)'", r'"\1"', text)
1✔
189
        text = re.sub(r'[bu]"(.+?)"', r'"\1"', text)
1✔
190
        text = re.sub(r': None([,}])', r': null\1', text)
1✔
191
        try:
1✔
192
            return json.loads(text)
1✔
193
        except ValueError as err:
1✔
194
            print(text)
1✔
195
            print(traceback.format_exc())
1✔
196
            # str(err) to avoid TypeError: Object of type JSONDecodeError is not JSON serializable
197
            return dict(json_loads_error=str(err), stats=backup)
1✔
198

199
    def update_data_with_crawler_stats(self, data, crawler_stats, update_log_count):
1✔
200
        # 'downloader/response_count': 4,
201
        # 'downloader/response_status_count/200': 2,
202
        # 'downloader/response_status_count/302': 1,
203
        # 'downloader/response_status_count/404': 1,
204
        # 'finish_reason': 'closespider_timeout',
205
        # 'item_scraped_count': 2,
206
        # 'response_received_count': 3,
207
        data['finish_reason'] = crawler_stats.get('finish_reason', data['finish_reason'])
1✔
208
        data['pages'] = crawler_stats.get('response_received_count', data['pages'])
1✔
209
        data['items'] = crawler_stats.get('item_scraped_count', data['items'])
1✔
210

211
        if not update_log_count:
1✔
UNCOV
212
            return
×
213
        redirect_count = 0
1✔
214
        for key, value in crawler_stats.items():
1✔
215
            if key.startswith('downloader/response_status_count/3'):
1✔
216
                redirect_count += value
1✔
217
        if redirect_count > 0:
1✔
218
            data['log_categories']['redirect_logs']['count'] = redirect_count
1✔
219

220
        for level, key in self.STATS_DUMPED_CATEGORIES_DICT.items():
1✔
221
            count = crawler_stats.get(key, 0)
1✔
222
            if count > 0:
1✔
223
                data['log_categories'][level]['count'] = count
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc