• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

chaoss / grimoirelab-elk / 7714832074

30 Jan 2024 05:08PM UTC coverage: 85.539%. Remained the same
7714832074

push

github

web-flow
Merge branch 'ci-install-pkg' of 'https://github.com/jjmerchante/grimoirelab-elk'

Merges #1133
Closes #1133

7459 of 8720 relevant lines covered (85.54%)

5.07 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

89.3
/grimoire_elk/enriched/githubql.py
1
# -*- coding: utf-8 -*-
2
#
3
# Copyright (C) 2015-2023 Bitergia
4
#
5
# This program is free software; you can redistribute it and/or modify
6
# it under the terms of the GNU General Public License as published by
7
# the Free Software Foundation; either version 3 of the License, or
8
# (at your option) any later version.
9
#
10
# This program is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
# GNU General Public License for more details.
14
#
15
# You should have received a copy of the GNU General Public License
16
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17
#
18
# Authors:
19
#   Valerio Cosentino <valcos@bitergia.com>
20
#   Miguel Ángel Fernández <mafesan@bitergia.com>
21
#   Quan Zhou <quan@bitergia.com>
22
#
23

24
import logging
6✔
25
import re
6✔
26

27
from elasticsearch import Elasticsearch as ES, RequestsHttpConnection
6✔
28

29
from .enrich import Enrich, metadata
6✔
30
from .utils import anonymize_url, get_time_diff_days
6✔
31
from ..elastic_mapping import Mapping as BaseMapping
6✔
32

33
GITHUB = 'https://github.com/'
6✔
34
LABEL_EVENTS = ['LabeledEvent', 'UnlabeledEvent']
6✔
35
PROJECT_EVENTS = ['AddedToProjectEvent', 'MovedColumnsInProjectEvent', 'RemovedFromProjectEvent']
6✔
36
REFERENCE_EVENTS = ['CrossReferencedEvent']
6✔
37
CLOSED_EVENTS = ['ClosedEvent']
6✔
38
MERGED_EVENTS = ['MergedEvent']
6✔
39
PULL_REQUEST_REVIEW_EVENTS = ['PullRequestReview']
6✔
40

41
logger = logging.getLogger(__name__)
6✔
42

43

44
class Mapping(BaseMapping):
6✔
45

46
    @staticmethod
6✔
47
    def get_elastic_mappings(es_major):
6✔
48
        """Get Elasticsearch mapping.
49

50
        geopoints type is not created in dynamic mapping
51

52
        :param es_major: major version of Elasticsearch, as string
53
        :returns:        dictionary with a key, 'items', with the mapping
54
        """
55

56
        mapping = """
6✔
57
        {
58
            "properties": {
59
               "issue_state": {
60
                   "type": "keyword"
61
               },
62
               "title_analyzed": {
63
                 "type": "text",
64
                 "index": true
65
               }
66
            }
67
        }
68
        """
69

70
        return {"items": mapping}
6✔
71

72

73
class GitHubQLEnrich(Enrich):
6✔
74

75
    mapping = Mapping
6✔
76

77
    event_roles = ['actor', 'reporter', 'submitter']
6✔
78

79
    def __init__(self, db_sortinghat=None, json_projects_map=None,
6✔
80
                 db_user='', db_password='', db_host='', db_path=None,
2✔
81
                 db_port=None, db_ssl=False, db_verify_ssl=True, db_tenant=None):
2✔
82
        super().__init__(db_sortinghat=db_sortinghat, json_projects_map=json_projects_map,
6✔
83
                         db_user=db_user, db_password=db_password, db_host=db_host,
6✔
84
                         db_port=db_port, db_path=db_path, db_ssl=db_ssl, db_verify_ssl=db_verify_ssl,
6✔
85
                         db_tenant=db_tenant)
6✔
86

87
        self.studies = [self.enrich_duration_analysis, self.enrich_reference_analysis]
6✔
88

89
    def set_elastic(self, elastic):
6✔
90
        self.elastic = elastic
6✔
91

92
    def get_field_author(self):
6✔
93
        return "actor"
6✔
94

95
    def get_field_date(self):
6✔
96
        """ Field with the date in the JSON enriched items """
97
        return "grimoire_creation_date"
98

99
    def get_identities(self, item):
100
        """Return the identities from an item"""
101

102
        event = item['data']
6✔
103
        event_actor = event.get("actor", event.get("author", None))
6✔
104
        if event_actor:
6✔
105
            identity = self.get_sh_identity(event_actor)
6✔
106
            if identity:
6✔
107
                yield identity
6✔
108

109
        issue = event['issue']
6✔
110
        issue_reporter = issue.get("user", None)
6✔
111
        if issue_reporter:
6✔
112
            identity = self.get_sh_identity(issue_reporter)
6✔
113
            if identity:
6✔
114
                yield identity
6✔
115

116
        closer = event.get('closer', None)
6✔
117
        if closer and closer['type'] == 'PullRequest':
6✔
118
            pull_submitter = closer.get('author', None)
6✔
119
            identity = self.get_sh_identity(pull_submitter)
6✔
120
            if identity:
6✔
121
                yield identity
6✔
122

123
    def get_sh_identity(self, item, identity_field=None):
6✔
124
        identity = {}
6✔
125

126
        user = item  # by default a specific user dict is expected
6✔
127
        if isinstance(item, dict) and 'data' in item:
6✔
128
            if identity_field == 'actor':
6✔
129
                user = item['data'][identity_field]
6✔
130
            elif identity_field == 'reporter':
6✔
131
                user = item['data']['issue']['user']
6✔
132
            elif identity_field == 'submitter':
6✔
133
                closer = item['data'].get('closer', None)
6✔
134
                if closer:
6✔
135
                    user = closer['author']
6✔
136

137
        if not user:
6✔
138
            return identity
×
139

140
        identity['username'] = user['login']
6✔
141
        identity['email'] = None
6✔
142
        identity['name'] = None
6✔
143

144
        return identity
6✔
145

146
    def get_project_repository(self, eitem):
6✔
147
        repo = eitem['origin']
6✔
148
        return repo
6✔
149

150
    @metadata
6✔
151
    def get_rich_item(self, item):
6✔
152

153
        rich_item = self.__get_rich_event(item)
6✔
154

155
        self.add_repository_labels(rich_item)
6✔
156
        self.add_metadata_filter_raw(rich_item)
6✔
157

158
        return rich_item
6✔
159

160
    def __get_rich_event(self, item):
6✔
161
        rich_event = {}
6✔
162

163
        self.copy_raw_fields(self.RAW_FIELDS_COPY, item, rich_event)
6✔
164

165
        event = item['data']
6✔
166
        issue = item['data']['issue']
6✔
167
        actor = item['data'].get('actor', item['data'].get('author', None))
6✔
168

169
        # move the issue reporter to level of actor. This is needed to
170
        # allow `get_item_sh` adding SortingHat identities
171
        reporter = issue['user']
6✔
172
        item['data']['reporter'] = reporter
6✔
173

174
        rich_event['event_type'] = event['eventType']
6✔
175
        rich_event['created_at'] = event['createdAt']
6✔
176
        rich_event['actor_username'] = actor['login'] if actor else None
6✔
177
        rich_event['repository'] = self.get_project_repository(rich_event)
6✔
178
        rich_event['pull_request'] = True
6✔
179
        rich_event['item_type'] = 'pull request'
6✔
180
        if 'head' not in issue.keys() and 'pull_request' not in issue.keys():
6✔
181
            rich_event['pull_request'] = False
6✔
182
            rich_event['item_type'] = 'issue'
6✔
183

184
        rich_event['issue_id'] = issue['id']
6✔
185
        rich_event['issue_id_in_repo'] = issue['html_url'].split("/")[-1]
6✔
186
        rich_event['title'] = issue['title']
6✔
187
        rich_event['title_analyzed'] = issue['title']
6✔
188
        rich_event['issue_state'] = issue['state']
6✔
189
        rich_event['issue_created_at'] = issue['created_at']
6✔
190
        rich_event['issue_updated_at'] = issue['updated_at']
6✔
191
        rich_event['issue_closed_at'] = issue['closed_at']
6✔
192
        rich_event['issue_url'] = issue['html_url']
6✔
193
        labels = []
6✔
194
        [labels.append(label['name']) for label in issue['labels'] if 'labels' in issue]
6✔
195
        rich_event['issue_labels'] = labels
6✔
196

197
        rich_event['github_repo'] = rich_event['repository'].replace(GITHUB, '')
6✔
198
        rich_event['github_repo'] = re.sub('.git$', '', rich_event['github_repo'])
6✔
199
        rich_event["issue_url_id"] = rich_event['github_repo'] + "/issues/" + rich_event['issue_id_in_repo']
6✔
200

201
        if rich_event['event_type'] in LABEL_EVENTS:
6✔
202
            label = event['label']
6✔
203
            rich_event['label'] = label['name']
6✔
204
            rich_event['label_description'] = label['description']
6✔
205
            rich_event['label_is_default'] = label['isDefault']
6✔
206
            rich_event['label_created_at'] = label['createdAt']
6✔
207
            rich_event['label_updated_at'] = label['updatedAt']
6✔
208
        elif rich_event['event_type'] in CLOSED_EVENTS:
6✔
209
            closer = event['closer']
6✔
210
            rich_event['label'] = rich_event['issue_labels']
6✔
211
            # In GitHub every pull request is an issue. When retrieving closed events for
212
            # issues using the GraphQL API, the closed events related to pull requests are
213
            # collected too. Since the attribute closer is not defined for pull requests,
214
            # the condition below makes sure to prevent NPE errors.
215
            # The test data at tests/data/githubql.json contains two examples with the
216
            # attribute closer being null and not null.
217
            if closer and closer['type'] == 'PullRequest':
6✔
218
                rich_event['closer_event_url'] = event['url']
6✔
219
                rich_event['closer_type'] = closer['type']
6✔
220
                rich_event['closer_number'] = closer['number']
6✔
221
                rich_event['closer_url'] = closer['url']
6✔
222
                rich_event['closer_repo'] = '/'.join(closer['url'].replace(GITHUB, '').split('/')[:-2])
6✔
223
                rich_event['closer_created_at'] = closer['createdAt']
6✔
224
                rich_event['closer_updated_at'] = closer['updatedAt']
6✔
225
                rich_event['closer_closed_at'] = closer['closedAt']
6✔
226
                rich_event['closer_closed'] = closer['closed']
6✔
227
                rich_event['closer_merged'] = closer.get('merged', None)
6✔
228
                submitter = closer['author']
6✔
229
                rich_event['closer_pull_submitter'] = submitter.get('login', None) if submitter else None
6✔
230
                # move the pull request submitter to level of actor. This is needed to
231
                # allow `get_item_sh` adding SortingHat identities
232
                item['data']['submitter'] = submitter
6✔
233
        elif rich_event['event_type'] in REFERENCE_EVENTS:
6✔
234
            source = event['source']
6✔
235
            rich_event['reference_cross_repo'] = event['isCrossRepository']
6✔
236
            rich_event['reference_will_close_target'] = event['willCloseTarget']
6✔
237
            rich_event['reference_event_url'] = event['url']
6✔
238
            rich_event['reference_source_type'] = source['type']
6✔
239
            rich_event['reference_source_number'] = source['number']
6✔
240
            rich_event['reference_source_url'] = source['url']
6✔
241
            rich_event['reference_source_repo'] = '/'.join(source['url'].replace(GITHUB, '').split('/')[:-2])
6✔
242
            rich_event['reference_source_created_at'] = source['createdAt']
6✔
243
            rich_event['reference_source_updated_at'] = source['updatedAt']
6✔
244
            rich_event['reference_source_closed_at'] = source['closedAt']
6✔
245
            rich_event['reference_source_closed'] = source['closed']
6✔
246
            rich_event['reference_source_merged'] = source.get('merged', None)
6✔
247
        elif rich_event['event_type'] in PROJECT_EVENTS:
6✔
248
            project = event['project']
6✔
249
            rich_event['board_column'] = event['projectColumnName']
6✔
250
            rich_event['board_name'] = project['name']
6✔
251
            rich_event['board_url'] = project['url']
6✔
252
            rich_event['board_created_at'] = project['createdAt']
6✔
253
            rich_event['board_updated_at'] = project['updatedAt']
6✔
254
            rich_event['board_closed_at'] = project['closedAt']
6✔
255
            rich_event['board_state'] = project['state'].lower()
6✔
256

257
            # only for events of type MovedColumnsInProjectEvent
258
            if 'previousProjectColumnName' in event:
6✔
259
                rich_event['board_previous_column'] = event['previousProjectColumnName']
6✔
260
        elif rich_event['event_type'] in MERGED_EVENTS:
6✔
261
            merge = event['pullRequest']
6✔
262
            rich_event['merge_closed'] = merge['closed']
6✔
263
            rich_event['merge_closed_at'] = merge['closedAt']
6✔
264
            rich_event['merge_created_at'] = merge['createdAt']
6✔
265
            rich_event['merge_merged'] = merge['merged']
6✔
266
            rich_event['merge_merged_at'] = merge['mergedAt']
6✔
267
            rich_event['merge_updated_at'] = merge['updatedAt']
6✔
268
            rich_event['merge_url'] = merge['url']
6✔
269
        elif rich_event['event_type'] in PULL_REQUEST_REVIEW_EVENTS:
6✔
270
            review = event['pullRequest']
6✔
271
            rich_event['merge_approved'] = 0
6✔
272
            rich_event['merge_state'] = event['state']
6✔
273
            rich_event['merge_approved'] = int(rich_event['merge_state'] == 'APPROVED')
6✔
274
            rich_event['merge_closed'] = review['closed']
6✔
275
            rich_event['merge_closed_at'] = review['closedAt']
6✔
276
            rich_event['merge_created_at'] = review['createdAt']
6✔
277
            rich_event['merge_merged'] = review['merged']
6✔
278
            rich_event['merge_merged_at'] = review['mergedAt']
6✔
279
            rich_event['merge_updated_at'] = review['updatedAt']
6✔
280
            rich_event['merge_url'] = review['url']
6✔
281
            item['data']['actor'] = item['data']['author']
6✔
282
        else:
×
283
            logger.warning("[github] event {} not processed".format(rich_event['event_type']))
×
284

285
        if self.prjs_map:
6✔
286
            rich_event.update(self.get_item_project(rich_event))
6✔
287

288
        rich_event.update(self.get_grimoire_fields(event['createdAt'], "issue"))
6✔
289
        item[self.get_field_date()] = rich_event[self.get_field_date()]
6✔
290
        rich_event.update(self.get_item_sh(item, self.event_roles))
6✔
291

292
        # Copy SH actor info to author equivalent attributes
293
        rich_event['author_id'] = rich_event.get('actor_id', None)
6✔
294
        rich_event['author_uuid'] = rich_event.get('actor_uuid', None)
6✔
295
        rich_event['author_name'] = rich_event.get('actor_name', None)
6✔
296
        rich_event['author_user_name'] = rich_event.get('actor_user_name', None)
6✔
297
        rich_event['author_domain'] = rich_event.get('actor_domain', None)
6✔
298
        rich_event['author_gender'] = rich_event.get('actor_gender', None)
6✔
299
        rich_event['author_gender_acc'] = rich_event.get('actor_gender_acc', None)
6✔
300
        rich_event['author_org_name'] = rich_event.get('actor_org_name', None)
6✔
301
        rich_event['author_bot'] = rich_event.get('actor_bot', None)
6✔
302
        rich_event['author_multi_org_names'] = rich_event.get('actor_multi_org_names', None)
6✔
303

304
        return rich_event
6✔
305

306
    def enrich_duration_analysis(self, ocean_backend, enrich_backend, start_event_type, target_attr,
6✔
307
                                 fltr_event_types, fltr_attr=None, page_size=200):
2✔
308
        """The purpose of this study is to calculate the duration between two GitHub events. It requires
×
309
        a start event type (e.g., UnlabeledEvent or MovedColumnsInProjectEvent), which is used to
×
310
        retrieve for each issue all events of that type. For each issue event obtained, the first
×
311
        previous event of one of the types defined at `fltr_event_types` is returned, and used to
×
312
        calculate the duration (in days) between the two events. Optionally, an additional filter
×
313
        can be defined to retain the events that share a given property (e.g., a specific label,
×
314
        the name of project board). Finally, the duration and the previous event uuid are added to
×
315
        the start event via the attributes `duration_from_previous_event` and `previous_event_uuid`.
×
316

317
        This study is executed in a incremental way, thus only the start events that don't
×
318
        include the attribute `duration_from_previous_event` are retrieved and processed.
×
319

320
        The examples below show how to activate the study by modifying the setup.cfg. The first example
×
321
        calculates the duration between Unlabeled and Labeled events per label. The second example
×
322
        calculates the duration between the MovedColumnsInProject and AddedToProject events per
×
323
        column in each board
×
324

325
        ```
×
326
        [githubql]
×
327
        ...
×
328
        studies = [enrich_duration_analysis:label, enrich_duration_analysis:kanban]
×
329

330
        [enrich_duration_analysis:kanban]
×
331
        start_event_type = MovedColumnsInProjectEvent
×
332
        fltr_attr = board_name
×
333
        target_attr = board_column
×
334
        fltr_event_types = [MovedColumnsInProjectEvent, AddedToProjectEvent]
×
335

336
        [enrich_duration_analysis:label]
×
337
        start_event_type = UnlabeledEvent
×
338
        target_attr = label
×
339
        fltr_attr = label
×
340
        fltr_event_types = [LabeledEvent]
×
341
        ```
×
342

343
        :param ocean_backend: backend from which to read the raw items
×
344
        :param enrich_backend:  backend from which to read the enriched items
×
345
        :param start_event_type: the type of the start event (e.g., UnlabeledEvent)
×
346
        :param target_attr: the attribute returned from the events (e.g., label)
×
347
        :param fltr_event_types: a list of event types to select the previous events (e.g., LabeledEvent)
×
348
        :param fltr_attr: an optional attribute to filter in the events with a given property (e.g., label)
×
349
        :param page_size: number of events without `duration_from_previous_event` per page
×
350
        """
×
351
        data_source = enrich_backend.__class__.__name__.split("Enrich")[0].lower()
6✔
352
        log_prefix = "[{}] Duration analysis".format(data_source)
6✔
353
        logger.info("{} starting study {}".format(log_prefix, anonymize_url(self.elastic.index_url)))
6✔
354

355
        es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
6✔
356
                   verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
6✔
357
        in_index = enrich_backend.elastic.index
6✔
358

359
        # get all start events that don't have the attribute `duration_from_previous_event`
360
        query_start_event_type = {
6✔
361
            "query": {
6✔
362
                "bool": {
6✔
363
                    "filter": {
6✔
364
                        "term": {
6✔
365
                            "event_type": start_event_type
6✔
366
                        }
367
                    },
368
                    "must_not": {
6✔
369
                        "exists": {
6✔
370
                            "field": "duration_from_previous_event"
6✔
371
                        }
372
                    }
373
                }
374
            },
375
            "_source": [
6✔
376
                "uuid", "issue_url_id", "grimoire_creation_date", target_attr
6✔
377
            ],
378
            "sort": [
6✔
379
                {
6✔
380
                    "grimoire_creation_date": {
6✔
381
                        "order": "asc"
6✔
382
                    }
383
                }
384
            ],
385
            "size": page_size
6✔
386
        }
387

388
        if fltr_attr:
6✔
389
            query_start_event_type['_source'].append(fltr_attr)
6✔
390

391
        start_event_types = es_in.search(index=in_index, body=query_start_event_type, scroll='5m')
6✔
392

393
        sid = start_event_types['_scroll_id']
6✔
394
        scroll_size = len(start_event_types['hits']['hits'])
6✔
395

396
        while scroll_size > 0:
6✔
397

398
            # for each event, retrieve the previous event included in `fltr_event_types`
399
            for start_event in start_event_types['hits']['hits']:
6✔
400
                start_event = start_event['_source']
6✔
401
                start_uuid = start_event['uuid']
6✔
402
                start_issue_url_id = start_event['issue_url_id']
6✔
403
                start_date_event = start_event['grimoire_creation_date']
6✔
404

405
                query_previous_events = {
6✔
406
                    "size": 1,
6✔
407
                    "query": {
6✔
408
                        "bool": {
6✔
409
                            "filter": [
6✔
410
                                {
6✔
411
                                    "term": {
6✔
412
                                        "issue_url_id": start_issue_url_id
6✔
413
                                    }
414
                                },
415
                                {
6✔
416
                                    "terms": {
6✔
417
                                        "event_type": fltr_event_types
6✔
418
                                    }
419
                                },
420
                                {
6✔
421
                                    "range": {
6✔
422
                                        "grimoire_creation_date": {
6✔
423
                                            "lt": start_date_event
6✔
424
                                        }
425
                                    }
426
                                }
427
                            ]
428
                        }
429
                    },
430
                    "_source": [
6✔
431
                        "uuid", "grimoire_creation_date", target_attr
6✔
432
                    ],
433
                    "sort": [
6✔
434
                        {
6✔
435
                            "grimoire_creation_date": {
6✔
436
                                "order": "desc"
6✔
437
                            }
438
                        }
439
                    ]
440
                }
441

442
                if fltr_attr:
6✔
443
                    _fltr = {
6✔
444
                        "term": {
6✔
445
                            fltr_attr: start_event[fltr_attr]
6✔
446
                        }
447
                    }
448

449
                    query_previous_events['query']['bool']['filter'].append(_fltr)
6✔
450
                    query_start_event_type['_source'].append(fltr_attr)
6✔
451

452
                previous_events = es_in.search(index=in_index, body=query_previous_events)['hits']['hits']
6✔
453
                if not previous_events:
6✔
454
                    continue
455

456
                previous_event = previous_events[0]['_source']
6✔
457
                previous_event_date = previous_event['grimoire_creation_date']
6✔
458
                previous_event_uuid = previous_event['uuid']
6✔
459
                duration = get_time_diff_days(previous_event_date, start_date_event)
6✔
460

461
                painless_code = "ctx._source.duration_from_previous_event=params.duration;" \
6✔
462
                                "ctx._source.previous_event_uuid=params.uuid"
463

464
                add_previous_event_query = {
6✔
465
                    "script": {
6✔
466
                        "source": painless_code,
6✔
467
                        "lang": "painless",
6✔
468
                        "params": {
6✔
469
                            "duration": duration,
6✔
470
                            "uuid": previous_event_uuid
6✔
471
                        }
472
                    },
473
                    "query": {
6✔
474
                        "bool": {
6✔
475
                            "filter": {
6✔
476
                                "term": {
6✔
477
                                    "uuid": start_uuid
6✔
478
                                }
479
                            }
480
                        }
481
                    }
482
                }
483
                r = es_in.update_by_query(index=in_index, body=add_previous_event_query, conflicts='proceed')
6✔
484
                if r['failures']:
6✔
485
                    logger.error("{} Error while executing study {}".format(log_prefix,
486
                                                                            anonymize_url(self.elastic.index_url)))
487
                    logger.error(str(r['failures'][0]))
488
                    return
489

490
            start_event_types = es_in.scroll(scroll_id=sid, scroll='2m')
6✔
491
            # update the scroll ID
492
            sid = start_event_types['_scroll_id']
6✔
493
            # get the number of results that returned in the last scroll
494
            scroll_size = len(start_event_types['hits']['hits'])
6✔
495

496
        logger.info("{} ending study {}".format(log_prefix, anonymize_url(self.elastic.index_url)))
6✔
497

498
    def enrich_reference_analysis(self, ocean_backend, enrich_backend, aliases_update=None):
6✔
499
        """
500
        The purpose of this study is to gather all the issues and pull requests which are
501
        mutually referenced. Once these references are obtained, all of the events for the given issue
502
        or pull request are updated with the corresponding list of URLs from the referenced items.
503

504
        This study is not executed in a incremental way, as it only takes the `CrossReferencedEvent`
505
        items to build a dictionary containing all the mutual references per each Issue or Pull Request,
506
        identified by `issue_url`. Then, it updates all the events belonging to the same `issue_url`
507
        adding the following fields:
508
        * `referenced_by_issues`: List of issues referenced by a given Issue or Pull Request,
509
            from the same repository.
510
        * `referenced_by_prs`: List of pull requests referenced by a given Issue or Pull Request,
511
            from the same repository.
512
          `referenced_by_merged_prs`: List of merged pull requests referenced by a given Issue or Pull
513
            Request, from the same repository.
514
        * `referenced_by_external_issues`: List of issues referenced by a given Issue or Pull Request,
515
            from different (external) repositories.
516
        * `referenced_by_external_prs`: List of pull requests referenced by a given Issue or Pull Request,
517
            from different (external) repositories.
518
        * `referenced_by_external_merged_prs`: List of merged pull requests referenced by a given Issue
519
            or Pull Request, from different (external) repositories.
520

521
        To classify the merged Pull Requests, the study asks for the list of the URLs from `MergedEvents`.
522

523
        The method accepts a list of ES aliases or indices where these new fields will be updated too,
524
        for the referenced elements identified by a given `issue_url`. An example would be the `github_issues`
525
        alias, which points to the corresponding enriched indexes from GitHub issues. Note that if the
526
        referenced aliases or indexes do not contain the `issue_url` field, the affected elements can't
527
        be updated.
528

529
        The examples below show how to activate the study by modifying the setup.cfg.
530

531
        ```
532
        [githubql]
533
        ...
534
        studies = [..., enrich_reference_analysis]
535

536
        [enrich_reference_analysis]]
537
        aliases_update = [github_issues, github2_issues, github2_pull_requests]
538
        ```
539

540
        :param ocean_backend: backend from which to read the raw items
541
        :param enrich_backend:  backend from which to read the enriched items
542
        :param aliases_update: list of aliases where to update the referenced items
543
        """
544
        def _is_pull_request(issue_url):
6✔
545
            """Return True if `issue_url` belongs to a Pull Request"""
546

547
            return '/pull/' in issue_url
548

549
        def _get_github_repo(issue_url):
550
            """Return GitHub repository path using `owner/repository` format"""
551

552
            repo = ''
6✔
553
            url_info = issue_url.split('/')
6✔
554
            if url_info[2] == 'github.com':
6✔
555
                repo = '/'.join([url_info[3], url_info[4]])
6✔
556
            return repo
6✔
557

558
        def _get_merged_prs(es_input):
6✔
559
            """Return a list of merged Pull Requests based on MergedEvent items"""
560

561
            # Ask for the URL from `MergedEvent` items, filtering by merged PRs
562
            es_query = {
563
                "size": 0,
564
                "query": {
565
                    "bool": {
566
                        "must": [
567
                            {
568
                                "term": {"event_type": "MergedEvent"}
569
                            },
570
                            {
571
                                "term": {"pull_request": True}
572
                            },
573
                            {
574
                                "term": {"merge_merged": True}
575
                            }
576
                        ]
577
                    }
578
                },
579
                "aggs": {
580
                    "merge_url": {
581
                        "terms": {
582
                            "field": "merge_url",
583
                            "size": 30000
584
                        }
585
                    }
586
                }
587
            }
588

589
            merged_prs = es_input.search(index=in_index, body=es_query)
590
            buckets = merged_prs['aggregations']['merge_url']['buckets']
591

592
            merged_prs_list = [item['key'] for item in buckets]
593

594
            return merged_prs_list
595

596
        data_source = enrich_backend.__class__.__name__.split("Enrich")[0].lower()
597
        log_prefix = "[{}] Cross reference analysis".format(data_source)
598
        logger.info("{} starting study {}".format(log_prefix, anonymize_url(self.elastic.index_url)))
599

600
        es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
601
                   verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
602
        in_index = enrich_backend.elastic.index
603

604
        # Get all the merged pull requests from MergedEvents
605
        logger.info("{} Retrieving the merged PRs from MergeEvents".format(log_prefix))
606
        merged_prs = _get_merged_prs(es_in)
607

608
        # Get all CrossReferencedEvent items and their referenced issues and pull requests
609
        es_query = {
610
            "size": 0,
611
            "query": {
612
                "bool": {
613
                    "must": {
614
                        "term": {
615
                            "event_type": "CrossReferencedEvent"
616
                        }
617
                    }
618
                }
619
            },
620
            "aggs": {
621
                "issue_url": {
622
                    "terms": {
623
                        "field": "issue_url",
624
                        "size": 30000
625
                    },
626
                    "aggs": {
627
                        "uniq_gender": {
628
                            "terms": {"field": "reference_source_url"}
629
                        }
630
                    }
631
                }
632
            }
633
        }
634

635
        cross_references = es_in.search(index=in_index, body=es_query)
636
        buckets = cross_references['aggregations']['issue_url']['buckets']
637

638
        reference_dict = {}
639
        for item in buckets:
640
            issue_url = item['key']
641
            references = [ref['key'] for ref in item['uniq_gender']['buckets']]
642

643
            # Update reference dictionary
644
            if issue_url not in reference_dict.keys():
645
                reference_dict[issue_url] = references
646
            else:
647
                prev_references = reference_dict[issue_url]
648
                prev_references.append(references)
649
                reference_dict[issue_url] = list(set(prev_references))
650

651
        # Adding list entries from reversed references
652
        for issue_url in reference_dict.keys():
653
            reference_list = reference_dict[issue_url]
654
            if not reference_list:
655
                continue
656
            for ref in reference_list:
657
                try:
658
                    ref_entry_list = reference_dict[ref]
659
                except KeyError:
660
                    continue
661
                if ref_entry_list:
662
                    ref_entry_list.append(issue_url)
663
                else:
664
                    ref_entry_list = [issue_url]
665
                reference_dict[ref] = list(set(ref_entry_list))
666

667
        # Updated affected issues and pull requests
668
        painless_code = """
669
            ctx._source.referenced_by_issues = params.referenced_by_issues;
670
            ctx._source.referenced_by_prs = params.referenced_by_prs;
671
            ctx._source.referenced_by_merged_prs = params.referenced_by_merged_prs;
672
            ctx._source.referenced_by_external_issues = params.referenced_by_external_issues;
673
            ctx._source.referenced_by_external_prs = params.referenced_by_external_prs;
674
            ctx._source.referenced_by_external_merged_prs = params.referenced_by_external_merged_prs;
675
        """
676
        for issue_url in reference_dict.keys():
6✔
677
            ref_issues_repo = []
6✔
678
            ref_prs_repo = []
6✔
679
            ref_prs_merged_repo = []
6✔
680
            ref_issues_ext = []
6✔
681
            ref_prs_ext = []
6✔
682
            ref_prs_merged_ext = []
6✔
683

684
            issue_repo = _get_github_repo(issue_url)
6✔
685

686
            # Classify references internal/external repo + issues/pull-requests
687
            reference_list = reference_dict[issue_url]
6✔
688
            for ref in reference_list:
6✔
689
                ref_repo = _get_github_repo(ref)
6✔
690
                ref_is_pr = _is_pull_request(ref)
6✔
691

692
                # Classify references
693
                if ref_repo == issue_repo:
6✔
694
                    if ref_is_pr:
6✔
695
                        ref_prs_repo.append(ref)
6✔
696
                        if ref in merged_prs:
6✔
697
                            ref_prs_merged_repo.append(ref)
6✔
698
                    else:
699
                        ref_issues_repo.append(ref)
6✔
700
                else:
701
                    if ref_is_pr:
702
                        ref_prs_ext.append(ref)
703
                        if ref in merged_prs:
704
                            ref_prs_merged_ext.append(ref)
705
                    else:
706
                        ref_issues_ext.append(ref)
707

708
            # Update items with the corresponding fields
709
            update_query = {
6✔
710
                "script": {
6✔
711
                    "source": painless_code,
6✔
712
                    "lang": "painless",
6✔
713
                    "params": {
6✔
714
                        "referenced_by_issues": ref_issues_repo,
6✔
715
                        "referenced_by_prs": ref_prs_repo,
6✔
716
                        "referenced_by_merged_prs": ref_prs_merged_repo,
6✔
717
                        "referenced_by_external_issues": ref_issues_ext,
6✔
718
                        "referenced_by_external_prs": ref_prs_ext,
6✔
719
                        "referenced_by_external_merged_prs": ref_prs_merged_ext,
6✔
720
                    }
721
                },
722
                "query": {
6✔
723
                    "term": {
6✔
724
                        "issue_url": issue_url
6✔
725
                    }
726
                }
727
            }
728

729
            update_indexes = [in_index]
6✔
730
            # Update data in the additional related indexes (if any) identified by their aliases
731
            if aliases_update:
6✔
732
                update_indexes += aliases_update
733

734
            for update_index in update_indexes:
6✔
735
                logger.info('{} - Updating fields from items with issue_url: {} from index {}'.format(log_prefix,
6✔
736
                                                                                                      issue_url,
6✔
737
                                                                                                      update_index))
6✔
738

739
                r = es_in.update_by_query(index=update_index, body=update_query, conflicts='proceed')
6✔
740
                if r['failures']:
6✔
741
                    logger.error("{} Error while executing study {}".format(log_prefix,
742
                                                                            anonymize_url(self.elastic.index_url)))
743
                    logger.error(str(r['failures'][0]))
744
                    return
745

746
        logger.info("{} ending study {}".format(log_prefix, anonymize_url(self.elastic.index_url)))
6✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc