• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

adsabs / ADSCitationCapture / 15909488557

26 Jun 2025 06:20PM UTC coverage: 70.171% (-0.04%) from 70.21%
15909488557

Pull #73

github

web-flow
Merge 1270afe42 into 57448c851
Pull Request #73: Allow reparsing of metadata without re-harvesting.

1 of 5 new or added lines in 1 file covered. (20.0%)

26 existing lines in 3 files now uncovered.

2510 of 3577 relevant lines covered (70.17%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.5
/ADSCitationCapture/forward.py
1

2
import os
1✔
3
import itertools
1✔
4
import datetime
1✔
5
from adsputils import get_date, date2solrstamp
1✔
6
from dateutil.tz import tzutc
1✔
7
from adsmsg import DenormalizedRecord, NonBibRecord, Status, CitationChangeContentType
1✔
8
from bs4 import BeautifulSoup
1✔
9
from adsputils import setup_logging
1✔
10

11
# ============================= INITIALIZATION ==================================== #
12
# - Use app logger:
13
#import logging
14
#logger = logging.getLogger('ads-citation-capture')
15
# - Or individual logger for this file:
16
from adsputils import setup_logging, load_config
1✔
17
proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
1✔
18
config = load_config(proj_home=proj_home)
1✔
19
logger = setup_logging(__name__, proj_home=proj_home,
1✔
20
                        level=config.get('LOGGING_LEVEL', 'INFO'),
21
                        attach_stdout=config.get('LOG_STDOUT', False))
22

23

24
# =============================== FUNCTIONS ======================================= #
25
def build_record(app, citation_change, parsed_metadata, citations, db_versions, readers=[], entry_date=None):
1✔
26
    if citation_change.content_type != CitationChangeContentType.doi:
1✔
UNCOV
27
        raise ValueError("Only DOI records can be forwarded to master")
×
28
    # Extract required values
29
    bibcode = parsed_metadata.get('bibcode')
1✔
30
    if bibcode is None:
1✔
31
        raise ValueError("Only records with a valid bibcode can be forwarded to master")
1✔
32
    if entry_date is None:
1✔
33
        entry_date = citation_change.timestamp.ToDatetime()
1✔
34
    #Check if doi points to a concept record or to a specific version
35
    if parsed_metadata.get('version_of', None) not in (None,"",[],''): 
1✔
36
        is_release = True
1✔
37
    else:
38
        is_release = False
1✔
39
    alternate_bibcode = parsed_metadata.get('alternate_bibcode', [])
1✔
40
    abstract = parsed_metadata.get('abstract', "")
1✔
41
    title = parsed_metadata.get('title', "")
1✔
42
    keywords = parsed_metadata.get('keywords', [])
1✔
43
    authors = parsed_metadata.get('authors', [])
1✔
44
    normalized_authors = parsed_metadata.get('normalized_authors', [])
1✔
45
    affiliations = parsed_metadata.get('affiliations', ['-']*len(authors))
1✔
46
    pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d"))
1✔
47
    try:
1✔
48
        solr_date=(datetime.datetime.strptime(pubdate, "%Y-%m-%d")+datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
1✔
UNCOV
49
    except ValueError:
×
UNCOV
50
        try:
×
51
            #In the event only a year is specified, the date is assumed to be January 1st of the given year.
52
            logger.warn("Publication date does not conform to Y-m-d format. Assuming only year is specified.")
×
UNCOV
53
            pubdate = pubdate+"-01"+"-01"
×
54
            solr_date=(datetime.datetime.strptime(pubdate, "%Y-%m-%d")+datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
×
55
        except ValueError:
×
56
            #If above fails, just set it to the current date. Running maintenance_metadata could fix the bad publication date in the future if it is updated upstream.
57
            logger.warn("Cannot parse publication date. Setting to current datetime.")
×
UNCOV
58
            solr_date=date2solrstamp(entry_date)
×
59

60
    source = parsed_metadata.get('source', "Unknown")
1✔
61
    version = parsed_metadata.get('version', "")
1✔
62
    doctype = parsed_metadata.get('doctype', "software")
1✔
63
    # Clean abstract and title
64
    abstract = ''.join(BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace('\n', ' ').replace('\r', '')
1✔
65
    title = ''.join(BeautifulSoup(title, features="lxml").findAll(text=True)).replace('\n', ' ').replace('\r', '')
1✔
66
    # Extract year
67
    year = pubdate.split("-")[0]
1✔
68
    # Build an author_facet_hier list with the following structure:
69
    #   "0/Blanco-Cuaresma, S",
70
    #   "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S",
71
    #   "0/Soubiran, C",
72
    #   "1/Soubiran, C/Soubiran, C",
73
    author_facet_hier = list(itertools.chain.from_iterable(zip(["0/"+a for a in normalized_authors], ["1/"+a[0]+"/"+a[1] for a in zip(normalized_authors, authors)])))
1✔
74

75
    # Count
76
    n_keywords = len(keywords)
1✔
77
    n_authors = len(authors)
1✔
78
    n_citations = len(citations)
1✔
79
    doi = citation_change.content
1✔
80
    record_dict = {
1✔
81
        'abstract': abstract,
82
        'ack': '',
83
        'aff': [ "-" if aff == "" else aff for aff in affiliations],
84
        'alternate_bibcode': alternate_bibcode,
85
        'alternate_title': [],
86
        'arxiv_class': [],
87
        'author': authors,
88
        'author_count': n_authors,
89
        'author_facet': normalized_authors,
90
        'author_facet_hier': author_facet_hier,
91
        'author_norm': normalized_authors,
92
        'bibcode': bibcode,
93
        'bibstem': ['zndo'],
94
        'bibstem_facet': 'zndo',
95
        'copyright': [],
96
        'comment': [],
97
        'database': ['general', 'astronomy'],
98
        'entry_date': date2solrstamp(entry_date), # date2solrstamp(get_date()),
99
        'year': year,
100
        'date': solr_date, # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter
101
        'doctype': doctype,
102
        'doctype_facet_hier': ["0/Non-Article", "1/Non-Article/Software"],
103
        'doi': [doi],
104
        'eid': doi,
105
        'email': ['-']*n_authors,
106
        'first_author': authors[0] if n_authors > 0 else '',
107
        'first_author_facet_hier': author_facet_hier[:2],
108
        'first_author_norm': normalized_authors[0] if n_authors > 0 else '',
109
        'links_data': ['{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}'.format(app.conf['DOI_URL'] + doi)], # TODO: How is it different from nonbib?
110
        'identifier': [bibcode, doi] + alternate_bibcode,
111
        'esources': ["PUB_HTML"],
112
        'citation': citations,
113
        'citation_count': n_citations,
114
        'citation_count_norm': n_citations/n_authors if n_authors > 0 else 0,
115
        'data_count': 1, # Number of elements in `links_data`
116
        'keyword': keywords,
117
        'keyword_facet': keywords,
118
        'keyword_norm': ["-"]*n_keywords,
119
        'keyword_schema': ["-"]*n_keywords,
120
        'property': ["ESOURCE", "NONARTICLE", "NOT REFEREED", "PUB_OPENACCESS", "OPENACCESS"],
121
        'pub': source,
122
        'pub_raw': source,
123
        'pubdate': pubdate,
124
        'pubnote': [],
125
        'read_count': len(readers),
126
        'title': [title],
127
        'publisher': source,
128
        'version': version
129
    }
130
    if version is None: # Concept DOIs may not contain version
1✔
131
        del record_dict['version']
×
132
    # Status
133
    if citation_change.status == Status.new:
1✔
134
        status = 2
1✔
135
    elif citation_change.status == Status.updated:
1✔
136
        status = 3
1✔
137
    elif citation_change.status == Status.deleted:
1✔
138
        status = 1
1✔
139
        # Only use this field for deletions, otherwise Solr will complain the field does not exist
140
        # and if this key does not exist in the dict/protobuf, the message will be
141
        # treated as new/update by MasterPipeline
142
        record_dict['status'] = status
1✔
143
    else:
144
        status = 0 # active
1✔
145
    if db_versions not in [{"":""}, {}, None]:
1✔
146
        record_dict['property'].append('ASSOCIATED')
1✔
147
    if is_release:
1✔
148
        record_dict['property'].append('RELEASE')
1✔
149

150
    record = DenormalizedRecord(**record_dict)
1✔
151
    nonbib_record = _build_nonbib_record(app, citation_change, record, db_versions, status, readers=readers)
1✔
152
    return record, nonbib_record
1✔
153

154

155
def _build_nonbib_record(app, citation_change, record, db_versions, status, readers=[]):
1✔
156
    doi = citation_change.content
1✔
157
    nonbib_record_dict = {
1✔
158
        'status': status,
159
        'bibcode': record.bibcode,
160
        'boost': 0.5, # Value between 0 and 1
161
        'citation_count': record.citation_count,
162
        'data': [],
163
        'data_links_rows': [
164
            {'link_type': 'ESOURCE', 'link_sub_type': 'PUB_HTML',
165
                     'url': [app.conf['DOI_URL'] + doi], 'title': [''], 'item_count':0},
166
                     ], # `item_count` only used for DATA and not ESOURCES
167
        'citation_count_norm': record.citation_count_norm,
168
        'grants': [],
169
        'ned_objects': [],
170
        'norm_cites': 0, # log10-normalized count of citations computed on the classic site but not currently used
171
        'read_count': record.read_count,
172
        'readers': readers,
173
        'simbad_objects': [],
174
        'total_link_counts': 0 # Only used for DATA and not for ESOURCES
175
    }
176
    if db_versions not in [{"":""}, {}, None]:
1✔
177
        nonbib_record_dict['data_links_rows'].append({'link_type': 'ASSOCIATED', 'link_sub_type': '', 
1✔
178
                     'url': db_versions.values(), 'title': db_versions.keys(), 'item_count':0})
179
    nonbib_record = NonBibRecord(**nonbib_record_dict)
1✔
180
    nonbib_record.esource.extend(record.esources)
1✔
181
    nonbib_record.reference.extend(record.reference)
1✔
182
    nonbib_record.property.extend(record.property)
1✔
183
    return nonbib_record
1✔
184

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc