• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

benwbrum / fromthepage / 17387282326

01 Sep 2025 09:13PM UTC coverage: 64.405%. Remained the same
17387282326

push

github

web-flow
4857 - Require rubocop step in CI (#4858)

* 4857 - Require rubocop step in CI

* 4865 - Organize gemfiles

1790 of 3303 branches covered (54.19%)

Branch coverage included in aggregate %.

839 of 1497 new or added lines in 133 files covered. (56.05%)

43 existing lines in 29 files now uncovered.

7928 of 11786 relevant lines covered (67.27%)

103.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

36.07
/lib/contentdm_translator.rb
1
module ContentdmTranslator
1✔
2
  def self.update_work_from_cdm(work, ocr_correction = false)
1✔
3
    # find the work manifest -- bail out if there is none
4
    return unless work.sc_manifest
×
5
    # make sure the manifest is cdm
6
    return unless iiif_manifest_is_cdm? work.sc_manifest.at_id
×
7

8
    if ocr_correction
×
9
      # get the fts field for this collection
×
10
      fts_error, fts_field = fts_field_for_collection(work.collection)
×
11
      if fts_error
×
12
        puts "Error retrieving Full-Text Search field: #{fts_error}\n"
×
13
      end
14
    end
15
    # for each page
16
    work.pages.each do |page|
×
17
      update_page_from_cdm(page, ocr_correction, fts_field)
×
18
    end
19
    work.ocr_correction=ocr_correction
×
20
    work.save!
×
21
  end
22

23
  def self.update_page_from_cdm(page, ocr_correction, fts_field)
1✔
24
    # fetch the cdm metadata
25
    info = fetch_cdm_info(page)
×
26
    # prune the boilerplate
27
    metadata = metadata_from_cdm_info(info)
×
28
    # store the metadata on the page
29
    page_columns = { metadata: metadata }
×
30

31
    if ocr_correction
×
32
      ocr = ocr_from_cdm_info(info, fts_field)
×
NEW
33
      page_columns[:source_text] = ocr.encode(xml: :text) if ocr
×
34
    end
35

36
    page.update_columns(page_columns)
×
37
  end
38

39
  def self.fetch_cdm_info(page)
1✔
40
    cdm_url = page_at_id_to_cdm_item_info(page.sc_canvas.sc_canvas_id)
×
41
    cdm_response = URI.open(cdm_url).read
×
42
    JSON.parse(cdm_response)
×
43
  end
44

45
  def self.fetch_cdm_field_config(collection)
1✔
46
    cdm_url = collection_to_cdm_field_config(collection)
×
47
    cdm_response = URI.open(cdm_url).read
×
48
    JSON.parse(cdm_response)
×
49
  end
50

51
  ITEM_INFO_DENYLIST = [
52
    'descri',
1✔
53
    'date',
54
    'creato',
55
    'subjec',
56
    'relate',
57
    'type',
58
    'publis',
59
    'langua',
60
    'rights',
61
    'transc',
62
    'contac',
63
    'fullrs',
64
    'find',
65
    'dmaccess',
66
    'dmimage',
67
    'dmcreated',
68
    'dmmodified',
69
    'dmoclcno',
70
    'restrictionCode',
71
    'cdmfilesize',
72
    'cdmfilesizeformatted',
73
    'cdmprintpdf',
74
    'cdmhasocr',
75
    'cdmisnewspaper' ]
76

77
  def self.metadata_from_cdm_info(info)
1✔
78
    # only return useful and unique things
79
    info.except(*ITEM_INFO_DENYLIST)
×
80
  end
81

82
  def self.ocr_from_cdm_info(info, fts_field)
1✔
83
    transcript = info[fts_field]
×
84
    if transcript.kind_of? String
×
85
      transcript
×
86
    else
87
      nil
88
    end
89
  end
90

91
  def self.page_at_id_to_cdm_item_info(at_id)
1✔
92
    cdm = at_id.sub(/cdm/, 'server')
×
93
    cdm.sub!(/(digital\/)?iiif/, 'dmwebservices/index.php?q=dmGetItemInfo')
×
94
    cdm.sub!(/\/canvas\/c\d*/, '/json')
×
95
    cdm.sub!(/:(\d+)/, '/\1') # handle coollection:id format instead of old collection/id
×
96

97
    cdm
×
98
  end
99

100
  def self.collection_to_cdm_field_config(collection)
1✔
101
    at_id = collection.pages.joins(:sc_canvas).reorder('pages.created_on').last.sc_canvas.sc_canvas_id
×
102
    cdm = at_id.sub(/cdm/, 'server')
×
103
    cdm.sub!(/(digital\/)?iiif/, 'dmwebservices/index.php?q=dmGetCollectionFieldInfo')
×
104
    cdm.sub!(/(:\d+)?\/canvas\/c\d*/, '/json')
×
105

106
    cdm
×
107
  end
108

109

110

111
  def self.iiif_manifest_is_cdm?(at_id)
1✔
112
    at_id.match(/contentdm.oclc.org/) || at_id.match(/iiif\/info\/\w+\/\d+\/manifest.json/)
11✔
113
  end
114

115
  def self.cdm_item_info_from_iiif(at_id)
1✔
116
    cdm = at_id.sub(/cdm/, 'server')
×
117
    cdm.sub!(/digital\/iiif-info/, 'dmwebservices/index.php?q=dmGetItemInfo')
×
118
  end
119

120
  def self.collection_is_cdm?(collection)
1✔
121
    imported_work = collection.works.joins(:sc_manifest).last
9✔
122
    imported_work && iiif_manifest_is_cdm?(imported_work.sc_manifest.at_id)
9✔
123
  end
124

125
  def self.fts_field_for_collection(collection)
1✔
126
    field_config = fetch_cdm_field_config(collection)
×
NEW
127
    fts_field = field_config.detect { |element| element['type'] == 'FTS' }
×
128
    if fts_field
×
129
      fts = fts_field['nick']
×
130
      error = nil
×
131
    else
×
132
      fts = nil
×
NEW
133
      error = 'No full-text search (FTS) fields were configured on CONTENTdm collection!'
×
134
    end
135
    return error, fts
×
136
  end
137

138
  def self.export_work_to_cdm_with_retry(work, username, password, license)
1✔
139
    max_delay = 21_600
×
140
    delay = 300
×
141

142
    begin
143
      ContentdmTranslator.export_work_to_cdm(work, username, password, license)
×
144
    rescue Net::ReadTimeout => e
NEW
145
      delay_to_use = [ delay, max_delay ].min
×
146
      print "Net::ReadTimeout: Retrying in #{delay_to_use} seconds... (#{e.message})"
×
147

148
      sleep(delay_to_use)
×
149

150
      delay = (delay * 1.5).round
×
151
      if delay > max_delay
×
152
        print "Net::ReadTimeout: Max retry delay reached, giving up. (#{e.message})"
×
153
      else
×
154
        retry
×
155
      end
156
    end
157
  end
158

159
  def self.export_work_to_cdm(work, username, password, license)
1✔
160
    error, fieldname = fts_field_for_collection(work.collection)
×
161
    if error
×
162
      puts "Error retrieving Full-Text Search field: #{error}\n"
×
163
      exit
×
164
    end
165

NEW
166
    soap_client = Savon.client(log: true, filters: [ :password ], wsdl: 'https://worldcat.org/webservices/contentdm/catcher?wsdl', follow_redirects: true)
×
167
    work.pages.each do |page|
×
168
      canvas_at_id = page.sc_canvas.sc_canvas_id
×
169
      manifest_at_id = work.sc_manifest.at_id
×
170
      puts "\nUpdating #{cdm_collection(manifest_at_id)}\trecord #{cdm_record(canvas_at_id)}\tfrom #{page.title}\t#{page.id}\t#{work.title} at #{Time.current.strftime('%Y-%m-%d %I:%M %p')}.  CONTENTdm response:"
×
171
      metadata_wrapper = {
172
        'metadataList' => {
×
173
          'metadata' => [
174
            { field: 'dmrecord', value: cdm_record(canvas_at_id) },
175
            { field: fieldname, value: page.verbatim_transcription_plaintext }
176
          ]
177
        }
178
      }
179

180
      message = {
NEW
181
        cdmurl: "http://#{cdm_server(manifest_at_id)}:8888",
×
182
        username: username,
183
        password: password,
184
        license: license,
185
        collection: cdm_collection(manifest_at_id),
186
        metadata: metadata_wrapper,
187
        action: 'edit'
188
      }
NEW
189
      resp = soap_client.call(:process_conten_tdm, message: message)
×
190

191
      puts resp.to_hash[:process_conten_tdm_response][:return]
×
192
    end
193
  end
194

195
  def self.log_file(collection)
1✔
196
    File.join(Rails.root, 'public', 'imports', "cdm_sync_#{collection.id}.log")
×
197
  end
198

199
  def self.log_contents(collection)
1✔
200
    STDOUT.flush
×
201
    File.read(log_file(collection))
×
202
  end
203

204
  private
1✔
205

206
  def self.cdm_server(at_id)
1✔
NEW
207
    at_id.sub(/https:\/\/cdm/, 'server').sub(/\/.*/, '')
×
208
  end
209

210
  def self.cdm_collection(at_id)
1✔
211
    if at_id.match(/.*iiif\/info\//)
×
212
      at_id.sub(/.*iiif\/info\//, '').sub(/\/\d+\/manifest.json/, '')
×
213
    elsif at_id.match(/.*iiif\/2\//)
×
NEW
214
      at_id.sub(/.*iiif\/2\//, '').sub(/:.*/, '')
×
215
    else # match https://cdm17168.contentdm.oclc.org/iiif/WFP:997/manifest.json
×
NEW
216
      at_id.sub(/.*iiif\//, '').sub(/:.*/, '')
×
217
    end
218
  end
219

220
  def self.cdm_record(at_id)
1✔
NEW
221
    at_id.sub(/\/canvas\/.*/, '').sub(/^.*\//, '').sub(/^.*:/, '')
×
222
  end
223

224
  def self.get_cdm_host_from_url(host)
1✔
225
    matches = host.match(/https?:\/\/(cdm\d+)/)
12✔
226
    return matches[1] if matches
12✔
227

228
    res = URI.open("#{host}/iiif/info/manifest.json").read
9✔
229
    res_json = JSON.parse(res)
8✔
230
    url = res_json['@id'] || nil
8✔
231

232
    if url
8!
233
      matches = url.match(/https?:\/\/(cdm\d+)/)
8✔
234
      return matches[1] if matches
8!
235
    end
236
    nil
237
  end
238

239
  def self.cdm_url_to_iiif(url)
1✔
240
    uri = URI(url)
12✔
241

242
    server = get_cdm_host_from_url("#{uri.scheme}://#{uri.host}")
12✔
243
    raise 'ContentDM URLs must be of the form http://cdmNNNNN.contentdm.oclc.org/...' if server.nil?
11!
244

245
    matches = uri.path.match(/.*collection\/(\w+)(?:\/id\/(\d+))?/)
11✔
246

247
    if matches
11✔
248
      collection = matches[1]
8✔
249
      record = matches[2]
8✔
250
    end
251

252
    # support back-level CONTENTdm IIIF presentation implementation
253
    if server && collection && record
11✔
254
      new_uri = "https://#{server}.contentdm.oclc.org/iiif/info/#{collection}/#{record}/manifest.json"
4✔
255
    elsif server && collection
7✔
256
      new_uri = "https://#{server}.contentdm.oclc.org/iiif/info/#{collection}/manifest.json"
4✔
257
    elsif server
3✔
258
      new_uri = "https://#{server}.contentdm.oclc.org/iiif/info/manifest.json"
3✔
259
    else
×
NEW
260
      raise 'ContentDM URLs must be of the form http://cdmNNNNN.contentdm.oclc.org/...'
×
261
    end
262

263
    begin
264
      URI.open(new_uri)
11✔
265
    rescue OpenURI::HTTPError
266
      if server && collection && record
1✔
267
        # https://cdm17217.contentdm.oclc.org/iiif/2/voter1867:4764/manifest.json
1✔
268
        new_uri = "https://#{server}.contentdm.oclc.org/iiif/2/#{collection}:#{record}/manifest.json"
1!
269
      elsif server && collection
×
270
        # https://cdm17217.contentdm.oclc.org/iiif/2/voter1867/manifest.json
×
271
        new_uri = "https://#{server}.contentdm.oclc.org/iiif/2/#{collection}/manifest.json"
×
272
      else
273
        # https://cdm17217.contentdm.oclc.org/iiif/2/manifest.json
×
274
        new_uri = "https://#{server}.contentdm.oclc.org/iiif/2/manifest.json"
×
275
      end
276

277
    end
278

279
    new_uri
11✔
280
  end
281

282
  def self.sample_manifest(collection)
1✔
283
    imported_work = collection.works.joins(:sc_manifest).last
×
284

285
    imported_work && imported_work.sc_manifest
×
286
  end
287
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc