• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

benwbrum / fromthepage / 13271621929

11 Feb 2025 08:10PM UTC coverage: 60.749% (+0.4%) from 60.338%
13271621929

Pull #4500

github

web-flow
Merge 11c39dd63 into 05051b96d
Pull Request #4500: changed cdm manifest url to use https

1494 of 2963 branches covered (50.42%)

Branch coverage included in aggregate %.

0 of 1 new or added line in 1 file covered. (0.0%)

115 existing lines in 9 files now uncovered.

6859 of 10787 relevant lines covered (63.59%)

78.09 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

38.24
/lib/contentdm_translator.rb
1
module ContentdmTranslator
1✔
2

3
  def self.update_work_from_cdm(work, ocr_correction=false)
1✔
4
    # find the work manifest -- bail out if there is none
5
    return unless work.sc_manifest
×
6
    # make sure the manifest is cdm
7
    return unless iiif_manifest_is_cdm? work.sc_manifest.at_id
×
8

9
    if ocr_correction
×
10
      # get the fts field for this collection
×
11
      fts_error, fts_field = fts_field_for_collection(work.collection)
×
12
      if fts_error
×
13
        puts "Error retrieving Full-Text Search field: #{error}\n"
×
14
      end
15
    end
16
    # for each page
17
    work.pages.each do |page|
×
18
      update_page_from_cdm(page, ocr_correction, fts_field)
×
19
    end
20
    work.ocr_correction=ocr_correction
×
21
    work.save!
×
22
  end
23

24
  def self.update_page_from_cdm(page, ocr_correction, fts_field)
1✔
25
    # fetch the cdm metadata
26
    info = fetch_cdm_info(page)
×
27
    # prune the boilerplate
28
    metadata = metadata_from_cdm_info(info)
×
29
    # store the metadata on the page
30
    page_columns = { metadata: metadata }
×
31

32
    if ocr_correction
×
33
      ocr = ocr_from_cdm_info(info, fts_field)
×
34
      page_columns[:source_text] = ocr.encode(:xml => :text) if ocr
×
35
    end
36

37
    page.update_columns(page_columns)
×
38
  end
39

40
  def self.fetch_cdm_info(page)
1✔
41
    cdm_url = page_at_id_to_cdm_item_info(page.sc_canvas.sc_canvas_id)
×
42
    cdm_response = URI.open(cdm_url).read
×
43
    JSON.parse(cdm_response)
×
44
  end
45

46
  def self.fetch_cdm_field_config(collection)
1✔
47
    cdm_url = collection_to_cdm_field_config(collection)
×
48
    cdm_response = URI.open(cdm_url).read
×
49
    JSON.parse(cdm_response)
×
50
  end
51

52
  ITEM_INFO_DENYLIST = [
53
    "descri",
1✔
54
    "date",
55
    "creato",
56
    "subjec",
57
    "relate",
58
    "type",
59
    "publis",
60
    "langua",
61
    "rights",
62
    "transc",
63
    "contac",
64
    "fullrs",
65
    "find",
66
    "dmaccess",
67
    "dmimage",
68
    "dmcreated",
69
    "dmmodified",
70
    "dmoclcno",
71
    "restrictionCode",
72
    "cdmfilesize",
73
    "cdmfilesizeformatted",
74
    "cdmprintpdf",
75
    "cdmhasocr",
76
    "cdmisnewspaper"]
77

78
  def self.metadata_from_cdm_info(info)
1✔
79
    # only return useful and unique things
80
    info.except(*ITEM_INFO_DENYLIST)
×
81
  end
82

83
  def self.ocr_from_cdm_info(info, fts_field)
1✔
84
    transcript = info[fts_field]
×
85
    if transcript.kind_of? String
×
86
      transcript
×
87
    else
88
      nil
89
    end
90
  end
91

92
  def self.page_at_id_to_cdm_item_info(at_id)
1✔
93
    cdm = at_id.sub(/cdm/, 'server')
×
94
    cdm.sub!(/(digital\/)?iiif/, 'dmwebservices/index.php?q=dmGetItemInfo')
×
95
    cdm.sub!(/\/canvas\/c\d*/, '/json')
×
96
    cdm.sub!(/:(\d+)/, '/\1') # handle coollection:id format instead of old collection/id
×
97

98
    cdm
×
99
  end
100

101
  def self.collection_to_cdm_field_config(collection)
1✔
102
    at_id = collection.pages.joins(:sc_canvas).reorder('pages.created_on').last.sc_canvas.sc_canvas_id
×
103
    cdm = at_id.sub(/cdm/, 'server')
×
104
    cdm.sub!(/(digital\/)?iiif/, 'dmwebservices/index.php?q=dmGetCollectionFieldInfo')
×
105
    cdm.sub!(/(:\d+)?\/canvas\/c\d*/, '/json')
×
106

107
    cdm
×
108
  end
109

110

111

112
  def self.iiif_manifest_is_cdm?(at_id)
1✔
113
    at_id.match(/contentdm.oclc.org/) || at_id.match(/iiif\/info\/\w+\/\d+\/manifest.json/)
11✔
114
  end
115

116
  def self.cdm_item_info_from_iiif(at_id)
1✔
117
    cdm = at_id.sub(/cdm/, 'server')
×
118
    cdm.sub!(/digital\/iiif-info/, 'dmwebservices/index.php?q=dmGetItemInfo')
×
119
  end
120

121
  def self.collection_is_cdm?(collection)
1✔
122
    imported_work = collection.works.joins(:sc_manifest).last
9✔
123
    imported_work && iiif_manifest_is_cdm?(imported_work.sc_manifest.at_id)
9✔
124
  end
125

126
  def self.fts_field_for_collection(collection)
1✔
127
    field_config = fetch_cdm_field_config(collection)
×
128
    fts_field = field_config.detect { |element| element["type"] == "FTS"}
×
129
    if fts_field
×
130
      fts = fts_field['nick']
×
131
      error = nil
×
132
    else
×
133
      fts = nil
×
134
      error = "No full-text search (FTS) fields were configured on CONTENTdm collection!"
×
135
    end
136
    return error, fts
×
137
  end
138

139

140
  def self.export_work_to_cdm(work, username, password, license)
1✔
141
    error, fieldname = fts_field_for_collection(work.collection)
×
142
    if error
×
143
      puts "Error retrieving Full-Text Search field: #{error}\n"
×
144
      exit
×
145
    end
146

NEW
147
    soap_client = Savon.client(:log=>true, filters: [:password], :wsdl => 'https://worldcat.org/webservices/contentdm/catcher?wsdl', follow_redirects: true)
×
148
    work.pages.each do |page|
×
149
      canvas_at_id = page.sc_canvas.sc_canvas_id
×
150
      manifest_at_id = work.sc_manifest.at_id
×
151
      puts "\nUpdating #{cdm_collection(manifest_at_id)}\trecord #{cdm_record(canvas_at_id)}\tfrom #{page.title}\t#{page.id}\t#{work.title}.  CONTENTdm response:"
×
152
      metadata_wrapper = {
153
        'metadataList' => {
×
154
          'metadata' => [
155
            { :field => 'dmrecord', :value => cdm_record(canvas_at_id)},
156
            { :field => fieldname, :value => page.verbatim_transcription_plaintext}
157
          ]
158
        }
159
      }
160

161
      message = {
162
        :cdmurl => "http://#{cdm_server(manifest_at_id)}:8888",
×
163
        :username => username,
164
        :password => password,
165
        :license => license,
166
        :collection => cdm_collection(manifest_at_id),
167
        :metadata => metadata_wrapper,
168
        :action => 'edit'
169
      }
170
      resp = soap_client.call(:process_conten_tdm, :message => message )
×
171

172
      puts resp.to_hash[:process_conten_tdm_response][:return]
×
173

174
    end
175
  end
176

177
  def self.log_file(collection)
1✔
178
    File.join(Rails.root, 'public', 'imports', "cdm_sync_#{collection.id}.log")
×
179
  end
180

181
  def self.log_contents(collection)
1✔
182
    STDOUT.flush
×
183
    File.read(log_file(collection))
×
184
  end
185

186
  private
1✔
187

188
  def self.cdm_server(at_id)
1✔
189
    at_id.sub(/https:\/\/cdm/,'server').sub(/\/.*/,'')
×
190
  end
191

192
  def self.cdm_collection(at_id)
1✔
193
    if at_id.match(/.*iiif\/info\//)
×
194
      at_id.sub(/.*iiif\/info\//, '').sub(/\/\d+\/manifest.json/, '')
×
195
    elsif at_id.match(/.*iiif\/2\//)
×
196
      at_id.sub(/.*iiif\/2\//, '').sub(/:.*/,'')
×
197
    else # match https://cdm17168.contentdm.oclc.org/iiif/WFP:997/manifest.json
×
198
      at_id.sub(/.*iiif\//, '').sub(/:.*/,'')
×
199
    end
200
  end
201

202
  def self.cdm_record(at_id)
1✔
203
    at_id.sub(/\/canvas\/.*/,'').sub(/^.*\//, '').sub(/^.*:/, '')
×
204
  end
205

206
  def self.get_cdm_host_from_url(host)
1✔
207
    matches = host.match(/https?:\/\/(cdm\d+)/)
12✔
208
    return matches[1] if matches
12✔
209

210
    res = URI.open("#{host}/iiif/info/manifest.json").read
9✔
211
    res_json = JSON.parse(res)
8✔
212
    url = res_json['@id'] || nil
8✔
213

214
    if url
8!
215
      matches = url.match(/https?:\/\/(cdm\d+)/)
8✔
216
      return matches[1] if matches
8!
217
    end
218
    nil
219
  end
220

221
  def self.cdm_url_to_iiif(url)
1✔
222
    uri = URI(url)
12✔
223

224
    server = get_cdm_host_from_url("#{uri.scheme}://#{uri.host}")
12✔
225
    raise "ContentDM URLs must be of the form http://cdmNNNNN.contentdm.oclc.org/..." if server.nil?
11!
226

227
    matches = uri.path.match(/.*collection\/(\w+)(?:\/id\/(\d+))?/)
11✔
228
    
229
    if matches
11✔
230
      collection = matches[1]
8✔
231
      record = matches[2]
8✔
232
    end
233
    
234
    # support back-level CONTENTdm IIIF presentation implementation
235
    if server && collection && record
11✔
236
      new_uri = "https://#{server}.contentdm.oclc.org/iiif/info/#{collection}/#{record}/manifest.json"
4✔
237
    elsif server && collection
7✔
238
      new_uri = "https://#{server}.contentdm.oclc.org/iiif/info/#{collection}/manifest.json"
4✔
239
    elsif server
3✔
240
      new_uri = "https://#{server}.contentdm.oclc.org/iiif/info/manifest.json"
3✔
241
    else
×
242
      raise "ContentDM URLs must be of the form http://cdmNNNNN.contentdm.oclc.org/..."
×
243
    end
244

245
    begin
246
      URI.open(new_uri)
11✔
247
    rescue OpenURI::HTTPError
248
      if server && collection && record
1✔
249
        # https://cdm17217.contentdm.oclc.org/iiif/2/voter1867:4764/manifest.json
1✔
250
        new_uri = "https://#{server}.contentdm.oclc.org/iiif/2/#{collection}:#{record}/manifest.json"
1!
251
      elsif server && collection
×
252
        # https://cdm17217.contentdm.oclc.org/iiif/2/voter1867/manifest.json
×
253
        new_uri = "https://#{server}.contentdm.oclc.org/iiif/2/#{collection}/manifest.json"
×
254
      else
255
        # https://cdm17217.contentdm.oclc.org/iiif/2/manifest.json
×
256
        new_uri = "https://#{server}.contentdm.oclc.org/iiif/2/manifest.json"
×
257
      end
258

259
    end
260

261
    new_uri
11✔
262
  end
263

264
  def self.sample_manifest(collection)
1✔
265
    imported_work = collection.works.joins(:sc_manifest).last
×
266

267
    imported_work && imported_work.sc_manifest
×
268
  end
269
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc