25507167079

Committed 07 May 2026 03:59PM UTC coverage: 78.298%. First build

Build # 25507167079

Build Type

Pull #379

github

Committed by

cdoyle-temple

Commit Message

workaround for different db adaptors

Pull Request Pull Request #379: IMT-208 match-isilon-assets-to-content-dm

Coverage Stats

4 of 5 new or added lines in 1 file covered. (80.0%)

920 of 1175 relevant lines covered (78.3%)

15.32 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.14

/app/services/sync_service/contentdm_filename_sync.rb

# frozen_string_literal: true

require "csv"
require "fileutils"

module SyncService
  class ContentdmFilenameSync
    CONTENTDM_FILENAME_MATCH_NOTE = "Filename exists in CONTENTdm"
    SyncResult = Struct.new(
      :updated_count,
      :rows_touched,
      :rows_matched,
      :rows_unmatched,
      :rows_discarded,
      keyword_init: true
    )
    CSV_FOLDER = nil
    FILENAME_HEADER = "File Name"
    COLLECTION_HEADER = "Collection"
    NON_MATCHES_CSV_PATH = Rails.root.join("tmp", "contentdm_filename_non_matches.csv")
    BATCH_SIZE = 500
    CONFLICT_WINNERS = {
      [ "ambler_filenames.csv", "scrc_photographs_filenames.csv" ] => "ambler_filenames.csv",
      [ "bulletin_photos_filenames.csv", "bulletin_photos_restricted_filenames.csv" ] => "bulletin_photos_filenames.csv",
      [ "bulletin_photos_filenames.csv", "inquirer_filenames.csv" ] => "bulletin_photos_filenames.csv",
      [ "bulletin_photos_restricted_filenames.csv", "inquirer_filenames.csv" ] => "bulletin_photos_restricted_filenames.csv",
      [ "cityparks_filenames.csv", "hadv_filenames.csv" ] => "cityparks_filenames.csv",
      [ "inquirer_filenames.csv", "scrc_photographs_filenames.csv" ] => "inquirer_filenames.csv"
    }

    def self.call(csv_folder: CSV_FOLDER)
      new(csv_folder: csv_folder).sync
    end

    def initialize(csv_folder:)
      @csv_folder = csv_folder.to_s.strip.presence
    end

    def sync
      validate_csv_folder!
      csv_files = Dir.glob(File.join(@csv_folder, "*.csv")).sort
      raise ArgumentError, "No CSV files found in #{@csv_folder}" if csv_files.empty?

      load_result = load_filename_map(csv_files)
      filename_map = load_result[:filename_map]
      non_matches = []
      summary = {
        updated_count: 0,
        rows_touched: load_result[:rows_touched],
        rows_matched: 0,
        rows_unmatched: 0,
        rows_discarded: load_result[:rows_discarded]
      }

      if filename_map.empty?
        return SyncResult.new(**summary)
      end

      filename_map.each do |collection_name, filenames|
        result = update_matching_assets(collection_name, filenames, non_matches)
        summary[:updated_count] += result[:updated_count]
        summary[:rows_matched] += result[:rows_matched]
        summary[:rows_unmatched] += result[:rows_unmatched]
      end
      SyncResult.new(**summary)
    ensure
      write_non_matches_csv(non_matches || [])
    end

    private

    def load_filename_map(csv_files)
      filename_entries = Hash.new { |hash, key| hash[key] = [] }
      rows_touched = 0

      csv_files.each do |csv_path|
        raise ArgumentError, "CSV file not found: #{csv_path}" unless File.exist?(csv_path)

        source_file = File.basename(csv_path)
        CSV.foreach(csv_path, headers: true, liberal_parsing: true) do |row|
          original_filename = row[FILENAME_HEADER].to_s.strip
          filename = normalize_filename(original_filename)
          collection_name = normalize_collection_name(row[COLLECTION_HEADER])
          next if filename.blank? || collection_name.blank?
          rows_touched += 1

          filename_entries[filename] << {
            collection_name: collection_name,
            original_filename: original_filename,
            source_file: source_file
          }
        end
      end

      rows_discarded = 0
      filename_to_collection = filename_entries.each_with_object({}) do |(filename, entries), resolved|
        winner = entries.first
        entries.drop(1).each do |candidate_entry|
          winner = preferred_entry(filename, winner, candidate_entry)
        end
        rows_discarded += entries.size - 1
        resolved[filename] = winner
      end

      filename_map = filename_to_collection.each_with_object(Hash.new { |hash, key| hash[key] = [] }) do |(filename, entry), grouped|
        grouped[entry[:collection_name]] << {
          normalized_filename: filename,
          original_filename: entry[:original_filename]
        }
      end

      {
        filename_map: filename_map,
        rows_touched: rows_touched,
        rows_discarded: rows_discarded
      }
    end

    def validate_csv_folder!
      raise ArgumentError, "csv_folder is required" if @csv_folder.blank?

      raise ArgumentError, "CSV folder not found: #{@csv_folder}" unless Dir.exist?(@csv_folder)
    end

    def normalize_filename(filename)
      filename.to_s.strip.downcase.presence
    end

    def normalize_collection_name(collection_name)
      collection_name.to_s.strip.presence
    end

    def preferred_entry(filename, existing_entry, candidate_entry)
      return existing_entry if existing_entry[:collection_name] == candidate_entry[:collection_name]

      if existing_entry[:source_file] == candidate_entry[:source_file]
        raise ArgumentError,
              "Conflicting collections for filename '#{filename}' within #{existing_entry[:source_file]}: " \
              "'#{existing_entry[:collection_name]}' and '#{candidate_entry[:collection_name]}'"
      end

      winning_file = CONFLICT_WINNERS[[ existing_entry[:source_file], candidate_entry[:source_file] ].sort]
      if winning_file.blank?
        raise ArgumentError,
              "Conflicting collections for filename '#{filename}': '#{existing_entry[:collection_name]}' " \
              "(#{existing_entry[:source_file]}) and '#{candidate_entry[:collection_name]}' " \
              "(#{candidate_entry[:source_file]})"
      end

      winning_file == candidate_entry[:source_file] ? candidate_entry : existing_entry
    end

    def update_matching_assets(collection_name, filenames, non_matches)
      collection = find_collection!(collection_name)
      updated_count = 0
      rows_matched = 0
      rows_unmatched = 0

      filenames.each_slice(BATCH_SIZE) do |filename_batch|
        normalized_filenames = filename_batch.map { |entry| entry[:normalized_filename] }
        matched_filenames = matching_assets_scope(normalized_filenames)
          .distinct
          .pluck(Arel.sql("LOWER(TRIM(isilon_name))"))
        rows_matched += matched_filenames.size

        unmatched_entries = filename_batch.reject { |entry| matched_filenames.include?(entry[:normalized_filename]) }
        rows_unmatched += unmatched_entries.size
        non_matches.concat(
          unmatched_entries.map do |entry|
            {
              original_filename: entry[:original_filename],
              collection_name: collection_name
            }
          end
        )

        updated_count += matching_assets_scope(normalized_filenames).update_all(
          contentdm_collection_id: collection.id,
          notes: notes_update_sql,
          updated_at: Time.current
        )
      end

      {
        updated_count: updated_count,
        rows_matched: rows_matched,
        rows_unmatched: rows_unmatched
      }
    end

    def find_collection!(collection_name)
      ContentdmCollection.find_by!(name: collection_name)
    end

    def matching_assets_scope(filename_batch)
      IsilonAsset.where("LOWER(TRIM(isilon_name)) IN (?)", filename_batch)
    end

    def write_non_matches_csv(non_matches)
      FileUtils.mkdir_p(NON_MATCHES_CSV_PATH.dirname)

      CSV.open(NON_MATCHES_CSV_PATH, "w") do |csv|
        csv << [ FILENAME_HEADER, COLLECTION_HEADER ]

        non_matches.each do |entry|
          csv << [ entry[:original_filename], entry[:collection_name] ]
        end
      end
    end

    def notes_update_sql
      quoted_note = ActiveRecord::Base.connection.quote(CONTENTDM_FILENAME_MATCH_NOTE)
      contains_note_sql = note_contains_sql(quoted_note)

      Arel.sql(<<~SQL.squish)
        CASE
          WHEN notes IS NULL OR TRIM(notes) = '' THEN #{quoted_note}
          WHEN #{contains_note_sql} = 0 THEN notes || '; ' || #{quoted_note}
          ELSE notes
        END
      SQL
    end

    def note_contains_sql(quoted_note)
      if ActiveRecord::Base.connection.adapter_name.downcase.include?("postgres")
        "strpos(notes, #{quoted_note})"
      else
        "instr(notes, #{quoted_note})"
      end
    end
  end
end

1	# frozen_string_literal: true
2
3	require "csv"	1✔
4	require "fileutils"	1✔
5
6	module SyncService	1✔
7	class ContentdmFilenameSync	1✔
8	CONTENTDM_FILENAME_MATCH_NOTE = "Filename exists in CONTENTdm"	1✔
9	SyncResult = Struct.new(	1✔
10	:updated_count,
11	:rows_touched,
12	:rows_matched,
13	:rows_unmatched,
14	:rows_discarded,
15	keyword_init: true
16	)
17	CSV_FOLDER = nil	1✔
18	FILENAME_HEADER = "File Name"	1✔
19	COLLECTION_HEADER = "Collection"	1✔
20	NON_MATCHES_CSV_PATH = Rails.root.join("tmp", "contentdm_filename_non_matches.csv")	1✔
21	BATCH_SIZE = 500	1✔
22	CONFLICT_WINNERS = {
23	[ "ambler_filenames.csv", "scrc_photographs_filenames.csv" ] => "ambler_filenames.csv",	1✔
24	[ "bulletin_photos_filenames.csv", "bulletin_photos_restricted_filenames.csv" ] => "bulletin_photos_filenames.csv",
25	[ "bulletin_photos_filenames.csv", "inquirer_filenames.csv" ] => "bulletin_photos_filenames.csv",
26	[ "bulletin_photos_restricted_filenames.csv", "inquirer_filenames.csv" ] => "bulletin_photos_restricted_filenames.csv",
27	[ "cityparks_filenames.csv", "hadv_filenames.csv" ] => "cityparks_filenames.csv",
28	[ "inquirer_filenames.csv", "scrc_photographs_filenames.csv" ] => "inquirer_filenames.csv"
29	}
30
31	def self.call(csv_folder: CSV_FOLDER)	1✔
32	new(csv_folder: csv_folder).sync	10✔
33	end
34
35	def initialize(csv_folder:)	1✔
36	@csv_folder = csv_folder.to_s.strip.presence	11✔
37	end
38
39	def sync	1✔
40	validate_csv_folder!	10✔
41	csv_files = Dir.glob(File.join(@csv_folder, "*.csv")).sort	10✔
42	raise ArgumentError, "No CSV files found in #{@csv_folder}" if csv_files.empty?	10✔
43
44	load_result = load_filename_map(csv_files)	9✔
45	filename_map = load_result[:filename_map]	8✔
46	non_matches = []	8✔
47	summary = {	8✔
48	updated_count: 0,
49	rows_touched: load_result[:rows_touched],
50	rows_matched: 0,
51	rows_unmatched: 0,
52	rows_discarded: load_result[:rows_discarded]
53	}
54
55	if filename_map.empty?	8✔
56	return SyncResult.new(**summary)	×
57	end
58
59	filename_map.each do \|collection_name, filenames\|	8✔
60	result = update_matching_assets(collection_name, filenames, non_matches)	11✔
61	summary[:updated_count] += result[:updated_count]	10✔
62	summary[:rows_matched] += result[:rows_matched]	10✔
63	summary[:rows_unmatched] += result[:rows_unmatched]	10✔
64	end
65	SyncResult.new(**summary)	7✔
66	ensure
67	write_non_matches_csv(non_matches \|\| [])	10✔
68	end
69
70	private	1✔
71
72	def load_filename_map(csv_files)	1✔
73	filename_entries = Hash.new { \|hash, key\| hash[key] = [] }	23✔
74	rows_touched = 0	9✔
75
76	csv_files.each do \|csv_path\|	9✔
77	raise ArgumentError, "CSV file not found: #{csv_path}" unless File.exist?(csv_path)	14✔
78
79	source_file = File.basename(csv_path)	14✔
80	CSV.foreach(csv_path, headers: true, liberal_parsing: true) do \|row\|	14✔
81	original_filename = row[FILENAME_HEADER].to_s.strip	18✔
82	filename = normalize_filename(original_filename)	18✔
83	collection_name = normalize_collection_name(row[COLLECTION_HEADER])	18✔
84	next if filename.blank? \|\| collection_name.blank?	18✔
85	rows_touched += 1	18✔
86
87	filename_entries[filename] << {	18✔
88	collection_name: collection_name,
89	original_filename: original_filename,
90	source_file: source_file
91	}
92	end
93	end
94
95	rows_discarded = 0	9✔
96	filename_to_collection = filename_entries.each_with_object({}) do \|(filename, entries), resolved\|	9✔
97	winner = entries.first	14✔
98	entries.drop(1).each do \|candidate_entry\|	14✔
99	winner = preferred_entry(filename, winner, candidate_entry)	4✔
100	end
101	rows_discarded += entries.size - 1	13✔
102	resolved[filename] = winner	13✔
103	end
104
105	filename_map = filename_to_collection.each_with_object(Hash.new { \|hash, key\| hash[key] = [] }) do \|(filename, entry), grouped\|	19✔
106	grouped[entry[:collection_name]] << {	13✔
107	normalized_filename: filename,
108	original_filename: entry[:original_filename]
109	}
110	end
111
112	{
113	filename_map: filename_map,	8✔
114	rows_touched: rows_touched,
115	rows_discarded: rows_discarded
116	}
117	end
118
119	def validate_csv_folder!	1✔
120	raise ArgumentError, "csv_folder is required" if @csv_folder.blank?	10✔
121
122	raise ArgumentError, "CSV folder not found: #{@csv_folder}" unless Dir.exist?(@csv_folder)	10✔
123	end
124
125	def normalize_filename(filename)	1✔
126	filename.to_s.strip.downcase.presence	18✔
127	end
128
129	def normalize_collection_name(collection_name)	1✔
130	collection_name.to_s.strip.presence	18✔
131	end
132
133	def preferred_entry(filename, existing_entry, candidate_entry)	1✔
134	return existing_entry if existing_entry[:collection_name] == candidate_entry[:collection_name]	4✔
135
136	if existing_entry[:source_file] == candidate_entry[:source_file]	3✔
137	raise ArgumentError,	×
138	"Conflicting collections for filename '#{filename}' within #{existing_entry[:source_file]}: " \
139	"'#{existing_entry[:collection_name]}' and '#{candidate_entry[:collection_name]}'"
140	end
141
142	winning_file = CONFLICT_WINNERS[[ existing_entry[:source_file], candidate_entry[:source_file] ].sort]	3✔
143	if winning_file.blank?	3✔
144	raise ArgumentError,	1✔
145	"Conflicting collections for filename '#{filename}': '#{existing_entry[:collection_name]}' " \
146	"(#{existing_entry[:source_file]}) and '#{candidate_entry[:collection_name]}' " \
147	"(#{candidate_entry[:source_file]})"
148	end
149
150	winning_file == candidate_entry[:source_file] ? candidate_entry : existing_entry	2✔
151	end
152
153	def update_matching_assets(collection_name, filenames, non_matches)	1✔
154	collection = find_collection!(collection_name)	11✔
155	updated_count = 0	10✔
156	rows_matched = 0	10✔
157	rows_unmatched = 0	10✔
158
159	filenames.each_slice(BATCH_SIZE) do \|filename_batch\|	10✔
160	normalized_filenames = filename_batch.map { \|entry\| entry[:normalized_filename] }	22✔
161	matched_filenames = matching_assets_scope(normalized_filenames)	10✔
162	.distinct
163	.pluck(Arel.sql("LOWER(TRIM(isilon_name))"))
164	rows_matched += matched_filenames.size	10✔
165
166	unmatched_entries = filename_batch.reject { \|entry\| matched_filenames.include?(entry[:normalized_filename]) }	22✔
167	rows_unmatched += unmatched_entries.size	10✔
168	non_matches.concat(	10✔
169	unmatched_entries.map do \|entry\|
170	{
171	original_filename: entry[:original_filename],	2✔
172	collection_name: collection_name
173	}
174	end
175	)
176
177	updated_count += matching_assets_scope(normalized_filenames).update_all(	10✔
178	contentdm_collection_id: collection.id,
179	notes: notes_update_sql,
180	updated_at: Time.current
181	)
182	end
183
184	{
185	updated_count: updated_count,	10✔
186	rows_matched: rows_matched,
187	rows_unmatched: rows_unmatched
188	}
189	end
190
191	def find_collection!(collection_name)	1✔
192	ContentdmCollection.find_by!(name: collection_name)	11✔
193	end
194
195	def matching_assets_scope(filename_batch)	1✔
196	IsilonAsset.where("LOWER(TRIM(isilon_name)) IN (?)", filename_batch)	20✔
197	end
198
199	def write_non_matches_csv(non_matches)	1✔
200	FileUtils.mkdir_p(NON_MATCHES_CSV_PATH.dirname)	10✔
201
202	CSV.open(NON_MATCHES_CSV_PATH, "w") do \|csv\|	10✔
203	csv << [ FILENAME_HEADER, COLLECTION_HEADER ]	10✔
204
205	non_matches.each do \|entry\|	10✔
206	csv << [ entry[:original_filename], entry[:collection_name] ]	2✔
207	end
208	end
209	end
210
211	def notes_update_sql	1✔
212	quoted_note = ActiveRecord::Base.connection.quote(CONTENTDM_FILENAME_MATCH_NOTE)	11✔
213	contains_note_sql = note_contains_sql(quoted_note)	11✔
214
215	Arel.sql(<<~SQL.squish)	11✔
216	CASE
217	WHEN notes IS NULL OR TRIM(notes) = '' THEN #{quoted_note}
218	WHEN #{contains_note_sql} = 0 THEN notes \|\| '; ' \|\| #{quoted_note}
219	ELSE notes
220	END
221	SQL
222	end
223
224	def note_contains_sql(quoted_note)	1✔
225	if ActiveRecord::Base.connection.adapter_name.downcase.include?("postgres")	11✔
NEW 226	"strpos(notes, #{quoted_note})"	×
227	else
228	"instr(notes, #{quoted_note})"	11✔
229	end
230	end
231	end
232	end

tulibraries / isilon-tracker / 25507167079

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous