• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

benwbrum / fromthepage / 17387282326

01 Sep 2025 09:13PM UTC coverage: 64.405%. Remained the same
17387282326

push

github

web-flow
4857 - Require rubocop step in CI (#4858)

* 4857 - Require rubocop step in CI

* 4865 - Organize gemfiles

1790 of 3303 branches covered (54.19%)

Branch coverage included in aggregate %.

839 of 1497 new or added lines in 133 files covered. (56.05%)

43 existing lines in 29 files now uncovered.

7928 of 11786 relevant lines covered (67.27%)

103.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.85
/app/models/xml_source_processor.rb
1
module XmlSourceProcessor
1✔
2
  def validate_source
1✔
3
    if self.source_text.blank?
3,364✔
4
      return
3,230✔
5
    end
6
    validate_links(self.source_text)
134✔
7
  end
8

9
  def validate_source_translation
1✔
10
    if self.source_translation.blank?
3,364✔
11
      return
3,317✔
12
    end
13
    validate_links(self.source_translation)
47✔
14
  end
15

16
  # check the text for problems or typos with the subject links
17
  def validate_links(text)
1✔
18
    error_scope = [ :activerecord, :errors, :models, :xml_source_processor ]
181✔
19
    # split on all begin-braces
20
    tags = text.split('[[')
181✔
21
    # remove the initial string which occurs before the first tag
22
    debug("validate_source: tags to process are #{tags.inspect}")
181✔
23
    tags = tags - [ tags[0] ]
181✔
24
    debug("validate_source: massaged tags to process are #{tags.inspect}")
181✔
25
    for tag in tags
181✔
26
      debug(tag)
112✔
27

28
      if tag.include?(']]]')
112✔
29
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('tags_should_not_use_3_brackets', scope: error_scope))
1✔
30
        return
1✔
31
      end
32
      unless tag.include?(']]')
111✔
33
        tag = tag.strip
1✔
34
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('wrong_number_of_closing_braces', tag: '"[['+tag+'"', scope: error_scope))
1✔
35
      end
36

37
      # just pull the pieces between the braces
38
      inner_tag = tag.split(']]')[0]
111✔
39
      if inner_tag =~ /^\s*$/
111✔
40
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_tag_in', tag: '"[['+tag+'"', scope: error_scope))
1✔
41
      end
42

43
      # check for unclosed single bracket
44
      if inner_tag.include?('[')
111✔
45
        unless inner_tag.include?(']')
1!
46
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('unclosed_bracket_within', tag: '"'+inner_tag+'"', scope: error_scope))
1✔
47
        end
48
      end
49
      # check for blank title or display name with pipes
50
      if inner_tag.include?('|')
111✔
51
        tag_parts = inner_tag.split('|')
13✔
52
        debug("validate_source: inner tag parts are #{tag_parts.inspect}")
13✔
53
        if tag_parts[0] =~ /^\s*$/
13✔
54
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_subject_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
1✔
55
        end
56
        if tag_parts[1] =~ /^\s*$/
13✔
57
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_text_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
1✔
58
        end
59
      end
60
    end
61
    #    return errors.size > 0
62
  end
63

64
def source_text=(text)
1✔
65
    self.source_text_will_change!
166✔
66
    super
166✔
67
end
68

69
def source_translation=(text)
1✔
70
  self.source_translation_will_change!
53✔
71
  super
53✔
72
end
73

74
  ##############################################
75
  # All code to convert transcriptions from source
76
  # format to canonical xml format belongs here.
77
  ##############################################
78
  def process_source
1✔
79
    if source_text_changed?
242✔
80
      self.xml_text = wiki_to_xml(self, Page::TEXT_TYPE::TRANSCRIPTION)
59✔
81
    end
82

83
    if self.respond_to?(:source_translation) && source_translation_changed?
242✔
84
      self.xml_translation = wiki_to_xml(self, Page::TEXT_TYPE::TRANSLATION)
12✔
85
    end
86
  end
87

88
  def wiki_to_xml(page, text_type)
1✔
89
    subjects_disabled = page.collection.subjects_disabled
77✔
90

91
    source_text = case text_type
77✔
92
    when Page::TEXT_TYPE::TRANSCRIPTION
64✔
93
                    page.source_text
64✔
94
    when Page::TEXT_TYPE::TRANSLATION
13✔
95
                    page.source_translation
13✔
96
    else
×
NEW
97
                    ''
×
98
    end
99

100
    xml_string = String.new(source_text)
77✔
101
    xml_string = process_latex_snippets(xml_string)
77✔
102
    xml_string = clean_bad_braces(xml_string)
77✔
103
    xml_string = clean_script_tags(xml_string)
77✔
104
    xml_string = process_square_braces(xml_string) unless subjects_disabled
77✔
105
    xml_string = process_linewise_markup(xml_string)
77✔
106
    xml_string = process_line_breaks(xml_string)
77✔
107
    xml_string = valid_xml_from_source(xml_string)
77✔
108
    xml_string = update_links_and_xml(xml_string, false, text_type)
77✔
109
    xml_string = postprocess_xml_markup(xml_string)
77✔
110
    postprocess_sections
77✔
111
    xml_string
77✔
112
  end
113

114

115
  # remove script tags from HTML to prevent javascript injection
116
  def clean_script_tags(text)
1✔
117
    # text.gsub(/<script.*?<\/script>/m, '')
118
    text.gsub(/<\/?script.*?>/m, '')
77✔
119
  end
120

121
  BAD_SHIFT_REGEX = /\[\[([[[:alpha:]][[:blank:]]|,\(\)\-[[:digit:]]]+)\}\}/
1✔
122
  def clean_bad_braces(text)
1✔
123
    text.gsub BAD_SHIFT_REGEX, '[[\\1]]'
77✔
124
  end
125

126
  BRACE_REGEX = /\[\[.*?\]\]/m
1✔
127
  def process_square_braces(text)
1✔
128
    # find all the links
129
    wikilinks = text.scan(BRACE_REGEX)
74✔
130
    wikilinks.each do |wikilink_contents|
74✔
131
      # strip braces
132
      munged = wikilink_contents.sub('[[', '')
29✔
133
      munged = munged.sub(']]', '')
29✔
134

135
      # extract the title and display
136
      if munged.include? '|'
29✔
137
        parts = munged.split '|'
10✔
138
        title = parts[0]
10✔
139
        verbatim = parts[1]
10✔
140
      else
19✔
141
        title = munged
19✔
142
        verbatim = munged
19✔
143
      end
144

145
      title = canonicalize_title(title)
29✔
146

147
      replacement = "<link target_title=\"#{title}\">#{verbatim}</link>"
29✔
148
      text.sub!(wikilink_contents, replacement)
29✔
149
    end
150

151
    text
74✔
152
  end
153

154
  def remove_square_braces(text)
1✔
155
    new_text = text.scan(BRACE_REGEX)
3✔
156
    new_text.each do |results|
3✔
157
      changed = results
3✔
158
      # remove title
159
      if results.include?('|')
3!
160
        changed = results.sub(/\[\[.*?\|/, '')
×
161
      end
162
      changed = changed.sub('[[', '')
3✔
163
      changed = changed.sub(']]', '')
3✔
164

165
      text.sub!(results, changed)
3✔
166
    end
167
    text
3✔
168
  end
169

170
  LATEX_SNIPPET = /(\{\{tex:?(.*?):?tex\}\})/m
1✔
171
  def process_latex_snippets(text)
1✔
172
    return text unless self.respond_to? :tex_figures
77✔
173
    replacements = {}
62✔
174
    figures = self.tex_figures.to_a
62✔
175

176
    text.scan(LATEX_SNIPPET).each_with_index do |pair, i|
62✔
177
      with_tags = pair[0]
×
178
      contents = pair[1]
×
179

180
      replacements[with_tags] = "<texFigure position=\"#{i+1}\"/>" # position attribute in acts as list starts with 1
×
181

182
      figure = figures[i] || TexFigure.new
×
183
      figure.source = contents unless figure.source == contents
×
184
      figures[i] = figure
×
185
    end
186

187
    self.tex_figures = figures
62✔
188
    replacements.each_pair do |s, r|
62✔
NEW
189
      text.sub!(s, r)
×
190
    end
191

192
    text
62✔
193
  end
194

195
  HEADER = /\s\|\s/
1✔
196
  SEPARATOR = /---.*\|/
1✔
197
  ROW = HEADER
1✔
198

199
  def process_linewise_markup(text)
1✔
200
    @tables = []
77✔
201
    @sections = []
77✔
202
    new_lines = []
77✔
203
    current_table = nil
77✔
204
    text.lines.each do |line|
77✔
205
      # first deal with any sections
206
      line = process_any_sections(line)
93✔
207
      # look for a header
208
      if !current_table
93✔
209
        if line.match(HEADER)
93!
210
          line.chomp
×
211
          current_table = { header: [], rows: [], section: @sections.last }
×
212
          # fill the header
213
          cells = line.split(/\s*\|\s*/)
×
214
          cells.shift if line.match(/^\|/) # remove leading pipe
×
NEW
215
          current_table[:header] = cells.map { |cell_title| cell_title.sub(/^!\s*/, '') }
×
216
          heading = cells.map do |cell|
×
217
            if cell.match(/^!/)
×
NEW
218
              "<th class=\"bang\">#{cell.sub(/^!\s*/, '')}</th>"
×
219
            else
×
220
              "<th>#{cell}</th>"
×
221
            end
222
          end.join(' ')
223
          new_lines << "<table class=\"tabular\">\n<thead>\n<tr>#{heading}</tr></thead>"
×
224
        else
225
          # no current table, no table contents -- NO-OP
93✔
226
          new_lines << line
93✔
227
        end
228
      else
229
        # this is either an end or a separator
×
230
        if line.match(SEPARATOR)
×
231
          # NO-OP
×
232
        elsif line.match(ROW)
×
233
          # remove leading and trailing delimiters
×
234
          clean_line=line.chomp.sub(/^\s*\|/, '').sub(/\|\s*$/, '')
×
235
          # fill the row
236
          cells = clean_line.split(/\s*\|\s*/, -1) # -1 means "don't prune empty values at the end"
×
237
          current_table[:rows] << cells
×
238
          rowline = ''
×
239
          cells.each_with_index do |cell, _i|
×
240
            rowline += "<td>#{cell}</td> "
×
241
          end
242

243
          if current_table[:rows].size == 1
×
244
            new_lines << '<tbody>'
×
245
          end
246
          new_lines << "<tr>#{rowline}</tr>"
×
247
        else
248
          # finished the last row
×
249
          unless current_table[:rows].empty? # only process tables with bodies
×
250
            @tables << current_table
×
251
            new_lines << '</tbody>'
×
252
          end
253
          new_lines << '</table><lb/>'
×
254
          current_table = nil
×
255
        end
256
      end
257
    end
258

259
    if current_table
77✔
260
      # unclosed table
×
261
      @tables << current_table
×
262
      unless current_table[:rows].empty? # only process tables with bodies
×
263
        @tables << current_table
×
264
        new_lines << '</tbody>'
×
265
      end
266
      new_lines << '</table><lb/>'
×
267
    end
268
    # do something with the table data
269
    new_lines.join(' ')
77✔
270
  end
271

272
  def process_any_sections(line)
1✔
273
    6.downto(2) do |depth|
93✔
274
      line.scan(/(={#{depth}}([^=]+)={#{depth}})/).each do |section_match|
465✔
275
        wiki_title = section_match[1].strip
×
276
        if wiki_title.length > 0
×
277
          verbatim = XmlSourceProcessor.cell_to_plaintext(wiki_title)
×
NEW
278
          safe_verbatim = verbatim.gsub(/"/, '&quot;')
×
279
          line = line.sub(section_match.first, "<entryHeading title=\"#{safe_verbatim}\" depth=\"#{depth}\" >#{wiki_title}</entryHeading>")
×
NEW
280
          @sections << Section.new(title: wiki_title, depth: depth)
×
281
        end
282
      end
283
    end
284

285
    line
93✔
286
  end
287

288
  def postprocess_sections
1✔
289
    @sections.each do |section|
77✔
290
      doc = XmlSourceProcessor.cell_to_xml(section.title)
×
NEW
291
      doc.elements.each('//link') do |e|
×
292
        title = e.attributes['target_title']
×
NEW
293
        article = collection.articles.where(title: title).first
×
294
        if article
×
295
          e.add_attribute('target_id', article.id.to_s)
×
296
        end
297
      end
298
      section.title = XmlSourceProcessor.xml_to_cell(doc)
×
299
    end
300
  end
301

302

303
  def canonicalize_title(title)
1✔
304
    # kill all tags
305
    title = title.gsub(/<.*?>/, '')
29✔
306
    # linebreaks -> spaces
307
    title = title.gsub(/\n/, ' ')
29✔
308
    # multiple spaces -> single spaces
309
    title = title.gsub(/\s+/, ' ')
29✔
310
    # change double quotes to proper xml
311
    title = title.gsub(/\"/, '&quot;')
29✔
312
    title
29✔
313
  end
314

315
  # transformations converting source mode transcription to xml
316
  def process_line_breaks(text)
1✔
317
    text="<p>#{text}</p>"
77✔
318
    text = text.gsub(/\s*\n\s*\n\s*/, '</p><p>')
77✔
319
    text = text.gsub(/([[:word:]]+)-\r\n\s*/, '\1<lb break="no" />')
77✔
320
    text = text.gsub(/\r\n\s*/, '<lb/>')
77✔
321
    text = text.gsub(/([[:word:]]+)-\n\s*/, '\1<lb break="no" />')
77✔
322
    text = text.gsub(/\n\s*/, '<lb/>')
77✔
323
    text = text.gsub(/([[:word:]]+)-\r\s*/, '\1<lb break="no" />')
77✔
324
    text = text.gsub(/\r\s*/, '<lb/>')
77✔
325
    text
77✔
326
  end
327

328
  def valid_xml_from_source(source)
1✔
329
    source = source || ''
77✔
330
    safe = source.gsub /\&/, '&amp;'
77✔
331
    safe.gsub! /\&amp;amp;/, '&amp;'
77✔
332
    safe.gsub! /[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]/, ' '
77✔
333

334
    string = <<EOF
77✔
335
    <?xml version="1.0" encoding="UTF-8"?>
336
      <page>
337
        #{safe}
338
      </page>
339
EOF
340
  end
341

342
  def update_links_and_xml(xml_string, preview_mode = false, text_type)
1✔
343
    # first clear out the existing links
344
    # log the count of articles before and after
345
    clear_links(text_type) unless preview_mode
77!
346
    processed = ''
77✔
347
    # process it
348
    doc = REXML::Document.new xml_string
77✔
349
    doc.elements.each('//link') do |element|
77✔
350
      # default the title to the text if it's not specified
351
      if !(title=element.attributes['target_title'])
29!
352
        title = element.text
×
353
      end
354
      # display_text = element.text
355
      display_text = ''
29✔
356
      element.children.each do |e|
29✔
357
        display_text += e.to_s
29✔
358
      end
359
      debug("link display_text = #{display_text}")
29✔
360
      # change the xml version of quotes back to double quotes for article title
361
      title = title.gsub('&quot;', '"')
29✔
362

363
      # create new blank articles if they don't exist already
364
      if !(article = collection.articles.where(title: title).first)
29✔
365
        article = Article.new
8✔
366
        article.title = title
8✔
367
        article.collection = collection
8✔
368
        article.created_by_id = Current.user.id if Current.user.present?
8✔
369
        article.save! unless preview_mode
8!
370
      end
371
      link_id = create_link(article, display_text, text_type) unless preview_mode
29!
372
      # now update the attribute
373
      link_element = REXML::Element.new('link')
29✔
374
      element.children.each { |c| link_element.add(c) }
58✔
375
      link_element.add_attribute('target_title', title)
29✔
376
      debug('element='+link_element.inspect)
29✔
377
      debug('article='+article.inspect)
29✔
378
      link_element.add_attribute('target_id', article.id.to_s) unless preview_mode
29!
379
      link_element.add_attribute('link_id', link_id.to_s) unless preview_mode
29!
380
      element.replace_with(link_element)
29✔
381
    end
382
    doc.write(processed)
77✔
383
    processed
77✔
384
  end
385

386

387
  # handle XML-dependent post-processing
388
  def postprocess_xml_markup(xml_string)
1✔
389
    doc = REXML::Document.new xml_string
77✔
390
    processed = ''
77✔
391
    doc.elements.each('//lb') do |element|
77✔
392
      if element.previous_element && element.previous_sibling.node_type == :element && element.previous_element.name == 'lb'
6!
393
        pre = doc.to_s
×
394
        element.parent.elements.delete(element)
×
395
      end
396
    end
397
    doc.write(processed)
77✔
398
    processed
77✔
399
  end
400

401

402
  CELL_PREFIX = "<?xml version='1.0' encoding='UTF-8'?><cell>"
1✔
403
  CELL_SUFFIX = '</cell>'
1✔
404

405
  def self.cell_to_xml(cell)
1✔
406
    REXML::Document.new(CELL_PREFIX + cell.gsub('&', '&amp;') + CELL_SUFFIX)
3✔
407
  end
408

409
  def self.xml_to_cell(doc)
1✔
NEW
410
    text = ''
×
411
    doc.write(text)
×
NEW
412
    text.sub(CELL_PREFIX, '').sub(CELL_SUFFIX, '')
×
413
  end
414

415
  def self.cell_to_plaintext(cell)
1✔
416
    doc = cell_to_xml(cell)
3✔
417
    doc.each_element('.//text()') { |e| p e.text }.join
3✔
418
  end
419

420
  def self.cell_to_subject(cell)
1✔
421
    doc = cell_to_xml(cell)
×
NEW
422
    subjects = ''
×
NEW
423
    doc.elements.each('//link') do |e|
×
424
      title = e.attributes['target_title']
×
425
      subjects << title
×
426
      subjects << "\n"
×
427
    end
428
    subjects
×
429
  end
430

431
  def self.cell_to_category(cell)
1✔
432
    doc = cell_to_xml(cell)
×
NEW
433
    categories = ''
×
NEW
434
    doc.elements.each('//link') do |e|
×
435
      id = e.attributes['target_id']
×
436
      if id
×
437
        article = Article.find(id)
×
438
        article.categories.each do |category|
×
439
          categories << category.title
×
440
          categories << "\n"
×
441
        end
442
      end
443
    end
444
    categories
×
445
  end
446

447
  ##############################################
448
  # Code to rename links within the text.
449
  # This assumes that the name change has already
450
  # taken place within the article table in the DB
451
  ##############################################
452
  def rename_article_links(old_title, new_title)
1✔
453
    title_regex =
454
      Regexp.escape(old_title)
13✔
455
        .gsub('\\ ', ' ') # Regexp.escape converts ' ' to '\\ ' for some reason -- undo this
456
        .gsub(/\s+/, '\s+') # convert multiple whitespaces into 1+n space characters
457

458
    self.source_text = rename_link_in_text(source_text, title_regex, new_title)
13✔
459

460
    # Articles don't have translations, but we still need to update pages.source_translation
461
    if has_attribute?(:source_translation) && !source_translation.nil?
13✔
462
      self.source_translation = rename_link_in_text(source_translation, title_regex, new_title)
5✔
463
    end
464
  end
465

466
  def rename_link_in_text(text, title_regex, new_title)
1✔
467
    if new_title == ''
18✔
468
      # Link deleted, remove [[ ]] but keep the original title text
469

470
      # Handle links of the form [[Old Title|Display Text]] => Display Text
3✔
471
      text = text.gsub(/\[\[#{title_regex}\|([^\]]+)\]\]/i, '\1')
3✔
472
      # Handle links of the form [[Old Title]] => Old Title
473
      text = text.gsub(/\[\[(#{title_regex})\]\]/i, '\1')
3✔
474
    else
475
      # Replace the title part in [[Old Title|Display Text]]
15✔
476
      text = text.gsub(/\[\[#{title_regex}\|/i, "[[#{new_title}|")
15✔
477
      # Replace [[Old Title]] with [[New Title|Old Title]]
478
      text = text.gsub(/\[\[(#{title_regex})\]\]/i, "[[#{new_title}|\\1]]")
15✔
479
    end
480

481
    text
18✔
482
  end
483

484

485
  def pipe_tables_formatting(text)
1✔
486
    # since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
487
    # to the beginning and end of each line
488
    text.split("\n").map { |line| "|#{line}|" }.join("\n")
10✔
489
  end
490

491
  def xml_table_to_markdown_table(table_element, pandoc_format = false, plaintext_export = false)
1✔
492
    text_table = ''
12✔
493

494
    # clean up in-cell line-breaks
495
    table_element.xpath('//lb').each { |n| n.replace(' ') }
50✔
496

497
    # calculate the widths of each column based on max(header, cell[0...end])
498
    column_count = ([ table_element.xpath('//th').count ] + table_element.xpath('//tr').map { |e| e.xpath('td').count }).max
36✔
499
    column_widths = {}
12✔
500
    1.upto(column_count) do |column_index|
12✔
501
      longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map { |e| e.text().length }.max || 0)
72✔
502
      corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
36✔
503
      heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
36!
504
      column_widths[column_index] = [ longest_cell, heading_length ].max
36✔
505
    end
506

507
    # print the header as markdown
508
    cell_strings = []
12✔
509
    table_element.xpath('//th').each_with_index do |e, i|
12✔
510
      cell_strings << e.text.rjust(column_widths[i+1], ' ')
36✔
511
    end
512
    text_table << cell_strings.join(' | ') << "\n"
12✔
513

514
    # print the separator
515
    text_table << column_count.times.map { |i| ''.rjust(column_widths[i+1], '-') }.join(' | ') << "\n"
48✔
516

517
    # print each row as markdown
518
    table_element.xpath('//tr').each do |row_element|
12✔
519
      text_table << row_element.xpath('td').map do |e|
24✔
520
        width = 80 # default for hand-coded tables
36✔
521
        index = e.path.match(/.*td\[(\d+)\]/)
36✔
522
        if index
36✔
523
          width = column_widths[index[1].to_i] || 80
36✔
524
        else
×
525
          width = column_widths.values.first
×
526
        end
527

528
        if plaintext_export
36✔
529
          e.text.rjust(width, ' ')
30✔
530
        else
6✔
531
          inner_html = xml_to_pandoc_md(e.to_s, false, false, nil, false).gsub("\n", '')
6✔
532
          inner_html.rjust(width, ' ')
6✔
533
        end
534
      end.join(' | ') << "\n"
535
    end
536
    if pandoc_format
12✔
537
      text_table = pipe_tables_formatting(text_table)
2✔
538
    end
539

540
    "#{text_table}\n\n"
12✔
541
  end
542

543

544

545
  def debug(msg)
1✔
546
    logger.debug("DEBUG: #{msg}")
574✔
547
  end
548
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc