• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

benwbrum / fromthepage / 13402333050

18 Feb 2025 11:48PM UTC coverage: 61.822% (+1.0%) from 60.846%
13402333050

push

github

web-flow
Merge pull request #4532 from benwbrum/4528-add-linebreak-after-table

4528 - Add linebreak after table

1543 of 2994 branches covered (51.54%)

Branch coverage included in aggregate %.

2 of 16 new or added lines in 1 file covered. (12.5%)

3 existing lines in 2 files now uncovered.

6994 of 10815 relevant lines covered (64.67%)

81.79 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

67.57
/app/models/xml_source_processor.rb
1
module XmlSourceProcessor
1✔
2

3
  @text_dirty = false
1✔
4
  @translation_dirty = false
1✔
5
  #@fields = false
6

7
  def source_text=(text)
1✔
8
    @text_dirty = true
149✔
9
    super
149✔
10
  end
11

12
  def source_translation=(translation)
1✔
13
    @translation_dirty = true
40✔
14
    super
40✔
15
  end
16

17
  def validate_source
1✔
18
    if self.source_text.blank?
2,159✔
19
      return
2,046✔
20
    end
21
    validate_links(self.source_text)
113✔
22
  end
23

24
  def validate_source_translation
1✔
25
    if self.source_translation.blank?
2,159✔
26
      return
2,126✔
27
    end
28
    validate_links(self.source_translation)
33✔
29
  end
30

31
  #check the text for problems or typos with the subject links
32
  def validate_links(text)
1✔
33
    error_scope = [:activerecord, :errors, :models, :xml_source_processor]
146✔
34
    # split on all begin-braces
35
    tags = text.split('[[')
146✔
36
    # remove the initial string which occurs before the first tag
37
    debug("validate_source: tags to process are #{tags.inspect}")
146✔
38
    tags = tags - [tags[0]]
146✔
39
    debug("validate_source: massaged tags to process are #{tags.inspect}")
146✔
40
    for tag in tags
146✔
41
      debug(tag)
85✔
42

43
      if tag.include?(']]]')
85✔
44
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('tags_should_not_use_3_brackets', scope: error_scope))
1✔
45
        return
1✔
46
      end
47
      unless tag.include?(']]')
84✔
48
        tag = tag.strip
1✔
49
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('wrong_number_of_closing_braces', tag: '"[['+tag+'"', scope: error_scope))
1✔
50
      end
51

52
      # just pull the pieces between the braces
53
      inner_tag = tag.split(']]')[0]
84✔
54
      if inner_tag =~ /^\s*$/
84✔
55
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_tag_in', tag: '"[['+tag+'"', scope: error_scope))
1✔
56
      end
57

58
      #check for unclosed single bracket
59
      if inner_tag.include?('[')
84✔
60
        unless inner_tag.include?(']')
1!
61
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('unclosed_bracket_within', tag: '"'+inner_tag+'"', scope: error_scope))
1✔
62
        end
63
      end
64
      # check for blank title or display name with pipes
65
      if inner_tag.include?("|")
84✔
66
        tag_parts = inner_tag.split('|')
17✔
67
        debug("validate_source: inner tag parts are #{tag_parts.inspect}")
17✔
68
        if tag_parts[0] =~ /^\s*$/
17✔
69
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_subject_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
1✔
70
        end
71
        if tag_parts[1] =~ /^\s*$/
17✔
72
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_text_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
1✔
73
        end
74
      end
75
    end
76
    #    return errors.size > 0
77
  end
78

79
  ##############################################
80
  # All code to convert transcriptions from source
81
  # format to canonical xml format belongs here.
82
  ##############################################
83
  def process_source
1✔
84
    if @text_dirty
209✔
85
      self.xml_text = wiki_to_xml(self, Page::TEXT_TYPE::TRANSCRIPTION)
65✔
86
    end
87

88
    if @translation_dirty
209✔
89
      self.xml_translation = wiki_to_xml(self, Page::TEXT_TYPE::TRANSLATION)
15✔
90
    end
91
  end
92

93
  def wiki_to_xml(page, text_type)
1✔
94

95
    subjects_disabled = page.collection.subjects_disabled
86✔
96

97
    source_text = case text_type
86✔
98
                  when Page::TEXT_TYPE::TRANSCRIPTION
70✔
99
                    page.source_text
70✔
100
                  when Page::TEXT_TYPE::TRANSLATION
16✔
101
                    page.source_translation
16✔
102
                  else
×
103
                    ""
×
104
                  end
105

106
    xml_string = String.new(source_text)
86✔
107
    xml_string = process_latex_snippets(xml_string)
86✔
108
    xml_string = clean_bad_braces(xml_string)
86✔
109
    xml_string = clean_script_tags(xml_string)
86✔
110
    xml_string = process_square_braces(xml_string) unless subjects_disabled
86✔
111
    xml_string = process_linewise_markup(xml_string)
86✔
112
    xml_string = process_line_breaks(xml_string)
86✔
113
    xml_string = valid_xml_from_source(xml_string)
86✔
114
    xml_string = update_links_and_xml(xml_string, false, text_type)
86✔
115
    xml_string = postprocess_xml_markup(xml_string)
86✔
116
    postprocess_sections
86✔
117
    xml_string
86✔
118
  end
119

120

121
  # remove script tags from HTML to prevent javascript injection
122
  def clean_script_tags(text)
1✔
123
    # text.gsub(/<script.*?<\/script>/m, '')
124
    text.gsub(/<\/?script.*?>/m, '')
86✔
125
  end
126

127
  BAD_SHIFT_REGEX = /\[\[([[[:alpha:]][[:blank:]]|,\(\)\-[[:digit:]]]+)\}\}/
1✔
128
  def clean_bad_braces(text)
1✔
129
    text.gsub BAD_SHIFT_REGEX, "[[\\1]]"
86✔
130
  end
131

132
  BRACE_REGEX = /\[\[.*?\]\]/m
1✔
133
  def process_square_braces(text)
1✔
134
    # find all the links
135
    wikilinks = text.scan(BRACE_REGEX)
83✔
136
    wikilinks.each do |wikilink_contents|
83✔
137
      # strip braces
138
      munged = wikilink_contents.sub('[[','')
32✔
139
      munged = munged.sub(']]','')
32✔
140

141
      # extract the title and display
142
      if munged.include? '|'
32✔
143
        parts = munged.split '|'
14✔
144
        title = parts[0]
14✔
145
        verbatim = parts[1]
14✔
146
      else
18✔
147
        title = munged
18✔
148
        verbatim = munged
18✔
149
      end
150

151
      title = canonicalize_title(title)
32✔
152

153
      replacement = "<link target_title=\"#{title}\">#{verbatim}</link>"
32✔
154
      text.sub!(wikilink_contents, replacement)
32✔
155
    end
156

157
    text
83✔
158
  end
159

160
  def remove_square_braces(text)
1✔
161
    new_text = text.scan(BRACE_REGEX)
3✔
162
    new_text.each do |results|
3✔
163
      changed = results
3✔
164
      #remove title
165
      if results.include?('|')
3!
166
        changed = results.sub(/\[\[.*?\|/, '')
×
167
      end
168
      changed = changed.sub('[[', '')
3✔
169
      changed = changed.sub(']]', '')
3✔
170

171
      text.sub!(results, changed)
3✔
172
    end
173
    text
3✔
174
  end
175

176
  LATEX_SNIPPET = /(\{\{tex:?(.*?):?tex\}\})/m
1✔
177
  def process_latex_snippets(text)
1✔
178
    return text unless self.respond_to? :tex_figures
86✔
179
    replacements = {}
72✔
180
    figures = self.tex_figures.to_a
72✔
181

182
    text.scan(LATEX_SNIPPET).each_with_index do |pair, i|
72✔
183
      with_tags = pair[0]
×
184
      contents = pair[1]
×
185

186
      replacements[with_tags] = "<texFigure position=\"#{i+1}\"/>" # position attribute in acts as list starts with 1
×
187

188
      figure = figures[i] || TexFigure.new
×
189
      figure.source = contents unless figure.source == contents
×
190
      figures[i] = figure
×
191
    end
192

193
    self.tex_figures = figures
72✔
194
    replacements.each_pair do |s,r|
72✔
195
      text.sub!(s,r)
×
196
    end
197

198
    text
72✔
199
  end
200

201
  HEADER = /\s\|\s/
1✔
202
  SEPARATOR = /---.*\|/
1✔
203
  ROW = HEADER
1✔
204

205
  def process_linewise_markup(text)
1✔
206
    @tables = []
86✔
207
    @sections = []
86✔
208
    new_lines = []
86✔
209
    current_table = nil
86✔
210
    text.lines.each do |line|
86✔
211
      # first deal with any sections
212
      line = process_any_sections(line)
98✔
213
      # look for a header
214
      if !current_table
98✔
215
        if line.match(HEADER)
98!
216
          line.chomp
×
NEW
217
          current_table = { header: [], rows: [], section: @sections.last }
×
218
          # fill the header
219
          cells = line.split(/\s*\|\s*/)
×
220
          cells.shift if line.match(/^\|/) # remove leading pipe
×
221
          current_table[:header] = cells.map{ |cell_title| cell_title.sub(/^!\s*/,'') }
×
222
          heading = cells.map do |cell|
×
223
            if cell.match(/^!/)
×
224
              "<th class=\"bang\">#{cell.sub(/^!\s*/,'')}</th>"
×
225
            else
×
226
              "<th>#{cell}</th>"
×
227
            end
228
          end.join(' ')
229
          new_lines << "<table class=\"tabular\">\n<thead>\n<tr>#{heading}</tr></thead>"
×
230
        else
231
          # no current table, no table contents -- NO-OP
98✔
232
          new_lines << line
98✔
233
        end
234
      else
235
        # this is either an end or a separator
×
236
        if line.match(SEPARATOR)
×
237
          # NO-OP
×
238
        elsif line.match(ROW)
×
239
          # remove leading and trailing delimiters
×
NEW
240
          clean_line=line.chomp.sub(/^\s*\|/, '').sub(/\|\s*$/, '')
×
241
          # fill the row
NEW
242
          cells = clean_line.split(/\s*\|\s*/, -1) # -1 means "don't prune empty values at the end"
×
243
          current_table[:rows] << cells
×
NEW
244
          rowline = ''
×
245
          cells.each_with_index do |cell, i|
×
246
            head = current_table[:header][i]
×
247
            role_string = " role=\"#{head}\""
×
NEW
248
            rowline += "<td#{role_string}>#{cell}</td> "
×
249
          end
250

251
          if current_table[:rows].size == 1
×
NEW
252
            new_lines << '<tbody>'
×
253
          end
254
          new_lines << "<tr>#{rowline}</tr>"
×
255
        else
256
          # finished the last row
×
NEW
257
          unless current_table[:rows].empty? # only process tables with bodies
×
258
            @tables << current_table
×
NEW
259
            new_lines << '</tbody>'
×
260
          end
NEW
261
          new_lines << '</table><lb/>'
×
262
          current_table = nil
×
263
        end
264
      end
265
    end
266

267
    if current_table
86✔
268
      # unclosed table
×
269
      @tables << current_table
×
NEW
270
      unless current_table[:rows].empty? # only process tables with bodies
×
271
        @tables << current_table
×
NEW
272
        new_lines << '</tbody>'
×
273
      end
NEW
274
      new_lines << '</table><lb/>'
×
275
    end
276
    # do something with the table data
277
    new_lines.join(' ')
86✔
278
  end
279

280
  def process_any_sections(line)
1✔
281
    6.downto(2) do |depth|
98✔
282
      line.scan(/(={#{depth}}([^=]+)={#{depth}})/).each do |section_match|
490✔
283
        wiki_title = section_match[1].strip
×
284
        if wiki_title.length > 0
×
285
          verbatim = XmlSourceProcessor.cell_to_plaintext(wiki_title)
×
286
          safe_verbatim = verbatim.gsub(/"/, "&quot;")
×
287
          line = line.sub(section_match.first, "<entryHeading title=\"#{safe_verbatim}\" depth=\"#{depth}\" >#{wiki_title}</entryHeading>")
×
288
          @sections << Section.new(:title => wiki_title, :depth => depth)
×
289
        end
290
      end
291
    end
292

293
    line
98✔
294
  end
295

296
  def postprocess_sections
1✔
297
    @sections.each do |section|
86✔
298
      doc = XmlSourceProcessor.cell_to_xml(section.title)
×
299
      doc.elements.each("//link") do |e|
×
300
        title = e.attributes['target_title']
×
301
        article = collection.articles.where(:title => title).first
×
302
        if article
×
303
          e.add_attribute('target_id', article.id.to_s)
×
304
        end
305
      end
306
      section.title = XmlSourceProcessor.xml_to_cell(doc)
×
307
    end
308
  end
309

310

311
  def canonicalize_title(title)
1✔
312
    # kill all tags
313
    title = title.gsub(/<.*?>/, '')
32✔
314
    # linebreaks -> spaces
315
    title = title.gsub(/\n/, ' ')
32✔
316
    # multiple spaces -> single spaces
317
    title = title.gsub(/\s+/, ' ')
32✔
318
    # change double quotes to proper xml
319
    title = title.gsub(/\"/, '&quot;')
32✔
320
    title
32✔
321
  end
322

323
  # transformations converting source mode transcription to xml
324
  def process_line_breaks(text)
1✔
325
    text="<p>#{text}</p>"
86✔
326
    text = text.gsub(/\s*\n\s*\n\s*/, "</p><p>")
86✔
327
    text = text.gsub(/([[:word:]]+)-\r\n\s*/, '\1<lb break="no" />')
86✔
328
    text = text.gsub(/\r\n\s*/, "<lb/>")
86✔
329
    text = text.gsub(/([[:word:]]+)-\n\s*/, '\1<lb break="no" />')
86✔
330
    text = text.gsub(/\n\s*/, "<lb/>")
86✔
331
    text = text.gsub(/([[:word:]]+)-\r\s*/, '\1<lb break="no" />')
86✔
332
    text = text.gsub(/\r\s*/, "<lb/>")
86✔
333
    return text
86✔
334
  end
335

336
  def valid_xml_from_source(source)
1✔
337
    source = source || ""
86✔
338
    safe = source.gsub /\&/, '&amp;'
86✔
339
    safe.gsub! /\&amp;amp;/, '&amp;'
86✔
340
    safe.gsub! /[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]/, ' '
86✔
341

342
    string = <<EOF
86✔
343
    <?xml version="1.0" encoding="UTF-8"?>
344
      <page>
345
        #{safe}
346
      </page>
347
EOF
348
  end
349

350
  def update_links_and_xml(xml_string, preview_mode=false, text_type)
1✔
351
    # first clear out the existing links
352
    # log the count of articles before and after
353
    old_article_count = collection.articles.count
86✔
354
    logger.info("ISSUE4269 old_article_count = #{old_article_count}")
86✔
355
    clear_links(text_type) unless preview_mode
86!
356
    processed = ""
86✔
357
    # process it
358
    doc = REXML::Document.new xml_string
86✔
359
    doc.elements.each("//link") do |element|
86✔
360
      # default the title to the text if it's not specified
361
      if !(title=element.attributes['target_title'])
32!
362
        title = element.text
×
363
      end
364
      #display_text = element.text
365
      display_text = ""
32✔
366
      element.children.each do |e|
32✔
367
        display_text += e.to_s
32✔
368
      end
369
      debug("link display_text = #{display_text}")
32✔
370
      #change the xml version of quotes back to double quotes for article title
371
      title = title.gsub('&quot;', '"')
32✔
372

373
      # create new blank articles if they don't exist already
374
      if !(article = collection.articles.where(:title => title).first)
32✔
375
        article = Article.new
7✔
376
        article.title = title
7✔
377
        article.collection = collection
7✔
378
        article.created_by_id = User.current_user.id if User.current_user.present?
7!
379
        article.save! unless preview_mode
7!
380
      end
381
      link_id = create_link(article, display_text, text_type) unless preview_mode
32!
382
      # now update the attribute
383
      link_element = REXML::Element.new("link")
32✔
384
      element.children.each { |c| link_element.add(c) }
64✔
385
      link_element.add_attribute('target_title', title)
32✔
386
      debug("element="+link_element.inspect)
32✔
387
      debug("article="+article.inspect)
32✔
388
      link_element.add_attribute('target_id', article.id.to_s) unless preview_mode
32!
389
      link_element.add_attribute('link_id', link_id.to_s) unless preview_mode
32!
390
      element.replace_with(link_element)
32✔
391
    end
392
    new_article_count = collection.articles.count
86✔
393
    logger.info("ISSUE4269 new_article_count = #{new_article_count}")
86✔
394
    if new_article_count < old_article_count
86!
395
      logger.error("ISSUE4269 ERROR new_article_count #{new_article_count} < old_article_count #{old_article_count}!")
×
396
    end
397
    doc.write(processed)
86✔
398
    return processed
86✔
399
  end
400

401

402
  # handle XML-dependent post-processing
403
  def postprocess_xml_markup(xml_string)
1✔
404
    doc = REXML::Document.new xml_string
86✔
405
    processed = ''
86✔
406
    doc.elements.each("//lb") do |element|
86✔
407
      if element.previous_element && element.previous_sibling.node_type == :element && element.previous_element.name == 'lb'
6!
408
        pre = doc.to_s
×
409
        element.parent.elements.delete(element)
×
410
      end
411
    end
412
    doc.write(processed)
86✔
413
    return processed
86✔
414
  end
415

416

417
  CELL_PREFIX = "<?xml version='1.0' encoding='UTF-8'?><cell>"
1✔
418
  CELL_SUFFIX = '</cell>'
1✔
419

420
  def self.cell_to_xml(cell)
1✔
421
    REXML::Document.new(CELL_PREFIX + cell.gsub('&','&amp;') + CELL_SUFFIX)
3✔
422
  end
423

424
  def self.xml_to_cell(doc)
1✔
425
    text = ""
×
426
    doc.write(text)
×
427
    text.sub(CELL_PREFIX,'').sub(CELL_SUFFIX,'')
×
428
  end
429

430
  def self.cell_to_plaintext(cell)
1✔
431
    doc = cell_to_xml(cell)
3✔
432
    doc.each_element('.//text()') { |e| p e.text }.join
3✔
433
  end
434

435
  def self.cell_to_subject(cell)
1✔
436
    doc = cell_to_xml(cell)
×
437
    subjects = ""
×
438
    doc.elements.each("//link") do |e|
×
439
      title = e.attributes['target_title']
×
440
      subjects << title
×
441
      subjects << "\n"
×
442
    end
443
    subjects
×
444
  end
445

446
  def self.cell_to_category(cell)
1✔
447
    doc = cell_to_xml(cell)
×
448
    categories = ""
×
449
    doc.elements.each("//link") do |e|
×
450
      id = e.attributes['target_id']
×
451
      if id
×
452
        article = Article.find(id)
×
453
        article.categories.each do |category|
×
454
          categories << category.title
×
455
          categories << "\n"
×
456
        end
457
      end
458
    end
459
    categories
×
460
  end
461

462
  ##############################################
463
  # Code to rename links within the text.
464
  # This assumes that the name change has already
465
  # taken place within the article table in the DB
466
  ##############################################
467
  def rename_article_links(old_title, new_title)
1✔
468
    title_regex =
469
      Regexp.escape(old_title)
15✔
470
        .gsub('\\ ',' ') # Regexp.escape converts ' ' to '\\ ' for some reason -- undo this
471
        .gsub(/\s+/, '\s+') # convert multiple whitespaces into 1+n space characters
472

473
    self.source_text = rename_link_in_text(source_text, title_regex, new_title)
15✔
474

475
    # Articles don't have translations, but we still need to update pages.source_translation
476
    if has_attribute?(:source_translation) && !source_translation.nil?
15✔
477
      self.source_translation = rename_link_in_text(source_translation, title_regex, new_title)
6✔
478
    end
479
  end
480

481
  def rename_link_in_text(text, title_regex, new_title)
1✔
482
    # handle links of the format [[Old Title|Display Text]]
483
    text = text.gsub(/\[\[#{title_regex}\|/, "[[#{new_title}|")
21✔
484
    # handle links of the format [[Old Title]]
485
    text = text.gsub(/\[\[(#{title_regex})\]\]/, "[[#{new_title}|\\1]]")
21✔
486

487
    text
21✔
488
  end
489

490

491
  def pipe_tables_formatting(text)
1✔
492
    # since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
493
    # to the beginning and end of each line
494
    text.split("\n").map{|line| "|#{line}|"}.join("\n")
2✔
495
  end
496

497
  def xml_table_to_markdown_table(table_element, pandoc_format=false)
1✔
498
    text_table = ""
12✔
499

500
    # clean up in-cell line-breaks
501
    table_element.xpath('//lb').each { |n| n.replace(' ')}
74✔
502

503
    # calculate the widths of each column based on max(header, cell[0...end])
504
    column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{|e| e.xpath('td').count }).max
12✔
505
    column_widths = {}
12✔
506
    1.upto(column_count) do |column_index|
12✔
507
      longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{|e| e.text().length}.max || 0)
×
508
      corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
×
509
      heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
×
510
      column_widths[column_index] = [longest_cell, heading_length].max
×
511
    end
512

513
    # print the header as markdown
514
    cell_strings = []
12✔
515
    table_element.xpath("//th").each_with_index do |e,i|
12✔
516
      cell_strings << e.text.rjust(column_widths[i+1], ' ')
×
517
    end
518
    text_table << cell_strings.join(' | ') << "\n"
12✔
519

520
    # print the separator
521
    text_table << column_count.times.map{|i| ''.rjust(column_widths[i+1], '-')}.join(' | ') << "\n"
12✔
522

523
    # print each row as markdown
524
    table_element.xpath('//tr').each do |row_element|
12✔
525
      text_table << row_element.xpath('td').map do |e|
×
526
        width = 80 #default for hand-coded tables
×
527
        index = e.path.match(/.*td\[(\d+)\]/)
×
528
        if index
×
NEW
529
          width = column_widths[index[1].to_i] || 80
×
530
        else
×
531
          width = column_widths.values.first
×
532
        end
NEW
533
        e.text.rjust(width, ' ')
×
534
      end.join(' | ') << "\n"
535
    end
536
    if pandoc_format
12✔
537
      text_table = pipe_tables_formatting(text_table)
2✔
538
    end
539

540
    "#{text_table}\n\n"
12✔
541
  end
542

543

544

545
  def debug(msg)
1✔
546
    logger.debug("DEBUG: #{msg}")
490✔
547
  end
548

549
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc