18107281180

Committed 29 Sep 2025 06:45PM UTC coverage: 65.024% (+0.2%) from 64.777%

Build # 18107281180

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #4937 from benwbrum/4901-extract-head-from-p-in-tei

Address #4901 by pulling head elements out of paragraph elements

Coverage Stats

1848 of 3359 branches covered (55.02%)

Branch coverage included in aggregate %.

30 of 32 new or added lines in 1 file covered. (93.75%)

30 existing lines in 2 files now uncovered.

8085 of 11917 relevant lines covered (67.84%)

109.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.59

/app/models/xml_source_processor.rb

module XmlSourceProcessor
  def validate_source
    if self.source_text.blank?
      return
    end
    # Skip subject linking validation for field-based collections
    # and collections with subjects disabled
    if self.collection&.field_based || self.collection&.subjects_disabled
      return
    end
    validate_links(self.source_text)
  end

  def validate_source_translation
    if self.source_translation.blank?
      return
    end
    # Skip subject linking validation for field-based collections
    # and collections with subjects disabled
    if self.collection&.field_based || self.collection&.subjects_disabled
      return
    end
    validate_links(self.source_translation)
  end

  # check the text for problems or typos with the subject links
  def validate_links(text)
    error_scope = [ :activerecord, :errors, :models, :xml_source_processor ]
    # split on all begin-braces
    tags = text.split('[[')
    # remove the initial string which occurs before the first tag
    debug("validate_source: tags to process are #{tags.inspect}")
    tags = tags - [ tags[0] ]
    debug("validate_source: massaged tags to process are #{tags.inspect}")
    for tag in tags
      debug(tag)

      if tag.include?(']]]')
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('tags_should_not_use_3_brackets', scope: error_scope))
        return
      end
      unless tag.include?(']]')
        tag = tag.strip
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('wrong_number_of_closing_braces', tag: '"[['+tag+'"', scope: error_scope))
      end

      # just pull the pieces between the braces
      inner_tag = tag.split(']]')[0]
      if inner_tag =~ /^\s*$/
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_tag_in', tag: '"[['+tag+'"', scope: error_scope))
      end

      # check for unclosed single bracket
      if inner_tag.include?('[')
        unless inner_tag.include?(']')
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('unclosed_bracket_within', tag: '"'+inner_tag+'"', scope: error_scope))
        end
      end
      # check for blank title or display name with pipes
      if inner_tag.include?('|')
        tag_parts = inner_tag.split('|')
        debug("validate_source: inner tag parts are #{tag_parts.inspect}")
        if tag_parts[0] =~ /^\s*$/
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_subject_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
        end
        if tag_parts[1] =~ /^\s*$/
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_text_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
        end
      end
    end
    #    return errors.size > 0
  end

def source_text=(text)
    self.source_text_will_change!
    super
end

def source_translation=(text)
  self.source_translation_will_change!
  super
end

  ##############################################
  # All code to convert transcriptions from source
  # format to canonical xml format belongs here.
  ##############################################
  def process_source
    if source_text_changed?
      self.xml_text = wiki_to_xml(self, Page::TEXT_TYPE::TRANSCRIPTION)
    end

    if self.respond_to?(:source_translation) && source_translation_changed?
      self.xml_translation = wiki_to_xml(self, Page::TEXT_TYPE::TRANSLATION)
    end
  end

  def wiki_to_xml(page, text_type)
    subjects_disabled = page.collection.subjects_disabled

    source_text = case text_type
    when Page::TEXT_TYPE::TRANSCRIPTION
                    page.source_text
    when Page::TEXT_TYPE::TRANSLATION
                    page.source_translation
    else
                    ''
    end

    xml_string = String.new(source_text)
    xml_string = process_latex_snippets(xml_string)
    xml_string = clean_bad_braces(xml_string)
    xml_string = clean_script_tags(xml_string)
    xml_string = process_square_braces(xml_string) unless subjects_disabled
    xml_string = process_linewise_markup(xml_string)
    xml_string = process_line_breaks(xml_string)
    xml_string = valid_xml_from_source(xml_string)
    xml_string = update_links_and_xml(xml_string, false, text_type)
    xml_string = postprocess_xml_markup(xml_string)
    postprocess_sections
    xml_string
  end


  # remove script tags from HTML to prevent javascript injection
  def clean_script_tags(text)
    # text.gsub(/<script.*?<\/script>/m, '')
    text.gsub(/<\/?script.*?>/m, '')
  end

  BAD_SHIFT_REGEX = /\[\[([[[:alpha:]][[:blank:]]|,\(\)\-[[:digit:]]]+)\}\}/
  def clean_bad_braces(text)
    text.gsub BAD_SHIFT_REGEX, '[[\\1]]'
  end

  BRACE_REGEX = /\[\[.*?\]\]/m
  def process_square_braces(text)
    # find all the links
    wikilinks = text.scan(BRACE_REGEX)
    wikilinks.each do |wikilink_contents|
      # strip braces
      munged = wikilink_contents.sub('[[', '')
      munged = munged.sub(']]', '')

      # extract the title and display
      if munged.include? '|'
        parts = munged.split '|'
        title = parts[0]
        verbatim = parts[1]
      else
        title = munged
        verbatim = munged
      end

      title = canonicalize_title(title)

      replacement = "<link target_title=\"#{title}\">#{verbatim}</link>"
      text.sub!(wikilink_contents, replacement)
    end

    text
  end

  def remove_square_braces(text)
    new_text = text.scan(BRACE_REGEX)
    new_text.each do |results|
      changed = results
      # remove title
      if results.include?('|')
        changed = results.sub(/\[\[.*?\|/, '')
      end
      changed = changed.sub('[[', '')
      changed = changed.sub(']]', '')

      text.sub!(results, changed)
    end
    text
  end

  LATEX_SNIPPET = /(\{\{tex:?(.*?):?tex\}\})/m
  def process_latex_snippets(text)
    return text unless self.respond_to? :tex_figures
    replacements = {}
    figures = self.tex_figures.to_a

    text.scan(LATEX_SNIPPET).each_with_index do |pair, i|
      with_tags = pair[0]
      contents = pair[1]

      replacements[with_tags] = "<texFigure position=\"#{i+1}\"/>" # position attribute in acts as list starts with 1

      figure = figures[i] || TexFigure.new
      figure.source = contents unless figure.source == contents
      figures[i] = figure
    end

    self.tex_figures = figures
    replacements.each_pair do |s, r|
      text.sub!(s, r)
    end

    text
  end

  HEADER = /\s\|\s/
  SEPARATOR = /---.*\|/
  ROW = HEADER

  def process_linewise_markup(text)
    @tables = []
    @sections = []
    new_lines = []
    current_table = nil
    text.lines.each do |line|
      # first deal with any sections
      line = process_any_sections(line)
      # look for a header
      if !current_table
        if line.match(HEADER)
          line = line.chomp
          current_table = { header: [], rows: [], section: @sections.last }
          # fill the header
          cells = line.split(/\s*\|\s*/)
          cells.shift if line.match(/^\|/) # remove leading pipe

          # trim whitespace from each header cell
          cells = cells.map(&:strip)

          current_table[:header] = cells.map { |cell_title| cell_title.sub(/^!\s*/, '') }
          heading = cells.map do |cell|
            if cell.match(/^!/)
              "<th class=\"bang\">#{cell.sub(/^!\s*/, '')}</th>"
            else
              "<th>#{cell}</th>"
            end
          end.join(' ')
          new_lines << "<table class=\"tabular\">\n<thead>\n<tr>#{heading}</tr></thead>"
        else
          # no current table, no table contents -- NO-OP
          new_lines << line
        end
      else
        # this is either an end or a separator
        if line.match(SEPARATOR)
          # NO-OP
        elsif line.match(ROW)
          # handle initial blank cells - if line starts with whitespace followed by pipe, preserve empty cell
          line_chomp = line.chomp
          has_initial_empty_cell = line_chomp.match(/^\s+\|/)

          # remove leading and trailing delimiters
          clean_line = line_chomp.sub(/^\s*\|/, '').sub(/\|\s*$/, '')
          # fill the row
          cells = clean_line.split(/\s*\|\s*/, -1) # -1 means "don't prune empty values at the end"

          # trim whitespace from each cell
          cells = cells.map(&:strip)

          # if there was initial whitespace before pipe, add empty cell at beginning
          cells.unshift('') if has_initial_empty_cell
          current_table[:rows] << cells
          rowline = ''
          cells.each_with_index do |cell, _i|
            rowline += "<td>#{cell}</td> "
          end

          if current_table[:rows].size == 1
            new_lines << '<tbody>'
          end
          new_lines << "<tr>#{rowline}</tr>"
        else
          # finished the last row
          unless current_table[:rows].empty? # only process tables with bodies
            @tables << current_table
            new_lines << '</tbody>'
          end
          new_lines << '</table><lb/>'
          current_table = nil
        end
      end
    end

    if current_table
      # unclosed table
      @tables << current_table
      unless current_table[:rows].empty? # only process tables with bodies
        @tables << current_table
        new_lines << '</tbody>'
      end
      new_lines << '</table><lb/>'
    end
    # do something with the table data
    new_lines.join(' ')
  end

  def process_any_sections(line)
    6.downto(2) do |depth|
      line.scan(/(={#{depth}}([^=]+)={#{depth}})/).each do |section_match|
        wiki_title = section_match[1].strip
        if wiki_title.length > 0
          verbatim = XmlSourceProcessor.cell_to_plaintext(wiki_title)
          safe_verbatim = verbatim.gsub(/"/, '&quot;')
          line = line.sub(section_match.first, "<entryHeading title=\"#{safe_verbatim}\" depth=\"#{depth}\" >#{wiki_title}</entryHeading>")
          @sections << Section.new(title: wiki_title, depth: depth)
        end
      end
    end

    line
  end

  def postprocess_sections
    @sections.each do |section|
      doc = XmlSourceProcessor.cell_to_xml(section.title)
      doc.elements.each('//link') do |e|
        title = e.attributes['target_title']
        article = collection.articles.where(title: title).first
        if article
          e.add_attribute('target_id', article.id.to_s)
        end
      end
      section.title = XmlSourceProcessor.xml_to_cell(doc)
    end
  end


  def canonicalize_title(title)
    # kill all tags
    title = title.gsub(/<.*?>/, '')
    # linebreaks -> spaces
    title = title.gsub(/\n/, ' ')
    # multiple spaces -> single spaces
    title = title.gsub(/\s+/, ' ')
    # change double quotes to proper xml
    title = title.gsub(/\"/, '&quot;')
    title
  end

  # transformations converting source mode transcription to xml
  def process_line_breaks(text)
    text="<p>#{text}</p>"
    text = text.gsub(/\s*\n\s*\n\s*/, '</p><p>')
    text = text.gsub(/([[:word:]]+)-\r\n\s*/, '\1<lb break="no" />')
    text = text.gsub(/\r\n\s*/, '<lb/>')
    text = text.gsub(/([[:word:]]+)-\n\s*/, '\1<lb break="no" />')
    text = text.gsub(/\n\s*/, '<lb/>')
    text = text.gsub(/([[:word:]]+)-\r\s*/, '\1<lb break="no" />')
    text = text.gsub(/\r\s*/, '<lb/>')
    text
  end

  def valid_xml_from_source(source)
    source = source || ''
    safe = source.gsub /\&/, '&amp;'
    safe.gsub! /\&amp;amp;/, '&amp;'
    safe.gsub! /[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]/, ' '

    string = <<EOF
    <?xml version="1.0" encoding="UTF-8"?>
      <page>
        #{safe}
      </page>
EOF
  end

  def update_links_and_xml(xml_string, preview_mode = false, text_type)
    # first clear out the existing links
    # log the count of articles before and after
    clear_links(text_type) unless preview_mode

    candidate_articles = collection.articles.left_joins(:article_versions)
    page_update_timestamp = 1.hour.ago

    processed = ''
    # process it
    doc = REXML::Document.new xml_string
    doc.elements.each('//link') do |element|
      # default the title to the text if it's not specified
      if !(title = element.attributes['target_title'])
        title = element.text
      end
      # display_text = element.text
      display_text = ''
      element.children.each do |e|
        display_text += e.to_s
      end
      debug("link display_text = #{display_text}")
      # change the xml version of quotes back to double quotes for article title
      title = title.gsub('&quot;', '"')

      article = candidate_articles.find_by(title: title)

      if article.nil?
        article = candidate_articles.where('article_versions.title': title)
                                    .where('article_versions.created_on > ?', page_update_timestamp)
                                    .first
        if article.present?
          display_text = article.title
          title = article.title
        end
      end

      # create new blank articles if they don't exist already
      if article.nil?
        article = Article.new
        article.title = title
        article.collection = collection
        article.created_by_id = Current.user.id if Current.user.present?
        article.save! unless preview_mode
      end

      link_id = create_link(article, display_text, text_type) unless preview_mode
      # now update the attribute
      link_element = REXML::Element.new('link')
      element.children.each { |c| link_element.add(c) }
      link_element.add_attribute('target_title', title)
      debug('element='+link_element.inspect)
      debug('article='+article.inspect)
      link_element.add_attribute('target_id', article.id.to_s) unless preview_mode
      link_element.add_attribute('link_id', link_id.to_s) unless preview_mode
      element.replace_with(link_element)
    end
    doc.write(processed)
    processed
  end

  # handle XML-dependent post-processing
  def postprocess_xml_markup(xml_string)
    doc = REXML::Document.new xml_string
    processed = ''
    doc.elements.each('//lb') do |element|
      if element.previous_element && element.previous_sibling.node_type == :element && element.previous_element.name == 'lb'
        pre = doc.to_s
        element.parent.elements.delete(element)
      end
    end
    doc.write(processed)
    processed
  end


  CELL_PREFIX = "<?xml version='1.0' encoding='UTF-8'?><cell>"
  CELL_SUFFIX = '</cell>'

  def self.cell_to_xml(cell)
    REXML::Document.new(CELL_PREFIX + cell.gsub('&', '&amp;') + CELL_SUFFIX)
  end

  def self.xml_to_cell(doc)
    text = ''
    doc.write(text)
    text.sub(CELL_PREFIX, '').sub(CELL_SUFFIX, '')
  end

  def self.cell_to_plaintext(cell)
    doc = cell_to_xml(cell)
    doc.each_element('.//text()') { |e| p e.text }.join
  end

  def self.cell_to_subject(cell)
    doc = cell_to_xml(cell)
    subjects = ''
    doc.elements.each('//link') do |e|
      title = e.attributes['target_title']
      subjects << title
      subjects << "\n"
    end
    subjects
  end

  def self.cell_to_category(cell)
    doc = cell_to_xml(cell)
    categories = ''
    doc.elements.each('//link') do |e|
      id = e.attributes['target_id']
      if id
        article = Article.find(id)
        article.categories.each do |category|
          categories << category.title
          categories << "\n"
        end
      end
    end
    categories
  end

  ##############################################
  # Code to rename links within the text.
  # This assumes that the name change has already
  # taken place within the article table in the DB
  ##############################################
  def rename_article_links(old_title, new_title)
    title_regex =
      Regexp.escape(old_title)
        .gsub('\\ ', ' ') # Regexp.escape converts ' ' to '\\ ' for some reason -- undo this
        .gsub(/\s+/, '\s+') # convert multiple whitespaces into 1+n space characters

    self.source_text = rename_link_in_text(source_text, title_regex, new_title)

    # Articles don't have translations, but we still need to update pages.source_translation
    if has_attribute?(:source_translation) && !source_translation.nil?
      self.source_translation = rename_link_in_text(source_translation, title_regex, new_title)
    end
  end

  def rename_link_in_text(text, title_regex, new_title)
    if new_title == ''
      # Link deleted, remove [[ ]] but keep the original title text

      # Handle links of the form [[Old Title|Display Text]] => Display Text
      text = text.gsub(/\[\[#{title_regex}\|([^\]]+)\]\]/i, '\1')
      # Handle links of the form [[Old Title]] => Old Title
      text = text.gsub(/\[\[(#{title_regex})\]\]/i, '\1')
    else
      # Replace the title part in [[Old Title|Display Text]]
      text = text.gsub(/\[\[#{title_regex}\|/i, "[[#{new_title}|")
      # Replace [[Old Title]] with [[New Title|Old Title]]
      text = text.gsub(/\[\[(#{title_regex})\]\]/i, "[[#{new_title}|\\1]]")
    end

    text
  end


  def pipe_tables_formatting(text)
    # since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
    # to the beginning and end of each line
    text.split("\n").map { |line| "|#{line}|" }.join("\n")
  end

  def xml_table_to_markdown_table(table_element, pandoc_format = false, plaintext_export = false)
    text_table = ''

    # clean up in-cell line-breaks
    table_element.xpath('//lb').each { |n| n.replace(' ') }

    # Sanitize single quotes with backticks
    # table_element.xpath('//*').each { |n| n.content.gsub("'", '`') }

    # calculate the widths of each column based on max(header, cell[0...end])
    column_count = ([ table_element.xpath('//th').count ] + table_element.xpath('//tr').map { |e| e.xpath('td').count }).max
    column_widths = {}
    1.upto(column_count) do |column_index|
      longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map { |e| e.text().length }.max || 0)
      corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
      heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
      column_widths[column_index] = [ longest_cell, heading_length ].max
    end

    # print the header as markdown
    cell_strings = []
    table_element.xpath('//th').each_with_index do |e, i|
      cell_strings << e.text.rjust(column_widths[i+1], ' ')
    end
    text_table << cell_strings.join(' | ') << "\n"

    # print the separator
    text_table << column_count.times.map { |i| ''.rjust(column_widths[i+1], '-') }.join(' | ') << "\n"

    # print each row as markdown
    table_element.xpath('//tr').each do |row_element|
      text_table << row_element.xpath('td').map do |e|
        width = 80 # default for hand-coded tables
        index = e.path.match(/.*td\[(\d+)\]/)
        if index
          width = column_widths[index[1].to_i] || 80
        else
          width = column_widths.values.first
        end

        if plaintext_export
          e.text.rjust(width, ' ')
        else
          inner_html = xml_to_pandoc_md(e.to_s.gsub("'", '&#39;'), false, false, nil, false).gsub("\n", '')
          inner_html.rjust(width, ' ')
        end
      end.join(' | ') << "\n"
    end
    if pandoc_format
      text_table = pipe_tables_formatting(text_table)
    end

    "#{text_table}\n\n"
  end



  def debug(msg)
    logger.debug("DEBUG: #{msg}")
  end
end

1	module XmlSourceProcessor	1✔
2	def validate_source	1✔
3	if self.source_text.blank?	3,682✔
4	return	3,538✔
5	end
6	# Skip subject linking validation for field-based collections
7	# and collections with subjects disabled
8	if self.collection&.field_based \|\| self.collection&.subjects_disabled	144✔
9	return	8✔
10	end
11	validate_links(self.source_text)	136✔
12	end
13
14	def validate_source_translation	1✔
15	if self.source_translation.blank?	3,679✔
16	return	3,628✔
17	end
18	# Skip subject linking validation for field-based collections
19	# and collections with subjects disabled
20	if self.collection&.field_based \|\| self.collection&.subjects_disabled	51!
21	return	1✔
22	end
23	validate_links(self.source_translation)	50✔
24	end
25
26	# check the text for problems or typos with the subject links
27	def validate_links(text)	1✔
28	error_scope = [ :activerecord, :errors, :models, :xml_source_processor ]	186✔
29	# split on all begin-braces
30	tags = text.split('[[')	186✔
31	# remove the initial string which occurs before the first tag
32	debug("validate_source: tags to process are #{tags.inspect}")	186✔
33	tags = tags - [ tags[0] ]	186✔
34	debug("validate_source: massaged tags to process are #{tags.inspect}")	186✔
35	for tag in tags	186✔
36	debug(tag)	114✔
37
38	if tag.include?(']]]')	114✔
39	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('tags_should_not_use_3_brackets', scope: error_scope))	1✔
40	return	1✔
41	end
42	unless tag.include?(']]')	113✔
43	tag = tag.strip	3✔
44	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('wrong_number_of_closing_braces', tag: '"[['+tag+'"', scope: error_scope))	3✔
45	end
46
47	# just pull the pieces between the braces
48	inner_tag = tag.split(']]')[0]	113✔
49	if inner_tag =~ /^\s*$/	113✔
50	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_tag_in', tag: '"[['+tag+'"', scope: error_scope))	1✔
51	end
52
53	# check for unclosed single bracket
54	if inner_tag.include?('[')	113✔
55	unless inner_tag.include?(']')	1!
56	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('unclosed_bracket_within', tag: '"'+inner_tag+'"', scope: error_scope))	1✔
57	end
58	end
59	# check for blank title or display name with pipes
60	if inner_tag.include?('\|')	113✔
61	tag_parts = inner_tag.split('\|')	10✔
62	debug("validate_source: inner tag parts are #{tag_parts.inspect}")	10✔
63	if tag_parts[0] =~ /^\s*$/	10✔
64	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_subject_in', tag: '"[['+inner_tag+']]"', scope: error_scope))	1✔
65	end
66	if tag_parts[1] =~ /^\s*$/	10✔
67	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_text_in', tag: '"[['+inner_tag+']]"', scope: error_scope))	1✔
68	end
69	end
70	end
71	# return errors.size > 0
72	end
73
74	def source_text=(text)	1✔
75	self.source_text_will_change!	179✔
76	super	179✔
77	end
78
79	def source_translation=(text)	1✔
80	self.source_translation_will_change!	55✔
81	super	55✔
82	end
83
84	##############################################
85	# All code to convert transcriptions from source
86	# format to canonical xml format belongs here.
87	##############################################
88	def process_source	1✔
89	if source_text_changed?	259✔
90	self.xml_text = wiki_to_xml(self, Page::TEXT_TYPE::TRANSCRIPTION)	61✔
91	end
92
93	if self.respond_to?(:source_translation) && source_translation_changed?	259✔
94	self.xml_translation = wiki_to_xml(self, Page::TEXT_TYPE::TRANSLATION)	12✔
95	end
96	end
97
98	def wiki_to_xml(page, text_type)	1✔
99	subjects_disabled = page.collection.subjects_disabled	79✔
100
101	source_text = case text_type	79✔
102	when Page::TEXT_TYPE::TRANSCRIPTION	66✔
103	page.source_text	66✔
104	when Page::TEXT_TYPE::TRANSLATION	13✔
105	page.source_translation	13✔
106	else	×
107	''	×
108	end
109
110	xml_string = String.new(source_text)	79✔
111	xml_string = process_latex_snippets(xml_string)	79✔
112	xml_string = clean_bad_braces(xml_string)	79✔
113	xml_string = clean_script_tags(xml_string)	79✔
114	xml_string = process_square_braces(xml_string) unless subjects_disabled	79✔
115	xml_string = process_linewise_markup(xml_string)	79✔
116	xml_string = process_line_breaks(xml_string)	79✔
117	xml_string = valid_xml_from_source(xml_string)	79✔
118	xml_string = update_links_and_xml(xml_string, false, text_type)	79✔
119	xml_string = postprocess_xml_markup(xml_string)	79✔
120	postprocess_sections	79✔
121	xml_string	79✔
122	end
123
124
125	# remove script tags from HTML to prevent javascript injection
126	def clean_script_tags(text)	1✔
127	# text.gsub(/<script.*?<\/script>/m, '')
128	text.gsub(/<\/?script.*?>/m, '')	79✔
129	end
130
131	BAD_SHIFT_REGEX = /\[\[([[[:alpha:]][[:blank:]]\|,\(\)\-[[:digit:]]]+)\}\}/	1✔
132	def clean_bad_braces(text)	1✔
133	text.gsub BAD_SHIFT_REGEX, '[[\\1]]'	79✔
134	end
135
136	BRACE_REGEX = /\[\[.*?\]\]/m	1✔
137	def process_square_braces(text)	1✔
138	# find all the links
139	wikilinks = text.scan(BRACE_REGEX)	76✔
140	wikilinks.each do \|wikilink_contents\|	76✔
141	# strip braces
142	munged = wikilink_contents.sub('[[', '')	30✔
143	munged = munged.sub(']]', '')	30✔
144
145	# extract the title and display
146	if munged.include? '\|'	30✔
147	parts = munged.split '\|'	10✔
148	title = parts[0]	10✔
149	verbatim = parts[1]	10✔
150	else	20✔
151	title = munged	20✔
152	verbatim = munged	20✔
153	end
154
155	title = canonicalize_title(title)	30✔
156
157	replacement = "<link target_title=\"#{title}\">#{verbatim}</link>"	30✔
158	text.sub!(wikilink_contents, replacement)	30✔
159	end
160
161	text	76✔
162	end
163
164	def remove_square_braces(text)	1✔
165	new_text = text.scan(BRACE_REGEX)	3✔
166	new_text.each do \|results\|	3✔
167	changed = results	3✔
168	# remove title
169	if results.include?('\|')	3!
170	changed = results.sub(/\[\[.*?\\|/, '')	×
171	end
172	changed = changed.sub('[[', '')	3✔
173	changed = changed.sub(']]', '')	3✔
174
175	text.sub!(results, changed)	3✔
176	end
177	text	3✔
178	end
179
180	LATEX_SNIPPET = /(\{\{tex:?(.*?):?tex\}\})/m	1✔
181	def process_latex_snippets(text)	1✔
182	return text unless self.respond_to? :tex_figures	79✔
183	replacements = {}	64✔
184	figures = self.tex_figures.to_a	64✔
185
186	text.scan(LATEX_SNIPPET).each_with_index do \|pair, i\|	64✔
187	with_tags = pair[0]	×
188	contents = pair[1]	×
189
190	replacements[with_tags] = "<texFigure position=\"#{i+1}\"/>" # position attribute in acts as list starts with 1	×
191
192	figure = figures[i] \|\| TexFigure.new	×
193	figure.source = contents unless figure.source == contents	×
194	figures[i] = figure	×
195	end
196
197	self.tex_figures = figures	64✔
198	replacements.each_pair do \|s, r\|	64✔
199	text.sub!(s, r)	×
200	end
201
202	text	64✔
203	end
204
205	HEADER = /\s\\|\s/	1✔
206	SEPARATOR = /---.*\\|/	1✔
207	ROW = HEADER	1✔
208
209	def process_linewise_markup(text)	1✔
210	@tables = []	83✔
211	@sections = []	83✔
212	new_lines = []	83✔
213	current_table = nil	83✔
214	text.lines.each do \|line\|	83✔
215	# first deal with any sections
216	line = process_any_sections(line)	107✔
217	# look for a header
218	if !current_table	107✔
219	if line.match(HEADER)	104✔
220	line = line.chomp	4✔
221	current_table = { header: [], rows: [], section: @sections.last }	4✔
222	# fill the header
223	cells = line.split(/\s\\|\s/)	4✔
224	cells.shift if line.match(/^\\|/) # remove leading pipe	4✔
225
226	# trim whitespace from each header cell
227	cells = cells.map(&:strip)	4✔
228
229	current_table[:header] = cells.map { \|cell_title\| cell_title.sub(/^!\s*/, '') }	16✔
230	heading = cells.map do \|cell\|	4✔
231	if cell.match(/^!/)	12!
UNCOV 232	"<th class=\"bang\">#{cell.sub(/^!\s*/, '')}</th>"	×
233	else	12✔
234	"<th>#{cell}</th>"	12✔
235	end
236	end.join(' ')
237	new_lines << "<table class=\"tabular\">\n<thead>\n<tr>#{heading}</tr></thead>"	4✔
238	else
239	# no current table, no table contents -- NO-OP	100✔
240	new_lines << line	100✔
241	end
242	else
243	# this is either an end or a separator	3✔
244	if line.match(SEPARATOR)	3✔
245	# NO-OP	2✔
246	elsif line.match(ROW)	2✔
247	# handle initial blank cells - if line starts with whitespace followed by pipe, preserve empty cell	2✔
248	line_chomp = line.chomp	2✔
249	has_initial_empty_cell = line_chomp.match(/^\s+\\|/)	2✔
250
251	# remove leading and trailing delimiters
252	clean_line = line_chomp.sub(/^\s\\|/, '').sub(/\\|\s$/, '')	2✔
253	# fill the row
254	cells = clean_line.split(/\s\\|\s/, -1) # -1 means "don't prune empty values at the end"	2✔
255
256	# trim whitespace from each cell
257	cells = cells.map(&:strip)	2✔
258
259	# if there was initial whitespace before pipe, add empty cell at beginning
260	cells.unshift('') if has_initial_empty_cell	2✔
261	current_table[:rows] << cells	2✔
262	rowline = ''	2✔
263	cells.each_with_index do \|cell, _i\|	2✔
264	rowline += "<td>#{cell}</td> "	6✔
265	end
266
267	if current_table[:rows].size == 1	2✔
268	new_lines << '<tbody>'	1✔
269	end
270	new_lines << "<tr>#{rowline}</tr>"	2✔
271	else
272	# finished the last row	×
273	unless current_table[:rows].empty? # only process tables with bodies	×
274	@tables << current_table	×
UNCOV 275	new_lines << '</tbody>'	×
276	end
UNCOV 277	new_lines << '</table><lb/>'	×
UNCOV 278	current_table = nil	×
279	end
280	end
281	end
282
283	if current_table	83✔
284	# unclosed table	4✔
285	@tables << current_table	4✔
286	unless current_table[:rows].empty? # only process tables with bodies	4✔
287	@tables << current_table	1✔
288	new_lines << '</tbody>'	1✔
289	end
290	new_lines << '</table><lb/>'	4✔
291	end
292	# do something with the table data
293	new_lines.join(' ')	83✔
294	end
295
296	def process_any_sections(line)	1✔
297	6.downto(2) do \|depth\|	107✔
298	line.scan(/(={#{depth}}([^=]+)={#{depth}})/).each do \|section_match\|	535✔
UNCOV 299	wiki_title = section_match[1].strip	×
300	if wiki_title.length > 0	×
301	verbatim = XmlSourceProcessor.cell_to_plaintext(wiki_title)	×
302	safe_verbatim = verbatim.gsub(/"/, '"')	×
303	line = line.sub(section_match.first, "<entryHeading title=\"#{safe_verbatim}\" depth=\"#{depth}\" >#{wiki_title}</entryHeading>")	×
304	@sections << Section.new(title: wiki_title, depth: depth)	×
305	end
306	end
307	end
308
309	line	107✔
310	end
311
312	def postprocess_sections	1✔
313	@sections.each do \|section\|	79✔
UNCOV 314	doc = XmlSourceProcessor.cell_to_xml(section.title)	×
UNCOV 315	doc.elements.each('//link') do \|e\|	×
UNCOV 316	title = e.attributes['target_title']	×
UNCOV 317	article = collection.articles.where(title: title).first	×
UNCOV 318	if article	×
UNCOV 319	e.add_attribute('target_id', article.id.to_s)	×
320	end
321	end
UNCOV 322	section.title = XmlSourceProcessor.xml_to_cell(doc)	×
323	end
324	end
325
326
327	def canonicalize_title(title)	1✔
328	# kill all tags
329	title = title.gsub(/<.*?>/, '')	30✔
330	# linebreaks -> spaces
331	title = title.gsub(/\n/, ' ')	30✔
332	# multiple spaces -> single spaces
333	title = title.gsub(/\s+/, ' ')	30✔
334	# change double quotes to proper xml
335	title = title.gsub(/\"/, '"')	30✔
336	title	30✔
337	end
338
339	# transformations converting source mode transcription to xml
340	def process_line_breaks(text)	1✔
341	text="<p>#{text}</p>"	79✔
342	text = text.gsub(/\s\n\s\n\s*/, '</p><p>')	79✔
343	text = text.gsub(/([[:word:]]+)-\r\n\s*/, '\1<lb break="no" />')	79✔
344	text = text.gsub(/\r\n\s*/, '<lb/>')	79✔
345	text = text.gsub(/([[:word:]]+)-\n\s*/, '\1<lb break="no" />')	79✔
346	text = text.gsub(/\n\s*/, '<lb/>')	79✔
347	text = text.gsub(/([[:word:]]+)-\r\s*/, '\1<lb break="no" />')	79✔
348	text = text.gsub(/\r\s*/, '<lb/>')	79✔
349	text	79✔
350	end
351
352	def valid_xml_from_source(source)	1✔
353	source = source \|\| ''	79✔
354	safe = source.gsub /\&/, '&'	79✔
355	safe.gsub! /\&amp;/, '&'	79✔
356	safe.gsub! /[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]/, ' '	79✔
357
358	string = <<EOF	79✔
359	<?xml version="1.0" encoding="UTF-8"?>
360	<page>
361	#{safe}
362	</page>
363	EOF
364	end
365
366	def update_links_and_xml(xml_string, preview_mode = false, text_type)	1✔
367	# first clear out the existing links
368	# log the count of articles before and after
369	clear_links(text_type) unless preview_mode	79!
370
371	candidate_articles = collection.articles.left_joins(:article_versions)	79✔
372	page_update_timestamp = 1.hour.ago	79✔
373
374	processed = ''	79✔
375	# process it
376	doc = REXML::Document.new xml_string	79✔
377	doc.elements.each('//link') do \|element\|	79✔
378	# default the title to the text if it's not specified
379	if !(title = element.attributes['target_title'])	30!
UNCOV 380	title = element.text	×
381	end
382	# display_text = element.text
383	display_text = ''	30✔
384	element.children.each do \|e\|	30✔
385	display_text += e.to_s	30✔
386	end
387	debug("link display_text = #{display_text}")	30✔
388	# change the xml version of quotes back to double quotes for article title
389	title = title.gsub('"', '"')	30✔
390
391	article = candidate_articles.find_by(title: title)	30✔
392
393	if article.nil?	30✔
394	article = candidate_articles.where('article_versions.title': title)	9✔
395	.where('article_versions.created_on > ?', page_update_timestamp)
396	.first
397	if article.present?	9✔
398	display_text = article.title	1✔
399	title = article.title	1✔
400	end
401	end
402
403	# create new blank articles if they don't exist already
404	if article.nil?	30✔
405	article = Article.new	8✔
406	article.title = title	8✔
407	article.collection = collection	8✔
408	article.created_by_id = Current.user.id if Current.user.present?	8✔
409	article.save! unless preview_mode	8!
410	end
411
412	link_id = create_link(article, display_text, text_type) unless preview_mode	30!
413	# now update the attribute
414	link_element = REXML::Element.new('link')	30✔
415	element.children.each { \|c\| link_element.add(c) }	60✔
416	link_element.add_attribute('target_title', title)	30✔
417	debug('element='+link_element.inspect)	30✔
418	debug('article='+article.inspect)	30✔
419	link_element.add_attribute('target_id', article.id.to_s) unless preview_mode	30!
420	link_element.add_attribute('link_id', link_id.to_s) unless preview_mode	30!
421	element.replace_with(link_element)	30✔
422	end
423	doc.write(processed)	79✔
424	processed	79✔
425	end
426
427	# handle XML-dependent post-processing
428	def postprocess_xml_markup(xml_string)	1✔
429	doc = REXML::Document.new xml_string	79✔
430	processed = ''	79✔
431	doc.elements.each('//lb') do \|element\|	79✔
432	if element.previous_element && element.previous_sibling.node_type == :element && element.previous_element.name == 'lb'	6!
UNCOV 433	pre = doc.to_s	×
UNCOV 434	element.parent.elements.delete(element)	×
435	end
436	end
437	doc.write(processed)	79✔
438	processed	79✔
439	end
440
441
442	CELL_PREFIX = "<?xml version='1.0' encoding='UTF-8'?><cell>"	1✔
443	CELL_SUFFIX = '</cell>'	1✔
444
445	def self.cell_to_xml(cell)	1✔
446	REXML::Document.new(CELL_PREFIX + cell.gsub('&', '&') + CELL_SUFFIX)	3✔
447	end
448
449	def self.xml_to_cell(doc)	1✔
450	text = ''	×
451	doc.write(text)	×
452	text.sub(CELL_PREFIX, '').sub(CELL_SUFFIX, '')	×
453	end
454
455	def self.cell_to_plaintext(cell)	1✔
456	doc = cell_to_xml(cell)	3✔
457	doc.each_element('.//text()') { \|e\| p e.text }.join	3✔
458	end
459
460	def self.cell_to_subject(cell)	1✔
461	doc = cell_to_xml(cell)	×
462	subjects = ''	×
463	doc.elements.each('//link') do \|e\|	×
464	title = e.attributes['target_title']	×
465	subjects << title	×
466	subjects << "\n"	×
467	end
UNCOV 468	subjects	×
469	end
470
471	def self.cell_to_category(cell)	1✔
UNCOV 472	doc = cell_to_xml(cell)	×
UNCOV 473	categories = ''	×
UNCOV 474	doc.elements.each('//link') do \|e\|	×
UNCOV 475	id = e.attributes['target_id']	×
UNCOV 476	if id	×
UNCOV 477	article = Article.find(id)	×
UNCOV 478	article.categories.each do \|category\|	×
UNCOV 479	categories << category.title	×
UNCOV 480	categories << "\n"	×
481	end
482	end
483	end
UNCOV 484	categories	×
485	end
486
487	##############################################
488	# Code to rename links within the text.
489	# This assumes that the name change has already
490	# taken place within the article table in the DB
491	##############################################
492	def rename_article_links(old_title, new_title)	1✔
493	title_regex =
494	Regexp.escape(old_title)	13✔
495	.gsub('\\ ', ' ') # Regexp.escape converts ' ' to '\\ ' for some reason -- undo this
496	.gsub(/\s+/, '\s+') # convert multiple whitespaces into 1+n space characters
497
498	self.source_text = rename_link_in_text(source_text, title_regex, new_title)	13✔
499
500	# Articles don't have translations, but we still need to update pages.source_translation
501	if has_attribute?(:source_translation) && !source_translation.nil?	13✔
502	self.source_translation = rename_link_in_text(source_translation, title_regex, new_title)	5✔
503	end
504	end
505
506	def rename_link_in_text(text, title_regex, new_title)	1✔
507	if new_title == ''	18✔
508	# Link deleted, remove [[ ]] but keep the original title text
509
510	# Handle links of the form [[Old Title\|Display Text]] => Display Text	3✔
511	text = text.gsub(/\[\[#{title_regex}\\|([^\]]+)\]\]/i, '\1')	3✔
512	# Handle links of the form [[Old Title]] => Old Title
513	text = text.gsub(/\[\[(#{title_regex})\]\]/i, '\1')	3✔
514	else
515	# Replace the title part in [[Old Title\|Display Text]]	15✔
516	text = text.gsub(/\[\[#{title_regex}\\|/i, "[[#{new_title}\|")	15✔
517	# Replace [[Old Title]] with [[New Title\|Old Title]]
518	text = text.gsub(/\[\[(#{title_regex})\]\]/i, "[[#{new_title}\|\\1]]")	15✔
519	end
520
521	text	18✔
522	end
523
524
525	def pipe_tables_formatting(text)	1✔
526	# since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
527	# to the beginning and end of each line
528	text.split("\n").map { \|line\| "\|#{line}\|" }.join("\n")	10✔
529	end
530
531	def xml_table_to_markdown_table(table_element, pandoc_format = false, plaintext_export = false)	1✔
532	text_table = ''	12✔
533
534	# clean up in-cell line-breaks
535	table_element.xpath('//lb').each { \|n\| n.replace(' ') }	50✔
536
537	# Sanitize single quotes with backticks
538	# table_element.xpath('//*').each { \|n\| n.content.gsub("'", '`') }
539
540	# calculate the widths of each column based on max(header, cell[0...end])
541	column_count = ([ table_element.xpath('//th').count ] + table_element.xpath('//tr').map { \|e\| e.xpath('td').count }).max	36✔
542	column_widths = {}	12✔
543	1.upto(column_count) do \|column_index\|	12✔
544	longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map { \|e\| e.text().length }.max \|\| 0)	72✔
545	corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first	36✔
546	heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length	36!
547	column_widths[column_index] = [ longest_cell, heading_length ].max	36✔
548	end
549
550	# print the header as markdown
551	cell_strings = []	12✔
552	table_element.xpath('//th').each_with_index do \|e, i\|	12✔
553	cell_strings << e.text.rjust(column_widths[i+1], ' ')	36✔
554	end
555	text_table << cell_strings.join(' \| ') << "\n"	12✔
556
557	# print the separator
558	text_table << column_count.times.map { \|i\| ''.rjust(column_widths[i+1], '-') }.join(' \| ') << "\n"	48✔
559
560	# print each row as markdown
561	table_element.xpath('//tr').each do \|row_element\|	12✔
562	text_table << row_element.xpath('td').map do \|e\|	24✔
563	width = 80 # default for hand-coded tables	36✔
564	index = e.path.match(/.*td\[(\d+)\]/)	36✔
565	if index	36✔
566	width = column_widths[index[1].to_i] \|\| 80	36✔
567	else	×
UNCOV 568	width = column_widths.values.first	×
569	end
570
571	if plaintext_export	36✔
572	e.text.rjust(width, ' ')	36✔
573	else	×
UNCOV 574	inner_html = xml_to_pandoc_md(e.to_s.gsub("'", '''), false, false, nil, false).gsub("\n", '')	×
UNCOV 575	inner_html.rjust(width, ' ')	×
576	end
577	end.join(' \| ') << "\n"
578	end
579	if pandoc_format	12✔
580	text_table = pipe_tables_formatting(text_table)	2✔
581	end
582
583	"#{text_table}\n\n"	12✔
584	end
585
586
587
588	def debug(msg)	1✔
589	logger.debug("DEBUG: #{msg}")	586✔
590	end
591	end

benwbrum / fromthepage / 18107281180

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous