17387282326

Committed 01 Sep 2025 09:13PM UTC coverage: 64.405%. Remained the same

Build # 17387282326

Build Type

push

github

Committed by

web-flow

Commit Message

4857 - Require rubocop step in CI (#4858)

* 4857 - Require rubocop step in CI

* 4865 - Organize gemfiles

Run Details

1790 of 3303 branches covered (54.19%)

Branch coverage included in aggregate %.

839 of 1497 new or added lines in 133 files covered. (56.05%)

43 existing lines in 29 files now uncovered.

7928 of 11786 relevant lines covered (67.27%)

103.82 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.85

/app/models/xml_source_processor.rb

module XmlSourceProcessor
  def validate_source
    if self.source_text.blank?
      return
    end
    validate_links(self.source_text)
  end

  def validate_source_translation
    if self.source_translation.blank?
      return
    end
    validate_links(self.source_translation)
  end

  # check the text for problems or typos with the subject links
  def validate_links(text)
    error_scope = [ :activerecord, :errors, :models, :xml_source_processor ]
    # split on all begin-braces
    tags = text.split('[[')
    # remove the initial string which occurs before the first tag
    debug("validate_source: tags to process are #{tags.inspect}")
    tags = tags - [ tags[0] ]
    debug("validate_source: massaged tags to process are #{tags.inspect}")
    for tag in tags
      debug(tag)

      if tag.include?(']]]')
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('tags_should_not_use_3_brackets', scope: error_scope))
        return
      end
      unless tag.include?(']]')
        tag = tag.strip
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('wrong_number_of_closing_braces', tag: '"[['+tag+'"', scope: error_scope))
      end

      # just pull the pieces between the braces
      inner_tag = tag.split(']]')[0]
      if inner_tag =~ /^\s*$/
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_tag_in', tag: '"[['+tag+'"', scope: error_scope))
      end

      # check for unclosed single bracket
      if inner_tag.include?('[')
        unless inner_tag.include?(']')
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('unclosed_bracket_within', tag: '"'+inner_tag+'"', scope: error_scope))
        end
      end
      # check for blank title or display name with pipes
      if inner_tag.include?('|')
        tag_parts = inner_tag.split('|')
        debug("validate_source: inner tag parts are #{tag_parts.inspect}")
        if tag_parts[0] =~ /^\s*$/
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_subject_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
        end
        if tag_parts[1] =~ /^\s*$/
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_text_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
        end
      end
    end
    #    return errors.size > 0
  end

def source_text=(text)
    self.source_text_will_change!
    super
end

def source_translation=(text)
  self.source_translation_will_change!
  super
end

  ##############################################
  # All code to convert transcriptions from source
  # format to canonical xml format belongs here.
  ##############################################
  def process_source
    if source_text_changed?
      self.xml_text = wiki_to_xml(self, Page::TEXT_TYPE::TRANSCRIPTION)
    end

    if self.respond_to?(:source_translation) && source_translation_changed?
      self.xml_translation = wiki_to_xml(self, Page::TEXT_TYPE::TRANSLATION)
    end
  end

  def wiki_to_xml(page, text_type)
    subjects_disabled = page.collection.subjects_disabled

    source_text = case text_type
    when Page::TEXT_TYPE::TRANSCRIPTION
                    page.source_text
    when Page::TEXT_TYPE::TRANSLATION
                    page.source_translation
    else
                    ''
    end

    xml_string = String.new(source_text)
    xml_string = process_latex_snippets(xml_string)
    xml_string = clean_bad_braces(xml_string)
    xml_string = clean_script_tags(xml_string)
    xml_string = process_square_braces(xml_string) unless subjects_disabled
    xml_string = process_linewise_markup(xml_string)
    xml_string = process_line_breaks(xml_string)
    xml_string = valid_xml_from_source(xml_string)
    xml_string = update_links_and_xml(xml_string, false, text_type)
    xml_string = postprocess_xml_markup(xml_string)
    postprocess_sections
    xml_string
  end


  # remove script tags from HTML to prevent javascript injection
  def clean_script_tags(text)
    # text.gsub(/<script.*?<\/script>/m, '')
    text.gsub(/<\/?script.*?>/m, '')
  end

  BAD_SHIFT_REGEX = /\[\[([[[:alpha:]][[:blank:]]|,\(\)\-[[:digit:]]]+)\}\}/
  def clean_bad_braces(text)
    text.gsub BAD_SHIFT_REGEX, '[[\\1]]'
  end

  BRACE_REGEX = /\[\[.*?\]\]/m
  def process_square_braces(text)
    # find all the links
    wikilinks = text.scan(BRACE_REGEX)
    wikilinks.each do |wikilink_contents|
      # strip braces
      munged = wikilink_contents.sub('[[', '')
      munged = munged.sub(']]', '')

      # extract the title and display
      if munged.include? '|'
        parts = munged.split '|'
        title = parts[0]
        verbatim = parts[1]
      else
        title = munged
        verbatim = munged
      end

      title = canonicalize_title(title)

      replacement = "<link target_title=\"#{title}\">#{verbatim}</link>"
      text.sub!(wikilink_contents, replacement)
    end

    text
  end

  def remove_square_braces(text)
    new_text = text.scan(BRACE_REGEX)
    new_text.each do |results|
      changed = results
      # remove title
      if results.include?('|')
        changed = results.sub(/\[\[.*?\|/, '')
      end
      changed = changed.sub('[[', '')
      changed = changed.sub(']]', '')

      text.sub!(results, changed)
    end
    text
  end

  LATEX_SNIPPET = /(\{\{tex:?(.*?):?tex\}\})/m
  def process_latex_snippets(text)
    return text unless self.respond_to? :tex_figures
    replacements = {}
    figures = self.tex_figures.to_a

    text.scan(LATEX_SNIPPET).each_with_index do |pair, i|
      with_tags = pair[0]
      contents = pair[1]

      replacements[with_tags] = "<texFigure position=\"#{i+1}\"/>" # position attribute in acts as list starts with 1

      figure = figures[i] || TexFigure.new
      figure.source = contents unless figure.source == contents
      figures[i] = figure
    end

    self.tex_figures = figures
    replacements.each_pair do |s, r|
      text.sub!(s, r)
    end

    text
  end

  HEADER = /\s\|\s/
  SEPARATOR = /---.*\|/
  ROW = HEADER

  def process_linewise_markup(text)
    @tables = []
    @sections = []
    new_lines = []
    current_table = nil
    text.lines.each do |line|
      # first deal with any sections
      line = process_any_sections(line)
      # look for a header
      if !current_table
        if line.match(HEADER)
          line.chomp
          current_table = { header: [], rows: [], section: @sections.last }
          # fill the header
          cells = line.split(/\s*\|\s*/)
          cells.shift if line.match(/^\|/) # remove leading pipe
          current_table[:header] = cells.map { |cell_title| cell_title.sub(/^!\s*/, '') }
          heading = cells.map do |cell|
            if cell.match(/^!/)
              "<th class=\"bang\">#{cell.sub(/^!\s*/, '')}</th>"
            else
              "<th>#{cell}</th>"
            end
          end.join(' ')
          new_lines << "<table class=\"tabular\">\n<thead>\n<tr>#{heading}</tr></thead>"
        else
          # no current table, no table contents -- NO-OP
          new_lines << line
        end
      else
        # this is either an end or a separator
        if line.match(SEPARATOR)
          # NO-OP
        elsif line.match(ROW)
          # remove leading and trailing delimiters
          clean_line=line.chomp.sub(/^\s*\|/, '').sub(/\|\s*$/, '')
          # fill the row
          cells = clean_line.split(/\s*\|\s*/, -1) # -1 means "don't prune empty values at the end"
          current_table[:rows] << cells
          rowline = ''
          cells.each_with_index do |cell, _i|
            rowline += "<td>#{cell}</td> "
          end

          if current_table[:rows].size == 1
            new_lines << '<tbody>'
          end
          new_lines << "<tr>#{rowline}</tr>"
        else
          # finished the last row
          unless current_table[:rows].empty? # only process tables with bodies
            @tables << current_table
            new_lines << '</tbody>'
          end
          new_lines << '</table><lb/>'
          current_table = nil
        end
      end
    end

    if current_table
      # unclosed table
      @tables << current_table
      unless current_table[:rows].empty? # only process tables with bodies
        @tables << current_table
        new_lines << '</tbody>'
      end
      new_lines << '</table><lb/>'
    end
    # do something with the table data
    new_lines.join(' ')
  end

  def process_any_sections(line)
    6.downto(2) do |depth|
      line.scan(/(={#{depth}}([^=]+)={#{depth}})/).each do |section_match|
        wiki_title = section_match[1].strip
        if wiki_title.length > 0
          verbatim = XmlSourceProcessor.cell_to_plaintext(wiki_title)
          safe_verbatim = verbatim.gsub(/"/, '&quot;')
          line = line.sub(section_match.first, "<entryHeading title=\"#{safe_verbatim}\" depth=\"#{depth}\" >#{wiki_title}</entryHeading>")
          @sections << Section.new(title: wiki_title, depth: depth)
        end
      end
    end

    line
  end

  def postprocess_sections
    @sections.each do |section|
      doc = XmlSourceProcessor.cell_to_xml(section.title)
      doc.elements.each('//link') do |e|
        title = e.attributes['target_title']
        article = collection.articles.where(title: title).first
        if article
          e.add_attribute('target_id', article.id.to_s)
        end
      end
      section.title = XmlSourceProcessor.xml_to_cell(doc)
    end
  end


  def canonicalize_title(title)
    # kill all tags
    title = title.gsub(/<.*?>/, '')
    # linebreaks -> spaces
    title = title.gsub(/\n/, ' ')
    # multiple spaces -> single spaces
    title = title.gsub(/\s+/, ' ')
    # change double quotes to proper xml
    title = title.gsub(/\"/, '&quot;')
    title
  end

  # transformations converting source mode transcription to xml
  def process_line_breaks(text)
    text="<p>#{text}</p>"
    text = text.gsub(/\s*\n\s*\n\s*/, '</p><p>')
    text = text.gsub(/([[:word:]]+)-\r\n\s*/, '\1<lb break="no" />')
    text = text.gsub(/\r\n\s*/, '<lb/>')
    text = text.gsub(/([[:word:]]+)-\n\s*/, '\1<lb break="no" />')
    text = text.gsub(/\n\s*/, '<lb/>')
    text = text.gsub(/([[:word:]]+)-\r\s*/, '\1<lb break="no" />')
    text = text.gsub(/\r\s*/, '<lb/>')
    text
  end

  def valid_xml_from_source(source)
    source = source || ''
    safe = source.gsub /\&/, '&amp;'
    safe.gsub! /\&amp;amp;/, '&amp;'
    safe.gsub! /[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]/, ' '

    string = <<EOF
    <?xml version="1.0" encoding="UTF-8"?>
      <page>
        #{safe}
      </page>
EOF
  end

  def update_links_and_xml(xml_string, preview_mode = false, text_type)
    # first clear out the existing links
    # log the count of articles before and after
    clear_links(text_type) unless preview_mode
    processed = ''
    # process it
    doc = REXML::Document.new xml_string
    doc.elements.each('//link') do |element|
      # default the title to the text if it's not specified
      if !(title=element.attributes['target_title'])
        title = element.text
      end
      # display_text = element.text
      display_text = ''
      element.children.each do |e|
        display_text += e.to_s
      end
      debug("link display_text = #{display_text}")
      # change the xml version of quotes back to double quotes for article title
      title = title.gsub('&quot;', '"')

      # create new blank articles if they don't exist already
      if !(article = collection.articles.where(title: title).first)
        article = Article.new
        article.title = title
        article.collection = collection
        article.created_by_id = Current.user.id if Current.user.present?
        article.save! unless preview_mode
      end
      link_id = create_link(article, display_text, text_type) unless preview_mode
      # now update the attribute
      link_element = REXML::Element.new('link')
      element.children.each { |c| link_element.add(c) }
      link_element.add_attribute('target_title', title)
      debug('element='+link_element.inspect)
      debug('article='+article.inspect)
      link_element.add_attribute('target_id', article.id.to_s) unless preview_mode
      link_element.add_attribute('link_id', link_id.to_s) unless preview_mode
      element.replace_with(link_element)
    end
    doc.write(processed)
    processed
  end


  # handle XML-dependent post-processing
  def postprocess_xml_markup(xml_string)
    doc = REXML::Document.new xml_string
    processed = ''
    doc.elements.each('//lb') do |element|
      if element.previous_element && element.previous_sibling.node_type == :element && element.previous_element.name == 'lb'
        pre = doc.to_s
        element.parent.elements.delete(element)
      end
    end
    doc.write(processed)
    processed
  end


  CELL_PREFIX = "<?xml version='1.0' encoding='UTF-8'?><cell>"
  CELL_SUFFIX = '</cell>'

  def self.cell_to_xml(cell)
    REXML::Document.new(CELL_PREFIX + cell.gsub('&', '&amp;') + CELL_SUFFIX)
  end

  def self.xml_to_cell(doc)
    text = ''
    doc.write(text)
    text.sub(CELL_PREFIX, '').sub(CELL_SUFFIX, '')
  end

  def self.cell_to_plaintext(cell)
    doc = cell_to_xml(cell)
    doc.each_element('.//text()') { |e| p e.text }.join
  end

  def self.cell_to_subject(cell)
    doc = cell_to_xml(cell)
    subjects = ''
    doc.elements.each('//link') do |e|
      title = e.attributes['target_title']
      subjects << title
      subjects << "\n"
    end
    subjects
  end

  def self.cell_to_category(cell)
    doc = cell_to_xml(cell)
    categories = ''
    doc.elements.each('//link') do |e|
      id = e.attributes['target_id']
      if id
        article = Article.find(id)
        article.categories.each do |category|
          categories << category.title
          categories << "\n"
        end
      end
    end
    categories
  end

  ##############################################
  # Code to rename links within the text.
  # This assumes that the name change has already
  # taken place within the article table in the DB
  ##############################################
  def rename_article_links(old_title, new_title)
    title_regex =
      Regexp.escape(old_title)
        .gsub('\\ ', ' ') # Regexp.escape converts ' ' to '\\ ' for some reason -- undo this
        .gsub(/\s+/, '\s+') # convert multiple whitespaces into 1+n space characters

    self.source_text = rename_link_in_text(source_text, title_regex, new_title)

    # Articles don't have translations, but we still need to update pages.source_translation
    if has_attribute?(:source_translation) && !source_translation.nil?
      self.source_translation = rename_link_in_text(source_translation, title_regex, new_title)
    end
  end

  def rename_link_in_text(text, title_regex, new_title)
    if new_title == ''
      # Link deleted, remove [[ ]] but keep the original title text

      # Handle links of the form [[Old Title|Display Text]] => Display Text
      text = text.gsub(/\[\[#{title_regex}\|([^\]]+)\]\]/i, '\1')
      # Handle links of the form [[Old Title]] => Old Title
      text = text.gsub(/\[\[(#{title_regex})\]\]/i, '\1')
    else
      # Replace the title part in [[Old Title|Display Text]]
      text = text.gsub(/\[\[#{title_regex}\|/i, "[[#{new_title}|")
      # Replace [[Old Title]] with [[New Title|Old Title]]
      text = text.gsub(/\[\[(#{title_regex})\]\]/i, "[[#{new_title}|\\1]]")
    end

    text
  end


  def pipe_tables_formatting(text)
    # since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
    # to the beginning and end of each line
    text.split("\n").map { |line| "|#{line}|" }.join("\n")
  end

  def xml_table_to_markdown_table(table_element, pandoc_format = false, plaintext_export = false)
    text_table = ''

    # clean up in-cell line-breaks
    table_element.xpath('//lb').each { |n| n.replace(' ') }

    # calculate the widths of each column based on max(header, cell[0...end])
    column_count = ([ table_element.xpath('//th').count ] + table_element.xpath('//tr').map { |e| e.xpath('td').count }).max
    column_widths = {}
    1.upto(column_count) do |column_index|
      longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map { |e| e.text().length }.max || 0)
      corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
      heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
      column_widths[column_index] = [ longest_cell, heading_length ].max
    end

    # print the header as markdown
    cell_strings = []
    table_element.xpath('//th').each_with_index do |e, i|
      cell_strings << e.text.rjust(column_widths[i+1], ' ')
    end
    text_table << cell_strings.join(' | ') << "\n"

    # print the separator
    text_table << column_count.times.map { |i| ''.rjust(column_widths[i+1], '-') }.join(' | ') << "\n"

    # print each row as markdown
    table_element.xpath('//tr').each do |row_element|
      text_table << row_element.xpath('td').map do |e|
        width = 80 # default for hand-coded tables
        index = e.path.match(/.*td\[(\d+)\]/)
        if index
          width = column_widths[index[1].to_i] || 80
        else
          width = column_widths.values.first
        end

        if plaintext_export
          e.text.rjust(width, ' ')
        else
          inner_html = xml_to_pandoc_md(e.to_s, false, false, nil, false).gsub("\n", '')
          inner_html.rjust(width, ' ')
        end
      end.join(' | ') << "\n"
    end
    if pandoc_format
      text_table = pipe_tables_formatting(text_table)
    end

    "#{text_table}\n\n"
  end



  def debug(msg)
    logger.debug("DEBUG: #{msg}")
  end
end

1	module XmlSourceProcessor	1✔
2	def validate_source	1✔
3	if self.source_text.blank?	3,364✔
4	return	3,230✔
5	end
6	validate_links(self.source_text)	134✔
7	end
8
9	def validate_source_translation	1✔
10	if self.source_translation.blank?	3,364✔
11	return	3,317✔
12	end
13	validate_links(self.source_translation)	47✔
14	end
15
16	# check the text for problems or typos with the subject links
17	def validate_links(text)	1✔
18	error_scope = [ :activerecord, :errors, :models, :xml_source_processor ]	181✔
19	# split on all begin-braces
20	tags = text.split('[[')	181✔
21	# remove the initial string which occurs before the first tag
22	debug("validate_source: tags to process are #{tags.inspect}")	181✔
23	tags = tags - [ tags[0] ]	181✔
24	debug("validate_source: massaged tags to process are #{tags.inspect}")	181✔
25	for tag in tags	181✔
26	debug(tag)	112✔
27
28	if tag.include?(']]]')	112✔
29	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('tags_should_not_use_3_brackets', scope: error_scope))	1✔
30	return	1✔
31	end
32	unless tag.include?(']]')	111✔
33	tag = tag.strip	1✔
34	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('wrong_number_of_closing_braces', tag: '"[['+tag+'"', scope: error_scope))	1✔
35	end
36
37	# just pull the pieces between the braces
38	inner_tag = tag.split(']]')[0]	111✔
39	if inner_tag =~ /^\s*$/	111✔
40	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_tag_in', tag: '"[['+tag+'"', scope: error_scope))	1✔
41	end
42
43	# check for unclosed single bracket
44	if inner_tag.include?('[')	111✔
45	unless inner_tag.include?(']')	1!
46	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('unclosed_bracket_within', tag: '"'+inner_tag+'"', scope: error_scope))	1✔
47	end
48	end
49	# check for blank title or display name with pipes
50	if inner_tag.include?('\|')	111✔
51	tag_parts = inner_tag.split('\|')	13✔
52	debug("validate_source: inner tag parts are #{tag_parts.inspect}")	13✔
53	if tag_parts[0] =~ /^\s*$/	13✔
54	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_subject_in', tag: '"[['+inner_tag+']]"', scope: error_scope))	1✔
55	end
56	if tag_parts[1] =~ /^\s*$/	13✔
57	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_text_in', tag: '"[['+inner_tag+']]"', scope: error_scope))	1✔
58	end
59	end
60	end
61	# return errors.size > 0
62	end
63
64	def source_text=(text)	1✔
65	self.source_text_will_change!	166✔
66	super	166✔
67	end
68
69	def source_translation=(text)	1✔
70	self.source_translation_will_change!	53✔
71	super	53✔
72	end
73
74	##############################################
75	# All code to convert transcriptions from source
76	# format to canonical xml format belongs here.
77	##############################################
78	def process_source	1✔
79	if source_text_changed?	242✔
80	self.xml_text = wiki_to_xml(self, Page::TEXT_TYPE::TRANSCRIPTION)	59✔
81	end
82
83	if self.respond_to?(:source_translation) && source_translation_changed?	242✔
84	self.xml_translation = wiki_to_xml(self, Page::TEXT_TYPE::TRANSLATION)	12✔
85	end
86	end
87
88	def wiki_to_xml(page, text_type)	1✔
89	subjects_disabled = page.collection.subjects_disabled	77✔
90
91	source_text = case text_type	77✔
92	when Page::TEXT_TYPE::TRANSCRIPTION	64✔
93	page.source_text	64✔
94	when Page::TEXT_TYPE::TRANSLATION	13✔
95	page.source_translation	13✔
96	else	×
NEW 97	''	×
98	end
99
100	xml_string = String.new(source_text)	77✔
101	xml_string = process_latex_snippets(xml_string)	77✔
102	xml_string = clean_bad_braces(xml_string)	77✔
103	xml_string = clean_script_tags(xml_string)	77✔
104	xml_string = process_square_braces(xml_string) unless subjects_disabled	77✔
105	xml_string = process_linewise_markup(xml_string)	77✔
106	xml_string = process_line_breaks(xml_string)	77✔
107	xml_string = valid_xml_from_source(xml_string)	77✔
108	xml_string = update_links_and_xml(xml_string, false, text_type)	77✔
109	xml_string = postprocess_xml_markup(xml_string)	77✔
110	postprocess_sections	77✔
111	xml_string	77✔
112	end
113
114
115	# remove script tags from HTML to prevent javascript injection
116	def clean_script_tags(text)	1✔
117	# text.gsub(/<script.*?<\/script>/m, '')
118	text.gsub(/<\/?script.*?>/m, '')	77✔
119	end
120
121	BAD_SHIFT_REGEX = /\[\[([[[:alpha:]][[:blank:]]\|,\(\)\-[[:digit:]]]+)\}\}/	1✔
122	def clean_bad_braces(text)	1✔
123	text.gsub BAD_SHIFT_REGEX, '[[\\1]]'	77✔
124	end
125
126	BRACE_REGEX = /\[\[.*?\]\]/m	1✔
127	def process_square_braces(text)	1✔
128	# find all the links
129	wikilinks = text.scan(BRACE_REGEX)	74✔
130	wikilinks.each do \|wikilink_contents\|	74✔
131	# strip braces
132	munged = wikilink_contents.sub('[[', '')	29✔
133	munged = munged.sub(']]', '')	29✔
134
135	# extract the title and display
136	if munged.include? '\|'	29✔
137	parts = munged.split '\|'	10✔
138	title = parts[0]	10✔
139	verbatim = parts[1]	10✔
140	else	19✔
141	title = munged	19✔
142	verbatim = munged	19✔
143	end
144
145	title = canonicalize_title(title)	29✔
146
147	replacement = "<link target_title=\"#{title}\">#{verbatim}</link>"	29✔
148	text.sub!(wikilink_contents, replacement)	29✔
149	end
150
151	text	74✔
152	end
153
154	def remove_square_braces(text)	1✔
155	new_text = text.scan(BRACE_REGEX)	3✔
156	new_text.each do \|results\|	3✔
157	changed = results	3✔
158	# remove title
159	if results.include?('\|')	3!
160	changed = results.sub(/\[\[.*?\\|/, '')	×
161	end
162	changed = changed.sub('[[', '')	3✔
163	changed = changed.sub(']]', '')	3✔
164
165	text.sub!(results, changed)	3✔
166	end
167	text	3✔
168	end
169
170	LATEX_SNIPPET = /(\{\{tex:?(.*?):?tex\}\})/m	1✔
171	def process_latex_snippets(text)	1✔
172	return text unless self.respond_to? :tex_figures	77✔
173	replacements = {}	62✔
174	figures = self.tex_figures.to_a	62✔
175
176	text.scan(LATEX_SNIPPET).each_with_index do \|pair, i\|	62✔
177	with_tags = pair[0]	×
178	contents = pair[1]	×
179
180	replacements[with_tags] = "<texFigure position=\"#{i+1}\"/>" # position attribute in acts as list starts with 1	×
181
182	figure = figures[i] \|\| TexFigure.new	×
183	figure.source = contents unless figure.source == contents	×
184	figures[i] = figure	×
185	end
186
187	self.tex_figures = figures	62✔
188	replacements.each_pair do \|s, r\|	62✔
NEW 189	text.sub!(s, r)	×
190	end
191
192	text	62✔
193	end
194
195	HEADER = /\s\\|\s/	1✔
196	SEPARATOR = /---.*\\|/	1✔
197	ROW = HEADER	1✔
198
199	def process_linewise_markup(text)	1✔
200	@tables = []	77✔
201	@sections = []	77✔
202	new_lines = []	77✔
203	current_table = nil	77✔
204	text.lines.each do \|line\|	77✔
205	# first deal with any sections
206	line = process_any_sections(line)	93✔
207	# look for a header
208	if !current_table	93✔
209	if line.match(HEADER)	93!
210	line.chomp	×
211	current_table = { header: [], rows: [], section: @sections.last }	×
212	# fill the header
213	cells = line.split(/\s\\|\s/)	×
214	cells.shift if line.match(/^\\|/) # remove leading pipe	×
NEW 215	current_table[:header] = cells.map { \|cell_title\| cell_title.sub(/^!\s*/, '') }	×
216	heading = cells.map do \|cell\|	×
217	if cell.match(/^!/)	×
NEW 218	"<th class=\"bang\">#{cell.sub(/^!\s*/, '')}</th>"	×
219	else	×
220	"<th>#{cell}</th>"	×
221	end
222	end.join(' ')
223	new_lines << "<table class=\"tabular\">\n<thead>\n<tr>#{heading}</tr></thead>"	×
224	else
225	# no current table, no table contents -- NO-OP	93✔
226	new_lines << line	93✔
227	end
228	else
229	# this is either an end or a separator	×
230	if line.match(SEPARATOR)	×
231	# NO-OP	×
232	elsif line.match(ROW)	×
233	# remove leading and trailing delimiters	×
234	clean_line=line.chomp.sub(/^\s\\|/, '').sub(/\\|\s$/, '')	×
235	# fill the row
236	cells = clean_line.split(/\s\\|\s/, -1) # -1 means "don't prune empty values at the end"	×
237	current_table[:rows] << cells	×
238	rowline = ''	×
239	cells.each_with_index do \|cell, _i\|	×
240	rowline += "<td>#{cell}</td> "	×
241	end
242
243	if current_table[:rows].size == 1	×
244	new_lines << '<tbody>'	×
245	end
246	new_lines << "<tr>#{rowline}</tr>"	×
247	else
248	# finished the last row	×
249	unless current_table[:rows].empty? # only process tables with bodies	×
250	@tables << current_table	×
251	new_lines << '</tbody>'	×
252	end
253	new_lines << '</table><lb/>'	×
254	current_table = nil	×
255	end
256	end
257	end
258
259	if current_table	77✔
260	# unclosed table	×
261	@tables << current_table	×
262	unless current_table[:rows].empty? # only process tables with bodies	×
263	@tables << current_table	×
264	new_lines << '</tbody>'	×
265	end
266	new_lines << '</table><lb/>'	×
267	end
268	# do something with the table data
269	new_lines.join(' ')	77✔
270	end
271
272	def process_any_sections(line)	1✔
273	6.downto(2) do \|depth\|	93✔
274	line.scan(/(={#{depth}}([^=]+)={#{depth}})/).each do \|section_match\|	465✔
275	wiki_title = section_match[1].strip	×
276	if wiki_title.length > 0	×
277	verbatim = XmlSourceProcessor.cell_to_plaintext(wiki_title)	×
NEW 278	safe_verbatim = verbatim.gsub(/"/, '"')	×
279	line = line.sub(section_match.first, "<entryHeading title=\"#{safe_verbatim}\" depth=\"#{depth}\" >#{wiki_title}</entryHeading>")	×
NEW 280	@sections << Section.new(title: wiki_title, depth: depth)	×
281	end
282	end
283	end
284
285	line	93✔
286	end
287
288	def postprocess_sections	1✔
289	@sections.each do \|section\|	77✔
290	doc = XmlSourceProcessor.cell_to_xml(section.title)	×
NEW 291	doc.elements.each('//link') do \|e\|	×
292	title = e.attributes['target_title']	×
NEW 293	article = collection.articles.where(title: title).first	×
294	if article	×
295	e.add_attribute('target_id', article.id.to_s)	×
296	end
297	end
298	section.title = XmlSourceProcessor.xml_to_cell(doc)	×
299	end
300	end
301
302
303	def canonicalize_title(title)	1✔
304	# kill all tags
305	title = title.gsub(/<.*?>/, '')	29✔
306	# linebreaks -> spaces
307	title = title.gsub(/\n/, ' ')	29✔
308	# multiple spaces -> single spaces
309	title = title.gsub(/\s+/, ' ')	29✔
310	# change double quotes to proper xml
311	title = title.gsub(/\"/, '"')	29✔
312	title	29✔
313	end
314
315	# transformations converting source mode transcription to xml
316	def process_line_breaks(text)	1✔
317	text="<p>#{text}</p>"	77✔
318	text = text.gsub(/\s\n\s\n\s*/, '</p><p>')	77✔
319	text = text.gsub(/([[:word:]]+)-\r\n\s*/, '\1<lb break="no" />')	77✔
320	text = text.gsub(/\r\n\s*/, '<lb/>')	77✔
321	text = text.gsub(/([[:word:]]+)-\n\s*/, '\1<lb break="no" />')	77✔
322	text = text.gsub(/\n\s*/, '<lb/>')	77✔
323	text = text.gsub(/([[:word:]]+)-\r\s*/, '\1<lb break="no" />')	77✔
324	text = text.gsub(/\r\s*/, '<lb/>')	77✔
325	text	77✔
326	end
327
328	def valid_xml_from_source(source)	1✔
329	source = source \|\| ''	77✔
330	safe = source.gsub /\&/, '&'	77✔
331	safe.gsub! /\&amp;/, '&'	77✔
332	safe.gsub! /[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]/, ' '	77✔
333
334	string = <<EOF	77✔
335	<?xml version="1.0" encoding="UTF-8"?>
336	<page>
337	#{safe}
338	</page>
339	EOF
340	end
341
342	def update_links_and_xml(xml_string, preview_mode = false, text_type)	1✔
343	# first clear out the existing links
344	# log the count of articles before and after
345	clear_links(text_type) unless preview_mode	77!
346	processed = ''	77✔
347	# process it
348	doc = REXML::Document.new xml_string	77✔
349	doc.elements.each('//link') do \|element\|	77✔
350	# default the title to the text if it's not specified
351	if !(title=element.attributes['target_title'])	29!
352	title = element.text	×
353	end
354	# display_text = element.text
355	display_text = ''	29✔
356	element.children.each do \|e\|	29✔
357	display_text += e.to_s	29✔
358	end
359	debug("link display_text = #{display_text}")	29✔
360	# change the xml version of quotes back to double quotes for article title
361	title = title.gsub('"', '"')	29✔
362
363	# create new blank articles if they don't exist already
364	if !(article = collection.articles.where(title: title).first)	29✔
365	article = Article.new	8✔
366	article.title = title	8✔
367	article.collection = collection	8✔
368	article.created_by_id = Current.user.id if Current.user.present?	8✔
369	article.save! unless preview_mode	8!
370	end
371	link_id = create_link(article, display_text, text_type) unless preview_mode	29!
372	# now update the attribute
373	link_element = REXML::Element.new('link')	29✔
374	element.children.each { \|c\| link_element.add(c) }	58✔
375	link_element.add_attribute('target_title', title)	29✔
376	debug('element='+link_element.inspect)	29✔
377	debug('article='+article.inspect)	29✔
378	link_element.add_attribute('target_id', article.id.to_s) unless preview_mode	29!
379	link_element.add_attribute('link_id', link_id.to_s) unless preview_mode	29!
380	element.replace_with(link_element)	29✔
381	end
382	doc.write(processed)	77✔
383	processed	77✔
384	end
385
386
387	# handle XML-dependent post-processing
388	def postprocess_xml_markup(xml_string)	1✔
389	doc = REXML::Document.new xml_string	77✔
390	processed = ''	77✔
391	doc.elements.each('//lb') do \|element\|	77✔
392	if element.previous_element && element.previous_sibling.node_type == :element && element.previous_element.name == 'lb'	6!
393	pre = doc.to_s	×
394	element.parent.elements.delete(element)	×
395	end
396	end
397	doc.write(processed)	77✔
398	processed	77✔
399	end
400
401
402	CELL_PREFIX = "<?xml version='1.0' encoding='UTF-8'?><cell>"	1✔
403	CELL_SUFFIX = '</cell>'	1✔
404
405	def self.cell_to_xml(cell)	1✔
406	REXML::Document.new(CELL_PREFIX + cell.gsub('&', '&') + CELL_SUFFIX)	3✔
407	end
408
409	def self.xml_to_cell(doc)	1✔
NEW 410	text = ''	×
411	doc.write(text)	×
NEW 412	text.sub(CELL_PREFIX, '').sub(CELL_SUFFIX, '')	×
413	end
414
415	def self.cell_to_plaintext(cell)	1✔
416	doc = cell_to_xml(cell)	3✔
417	doc.each_element('.//text()') { \|e\| p e.text }.join	3✔
418	end
419
420	def self.cell_to_subject(cell)	1✔
421	doc = cell_to_xml(cell)	×
NEW 422	subjects = ''	×
NEW 423	doc.elements.each('//link') do \|e\|	×
424	title = e.attributes['target_title']	×
425	subjects << title	×
426	subjects << "\n"	×
427	end
428	subjects	×
429	end
430
431	def self.cell_to_category(cell)	1✔
432	doc = cell_to_xml(cell)	×
NEW 433	categories = ''	×
NEW 434	doc.elements.each('//link') do \|e\|	×
435	id = e.attributes['target_id']	×
436	if id	×
437	article = Article.find(id)	×
438	article.categories.each do \|category\|	×
439	categories << category.title	×
440	categories << "\n"	×
441	end
442	end
443	end
444	categories	×
445	end
446
447	##############################################
448	# Code to rename links within the text.
449	# This assumes that the name change has already
450	# taken place within the article table in the DB
451	##############################################
452	def rename_article_links(old_title, new_title)	1✔
453	title_regex =
454	Regexp.escape(old_title)	13✔
455	.gsub('\\ ', ' ') # Regexp.escape converts ' ' to '\\ ' for some reason -- undo this
456	.gsub(/\s+/, '\s+') # convert multiple whitespaces into 1+n space characters
457
458	self.source_text = rename_link_in_text(source_text, title_regex, new_title)	13✔
459
460	# Articles don't have translations, but we still need to update pages.source_translation
461	if has_attribute?(:source_translation) && !source_translation.nil?	13✔
462	self.source_translation = rename_link_in_text(source_translation, title_regex, new_title)	5✔
463	end
464	end
465
466	def rename_link_in_text(text, title_regex, new_title)	1✔
467	if new_title == ''	18✔
468	# Link deleted, remove [[ ]] but keep the original title text
469
470	# Handle links of the form [[Old Title\|Display Text]] => Display Text	3✔
471	text = text.gsub(/\[\[#{title_regex}\\|([^\]]+)\]\]/i, '\1')	3✔
472	# Handle links of the form [[Old Title]] => Old Title
473	text = text.gsub(/\[\[(#{title_regex})\]\]/i, '\1')	3✔
474	else
475	# Replace the title part in [[Old Title\|Display Text]]	15✔
476	text = text.gsub(/\[\[#{title_regex}\\|/i, "[[#{new_title}\|")	15✔
477	# Replace [[Old Title]] with [[New Title\|Old Title]]
478	text = text.gsub(/\[\[(#{title_regex})\]\]/i, "[[#{new_title}\|\\1]]")	15✔
479	end
480
481	text	18✔
482	end
483
484
485	def pipe_tables_formatting(text)	1✔
486	# since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
487	# to the beginning and end of each line
488	text.split("\n").map { \|line\| "\|#{line}\|" }.join("\n")	10✔
489	end
490
491	def xml_table_to_markdown_table(table_element, pandoc_format = false, plaintext_export = false)	1✔
492	text_table = ''	12✔
493
494	# clean up in-cell line-breaks
495	table_element.xpath('//lb').each { \|n\| n.replace(' ') }	50✔
496
497	# calculate the widths of each column based on max(header, cell[0...end])
498	column_count = ([ table_element.xpath('//th').count ] + table_element.xpath('//tr').map { \|e\| e.xpath('td').count }).max	36✔
499	column_widths = {}	12✔
500	1.upto(column_count) do \|column_index\|	12✔
501	longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map { \|e\| e.text().length }.max \|\| 0)	72✔
502	corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first	36✔
503	heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length	36!
504	column_widths[column_index] = [ longest_cell, heading_length ].max	36✔
505	end
506
507	# print the header as markdown
508	cell_strings = []	12✔
509	table_element.xpath('//th').each_with_index do \|e, i\|	12✔
510	cell_strings << e.text.rjust(column_widths[i+1], ' ')	36✔
511	end
512	text_table << cell_strings.join(' \| ') << "\n"	12✔
513
514	# print the separator
515	text_table << column_count.times.map { \|i\| ''.rjust(column_widths[i+1], '-') }.join(' \| ') << "\n"	48✔
516
517	# print each row as markdown
518	table_element.xpath('//tr').each do \|row_element\|	12✔
519	text_table << row_element.xpath('td').map do \|e\|	24✔
520	width = 80 # default for hand-coded tables	36✔
521	index = e.path.match(/.*td\[(\d+)\]/)	36✔
522	if index	36✔
523	width = column_widths[index[1].to_i] \|\| 80	36✔
524	else	×
525	width = column_widths.values.first	×
526	end
527
528	if plaintext_export	36✔
529	e.text.rjust(width, ' ')	30✔
530	else	6✔
531	inner_html = xml_to_pandoc_md(e.to_s, false, false, nil, false).gsub("\n", '')	6✔
532	inner_html.rjust(width, ' ')	6✔
533	end
534	end.join(' \| ') << "\n"
535	end
536	if pandoc_format	12✔
537	text_table = pipe_tables_formatting(text_table)	2✔
538	end
539
540	"#{text_table}\n\n"	12✔
541	end
542
543
544
545	def debug(msg)	1✔
546	logger.debug("DEBUG: #{msg}")	574✔
547	end
548	end

benwbrum / fromthepage / 17387282326

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous