13402333050

Committed 18 Feb 2025 11:48PM UTC coverage: 61.822% (+1.0%) from 60.846%

Build # 13402333050

Build Type

push

github

Committed by

web-flow

Commit Message

Merge pull request #4532 from benwbrum/4528-add-linebreak-after-table

4528 - Add linebreak after table

Run Details

1543 of 2994 branches covered (51.54%)

Branch coverage included in aggregate %.

2 of 16 new or added lines in 1 file covered. (12.5%)

3 existing lines in 2 files now uncovered.

6994 of 10815 relevant lines covered (64.67%)

81.79 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

67.57

/app/models/xml_source_processor.rb

module XmlSourceProcessor

  @text_dirty = false
  @translation_dirty = false
  #@fields = false

  def source_text=(text)
    @text_dirty = true
    super
  end

  def source_translation=(translation)
    @translation_dirty = true
    super
  end

  def validate_source
    if self.source_text.blank?
      return
    end
    validate_links(self.source_text)
  end

  def validate_source_translation
    if self.source_translation.blank?
      return
    end
    validate_links(self.source_translation)
  end

  #check the text for problems or typos with the subject links
  def validate_links(text)
    error_scope = [:activerecord, :errors, :models, :xml_source_processor]
    # split on all begin-braces
    tags = text.split('[[')
    # remove the initial string which occurs before the first tag
    debug("validate_source: tags to process are #{tags.inspect}")
    tags = tags - [tags[0]]
    debug("validate_source: massaged tags to process are #{tags.inspect}")
    for tag in tags
      debug(tag)

      if tag.include?(']]]')
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('tags_should_not_use_3_brackets', scope: error_scope))
        return
      end
      unless tag.include?(']]')
        tag = tag.strip
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('wrong_number_of_closing_braces', tag: '"[['+tag+'"', scope: error_scope))
      end

      # just pull the pieces between the braces
      inner_tag = tag.split(']]')[0]
      if inner_tag =~ /^\s*$/
        errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_tag_in', tag: '"[['+tag+'"', scope: error_scope))
      end

      #check for unclosed single bracket
      if inner_tag.include?('[')
        unless inner_tag.include?(']')
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('unclosed_bracket_within', tag: '"'+inner_tag+'"', scope: error_scope))
        end
      end
      # check for blank title or display name with pipes
      if inner_tag.include?("|")
        tag_parts = inner_tag.split('|')
        debug("validate_source: inner tag parts are #{tag_parts.inspect}")
        if tag_parts[0] =~ /^\s*$/
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_subject_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
        end
        if tag_parts[1] =~ /^\s*$/
          errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_text_in', tag: '"[['+inner_tag+']]"', scope: error_scope))
        end
      end
    end
    #    return errors.size > 0
  end

  ##############################################
  # All code to convert transcriptions from source
  # format to canonical xml format belongs here.
  ##############################################
  def process_source
    if @text_dirty
      self.xml_text = wiki_to_xml(self, Page::TEXT_TYPE::TRANSCRIPTION)
    end

    if @translation_dirty
      self.xml_translation = wiki_to_xml(self, Page::TEXT_TYPE::TRANSLATION)
    end
  end

  def wiki_to_xml(page, text_type)

    subjects_disabled = page.collection.subjects_disabled

    source_text = case text_type
                  when Page::TEXT_TYPE::TRANSCRIPTION
                    page.source_text
                  when Page::TEXT_TYPE::TRANSLATION
                    page.source_translation
                  else
                    ""
                  end

    xml_string = String.new(source_text)
    xml_string = process_latex_snippets(xml_string)
    xml_string = clean_bad_braces(xml_string)
    xml_string = clean_script_tags(xml_string)
    xml_string = process_square_braces(xml_string) unless subjects_disabled
    xml_string = process_linewise_markup(xml_string)
    xml_string = process_line_breaks(xml_string)
    xml_string = valid_xml_from_source(xml_string)
    xml_string = update_links_and_xml(xml_string, false, text_type)
    xml_string = postprocess_xml_markup(xml_string)
    postprocess_sections
    xml_string
  end


  # remove script tags from HTML to prevent javascript injection
  def clean_script_tags(text)
    # text.gsub(/<script.*?<\/script>/m, '')
    text.gsub(/<\/?script.*?>/m, '')
  end

  BAD_SHIFT_REGEX = /\[\[([[[:alpha:]][[:blank:]]|,\(\)\-[[:digit:]]]+)\}\}/
  def clean_bad_braces(text)
    text.gsub BAD_SHIFT_REGEX, "[[\\1]]"
  end

  BRACE_REGEX = /\[\[.*?\]\]/m
  def process_square_braces(text)
    # find all the links
    wikilinks = text.scan(BRACE_REGEX)
    wikilinks.each do |wikilink_contents|
      # strip braces
      munged = wikilink_contents.sub('[[','')
      munged = munged.sub(']]','')

      # extract the title and display
      if munged.include? '|'
        parts = munged.split '|'
        title = parts[0]
        verbatim = parts[1]
      else
        title = munged
        verbatim = munged
      end

      title = canonicalize_title(title)

      replacement = "<link target_title=\"#{title}\">#{verbatim}</link>"
      text.sub!(wikilink_contents, replacement)
    end

    text
  end

  def remove_square_braces(text)
    new_text = text.scan(BRACE_REGEX)
    new_text.each do |results|
      changed = results
      #remove title
      if results.include?('|')
        changed = results.sub(/\[\[.*?\|/, '')
      end
      changed = changed.sub('[[', '')
      changed = changed.sub(']]', '')

      text.sub!(results, changed)
    end
    text
  end

  LATEX_SNIPPET = /(\{\{tex:?(.*?):?tex\}\})/m
  def process_latex_snippets(text)
    return text unless self.respond_to? :tex_figures
    replacements = {}
    figures = self.tex_figures.to_a

    text.scan(LATEX_SNIPPET).each_with_index do |pair, i|
      with_tags = pair[0]
      contents = pair[1]

      replacements[with_tags] = "<texFigure position=\"#{i+1}\"/>" # position attribute in acts as list starts with 1

      figure = figures[i] || TexFigure.new
      figure.source = contents unless figure.source == contents
      figures[i] = figure
    end

    self.tex_figures = figures
    replacements.each_pair do |s,r|
      text.sub!(s,r)
    end

    text
  end

  HEADER = /\s\|\s/
  SEPARATOR = /---.*\|/
  ROW = HEADER

  def process_linewise_markup(text)
    @tables = []
    @sections = []
    new_lines = []
    current_table = nil
    text.lines.each do |line|
      # first deal with any sections
      line = process_any_sections(line)
      # look for a header
      if !current_table
        if line.match(HEADER)
          line.chomp
          current_table = { header: [], rows: [], section: @sections.last }
          # fill the header
          cells = line.split(/\s*\|\s*/)
          cells.shift if line.match(/^\|/) # remove leading pipe
          current_table[:header] = cells.map{ |cell_title| cell_title.sub(/^!\s*/,'') }
          heading = cells.map do |cell|
            if cell.match(/^!/)
              "<th class=\"bang\">#{cell.sub(/^!\s*/,'')}</th>"
            else
              "<th>#{cell}</th>"
            end
          end.join(' ')
          new_lines << "<table class=\"tabular\">\n<thead>\n<tr>#{heading}</tr></thead>"
        else
          # no current table, no table contents -- NO-OP
          new_lines << line
        end
      else
        # this is either an end or a separator
        if line.match(SEPARATOR)
          # NO-OP
        elsif line.match(ROW)
          # remove leading and trailing delimiters
          clean_line=line.chomp.sub(/^\s*\|/, '').sub(/\|\s*$/, '')
          # fill the row
          cells = clean_line.split(/\s*\|\s*/, -1) # -1 means "don't prune empty values at the end"
          current_table[:rows] << cells
          rowline = ''
          cells.each_with_index do |cell, i|
            head = current_table[:header][i]
            role_string = " role=\"#{head}\""
            rowline += "<td#{role_string}>#{cell}</td> "
          end

          if current_table[:rows].size == 1
            new_lines << '<tbody>'
          end
          new_lines << "<tr>#{rowline}</tr>"
        else
          # finished the last row
          unless current_table[:rows].empty? # only process tables with bodies
            @tables << current_table
            new_lines << '</tbody>'
          end
          new_lines << '</table><lb/>'
          current_table = nil
        end
      end
    end

    if current_table
      # unclosed table
      @tables << current_table
      unless current_table[:rows].empty? # only process tables with bodies
        @tables << current_table
        new_lines << '</tbody>'
      end
      new_lines << '</table><lb/>'
    end
    # do something with the table data
    new_lines.join(' ')
  end

  def process_any_sections(line)
    6.downto(2) do |depth|
      line.scan(/(={#{depth}}([^=]+)={#{depth}})/).each do |section_match|
        wiki_title = section_match[1].strip
        if wiki_title.length > 0
          verbatim = XmlSourceProcessor.cell_to_plaintext(wiki_title)
          safe_verbatim = verbatim.gsub(/"/, "&quot;")
          line = line.sub(section_match.first, "<entryHeading title=\"#{safe_verbatim}\" depth=\"#{depth}\" >#{wiki_title}</entryHeading>")
          @sections << Section.new(:title => wiki_title, :depth => depth)
        end
      end
    end

    line
  end

  def postprocess_sections
    @sections.each do |section|
      doc = XmlSourceProcessor.cell_to_xml(section.title)
      doc.elements.each("//link") do |e|
        title = e.attributes['target_title']
        article = collection.articles.where(:title => title).first
        if article
          e.add_attribute('target_id', article.id.to_s)
        end
      end
      section.title = XmlSourceProcessor.xml_to_cell(doc)
    end
  end


  def canonicalize_title(title)
    # kill all tags
    title = title.gsub(/<.*?>/, '')
    # linebreaks -> spaces
    title = title.gsub(/\n/, ' ')
    # multiple spaces -> single spaces
    title = title.gsub(/\s+/, ' ')
    # change double quotes to proper xml
    title = title.gsub(/\"/, '&quot;')
    title
  end

  # transformations converting source mode transcription to xml
  def process_line_breaks(text)
    text="<p>#{text}</p>"
    text = text.gsub(/\s*\n\s*\n\s*/, "</p><p>")
    text = text.gsub(/([[:word:]]+)-\r\n\s*/, '\1<lb break="no" />')
    text = text.gsub(/\r\n\s*/, "<lb/>")
    text = text.gsub(/([[:word:]]+)-\n\s*/, '\1<lb break="no" />')
    text = text.gsub(/\n\s*/, "<lb/>")
    text = text.gsub(/([[:word:]]+)-\r\s*/, '\1<lb break="no" />')
    text = text.gsub(/\r\s*/, "<lb/>")
    return text
  end

  def valid_xml_from_source(source)
    source = source || ""
    safe = source.gsub /\&/, '&amp;'
    safe.gsub! /\&amp;amp;/, '&amp;'
    safe.gsub! /[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]/, ' '

    string = <<EOF
    <?xml version="1.0" encoding="UTF-8"?>
      <page>
        #{safe}
      </page>
EOF
  end

  def update_links_and_xml(xml_string, preview_mode=false, text_type)
    # first clear out the existing links
    # log the count of articles before and after
    old_article_count = collection.articles.count
    logger.info("ISSUE4269 old_article_count = #{old_article_count}")
    clear_links(text_type) unless preview_mode
    processed = ""
    # process it
    doc = REXML::Document.new xml_string
    doc.elements.each("//link") do |element|
      # default the title to the text if it's not specified
      if !(title=element.attributes['target_title'])
        title = element.text
      end
      #display_text = element.text
      display_text = ""
      element.children.each do |e|
        display_text += e.to_s
      end
      debug("link display_text = #{display_text}")
      #change the xml version of quotes back to double quotes for article title
      title = title.gsub('&quot;', '"')

      # create new blank articles if they don't exist already
      if !(article = collection.articles.where(:title => title).first)
        article = Article.new
        article.title = title
        article.collection = collection
        article.created_by_id = User.current_user.id if User.current_user.present?
        article.save! unless preview_mode
      end
      link_id = create_link(article, display_text, text_type) unless preview_mode
      # now update the attribute
      link_element = REXML::Element.new("link")
      element.children.each { |c| link_element.add(c) }
      link_element.add_attribute('target_title', title)
      debug("element="+link_element.inspect)
      debug("article="+article.inspect)
      link_element.add_attribute('target_id', article.id.to_s) unless preview_mode
      link_element.add_attribute('link_id', link_id.to_s) unless preview_mode
      element.replace_with(link_element)
    end
    new_article_count = collection.articles.count
    logger.info("ISSUE4269 new_article_count = #{new_article_count}")
    if new_article_count < old_article_count
      logger.error("ISSUE4269 ERROR new_article_count #{new_article_count} < old_article_count #{old_article_count}!")
    end
    doc.write(processed)
    return processed
  end


  # handle XML-dependent post-processing
  def postprocess_xml_markup(xml_string)
    doc = REXML::Document.new xml_string
    processed = ''
    doc.elements.each("//lb") do |element|
      if element.previous_element && element.previous_sibling.node_type == :element && element.previous_element.name == 'lb'
        pre = doc.to_s
        element.parent.elements.delete(element)
      end
    end
    doc.write(processed)
    return processed
  end


  CELL_PREFIX = "<?xml version='1.0' encoding='UTF-8'?><cell>"
  CELL_SUFFIX = '</cell>'

  def self.cell_to_xml(cell)
    REXML::Document.new(CELL_PREFIX + cell.gsub('&','&amp;') + CELL_SUFFIX)
  end

  def self.xml_to_cell(doc)
    text = ""
    doc.write(text)
    text.sub(CELL_PREFIX,'').sub(CELL_SUFFIX,'')
  end

  def self.cell_to_plaintext(cell)
    doc = cell_to_xml(cell)
    doc.each_element('.//text()') { |e| p e.text }.join
  end

  def self.cell_to_subject(cell)
    doc = cell_to_xml(cell)
    subjects = ""
    doc.elements.each("//link") do |e|
      title = e.attributes['target_title']
      subjects << title
      subjects << "\n"
    end
    subjects
  end

  def self.cell_to_category(cell)
    doc = cell_to_xml(cell)
    categories = ""
    doc.elements.each("//link") do |e|
      id = e.attributes['target_id']
      if id
        article = Article.find(id)
        article.categories.each do |category|
          categories << category.title
          categories << "\n"
        end
      end
    end
    categories
  end

  ##############################################
  # Code to rename links within the text.
  # This assumes that the name change has already
  # taken place within the article table in the DB
  ##############################################
  def rename_article_links(old_title, new_title)
    title_regex =
      Regexp.escape(old_title)
        .gsub('\\ ',' ') # Regexp.escape converts ' ' to '\\ ' for some reason -- undo this
        .gsub(/\s+/, '\s+') # convert multiple whitespaces into 1+n space characters

    self.source_text = rename_link_in_text(source_text, title_regex, new_title)

    # Articles don't have translations, but we still need to update pages.source_translation
    if has_attribute?(:source_translation) && !source_translation.nil?
      self.source_translation = rename_link_in_text(source_translation, title_regex, new_title)
    end
  end

  def rename_link_in_text(text, title_regex, new_title)
    # handle links of the format [[Old Title|Display Text]]
    text = text.gsub(/\[\[#{title_regex}\|/, "[[#{new_title}|")
    # handle links of the format [[Old Title]]
    text = text.gsub(/\[\[(#{title_regex})\]\]/, "[[#{new_title}|\\1]]")

    text
  end


  def pipe_tables_formatting(text)
    # since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
    # to the beginning and end of each line
    text.split("\n").map{|line| "|#{line}|"}.join("\n")
  end

  def xml_table_to_markdown_table(table_element, pandoc_format=false)
    text_table = ""

    # clean up in-cell line-breaks
    table_element.xpath('//lb').each { |n| n.replace(' ')}

    # calculate the widths of each column based on max(header, cell[0...end])
    column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{|e| e.xpath('td').count }).max
    column_widths = {}
    1.upto(column_count) do |column_index|
      longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{|e| e.text().length}.max || 0)
      corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first
      heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length
      column_widths[column_index] = [longest_cell, heading_length].max
    end

    # print the header as markdown
    cell_strings = []
    table_element.xpath("//th").each_with_index do |e,i|
      cell_strings << e.text.rjust(column_widths[i+1], ' ')
    end
    text_table << cell_strings.join(' | ') << "\n"

    # print the separator
    text_table << column_count.times.map{|i| ''.rjust(column_widths[i+1], '-')}.join(' | ') << "\n"

    # print each row as markdown
    table_element.xpath('//tr').each do |row_element|
      text_table << row_element.xpath('td').map do |e|
        width = 80 #default for hand-coded tables
        index = e.path.match(/.*td\[(\d+)\]/)
        if index
          width = column_widths[index[1].to_i] || 80
        else
          width = column_widths.values.first
        end
        e.text.rjust(width, ' ')
      end.join(' | ') << "\n"
    end
    if pandoc_format
      text_table = pipe_tables_formatting(text_table)
    end

    "#{text_table}\n\n"
  end



  def debug(msg)
    logger.debug("DEBUG: #{msg}")
  end

end

1	module XmlSourceProcessor	1✔
2
3	@text_dirty = false	1✔
4	@translation_dirty = false	1✔
5	#@fields = false
6
7	def source_text=(text)	1✔
8	@text_dirty = true	149✔
9	super	149✔
10	end
11
12	def source_translation=(translation)	1✔
13	@translation_dirty = true	40✔
14	super	40✔
15	end
16
17	def validate_source	1✔
18	if self.source_text.blank?	2,159✔
19	return	2,046✔
20	end
21	validate_links(self.source_text)	113✔
22	end
23
24	def validate_source_translation	1✔
25	if self.source_translation.blank?	2,159✔
26	return	2,126✔
27	end
28	validate_links(self.source_translation)	33✔
29	end
30
31	#check the text for problems or typos with the subject links
32	def validate_links(text)	1✔
33	error_scope = [:activerecord, :errors, :models, :xml_source_processor]	146✔
34	# split on all begin-braces
35	tags = text.split('[[')	146✔
36	# remove the initial string which occurs before the first tag
37	debug("validate_source: tags to process are #{tags.inspect}")	146✔
38	tags = tags - [tags[0]]	146✔
39	debug("validate_source: massaged tags to process are #{tags.inspect}")	146✔
40	for tag in tags	146✔
41	debug(tag)	85✔
42
43	if tag.include?(']]]')	85✔
44	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('tags_should_not_use_3_brackets', scope: error_scope))	1✔
45	return	1✔
46	end
47	unless tag.include?(']]')	84✔
48	tag = tag.strip	1✔
49	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('wrong_number_of_closing_braces', tag: '"[['+tag+'"', scope: error_scope))	1✔
50	end
51
52	# just pull the pieces between the braces
53	inner_tag = tag.split(']]')[0]	84✔
54	if inner_tag =~ /^\s*$/	84✔
55	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_tag_in', tag: '"[['+tag+'"', scope: error_scope))	1✔
56	end
57
58	#check for unclosed single bracket
59	if inner_tag.include?('[')	84✔
60	unless inner_tag.include?(']')	1!
61	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('unclosed_bracket_within', tag: '"'+inner_tag+'"', scope: error_scope))	1✔
62	end
63	end
64	# check for blank title or display name with pipes
65	if inner_tag.include?("\|")	84✔
66	tag_parts = inner_tag.split('\|')	17✔
67	debug("validate_source: inner tag parts are #{tag_parts.inspect}")	17✔
68	if tag_parts[0] =~ /^\s*$/	17✔
69	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_subject_in', tag: '"[['+inner_tag+']]"', scope: error_scope))	1✔
70	end
71	if tag_parts[1] =~ /^\s*$/	17✔
72	errors.add(:base, I18n.t('subject_linking_error', scope: error_scope) + I18n.t('blank_text_in', tag: '"[['+inner_tag+']]"', scope: error_scope))	1✔
73	end
74	end
75	end
76	# return errors.size > 0
77	end
78
79	##############################################
80	# All code to convert transcriptions from source
81	# format to canonical xml format belongs here.
82	##############################################
83	def process_source	1✔
84	if @text_dirty	209✔
85	self.xml_text = wiki_to_xml(self, Page::TEXT_TYPE::TRANSCRIPTION)	65✔
86	end
87
88	if @translation_dirty	209✔
89	self.xml_translation = wiki_to_xml(self, Page::TEXT_TYPE::TRANSLATION)	15✔
90	end
91	end
92
93	def wiki_to_xml(page, text_type)	1✔
94
95	subjects_disabled = page.collection.subjects_disabled	86✔
96
97	source_text = case text_type	86✔
98	when Page::TEXT_TYPE::TRANSCRIPTION	70✔
99	page.source_text	70✔
100	when Page::TEXT_TYPE::TRANSLATION	16✔
101	page.source_translation	16✔
102	else	×
103	""	×
104	end
105
106	xml_string = String.new(source_text)	86✔
107	xml_string = process_latex_snippets(xml_string)	86✔
108	xml_string = clean_bad_braces(xml_string)	86✔
109	xml_string = clean_script_tags(xml_string)	86✔
110	xml_string = process_square_braces(xml_string) unless subjects_disabled	86✔
111	xml_string = process_linewise_markup(xml_string)	86✔
112	xml_string = process_line_breaks(xml_string)	86✔
113	xml_string = valid_xml_from_source(xml_string)	86✔
114	xml_string = update_links_and_xml(xml_string, false, text_type)	86✔
115	xml_string = postprocess_xml_markup(xml_string)	86✔
116	postprocess_sections	86✔
117	xml_string	86✔
118	end
119
120
121	# remove script tags from HTML to prevent javascript injection
122	def clean_script_tags(text)	1✔
123	# text.gsub(/<script.*?<\/script>/m, '')
124	text.gsub(/<\/?script.*?>/m, '')	86✔
125	end
126
127	BAD_SHIFT_REGEX = /\[\[([[[:alpha:]][[:blank:]]\|,\(\)\-[[:digit:]]]+)\}\}/	1✔
128	def clean_bad_braces(text)	1✔
129	text.gsub BAD_SHIFT_REGEX, "[[\\1]]"	86✔
130	end
131
132	BRACE_REGEX = /\[\[.*?\]\]/m	1✔
133	def process_square_braces(text)	1✔
134	# find all the links
135	wikilinks = text.scan(BRACE_REGEX)	83✔
136	wikilinks.each do \|wikilink_contents\|	83✔
137	# strip braces
138	munged = wikilink_contents.sub('[[','')	32✔
139	munged = munged.sub(']]','')	32✔
140
141	# extract the title and display
142	if munged.include? '\|'	32✔
143	parts = munged.split '\|'	14✔
144	title = parts[0]	14✔
145	verbatim = parts[1]	14✔
146	else	18✔
147	title = munged	18✔
148	verbatim = munged	18✔
149	end
150
151	title = canonicalize_title(title)	32✔
152
153	replacement = "<link target_title=\"#{title}\">#{verbatim}</link>"	32✔
154	text.sub!(wikilink_contents, replacement)	32✔
155	end
156
157	text	83✔
158	end
159
160	def remove_square_braces(text)	1✔
161	new_text = text.scan(BRACE_REGEX)	3✔
162	new_text.each do \|results\|	3✔
163	changed = results	3✔
164	#remove title
165	if results.include?('\|')	3!
166	changed = results.sub(/\[\[.*?\\|/, '')	×
167	end
168	changed = changed.sub('[[', '')	3✔
169	changed = changed.sub(']]', '')	3✔
170
171	text.sub!(results, changed)	3✔
172	end
173	text	3✔
174	end
175
176	LATEX_SNIPPET = /(\{\{tex:?(.*?):?tex\}\})/m	1✔
177	def process_latex_snippets(text)	1✔
178	return text unless self.respond_to? :tex_figures	86✔
179	replacements = {}	72✔
180	figures = self.tex_figures.to_a	72✔
181
182	text.scan(LATEX_SNIPPET).each_with_index do \|pair, i\|	72✔
183	with_tags = pair[0]	×
184	contents = pair[1]	×
185
186	replacements[with_tags] = "<texFigure position=\"#{i+1}\"/>" # position attribute in acts as list starts with 1	×
187
188	figure = figures[i] \|\| TexFigure.new	×
189	figure.source = contents unless figure.source == contents	×
190	figures[i] = figure	×
191	end
192
193	self.tex_figures = figures	72✔
194	replacements.each_pair do \|s,r\|	72✔
195	text.sub!(s,r)	×
196	end
197
198	text	72✔
199	end
200
201	HEADER = /\s\\|\s/	1✔
202	SEPARATOR = /---.*\\|/	1✔
203	ROW = HEADER	1✔
204
205	def process_linewise_markup(text)	1✔
206	@tables = []	86✔
207	@sections = []	86✔
208	new_lines = []	86✔
209	current_table = nil	86✔
210	text.lines.each do \|line\|	86✔
211	# first deal with any sections
212	line = process_any_sections(line)	98✔
213	# look for a header
214	if !current_table	98✔
215	if line.match(HEADER)	98!
216	line.chomp	×
NEW 217	current_table = { header: [], rows: [], section: @sections.last }	×
218	# fill the header
219	cells = line.split(/\s\\|\s/)	×
220	cells.shift if line.match(/^\\|/) # remove leading pipe	×
221	current_table[:header] = cells.map{ \|cell_title\| cell_title.sub(/^!\s*/,'') }	×
222	heading = cells.map do \|cell\|	×
223	if cell.match(/^!/)	×
224	"<th class=\"bang\">#{cell.sub(/^!\s*/,'')}</th>"	×
225	else	×
226	"<th>#{cell}</th>"	×
227	end
228	end.join(' ')
229	new_lines << "<table class=\"tabular\">\n<thead>\n<tr>#{heading}</tr></thead>"	×
230	else
231	# no current table, no table contents -- NO-OP	98✔
232	new_lines << line	98✔
233	end
234	else
235	# this is either an end or a separator	×
236	if line.match(SEPARATOR)	×
237	# NO-OP	×
238	elsif line.match(ROW)	×
239	# remove leading and trailing delimiters	×
NEW 240	clean_line=line.chomp.sub(/^\s\\|/, '').sub(/\\|\s$/, '')	×
241	# fill the row
NEW 242	cells = clean_line.split(/\s\\|\s/, -1) # -1 means "don't prune empty values at the end"	×
243	current_table[:rows] << cells	×
NEW 244	rowline = ''	×
245	cells.each_with_index do \|cell, i\|	×
246	head = current_table[:header][i]	×
247	role_string = " role=\"#{head}\""	×
NEW 248	rowline += "<td#{role_string}>#{cell}</td> "	×
249	end
250
251	if current_table[:rows].size == 1	×
NEW 252	new_lines << '<tbody>'	×
253	end
254	new_lines << "<tr>#{rowline}</tr>"	×
255	else
256	# finished the last row	×
NEW 257	unless current_table[:rows].empty? # only process tables with bodies	×
258	@tables << current_table	×
NEW 259	new_lines << '</tbody>'	×
260	end
NEW 261	new_lines << '</table><lb/>'	×
262	current_table = nil	×
263	end
264	end
265	end
266
267	if current_table	86✔
268	# unclosed table	×
269	@tables << current_table	×
NEW 270	unless current_table[:rows].empty? # only process tables with bodies	×
271	@tables << current_table	×
NEW 272	new_lines << '</tbody>'	×
273	end
NEW 274	new_lines << '</table><lb/>'	×
275	end
276	# do something with the table data
277	new_lines.join(' ')	86✔
278	end
279
280	def process_any_sections(line)	1✔
281	6.downto(2) do \|depth\|	98✔
282	line.scan(/(={#{depth}}([^=]+)={#{depth}})/).each do \|section_match\|	490✔
283	wiki_title = section_match[1].strip	×
284	if wiki_title.length > 0	×
285	verbatim = XmlSourceProcessor.cell_to_plaintext(wiki_title)	×
286	safe_verbatim = verbatim.gsub(/"/, """)	×
287	line = line.sub(section_match.first, "<entryHeading title=\"#{safe_verbatim}\" depth=\"#{depth}\" >#{wiki_title}</entryHeading>")	×
288	@sections << Section.new(:title => wiki_title, :depth => depth)	×
289	end
290	end
291	end
292
293	line	98✔
294	end
295
296	def postprocess_sections	1✔
297	@sections.each do \|section\|	86✔
298	doc = XmlSourceProcessor.cell_to_xml(section.title)	×
299	doc.elements.each("//link") do \|e\|	×
300	title = e.attributes['target_title']	×
301	article = collection.articles.where(:title => title).first	×
302	if article	×
303	e.add_attribute('target_id', article.id.to_s)	×
304	end
305	end
306	section.title = XmlSourceProcessor.xml_to_cell(doc)	×
307	end
308	end
309
310
311	def canonicalize_title(title)	1✔
312	# kill all tags
313	title = title.gsub(/<.*?>/, '')	32✔
314	# linebreaks -> spaces
315	title = title.gsub(/\n/, ' ')	32✔
316	# multiple spaces -> single spaces
317	title = title.gsub(/\s+/, ' ')	32✔
318	# change double quotes to proper xml
319	title = title.gsub(/\"/, '"')	32✔
320	title	32✔
321	end
322
323	# transformations converting source mode transcription to xml
324	def process_line_breaks(text)	1✔
325	text="<p>#{text}</p>"	86✔
326	text = text.gsub(/\s\n\s\n\s*/, "</p><p>")	86✔
327	text = text.gsub(/([[:word:]]+)-\r\n\s*/, '\1<lb break="no" />')	86✔
328	text = text.gsub(/\r\n\s*/, "<lb/>")	86✔
329	text = text.gsub(/([[:word:]]+)-\n\s*/, '\1<lb break="no" />')	86✔
330	text = text.gsub(/\n\s*/, "<lb/>")	86✔
331	text = text.gsub(/([[:word:]]+)-\r\s*/, '\1<lb break="no" />')	86✔
332	text = text.gsub(/\r\s*/, "<lb/>")	86✔
333	return text	86✔
334	end
335
336	def valid_xml_from_source(source)	1✔
337	source = source \|\| ""	86✔
338	safe = source.gsub /\&/, '&'	86✔
339	safe.gsub! /\&amp;/, '&'	86✔
340	safe.gsub! /[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]/, ' '	86✔
341
342	string = <<EOF	86✔
343	<?xml version="1.0" encoding="UTF-8"?>
344	<page>
345	#{safe}
346	</page>
347	EOF
348	end
349
350	def update_links_and_xml(xml_string, preview_mode=false, text_type)	1✔
351	# first clear out the existing links
352	# log the count of articles before and after
353	old_article_count = collection.articles.count	86✔
354	logger.info("ISSUE4269 old_article_count = #{old_article_count}")	86✔
355	clear_links(text_type) unless preview_mode	86!
356	processed = ""	86✔
357	# process it
358	doc = REXML::Document.new xml_string	86✔
359	doc.elements.each("//link") do \|element\|	86✔
360	# default the title to the text if it's not specified
361	if !(title=element.attributes['target_title'])	32!
362	title = element.text	×
363	end
364	#display_text = element.text
365	display_text = ""	32✔
366	element.children.each do \|e\|	32✔
367	display_text += e.to_s	32✔
368	end
369	debug("link display_text = #{display_text}")	32✔
370	#change the xml version of quotes back to double quotes for article title
371	title = title.gsub('"', '"')	32✔
372
373	# create new blank articles if they don't exist already
374	if !(article = collection.articles.where(:title => title).first)	32✔
375	article = Article.new	7✔
376	article.title = title	7✔
377	article.collection = collection	7✔
378	article.created_by_id = User.current_user.id if User.current_user.present?	7!
379	article.save! unless preview_mode	7!
380	end
381	link_id = create_link(article, display_text, text_type) unless preview_mode	32!
382	# now update the attribute
383	link_element = REXML::Element.new("link")	32✔
384	element.children.each { \|c\| link_element.add(c) }	64✔
385	link_element.add_attribute('target_title', title)	32✔
386	debug("element="+link_element.inspect)	32✔
387	debug("article="+article.inspect)	32✔
388	link_element.add_attribute('target_id', article.id.to_s) unless preview_mode	32!
389	link_element.add_attribute('link_id', link_id.to_s) unless preview_mode	32!
390	element.replace_with(link_element)	32✔
391	end
392	new_article_count = collection.articles.count	86✔
393	logger.info("ISSUE4269 new_article_count = #{new_article_count}")	86✔
394	if new_article_count < old_article_count	86!
395	logger.error("ISSUE4269 ERROR new_article_count #{new_article_count} < old_article_count #{old_article_count}!")	×
396	end
397	doc.write(processed)	86✔
398	return processed	86✔
399	end
400
401
402	# handle XML-dependent post-processing
403	def postprocess_xml_markup(xml_string)	1✔
404	doc = REXML::Document.new xml_string	86✔
405	processed = ''	86✔
406	doc.elements.each("//lb") do \|element\|	86✔
407	if element.previous_element && element.previous_sibling.node_type == :element && element.previous_element.name == 'lb'	6!
408	pre = doc.to_s	×
409	element.parent.elements.delete(element)	×
410	end
411	end
412	doc.write(processed)	86✔
413	return processed	86✔
414	end
415
416
417	CELL_PREFIX = "<?xml version='1.0' encoding='UTF-8'?><cell>"	1✔
418	CELL_SUFFIX = '</cell>'	1✔
419
420	def self.cell_to_xml(cell)	1✔
421	REXML::Document.new(CELL_PREFIX + cell.gsub('&','&') + CELL_SUFFIX)	3✔
422	end
423
424	def self.xml_to_cell(doc)	1✔
425	text = ""	×
426	doc.write(text)	×
427	text.sub(CELL_PREFIX,'').sub(CELL_SUFFIX,'')	×
428	end
429
430	def self.cell_to_plaintext(cell)	1✔
431	doc = cell_to_xml(cell)	3✔
432	doc.each_element('.//text()') { \|e\| p e.text }.join	3✔
433	end
434
435	def self.cell_to_subject(cell)	1✔
436	doc = cell_to_xml(cell)	×
437	subjects = ""	×
438	doc.elements.each("//link") do \|e\|	×
439	title = e.attributes['target_title']	×
440	subjects << title	×
441	subjects << "\n"	×
442	end
443	subjects	×
444	end
445
446	def self.cell_to_category(cell)	1✔
447	doc = cell_to_xml(cell)	×
448	categories = ""	×
449	doc.elements.each("//link") do \|e\|	×
450	id = e.attributes['target_id']	×
451	if id	×
452	article = Article.find(id)	×
453	article.categories.each do \|category\|	×
454	categories << category.title	×
455	categories << "\n"	×
456	end
457	end
458	end
459	categories	×
460	end
461
462	##############################################
463	# Code to rename links within the text.
464	# This assumes that the name change has already
465	# taken place within the article table in the DB
466	##############################################
467	def rename_article_links(old_title, new_title)	1✔
468	title_regex =
469	Regexp.escape(old_title)	15✔
470	.gsub('\\ ',' ') # Regexp.escape converts ' ' to '\\ ' for some reason -- undo this
471	.gsub(/\s+/, '\s+') # convert multiple whitespaces into 1+n space characters
472
473	self.source_text = rename_link_in_text(source_text, title_regex, new_title)	15✔
474
475	# Articles don't have translations, but we still need to update pages.source_translation
476	if has_attribute?(:source_translation) && !source_translation.nil?	15✔
477	self.source_translation = rename_link_in_text(source_translation, title_regex, new_title)	6✔
478	end
479	end
480
481	def rename_link_in_text(text, title_regex, new_title)	1✔
482	# handle links of the format [[Old Title\|Display Text]]
483	text = text.gsub(/\[\[#{title_regex}\\|/, "[[#{new_title}\|")	21✔
484	# handle links of the format [[Old Title]]
485	text = text.gsub(/\[\[(#{title_regex})\]\]/, "[[#{new_title}\|\\1]]")	21✔
486
487	text	21✔
488	end
489
490
491	def pipe_tables_formatting(text)	1✔
492	# since Pandoc Pipe Tables extension requires pipe characters at the beginning and end of each line we must add them
493	# to the beginning and end of each line
494	text.split("\n").map{\|line\| "\|#{line}\|"}.join("\n")	2✔
495	end
496
497	def xml_table_to_markdown_table(table_element, pandoc_format=false)	1✔
498	text_table = ""	12✔
499
500	# clean up in-cell line-breaks
501	table_element.xpath('//lb').each { \|n\| n.replace(' ')}	74✔
502
503	# calculate the widths of each column based on max(header, cell[0...end])
504	column_count = ([table_element.xpath("//th").count] + table_element.xpath('//tr').map{\|e\| e.xpath('td').count }).max	12✔
505	column_widths = {}	12✔
506	1.upto(column_count) do \|column_index\|	12✔
507	longest_cell = (table_element.xpath("//tr/td[position()=#{column_index}]").map{\|e\| e.text().length}.max \|\| 0)	×
508	corresponding_heading = heading_length = table_element.xpath("//th[position()=#{column_index}]").first	×
509	heading_length = corresponding_heading.nil? ? 0 : corresponding_heading.text().length	×
510	column_widths[column_index] = [longest_cell, heading_length].max	×
511	end
512
513	# print the header as markdown
514	cell_strings = []	12✔
515	table_element.xpath("//th").each_with_index do \|e,i\|	12✔
516	cell_strings << e.text.rjust(column_widths[i+1], ' ')	×
517	end
518	text_table << cell_strings.join(' \| ') << "\n"	12✔
519
520	# print the separator
521	text_table << column_count.times.map{\|i\| ''.rjust(column_widths[i+1], '-')}.join(' \| ') << "\n"	12✔
522
523	# print each row as markdown
524	table_element.xpath('//tr').each do \|row_element\|	12✔
525	text_table << row_element.xpath('td').map do \|e\|	×
526	width = 80 #default for hand-coded tables	×
527	index = e.path.match(/.*td\[(\d+)\]/)	×
528	if index	×
NEW 529	width = column_widths[index[1].to_i] \|\| 80	×
530	else	×
531	width = column_widths.values.first	×
532	end
NEW 533	e.text.rjust(width, ' ')	×
534	end.join(' \| ') << "\n"
535	end
536	if pandoc_format	12✔
537	text_table = pipe_tables_formatting(text_table)	2✔
538	end
539
540	"#{text_table}\n\n"	12✔
541	end
542
543
544
545	def debug(msg)	1✔
546	logger.debug("DEBUG: #{msg}")	490✔
547	end
548
549	end

benwbrum / fromthepage / 13402333050

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous