7861513225

Committed 11 Feb 2024 11:05AM UTC coverage: 62.678% (-0.05%) from 62.731%

Build # 7861513225

Build Type

Pull #14169

github

Committed by

web-flow

Commit Message

Merge 13f2acde4 into 6010244ba

Pull Request Pull Request #14169: Fixed a bug with models that has 'onnx_data' file not working in dbfs/hdfs

Run Details

8951 of 14281 relevant lines covered (62.68%)

0.63 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.18

/src/main/scala/com/johnsnowlabs/nlp/training/CoNLLHelper.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.training

import com.johnsnowlabs.nlp.annotators.common.{IndexedTaggedWord, TaggedSentence}

import scala.collection.mutable.ArrayBuffer

object CoNLLHelper {

  case class CoNLLTokenCols(
      uPosTokens: IndexedTaggedWord,
      xPosTokens: IndexedTaggedWord,
      lemma: IndexedTaggedWord,
      sentenceIndex: Int)
  case class CoNLLSentenceCols(uPos: TaggedSentence, xPos: TaggedSentence, lemma: TaggedSentence)

  def readLines(lines: Array[String], explodeSentences: Boolean): Seq[CoNLLUDocument] = {

    val doc = new StringBuilder()
    val lastSentence = ArrayBuffer.empty[CoNLLTokenCols]
    val sentences = ArrayBuffer.empty[CoNLLSentenceCols]

    def addSentence(): Unit = {
      val uPosTokens = clearTokens(lastSentence.map(t => t.uPosTokens).toArray)
      val xPosTokens = clearTokens(lastSentence.map(t => t.xPosTokens).toArray)
      val lemmaTokens = clearTokens(lastSentence.map(t => t.lemma).toArray)
      val uPos = TaggedSentence(uPosTokens)
      val xPos = TaggedSentence(xPosTokens)
      val lemma = TaggedSentence(lemmaTokens)
      val taggedCoNLLSentence = CoNLLSentenceCols(uPos, xPos, lemma)

      sentences.append(taggedCoNLLSentence)
      lastSentence.clear()
    }

    def closeDocument: Option[(String, List[CoNLLSentenceCols])] = {

      val result = (doc.toString, sentences.toList)
      doc.clear()
      sentences.clear()

      if (result._1.nonEmpty) Some(result._1, result._2) else None
    }

    def processCoNLLRow(items: Array[String]): Option[(String, List[CoNLLSentenceCols])] = {
      if (doc.nonEmpty && !doc.endsWith(System.lineSeparator()) && items(3) != "PUNCT")
        doc.append(" ")
      val indexedTaggedCoNLL = getIndexedTaggedCoNLL(items, doc)
      lastSentence.append(indexedTaggedCoNLL)
      None
    }

    def processNewLine(): Option[(String, List[CoNLLSentenceCols])] = {
      if (!explodeSentences && (doc.nonEmpty && !doc.endsWith(
          System.lineSeparator) && lastSentence.nonEmpty)) {
        doc.append(System.lineSeparator * 2)
      }
      addSentence()
      if (explodeSentences) closeDocument else None
    }

    def processComment(items: Array[String]): Option[(String, List[CoNLLSentenceCols])] = {
      if (items(CoNLLUCols.ID.id).contains("newdoc")) {
        closeDocument
      } else None
    }

    val docs = lines
      .flatMap { line =>
        val items = line.trim.split("\\t")
        val id =
          if (items(CoNLLUCols.ID.id).isEmpty) "" else items(CoNLLUCols.ID.id).head.toString

        val coNLLRow = id match {
          case "#" => processComment(items)
          case "" => processNewLine()
          case _ => processCoNLLRow(items)
        }
        coNLLRow
      }

    addSentence()

    val last = if (doc.nonEmpty) Seq((doc.toString, sentences.toList)) else Seq.empty

    (docs ++ last).map { case (text, textSentence) =>
      val uPos = textSentence.map(t => t.uPos)
      val xPos = textSentence.map(t => t.xPos)
      val lemma = textSentence.map(t => t.lemma)
      CoNLLUDocument(text, uPos, xPos, lemma)
    }
  }

  private def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord] = {
    tokens.filter(t => t.word.trim().nonEmpty)
  }

  private def getIndexedTaggedCoNLL(
      items: Array[String],
      doc: StringBuilder,
      sentenceIndex: Int = 0): CoNLLTokenCols = {
    val begin = doc.length
    doc.append(items(CoNLLUCols.FORM.id))
    val end = doc.length - 1
    val word = items(CoNLLUCols.FORM.id)
    val uPosTag = items(CoNLLUCols.UPOS.id)
    val xPosTag = items(CoNLLUCols.XPOS.id)
    val lemmaValue = items(CoNLLUCols.LEMMA.id)

    val uPos = IndexedTaggedWord(word, uPosTag, begin, end)
    val xPos = IndexedTaggedWord(word, xPosTag, begin, end)
    val lemma = IndexedTaggedWord(lemmaValue, "", begin, end)

    CoNLLTokenCols(uPos, xPos, lemma, sentenceIndex)
  }

}

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.training
18
19	import com.johnsnowlabs.nlp.annotators.common.{IndexedTaggedWord, TaggedSentence}
20
21	import scala.collection.mutable.ArrayBuffer
22
23	object CoNLLHelper {
24
25	case class CoNLLTokenCols(
26	uPosTokens: IndexedTaggedWord,
27	xPosTokens: IndexedTaggedWord,
28	lemma: IndexedTaggedWord,
29	sentenceIndex: Int)
30	case class CoNLLSentenceCols(uPos: TaggedSentence, xPos: TaggedSentence, lemma: TaggedSentence)
31
32	def readLines(lines: Array[String], explodeSentences: Boolean): Seq[CoNLLUDocument] = {
33
34	val doc = new StringBuilder()	1✔
35	val lastSentence = ArrayBuffer.empty[CoNLLTokenCols]	1✔
36	val sentences = ArrayBuffer.empty[CoNLLSentenceCols]	1✔
37
38	def addSentence(): Unit = {
39	val uPosTokens = clearTokens(lastSentence.map(t => t.uPosTokens).toArray)	1✔
40	val xPosTokens = clearTokens(lastSentence.map(t => t.xPosTokens).toArray)	1✔
41	val lemmaTokens = clearTokens(lastSentence.map(t => t.lemma).toArray)	1✔
42	val uPos = TaggedSentence(uPosTokens)	1✔
43	val xPos = TaggedSentence(xPosTokens)	1✔
44	val lemma = TaggedSentence(lemmaTokens)	1✔
45	val taggedCoNLLSentence = CoNLLSentenceCols(uPos, xPos, lemma)	1✔
46
47	sentences.append(taggedCoNLLSentence)	1✔
48	lastSentence.clear()	1✔
49	}
50
51	def closeDocument: Option[(String, List[CoNLLSentenceCols])] = {
52
53	val result = (doc.toString, sentences.toList)	1✔
54	doc.clear()	1✔
55	sentences.clear()	1✔
56
57	if (result._1.nonEmpty) Some(result._1, result._2) else None	1✔
58	}
59
60	def processCoNLLRow(items: Array[String]): Option[(String, List[CoNLLSentenceCols])] = {
61	if (doc.nonEmpty && !doc.endsWith(System.lineSeparator()) && items(3) != "PUNCT")	1✔
62	doc.append(" ")	1✔
63	val indexedTaggedCoNLL = getIndexedTaggedCoNLL(items, doc)	1✔
64	lastSentence.append(indexedTaggedCoNLL)	1✔
65	None	1✔
66	}
67
68	def processNewLine(): Option[(String, List[CoNLLSentenceCols])] = {
69	if (!explodeSentences && (doc.nonEmpty && !doc.endsWith(	1✔
70	System.lineSeparator) && lastSentence.nonEmpty)) {	1✔
71	doc.append(System.lineSeparator * 2)	1✔
72	}
73	addSentence()	1✔
74	if (explodeSentences) closeDocument else None	1✔
75	}
76
77	def processComment(items: Array[String]): Option[(String, List[CoNLLSentenceCols])] = {
78	if (items(CoNLLUCols.ID.id).contains("newdoc")) {	1✔
79	closeDocument	1✔
80	} else None	1✔
81	}
82
83	val docs = lines
84	.flatMap { line =>	1✔
85	val items = line.trim.split("\\t")	1✔
86	val id =
87	if (items(CoNLLUCols.ID.id).isEmpty) "" else items(CoNLLUCols.ID.id).head.toString	1✔
88
89	val coNLLRow = id match {
90	case "#" => processComment(items)	1✔
91	case "" => processNewLine()	1✔
92	case _ => processCoNLLRow(items)	1✔
93	}
94	coNLLRow	1✔
95	}
96
97	addSentence()	1✔
98
99	val last = if (doc.nonEmpty) Seq((doc.toString, sentences.toList)) else Seq.empty	×
100
101	(docs ++ last).map { case (text, textSentence) =>	1✔
102	val uPos = textSentence.map(t => t.uPos)	1✔
103	val xPos = textSentence.map(t => t.xPos)	1✔
104	val lemma = textSentence.map(t => t.lemma)	1✔
105	CoNLLUDocument(text, uPos, xPos, lemma)	1✔
106	}
107	}
108
109	private def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord] = {
110	tokens.filter(t => t.word.trim().nonEmpty)	1✔
111	}
112
113	private def getIndexedTaggedCoNLL(
114	items: Array[String],
115	doc: StringBuilder,
116	sentenceIndex: Int = 0): CoNLLTokenCols = {
117	val begin = doc.length	1✔
118	doc.append(items(CoNLLUCols.FORM.id))	1✔
119	val end = doc.length - 1	1✔
120	val word = items(CoNLLUCols.FORM.id)	1✔
121	val uPosTag = items(CoNLLUCols.UPOS.id)	1✔
122	val xPosTag = items(CoNLLUCols.XPOS.id)	1✔
123	val lemmaValue = items(CoNLLUCols.LEMMA.id)	1✔
124
125	val uPos = IndexedTaggedWord(word, uPosTag, begin, end)	1✔
126	val xPos = IndexedTaggedWord(word, xPosTag, begin, end)	1✔
127	val lemma = IndexedTaggedWord(lemmaValue, "", begin, end)	1✔
128
129	CoNLLTokenCols(uPos, xPos, lemma, sentenceIndex)	1✔
130	}
131
132	}

JohnSnowLabs / spark-nlp / 7861513225

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous