• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 7861513225

11 Feb 2024 11:05AM UTC coverage: 62.678% (-0.05%) from 62.731%
7861513225

Pull #14169

github

web-flow
Merge 13f2acde4 into 6010244ba
Pull Request #14169: Fixed a bug with models that has 'onnx_data' file not working in dbfs/hdfs

8951 of 14281 relevant lines covered (62.68%)

0.63 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.18
/src/main/scala/com/johnsnowlabs/nlp/training/CoNLLHelper.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.training
18

19
import com.johnsnowlabs.nlp.annotators.common.{IndexedTaggedWord, TaggedSentence}
20

21
import scala.collection.mutable.ArrayBuffer
22

23
object CoNLLHelper {
24

25
  case class CoNLLTokenCols(
26
      uPosTokens: IndexedTaggedWord,
27
      xPosTokens: IndexedTaggedWord,
28
      lemma: IndexedTaggedWord,
29
      sentenceIndex: Int)
30
  case class CoNLLSentenceCols(uPos: TaggedSentence, xPos: TaggedSentence, lemma: TaggedSentence)
31

32
  def readLines(lines: Array[String], explodeSentences: Boolean): Seq[CoNLLUDocument] = {
33

34
    val doc = new StringBuilder()
1✔
35
    val lastSentence = ArrayBuffer.empty[CoNLLTokenCols]
1✔
36
    val sentences = ArrayBuffer.empty[CoNLLSentenceCols]
1✔
37

38
    def addSentence(): Unit = {
39
      val uPosTokens = clearTokens(lastSentence.map(t => t.uPosTokens).toArray)
1✔
40
      val xPosTokens = clearTokens(lastSentence.map(t => t.xPosTokens).toArray)
1✔
41
      val lemmaTokens = clearTokens(lastSentence.map(t => t.lemma).toArray)
1✔
42
      val uPos = TaggedSentence(uPosTokens)
1✔
43
      val xPos = TaggedSentence(xPosTokens)
1✔
44
      val lemma = TaggedSentence(lemmaTokens)
1✔
45
      val taggedCoNLLSentence = CoNLLSentenceCols(uPos, xPos, lemma)
1✔
46

47
      sentences.append(taggedCoNLLSentence)
1✔
48
      lastSentence.clear()
1✔
49
    }
50

51
    def closeDocument: Option[(String, List[CoNLLSentenceCols])] = {
52

53
      val result = (doc.toString, sentences.toList)
1✔
54
      doc.clear()
1✔
55
      sentences.clear()
1✔
56

57
      if (result._1.nonEmpty) Some(result._1, result._2) else None
1✔
58
    }
59

60
    def processCoNLLRow(items: Array[String]): Option[(String, List[CoNLLSentenceCols])] = {
61
      if (doc.nonEmpty && !doc.endsWith(System.lineSeparator()) && items(3) != "PUNCT")
1✔
62
        doc.append(" ")
1✔
63
      val indexedTaggedCoNLL = getIndexedTaggedCoNLL(items, doc)
1✔
64
      lastSentence.append(indexedTaggedCoNLL)
1✔
65
      None
1✔
66
    }
67

68
    def processNewLine(): Option[(String, List[CoNLLSentenceCols])] = {
69
      if (!explodeSentences && (doc.nonEmpty && !doc.endsWith(
1✔
70
          System.lineSeparator) && lastSentence.nonEmpty)) {
1✔
71
        doc.append(System.lineSeparator * 2)
1✔
72
      }
73
      addSentence()
1✔
74
      if (explodeSentences) closeDocument else None
1✔
75
    }
76

77
    def processComment(items: Array[String]): Option[(String, List[CoNLLSentenceCols])] = {
78
      if (items(CoNLLUCols.ID.id).contains("newdoc")) {
1✔
79
        closeDocument
1✔
80
      } else None
1✔
81
    }
82

83
    val docs = lines
84
      .flatMap { line =>
1✔
85
        val items = line.trim.split("\\t")
1✔
86
        val id =
87
          if (items(CoNLLUCols.ID.id).isEmpty) "" else items(CoNLLUCols.ID.id).head.toString
1✔
88

89
        val coNLLRow = id match {
90
          case "#" => processComment(items)
1✔
91
          case "" => processNewLine()
1✔
92
          case _ => processCoNLLRow(items)
1✔
93
        }
94
        coNLLRow
1✔
95
      }
96

97
    addSentence()
1✔
98

99
    val last = if (doc.nonEmpty) Seq((doc.toString, sentences.toList)) else Seq.empty
×
100

101
    (docs ++ last).map { case (text, textSentence) =>
1✔
102
      val uPos = textSentence.map(t => t.uPos)
1✔
103
      val xPos = textSentence.map(t => t.xPos)
1✔
104
      val lemma = textSentence.map(t => t.lemma)
1✔
105
      CoNLLUDocument(text, uPos, xPos, lemma)
1✔
106
    }
107
  }
108

109
  private def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord] = {
110
    tokens.filter(t => t.word.trim().nonEmpty)
1✔
111
  }
112

113
  private def getIndexedTaggedCoNLL(
114
      items: Array[String],
115
      doc: StringBuilder,
116
      sentenceIndex: Int = 0): CoNLLTokenCols = {
117
    val begin = doc.length
1✔
118
    doc.append(items(CoNLLUCols.FORM.id))
1✔
119
    val end = doc.length - 1
1✔
120
    val word = items(CoNLLUCols.FORM.id)
1✔
121
    val uPosTag = items(CoNLLUCols.UPOS.id)
1✔
122
    val xPosTag = items(CoNLLUCols.XPOS.id)
1✔
123
    val lemmaValue = items(CoNLLUCols.LEMMA.id)
1✔
124

125
    val uPos = IndexedTaggedWord(word, uPosTag, begin, end)
1✔
126
    val xPos = IndexedTaggedWord(word, xPosTag, begin, end)
1✔
127
    val lemma = IndexedTaggedWord(lemmaValue, "", begin, end)
1✔
128

129
    CoNLLTokenCols(uPos, xPos, lemma, sentenceIndex)
1✔
130
  }
131

132
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc