• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 10675586955

03 Sep 2024 02:30AM UTC coverage: 61.821% (-0.06%) from 61.884%
10675586955

Pull #14379

github

web-flow
Merge 1f222af49 into 9285df8c6
Pull Request #14379: SPARKNLP Introducing LLAMA 3

0 of 27 new or added lines in 3 files covered. (0.0%)

15 existing lines in 11 files now uncovered.

8982 of 14529 relevant lines covered (61.82%)

0.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.34
/src/main/scala/com/johnsnowlabs/nlp/training/POS.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.training
18

19
import com.johnsnowlabs.nlp.util.io.OutputHelper
20
import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
21
import org.apache.spark.sql.expressions.UserDefinedFunction
22
import org.apache.spark.sql.functions.{col, concat_ws, udf}
23
import org.apache.spark.sql.types.MetadataBuilder
24
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
25

26
import scala.collection.mutable.ArrayBuffer
27

28
private case class TaggedToken(token: String, tag: String)
29
private case class TaggedDocument(sentence: String, taggedTokens: Array[TaggedToken])
30
private case class Annotations(text: String, document: Array[Annotation], pos: Array[Annotation])
31

32
/** Helper class for creating DataFrames for training a part-of-speech tagger.
33
  *
34
  * The dataset needs to consist of sentences on each line, where each word is delimited with its
35
  * respective tag:
36
  *
37
  * {{{
38
  * Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN Nov.|NNP 29|CD .|.
39
  * }}}
40
  *
41
  * The sentence can then be parsed with [[readDataset]] into a column with annotations of type
42
  * `POS`.
43
  *
44
  * ==Example==
45
  * In this example, the file `test-training.txt` has the content of the sentence above.
46
  * {{{
47
  * import com.johnsnowlabs.nlp.training.POS
48
  *
49
  * val pos = POS()
50
  * val path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
51
  * val posDf = pos.readDataset(spark, path, "|", "tags")
52
  *
53
  * posDf.selectExpr("explode(tags) as tags").show(false)
54
  * +---------------------------------------------+
55
  * |tags                                         |
56
  * +---------------------------------------------+
57
  * |[pos, 0, 5, NNP, [word -> Pierre], []]       |
58
  * |[pos, 7, 12, NNP, [word -> Vinken], []]      |
59
  * |[pos, 14, 14, ,, [word -> ,], []]            |
60
  * |[pos, 16, 17, CD, [word -> 61], []]          |
61
  * |[pos, 19, 23, NNS, [word -> years], []]      |
62
  * |[pos, 25, 27, JJ, [word -> old], []]         |
63
  * |[pos, 29, 29, ,, [word -> ,], []]            |
64
  * |[pos, 31, 34, MD, [word -> will], []]        |
65
  * |[pos, 36, 39, VB, [word -> join], []]        |
66
  * |[pos, 41, 43, DT, [word -> the], []]         |
67
  * |[pos, 45, 49, NN, [word -> board], []]       |
68
  * |[pos, 51, 52, IN, [word -> as], []]          |
69
  * |[pos, 47, 47, DT, [word -> a], []]           |
70
  * |[pos, 56, 67, JJ, [word -> nonexecutive], []]|
71
  * |[pos, 69, 76, NN, [word -> director], []]    |
72
  * |[pos, 78, 81, NNP, [word -> Nov.], []]       |
73
  * |[pos, 83, 84, CD, [word -> 29], []]          |
74
  * |[pos, 81, 81, ., [word -> .], []]            |
75
  * +---------------------------------------------+
76
  * }}}
77
  */
78
case class POS() {
79

80
  /*
81
   * Add Metadata annotationType to output DataFrame
82
   * NOTE: This should be replaced by an existing function when it's accessible in next release
83
   * */
84

85
  def wrapColumnMetadata(col: Column, annotatorType: String, outPutColName: String): Column = {
86
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
1✔
87
    metadataBuilder.putString("annotatorType", annotatorType)
1✔
88
    col.as(outPutColName, metadataBuilder.build)
1✔
89
  }
90

91
  /*
92
   * This section is to help users to convert text files in token|tag style into DataFrame
93
   * with POS Annotation for training PerceptronApproach
94
   * */
95

96
  private def createDocumentAnnotation(sentence: String) = {
97
    Array(
1✔
98
      Annotation(
1✔
99
        AnnotatorType.DOCUMENT,
1✔
100
        0,
1✔
101
        sentence.length - 1,
1✔
102
        sentence,
103
        Map.empty[String, String]))
1✔
104
  }
105

106
  private def createPosAnnotation(sentence: String, taggedTokens: Array[TaggedToken]) = {
107
    var lastBegin = 0
1✔
108
    taggedTokens.map { case TaggedToken(token, tag) =>
1✔
109
      val tokenBegin = sentence.indexOf(token, lastBegin)
1✔
110
      val a = Annotation(
1✔
111
        AnnotatorType.POS,
1✔
112
        tokenBegin,
113
        tokenBegin + token.length - 1,
1✔
114
        tag,
115
        Map("word" -> token))
1✔
116
      lastBegin += token.length
1✔
117
      a
118
    }
119
  }
120

121
  private def lineToTaggedDocument(line: String, delimiter: String) = {
122

123
    /*
124
    TODO: improve the performance of regex group
125
    val splitted = line.replaceAll(s"(?:${delimiter.head}\\w+)+(\\s)", "$0##$1").split("##").map(_.trim)
126
     */
127
    val splitted = line.split(" ").map(_.trim)
1✔
128

129
    val tokenTags = splitted.flatMap(token => {
1✔
130
      val tokenTag = token.split(delimiter.head).map(_.trim)
1✔
131
      if (tokenTag.exists(_.isEmpty) || tokenTag.length != 2)
1✔
132
        // Ignore broken pairs or pairs with delimiter char
133
        None
1✔
134
      else
135
        Some(TaggedToken(tokenTag.head, tokenTag.last))
1✔
136
    })
137
    TaggedDocument(tokenTags.map(_.token).mkString(" "), tokenTags)
1✔
138
  }
139

140
  /** Reads the provided dataset file with given parameters and returns a DataFrame ready to for
141
    * training a part-of-speech tagger.
142
    *
143
    * @param sparkSession
144
    *   Current Spark sessions
145
    * @param path
146
    *   Path to the resource
147
    * @param delimiter
148
    *   Delimiter used to separate word from their tag in the text
149
    * @param outputPosCol
150
    *   Name for the output column of the part-of-tags
151
    * @param outputDocumentCol
152
    *   Name for the [[com.johnsnowlabs.nlp.base.DocumentAssembler DocumentAssembler]] column
153
    * @param outputTextCol
154
    *   Name for the column of the raw text
155
    * @return
156
    *   DataFrame of parsed text
157
    */
158
  def readDataset(
159
      sparkSession: SparkSession,
160
      path: String,
161
      delimiter: String = "|",
162
      outputPosCol: String = "tags",
163
      outputDocumentCol: String = "document",
164
      outputTextCol: String = "text"): DataFrame = {
165
    import sparkSession.implicits._
166

UNCOV
167
    require(delimiter.length == 1, s"Delimiter must be one character long. Received $delimiter")
×
168

169
    val dataset = sparkSession.read
170
      .textFile(OutputHelper.parsePath(path))
1✔
171
      .filter(_.nonEmpty)
1✔
172
      .map(line => lineToTaggedDocument(line, delimiter))
1✔
173
      .map { case TaggedDocument(sentence, taggedTokens) =>
1✔
174
        Annotations(
1✔
175
          sentence,
176
          createDocumentAnnotation(sentence),
1✔
177
          createPosAnnotation(sentence, taggedTokens))
1✔
178
      }
179

180
    dataset
181
      .withColumnRenamed("text", outputTextCol)
182
      .withColumn(
183
        outputDocumentCol,
184
        wrapColumnMetadata(dataset("document"), AnnotatorType.DOCUMENT, outputDocumentCol))
185
      .withColumn(
186
        outputPosCol,
187
        wrapColumnMetadata(dataset("pos"), AnnotatorType.POS, outputPosCol))
188
      .select(outputTextCol, outputDocumentCol, outputPosCol)
1✔
189
  }
190

191
  // For testing purposes when there is an array of tokens and an array of labels
192
  def readFromDataframe(
193
      posDataframe: DataFrame,
194
      tokensCol: String = "tokens",
195
      labelsCol: String = "labels",
196
      outPutDocColName: String = "text",
197
      outPutPosColName: String = "tags"): DataFrame = {
198
    def annotatorType: String = AnnotatorType.POS
1✔
199

200
    def annotateTokensTags: UserDefinedFunction = udf {
1✔
201
      (tokens: Seq[String], tags: Seq[String], text: String) =>
202
        lazy val strTokens = tokens.mkString("#")
203
        lazy val strPosTags = tags.mkString("#")
204

205
        require(
1✔
206
          tokens.length == tags.length,
1✔
207
          s"Cannot train from DataFrame since there" +
×
208
            s" is a row with different amount of tags and tokens:\n$strTokens\n$strPosTags")
×
209

210
        val tokenTagAnnotation: ArrayBuffer[Annotation] = ArrayBuffer()
1✔
211
        def annotatorType: String = AnnotatorType.POS
1✔
212
        var lastIndex = 0
1✔
213

214
        for ((e, i) <- tokens.zipWithIndex) {
1✔
215

216
          val beginOfToken = text.indexOfSlice(e, lastIndex)
1✔
217
          val endOfToken = (beginOfToken + e.length) - 1
1✔
218

219
          val fullPOSAnnotatorStruct = new Annotation(
1✔
220
            annotatorType = annotatorType,
221
            begin = beginOfToken,
222
            end = endOfToken,
223
            result = tags(i),
1✔
224
            metadata = Map("word" -> e))
1✔
225
          tokenTagAnnotation += fullPOSAnnotatorStruct
1✔
226
          lastIndex = text.indexOfSlice(e, lastIndex)
1✔
227
        }
228
        tokenTagAnnotation
229
    }
230

231
    val tempDataFrame = posDataframe
232
      .withColumn(outPutDocColName, concat_ws(" ", col(tokensCol)))
233
      .withColumn(
234
        outPutPosColName,
235
        annotateTokensTags(col(tokensCol), col(labelsCol), col(outPutDocColName)))
236
      .drop(tokensCol, labelsCol)
1✔
237

238
    tempDataFrame.withColumn(
1✔
239
      outPutPosColName,
240
      wrapColumnMetadata(tempDataFrame(outPutPosColName), annotatorType, outPutPosColName))
1✔
241
  }
242

243
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc