10675586955

Committed 03 Sep 2024 02:30AM UTC coverage: 61.821% (-0.06%) from 61.884%

Build # 10675586955

Build Type

Pull #14379

github

Committed by

web-flow

Commit Message

Merge 1f222af49 into 9285df8c6

Pull Request Pull Request #14379: SPARKNLP Introducing LLAMA 3

Run Details

0 of 27 new or added lines in 3 files covered. (0.0%)

15 existing lines in 11 files now uncovered.

8982 of 14529 relevant lines covered (61.82%)

0.62 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.34

/src/main/scala/com/johnsnowlabs/nlp/training/POS.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.training

import com.johnsnowlabs.nlp.util.io.OutputHelper
import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{col, concat_ws, udf}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.sql.{Column, DataFrame, SparkSession}

import scala.collection.mutable.ArrayBuffer

private case class TaggedToken(token: String, tag: String)
private case class TaggedDocument(sentence: String, taggedTokens: Array[TaggedToken])
private case class Annotations(text: String, document: Array[Annotation], pos: Array[Annotation])

/** Helper class for creating DataFrames for training a part-of-speech tagger.
  *
  * The dataset needs to consist of sentences on each line, where each word is delimited with its
  * respective tag:
  *
  * {{{
  * Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN Nov.|NNP 29|CD .|.
  * }}}
  *
  * The sentence can then be parsed with [[readDataset]] into a column with annotations of type
  * `POS`.
  *
  * ==Example==
  * In this example, the file `test-training.txt` has the content of the sentence above.
  * {{{
  * import com.johnsnowlabs.nlp.training.POS
  *
  * val pos = POS()
  * val path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
  * val posDf = pos.readDataset(spark, path, "|", "tags")
  *
  * posDf.selectExpr("explode(tags) as tags").show(false)
  * +---------------------------------------------+
  * |tags                                         |
  * +---------------------------------------------+
  * |[pos, 0, 5, NNP, [word -> Pierre], []]       |
  * |[pos, 7, 12, NNP, [word -> Vinken], []]      |
  * |[pos, 14, 14, ,, [word -> ,], []]            |
  * |[pos, 16, 17, CD, [word -> 61], []]          |
  * |[pos, 19, 23, NNS, [word -> years], []]      |
  * |[pos, 25, 27, JJ, [word -> old], []]         |
  * |[pos, 29, 29, ,, [word -> ,], []]            |
  * |[pos, 31, 34, MD, [word -> will], []]        |
  * |[pos, 36, 39, VB, [word -> join], []]        |
  * |[pos, 41, 43, DT, [word -> the], []]         |
  * |[pos, 45, 49, NN, [word -> board], []]       |
  * |[pos, 51, 52, IN, [word -> as], []]          |
  * |[pos, 47, 47, DT, [word -> a], []]           |
  * |[pos, 56, 67, JJ, [word -> nonexecutive], []]|
  * |[pos, 69, 76, NN, [word -> director], []]    |
  * |[pos, 78, 81, NNP, [word -> Nov.], []]       |
  * |[pos, 83, 84, CD, [word -> 29], []]          |
  * |[pos, 81, 81, ., [word -> .], []]            |
  * +---------------------------------------------+
  * }}}
  */
case class POS() {

  /*
   * Add Metadata annotationType to output DataFrame
   * NOTE: This should be replaced by an existing function when it's accessible in next release
   * */

  def wrapColumnMetadata(col: Column, annotatorType: String, outPutColName: String): Column = {
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", annotatorType)
    col.as(outPutColName, metadataBuilder.build)
  }

  /*
   * This section is to help users to convert text files in token|tag style into DataFrame
   * with POS Annotation for training PerceptronApproach
   * */

  private def createDocumentAnnotation(sentence: String) = {
    Array(
      Annotation(
        AnnotatorType.DOCUMENT,
        0,
        sentence.length - 1,
        sentence,
        Map.empty[String, String]))
  }

  private def createPosAnnotation(sentence: String, taggedTokens: Array[TaggedToken]) = {
    var lastBegin = 0
    taggedTokens.map { case TaggedToken(token, tag) =>
      val tokenBegin = sentence.indexOf(token, lastBegin)
      val a = Annotation(
        AnnotatorType.POS,
        tokenBegin,
        tokenBegin + token.length - 1,
        tag,
        Map("word" -> token))
      lastBegin += token.length
      a
    }
  }

  private def lineToTaggedDocument(line: String, delimiter: String) = {

    /*
    TODO: improve the performance of regex group
    val splitted = line.replaceAll(s"(?:${delimiter.head}\\w+)+(\\s)", "$0##$1").split("##").map(_.trim)
     */
    val splitted = line.split(" ").map(_.trim)

    val tokenTags = splitted.flatMap(token => {
      val tokenTag = token.split(delimiter.head).map(_.trim)
      if (tokenTag.exists(_.isEmpty) || tokenTag.length != 2)
        // Ignore broken pairs or pairs with delimiter char
        None
      else
        Some(TaggedToken(tokenTag.head, tokenTag.last))
    })
    TaggedDocument(tokenTags.map(_.token).mkString(" "), tokenTags)
  }

  /** Reads the provided dataset file with given parameters and returns a DataFrame ready to for
    * training a part-of-speech tagger.
    *
    * @param sparkSession
    *   Current Spark sessions
    * @param path
    *   Path to the resource
    * @param delimiter
    *   Delimiter used to separate word from their tag in the text
    * @param outputPosCol
    *   Name for the output column of the part-of-tags
    * @param outputDocumentCol
    *   Name for the [[com.johnsnowlabs.nlp.base.DocumentAssembler DocumentAssembler]] column
    * @param outputTextCol
    *   Name for the column of the raw text
    * @return
    *   DataFrame of parsed text
    */
  def readDataset(
      sparkSession: SparkSession,
      path: String,
      delimiter: String = "|",
      outputPosCol: String = "tags",
      outputDocumentCol: String = "document",
      outputTextCol: String = "text"): DataFrame = {
    import sparkSession.implicits._

    require(delimiter.length == 1, s"Delimiter must be one character long. Received $delimiter")

    val dataset = sparkSession.read
      .textFile(OutputHelper.parsePath(path))
      .filter(_.nonEmpty)
      .map(line => lineToTaggedDocument(line, delimiter))
      .map { case TaggedDocument(sentence, taggedTokens) =>
        Annotations(
          sentence,
          createDocumentAnnotation(sentence),
          createPosAnnotation(sentence, taggedTokens))
      }

    dataset
      .withColumnRenamed("text", outputTextCol)
      .withColumn(
        outputDocumentCol,
        wrapColumnMetadata(dataset("document"), AnnotatorType.DOCUMENT, outputDocumentCol))
      .withColumn(
        outputPosCol,
        wrapColumnMetadata(dataset("pos"), AnnotatorType.POS, outputPosCol))
      .select(outputTextCol, outputDocumentCol, outputPosCol)
  }

  // For testing purposes when there is an array of tokens and an array of labels
  def readFromDataframe(
      posDataframe: DataFrame,
      tokensCol: String = "tokens",
      labelsCol: String = "labels",
      outPutDocColName: String = "text",
      outPutPosColName: String = "tags"): DataFrame = {
    def annotatorType: String = AnnotatorType.POS

    def annotateTokensTags: UserDefinedFunction = udf {
      (tokens: Seq[String], tags: Seq[String], text: String) =>
        lazy val strTokens = tokens.mkString("#")
        lazy val strPosTags = tags.mkString("#")

        require(
          tokens.length == tags.length,
          s"Cannot train from DataFrame since there" +
            s" is a row with different amount of tags and tokens:\n$strTokens\n$strPosTags")

        val tokenTagAnnotation: ArrayBuffer[Annotation] = ArrayBuffer()
        def annotatorType: String = AnnotatorType.POS
        var lastIndex = 0

        for ((e, i) <- tokens.zipWithIndex) {

          val beginOfToken = text.indexOfSlice(e, lastIndex)
          val endOfToken = (beginOfToken + e.length) - 1

          val fullPOSAnnotatorStruct = new Annotation(
            annotatorType = annotatorType,
            begin = beginOfToken,
            end = endOfToken,
            result = tags(i),
            metadata = Map("word" -> e))
          tokenTagAnnotation += fullPOSAnnotatorStruct
          lastIndex = text.indexOfSlice(e, lastIndex)
        }
        tokenTagAnnotation
    }

    val tempDataFrame = posDataframe
      .withColumn(outPutDocColName, concat_ws(" ", col(tokensCol)))
      .withColumn(
        outPutPosColName,
        annotateTokensTags(col(tokensCol), col(labelsCol), col(outPutDocColName)))
      .drop(tokensCol, labelsCol)

    tempDataFrame.withColumn(
      outPutPosColName,
      wrapColumnMetadata(tempDataFrame(outPutPosColName), annotatorType, outPutPosColName))
  }

}

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.training
18
19	import com.johnsnowlabs.nlp.util.io.OutputHelper
20	import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
21	import org.apache.spark.sql.expressions.UserDefinedFunction
22	import org.apache.spark.sql.functions.{col, concat_ws, udf}
23	import org.apache.spark.sql.types.MetadataBuilder
24	import org.apache.spark.sql.{Column, DataFrame, SparkSession}
25
26	import scala.collection.mutable.ArrayBuffer
27
28	private case class TaggedToken(token: String, tag: String)
29	private case class TaggedDocument(sentence: String, taggedTokens: Array[TaggedToken])
30	private case class Annotations(text: String, document: Array[Annotation], pos: Array[Annotation])
31
32	/** Helper class for creating DataFrames for training a part-of-speech tagger.
33	*
34	* The dataset needs to consist of sentences on each line, where each word is delimited with its
35	* respective tag:
36	*
37	* {{{
38	* Pierre\|NNP Vinken\|NNP ,\|, 61\|CD years\|NNS old\|JJ ,\|, will\|MD join\|VB the\|DT board\|NN as\|IN a\|DT nonexecutive\|JJ director\|NN Nov.\|NNP 29\|CD .\|.
39	* }}}
40	*
41	* The sentence can then be parsed with [[readDataset]] into a column with annotations of type
42	* `POS`.
43	*
44	* ==Example==
45	* In this example, the file `test-training.txt` has the content of the sentence above.
46	* {{{
47	* import com.johnsnowlabs.nlp.training.POS
48	*
49	* val pos = POS()
50	* val path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
51	* val posDf = pos.readDataset(spark, path, "\|", "tags")
52	*
53	* posDf.selectExpr("explode(tags) as tags").show(false)
54	* +---------------------------------------------+
55	* \|tags \|
56	* +---------------------------------------------+
57	* \|[pos, 0, 5, NNP, [word -> Pierre], []] \|
58	* \|[pos, 7, 12, NNP, [word -> Vinken], []] \|
59	* \|[pos, 14, 14, ,, [word -> ,], []] \|
60	* \|[pos, 16, 17, CD, [word -> 61], []] \|
61	* \|[pos, 19, 23, NNS, [word -> years], []] \|
62	* \|[pos, 25, 27, JJ, [word -> old], []] \|
63	* \|[pos, 29, 29, ,, [word -> ,], []] \|
64	* \|[pos, 31, 34, MD, [word -> will], []] \|
65	* \|[pos, 36, 39, VB, [word -> join], []] \|
66	* \|[pos, 41, 43, DT, [word -> the], []] \|
67	* \|[pos, 45, 49, NN, [word -> board], []] \|
68	* \|[pos, 51, 52, IN, [word -> as], []] \|
69	* \|[pos, 47, 47, DT, [word -> a], []] \|
70	* \|[pos, 56, 67, JJ, [word -> nonexecutive], []]\|
71	* \|[pos, 69, 76, NN, [word -> director], []] \|
72	* \|[pos, 78, 81, NNP, [word -> Nov.], []] \|
73	* \|[pos, 83, 84, CD, [word -> 29], []] \|
74	* \|[pos, 81, 81, ., [word -> .], []] \|
75	* +---------------------------------------------+
76	* }}}
77	*/
78	case class POS() {
79
80	/*
81	* Add Metadata annotationType to output DataFrame
82	* NOTE: This should be replaced by an existing function when it's accessible in next release
83	* */
84
85	def wrapColumnMetadata(col: Column, annotatorType: String, outPutColName: String): Column = {
86	val metadataBuilder: MetadataBuilder = new MetadataBuilder()	1✔
87	metadataBuilder.putString("annotatorType", annotatorType)	1✔
88	col.as(outPutColName, metadataBuilder.build)	1✔
89	}
90
91	/*
92	* This section is to help users to convert text files in token\|tag style into DataFrame
93	* with POS Annotation for training PerceptronApproach
94	* */
95
96	private def createDocumentAnnotation(sentence: String) = {
97	Array(	1✔
98	Annotation(	1✔
99	AnnotatorType.DOCUMENT,	1✔
100	0,	1✔
101	sentence.length - 1,	1✔
102	sentence,
103	Map.empty[String, String]))	1✔
104	}
105
106	private def createPosAnnotation(sentence: String, taggedTokens: Array[TaggedToken]) = {
107	var lastBegin = 0	1✔
108	taggedTokens.map { case TaggedToken(token, tag) =>	1✔
109	val tokenBegin = sentence.indexOf(token, lastBegin)	1✔
110	val a = Annotation(	1✔
111	AnnotatorType.POS,	1✔
112	tokenBegin,
113	tokenBegin + token.length - 1,	1✔
114	tag,
115	Map("word" -> token))	1✔
116	lastBegin += token.length	1✔
117	a
118	}
119	}
120
121	private def lineToTaggedDocument(line: String, delimiter: String) = {
122
123	/*
124	TODO: improve the performance of regex group
125	val splitted = line.replaceAll(s"(?:${delimiter.head}\\w+)+(\\s)", "$0##$1").split("##").map(_.trim)
126	*/
127	val splitted = line.split(" ").map(_.trim)	1✔
128
129	val tokenTags = splitted.flatMap(token => {	1✔
130	val tokenTag = token.split(delimiter.head).map(_.trim)	1✔
131	if (tokenTag.exists(_.isEmpty) \|\| tokenTag.length != 2)	1✔
132	// Ignore broken pairs or pairs with delimiter char
133	None	1✔
134	else
135	Some(TaggedToken(tokenTag.head, tokenTag.last))	1✔
136	})
137	TaggedDocument(tokenTags.map(_.token).mkString(" "), tokenTags)	1✔
138	}
139
140	/** Reads the provided dataset file with given parameters and returns a DataFrame ready to for
141	* training a part-of-speech tagger.
142	*
143	* @param sparkSession
144	* Current Spark sessions
145	* @param path
146	* Path to the resource
147	* @param delimiter
148	* Delimiter used to separate word from their tag in the text
149	* @param outputPosCol
150	* Name for the output column of the part-of-tags
151	* @param outputDocumentCol
152	* Name for the [[com.johnsnowlabs.nlp.base.DocumentAssembler DocumentAssembler]] column
153	* @param outputTextCol
154	* Name for the column of the raw text
155	* @return
156	* DataFrame of parsed text
157	*/
158	def readDataset(
159	sparkSession: SparkSession,
160	path: String,
161	delimiter: String = "\|",
162	outputPosCol: String = "tags",
163	outputDocumentCol: String = "document",
164	outputTextCol: String = "text"): DataFrame = {
165	import sparkSession.implicits._
166
UNCOV 167	require(delimiter.length == 1, s"Delimiter must be one character long. Received $delimiter")	×
168
169	val dataset = sparkSession.read
170	.textFile(OutputHelper.parsePath(path))	1✔
171	.filter(_.nonEmpty)	1✔
172	.map(line => lineToTaggedDocument(line, delimiter))	1✔
173	.map { case TaggedDocument(sentence, taggedTokens) =>	1✔
174	Annotations(	1✔
175	sentence,
176	createDocumentAnnotation(sentence),	1✔
177	createPosAnnotation(sentence, taggedTokens))	1✔
178	}
179
180	dataset
181	.withColumnRenamed("text", outputTextCol)
182	.withColumn(
183	outputDocumentCol,
184	wrapColumnMetadata(dataset("document"), AnnotatorType.DOCUMENT, outputDocumentCol))
185	.withColumn(
186	outputPosCol,
187	wrapColumnMetadata(dataset("pos"), AnnotatorType.POS, outputPosCol))
188	.select(outputTextCol, outputDocumentCol, outputPosCol)	1✔
189	}
190
191	// For testing purposes when there is an array of tokens and an array of labels
192	def readFromDataframe(
193	posDataframe: DataFrame,
194	tokensCol: String = "tokens",
195	labelsCol: String = "labels",
196	outPutDocColName: String = "text",
197	outPutPosColName: String = "tags"): DataFrame = {
198	def annotatorType: String = AnnotatorType.POS	1✔
199
200	def annotateTokensTags: UserDefinedFunction = udf {	1✔
201	(tokens: Seq[String], tags: Seq[String], text: String) =>
202	lazy val strTokens = tokens.mkString("#")
203	lazy val strPosTags = tags.mkString("#")
204
205	require(	1✔
206	tokens.length == tags.length,	1✔
207	s"Cannot train from DataFrame since there" +	×
208	s" is a row with different amount of tags and tokens:\n$strTokens\n$strPosTags")	×
209
210	val tokenTagAnnotation: ArrayBuffer[Annotation] = ArrayBuffer()	1✔
211	def annotatorType: String = AnnotatorType.POS	1✔
212	var lastIndex = 0	1✔
213
214	for ((e, i) <- tokens.zipWithIndex) {	1✔
215
216	val beginOfToken = text.indexOfSlice(e, lastIndex)	1✔
217	val endOfToken = (beginOfToken + e.length) - 1	1✔
218
219	val fullPOSAnnotatorStruct = new Annotation(	1✔
220	annotatorType = annotatorType,
221	begin = beginOfToken,
222	end = endOfToken,
223	result = tags(i),	1✔
224	metadata = Map("word" -> e))	1✔
225	tokenTagAnnotation += fullPOSAnnotatorStruct	1✔
226	lastIndex = text.indexOfSlice(e, lastIndex)	1✔
227	}
228	tokenTagAnnotation
229	}
230
231	val tempDataFrame = posDataframe
232	.withColumn(outPutDocColName, concat_ws(" ", col(tokensCol)))
233	.withColumn(
234	outPutPosColName,
235	annotateTokensTags(col(tokensCol), col(labelsCol), col(outPutDocColName)))
236	.drop(tokensCol, labelsCol)	1✔
237
238	tempDataFrame.withColumn(	1✔
239	outPutPosColName,
240	wrapColumnMetadata(tempDataFrame(outPutPosColName), annotatorType, outPutPosColName))	1✔
241	}
242
243	}

JohnSnowLabs / spark-nlp / 10675586955

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous