7340940136

Committed 27 Dec 2023 06:28PM UTC coverage: 62.876%. First build

Build # 7340940136

Build Type

Pull #14112

github

Committed by

web-flow

Commit Message

Merge 64ecc94ab into e9099b0f1

Pull Request Pull Request #14112: Release/521 release candidate

Run Details

20 of 29 new or added lines in 6 files covered. (68.97%)

8958 of 14247 relevant lines covered (62.88%)

0.63 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

76.47

/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotate.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.IntParam
import org.apache.spark.sql.Row

trait HasBatchedAnnotate[M <: Model[M]] {

  this: RawAnnotator[M] =>

  /** Size of every batch (Default depends on model).
    *
    * @group param
    */
  val batchSize = new IntParam(this, "batchSize", "Size of every batch.")

  /** Size of every batch.
    *
    * @group setParam
    */
  def setBatchSize(size: Int): this.type = {
    val recommended = size
    require(recommended > 0, "batchSize must be greater than 0")
    set(this.batchSize, recommended)
  }

  /** Size of every batch.
    *
    * @group getParam
    */
  def getBatchSize: Int = $(batchSize)

  def batchProcess(rows: Iterator[_]): Iterator[Row] = {
    val groupedRows = rows.grouped(getBatchSize)

    groupedRows.flatMap {
      case batchRow: Seq[Row] => processBatchRows(batchRow)
      case singleRow: Row => processBatchRows(Seq(singleRow))
      case _ => Seq(Row.empty)
    }
  }

  private def processBatchRows(batchedRows: Seq[Row]): Seq[Row] = {
    val inputAnnotations = batchedRows.map(row => {
      getInputCols.flatMap(inputCol => {
        row.getAs[Seq[Row]](inputCol).map(Annotation(_))
      })
    })
    val outputAnnotations = batchAnnotate(inputAnnotations)
    batchedRows
      .zip(outputAnnotations)
      .map { case (row, annotations) =>
        row.toSeq ++ Array(annotations.map(a => Row(a.productIterator.toSeq: _*)))
      }
      .map(Row.fromSeq)
  }

  /** takes a document and annotations and produces new annotations of this annotator's annotation
    * type
    *
    * @param batchedAnnotations
    *   Annotations in batches that correspond to inputAnnotationCols generated by previous
    *   annotators if any
    * @return
    *   any number of annotations processed for every batch of input annotations. Not necessary
    *   one to one relationship
    *
    * IMPORTANT: !MUST! return sequences of equal lengths !! IMPORTANT: !MUST! return sentences
    * that belong to the same original row !! (challenging)
    */
  def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]]

}

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp
18
19	import org.apache.spark.ml.Model
20	import org.apache.spark.ml.param.IntParam
21	import org.apache.spark.sql.Row
22
23	trait HasBatchedAnnotate[M <: Model[M]] {
24
25	this: RawAnnotator[M] =>
26
27	/** Size of every batch (Default depends on model).
28	*
29	* @group param
30	*/
31	val batchSize = new IntParam(this, "batchSize", "Size of every batch.")	1✔
32
33	/** Size of every batch.
34	*
35	* @group setParam
36	*/
37	def setBatchSize(size: Int): this.type = {
38	val recommended = size
39	require(recommended > 0, "batchSize must be greater than 0")	×
40	set(this.batchSize, recommended)	×
41	}
42
43	/** Size of every batch.
44	*
45	* @group getParam
46	*/
47	def getBatchSize: Int = $(batchSize)	1✔
48
49	def batchProcess(rows: Iterator[_]): Iterator[Row] = {
50	val groupedRows = rows.grouped(getBatchSize)	1✔
51
52	groupedRows.flatMap {	1✔
53	case batchRow: Seq[Row] => processBatchRows(batchRow)	1✔
NEW 54	case singleRow: Row => processBatchRows(Seq(singleRow))	×
NEW 55	case _ => Seq(Row.empty)	×
56	}
57	}
58
59	private def processBatchRows(batchedRows: Seq[Row]): Seq[Row] = {
60	val inputAnnotations = batchedRows.map(row => {	1✔
61	getInputCols.flatMap(inputCol => {	1✔
62	row.getAs[Seq[Row]](inputCol).map(Annotation(_))	1✔
63	})
64	})
65	val outputAnnotations = batchAnnotate(inputAnnotations)	1✔
66	batchedRows
67	.zip(outputAnnotations)	1✔
68	.map { case (row, annotations) =>	1✔
69	row.toSeq ++ Array(annotations.map(a => Row(a.productIterator.toSeq: _*)))	1✔
70	}
71	.map(Row.fromSeq)	1✔
72	}
73
74	/** takes a document and annotations and produces new annotations of this annotator's annotation
75	* type
76	*
77	* @param batchedAnnotations
78	* Annotations in batches that correspond to inputAnnotationCols generated by previous
79	* annotators if any
80	* @return
81	* any number of annotations processed for every batch of input annotations. Not necessary
82	* one to one relationship
83	*
84	* IMPORTANT: !MUST! return sequences of equal lengths !! IMPORTANT: !MUST! return sentences
85	* that belong to the same original row !! (challenging)
86	*/
87	def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]]
88
89	}

JohnSnowLabs / spark-nlp / 7340940136

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous