• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 4413868535

pending completion
4413868535

push

github

GitHub
SPARKNLP-746: Handle empty validation sets (#13615)

8597 of 12936 relevant lines covered (66.46%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/main/scala/com/johnsnowlabs/nlp/annotators/common/WordpieceTokenized.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.common
18

19
import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
20

21
import scala.collection.Map
22

23
object WordpieceTokenized extends Annotated[WordpieceTokenizedSentence] {
24

25
  override def annotatorType: String = AnnotatorType.WORDPIECE
×
26

27
  override def unpack(annotations: Seq[Annotation]): Seq[WordpieceTokenizedSentence] = {
28
    val tokens = annotations
29
      .filter(_.annotatorType == annotatorType)
×
30
      .toArray
×
31

32
    SentenceSplit
33
      .unpack(annotations)
34
      .map(sentence => {
×
35
        tokens
36
          .filter(token => token.begin >= sentence.start & token.end <= sentence.end)
×
37
          .map(token =>
×
38
            TokenPiece(
×
39
              wordpiece = token.result,
×
40
              token = token.metadata("token"),
×
41
              pieceId = token.metadata("pieceId").toInt,
×
42
              isWordStart = token.metadata("isWordStart").toBoolean,
×
43
              begin = token.begin,
×
44
              end = token.end))
×
45
      })
46
      .filter(_.nonEmpty)
×
47
      .map(tokens => WordpieceTokenizedSentence(tokens))
×
48

49
  }
50

51
  override def pack(sentences: Seq[WordpieceTokenizedSentence]): Seq[Annotation] = {
52
    var sentenceIndex = 0
×
53

54
    sentences.flatMap { sentence =>
×
55
      sentenceIndex += 1
×
56
      sentence.tokens.map { token =>
×
57
        Annotation(
×
58
          annotatorType,
×
59
          token.begin,
×
60
          token.end,
×
61
          token.wordpiece,
×
62
          Map(
×
63
            "sentence" -> sentenceIndex.toString,
×
64
            "isWordStart" -> token.isWordStart.toString,
×
65
            "pieceId" -> token.pieceId.toString,
×
66
            "token" -> token.token))
×
67
      }
68
    }
69
  }
70
}
71

72
case class WordpieceTokenizedSentence(tokens: Array[TokenPiece])
73
case class TokenPiece(
74
    wordpiece: String,
75
    token: String,
76
    pieceId: Int,
77
    isWordStart: Boolean,
78
    begin: Int,
79
    end: Int)
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc