• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 15252839065

26 May 2025 11:30AM UTC coverage: 52.115% (-0.6%) from 52.715%
15252839065

Pull #14585

github

web-flow
Merge 625e5c10f into 56512b006
Pull Request #14585: SparkNLP 1131 - Introducing Florance-2

0 of 199 new or added lines in 4 files covered. (0.0%)

50 existing lines in 33 files now uncovered.

9931 of 19056 relevant lines covered (52.11%)

0.52 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Florence2Tokenizer.scala
1
/*
2
 * Copyright 2017-2025 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.tokenizer.bpe
18

19
import com.johnsnowlabs.nlp.annotators.common.IndexedToken
20

21
import java.nio.charset.Charset
22
import scala.collection.mutable.ListBuffer
23
import scala.util.matching.Regex
24

25
class Florence2Tokenizer(
26
    merges: Map[(String, String), Int],
27
    vocab: Map[String, Int],
28
    specialTokens: SpecialTokens,
29
    padWithSequenceTokens: Boolean = true,
30
    prependString: String = "Ġ",
31
    addPrefixSpaceToSentence: Boolean = false,
32
    alwaysAddPrefix: Boolean = true,
33
    splitPatternRegex: Regex =
34
      raw"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""".r)
35
    extends BpeTokenizer(
36
      merges,
37
      vocab,
38
      specialTokens,
39
      padWithSequenceTokens,
40
      addPrefixSpaceToSentence,
41
      alwaysAddPrefix) {
42

43
  /** Mapping for bytes to a different set of unicode characters (especially white spaces). This
44
    * improved model performance for gpt-2
45
    */
46
  protected val bytesToUnicodeMapping: Map[Int, String] = {
47
    val bytes: ListBuffer[Int] =
NEW
48
      ListBuffer.range('!', '~' + 1) ++ ListBuffer.range('¡', '¬' + 1) ++ ListBuffer
×
NEW
49
        .range('®', 'ÿ' + 1)
×
NEW
50
    val characters: ListBuffer[Int] = bytes.clone
×
NEW
51
    var n = 0
×
NEW
52
    for (b <- 0 to 256) {
×
NEW
53
      if (!bytes.contains(b)) {
×
NEW
54
        bytes += b
×
NEW
55
        characters += (256 + n)
×
NEW
56
        n += 1
×
57
      }
58
    }
NEW
59
    (bytes zip characters.map(_.toChar.toString)).toMap
×
60
  }
61

62
  // Differs from Transformers, space is always prepended.
63
  // FIX: Space should not be prepended to all tokens, but to the beginning of the text only. Otherwise token
64
  // such as '.' get space prepended and they should not.
65
  override val prefixForPieceId: Option[String] =
NEW
66
    if (prependString.nonEmpty) Some(prependString) else None
×
67

NEW
68
  protected val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1))
×
69

70
  protected val unicodeToByteMapping: Map[String, Int] =
NEW
71
    bytesToUnicodeMapping.map(x => (x._2, x._1))
×
72

73
  override def preProcessTokenForBpe(token: String): String = {
74
    token
NEW
75
      .getBytes("UTF-8")
×
NEW
76
      .map { b => if (b < 0) 256 + b else b }
×
NEW
77
      .foldLeft("")(_ + bytesToUnicodeMapping(_))
×
78
  }
79

NEW
80
  val splitPattern: Regex = splitPatternRegex
×
81

82
  override def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] = {
83
    // split pattern based on gpt2's bpe tokenizer
84
    splitPattern
NEW
85
      .findAllMatchIn(if (prefixForPieceId.isDefined || text.startsWith(" ")) text
×
NEW
86
      else " " + text) // Prepend space to the beginning of text
×
NEW
87
      .map(tok => IndexedToken(tok.matched, tok.start + indexOffset, tok.end + indexOffset - 1))
×
NEW
88
      .toArray
×
89
  }
90

91
  //  def decodeTokens(tokens: Array[Int]): String = {
92
  //    val decoded = new mutable.StringBuilder()
93
  //    tokens.foreach { token =>
94
  //      {
95
  //        val decodedToken = decoderVocab(token)
96
  //        if (!specialTokens.contains(decodedToken)) {
97
  //          if (decodedToken.startsWith("<0x") && decodedToken.endsWith(">")) {
98
  //            val strippedHex = decodedToken.replaceAll("<0x|>", "")
99
  //            val byteValue = Integer.parseInt(strippedHex, 16)
100
  //            decoded.append(byteValue.toChar)
101
  //          } else {
102
  //            decoded.append(decodedToken)
103
  //          }
104
  //        }
105
  //      }
106
  //
107
  //    }
108
  //    decoded.toString().replaceAll(decoderVocab(29871), " ").trim()
109
  //  }
110
  def decodeTokens(tokens: Array[Int]): String = {
111
    val text = tokens
112
      .map(token => decoderVocab(token))
113
//      .filter(x => !specialTokens.contains(x))
NEW
114
      .mkString("")
×
NEW
115
    val bytes = text.map(x => unicodeToByteMapping(x.toString)).map(x => x.toByte).toArray
×
NEW
116
    new String(bytes, Charset.forName("UTF-8"))
×
117
  }
118
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc