15252839065

Committed 26 May 2025 11:30AM UTC coverage: 52.115% (-0.6%) from 52.715%

Build # 15252839065

Build Type

Pull #14585

github

Committed by

web-flow

Commit Message

Merge 625e5c10f into 56512b006

Pull Request Pull Request #14585: SparkNLP 1131 - Introducing Florance-2

Run Details

0 of 199 new or added lines in 4 files covered. (0.0%)

50 existing lines in 33 files now uncovered.

9931 of 19056 relevant lines covered (52.11%)

0.52 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Florence2Tokenizer.scala

/*
 * Copyright 2017-2025 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.tokenizer.bpe

import com.johnsnowlabs.nlp.annotators.common.IndexedToken

import java.nio.charset.Charset
import scala.collection.mutable.ListBuffer
import scala.util.matching.Regex

class Florence2Tokenizer(
    merges: Map[(String, String), Int],
    vocab: Map[String, Int],
    specialTokens: SpecialTokens,
    padWithSequenceTokens: Boolean = true,
    prependString: String = "Ġ",
    addPrefixSpaceToSentence: Boolean = false,
    alwaysAddPrefix: Boolean = true,
    splitPatternRegex: Regex =
      raw"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""".r)
    extends BpeTokenizer(
      merges,
      vocab,
      specialTokens,
      padWithSequenceTokens,
      addPrefixSpaceToSentence,
      alwaysAddPrefix) {

  /** Mapping for bytes to a different set of unicode characters (especially white spaces). This
    * improved model performance for gpt-2
    */
  protected val bytesToUnicodeMapping: Map[Int, String] = {
    val bytes: ListBuffer[Int] =
      ListBuffer.range('!', '~' + 1) ++ ListBuffer.range('¡', '¬' + 1) ++ ListBuffer
        .range('®', 'ÿ' + 1)
    val characters: ListBuffer[Int] = bytes.clone
    var n = 0
    for (b <- 0 to 256) {
      if (!bytes.contains(b)) {
        bytes += b
        characters += (256 + n)
        n += 1
      }
    }
    (bytes zip characters.map(_.toChar.toString)).toMap
  }

  // Differs from Transformers, space is always prepended.
  // FIX: Space should not be prepended to all tokens, but to the beginning of the text only. Otherwise token
  // such as '.' get space prepended and they should not.
  override val prefixForPieceId: Option[String] =
    if (prependString.nonEmpty) Some(prependString) else None

  protected val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1))

  protected val unicodeToByteMapping: Map[String, Int] =
    bytesToUnicodeMapping.map(x => (x._2, x._1))

  override def preProcessTokenForBpe(token: String): String = {
    token
      .getBytes("UTF-8")
      .map { b => if (b < 0) 256 + b else b }
      .foldLeft("")(_ + bytesToUnicodeMapping(_))
  }

  val splitPattern: Regex = splitPatternRegex

  override def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] = {
    // split pattern based on gpt2's bpe tokenizer
    splitPattern
      .findAllMatchIn(if (prefixForPieceId.isDefined || text.startsWith(" ")) text
      else " " + text) // Prepend space to the beginning of text
      .map(tok => IndexedToken(tok.matched, tok.start + indexOffset, tok.end + indexOffset - 1))
      .toArray
  }

  //  def decodeTokens(tokens: Array[Int]): String = {
  //    val decoded = new mutable.StringBuilder()
  //    tokens.foreach { token =>
  //      {
  //        val decodedToken = decoderVocab(token)
  //        if (!specialTokens.contains(decodedToken)) {
  //          if (decodedToken.startsWith("<0x") && decodedToken.endsWith(">")) {
  //            val strippedHex = decodedToken.replaceAll("<0x|>", "")
  //            val byteValue = Integer.parseInt(strippedHex, 16)
  //            decoded.append(byteValue.toChar)
  //          } else {
  //            decoded.append(decodedToken)
  //          }
  //        }
  //      }
  //
  //    }
  //    decoded.toString().replaceAll(decoderVocab(29871), " ").trim()
  //  }
  def decodeTokens(tokens: Array[Int]): String = {
    val text = tokens
      .map(token => decoderVocab(token))
//      .filter(x => !specialTokens.contains(x))
      .mkString("")
    val bytes = text.map(x => unicodeToByteMapping(x.toString)).map(x => x.toByte).toArray
    new String(bytes, Charset.forName("UTF-8"))
  }
}

1	/*
2	* Copyright 2017-2025 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators.tokenizer.bpe
18
19	import com.johnsnowlabs.nlp.annotators.common.IndexedToken
20
21	import java.nio.charset.Charset
22	import scala.collection.mutable.ListBuffer
23	import scala.util.matching.Regex
24
25	class Florence2Tokenizer(
26	merges: Map[(String, String), Int],
27	vocab: Map[String, Int],
28	specialTokens: SpecialTokens,
29	padWithSequenceTokens: Boolean = true,
30	prependString: String = "Ġ",
31	addPrefixSpaceToSentence: Boolean = false,
32	alwaysAddPrefix: Boolean = true,
33	splitPatternRegex: Regex =
34	raw"""(?i:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\r\n\p{L}\p{N}]?\p{L}+\|\p{N}{1,3}\| ?[^\s\p{L}\p{N}]+[\r\n]\|\s[\r\n]+\|\s+(?!\S)\|\s+""".r)
35	extends BpeTokenizer(
36	merges,
37	vocab,
38	specialTokens,
39	padWithSequenceTokens,
40	addPrefixSpaceToSentence,
41	alwaysAddPrefix) {
42
43	/** Mapping for bytes to a different set of unicode characters (especially white spaces). This
44	* improved model performance for gpt-2
45	*/
46	protected val bytesToUnicodeMapping: Map[Int, String] = {
47	val bytes: ListBuffer[Int] =
NEW 48	ListBuffer.range('!', '~' + 1) ++ ListBuffer.range('¡', '¬' + 1) ++ ListBuffer	×
NEW 49	.range('®', 'ÿ' + 1)	×
NEW 50	val characters: ListBuffer[Int] = bytes.clone	×
NEW 51	var n = 0	×
NEW 52	for (b <- 0 to 256) {	×
NEW 53	if (!bytes.contains(b)) {	×
NEW 54	bytes += b	×
NEW 55	characters += (256 + n)	×
NEW 56	n += 1	×
57	}
58	}
NEW 59	(bytes zip characters.map(_.toChar.toString)).toMap	×
60	}
61
62	// Differs from Transformers, space is always prepended.
63	// FIX: Space should not be prepended to all tokens, but to the beginning of the text only. Otherwise token
64	// such as '.' get space prepended and they should not.
65	override val prefixForPieceId: Option[String] =
NEW 66	if (prependString.nonEmpty) Some(prependString) else None	×
67
NEW 68	protected val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1))	×
69
70	protected val unicodeToByteMapping: Map[String, Int] =
NEW 71	bytesToUnicodeMapping.map(x => (x._2, x._1))	×
72
73	override def preProcessTokenForBpe(token: String): String = {
74	token
NEW 75	.getBytes("UTF-8")	×
NEW 76	.map { b => if (b < 0) 256 + b else b }	×
NEW 77	.foldLeft("")(_ + bytesToUnicodeMapping(_))	×
78	}
79
NEW 80	val splitPattern: Regex = splitPatternRegex	×
81
82	override def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] = {
83	// split pattern based on gpt2's bpe tokenizer
84	splitPattern
NEW 85	.findAllMatchIn(if (prefixForPieceId.isDefined \|\| text.startsWith(" ")) text	×
NEW 86	else " " + text) // Prepend space to the beginning of text	×
NEW 87	.map(tok => IndexedToken(tok.matched, tok.start + indexOffset, tok.end + indexOffset - 1))	×
NEW 88	.toArray	×
89	}
90
91	// def decodeTokens(tokens: Array[Int]): String = {
92	// val decoded = new mutable.StringBuilder()
93	// tokens.foreach { token =>
94	// {
95	// val decodedToken = decoderVocab(token)
96	// if (!specialTokens.contains(decodedToken)) {
97	// if (decodedToken.startsWith("<0x") && decodedToken.endsWith(">")) {
98	// val strippedHex = decodedToken.replaceAll("<0x\|>", "")
99	// val byteValue = Integer.parseInt(strippedHex, 16)
100	// decoded.append(byteValue.toChar)
101	// } else {
102	// decoded.append(decodedToken)
103	// }
104	// }
105	// }
106	//
107	// }
108	// decoded.toString().replaceAll(decoderVocab(29871), " ").trim()
109	// }
110	def decodeTokens(tokens: Array[Int]): String = {
111	val text = tokens
112	.map(token => decoderVocab(token))
113	// .filter(x => !specialTokens.contains(x))
NEW 114	.mkString("")	×
NEW 115	val bytes = text.map(x => unicodeToByteMapping(x.toString)).map(x => x.toByte).toArray	×
NEW 116	new String(bytes, Charset.forName("UTF-8"))	×
117	}
118	}

JohnSnowLabs / spark-nlp / 15252839065

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous