13883000244

Committed 16 Mar 2025 11:44AM UTC coverage: 59.034% (-1.0%) from 60.072%

Build # 13883000244

Build Type

Pull #14444

github

Committed by

web-flow

Commit Message

Merge 6d717703b into 05000ab4a

Pull Request Pull Request #14444: Sparknlp 1060 implement phi 3.5 vision

Run Details

0 of 292 new or added lines in 5 files covered. (0.0%)

20 existing lines in 14 files now uncovered.

9413 of 15945 relevant lines covered (59.03%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Phi3VisionTokenizer.scala

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.tokenizer.bpe

import com.johnsnowlabs.nlp.annotators.common.IndexedToken

import java.nio.charset.Charset
import scala.collection.mutable.ListBuffer
import scala.util.matching.Regex
import scala.collection.mutable

class Phi3VisionTokenizer(
    merges: Map[(String, String), Int],
    vocab: Map[String, Int],
    specialTokens: SpecialTokens,
    padWithSequenceTokens: Boolean = true,
    prependString: String = "",
    addPrefixSpaceToSentence: Boolean = false,
    alwaysAddPrefix: Boolean = true,
    splitPatternRegex: Regex =
      raw"""(?i)(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""".r)
    extends BpeTokenizer(
      merges,
      vocab,
      specialTokens,
      padWithSequenceTokens,
      addPrefixSpaceToSentence,
      alwaysAddPrefix) {

  /** Mapping for bytes to a different set of unicode characters (especially white spaces). This
    * improved model performance for gpt-2
    */
  protected val bytesToUnicodeMapping: Map[Int, String] = {
    val bytes: ListBuffer[Int] =
      ListBuffer.range('!', '~' + 1) ++ ListBuffer.range('¡', '¬' + 1) ++ ListBuffer
        .range('®', 'ÿ' + 1)
    val characters: ListBuffer[Int] = bytes.clone
    var n = 0
    for (b <- 0 to 256) {
      if (!bytes.contains(b)) {
        bytes += b
        characters += (256 + n)
        n += 1
      }
    }
    (bytes zip characters.map(_.toChar.toString)).toMap
  }

  // Differs from Transformers, space is always prepended.
  // FIX: Space should not be prepended to all tokens, but to the beginning of the text only. Otherwise token
  // such as '.' get space prepended and they should not.
  override val prefixForPieceId: Option[String] =
    if (prependString.nonEmpty) Some(prependString) else None

  protected val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1))

  protected val unicodeToByteMapping: Map[String, Int] =
    bytesToUnicodeMapping.map(x => (x._2, x._1))

  override def preProcessTokenForBpe(token: String): String = {
    token
      .getBytes("UTF-8")
      .map { b => if (b < 0) 256 + b else b }
      .foldLeft("")(_ + bytesToUnicodeMapping(_))
  }

  val splitPattern: Regex = splitPatternRegex

  override def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] = {
    // split pattern based on gpt2's bpe tokenizer
    splitPattern
      .findAllMatchIn(if (prefixForPieceId.isDefined || text.startsWith(" ")) text
      else " " + text) // Prepend space to the beginning of text
      .map(tok => IndexedToken(tok.matched, tok.start + indexOffset, tok.end + indexOffset - 1))
      .toArray
  }

  def decodeTokens(tokens: Array[Int]): String = {
    val decoded = new mutable.StringBuilder()
    tokens.foreach { token =>
      {
        val decodedToken = decoderVocab(token)
        if (!specialTokens.contains(decodedToken)) {
          if (decodedToken.startsWith("<0x") && decodedToken.endsWith(">")) {
            val strippedHex = decodedToken.replaceAll("<0x|>", "")
            val byteValue = Integer.parseInt(strippedHex, 16)
            decoded.append(byteValue.toChar)
          } else {
            decoded.append(decodedToken)
          }
        }
      }

    }
    decoded.toString().replaceAll(decoderVocab(29871), " ").trim()
  }
}

1	/*
2	* Copyright 2017-2022 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators.tokenizer.bpe
18
19	import com.johnsnowlabs.nlp.annotators.common.IndexedToken
20
21	import java.nio.charset.Charset
22	import scala.collection.mutable.ListBuffer
23	import scala.util.matching.Regex
24	import scala.collection.mutable
25
26	class Phi3VisionTokenizer(
27	merges: Map[(String, String), Int],
28	vocab: Map[String, Int],
29	specialTokens: SpecialTokens,
30	padWithSequenceTokens: Boolean = true,
31	prependString: String = "",
32	addPrefixSpaceToSentence: Boolean = false,
33	alwaysAddPrefix: Boolean = true,
34	splitPatternRegex: Regex =
35	raw"""(?i)(?:'s\|'t\|'re\|'ve\|'m\|'ll\|'d)\|[^\r\n\p{L}\p{N}]?\p{L}+\|\p{N}{1,3}\| ?[^\s\p{L}\p{N}]+[\r\n]\|\s[\r\n]+\|\s+(?!\S)\|\s+""".r)
36	extends BpeTokenizer(
37	merges,
38	vocab,
39	specialTokens,
40	padWithSequenceTokens,
41	addPrefixSpaceToSentence,
42	alwaysAddPrefix) {
43
44	/** Mapping for bytes to a different set of unicode characters (especially white spaces). This
45	* improved model performance for gpt-2
46	*/
47	protected val bytesToUnicodeMapping: Map[Int, String] = {
48	val bytes: ListBuffer[Int] =
NEW 49	ListBuffer.range('!', '~' + 1) ++ ListBuffer.range('¡', '¬' + 1) ++ ListBuffer	×
NEW 50	.range('®', 'ÿ' + 1)	×
NEW 51	val characters: ListBuffer[Int] = bytes.clone	×
NEW 52	var n = 0	×
NEW 53	for (b <- 0 to 256) {	×
NEW 54	if (!bytes.contains(b)) {	×
NEW 55	bytes += b	×
NEW 56	characters += (256 + n)	×
NEW 57	n += 1	×
58	}
59	}
NEW 60	(bytes zip characters.map(_.toChar.toString)).toMap	×
61	}
62
63	// Differs from Transformers, space is always prepended.
64	// FIX: Space should not be prepended to all tokens, but to the beginning of the text only. Otherwise token
65	// such as '.' get space prepended and they should not.
66	override val prefixForPieceId: Option[String] =
NEW 67	if (prependString.nonEmpty) Some(prependString) else None	×
68
NEW 69	protected val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1))	×
70
71	protected val unicodeToByteMapping: Map[String, Int] =
NEW 72	bytesToUnicodeMapping.map(x => (x._2, x._1))	×
73
74	override def preProcessTokenForBpe(token: String): String = {
75	token
NEW 76	.getBytes("UTF-8")	×
NEW 77	.map { b => if (b < 0) 256 + b else b }	×
NEW 78	.foldLeft("")(_ + bytesToUnicodeMapping(_))	×
79	}
80
NEW 81	val splitPattern: Regex = splitPatternRegex	×
82
83	override def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] = {
84	// split pattern based on gpt2's bpe tokenizer
85	splitPattern
NEW 86	.findAllMatchIn(if (prefixForPieceId.isDefined \|\| text.startsWith(" ")) text	×
NEW 87	else " " + text) // Prepend space to the beginning of text	×
NEW 88	.map(tok => IndexedToken(tok.matched, tok.start + indexOffset, tok.end + indexOffset - 1))	×
NEW 89	.toArray	×
90	}
91
92	def decodeTokens(tokens: Array[Int]): String = {
NEW 93	val decoded = new mutable.StringBuilder()	×
NEW 94	tokens.foreach { token =>	×
95	{
NEW 96	val decodedToken = decoderVocab(token)	×
NEW 97	if (!specialTokens.contains(decodedToken)) {	×
NEW 98	if (decodedToken.startsWith("<0x") && decodedToken.endsWith(">")) {	×
NEW 99	val strippedHex = decodedToken.replaceAll("<0x\|>", "")	×
NEW 100	val byteValue = Integer.parseInt(strippedHex, 16)	×
NEW 101	decoded.append(byteValue.toChar)	×
102	} else {
NEW 103	decoded.append(decodedToken)	×
104	}
105	}
106	}
107
108	}
NEW 109	decoded.toString().replaceAll(decoderVocab(29871), " ").trim()	×
110	}
111	}

JohnSnowLabs / spark-nlp / 13883000244

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous