• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 13883000244

16 Mar 2025 11:44AM UTC coverage: 59.034% (-1.0%) from 60.072%
13883000244

Pull #14444

github

web-flow
Merge 6d717703b into 05000ab4a
Pull Request #14444: Sparknlp 1060 implement phi 3.5 vision

0 of 292 new or added lines in 5 files covered. (0.0%)

20 existing lines in 14 files now uncovered.

9413 of 15945 relevant lines covered (59.03%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/Phi3VisionTokenizer.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.tokenizer.bpe
18

19
import com.johnsnowlabs.nlp.annotators.common.IndexedToken
20

21
import java.nio.charset.Charset
22
import scala.collection.mutable.ListBuffer
23
import scala.util.matching.Regex
24
import scala.collection.mutable
25

26
class Phi3VisionTokenizer(
27
    merges: Map[(String, String), Int],
28
    vocab: Map[String, Int],
29
    specialTokens: SpecialTokens,
30
    padWithSequenceTokens: Boolean = true,
31
    prependString: String = "",
32
    addPrefixSpaceToSentence: Boolean = false,
33
    alwaysAddPrefix: Boolean = true,
34
    splitPatternRegex: Regex =
35
      raw"""(?i)(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""".r)
36
    extends BpeTokenizer(
37
      merges,
38
      vocab,
39
      specialTokens,
40
      padWithSequenceTokens,
41
      addPrefixSpaceToSentence,
42
      alwaysAddPrefix) {
43

44
  /** Mapping for bytes to a different set of unicode characters (especially white spaces). This
45
    * improved model performance for gpt-2
46
    */
47
  protected val bytesToUnicodeMapping: Map[Int, String] = {
48
    val bytes: ListBuffer[Int] =
NEW
49
      ListBuffer.range('!', '~' + 1) ++ ListBuffer.range('¡', '¬' + 1) ++ ListBuffer
×
NEW
50
        .range('®', 'ÿ' + 1)
×
NEW
51
    val characters: ListBuffer[Int] = bytes.clone
×
NEW
52
    var n = 0
×
NEW
53
    for (b <- 0 to 256) {
×
NEW
54
      if (!bytes.contains(b)) {
×
NEW
55
        bytes += b
×
NEW
56
        characters += (256 + n)
×
NEW
57
        n += 1
×
58
      }
59
    }
NEW
60
    (bytes zip characters.map(_.toChar.toString)).toMap
×
61
  }
62

63
  // Differs from Transformers, space is always prepended.
64
  // FIX: Space should not be prepended to all tokens, but to the beginning of the text only. Otherwise token
65
  // such as '.' get space prepended and they should not.
66
  override val prefixForPieceId: Option[String] =
NEW
67
    if (prependString.nonEmpty) Some(prependString) else None
×
68

NEW
69
  protected val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1))
×
70

71
  protected val unicodeToByteMapping: Map[String, Int] =
NEW
72
    bytesToUnicodeMapping.map(x => (x._2, x._1))
×
73

74
  override def preProcessTokenForBpe(token: String): String = {
75
    token
NEW
76
      .getBytes("UTF-8")
×
NEW
77
      .map { b => if (b < 0) 256 + b else b }
×
NEW
78
      .foldLeft("")(_ + bytesToUnicodeMapping(_))
×
79
  }
80

NEW
81
  val splitPattern: Regex = splitPatternRegex
×
82

83
  override def tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken] = {
84
    // split pattern based on gpt2's bpe tokenizer
85
    splitPattern
NEW
86
      .findAllMatchIn(if (prefixForPieceId.isDefined || text.startsWith(" ")) text
×
NEW
87
      else " " + text) // Prepend space to the beginning of text
×
NEW
88
      .map(tok => IndexedToken(tok.matched, tok.start + indexOffset, tok.end + indexOffset - 1))
×
NEW
89
      .toArray
×
90
  }
91

92
  def decodeTokens(tokens: Array[Int]): String = {
NEW
93
    val decoded = new mutable.StringBuilder()
×
NEW
94
    tokens.foreach { token =>
×
95
      {
NEW
96
        val decodedToken = decoderVocab(token)
×
NEW
97
        if (!specialTokens.contains(decodedToken)) {
×
NEW
98
          if (decodedToken.startsWith("<0x") && decodedToken.endsWith(">")) {
×
NEW
99
            val strippedHex = decodedToken.replaceAll("<0x|>", "")
×
NEW
100
            val byteValue = Integer.parseInt(strippedHex, 16)
×
NEW
101
            decoded.append(byteValue.toChar)
×
102
          } else {
NEW
103
            decoded.append(decodedToken)
×
104
          }
105
        }
106
      }
107

108
    }
NEW
109
    decoded.toString().replaceAll(decoderVocab(29871), " ").trim()
×
110
  }
111
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc