15252839065

Committed 26 May 2025 11:30AM UTC coverage: 52.115% (-0.6%) from 52.715%

Build # 15252839065

Build Type

Pull #14585

github

Committed by

web-flow

Commit Message

Merge 625e5c10f into 56512b006

Pull Request Pull Request #14585: SparkNLP 1131 - Introducing Florance-2

Run Details

0 of 199 new or added lines in 4 files covered. (0.0%)

50 existing lines in 33 files now uncovered.

9931 of 19056 relevant lines covered (52.11%)

0.52 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/Florence2Transformer.scala

/*
 * Copyright 2017-2024 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.nlp.annotators.cv

import com.johnsnowlabs.ml.ai.util.Generation.GenerationConfig
import com.johnsnowlabs.ml.ai.Florence2
import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
import com.johnsnowlabs.ml.openvino.OpenvinoWrapper.Florence2Wrappers
import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel}
import com.johnsnowlabs.ml.util.LoadExternalModel.{
  loadJsonStringAsset,
  loadTextAsset,
  modelSanityCheck,
  notSupportedEngineError
}
import com.johnsnowlabs.ml.util.Openvino
import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, IMAGE}
import com.johnsnowlabs.nlp._
import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.SparkSession
import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature}
import org.json4s._
import org.json4s.jackson.JsonMethods._

/** Florence2: Advancing a Unified Representation for a Variety of Vision Tasks
  *
  * Florence-2 is an advanced vision foundation model from Microsoft that uses a prompt-based
  * approach to handle a wide range of vision and vision-language tasks. It can interpret simple
  * text prompts to perform tasks like captioning, object detection, segmentation, OCR, and more.
  * The model leverages the FLD-5B dataset, containing 5.4 billion annotations across 126 million
  * images, to master multi-task learning. Its sequence-to-sequence architecture enables it to
  * excel in both zero-shot and fine-tuned settings.
  *
  * Pretrained and finetuned models can be loaded with `pretrained` of the companion object: {{ {
  * val florence2 = Florence2Transformer.pretrained() .setInputCols("image")
  * .setOutputCol("generation") }} } The default model is `"florence2_base_ft_int4"`, if no name
  * is provided.
  *
  * For available pretrained models please see the
  * [[https://sparknlp.org/models?task=Vision+Tasks Models Hub]].
  *
  * ==Supported Tasks==
  *
  * Florence-2 supports a variety of tasks through prompt engineering. The following prompt tokens
  * can be used:
  *
  *   - <CAPTION>: Image captioning
  *   - <DETAILED_CAPTION>: Detailed image captioning
  *   - <MORE_DETAILED_CAPTION>: Paragraph-level captioning
  *   - <CAPTION_TO_PHRASE_GROUNDING>: Phrase grounding from caption (requires additional text
  *     input)
  *   - <OD>: Object detection
  *   - <DENSE_REGION_CAPTION>: Dense region captioning
  *   - <REGION_PROPOSAL>: Region proposal
  *   - <OCR>: Optical Character Recognition (plain text extraction)
  *   - <OCR_WITH_REGION>: OCR with region information
  *   - <REFERRING_EXPRESSION_SEGMENTATION>: Segmentation for a referred phrase (requires
  *     additional text input)
  *   - <REGION_TO_SEGMENTATION>: Polygon mask for a region (requires additional text input)
  *   - <OPEN_VOCABULARY_DETECTION>: Open vocabulary detection for a phrase (requires additional
  *     text input)
  *   - <REGION_TO_CATEGORY>: Category of a region (requires additional text input)
  *   - <REGION_TO_DESCRIPTION>: Description of a region (requires additional text input)
  *   - <REGION_TO_OCR>: OCR for a region (requires additional text input)
  *
  * ==Example Usage==
  *
  * {{ { import com.johnsnowlabs.nlp.base.ImageAssembler import
  * com.johnsnowlabs.nlp.annotators.cv.Florence2Transformer import org.apache.spark.ml.Pipeline
  *
  * val imageAssembler = new ImageAssembler() .setInputCol("image")
  * .setOutputCol("image_assembler")
  *
  * val florence2 = Florence2Transformer.pretrained("florence2_base_ft_int4")
  * .setInputCols("image_assembler") .setOutputCol("answer") .setMaxOutputLength(50)
  *
  * val pipeline = new Pipeline().setStages(Array(imageAssembler, florence2))
  *
  * val data = Seq("/path/to/image.jpg").toDF("image") val result =
  * pipeline.fit(data).transform(data) result.select("answer.result").show(truncate = false) }} }
  *
  * ==References==
  *
  *   - Florence-2 technical report: https://arxiv.org/abs/2311.06242
  *   - Hugging Face model card: https://huggingface.co/microsoft/Florence-2-base-ft
  *   - Official sample notebook:
  *     https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb
  *
  * For more details and advanced usage, see the official documentation and sample notebooks.
  */
class Florence2Transformer(override val uid: String)
    extends AnnotatorModel[Florence2Transformer]
    with HasBatchedAnnotateImage[Florence2Transformer]
    with HasImageFeatureProperties
    with WriteOpenvinoModel
    with HasGeneratorProperties
    with HasEngine {

  def this() = this(Identifiable.randomUID("Florence2TRANSFORMER"))

  /** Input annotator type : DOCUMENT
    *
    * @group param
    */
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(IMAGE)
  override val outputAnnotatorType: AnnotatorType = DOCUMENT

  /** @group setParam */
  def setRandomSeed(value: Int): Florence2Transformer.this.type = {
    if (randomSeed.isEmpty) {
      this.randomSeed = Some(value)
    }
    this
  }

  /** A list of token ids which are ignored in the decoder's output (Default: `Array()`)
    *
    * @group param
    */
  var ignoreTokenIds = new IntArrayParam(
    this,
    "ignoreTokenIds",
    "A list of token ids which are ignored in the decoder's output")

  /** @group setParam */
  def setIgnoreTokenIds(tokenIds: Array[Int]): Florence2Transformer.this.type = {
    set(ignoreTokenIds, tokenIds)
  }

  /** @group getParam */
  def getIgnoreTokenIds: Array[Int] = $(ignoreTokenIds)

  private var _model: Option[Broadcast[Florence2]] = None

  /** Vocabulary used to encode the words to ids with bpeTokenizer.encode
    *
    * @group param
    */
  val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected()

  /** @group setParam */
  def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value)

  val generationConfig: StructFeature[GenerationConfig] =
    new StructFeature(this, "generationConfig").setProtected()

  def setGenerationConfig(value: GenerationConfig): this.type =
    set(generationConfig, value)

  def getGenerationConfig: GenerationConfig = $$(generationConfig)

  /** Holding merges.txt coming from RoBERTa model
    *
    * @group param
    */
  val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected()

  /** @group setParam */
  def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value)

  /** Additional tokens to be added to the vocabulary
    *
    * @group param
    */
  val addedTokens: MapFeature[String, Int] = new MapFeature(this, "addedTokens").setProtected()

  /** @group setParam */
  def setAddedTokens(value: Map[String, Int]): this.type = set(addedTokens, value)

  /** Stop tokens to terminate the generation
    *
    * @group param
    */
  override val stopTokenIds =
    new IntArrayParam(this, "stopTokenIds", "Stop tokens to terminate the generation")

  /** @group setParam */
  override def setStopTokenIds(value: Array[Int]): this.type = {
    set(stopTokenIds, value)
  }

  /** @group getParam */
  override def getStopTokenIds: Array[Int] = $(stopTokenIds)

  /** @group setParam */
  def setModelIfNotSet(
      spark: SparkSession,
      preprocessor: Preprocessor,
      onnxWrappers: Option[DecoderWrappers],
      openvinoWrapper: Option[Florence2Wrappers]): this.type = {
    if (_model.isEmpty) {
      _model = Some(
        spark.sparkContext.broadcast(
          new Florence2(
            onnxWrappers,
            openvinoWrapper,
            $$(merges),
            $$(vocabulary),
            $$(addedTokens),
            preprocessor,
            generationConfig = getGenerationConfig)))
    }
    this
  }

  /** @group getParam */
  def getModelIfNotSet: Florence2 = _model.get.value

  setDefault(
    minOutputLength -> 10,
    maxOutputLength -> 200,
    doSample -> false,
    temperature -> 1.0,
    topK -> 50,
    topP -> 1.0,
    repetitionPenalty -> 1.0,
    noRepeatNgramSize -> 3,
    ignoreTokenIds -> Array(),
    batchSize -> 1,
    beamSize -> 1,
    maxInputLength -> 1024,
    stopTokenIds -> Array(2))

  /** takes a document and annotations and produces new annotations of this annotator's annotation
    * type
    *
    * @param batchedAnnotations
    *   Annotations that correspond to inputAnnotationCols generated by previous annotators if any
    * @return
    *   any number of annotations processed for every input annotation. Not necessary one to one
    *   relationship
    */
  override def batchAnnotate(
      batchedAnnotations: Seq[Array[AnnotationImage]]): Seq[Seq[Annotation]] = {

    batchedAnnotations.map { cleanAnnotationImages =>
      val validImages = cleanAnnotationImages.filter(_.result.nonEmpty)
      val questionAnnotations = extractInputAnnotation(validImages)

      getModelIfNotSet.predict(
        questionAnnotations,
        validImages.toSeq,
        batchSize = $(batchSize),
        minOutputLength = $(minOutputLength),
        maxOutputLength = $(maxOutputLength),
        doSample = $(doSample),
        temperature = $(temperature),
        topK = $(topK),
        topP = $(topP),
        repetitionPenalty = $(repetitionPenalty),
        noRepeatNgramSize = $(noRepeatNgramSize),
        randomSeed = this.randomSeed,
        ignoreTokenIds = $(ignoreTokenIds),
        beamSize = $(beamSize),
        maxInputLength = $(maxInputLength))
    }

  }

  private def extractInputAnnotation(
      annotationImages: Array[AnnotationImage]): Seq[Annotation] = {
    val questions = annotationImages.map(annotationImage => {
      val imageText =
        if (annotationImage.text.nonEmpty) annotationImage.text
        else
          "<s>Locate the objects with category name in the image.</s>" // default question
      Annotation(imageText)
    })

    questions
  }

  override def onWrite(path: String, spark: SparkSession): Unit = {
    super.onWrite(path, spark)
    getEngine match {
      case Openvino.name =>
        val wrappers = getModelIfNotSet.openvinoWrapper
        writeOpenvinoModels(
          path,
          spark,
          Seq((wrappers.get.encoderModel, "encoder.xml")),
          PaliGemmaForMultiModal.suffix)

        writeOpenvinoModels(
          path,
          spark,
          Seq((wrappers.get.decoderModel, "decoder.xml")),
          PaliGemmaForMultiModal.suffix)

        writeOpenvinoModels(
          path,
          spark,
          Seq((wrappers.get.textEmbeddingsModel, "text_embedding.xml")),
          PaliGemmaForMultiModal.suffix)

        writeOpenvinoModels(
          path,
          spark,
          Seq((wrappers.get.imageEmbedModel, "image_embedding.xml")),
          PaliGemmaForMultiModal.suffix)

        writeOpenvinoModels(
          path,
          spark,
          Seq((wrappers.get.modelMergerModel, "merger_model.xml")),
          PaliGemmaForMultiModal.suffix)
      case _ =>
        throw new Exception(notSupportedEngineError)
    }
  }
}

trait ReadablePretrainedFlorence2TransformerModel
    extends ParamsAndFeaturesReadable[Florence2Transformer]
    with HasPretrained[Florence2Transformer] {
  override val defaultModelName: Some[String] = Some("florence2_base_ft_int4")

  /** Java compliant-overrides */
  override def pretrained(): Florence2Transformer = super.pretrained()

  override def pretrained(name: String): Florence2Transformer = super.pretrained(name)

  override def pretrained(name: String, lang: String): Florence2Transformer =
    super.pretrained(name, lang)

  override def pretrained(name: String, lang: String, remoteLoc: String): Florence2Transformer =
    super.pretrained(name, lang, remoteLoc)
}

trait ReadFlorence2TransformerDLModel extends ReadOpenvinoModel {
  this: ParamsAndFeaturesReadable[Florence2Transformer] =>

  val suffix: String = "_Florence2"
  override val openvinoFile: String = "Florence2_openvino"

  def readModel(instance: Florence2Transformer, path: String, spark: SparkSession): Unit = {
    instance.getEngine match {
      case Openvino.name =>
        val decoderWrappers =
          readOpenvinoModels(path, spark, Seq("decoder.xml"), suffix)
        val encoderWrappers =
          readOpenvinoModels(path, spark, Seq("encoder.xml"), suffix)
        val textEmbeddingsWrappers =
          readOpenvinoModels(path, spark, Seq("text_embedding.xml"), suffix)
        val imageEmbeddingsWrappers =
          readOpenvinoModels(path, spark, Seq("image_embedding.xml"), suffix)
        val modelMergerWrappers =
          readOpenvinoModels(path, spark, Seq("merger_model.xml"), suffix)
        val ovWrapper = {
          Florence2Wrappers(
            encoderModel = encoderWrappers("encoder.xml"),
            decoderModel = decoderWrappers("decoder.xml"),
            textEmbeddingsModel = textEmbeddingsWrappers("text_embedding.xml"),
            imageEmbedModel = imageEmbeddingsWrappers("image_embedding.xml"),
            modelMergerModel = modelMergerWrappers("merger_model.xml"))
        }
        val preprocessor = Preprocessor(
          do_normalize = true,
          do_resize = true,
          "FlorenceFeatureExtractor",
          instance.getImageMean,
          instance.getImageStd,
          instance.getResample,
          instance.getSize)
        instance.setModelIfNotSet(spark, preprocessor, None, Some(ovWrapper))
      case _ =>
        throw new Exception(notSupportedEngineError)
    }
  }

  addReader(readModel)

  def loadSavedModel(
      modelPath: String,
      spark: SparkSession,
      useOpenvino: Boolean = false): Florence2Transformer = {
    implicit val formats: DefaultFormats.type = DefaultFormats // for json4s
    val (localModelPath, detectedEngine) =
      modelSanityCheck(
        modelPath,
        isDecoder = false,
        custom =
          Some(List("encoder", "decoder", "text_embedding", "merger_model", "image_embedding")))
    val modelConfig: JValue =
      parse(loadJsonStringAsset(localModelPath, "config.json"))

    val preprocessorConfigJsonContent =
      loadJsonStringAsset(localModelPath, "preprocessor_config.json")
    val preprocessorConfig = Preprocessor.loadPreprocessorConfig(preprocessorConfigJsonContent)
    val beginSuppressTokens: Array[Int] =
      (modelConfig \ "begin_suppress_tokens").extract[Array[Int]]

    val suppressTokenIds: Array[Int] =
      (modelConfig \ "suppress_tokens").extract[Array[Int]]

    val forcedDecoderIds: Array[(Int, Int)] =
      (modelConfig \ "forced_decoder_ids").extract[Array[Array[Int]]].map {
        case idxWithTokenId: Array[Int] if idxWithTokenId.length == 2 =>
          (idxWithTokenId(0), idxWithTokenId(1))
        case _ =>
          throw new Exception(
            "Could not extract forced_decoder_ids. Should be a list of tuples with 2 entries.")
      }

    def arrayOrNone[T](array: Array[T]): Option[Array[T]] =
      if (array.nonEmpty) Some(array) else None

    val bosTokenId = (modelConfig \ "bos_token_id").extract[Int]
    val eosTokenId = (modelConfig \ "eos_token_id").extract[Int]
    val padTokenId = (modelConfig \ "pad_token_id").extract[Int]
    val vocabSize = (modelConfig \ "text_config" \ "vocab_size").extract[Int]

    // Check if tokenizer.json exists
    val tokenizerPath = s"$localModelPath/assets/tokenizer.json"
    val tokenizerExists = new java.io.File(tokenizerPath).exists()
    val (vocabs, addedTokens, bytePairs) = if (tokenizerExists) {
      val tokenizerConfig: JValue = parse(loadJsonStringAsset(localModelPath, "tokenizer.json"))
      var vocabs: Map[String, Int] =
        (tokenizerConfig \ "model" \ "vocab").extract[Map[String, Int]]

      val bytePairs = (tokenizerConfig \ "model" \ "merges")
        .extract[List[Array[String]]]
        .filter(w => w.length == 2)
        .map { case Array(c1, c2) => (c1, c2) }
        .zipWithIndex
        .toMap

      val addedTokens = (tokenizerConfig \ "added_tokens")
        .extract[List[Map[String, Any]]]
        .map { token =>
          val id = token("id").asInstanceOf[BigInt].intValue()
          val content = token("content").asInstanceOf[String]
          (content, id)
        }
        .toMap

      addedTokens.foreach { case (content, id) =>
        vocabs += (content -> id)
      }
      (vocabs, addedTokens, bytePairs)
    } else {
      val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap
      val addedTokens = loadTextAsset(localModelPath, "added_tokens.txt").zipWithIndex.toMap
      val bytePairs = loadTextAsset(localModelPath, "merges.txt")
        .map(_.split(" "))
        .filter(w => w.length == 2)
        .map { case Array(c1, c2) => (c1, c2) }
        .zipWithIndex
        .toMap
      (vocabs, addedTokens, bytePairs)
    }

    val annotatorModel = new Florence2Transformer()
      .setGenerationConfig(
        GenerationConfig(
          bosTokenId,
          padTokenId,
          eosTokenId,
          vocabSize,
          arrayOrNone(beginSuppressTokens),
          arrayOrNone(suppressTokenIds),
          arrayOrNone(forcedDecoderIds)))
      .setVocabulary(vocabs)
      .setMerges(bytePairs)
      .setAddedTokens(addedTokens)
      .setSize(preprocessorConfig.size)
      .setImageMean(preprocessorConfig.image_mean)
      .setImageStd(preprocessorConfig.image_std)
      .setResample(preprocessorConfig.resample)

    val modelEngine =
      if (useOpenvino)
        Openvino.name
      else
        detectedEngine
    annotatorModel.set(annotatorModel.engine, modelEngine)

    detectedEngine match {
      case Openvino.name =>
        val openvinoEncoderWrapper =
          OpenvinoWrapper.read(
            spark,
            localModelPath,
            zipped = false,
            useBundle = true,
            detectedEngine = detectedEngine,
            modelName = "encoder")
        val openvinoDecoderWrapper =
          OpenvinoWrapper.read(
            spark,
            localModelPath,
            zipped = false,
            useBundle = true,
            detectedEngine = detectedEngine,
            modelName = "decoder")
        val openvinoTextEmbeddingsWrapper =
          OpenvinoWrapper.read(
            spark,
            localModelPath,
            zipped = false,
            useBundle = true,
            detectedEngine = detectedEngine,
            modelName = "text_embedding")
        val openvinoImageEmbeddingsWrapper =
          OpenvinoWrapper.read(
            spark,
            localModelPath,
            zipped = false,
            useBundle = true,
            detectedEngine = detectedEngine,
            modelName = "image_embedding")
        val openvinoModelMergerWrapper =
          OpenvinoWrapper.read(
            spark,
            localModelPath,
            zipped = false,
            useBundle = true,
            detectedEngine = detectedEngine,
            modelName = "merger_model")
        val openvinoWrapper =
          Florence2Wrappers(
            encoderModel = openvinoEncoderWrapper,
            decoderModel = openvinoDecoderWrapper,
            textEmbeddingsModel = openvinoTextEmbeddingsWrapper,
            imageEmbedModel = openvinoImageEmbeddingsWrapper,
            modelMergerModel = openvinoModelMergerWrapper)
        annotatorModel.setModelIfNotSet(spark, preprocessorConfig, None, Some(openvinoWrapper))

      case _ =>
        throw new Exception(notSupportedEngineError)
    }

    annotatorModel
  }

}

object Florence2Transformer
    extends ReadablePretrainedFlorence2TransformerModel
    with ReadFlorence2TransformerDLModel

1	/*
2	* Copyright 2017-2024 John Snow Labs
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	package com.johnsnowlabs.nlp.annotators.cv
18
19	import com.johnsnowlabs.ml.ai.util.Generation.GenerationConfig
20	import com.johnsnowlabs.ml.ai.Florence2
21	import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
22	import com.johnsnowlabs.ml.openvino.OpenvinoWrapper.Florence2Wrappers
23	import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel}
24	import com.johnsnowlabs.ml.util.LoadExternalModel.{
25	loadJsonStringAsset,
26	loadTextAsset,
27	modelSanityCheck,
28	notSupportedEngineError
29	}
30	import com.johnsnowlabs.ml.util.Openvino
31	import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, IMAGE}
32	import com.johnsnowlabs.nlp._
33	import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor
34	import org.apache.spark.broadcast.Broadcast
35	import org.apache.spark.ml.param._
36	import org.apache.spark.ml.util.Identifiable
37	import org.apache.spark.sql.SparkSession
38	import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature}
39	import org.json4s._
40	import org.json4s.jackson.JsonMethods._
41
42	/** Florence2: Advancing a Unified Representation for a Variety of Vision Tasks
43	*
44	* Florence-2 is an advanced vision foundation model from Microsoft that uses a prompt-based
45	* approach to handle a wide range of vision and vision-language tasks. It can interpret simple
46	* text prompts to perform tasks like captioning, object detection, segmentation, OCR, and more.
47	* The model leverages the FLD-5B dataset, containing 5.4 billion annotations across 126 million
48	* images, to master multi-task learning. Its sequence-to-sequence architecture enables it to
49	* excel in both zero-shot and fine-tuned settings.
50	*
51	* Pretrained and finetuned models can be loaded with `pretrained` of the companion object: {{ {
52	* val florence2 = Florence2Transformer.pretrained() .setInputCols("image")
53	* .setOutputCol("generation") }} } The default model is `"florence2_base_ft_int4"`, if no name
54	* is provided.
55	*
56	* For available pretrained models please see the
57	* [[https://sparknlp.org/models?task=Vision+Tasks Models Hub]].
58	*
59	* ==Supported Tasks==
60	*
61	* Florence-2 supports a variety of tasks through prompt engineering. The following prompt tokens
62	* can be used:
63	*
64	* - <CAPTION>: Image captioning
65	* - <DETAILED_CAPTION>: Detailed image captioning
66	* - <MORE_DETAILED_CAPTION>: Paragraph-level captioning
67	* - <CAPTION_TO_PHRASE_GROUNDING>: Phrase grounding from caption (requires additional text
68	* input)
69	* - <OD>: Object detection
70	* - <DENSE_REGION_CAPTION>: Dense region captioning
71	* - <REGION_PROPOSAL>: Region proposal
72	* - <OCR>: Optical Character Recognition (plain text extraction)
73	* - <OCR_WITH_REGION>: OCR with region information
74	* - <REFERRING_EXPRESSION_SEGMENTATION>: Segmentation for a referred phrase (requires
75	* additional text input)
76	* - <REGION_TO_SEGMENTATION>: Polygon mask for a region (requires additional text input)
77	* - <OPEN_VOCABULARY_DETECTION>: Open vocabulary detection for a phrase (requires additional
78	* text input)
79	* - <REGION_TO_CATEGORY>: Category of a region (requires additional text input)
80	* - <REGION_TO_DESCRIPTION>: Description of a region (requires additional text input)
81	* - <REGION_TO_OCR>: OCR for a region (requires additional text input)
82	*
83	* ==Example Usage==
84	*
85	* {{ { import com.johnsnowlabs.nlp.base.ImageAssembler import
86	* com.johnsnowlabs.nlp.annotators.cv.Florence2Transformer import org.apache.spark.ml.Pipeline
87	*
88	* val imageAssembler = new ImageAssembler() .setInputCol("image")
89	* .setOutputCol("image_assembler")
90	*
91	* val florence2 = Florence2Transformer.pretrained("florence2_base_ft_int4")
92	* .setInputCols("image_assembler") .setOutputCol("answer") .setMaxOutputLength(50)
93	*
94	* val pipeline = new Pipeline().setStages(Array(imageAssembler, florence2))
95	*
96	* val data = Seq("/path/to/image.jpg").toDF("image") val result =
97	* pipeline.fit(data).transform(data) result.select("answer.result").show(truncate = false) }} }
98	*
99	* ==References==
100	*
101	* - Florence-2 technical report: https://arxiv.org/abs/2311.06242
102	* - Hugging Face model card: https://huggingface.co/microsoft/Florence-2-base-ft
103	* - Official sample notebook:
104	* https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb
105	*
106	* For more details and advanced usage, see the official documentation and sample notebooks.
107	*/
108	class Florence2Transformer(override val uid: String)
109	extends AnnotatorModel[Florence2Transformer]
110	with HasBatchedAnnotateImage[Florence2Transformer]
111	with HasImageFeatureProperties
112	with WriteOpenvinoModel
113	with HasGeneratorProperties
114	with HasEngine {
115
NEW 116	def this() = this(Identifiable.randomUID("Florence2TRANSFORMER"))	×
117
118	/** Input annotator type : DOCUMENT
119	*
120	* @group param
121	*/
NEW 122	override val inputAnnotatorTypes: Array[AnnotatorType] = Array(IMAGE)	×
NEW 123	override val outputAnnotatorType: AnnotatorType = DOCUMENT	×
124
125	/** @group setParam */
126	def setRandomSeed(value: Int): Florence2Transformer.this.type = {
NEW 127	if (randomSeed.isEmpty) {	×
NEW 128	this.randomSeed = Some(value)	×
129	}
130	this
131	}
132
133	/** A list of token ids which are ignored in the decoder's output (Default: `Array()`)
134	*
135	* @group param
136	*/
NEW 137	var ignoreTokenIds = new IntArrayParam(	×
138	this,
NEW 139	"ignoreTokenIds",	×
NEW 140	"A list of token ids which are ignored in the decoder's output")	×
141
142	/** @group setParam */
143	def setIgnoreTokenIds(tokenIds: Array[Int]): Florence2Transformer.this.type = {
NEW 144	set(ignoreTokenIds, tokenIds)	×
145	}
146
147	/** @group getParam */
NEW 148	def getIgnoreTokenIds: Array[Int] = $(ignoreTokenIds)	×
149
NEW 150	private var _model: Option[Broadcast[Florence2]] = None	×
151
152	/** Vocabulary used to encode the words to ids with bpeTokenizer.encode
153	*
154	* @group param
155	*/
NEW 156	val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected()	×
157
158	/** @group setParam */
NEW 159	def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value)	×
160
161	val generationConfig: StructFeature[GenerationConfig] =
NEW 162	new StructFeature(this, "generationConfig").setProtected()	×
163
164	def setGenerationConfig(value: GenerationConfig): this.type =
NEW 165	set(generationConfig, value)	×
166
NEW 167	def getGenerationConfig: GenerationConfig = $$(generationConfig)	×
168
169	/** Holding merges.txt coming from RoBERTa model
170	*
171	* @group param
172	*/
NEW 173	val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected()	×
174
175	/** @group setParam */
NEW 176	def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value)	×
177
178	/** Additional tokens to be added to the vocabulary
179	*
180	* @group param
181	*/
NEW 182	val addedTokens: MapFeature[String, Int] = new MapFeature(this, "addedTokens").setProtected()	×
183
184	/** @group setParam */
NEW 185	def setAddedTokens(value: Map[String, Int]): this.type = set(addedTokens, value)	×
186
187	/** Stop tokens to terminate the generation
188	*
189	* @group param
190	*/
191	override val stopTokenIds =
NEW 192	new IntArrayParam(this, "stopTokenIds", "Stop tokens to terminate the generation")	×
193
194	/** @group setParam */
195	override def setStopTokenIds(value: Array[Int]): this.type = {
NEW 196	set(stopTokenIds, value)	×
197	}
198
199	/** @group getParam */
NEW 200	override def getStopTokenIds: Array[Int] = $(stopTokenIds)	×
201
202	/** @group setParam */
203	def setModelIfNotSet(
204	spark: SparkSession,
205	preprocessor: Preprocessor,
206	onnxWrappers: Option[DecoderWrappers],
207	openvinoWrapper: Option[Florence2Wrappers]): this.type = {
NEW 208	if (_model.isEmpty) {	×
NEW 209	_model = Some(	×
NEW 210	spark.sparkContext.broadcast(	×
NEW 211	new Florence2(	×
212	onnxWrappers,
213	openvinoWrapper,
NEW 214	$$(merges),	×
NEW 215	$$(vocabulary),	×
NEW 216	$$(addedTokens),	×
217	preprocessor,
NEW 218	generationConfig = getGenerationConfig)))	×
219	}
220	this
221	}
222
223	/** @group getParam */
NEW 224	def getModelIfNotSet: Florence2 = _model.get.value	×
225
NEW 226	setDefault(	×
NEW 227	minOutputLength -> 10,	×
NEW 228	maxOutputLength -> 200,	×
NEW 229	doSample -> false,	×
NEW 230	temperature -> 1.0,	×
NEW 231	topK -> 50,	×
NEW 232	topP -> 1.0,	×
NEW 233	repetitionPenalty -> 1.0,	×
NEW 234	noRepeatNgramSize -> 3,	×
NEW 235	ignoreTokenIds -> Array(),	×
NEW 236	batchSize -> 1,	×
NEW 237	beamSize -> 1,	×
NEW 238	maxInputLength -> 1024,	×
NEW 239	stopTokenIds -> Array(2))	×
240
241	/** takes a document and annotations and produces new annotations of this annotator's annotation
242	* type
243	*
244	* @param batchedAnnotations
245	* Annotations that correspond to inputAnnotationCols generated by previous annotators if any
246	* @return
247	* any number of annotations processed for every input annotation. Not necessary one to one
248	* relationship
249	*/
250	override def batchAnnotate(
251	batchedAnnotations: Seq[Array[AnnotationImage]]): Seq[Seq[Annotation]] = {
252
NEW 253	batchedAnnotations.map { cleanAnnotationImages =>	×
NEW 254	val validImages = cleanAnnotationImages.filter(_.result.nonEmpty)	×
NEW 255	val questionAnnotations = extractInputAnnotation(validImages)	×
256
NEW 257	getModelIfNotSet.predict(	×
258	questionAnnotations,
NEW 259	validImages.toSeq,	×
NEW 260	batchSize = $(batchSize),	×
NEW 261	minOutputLength = $(minOutputLength),	×
NEW 262	maxOutputLength = $(maxOutputLength),	×
NEW 263	doSample = $(doSample),	×
NEW 264	temperature = $(temperature),	×
NEW 265	topK = $(topK),	×
NEW 266	topP = $(topP),	×
NEW 267	repetitionPenalty = $(repetitionPenalty),	×
NEW 268	noRepeatNgramSize = $(noRepeatNgramSize),	×
NEW 269	randomSeed = this.randomSeed,	×
NEW 270	ignoreTokenIds = $(ignoreTokenIds),	×
NEW 271	beamSize = $(beamSize),	×
NEW 272	maxInputLength = $(maxInputLength))	×
273	}
274
275	}
276
277	private def extractInputAnnotation(
278	annotationImages: Array[AnnotationImage]): Seq[Annotation] = {
NEW 279	val questions = annotationImages.map(annotationImage => {	×
280	val imageText =
NEW 281	if (annotationImage.text.nonEmpty) annotationImage.text	×
282	else
NEW 283	"<s>Locate the objects with category name in the image.</s>" // default question	×
NEW 284	Annotation(imageText)	×
285	})
286
NEW 287	questions	×
288	}
289
290	override def onWrite(path: String, spark: SparkSession): Unit = {
NEW 291	super.onWrite(path, spark)	×
NEW 292	getEngine match {	×
293	case Openvino.name =>
NEW 294	val wrappers = getModelIfNotSet.openvinoWrapper	×
NEW 295	writeOpenvinoModels(	×
296	path,
297	spark,
NEW 298	Seq((wrappers.get.encoderModel, "encoder.xml")),	×
NEW 299	PaliGemmaForMultiModal.suffix)	×
300
NEW 301	writeOpenvinoModels(	×
302	path,
303	spark,
NEW 304	Seq((wrappers.get.decoderModel, "decoder.xml")),	×
NEW 305	PaliGemmaForMultiModal.suffix)	×
306
NEW 307	writeOpenvinoModels(	×
308	path,
309	spark,
NEW 310	Seq((wrappers.get.textEmbeddingsModel, "text_embedding.xml")),	×
NEW 311	PaliGemmaForMultiModal.suffix)	×
312
NEW 313	writeOpenvinoModels(	×
314	path,
315	spark,
NEW 316	Seq((wrappers.get.imageEmbedModel, "image_embedding.xml")),	×
NEW 317	PaliGemmaForMultiModal.suffix)	×
318
NEW 319	writeOpenvinoModels(	×
320	path,
321	spark,
NEW 322	Seq((wrappers.get.modelMergerModel, "merger_model.xml")),	×
NEW 323	PaliGemmaForMultiModal.suffix)	×
324	case _ =>
NEW 325	throw new Exception(notSupportedEngineError)	×
326	}
327	}
328	}
329
330	trait ReadablePretrainedFlorence2TransformerModel
331	extends ParamsAndFeaturesReadable[Florence2Transformer]
332	with HasPretrained[Florence2Transformer] {
NEW 333	override val defaultModelName: Some[String] = Some("florence2_base_ft_int4")	×
334
335	/** Java compliant-overrides */
NEW 336	override def pretrained(): Florence2Transformer = super.pretrained()	×
337
NEW 338	override def pretrained(name: String): Florence2Transformer = super.pretrained(name)	×
339
340	override def pretrained(name: String, lang: String): Florence2Transformer =
NEW 341	super.pretrained(name, lang)	×
342
343	override def pretrained(name: String, lang: String, remoteLoc: String): Florence2Transformer =
NEW 344	super.pretrained(name, lang, remoteLoc)	×
345	}
346
347	trait ReadFlorence2TransformerDLModel extends ReadOpenvinoModel {
348	this: ParamsAndFeaturesReadable[Florence2Transformer] =>
349
NEW 350	val suffix: String = "_Florence2"	×
NEW 351	override val openvinoFile: String = "Florence2_openvino"	×
352
353	def readModel(instance: Florence2Transformer, path: String, spark: SparkSession): Unit = {
NEW 354	instance.getEngine match {	×
355	case Openvino.name =>
356	val decoderWrappers =
NEW 357	readOpenvinoModels(path, spark, Seq("decoder.xml"), suffix)	×
358	val encoderWrappers =
NEW 359	readOpenvinoModels(path, spark, Seq("encoder.xml"), suffix)	×
360	val textEmbeddingsWrappers =
NEW 361	readOpenvinoModels(path, spark, Seq("text_embedding.xml"), suffix)	×
362	val imageEmbeddingsWrappers =
NEW 363	readOpenvinoModels(path, spark, Seq("image_embedding.xml"), suffix)	×
364	val modelMergerWrappers =
NEW 365	readOpenvinoModels(path, spark, Seq("merger_model.xml"), suffix)	×
366	val ovWrapper = {
NEW 367	Florence2Wrappers(	×
NEW 368	encoderModel = encoderWrappers("encoder.xml"),	×
NEW 369	decoderModel = decoderWrappers("decoder.xml"),	×
NEW 370	textEmbeddingsModel = textEmbeddingsWrappers("text_embedding.xml"),	×
NEW 371	imageEmbedModel = imageEmbeddingsWrappers("image_embedding.xml"),	×
NEW 372	modelMergerModel = modelMergerWrappers("merger_model.xml"))	×
373	}
NEW 374	val preprocessor = Preprocessor(	×
NEW 375	do_normalize = true,	×
NEW 376	do_resize = true,	×
NEW 377	"FlorenceFeatureExtractor",	×
NEW 378	instance.getImageMean,	×
NEW 379	instance.getImageStd,	×
NEW 380	instance.getResample,	×
NEW 381	instance.getSize)	×
NEW 382	instance.setModelIfNotSet(spark, preprocessor, None, Some(ovWrapper))	×
383	case _ =>
NEW 384	throw new Exception(notSupportedEngineError)	×
385	}
386	}
387
NEW 388	addReader(readModel)	×
389
390	def loadSavedModel(
391	modelPath: String,
392	spark: SparkSession,
393	useOpenvino: Boolean = false): Florence2Transformer = {
NEW 394	implicit val formats: DefaultFormats.type = DefaultFormats // for json4s	×
NEW 395	val (localModelPath, detectedEngine) =	×
396	modelSanityCheck(
397	modelPath,
398	isDecoder = false,
399	custom =
400	Some(List("encoder", "decoder", "text_embedding", "merger_model", "image_embedding")))
401	val modelConfig: JValue =
NEW 402	parse(loadJsonStringAsset(localModelPath, "config.json"))	×
403
404	val preprocessorConfigJsonContent =
NEW 405	loadJsonStringAsset(localModelPath, "preprocessor_config.json")	×
NEW 406	val preprocessorConfig = Preprocessor.loadPreprocessorConfig(preprocessorConfigJsonContent)	×
407	val beginSuppressTokens: Array[Int] =
NEW 408	(modelConfig \ "begin_suppress_tokens").extract[Array[Int]]	×
409
410	val suppressTokenIds: Array[Int] =
NEW 411	(modelConfig \ "suppress_tokens").extract[Array[Int]]	×
412
413	val forcedDecoderIds: Array[(Int, Int)] =
NEW 414	(modelConfig \ "forced_decoder_ids").extract[Array[Array[Int]]].map {	×
NEW 415	case idxWithTokenId: Array[Int] if idxWithTokenId.length == 2 =>	×
NEW 416	(idxWithTokenId(0), idxWithTokenId(1))	×
417	case _ =>
NEW 418	throw new Exception(	×
419	"Could not extract forced_decoder_ids. Should be a list of tuples with 2 entries.")
420	}
421
422	def arrayOrNone[T](array: Array[T]): Option[Array[T]] =
NEW 423	if (array.nonEmpty) Some(array) else None	×
424
NEW 425	val bosTokenId = (modelConfig \ "bos_token_id").extract[Int]	×
NEW 426	val eosTokenId = (modelConfig \ "eos_token_id").extract[Int]	×
NEW 427	val padTokenId = (modelConfig \ "pad_token_id").extract[Int]	×
NEW 428	val vocabSize = (modelConfig \ "text_config" \ "vocab_size").extract[Int]	×
429
430	// Check if tokenizer.json exists
NEW 431	val tokenizerPath = s"$localModelPath/assets/tokenizer.json"	×
NEW 432	val tokenizerExists = new java.io.File(tokenizerPath).exists()	×
NEW 433	val (vocabs, addedTokens, bytePairs) = if (tokenizerExists) {	×
434	val tokenizerConfig: JValue = parse(loadJsonStringAsset(localModelPath, "tokenizer.json"))
435	var vocabs: Map[String, Int] =
436	(tokenizerConfig \ "model" \ "vocab").extract[Map[String, Int]]
437
438	val bytePairs = (tokenizerConfig \ "model" \ "merges")
439	.extract[List[Array[String]]]
440	.filter(w => w.length == 2)
441	.map { case Array(c1, c2) => (c1, c2) }
442	.zipWithIndex
443	.toMap
444
445	val addedTokens = (tokenizerConfig \ "added_tokens")
446	.extract[List[Map[String, Any]]]
447	.map { token =>
448	val id = token("id").asInstanceOf[BigInt].intValue()
449	val content = token("content").asInstanceOf[String]
450	(content, id)
451	}
452	.toMap
453
454	addedTokens.foreach { case (content, id) =>
455	vocabs += (content -> id)
456	}
457	(vocabs, addedTokens, bytePairs)
458	} else {
459	val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap
460	val addedTokens = loadTextAsset(localModelPath, "added_tokens.txt").zipWithIndex.toMap
461	val bytePairs = loadTextAsset(localModelPath, "merges.txt")
462	.map(_.split(" "))
463	.filter(w => w.length == 2)
464	.map { case Array(c1, c2) => (c1, c2) }
465	.zipWithIndex
466	.toMap
467	(vocabs, addedTokens, bytePairs)
468	}
469
470	val annotatorModel = new Florence2Transformer()
471	.setGenerationConfig(
NEW 472	GenerationConfig(	×
473	bosTokenId,
474	padTokenId,
475	eosTokenId,
476	vocabSize,
NEW 477	arrayOrNone(beginSuppressTokens),	×
NEW 478	arrayOrNone(suppressTokenIds),	×
NEW 479	arrayOrNone(forcedDecoderIds)))	×
480	.setVocabulary(vocabs)
481	.setMerges(bytePairs)
482	.setAddedTokens(addedTokens)
NEW 483	.setSize(preprocessorConfig.size)	×
NEW 484	.setImageMean(preprocessorConfig.image_mean)	×
NEW 485	.setImageStd(preprocessorConfig.image_std)	×
NEW 486	.setResample(preprocessorConfig.resample)	×
487
488	val modelEngine =
489	if (useOpenvino)
NEW 490	Openvino.name	×
491	else
NEW 492	detectedEngine	×
NEW 493	annotatorModel.set(annotatorModel.engine, modelEngine)	×
494
495	detectedEngine match {
496	case Openvino.name =>
497	val openvinoEncoderWrapper =
NEW 498	OpenvinoWrapper.read(	×
499	spark,
500	localModelPath,
NEW 501	zipped = false,	×
NEW 502	useBundle = true,	×
503	detectedEngine = detectedEngine,
NEW 504	modelName = "encoder")	×
505	val openvinoDecoderWrapper =
NEW 506	OpenvinoWrapper.read(	×
507	spark,
508	localModelPath,
NEW 509	zipped = false,	×
NEW 510	useBundle = true,	×
511	detectedEngine = detectedEngine,
NEW 512	modelName = "decoder")	×
513	val openvinoTextEmbeddingsWrapper =
NEW 514	OpenvinoWrapper.read(	×
515	spark,
516	localModelPath,
NEW 517	zipped = false,	×
NEW 518	useBundle = true,	×
519	detectedEngine = detectedEngine,
NEW 520	modelName = "text_embedding")	×
521	val openvinoImageEmbeddingsWrapper =
NEW 522	OpenvinoWrapper.read(	×
523	spark,
524	localModelPath,
NEW 525	zipped = false,	×
NEW 526	useBundle = true,	×
527	detectedEngine = detectedEngine,
NEW 528	modelName = "image_embedding")	×
529	val openvinoModelMergerWrapper =
NEW 530	OpenvinoWrapper.read(	×
531	spark,
532	localModelPath,
NEW 533	zipped = false,	×
NEW 534	useBundle = true,	×
535	detectedEngine = detectedEngine,
NEW 536	modelName = "merger_model")	×
537	val openvinoWrapper =
NEW 538	Florence2Wrappers(	×
539	encoderModel = openvinoEncoderWrapper,
540	decoderModel = openvinoDecoderWrapper,
541	textEmbeddingsModel = openvinoTextEmbeddingsWrapper,
542	imageEmbedModel = openvinoImageEmbeddingsWrapper,
543	modelMergerModel = openvinoModelMergerWrapper)
NEW 544	annotatorModel.setModelIfNotSet(spark, preprocessorConfig, None, Some(openvinoWrapper))	×
545
546	case _ =>
NEW 547	throw new Exception(notSupportedEngineError)	×
548	}
549
550	annotatorModel
551	}
552
553	}
554
555	object Florence2Transformer
556	extends ReadablePretrainedFlorence2TransformerModel
557	with ReadFlorence2TransformerDLModel

JohnSnowLabs / spark-nlp / 15252839065

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous