• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 15252839065

26 May 2025 11:30AM CUT coverage: 52.115% (-0.6%) from 52.715%
15252839065

Pull #14585

github

web-flow
Merge 625e5c10f into 56512b006
Pull Request #14585: SparkNLP 1131 - Introducing Florance-2

0 of 199 new or added lines in 4 files covered. (0.0%)

50 existing lines in 33 files now uncovered.

9931 of 19056 relevant lines covered (52.11%)

0.52 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

47.54
/src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/Tagger.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators.parser.dep
18

19
import com.johnsnowlabs.nlp.annotators.parser.dep.GreedyTransition._
20

21
import scala.collection.mutable
22

23
class Tagger(classes: Vector[ClassName], tagDict: Map[Word, ClassNum]) extends Serializable {
24
  private val getClassNum =
25
    classes.zipWithIndex.toMap.withDefaultValue(-1) // -1 => "CLASS-NOT-FOUND"
1✔
26

27
  private val perceptron = new Perceptron(classes.length)
1✔
28

29
  def getFeatures(word: List[Word], pos: List[ClassName], i: Int): Map[Feature, Score] = {
30
    val featureSet = Set(
1✔
31
      Feature(
1✔
32
        "bias",
33
        ""
34
      ), // It's useful to have a constant feature, which acts sort of like a prior
35
      Feature("word", word(i)),
1✔
36
      Feature("w suffix", word(i).takeRight(3)),
1✔
37
      Feature("w pref1", word(i).take(1)),
1✔
38
      Feature("tag-1", pos(i - 1)),
1✔
39
      Feature("tag-2", pos(i - 2)),
1✔
40
      Feature("tag-1-2", s"${pos(i - 1)} ${pos(i - 2)}"),
1✔
41
      Feature("w,tag-1", s"${word(i)} ${pos(i - 1)}"),
1✔
42
      Feature("w-1", word(i - 1)),
1✔
43
      Feature("w-1 suffix", word(i - 1).takeRight(3)),
1✔
44
      Feature("w-2", word(i - 2)),
1✔
45
      Feature("w+1", word(i + 1)),
1✔
46
      Feature("w+1 suffix", word(i + 1).takeRight(3)),
1✔
47
      Feature("w+2", word(i + 2)))
1✔
48

49
    // All weights on this set of features are ==1
50
    featureSet.map(f => (f, 1: Score)).toMap
1✔
51
  }
52

53
  def train(sentences: List[Sentence], seed: Int): Float = {
54
    val rand = new scala.util.Random(seed)
×
55
    rand.shuffle(sentences).map(s => trainSentence(s)).sum / sentences.length
×
56
  }
57
  def trainSentence(sentence: Sentence): Float =
58
    goodness(sentence, process(sentence, train = true))
×
59

60
  def tag(sentence: Sentence): List[ClassName] = process(sentence, train = false)
1✔
61

62
  def process(sentence: Sentence, train: Boolean): List[ClassName] = {
63
    val wordsNorm = sentence.map(_.norm)
1✔
64
    val words: List[Word] = List("%START%", "%PAD%") ::: wordsNorm ::: List("%ROOT%", "%END%")
1✔
65
    val goldTags: List[ClassNum] =
66
      if (train) List(-1, -1) ::: sentence.map(wd => getClassNum(wd.pos)) ::: List(-1, -1)
×
67
      else Nil
1✔
68

69
    val (_, allTags) =
70
      wordsNorm.foldLeft((2: Int, List[ClassName]("%START%", "%PAD%"))) {
1✔
71
        case ((i, tags), wordNorm) => {
72
          val guess = tagDict.getOrElse(
1✔
73
            wordNorm, { // Don't do the feature scoring if we already 'know' the right PoS
74
              val features = getFeatures(words, tags, i)
1✔
75
              val score =
UNCOV
76
                perceptron.score(features, if (train) perceptron.current else perceptron.average)
×
77
              val guessed = perceptron.predict(score)
1✔
78

79
              if (train) {
1✔
80
                perceptron.update(goldTags(i), guessed, features.keys)
×
81
              }
82
              guessed // Use the guessed value for next prediction/learning step (rather than the truth...)
83
            })
84
          (i + 1, tags :+ classes(guess))
1✔
85
        }
86
      }
87
    allTags.drop(2)
1✔
88
  }
89

90
  def goodness(sentence: Sentence, fit: List[ClassName]): Float = {
91
    val gold = sentence.map(_.pos).toVector
×
92
    val correct = fit.zip(gold).count(pair => pair._1 == pair._2) / gold.length.toFloat
×
93
    correct
94
  }
95

96
  override def toString: String = {
97
    classes.mkString("tagger.classes=[", "|", "]" + System.lineSeparator()) +
×
98
      tagDict
99
        .map({ case (norm, classnum) => s"$norm=$classnum" })
×
100
        .mkString("tagger.tag_dict=[", "|", "]" + System.lineSeparator()) +
×
101
      System.lineSeparator() +
×
102
      perceptron.toString
×
103
  }
104

105
  def getPerceptronAsIterator: Iterator[String] = {
106
    perceptron.toString().split(System.lineSeparator()).toIterator
×
107
  }
108

109
  def getTaggerAsIterator: Iterator[String] = {
110
    this.toString().split(System.lineSeparator()).toIterator
×
111
  }
112

113
}
114
object Tagger { // Here, tag == Part-of-Speech
115

116
  def load(lines: Iterator[String]): Tagger = {
117
    var (classes, tagDict) = (Array[ClassName](), mutable.Map[Word, ClassNum]())
×
118

119
    val taggerClasses = """tagger.classes=\[(.*)\]""".r
×
120
    val taggerTagDict = """tagger.tag_dict=\[(.*)\]""".r
×
121
    def parse(lines: Iterator[String]): Unit = lines.next match {
×
122
      case taggerClasses(data) if data.nonEmpty => {
×
123
        classes = data.split('|')
×
124
        parse(lines)
×
125
      }
126
      case taggerTagDict(data) if data.nonEmpty => {
×
127
        tagDict ++= data
×
128
          .split('|')
×
129
          .map(nc => {
×
130
            val arr = nc.split('='); (arr(0), arr(1).toInt)
×
131
          }) // println(s"Tagger pair : $nc");
132
        parse(lines)
×
133
      }
134
      case _ => () // line not understood : Finish
×
135
    }
136
    parse(lines)
×
137

138
    val t = new Tagger(classes.toVector, tagDict.toMap)
×
139
    t.perceptron.load(lines)
×
140
    t
141
  }
142

143
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc