• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 11429325160

20 Oct 2024 08:18PM UTC coverage: 60.052% (-0.2%) from 60.216%
11429325160

Pull #14439

github

web-flow
Merge 1c191569d into 9db33328b
Pull Request #14439: [SPARKNLP-1067] PromptAssembler

0 of 50 new or added lines in 2 files covered. (0.0%)

48 existing lines in 26 files now uncovered.

8985 of 14962 relevant lines covered (60.05%)

0.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.07
/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators
18

19
import com.johnsnowlabs.nlp.util.regex.RuleFactory
20
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
21
import org.apache.commons.lang3.time.DateUtils
22
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
23

24
import java.text.SimpleDateFormat
25
import java.util.Calendar
26
import scala.collection.mutable.ListBuffer
27
import scala.util.matching.Regex
28

29
/** Matches standard date formats into a provided format.
30
  *
31
  * Reads the following kind of dates:
32
  * {{{
33
  * "1978-01-28", "1984/04/02,1/02/1980", "2/28/79", "The 31st of April in the year 2008",
34
  * "Fri, 21 Nov 1997", "Jan 21, ‘97", "Sun", "Nov 21", "jan 1st", "next thursday",
35
  * "last wednesday", "today", "tomorrow", "yesterday", "next week", "next month",
36
  * "next year", "day after", "the day before", "0600h", "06:00 hours", "6pm", "5:30 a.m.",
37
  * "at 5", "12:59", "23:59", "1988/11/23 6pm", "next week at 7.30", "5 am tomorrow"
38
  * }}}
39
  *
40
  * For example `"The 31st of April in the year 2008"` will be converted into `2008/04/31`.
41
  *
42
  * For extended examples of usage, see the
43
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb Examples]]
44
  * and the
45
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherTestSpec.scala MultiDateMatcherTestSpec]].
46
  *
47
  * ==Example==
48
  * {{{
49
  * import spark.implicits._
50
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
51
  * import com.johnsnowlabs.nlp.annotators.MultiDateMatcher
52
  * import org.apache.spark.ml.Pipeline
53
  *
54
  * val documentAssembler = new DocumentAssembler()
55
  *   .setInputCol("text")
56
  *   .setOutputCol("document")
57
  *
58
  * val date = new MultiDateMatcher()
59
  *   .setInputCols("document")
60
  *   .setOutputCol("date")
61
  *   .setAnchorDateYear(2020)
62
  *   .setAnchorDateMonth(1)
63
  *   .setAnchorDateDay(11)
64
  *
65
  * val pipeline = new Pipeline().setStages(Array(
66
  *   documentAssembler,
67
  *   date
68
  * ))
69
  *
70
  * val data = Seq("I saw him yesterday and he told me that he will visit us next week")
71
  *   .toDF("text")
72
  * val result = pipeline.fit(data).transform(data)
73
  *
74
  * result.selectExpr("explode(date) as dates").show(false)
75
  * +-----------------------------------------------+
76
  * |dates                                          |
77
  * +-----------------------------------------------+
78
  * |[date, 57, 65, 2020/01/18, [sentence -> 0], []]|
79
  * |[date, 10, 18, 2020/01/10, [sentence -> 0], []]|
80
  * +-----------------------------------------------+
81
  * }}}
82
  *
83
  * @param uid
84
  *   internal uid required to generate writable annotators
85
  * @groupname anno Annotator types
86
  * @groupdesc anno
87
  *   Required input and expected output annotator types
88
  * @groupname Ungrouped Members
89
  * @groupname param Parameters
90
  * @groupname setParam Parameter setters
91
  * @groupname getParam Parameter getters
92
  * @groupname Ungrouped Members
93
  * @groupprio param  1
94
  * @groupprio anno  2
95
  * @groupprio Ungrouped 3
96
  * @groupprio setParam  4
97
  * @groupprio getParam  5
98
  * @groupdesc param
99
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
100
  *   parameter values through setters and getters, respectively.
101
  */
102
class MultiDateMatcher(override val uid: String)
103
    extends AnnotatorModel[MultiDateMatcher]
104
    with HasSimpleAnnotate[MultiDateMatcher]
105
    with DateMatcherUtils {
106

107
  import com.johnsnowlabs.nlp.AnnotatorType._
108

109
  /** Output Annotator Type : DATE
110
    *
111
    * @group anno
112
    */
113
  override val outputAnnotatorType: AnnotatorType = DATE
1✔
114

115
  /** Input Annotator Type : DOCUMENT
116
    *
117
    * @group anno
118
    */
119
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
1✔
120

121
  /** Internal constructor to submit a random UID */
122
  def this() = this(Identifiable.randomUID("MULTI_DATE"))
1✔
123

124
  private def runTranslation(text: String) = {
125
    val sourceLanguage = getSourceLanguage
1✔
126
    val translationPreds = Array(sourceLanguage.length == 2, !sourceLanguage.equals("en"))
1✔
127

128
    if (translationPreds.forall(_.equals(true)))
1✔
129
      new DateMatcherTranslator(MultiDatePolicy).translate(text, sourceLanguage)
1✔
130
    else
131
      text
1✔
132
  }
133

134
  private def findByInputFormatsRules(text: String, factory: RuleFactory): Seq[MatchedDateTime] =
135
    factory
136
      .findMatch(text)
137
      .map(formalDateContentParse(_))
1✔
138
      .groupBy(_.calendar)
1✔
139
      .map { case (_, group) => group.head }
1✔
140
      .toSeq
1✔
141

142
  def runInputFormatsSearch(text: String): Seq[MatchedDateTime] = {
143
    val regexes: Array[Regex] = getInputFormats
1✔
144
      .filter(formalInputFormats.contains(_))
1✔
145
      .map(formalInputFormats(_))
1✔
146

147
    for (r <- regexes) {
1✔
148
      formalFactoryInputFormats.addRule(r, "formal rule from input formats")
1✔
149
    }
150

151
    findByInputFormatsRules(text, formalFactoryInputFormats)
1✔
152
  }
153

154
  def runDateExtractorChain(_text: String): Seq[MatchedDateTime] = {
155
    val strategies: Seq[() => Seq[MatchedDateTime]] = Seq(
1✔
156
      () => extractFormalDate(_text),
1✔
157
      () => extractRelativeDatePast(_text),
1✔
158
      () => extractRelativeDateFuture(_text),
1✔
159
      () => extractRelaxedDate(_text),
1✔
160
      () => extractRelativeDate(_text),
1✔
161
      () => extractTomorrowYesterday(_text),
1✔
162
      () => extractRelativeExactDay(_text))
1✔
163

164
    strategies.foldLeft(Seq.empty[MatchedDateTime])((previousResults, strategy) => {
1✔
165
      // Always keep earliest match of each strategy by date found
166
      val newResults = strategy()
1✔
167
      newResults.foldLeft(previousResults)((previous, newResult) => {
1✔
168
        // Prioritize previous results on this index, ignore new ones if overlapping previous results
169
        if (previous.exists(_.start == newResult.start))
1✔
170
          previous
1✔
171
        else
172
          previous :+ newResult
1✔
173
      })
174
    })
175
  }
176

177
  /** Finds dates in a specific order, from formal to more relaxed. Add time of any, or
178
    * stand-alone time
179
    *
180
    * @param text
181
    *   input text coming from target document
182
    * @return
183
    *   a possible date-time match
184
    */
185
  private[annotators] def extractDate(text: String): Seq[MatchedDateTime] = {
186

187
    val _text: String = runTranslation(text)
1✔
188

189
    def inputFormatsAreDefined = !getInputFormats.sameElements(EMPTY_INIT_ARRAY)
1✔
190

191
    val possibleDates: Seq[MatchedDateTime] =
192
      if (inputFormatsAreDefined)
193
        runInputFormatsSearch(_text)
1✔
194
      else
195
        runDateExtractorChain(_text)
1✔
196

197
    if (getAggressiveMatching && possibleDates.isEmpty) {
1✔
198
      runDateExtractorChain(_text)
1✔
199
    } else possibleDates
1✔
200
  }
201

202
  private def extractRelativeDateFuture(text: String): Seq[MatchedDateTime] = {
203
    if ("(.*)\\s*in\\s*[0-9](.*)".r.findFirstMatchIn(text).isDefined)
1✔
204
      relativeFutureFactory
205
        .findMatch(text.toLowerCase())
1✔
206
        .map(possibleDate => relativeDateFutureContentParse(possibleDate))
1✔
207
    else
208
      Seq.empty
1✔
209
  }
210

211
  private def extractRelativeDatePast(text: String): Seq[MatchedDateTime] = {
212
    if ("(.*)\\s*[0-9]\\s*(.*)\\s*(ago)(.*)".r.findFirstMatchIn(text).isDefined)
1✔
213
      relativePastFactory
214
        .findMatch(text.toLowerCase())
1✔
215
        .map(possibleDate => relativeDatePastContentParse(possibleDate))
1✔
216
    else
217
      Seq.empty
1✔
218
  }
219

220
  private def extractFormalDate(text: String): Seq[MatchedDateTime] = {
221
    val allFormalDateMatches = formalFactory.findMatch(text).map { possibleDate =>
1✔
222
      formalDateContentParse(possibleDate)
1✔
223
    }
224

225
    regularizeFormalDateMatches(allFormalDateMatches)
1✔
226
  }
227

228
  private def regularizeFormalDateMatches: Seq[MatchedDateTime] => Seq[MatchedDateTime] =
229
    allFormalDateMatches => {
230
      def truncatedExists(e: Calendar, candidate: Calendar) = {
231
        DateUtils.truncate(e, Calendar.MONTH).equals(candidate)
1✔
232
      }
233

234
      val indexedMatches: Seq[(MatchedDateTime, Int)] = allFormalDateMatches.zipWithIndex
1✔
235
      val indexesToRemove = new ListBuffer[Int]()
1✔
236

237
      for (e <- indexedMatches) {
1✔
238
        val candidates = indexedMatches.filterNot(_._2 == e._2)
1✔
239
        val accTempIdx: Seq[Int] =
240
          for (candidate <- candidates
1✔
241
            // if true, the candidate is the truncated match of the existing match
242
            if truncatedExists(e._1.calendar, candidate._1.calendar)) yield candidate._2
1✔
243
        accTempIdx.foreach(indexesToRemove.append(_))
1✔
244
      }
245

246
      val regularized =
247
        indexedMatches.filterNot { case (_, i) => indexesToRemove.contains(i) }.map(_._1)
1✔
248
      regularized
249
    }
250

251
  private def extractRelaxedDate(text: String): Seq[MatchedDateTime] = {
252
    val possibleDates = relaxedFactory.findMatch(text)
1✔
253
    val possibleDatesByIndexMatch = possibleDates.groupBy(_.indexMatch)
1✔
254
    possibleDatesByIndexMatch.flatMap { case (_, possibleDates) =>
1✔
255
      computePossibleDates(possibleDates)
1✔
256
    }.toSeq
1✔
257
  }
258

259
  private def computePossibleDates(
260
      possibleDates: Seq[RuleFactory.RuleMatch]): Seq[MatchedDateTime] = {
261
    var dayMatch = $(defaultDayWhenMissing)
1✔
262
    var monthMatch = defaultMonthWhenMissing
1✔
263
    var yearMatch = defaultYearWhenMissing
1✔
264
    var changes = 0
1✔
265

266
    possibleDates.foreach(possibleDate => {
1✔
267

268
      if (possibleDate.identifier == "relaxed days" && possibleDate.content.matched.exists(
1✔
269
          _.isDigit)) {
1✔
270
        changes += 1
1✔
271
        dayMatch = possibleDate.content.matched.filter(_.isDigit).toInt
1✔
272
      }
273

274
      if (possibleDate.identifier == "relaxed months exclusive" && possibleDate.content.matched.length > 2) {
1✔
275
        changes += 1
1✔
276
        val month = possibleDate.content.matched.toLowerCase().take(3)
1✔
UNCOV
277
        if (shortMonths.contains(month))
×
278
          monthMatch = shortMonths.indexOf(month)
1✔
279
      }
280

281
      if (possibleDate.identifier == "relaxed year" &&
1✔
282
        possibleDate.content.matched.exists(_.isDigit) &&
1✔
283
        possibleDate.content.matched.length > 2) {
1✔
284
        changes += 1
1✔
285
        val year = possibleDate.content.matched.filter(_.isDigit).toInt
1✔
286
        yearMatch = if (year > 999) year else year + 1900
1✔
287
      }
288
    })
289
    if (possibleDates.nonEmpty && changes > 1) {
1✔
290
      val calendar = new Calendar.Builder()
1✔
291
      calendar.setDate(yearMatch, monthMatch, dayMatch)
1✔
292
      Seq(
1✔
293
        MatchedDateTime(
1✔
294
          calendar.build(),
1✔
295
          possibleDates.map(_.content.start).min,
1✔
296
          possibleDates.map(_.content.end).max))
1✔
297
    } else Seq.empty
1✔
298
  }
299

300
  private def extractRelativeDate(text: String): Seq[MatchedDateTime] = {
301
    relativeFactory.findMatch(text).map(possibleDate => relativeDateContentParse(possibleDate))
1✔
302
  }
303

304
  private def extractTomorrowYesterday(text: String): Seq[MatchedDateTime] = {
305
    tyFactory
306
      .findMatch(text)
307
      .map(possibleDate => tomorrowYesterdayContentParse(possibleDate))
1✔
308
  }
309

310
  private def extractRelativeExactDay(text: String): Seq[MatchedDateTime] = {
311
    relativeExactFactory
312
      .findMatch(text.toLowerCase)
1✔
313
      .map(possibleDate => relativeExactContentParse(possibleDate))
1✔
314
  }
315

316
  /** One to one relationship between content document and output annotation
317
    *
318
    * @return
319
    *   Any found date, empty if not. Final format is [[outputFormat]] or default yyyy/MM/dd
320
    */
321
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
322
    val simpleDateFormat = new SimpleDateFormat(getOutputFormat)
1✔
323
    annotations.flatMap(annotation =>
1✔
324
      extractDate(annotation.result)
1✔
325
        .map(matchedDate =>
1✔
326
          Annotation(
1✔
327
            outputAnnotatorType,
1✔
328
            matchedDate.start,
1✔
329
            matchedDate.end - 1,
1✔
330
            simpleDateFormat.format(matchedDate.calendar.getTime),
1✔
331
            annotation.metadata)))
1✔
332
  }
333

334
}
335

336
/** This is the companion object of [[MultiDateMatcher]]. Please refer to that class for the
337
  * documentation.
338
  */
339
object MultiDateMatcher extends DefaultParamsReadable[MultiDateMatcher]
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc