• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 7961647790

19 Feb 2024 03:10PM UTC coverage: 62.872% (+0.2%) from 62.707%
7961647790

Pull #14156

github

web-flow
Merge 1bf922044 into 033847426
Pull Request #14156: fixed all sbt warnings

8958 of 14248 relevant lines covered (62.87%)

0.63 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.04
/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators
18

19
import com.johnsnowlabs.nlp.util.regex.RuleFactory
20
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
21
import org.apache.commons.lang3.time.DateUtils
22
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
23

24
import java.text.SimpleDateFormat
25
import java.util.Calendar
26
import scala.collection.mutable.ListBuffer
27
import scala.util.matching.Regex
28

29
/** Matches standard date formats into a provided format.
30
  *
31
  * Reads the following kind of dates:
32
  * {{{
33
  * "1978-01-28", "1984/04/02,1/02/1980", "2/28/79", "The 31st of April in the year 2008",
34
  * "Fri, 21 Nov 1997", "Jan 21, ‘97", "Sun", "Nov 21", "jan 1st", "next thursday",
35
  * "last wednesday", "today", "tomorrow", "yesterday", "next week", "next month",
36
  * "next year", "day after", "the day before", "0600h", "06:00 hours", "6pm", "5:30 a.m.",
37
  * "at 5", "12:59", "23:59", "1988/11/23 6pm", "next week at 7.30", "5 am tomorrow"
38
  * }}}
39
  *
40
  * For example `"The 31st of April in the year 2008"` will be converted into `2008/04/31`.
41
  *
42
  * For extended examples of usage, see the
43
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb Examples]]
44
  * and the
45
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherTestSpec.scala MultiDateMatcherTestSpec]].
46
  *
47
  * ==Example==
48
  * {{{
49
  * import spark.implicits._
50
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
51
  * import com.johnsnowlabs.nlp.annotators.MultiDateMatcher
52
  * import org.apache.spark.ml.Pipeline
53
  *
54
  * val documentAssembler = new DocumentAssembler()
55
  *   .setInputCol("text")
56
  *   .setOutputCol("document")
57
  *
58
  * val date = new MultiDateMatcher()
59
  *   .setInputCols("document")
60
  *   .setOutputCol("date")
61
  *   .setAnchorDateYear(2020)
62
  *   .setAnchorDateMonth(1)
63
  *   .setAnchorDateDay(11)
64
  *
65
  * val pipeline = new Pipeline().setStages(Array(
66
  *   documentAssembler,
67
  *   date
68
  * ))
69
  *
70
  * val data = Seq("I saw him yesterday and he told me that he will visit us next week")
71
  *   .toDF("text")
72
  * val result = pipeline.fit(data).transform(data)
73
  *
74
  * result.selectExpr("explode(date) as dates").show(false)
75
  * +-----------------------------------------------+
76
  * |dates                                          |
77
  * +-----------------------------------------------+
78
  * |[date, 57, 65, 2020/01/18, [sentence -> 0], []]|
79
  * |[date, 10, 18, 2020/01/10, [sentence -> 0], []]|
80
  * +-----------------------------------------------+
81
  * }}}
82
  *
83
  * @param uid
84
  *   internal uid required to generate writable annotators
85
  * @groupname anno Annotator types
86
  * @groupdesc anno
87
  *   Required input and expected output annotator types
88
  * @groupname Ungrouped Members
89
  * @groupname param Parameters
90
  * @groupname setParam Parameter setters
91
  * @groupname getParam Parameter getters
92
  * @groupname Ungrouped Members
93
  * @groupprio param  1
94
  * @groupprio anno  2
95
  * @groupprio Ungrouped 3
96
  * @groupprio setParam  4
97
  * @groupprio getParam  5
98
  * @groupdesc param
99
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
100
  *   parameter values through setters and getters, respectively.
101
  */
102
class MultiDateMatcher(override val uid: String)
103
    extends AnnotatorModel[MultiDateMatcher]
104
    with HasSimpleAnnotate[MultiDateMatcher]
105
    with DateMatcherUtils {
106

107
  import com.johnsnowlabs.nlp.AnnotatorType._
108

109
  /** Output Annotator Type : DATE
110
    *
111
    * @group anno
112
    */
113
  override val outputAnnotatorType: AnnotatorType = DATE
1✔
114

115
  /** Input Annotator Type : DOCUMENT
116
    *
117
    * @group anno
118
    */
119
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
1✔
120

121
  /** Internal constructor to submit a random UID */
122
  def this() = this(Identifiable.randomUID("MULTI_DATE"))
1✔
123

124
  private def runTranslation(text: String) = {
125
    val sourceLanguage = getSourceLanguage
1✔
126
    val translationPreds = Array(sourceLanguage.length == 2, !sourceLanguage.equals("en"))
1✔
127

128
    if (translationPreds.forall(_.equals(true)))
1✔
129
      new DateMatcherTranslator(MultiDatePolicy).translate(text, sourceLanguage)
1✔
130
    else
131
      text
1✔
132
  }
133

134
  private def findByInputFormatsRules(text: String, factory: RuleFactory): Seq[MatchedDateTime] =
135
    factory
136
      .findMatch(text)
137
      .map(formalDateContentParse(_))
1✔
138
      .groupBy(_.calendar)
1✔
139
      .map { case (_, group) => group.head }
1✔
140
      .toSeq
1✔
141

142
  def runInputFormatsSearch(text: String): Seq[MatchedDateTime] = {
143
    val regexes: Array[Regex] = getInputFormats
1✔
144
      .filter(formalInputFormats.contains(_))
1✔
145
      .map(formalInputFormats(_))
1✔
146

147
    for (r <- regexes) {
1✔
148
      formalFactoryInputFormats.addRule(r, "formal rule from input formats")
1✔
149
    }
150

151
    findByInputFormatsRules(text, formalFactoryInputFormats)
1✔
152
  }
153

154
  def runDateExtractorChain(_text: String): Seq[MatchedDateTime] = {
155
    val strategies: Seq[() => Seq[MatchedDateTime]] = Seq(
1✔
156
      () => extractFormalDate(_text),
1✔
157
      () => extractRelativeDatePast(_text),
1✔
158
      () => extractRelativeDateFuture(_text),
1✔
159
      () => extractRelaxedDate(_text),
1✔
160
      () => extractRelativeDate(_text),
1✔
161
      () => extractTomorrowYesterday(_text),
1✔
162
      () => extractRelativeExactDay(_text))
1✔
163

164
    strategies.foldLeft(Seq.empty[MatchedDateTime])((previousResults, strategy) => {
1✔
165
      // Always keep earliest match of each strategy by date found
166
      val newResults = strategy()
1✔
167
      newResults.foldLeft(previousResults)((previous, newResult) => {
1✔
168
        // Prioritize previous results on this index, ignore new ones if overlapping previous results
169
        if (previous.exists(_.start == newResult.start))
1✔
170
          previous
1✔
171
        else
172
          previous :+ newResult
1✔
173
      })
174
    })
175
  }
176

177
  /** Finds dates in a specific order, from formal to more relaxed. Add time of any, or
178
    * stand-alone time
179
    *
180
    * @param text
181
    *   input text coming from target document
182
    * @return
183
    *   a possible date-time match
184
    */
185
  private[annotators] def extractDate(text: String): Seq[MatchedDateTime] = {
186

187
    val _text: String = runTranslation(text)
1✔
188

189
    def inputFormatsAreDefined = !getInputFormats.sameElements(EMPTY_INIT_ARRAY)
1✔
190

191
    val possibleDates: Seq[MatchedDateTime] =
192
      if (inputFormatsAreDefined)
193
        runInputFormatsSearch(_text)
1✔
194
      else
195
        runDateExtractorChain(_text)
1✔
196

197
    possibleDates
198
  }
199

200
  private def extractRelativeDateFuture(text: String): Seq[MatchedDateTime] = {
201
    if ("(.*)\\s*in\\s*[0-9](.*)".r.findFirstMatchIn(text).isDefined)
1✔
202
      relativeFutureFactory
203
        .findMatch(text.toLowerCase())
1✔
204
        .map(possibleDate => relativeDateFutureContentParse(possibleDate))
1✔
205
    else
206
      Seq.empty
1✔
207
  }
208

209
  private def extractRelativeDatePast(text: String): Seq[MatchedDateTime] = {
210
    if ("(.*)\\s*[0-9]\\s*(.*)\\s*(ago)(.*)".r.findFirstMatchIn(text).isDefined)
1✔
211
      relativePastFactory
212
        .findMatch(text.toLowerCase())
1✔
213
        .map(possibleDate => relativeDatePastContentParse(possibleDate))
1✔
214
    else
215
      Seq.empty
1✔
216
  }
217

218
  private def extractFormalDate(text: String): Seq[MatchedDateTime] = {
219
    val allFormalDateMatches = formalFactory.findMatch(text).map { possibleDate =>
1✔
220
      formalDateContentParse(possibleDate)
1✔
221
    }
222

223
    regularizeFormalDateMatches(allFormalDateMatches)
1✔
224
  }
225

226
  private def regularizeFormalDateMatches: Seq[MatchedDateTime] => Seq[MatchedDateTime] =
227
    allFormalDateMatches => {
228
      def truncatedExists(e: Calendar, candidate: Calendar) = {
229
        DateUtils.truncate(e, Calendar.MONTH).equals(candidate)
1✔
230
      }
231

232
      val indexedMatches: Seq[(MatchedDateTime, Int)] = allFormalDateMatches.zipWithIndex
1✔
233
      val indexesToRemove = new ListBuffer[Int]()
1✔
234

235
      for (e <- indexedMatches) {
1✔
236
        val candidates = indexedMatches.filterNot(_._2 == e._2)
1✔
237
        val accTempIdx: Seq[Int] =
238
          for (candidate <- candidates
1✔
239
            // if true, the candidate is the truncated match of the existing match
240
            if truncatedExists(e._1.calendar, candidate._1.calendar)) yield candidate._2
1✔
241
        accTempIdx.foreach(indexesToRemove.append(_))
1✔
242
      }
243

244
      val regularized =
245
        indexedMatches.filterNot { case (_, i) => indexesToRemove.contains(i) }.map(_._1)
1✔
246
      regularized
247
    }
248

249
  private def extractRelaxedDate(text: String): Seq[MatchedDateTime] = {
250
    val possibleDates = relaxedFactory.findMatch(text)
1✔
251
    val possibleDatesByIndexMatch = possibleDates.groupBy(_.indexMatch)
1✔
252
    possibleDatesByIndexMatch.flatMap { case (_, possibleDates) =>
1✔
253
      computePossibleDates(possibleDates)
1✔
254
    }.toSeq
1✔
255
  }
256

257
  private def computePossibleDates(
258
      possibleDates: Seq[RuleFactory.RuleMatch]): Seq[MatchedDateTime] = {
259
    var dayMatch = $(defaultDayWhenMissing)
1✔
260
    var monthMatch = defaultMonthWhenMissing
1✔
261
    var yearMatch = defaultYearWhenMissing
1✔
262
    var changes = 0
1✔
263

264
    possibleDates.foreach(possibleDate => {
1✔
265

266
      if (possibleDate.identifier == "relaxed days" && possibleDate.content.matched.exists(
1✔
267
          _.isDigit)) {
1✔
268
        changes += 1
1✔
269
        dayMatch = possibleDate.content.matched.filter(_.isDigit).toInt
1✔
270
      }
271

272
      if (possibleDate.identifier == "relaxed months exclusive" && possibleDate.content.matched.length > 2) {
1✔
273
        changes += 1
1✔
274
        val month = possibleDate.content.matched.toLowerCase().take(3)
1✔
275
        if (shortMonths.contains(month))
×
276
          monthMatch = shortMonths.indexOf(month)
1✔
277
      }
278

279
      if (possibleDate.identifier == "relaxed year" &&
1✔
280
        possibleDate.content.matched.exists(_.isDigit) &&
1✔
281
        possibleDate.content.matched.length > 2) {
1✔
282
        changes += 1
1✔
283
        val year = possibleDate.content.matched.filter(_.isDigit).toInt
1✔
284
        yearMatch = if (year > 999) year else year + 1900
1✔
285
      }
286
    })
287
    if (possibleDates.nonEmpty && changes > 1) {
1✔
288
      val calendar = new Calendar.Builder()
1✔
289
      calendar.setDate(yearMatch, monthMatch, dayMatch)
1✔
290
      Seq(
1✔
291
        MatchedDateTime(
1✔
292
          calendar.build(),
1✔
293
          possibleDates.map(_.content.start).min,
1✔
294
          possibleDates.map(_.content.end).max))
1✔
295
    } else Seq.empty
1✔
296
  }
297

298
  private def extractRelativeDate(text: String): Seq[MatchedDateTime] = {
299
    relativeFactory.findMatch(text).map(possibleDate => relativeDateContentParse(possibleDate))
1✔
300
  }
301

302
  private def extractTomorrowYesterday(text: String): Seq[MatchedDateTime] = {
303
    tyFactory
304
      .findMatch(text)
305
      .map(possibleDate => tomorrowYesterdayContentParse(possibleDate))
1✔
306
  }
307

308
  private def extractRelativeExactDay(text: String): Seq[MatchedDateTime] = {
309
    relativeExactFactory
310
      .findMatch(text.toLowerCase)
1✔
311
      .map(possibleDate => relativeExactContentParse(possibleDate))
1✔
312
  }
313

314
  /** One to one relationship between content document and output annotation
315
    *
316
    * @return
317
    *   Any found date, empty if not. Final format is [[outputFormat]] or default yyyy/MM/dd
318
    */
319
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
320
    val simpleDateFormat = new SimpleDateFormat(getOutputFormat)
1✔
321
    annotations.flatMap(annotation =>
1✔
322
      extractDate(annotation.result)
1✔
323
        .map(matchedDate =>
1✔
324
          Annotation(
1✔
325
            outputAnnotatorType,
1✔
326
            matchedDate.start,
1✔
327
            matchedDate.end - 1,
1✔
328
            simpleDateFormat.format(matchedDate.calendar.getTime),
1✔
329
            annotation.metadata)))
1✔
330
  }
331

332
}
333

334
/** This is the companion object of [[MultiDateMatcher]]. Please refer to that class for the
335
  * documentation.
336
  */
337
object MultiDateMatcher extends DefaultParamsReadable[MultiDateMatcher]
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc