• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JohnSnowLabs / spark-nlp / 4992350528

pending completion
4992350528

Pull #13797

github

GitHub
Merge 424c7ff18 into ef7906c5e
Pull Request #13797: SPARKNLP-835: ProtectedParam and ProtectedFeature

24 of 24 new or added lines in 6 files covered. (100.0%)

8643 of 13129 relevant lines covered (65.83%)

0.66 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.2
/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcher.scala
1
/*
2
 * Copyright 2017-2022 John Snow Labs
3
 *
4
 * Licensed under the Apache License, Version 2.0 (the "License");
5
 * you may not use this file except in compliance with the License.
6
 * You may obtain a copy of the License at
7
 *
8
 *    http://www.apache.org/licenses/LICENSE-2.0
9
 *
10
 * Unless required by applicable law or agreed to in writing, software
11
 * distributed under the License is distributed on an "AS IS" BASIS,
12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
 * See the License for the specific language governing permissions and
14
 * limitations under the License.
15
 */
16

17
package com.johnsnowlabs.nlp.annotators
18

19
import com.johnsnowlabs.nlp.util.regex.RuleFactory
20
import com.johnsnowlabs.nlp.util.regex.RuleFactory.RuleMatch
21
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
22
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
23

24
import java.text.SimpleDateFormat
25
import java.util.Calendar
26
import scala.util.matching.Regex
27

28
/** Matches standard date formats into a provided format Reads from different forms of date and
29
  * time expressions and converts them to a provided date format.
30
  *
31
  * Extracts only '''one''' date per document. Use with sentence detector to find matches in each
32
  * sentence. To extract multiple dates from a document, please use the [[MultiDateMatcher]].
33
  *
34
  * Reads the following kind of dates:
35
  * {{{
36
  * "1978-01-28", "1984/04/02,1/02/1980", "2/28/79", "The 31st of April in the year 2008",
37
  * "Fri, 21 Nov 1997", "Jan 21, ‘97", "Sun", "Nov 21", "jan 1st", "next thursday",
38
  * "last wednesday", "today", "tomorrow", "yesterday", "next week", "next month",
39
  * "next year", "day after", "the day before", "0600h", "06:00 hours", "6pm", "5:30 a.m.",
40
  * "at 5", "12:59", "23:59", "1988/11/23 6pm", "next week at 7.30", "5 am tomorrow"
41
  * }}}
42
  *
43
  * For example `"The 31st of April in the year 2008"` will be converted into `2008/04/31`.
44
  *
45
  * Pretrained pipelines are available for this module, see
46
  * [[https://sparknlp.org/docs/en/pipelines Pipelines]].
47
  *
48
  * For extended examples of usage, see the
49
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb Examples]]
50
  * and the
51
  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DateMatcherTestSpec.scala DateMatcherTestSpec]].
52
  *
53
  * ==Example==
54
  * {{{
55
  * import spark.implicits._
56
  * import com.johnsnowlabs.nlp.base.DocumentAssembler
57
  * import com.johnsnowlabs.nlp.annotators.DateMatcher
58
  * import org.apache.spark.ml.Pipeline
59
  *
60
  * val documentAssembler = new DocumentAssembler()
61
  *   .setInputCol("text")
62
  *   .setOutputCol("document")
63
  *
64
  * val date = new DateMatcher()
65
  *   .setInputCols("document")
66
  *   .setOutputCol("date")
67
  *   .setAnchorDateYear(2020)
68
  *   .setAnchorDateMonth(1)
69
  *   .setAnchorDateDay(11)
70
  *
71
  * val pipeline = new Pipeline().setStages(Array(
72
  *   documentAssembler,
73
  *   date
74
  * ))
75
  *
76
  * val data = Seq("Fri, 21 Nov 1997", "next week at 7.30", "see you a day after").toDF("text")
77
  * val result = pipeline.fit(data).transform(data)
78
  *
79
  * result.selectExpr("date").show(false)
80
  * +-------------------------------------------------+
81
  * |date                                             |
82
  * +-------------------------------------------------+
83
  * |[[date, 5, 15, 1997/11/21, [sentence -> 0], []]] |
84
  * |[[date, 0, 8, 2020/01/18, [sentence -> 0], []]]  |
85
  * |[[date, 10, 18, 2020/01/12, [sentence -> 0], []]]|
86
  * +-------------------------------------------------+
87
  * }}}
88
  *
89
  * @see
90
  *   [[MultiDateMatcher]] for matching multiple dates in a document
91
  * @param uid
92
  *   internal uid required to generate writable annotators
93
  * @groupname anno Annotator types
94
  * @groupdesc anno
95
  *   Required input and expected output annotator types
96
  * @groupname Ungrouped Members
97
  * @groupname param Parameters
98
  * @groupname setParam Parameter setters
99
  * @groupname getParam Parameter getters
100
  * @groupname Ungrouped Members
101
  * @groupprio param  1
102
  * @groupprio anno  2
103
  * @groupprio Ungrouped 3
104
  * @groupprio setParam  4
105
  * @groupprio getParam  5
106
  * @groupdesc param
107
  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
108
  *   parameter values through setters and getters, respectively.
109
  */
110
class DateMatcher(override val uid: String)
111
    extends AnnotatorModel[DateMatcher]
112
    with HasSimpleAnnotate[DateMatcher]
113
    with DateMatcherUtils {
114

115
  import com.johnsnowlabs.nlp.AnnotatorType._
116

117
  /** Output annotator type: DATE
118
    *
119
    * @group anno
120
    */
121
  override val outputAnnotatorType: AnnotatorType = DATE
1✔
122

123
  /** Input annotator type: DOCUMENT
124
    *
125
    * @group anno
126
    */
127
  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
1✔
128

129
  /** Internal constructor to submit a random UID */
130
  def this() = this(Identifiable.randomUID("DATE"))
1✔
131

132
  private def runFormalFactoryForInputFormats(
133
      text: String,
134
      factory: RuleFactory): Option[MatchedDateTime] = {
135
    factory.findMatchFirstOnly(text).map { possibleDate =>
1✔
136
      formalDateContentParse(possibleDate)
1✔
137
    }
138
  }
139

140
  def runInputFormatsSearch(text: String): Option[MatchedDateTime] = {
141
    val regexes: Array[Regex] = getInputFormats
1✔
142
      .filter(formalInputFormats.contains(_))
1✔
143
      .map(formalInputFormats(_))
1✔
144

145
    for (r <- regexes) {
1✔
146
      formalFactoryInputFormats.addRule(r, "formal rule from input formats")
1✔
147
    }
148

149
    runFormalFactoryForInputFormats(text, formalFactoryInputFormats)
1✔
150
  }
151

152
  /** Finds dates in a specific order, from formal to more relaxed. Add time of any, or
153
    * stand-alone time
154
    *
155
    * @param text
156
    *   input text coming from target document
157
    * @return
158
    *   a possible date-time match
159
    */
160
  private[annotators] def extractDate(text: String): Option[MatchedDateTime] = {
161

162
    val _text: String = runTranslation(text)
1✔
163

164
    def inputFormatsAreDefined = !getInputFormats.sameElements(EMPTY_INIT_ARRAY)
1✔
165

166
    val possibleDate: Option[MatchedDateTime] =
167
      if (inputFormatsAreDefined)
168
        runInputFormatsSearch(_text)
1✔
169
      else
170
        runDateExtractorChain(_text)
1✔
171

172
    possibleDate.orElse(setTimeIfAny(possibleDate, _text))
1✔
173
  }
174

175
  private def runDateExtractorChain(_text: String) = {
176
    extractFormalDate(_text)
177
      .orElse(extractRelativeDatePast(_text))
1✔
178
      .orElse(extractRelativeDateFuture(_text))
1✔
179
      .orElse(extractRelaxedDate(_text))
1✔
180
      .orElse(extractRelativeDate(_text))
1✔
181
      .orElse(extractTomorrowYesterday(_text))
1✔
182
      .orElse(extractRelativeExactDay(_text))
1✔
183
  }
184

185
  private def runTranslation(text: String) = {
186
    val sourceLanguage = getSourceLanguage
1✔
187
    val translationPreds = Array(sourceLanguage.length == 2, !sourceLanguage.equals("en"))
1✔
188

189
    if (translationPreds.forall(_.equals(true)))
1✔
190
      new DateMatcherTranslator(SingleDatePolicy).translate(text, sourceLanguage)
1✔
191
    else
192
      text
1✔
193
  }
194

195
  private def extractFormalDate(text: String): Option[MatchedDateTime] = {
196
    formalFactory.findMatchFirstOnly(text).map { possibleDate =>
1✔
197
      formalDateContentParse(possibleDate)
1✔
198
    }
199
  }
200

201
  private def isNotMonthSubwordMatch(text: String, d: RuleMatch): Boolean = {
202
    val words = text.replaceAll("""([?.!:]|\b\p{IsLetter}{1,2}\b)\s*""", "").split(SPACE_CHAR)
1✔
203
    val notSubWordMatches = words
204
      .map(_.toLowerCase)
1✔
205
      .filter(w =>
1✔
206
        w.contains(d.content.matched.toLowerCase) && w.length <= d.content.matched.length)
1✔
207

208
    notSubWordMatches.length match {
1✔
209
      case 1 => true
1✔
210
      case _ => false
1✔
211
    }
212
  }
213

214
  private def extractRelaxedDate(text: String): Option[MatchedDateTime] = {
215
    val possibleDates: Seq[RuleFactory.RuleMatch] = relaxedFactory.findMatch(text)
1✔
216

217
    if (possibleDates.length > 1) {
1✔
218
      var dayMatch = $(defaultDayWhenMissing)
1✔
219
      var monthMatch = defaultMonthWhenMissing
1✔
220
      var yearMatch = defaultYearWhenMissing
1✔
221

222
      val dayCandidate = possibleDates.find(_.identifier == "relaxed days")
1✔
223
      if (dayCandidate.isDefined && dayCandidate.get.content.matched.exists(_.isDigit)) {
1✔
224
        dayMatch = dayCandidate.get.content.matched.filter(_.isDigit).toInt
1✔
225
      }
226

227
      val monthCandidate = possibleDates
228
        .find(_.identifier == "relaxed months exclusive")
1✔
229
        .filter(d => isNotMonthSubwordMatch(text, d))
1✔
230

231
      if (monthCandidate.isDefined && monthCandidate.get.content.matched.length > 2) {
1✔
232
        val month = monthCandidate.get.content.matched.toLowerCase().take(3)
1✔
233
        if (shortMonths.contains(month))
1✔
234
          monthMatch = shortMonths.indexOf(month)
1✔
235
      }
236

237
      val yearCandidate = possibleDates.find(_.identifier == "relaxed year")
1✔
238
      if (yearCandidate.isDefined &&
1✔
239
        yearCandidate.get.content.matched.exists(_.isDigit) &&
1✔
240
        yearCandidate.get.content.matched.length > 2) {
1✔
241
        val year = yearCandidate.get.content.matched.filter(_.isDigit).toInt
1✔
242
        yearMatch = if (year > 999) year else year + 1900
1✔
243
      }
244

245
      val calendar = new Calendar.Builder()
1✔
246
      calendar.setDate(yearMatch, monthMatch, dayMatch)
1✔
247
      val matches = possibleDates.map(p => (p.content.start, p.content.end))
1✔
248
      Some(MatchedDateTime(calendar.build(), matches.minBy(_._1)._1, matches.maxBy(_._2)._2))
1✔
249
    } else None
1✔
250
  }
251

252
  private def extractRelativeDateFuture(text: String): Option[MatchedDateTime] = {
253
    if ("in\\s[0-9]".r.findFirstMatchIn(text).isDefined && !text.contains(relativePastPattern))
1✔
254
      relativeFutureFactory
255
        .findMatchFirstOnly(text.toLowerCase())
1✔
256
        .map(possibleDate => relativeDateFutureContentParse(possibleDate))
1✔
257
    else
258
      None
1✔
259
  }
260

261
  private def extractRelativeDatePast(text: String): Option[MatchedDateTime] = {
262
    if (!"(.*)\\s+(in)\\s+[0-9]".r.findFirstMatchIn(text).isDefined && text.contains(
1✔
263
        relativePastPattern))
1✔
264
      relativePastFactory
265
        .findMatchFirstOnly(text.toLowerCase())
1✔
266
        .map(possibleDate => relativeDatePastContentParse(possibleDate))
1✔
267
    else
268
      None
1✔
269
  }
270

271
  private def extractRelativeDate(text: String): Option[MatchedDateTime] = {
272
    if (!"in\\s+[0-9]".r.findFirstMatchIn(text).isDefined && !text.contains(relativePastPattern))
1✔
273
      relativeFactory
274
        .findMatchFirstOnly(text.toLowerCase)
1✔
275
        .map(possibleDate => relativeDateContentParse(possibleDate))
1✔
276
    else
277
      None
×
278
  }
279

280
  private def extractTomorrowYesterday(text: String): Option[MatchedDateTime] = {
281
    tyFactory
282
      .findMatchFirstOnly(text.toLowerCase())
1✔
283
      .map(possibleDate => tomorrowYesterdayContentParse(possibleDate))
1✔
284
  }
285

286
  private def extractRelativeExactDay(text: String): Option[MatchedDateTime] = {
287
    relativeExactFactory
288
      .findMatchFirstOnly(text.toLowerCase())
1✔
289
      .map(possibleDate => relativeExactContentParse(possibleDate))
1✔
290
  }
291

292
  private def setTimeIfAny(
293
      dateTime: Option[MatchedDateTime],
294
      text: String): Option[MatchedDateTime] = {
295
    timeFactory.findMatchFirstOnly(text).map { possibleTime =>
1✔
296
      {
297
        val calendarBuild = new Calendar.Builder
1✔
298
        val currentCalendar = dateTime.map(_.calendar).getOrElse(Calendar.getInstance)
×
299
        calendarBuild.setDate(
1✔
300
          currentCalendar.get(Calendar.YEAR),
1✔
301
          currentCalendar.get(Calendar.MONTH),
1✔
302
          currentCalendar.get(Calendar.DAY_OF_MONTH))
1✔
303
        val times = possibleTime.content.subgroups
1✔
304
        val hour = {
305

306
          /** assuming PM if 2 digits regex-subgroup hour is defined, is ot AM and is less than
307
            * number 12 e.g. meet you at 5
308
            */
309
          if (times.head != null && // hour is defined
1✔
310
            amDefinition.findFirstIn(text).isDefined && // no explicit am
1✔
311
            times.head.toInt < 12 // hour is within smaller than 12
1✔
312
          ) times.head.toInt + 12
1✔
313
          else if (times.head.toInt < 25) times.head.toInt
1✔
314
          else 0
×
315
        }
316

317
        /** Minutes are valid if regex-subgroup matched and less than number 60 */
318
        val minutes = {
319
          if (times(1) != null && times(1).toInt < 60) times(1).toInt
1✔
320
          else 0
1✔
321
        }
322

323
        /** Seconds are valid if regex-subgroup matched and less than number 60 */
324
        val seconds = {
325
          if (times(2) != null && times(2).toInt < 60) times(2).toInt
1✔
326
          else 0
1✔
327
        }
328
        calendarBuild.setTimeOfDay(hour, minutes, seconds)
1✔
329
        MatchedDateTime(calendarBuild.build, possibleTime.content.start, possibleTime.content.end)
1✔
330
      }
331
    }
332
  }
333

334
  /** One to one relationship between content document and output annotation
335
    *
336
    * @return
337
    *   Any found date, empty if not. Final format is [[outputFormat]] or default yyyy/MM/dd
338
    */
339
  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
340
    val simpleDateFormat = new SimpleDateFormat(getOutputFormat)
1✔
341
    annotations.flatMap(annotation =>
1✔
342
      extractDate(annotation.result).map(matchedDate =>
1✔
343
        Annotation(
1✔
344
          outputAnnotatorType,
1✔
345
          matchedDate.start,
1✔
346
          matchedDate.end - 1,
1✔
347
          simpleDateFormat.format(matchedDate.calendar.getTime),
1✔
348
          annotation.metadata)))
1✔
349
  }
350

351
}
352

353
/** This is the companion object of [[DateMatcher]]. Please refer to that class for the
354
  * documentation.
355
  */
356
object DateMatcher extends DefaultParamsReadable[DateMatcher]
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc