• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

msakai / haskell-sequitur / 58

28 Jul 2024 01:09PM UTC coverage: 57.006% (-2.2%) from 59.236%
58

push

github

msakai
improve haddock

179 of 314 relevant lines covered (57.01%)

0.57 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

57.01
/src/Language/Grammar/Sequitur.hs
1
{-# OPTIONS_GHC -Wall #-}
2
{-# LANGUAGE ConstraintKinds #-}
3
{-# LANGUAGE CPP #-}
4
{-# LANGUAGE DeriveGeneric #-}
5
{-# LANGUAGE FlexibleInstances #-}
6
{-# LANGUAGE LambdaCase #-}
7
{-# LANGUAGE ScopedTypeVariables #-}
8
{-# LANGUAGE TypeFamilies #-}
9
-----------------------------------------------------------------------------
10
-- |
11
-- Module      :  Language.Grammar.Sequitur
12
-- Copyright   :  (c) Masahiro Sakai 2024
13
-- License     :  BSD-style
14
--
15
-- Maintainer  :  masahiro.sakai@gmail.com
16
-- Stability   :  provisional
17
-- Portability :  non-portable
18
--
19
-- /SEQUITUR/ is a linear-time, online algorithm for producing a context-free
20
-- grammar from an input sequence. The resulting grammar is a compact representation
21
-- of the original sequence and can be used for data compression.
22
--
23
-- Example:
24
--
25
--   - Input string: @abcabcabcabcabc@
26
--
27
--   - Resulting grammar
28
--
29
--       - @S@ → @AAB@
30
--
31
--       - @A@ → @BB@
32
--
33
--       - @B@ → @abc@
34
--
35
-- /SEQUITUR/ consumes input symbols one-by-one and appends each symbol at the end of the
36
-- grammar's start production (@S@ in the above example), then substitutes repeating
37
-- patterns in the given sequence with new rules. /SEQUITUR/ maintains two invariants:
38
--
39
--   [/Digram Uniqueness/]: /SEQUITUR/ ensures that no digram
40
--   (a.k.a. bigram) occurs more than once in the grammar. If a digram
41
--   (e.g. @ab@) occurs twice, SEQUITUR introduces a fresh non-terminal
42
--   symbol (e.g. @M@) and a rule (e.g. @M@ → @ab@) and replaces the
43
--   original occurrences with the newly introduced non-terminal symbol.
44
--   One exception is the cases where two occurrences overlap.
45
--
46
--   [/Rule Utility/]: If a non-terminal symbol occurs only once,
47
--   /SEQUITUR/ removes the associated rule and substitutes the occurrence
48
--   with the right-hand side of the rule.
49
--
50
-- References:
51
--
52
--   - [Sequitur algorithm - Wikipedia](https://en.m.wikipedia.org/wiki/Sequitur_algorithm)
53
--
54
--   - [sequitur.info](http://www.sequitur.info/)
55
--
56
--   - Nevill-Manning, C.G. and Witten, I.H. (1997) "[Identifying
57
--     Hierarchical Structure in Sequences: A linear-time
58
--     algorithm](https://doi.org/10.1613/jair.374)," Journal of
59
--     Artificial Intelligence Research, 7, 67-82.
60
--
61
-----------------------------------------------------------------------------
62
module Language.Grammar.Sequitur
63
  (
64
  -- * Basic type definition
65
    Grammar (..)
66
  , Symbol (..)
67
  , NonTerminalSymbol
68
  , IsTerminalSymbol
69

70
  -- * Construction
71

72
  -- ** High-level API
73
  --
74
  -- | Use 'encode' if the entire sequence is given at once and you
75
  -- only need to create a single grammar from it.
76
  , encode
77

78
  -- ** Low-level monadic API
79
  --
80
  -- | Use these low-level monadic API if the input sequence is given
81
  -- incrementally, or you want to repeatedly construct grammars with
82
  -- newly added inputs.
83
  , Builder
84
  , newBuilder
85
  , add
86
  , build
87

88
  -- * Conversion to other types
89
  , decode
90
  , decodeToSeq
91
  , decodeToMonoid
92
  ) where
93

94
import Control.Exception
95
import Control.Monad
96
import Control.Monad.Primitive
97
import Control.Monad.ST
98
import Data.Either
99
import qualified Data.Foldable as F
100
import Data.Function (on)
101
import Data.Hashable
102
import Data.IntMap.Strict (IntMap)
103
import qualified Data.IntMap.Strict as IntMap
104
import Data.Primitive.MutVar
105
#if MIN_VERSION_primitive(0,8,0)
106
import Data.Primitive.PrimVar
107
#endif
108
import qualified Data.HashTable.Class as H (toList)
109
import qualified Data.HashTable.ST.Cuckoo as H
110
import Data.Maybe
111
import Data.Semigroup (Endo (..))
112
import Data.Sequence (Seq)
113
import qualified Data.Sequence as Seq
114
import Data.String (IsString (..))
115
import GHC.Generics (Generic)
116
#if MIN_VERSION_base(4,17,0)
117
import qualified GHC.IsList as IsList (IsList (..))
118
#else
119
import qualified GHC.Exts as IsList (IsList (..))
120
#endif
121
import GHC.Stack
122

123
#if !MIN_VERSION_primitive(0,8,0)
124

125
type PrimVar s a = MutVar s a
126

127
{-# INLINE newPrimVar #-}
128
newPrimVar :: PrimMonad m => a -> m (PrimVar (PrimState m) a)
129
newPrimVar = newMutVar
130

131
{-# INLINE readPrimVar #-}
132
readPrimVar :: PrimMonad m => PrimVar (PrimState m) a -> m a
133
readPrimVar = readMutVar
134

135
{-# INLINE writePrimVar #-}
136
writePrimVar :: PrimMonad m => PrimVar (PrimState m) a -> a -> m ()
137
writePrimVar = writeMutVar
138

139
{-# INLINE modifyPrimVar #-}
140
modifyPrimVar :: PrimMonad m => PrimVar (PrimState m) a -> (a -> a) -> m ()
141
modifyPrimVar = modifyMutVar'
142

143
#endif
144

145
-- -------------------------------------------------------------------
146

147
sanityCheck :: Bool
148
sanityCheck = False
1✔
149

150
-- -------------------------------------------------------------------
151

152
-- | Non-terminal symbols are represented by 'Int'.
153
--
154
-- The number @0@ is reserved for the start symbol of the grammar.
155
type NonTerminalSymbol = Int
156

157
-- | Internal alias of 'NonTerminalSymbol'
158
type RuleId = NonTerminalSymbol
159

160
-- | A symbol is either a terminal symbol (from a user-specified type)
161
-- or a non-terminal symbol.
162
data Symbol a
163
  = NonTerminal !NonTerminalSymbol
164
  | Terminal a
165
  deriving (Eq, Ord, Show, Generic)
×
166

167
instance (Hashable a) => Hashable (Symbol a)
×
168

169
-- | @since 0.2.0.0
170
instance Functor Symbol where
×
171
  fmap _ (NonTerminal rid) = NonTerminal rid
×
172
  fmap f (Terminal a) = Terminal (f a)
×
173

174
type Digram a = (Symbol a, Symbol a)
175

176
-- | Since a grammar generated by /SEQUITUR/ has exactly one rule for
177
-- each non-terminal symbol, a grammar is represented as a mapping
178
-- from non-terminal symbols (left-hand sides of the rules) to
179
-- sequences of symbols (right-hand side of the rules).
180
--
181
-- For example, a grammar
182
--
183
--   - @0@ → @1 1 2@
184
--
185
--   - @1@ → @2 2@
186
--
187
--   - @2@ → @a b c@
188
--
189
-- is represented as
190
--
191
-- @
192
-- Grammar (fromList
193
--   [ (0, [NonTerminal 1, NonTerminal 1, NonTerminal 2])
194
--   , (1, [NonTerminal 2, NonTerminal 2])
195
--   , (2, [Terminal \'a\', Terminal \'b\', Terminal \'c\'])
196
--   ])
197
-- @
198
--
199
-- Since a grammar generated by /SEQUITUR/ produces exactly one
200
-- sequence, we can identify the grammar with the produced
201
-- sequence. Therefore, 'Grammar' type is an instance of 'Foldable',
202
-- 'IsList.IsList', and 'IsString'.
203
newtype Grammar a = Grammar {unGrammar :: IntMap [Symbol a]}
×
204
  deriving (Eq, Show)
×
205

206
-- | @since 0.2.0.0
207
instance Functor Grammar where
×
208
  fmap f (Grammar m) = Grammar (fmap (map (fmap f)) m)
×
209

210
-- | @since 0.2.0.0
211
instance Foldable Grammar where
×
212
  foldMap = decodeToMonoid
×
213

214
-- | @since 0.2.0.0
215
instance IsTerminalSymbol a => IsList.IsList (Grammar a) where
×
216
  type Item (Grammar a) = a
217
  fromList = encode
×
218
  toList = decode
×
219

220
-- | @since 0.2.0.0
221
instance  IsString (Grammar Char) where
222
  fromString = encode
×
223

224
-- | @IsTerminalSymbol@ is a class synonym for absorbing the difference
225
-- between @hashable@ @<1.4.0.0@ and @>=1.4.0.0@.
226
--
227
-- @hashable-1.4.0.0@ makes 'Eq' be a superclass of 'Hashable'.
228
-- Therefore we define
229
--
230
-- @
231
-- type IsTerminalSymbol a = Hashable a
232
-- @
233
--
234
-- on @hashable >=1.4.0.0@, while we define
235
--
236
-- @
237
-- type IsTerminalSymbol a = (Eq a, Hashable a)
238
-- @
239
--
240
-- on @hashable <1.4.0.0@.
241
--
242
-- Also, developers can temporarily add other classes (e.g. 'Show') to
243
-- ease debugging.
244
#if MIN_VERSION_hashable(1,4,0)
245
type IsTerminalSymbol a = Hashable a
246
#else
247
type IsTerminalSymbol a = (Eq a, Hashable a)
248
#endif
249

250
-- -------------------------------------------------------------------
251

252
data Node s a
253
  = Node
254
  { nodePrev :: {-# UNPACK #-} !(MutVar s (Node s a))
1✔
255
  , nodeNext :: {-# UNPACK #-} !(MutVar s (Node s a))
1✔
256
  , nodeData :: Either RuleId (Symbol a)
1✔
257
  } deriving (Generic)
×
258

259
instance Eq (Node s a) where
1✔
260
  (==) = (==) `on` nodePrev
1✔
261

262
isGuardNode :: Node s a -> Bool
263
isGuardNode s = isLeft $ nodeData s
1✔
264

265
nodeSymbolMaybe :: Node s a -> Maybe (Symbol a)
266
nodeSymbolMaybe node =
1✔
267
  case nodeData node of
1✔
268
    Left _ -> Nothing
1✔
269
    Right sym -> Just sym
1✔
270

271
nodeSymbol :: HasCallStack => Node s a -> Symbol a
272
nodeSymbol node =
1✔
273
  case nodeSymbolMaybe node of
1✔
274
    Nothing -> error "nodeSymbol is called for guard node"
×
275
    Just sym -> sym
1✔
276

277
ruleOfGuardNode :: Node s a -> Maybe RuleId
278
ruleOfGuardNode node =
1✔
279
  case nodeData node of
1✔
280
    Left rule -> Just rule
×
281
    Right _ -> Nothing
1✔
282

283
getPrev :: Node s a -> ST s (Node s a)
284
getPrev node = readMutVar (nodePrev node)
1✔
285

286
getNext :: Node s a -> ST s (Node s a)
287
getNext node = readMutVar (nodeNext node)
1✔
288

289
setPrev :: Node s a -> Node s a -> ST s ()
290
setPrev node prev = writeMutVar (nodePrev node) prev
1✔
291

292
setNext :: Node s a -> Node s a -> ST s ()
293
setNext node next = writeMutVar (nodeNext node) next
1✔
294

295
mkGuardNode :: RuleId -> ST s (Node s a)
296
mkGuardNode rid = do
1✔
297
  prevRef <- newMutVar undefined
×
298
  nextRef <- newMutVar undefined
×
299
  let node = Node prevRef nextRef (Left rid)
×
300
  writeMutVar prevRef node
1✔
301
  writeMutVar nextRef node
1✔
302
  return node
1✔
303

304
-- -------------------------------------------------------------------
305

306
data Rule s a
307
  = Rule
308
  { ruleId :: {-# UNPACK #-} !RuleId
1✔
309
  , ruleGuardNode :: !(Node s a)
1✔
310
  , ruleRefCounter :: {-# UNPACK #-} !(PrimVar s Int)
1✔
311
  }
312

313
instance Eq (Rule s a) where
×
314
  (==) = (==) `on` ruleId
×
315

316
instance Hashable (Rule s a) where
×
317
  hashWithSalt salt rule = hashWithSalt salt (ruleId rule)
×
318

319
getFirstNodeOfRule :: Rule s a -> ST s (Node s a)
320
getFirstNodeOfRule rule = getNext (ruleGuardNode rule)
1✔
321

322
getLastNodeOfRule :: Rule s a -> ST s (Node s a)
323
getLastNodeOfRule rule = getPrev (ruleGuardNode rule)
1✔
324

325
mkRule :: RuleId -> ST s (Rule s a)
326
mkRule rid = do
1✔
327
  g <- mkGuardNode rid
×
328
  refCounter <- newPrimVar 0
1✔
329
  return $ Rule rid g refCounter
1✔
330

331
newRule :: Builder s a -> ST s (Rule s a)
332
newRule s = do
1✔
333
  rid <- readPrimVar (sRuleIdCounter s)
1✔
334
  modifyPrimVar (sRuleIdCounter s) (+ 1)
1✔
335
  rule <- mkRule rid
1✔
336
  H.insert (sRules s) rid rule
1✔
337
  return rule
1✔
338

339
-- -------------------------------------------------------------------
340

341
-- | 'Builder' denotes an internal state of the /SEQUITUR/ algorithm.
342
data Builder s a
343
  = Builder
344
  { sRoot :: !(Rule s a)
1✔
345
  , sDigrams :: !(H.HashTable s (Digram a) (Node s a))
1✔
346
  , sRules :: !(H.HashTable s RuleId (Rule s a))
1✔
347
  , sRuleIdCounter :: {-# UNPACK #-} !(PrimVar s Int)
1✔
348
  , sDummyNode :: !(Node s a)
1✔
349
  }
350

351
-- | Create a new 'Builder'.
352
newBuilder :: PrimMonad m => m (Builder (PrimState m) a)
353
newBuilder = stToPrim $ do
1✔
354
  root <- mkRule 0
1✔
355
  digrams <- H.new
1✔
356
  rules <- H.new
1✔
357
  counter <- newPrimVar 1
1✔
358

359
  prevRef <- newMutVar undefined
×
360
  nextRef <- newMutVar undefined
×
361
  let dummyNode = Node prevRef nextRef (Left 0)
×
362
  writeMutVar prevRef dummyNode
×
363
  writeMutVar nextRef dummyNode
×
364

365
  return $ Builder root digrams rules counter dummyNode
1✔
366

367
getRule :: HasCallStack => Builder s a -> RuleId -> ST s (Rule s a)
368
getRule s rid = do
1✔
369
  ret <- H.lookup (sRules s) rid
1✔
370
  case ret of
1✔
371
    Nothing -> error "getRule: invalid rule id"
×
372
    Just rule -> return rule
1✔
373

374
-- | Add a new symbol to the end of grammar's start production,
375
-- and perform normalization to keep the invariants of the /SEQUITUR/ algorithm.
376
add :: (PrimMonad m, IsTerminalSymbol a) => Builder (PrimState m) a -> a -> m ()
377
add s a = stToPrim $ do
1✔
378
  lastNode <- getLastNodeOfRule (sRoot s)
1✔
379
  _ <- insertAfter s lastNode (Terminal a)
1✔
380
  _ <- check s lastNode
1✔
381
  when sanityCheck $ do
×
382
    checkDigramTable s
×
383
    checkRefCount s
×
384

385
-- | Retrieve a grammar (as a persistent data structure) from the 'Builder'\'s internal state.
386
build :: (PrimMonad m) => Builder (PrimState m) a -> m (Grammar a)
387
build s = stToPrim $ do
1✔
388
  root <- freezeGuardNode $ ruleGuardNode (sRoot s)
1✔
389
  xs <- H.toList (sRules s)
1✔
390
  m <- forM xs $ \(rid, rule) -> do
1✔
391
    ys <- freezeGuardNode (ruleGuardNode rule)
1✔
392
    return (rid, ys)
1✔
393
  return $ Grammar $ IntMap.insert 0 root $ IntMap.fromList m
1✔
394

395
freezeGuardNode :: forall a s. Node s a -> ST s [Symbol a]
396
freezeGuardNode g = f [] =<< getPrev g
1✔
397
  where
398
    f :: [Symbol a] -> Node s a -> ST s [Symbol a]
399
    f ret node = do
1✔
400
      if isGuardNode node then
1✔
401
        return ret
1✔
402
      else do
1✔
403
        node' <- getPrev node
1✔
404
        f (nodeSymbol node : ret) node'
1✔
405

406
-- -------------------------------------------------------------------
407

408
link :: IsTerminalSymbol a => Builder s a -> Node s a -> Node s a -> ST s ()
409
link s left right = do
1✔
410
  leftPrev <- getPrev left
1✔
411
  leftNext <- getNext left
1✔
412
  rightPrev <- getPrev right
1✔
413
  rightNext <- getNext right
1✔
414

415
  unless (isGuardNode leftNext) $ do
1✔
416
    deleteDigram s left
1✔
417

418
    -- これが不要なのは何故?
419
    -- unless (isGuardNode rightPrev) $ deleteDigram s rightPrev
420

421
    case (nodeSymbolMaybe rightPrev, nodeSymbolMaybe right, nodeSymbolMaybe rightNext) of
1✔
422
      (Just sym1, Just sym2, Just sym3) | sym1 == sym2 && sym2 == sym3 ->
1✔
423
        H.insert (sDigrams s) (sym2, sym3) right
1✔
424
      _ -> return ()
×
425

426
    case (nodeSymbolMaybe leftPrev, nodeSymbolMaybe left, nodeSymbolMaybe leftNext) of
1✔
427
      (Just sym1, Just sym2, Just sym3) | sym1 == sym2 && sym2 == sym3 ->
1✔
428
        H.insert (sDigrams s) (sym1, sym2) leftPrev
1✔
429
      _ -> return ()
×
430

431
  setNext left right
1✔
432
  setPrev right left
1✔
433

434
insertAfter :: (IsTerminalSymbol a, HasCallStack) => Builder s a -> Node s a -> Symbol a -> ST s (Node s a)
435
insertAfter s node sym = do
1✔
436
  prevRef <- newMutVar (sDummyNode s)
1✔
437
  nextRef <- newMutVar (sDummyNode s)
1✔
438
  let newNode = Node prevRef nextRef (Right sym)
1✔
439

440
  next <- getNext node
1✔
441
  link s newNode next
×
442
  link s node newNode
1✔
443

444
  case sym of
1✔
445
    Terminal _ -> return ()
×
446
    NonTerminal rid -> do
1✔
447
      rule <- getRule s rid
1✔
448
      modifyPrimVar (ruleRefCounter rule) (+ 1)
1✔
449

450
  return newNode
1✔
451

452
deleteDigram :: IsTerminalSymbol a => Builder s a -> Node s a -> ST s ()
453
deleteDigram s n
1✔
454
  | isGuardNode n = return ()
×
455
  | otherwise = do
×
456
      next <- getNext n
1✔
457
      unless (isGuardNode next) $ do
1✔
458
        _ <- H.mutate (sDigrams s) (nodeSymbol n, nodeSymbol next) $ \case
1✔
459
          Just n' | n /= n' -> (Just n', ())
×
460
          _ -> (Nothing, ())
×
461
        return ()
×
462

463
check :: IsTerminalSymbol a => Builder s a -> Node s a -> ST s Bool
464
check s node
1✔
465
  | isGuardNode node = return False
1✔
466
  | otherwise = do
×
467
      next <- getNext node
1✔
468
      if isGuardNode next then
1✔
469
        return False
×
470
      else do
1✔
471
        ret <- H.mutate (sDigrams s) (nodeSymbol node, nodeSymbol next) $ \case
1✔
472
          Nothing -> (Just node, Nothing)
1✔
473
          Just node' -> (Just node', Just node')
1✔
474
        case ret of
1✔
475
          Nothing -> return False
1✔
476
          Just node' -> do
1✔
477
             next' <- getNext node'
1✔
478
             if node == next' then
1✔
479
               return False
×
480
             else do
1✔
481
               match s node node'
1✔
482
               return True
×
483

484
match :: (IsTerminalSymbol a, HasCallStack) => Builder s a -> Node s a -> Node s a -> ST s ()
485
match s ss m = do
1✔
486
  mPrev <- getPrev m
1✔
487
  mNext <- getNext m
1✔
488
  mNextNext <- getNext mNext
1✔
489

490
  rule <- case ruleOfGuardNode mPrev of
1✔
491
    Just rid | isGuardNode mNextNext -> do
×
492
      rule <- getRule s rid
×
493
      substitute s ss rule
×
494
      return rule
×
495
    _ -> do
1✔
496
      rule <- newRule  s
1✔
497
      ss2 <- getNext ss
1✔
498
      lastNode <- getLastNodeOfRule rule
1✔
499
      node1 <- insertAfter s lastNode (nodeSymbol ss)
1✔
500
      node2 <- insertAfter s node1 (nodeSymbol ss2)
1✔
501
      substitute s m rule
1✔
502
      substitute s ss rule
1✔
503
      H.insert (sDigrams s) (nodeSymbol node1, nodeSymbol node2) node1
1✔
504
      return rule
1✔
505

506
  firstNode <- getFirstNodeOfRule rule
1✔
507
  case nodeSymbol firstNode of
1✔
508
    Terminal _ -> return ()
×
509
    NonTerminal rid -> do
1✔
510
      rule2 <- getRule s rid
1✔
511
      freq <- readPrimVar (ruleRefCounter rule2)
1✔
512
      when (freq == 1) $ expand s firstNode rule2
1✔
513

514
  when sanityCheck $ do
×
515
    let loop node
×
516
          | isGuardNode node = return ()
×
517
          | otherwise = do
×
518
              case nodeSymbol node of
×
519
                Terminal _ -> return ()
×
520
                NonTerminal rid -> do
×
521
                  rule2 <- getRule s rid
×
522
                  freq <- readPrimVar (ruleRefCounter rule2)
×
523
                  when (freq <= 1) $ error "Sequitur.match: non-first node with refCount <= 1"
×
524
    loop =<< getNext firstNode
×
525

526
deleteNode :: (IsTerminalSymbol a, HasCallStack) => Builder s a -> Node s a -> ST s ()
527
deleteNode s node = do
1✔
528
  assert (not (isGuardNode node)) $ return ()
×
529
  prev <- getPrev node
1✔
530
  next <- getNext node
1✔
531
  link s prev next
1✔
532
  deleteDigram s node
1✔
533
  case nodeSymbol node of
1✔
534
    Terminal _ -> return ()
×
535
    NonTerminal rid -> do
1✔
536
      rule <- getRule s rid
1✔
537
      modifyPrimVar (ruleRefCounter rule) (subtract 1)
1✔
538

539
substitute :: (IsTerminalSymbol a, HasCallStack) => Builder s a -> Node s a -> Rule s a -> ST s ()
540
substitute s node rule = do
1✔
541
  prev <- getPrev node
1✔
542
  deleteNode s =<< getNext prev
1✔
543
  deleteNode s =<< getNext prev
1✔
544
  _ <- insertAfter s prev (NonTerminal (ruleId rule))
1✔
545
  ret <- check s prev
1✔
546
  unless ret $ do
1✔
547
    next <- getNext prev
1✔
548
    _ <- check s next
1✔
549
    return ()
×
550

551
expand :: IsTerminalSymbol a => Builder s a -> Node s a -> Rule s a -> ST s ()
552
expand s node rule = do
1✔
553
  left <- getPrev node
1✔
554
  right <- getNext node
1✔
555
  deleteNode s node
1✔
556

557
  f <- getFirstNodeOfRule rule
1✔
558
  l <- getLastNodeOfRule rule
1✔
559
  link s left f
×
560
  link s l right
×
561

562
  n <- getNext l
1✔
563
  let key = (nodeSymbol l, nodeSymbol n)
1✔
564
  when sanityCheck $ do
×
565
    ret <- H.lookup (sDigrams s) key
×
566
    when (isJust ret) $ error "Sequitur.expand: the digram is already in the table"
×
567
  H.insert (sDigrams s) key l
1✔
568
  H.delete (sRules s) (ruleId rule)
1✔
569

570
-- -------------------------------------------------------------------
571

572
-- | Construct a grammar from a given sequence of symbols using /SEQUITUR/.
573
--
574
-- 'IsList.fromList' and 'fromString' can also be used.
575
encode :: IsTerminalSymbol a => [a] -> Grammar a
576
encode xs = runST $ do
1✔
577
  e <- newBuilder
1✔
578
  mapM_ (add e) xs
1✔
579
  build e
1✔
580

581
-- | Reconstruct an input sequence from a grammar.
582
--
583
-- It is lazy in the sense that you can consume from the beginning
584
-- before constructing the entire sequence. This function is suitable
585
-- if you just need to access the resulting sequence only once and
586
-- from beginning to end. If you need to use the resulting sequence in
587
-- a more complex way, 'decodeToSeq' would be more suitable.
588
--
589
-- This is a left-inverse of 'encode', and is equivalent to 'F.toList'
590
-- of 'Foldable' class and 'IsList.toList' of 'IsList.IsList'.
591
decode :: HasCallStack => Grammar a -> [a]
592
decode g = appEndo (decodeToMonoid (\a -> Endo (a :)) g) []
1✔
593

594
-- | A variant of 'decode' in which the result type is 'Seq'.
595
decodeToSeq :: HasCallStack => Grammar a -> Seq a
596
decodeToSeq = decodeToMonoid Seq.singleton
1✔
597

598
-- | 'Monoid'-based folding over the decoded sequence.
599
--
600
-- This function is equivalent to the following definition but is more
601
-- efficient due to the utilization of sharing.
602
--
603
-- @
604
-- decodeToMonoid f = 'mconcat' . 'map' f . 'decode'
605
-- @
606
--
607
-- This is equivalent to 'F.foldMap' of 'Foldable' class.
608
decodeToMonoid :: (Monoid m, HasCallStack) => (a -> m) -> Grammar a -> m
609
decodeToMonoid e (Grammar m) = get 0 table
1✔
610
  where
611
    -- depends on the fact that fmap of IntMap is lazy
612
    table = fmap (mconcat . map f) m
1✔
613

614
    f (Terminal a) = e a
1✔
615
    f (NonTerminal r) = get r table
1✔
616

617
    get r tbl =
1✔
618
      case IntMap.lookup r tbl of
1✔
619
        Nothing -> error ("rule " ++ show r ++ " is missing")
×
620
        Just x -> x
1✔
621

622
-- -------------------------------------------------------------------
623

624
checkDigramTable :: IsTerminalSymbol a => Builder s a -> ST s ()
625
checkDigramTable s = do
×
626
  checkDigramTable1 s
×
627
  checkDigramTable2 s
×
628

629
checkDigramTable1 :: IsTerminalSymbol a => Builder s a -> ST s ()
630
checkDigramTable1 s = do
×
631
  ds <- H.toList (sDigrams s)
×
632
  forM_ ds $ \((sym1, sym2), node1) -> do
×
633
    node2 <- getNext node1
×
634
    unless ((nodeData node1, nodeData node2) == (Right sym1, Right sym2)) $ do
×
635
      error "checkDigramTable1: an entry points to a different digram"
×
636
    let f n =
×
637
          case nodeData n of
×
638
            Right _ -> f =<< getPrev n
×
639
            Left rid -> do
×
640
              rule <- if rid == 0 then
×
641
                        return (sRoot s)
×
642
                      else do
×
643
                        ret <- H.lookup (sRules s) rid
×
644
                        case ret of
×
645
                          Nothing -> error "checkDigramTable1: an entry points to a digram in an invalid rule"
×
646
                          Just rule -> return rule
×
647
              unless (ruleGuardNode rule == n) $ do
×
648
                error "checkDigramTable1: an entry points to a digram in a inconsistent rule"
×
649
    f node1
×
650

651
checkDigramTable2 :: IsTerminalSymbol a => Builder s a -> ST s ()
652
checkDigramTable2 s = do
×
653
  rules <- H.toList (sRules s)
×
654
  forM_ (sRoot s : map snd rules) $ \rule -> do
×
655
    let f node1 = do
×
656
          node2 <- getNext node1
×
657
          unless (isGuardNode node2) $ do
×
658
            let sym1 = nodeSymbol node1
×
659
                sym2 = nodeSymbol node2
×
660
                normalCase = do
×
661
                  ret <- H.lookup (sDigrams s) (sym1, sym2)
×
662
                  case ret of
×
663
                    Nothing -> error "checkDigramTable2: digram does not in the digram table"
×
664
                    Just node | node1 /= node -> error "checkDigramTable2: digram entry points to a different node"
×
665
                    Just _ -> return ()
×
666
                  f node2
×
667
            if sym1 == sym2 then do
×
668
              node3 <- getNext node2
×
669
              case nodeData node3 of
×
670
                Right sym3 | sym1 == sym3 -> do
×
671
                  ret <- H.lookup (sDigrams s) (sym1, sym2)
×
672
                  case ret of
×
673
                    Nothing -> error "checkDigramTable2: digram does not in the digram table"
×
674
                    Just node | node1 /= node && node2 /= node -> error "checkDigramTable2: digram entry points to a different node"
×
675
                    Just _ -> return ()
×
676
                  f node3
×
677
                _ -> normalCase
×
678
            else do
×
679
              normalCase
×
680
    f =<< getFirstNodeOfRule rule
×
681

682
checkRefCount :: forall s a. Builder s a -> ST s ()
683
checkRefCount s = do
×
684
  Grammar m <- build s
×
685
  let occurences = IntMap.fromListWith (+) [(rid, 1) | body <- IntMap.elems m, NonTerminal rid <- body]
×
686
      f :: (RuleId, Rule s a) -> ST s ()
687
      f (_r, rule) = do
×
688
        actual <- readPrimVar (ruleRefCounter rule)
×
689
        let expected = IntMap.findWithDefault 0 (ruleId rule) occurences
×
690
        unless (actual == expected) $
×
691
          error ("rule " ++ show (ruleId rule) ++ " occurs " ++ show expected ++ " times,"
×
692
                 ++ " but its reference counter is " ++ show actual)
×
693
  H.mapM_ f (sRules s)
×
694

695
-- -------------------------------------------------------------------
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc