c1af5ae6-6501-463e-8e2b-3f62f7a4d663

Committed 07 Nov 2025 07:09PM UTC coverage: 55.158% (+0.04%) from 55.118%

Build # c1af5ae6-6501-463e-8e2b-3f62f7a4d663

Build Type

Pull #1607

circleci

Committed by

akarki2005

Commit Message

Merge branch 'master' of https://github.com/Courseography/courseography into rename-post-table-to-program

Pull Request Pull Request #1607: Rename post table to program

Run Details

487 of 958 branches covered (50.84%)

Branch coverage included in aggregate %.

16 of 32 new or added lines in 6 files covered. (50.0%)

16 existing lines in 1 file now uncovered.

2224 of 3957 relevant lines covered (56.2%)

159.45 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

38.24

/app/WebParsing/PostParser.hs

module WebParsing.PostParser
    ( addPostToDatabase
    , postInfoParser
    , pruneHtml
    , getPostType
    ) where

import Control.Monad.Trans (liftIO)
import Data.Either (fromRight)
import Data.Functor (void)
import Data.List (find)
import Data.List.Split (keepDelimsL, split, splitWhen, whenElt)
import Data.Text (strip)
import qualified Data.Text as T
import Data.Time.Clock (getCurrentTime)
import Database.DataType (ProgramType (..))
import Database.Persist (insertUnique)
import Database.Persist.Sqlite (SqlPersistM, insert_)
import Database.Tables
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import qualified Text.Parsec as P
import Text.Parsec.Text (Parser)
import WebParsing.ParsecCombinators (parseUntil)
import WebParsing.ReqParser (parseReqs)


addPostToDatabase :: [Tag T.Text] -> SqlPersistM ()
addPostToDatabase programElements = do
    let fullPostName = maybe "" (strip . fromTagText) $ find isTagText programElements
        postDescHtml = partitions isDescriptionSection programElements
        descriptionText = case postDescHtml of
            [] -> T.empty
            (x:_) -> renderTags x
        postReqHtml = sections isRequirementSection programElements
        requirementLines = if null postReqHtml then [] else pruneHtml $ last postReqHtml
        requirements = concatMap parseRequirement $ reqHtmlToLines requirementLines
    liftIO $ print fullPostName

    case P.parse postInfoParser "POSt information" fullPostName of
        Left _ -> return ()
        Right (department, code) -> do
            currTime <- liftIO getCurrentTime
            programExists <- insertUnique Program {
                programName = getPostType code department,
                programDepartment = department,
                programCode = code,
                programDescription = descriptionText,
                programRequirements = renderTags requirementLines,
                programCreated = currTime,
                programModified = currTime
                }
            case programExists of
                Just key ->
                    mapM_ (insert_ . ProgramCategory key) requirements
                Nothing -> return ()
    where
        isDescriptionSection tag = tagOpenAttrNameLit "div" "class" (T.isInfixOf "views-field-body") tag || isRequirementSection tag
        isRequirementSection tag = tagOpenAttrNameLit "div" "class" (T.isInfixOf "views-field-field-enrolment-requirements") tag || tagOpenAttrNameLit "div" "class" (T.isInfixOf "views-field-field-completion-requirements") tag


-- | Parse a Post value from its title.
-- Titles are usually of the form "Actuarial Science Major (Science Program)".
postInfoParser :: Parser (T.Text, T.Text)
postInfoParser = do
    deptName <- P.manyTill P.anyChar $ P.choice $ map (P.try . P.lookAhead) [
        void postCodeParser,
        P.eof
        ]
    code <- postCodeParser P.<|> return T.empty

    return (T.pack deptName, code)

-- | Extracts the post type (eg. major) from a post code if it is non-empty,
-- | or from a dept name otherwise
getPostType :: T.Text -> T.Text -> ProgramType
getPostType "" deptName = getPostTypeFromName deptName
getPostType code _ = getPostTypeFromCode code

-- | Extracts the post type (eg. major) from a post name (eg. "Biology Specialist")
getPostTypeFromName :: T.Text -> ProgramType
getPostTypeFromName deptName
    | T.isInfixOf "Specialist" deptName = Specialist
    | T.isInfixOf "Major" deptName = Major
    | T.isInfixOf "Minor" deptName = Minor
    | T.isInfixOf "Focus" deptName = Focus
    | T.isInfixOf "Certificate" deptName = Certificate
    | otherwise = Other

-- | Extracts the program type (eg. major) from a program code (eg. ASMAJ1689)
getPostTypeFromCode :: T.Text -> ProgramType
getPostTypeFromCode = abbrevToPost . T.take 3 . T.drop 2

-- | Maps the post type abbreviations to their corresponding ProgramType
abbrevToPost :: T.Text -> ProgramType
abbrevToPost "SPE" = Specialist
abbrevToPost "MAJ" = Major
abbrevToPost "MIN" = Minor
abbrevToPost "FOC" = Focus
abbrevToPost "CER" = Certificate
abbrevToPost _ = Other

-- | Parser for a post code (eg. ASFOC1689A)
postCodeParser :: Parser T.Text
postCodeParser = do
    _ <- P.many1 P.space >> P.char '-' >> P.many1 P.space
    code <- P.count 5 P.letter
    num <- P.count 4 P.digit
    variant <- P.many P.letter
    return $ T.pack $ code ++ num ++ variant

-- | Prunes all the attributes (eg. class, href) in the html except for the style.
-- | Removes all <a></a> tags
pruneHtml :: [Tag T.Text] -> [Tag T.Text]
pruneHtml [] = []
pruneHtml ((TagOpen "a" _):xs) = pruneHtml xs
pruneHtml ((TagClose "a"):xs) = pruneHtml xs
pruneHtml ((TagOpen tag attrs):xs) = (TagOpen tag [style | style@("style", _) <- attrs]) : pruneHtml xs
pruneHtml (x:xs) = x : pruneHtml xs

-- | Split requirements HTML into individual lines.
reqHtmlToLines :: [Tag T.Text] -> [[T.Text]]
reqHtmlToLines tags =
    let sects = split (keepDelimsL $ whenElt isSectionSplit) tags
        sectionsNoNotes = filter (not . isNoteSection) sects
        paragraphs = concatMap (splitWhen (isTagOpenName "p")) sectionsNoNotes
        lines' = map (map (T.strip . convertLine) . splitLines) paragraphs
    in
        lines'

    where
        isSectionSplit :: Tag T.Text -> Bool
        isSectionSplit tag =
            isTagText tag &&
            any (flip T.isInfixOf $ fromTagText tag)
                ["First", "Second", "Third", "Higher", "Recommended Courses:", "Notes", "NOTES"]

        isNoteSection :: [Tag T.Text] -> Bool
        isNoteSection (sectionTitleTag:_) =
            isTagText sectionTitleTag && any (flip T.isInfixOf $ fromTagText sectionTitleTag) ["Notes", "NOTES"]
        isNoteSection [] = False

        splitLines :: [Tag T.Text] -> [[Tag T.Text]]
        splitLines = splitWhen (\tag -> isTagOpenName "br" tag || isTagOpenName "li" tag)

        convertLine :: [Tag T.Text] -> T.Text
        convertLine [] = ""
        convertLine (t:ts)
            | isTagOpenName "li" t = T.append "0." (innerText ts)
            | otherwise = innerText (t:ts)


parseRequirement :: [T.Text] -> [T.Text]
parseRequirement requirement = map parseSingleReq $ filter isReq requirement
    where
        isReq t = T.length t >= 7 &&
            not (any (`T.isInfixOf` t) ["First", "Second", "Third", "Higher"])

        parseSingleReq =
            T.pack . show .
            parseReqs .      -- Using parser for new Req type
            T.unpack .
            fromRight "" .
            P.parse getLineText "Reading a requirement line" .
            T.strip

        -- Strips the optional leading numbering (#.) from a line.
        getLineText :: Parser T.Text
        getLineText = do
            P.optional $ P.try (P.digit >> P.char '.' >> P.space)
            parseUntil P.eof

1	module WebParsing.PostParser
2	( addPostToDatabase
3	, postInfoParser
4	, pruneHtml
5	, getPostType
6	) where
7
8	import Control.Monad.Trans (liftIO)
9	import Data.Either (fromRight)
10	import Data.Functor (void)
11	import Data.List (find)
12	import Data.List.Split (keepDelimsL, split, splitWhen, whenElt)
13	import Data.Text (strip)
14	import qualified Data.Text as T
15	import Data.Time.Clock (getCurrentTime)
16	import Database.DataType (ProgramType (..))
17	import Database.Persist (insertUnique)
18	import Database.Persist.Sqlite (SqlPersistM, insert_)
19	import Database.Tables
20	import Text.HTML.TagSoup
21	import Text.HTML.TagSoup.Match
22	import qualified Text.Parsec as P
23	import Text.Parsec.Text (Parser)
24	import WebParsing.ParsecCombinators (parseUntil)
25	import WebParsing.ReqParser (parseReqs)
26
27
28	addPostToDatabase :: [Tag T.Text] -> SqlPersistM ()
29	addPostToDatabase programElements = do	×
30	let fullPostName = maybe "" (strip . fromTagText) $ find isTagText programElements	×
31	postDescHtml = partitions isDescriptionSection programElements	×
32	descriptionText = case postDescHtml of	×
33	[] -> T.empty	×
34	(x:_) -> renderTags x	×
35	postReqHtml = sections isRequirementSection programElements	×
36	requirementLines = if null postReqHtml then [] else pruneHtml $ last postReqHtml	×
37	requirements = concatMap parseRequirement $ reqHtmlToLines requirementLines	×
38	liftIO $ print fullPostName	×
39
40	case P.parse postInfoParser "POSt information" fullPostName of	×
41	Left _ -> return ()	×
42	Right (department, code) -> do	×
43	currTime <- liftIO getCurrentTime	×
NEW 44	programExists <- insertUnique Program {	×
NEW 45	programName = getPostType code department,	×
NEW 46	programDepartment = department,	×
NEW 47	programCode = code,	×
NEW 48	programDescription = descriptionText,	×
NEW 49	programRequirements = renderTags requirementLines,	×
NEW 50	programCreated = currTime,	×
NEW 51	programModified = currTime	×
52	}
NEW 53	case programExists of	×
54	Just key ->
NEW 55	mapM_ (insert_ . ProgramCategory key) requirements	×
56	Nothing -> return ()	×
57	where
58	isDescriptionSection tag = tagOpenAttrNameLit "div" "class" (T.isInfixOf "views-field-body") tag \|\| isRequirementSection tag	×
59	isRequirementSection tag = tagOpenAttrNameLit "div" "class" (T.isInfixOf "views-field-field-enrolment-requirements") tag \|\| tagOpenAttrNameLit "div" "class" (T.isInfixOf "views-field-field-completion-requirements") tag	×
60
61
62	-- \| Parse a Post value from its title.
63	-- Titles are usually of the form "Actuarial Science Major (Science Program)".
64	postInfoParser :: Parser (T.Text, T.Text)
65	postInfoParser = do	1✔
66	deptName <- P.manyTill P.anyChar $ P.choice $ map (P.try . P.lookAhead) [	1✔
67	void postCodeParser,	1✔
68	P.eof	1✔
69	]
70	code <- postCodeParser P.<\|> return T.empty	1✔
71
72	return (T.pack deptName, code)	14✔
73
74	-- \| Extracts the post type (eg. major) from a post code if it is non-empty,
75	-- \| or from a dept name otherwise
76	getPostType :: T.Text -> T.Text -> ProgramType
77	getPostType "" deptName = getPostTypeFromName deptName	5✔
78	getPostType code _ = getPostTypeFromCode code	5✔
79
80	-- \| Extracts the post type (eg. major) from a post name (eg. "Biology Specialist")
81	getPostTypeFromName :: T.Text -> ProgramType
82	getPostTypeFromName deptName
83	\| T.isInfixOf "Specialist" deptName = Specialist	5✔
84	\| T.isInfixOf "Major" deptName = Major	4✔
85	\| T.isInfixOf "Minor" deptName = Minor	3✔
86	\| T.isInfixOf "Focus" deptName = Focus	2✔
87	\| T.isInfixOf "Certificate" deptName = Certificate	1!
88	\| otherwise = Other	×
89
90	-- \| Extracts the program type (eg. major) from a program code (eg. ASMAJ1689)
91	getPostTypeFromCode :: T.Text -> ProgramType
92	getPostTypeFromCode = abbrevToPost . T.take 3 . T.drop 2	1✔
93
94	-- \| Maps the post type abbreviations to their corresponding ProgramType
95	abbrevToPost :: T.Text -> ProgramType
96	abbrevToPost "SPE" = Specialist	1✔
97	abbrevToPost "MAJ" = Major	1✔
98	abbrevToPost "MIN" = Minor	1✔
99	abbrevToPost "FOC" = Focus	1✔
100	abbrevToPost "CER" = Certificate	1✔
UNCOV 101	abbrevToPost _ = Other	×
102
103	-- \| Parser for a post code (eg. ASFOC1689A)
104	postCodeParser :: Parser T.Text
105	postCodeParser = do	1✔
106	_ <- P.many1 P.space >> P.char '-' >> P.many1 P.space	1✔
107	code <- P.count 5 P.letter	1✔
108	num <- P.count 4 P.digit	1✔
109	variant <- P.many P.letter	1✔
110	return $ T.pack $ code ++ num ++ variant	24✔
111
112	-- \| Prunes all the attributes (eg. class, href) in the html except for the style.
113	-- \| Removes all <a></a> tags
114	pruneHtml :: [Tag T.Text] -> [Tag T.Text]
115	pruneHtml [] = []	1✔
116	pruneHtml ((TagOpen "a" _):xs) = pruneHtml xs	1✔
117	pruneHtml ((TagClose "a"):xs) = pruneHtml xs	1✔
118	pruneHtml ((TagOpen tag attrs):xs) = (TagOpen tag [style \| style@("style", _) <- attrs]) : pruneHtml xs	4✔
119	pruneHtml (x:xs) = x : pruneHtml xs	8✔
120
121	-- \| Split requirements HTML into individual lines.
122	reqHtmlToLines :: [Tag T.Text] -> [[T.Text]]
123	reqHtmlToLines tags =
UNCOV 124	let sects = split (keepDelimsL $ whenElt isSectionSplit) tags	×
UNCOV 125	sectionsNoNotes = filter (not . isNoteSection) sects	×
126	paragraphs = concatMap (splitWhen (isTagOpenName "p")) sectionsNoNotes	×
127	lines' = map (map (T.strip . convertLine) . splitLines) paragraphs	×
128	in
129	lines'
130
131	where
132	isSectionSplit :: Tag T.Text -> Bool
133	isSectionSplit tag =
UNCOV 134	isTagText tag &&	×
UNCOV 135	any (flip T.isInfixOf $ fromTagText tag)	×
136	["First", "Second", "Third", "Higher", "Recommended Courses:", "Notes", "NOTES"]	×
137
138	isNoteSection :: [Tag T.Text] -> Bool
139	isNoteSection (sectionTitleTag:_) =
UNCOV 140	isTagText sectionTitleTag && any (flip T.isInfixOf $ fromTagText sectionTitleTag) ["Notes", "NOTES"]	×
UNCOV 141	isNoteSection [] = False	×
142
143	splitLines :: [Tag T.Text] -> [[Tag T.Text]]
UNCOV 144	splitLines = splitWhen (\tag -> isTagOpenName "br" tag \|\| isTagOpenName "li" tag)	×
145
146	convertLine :: [Tag T.Text] -> T.Text
UNCOV 147	convertLine [] = ""	×
148	convertLine (t:ts)
149	\| isTagOpenName "li" t = T.append "0." (innerText ts)	×
UNCOV 150	\| otherwise = innerText (t:ts)	×
151
152
153	parseRequirement :: [T.Text] -> [T.Text]
UNCOV 154	parseRequirement requirement = map parseSingleReq $ filter isReq requirement	×
155	where
156	isReq t = T.length t >= 7 &&	×
UNCOV 157	not (any (`T.isInfixOf` t) ["First", "Second", "Third", "Higher"])	×
158
159	parseSingleReq =
UNCOV 160	T.pack . show .	×
UNCOV 161	parseReqs . -- Using parser for new Req type	×
162	T.unpack .	×
163	fromRight "" .	×
164	P.parse getLineText "Reading a requirement line" .	×
165	T.strip	×
166
167	-- Strips the optional leading numbering (#.) from a line.
168	getLineText :: Parser T.Text
UNCOV 169	getLineText = do	×
UNCOV 170	P.optional $ P.try (P.digit >> P.char '.' >> P.space)	×
171	parseUntil P.eof	×

Courseography / courseography / c1af5ae6-6501-463e-8e2b-3f62f7a4d663

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous