• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

rdf-elixir / rdf-ex / e13c6dddfa3dcff42f19061d70f6a1c7c0caafb8-PR-17

pending completion
e13c6dddfa3dcff42f19061d70f6a1c7c0caafb8-PR-17

Pull #17

github

marcelotto
Pull Request #17: Add implementation of the RDF Dataset canonicalization algorithm

166 of 166 new or added lines in 9 files covered. (100.0%)

5349 of 7224 relevant lines covered (74.04%)

567.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.51
/lib/rdf/canonicalization/canonicalization.ex
1
defmodule RDF.Canonicalization do
2
  @moduledoc """
3
  An implementation of the standard RDF Dataset Canonicalization Algorithm.
4

5
  See <https://w3c-ccg.github.io/rdf-dataset-canonicalization/spec/>.
6
  """
7

8
  use RDF
9
  alias RDF.Canonicalization.{IdentifierIssuer, State}
10
  alias RDF.{BlankNode, Dataset, Quad, Statement, Utils}
11

12
  def normalize(input) do
13
    urdna2015(input)
62✔
14
  end
15

16
  defp urdna2015(input) do
17
    input
18
    |> State.new()
19
    |> create_canonical_identifiers_for_single_node_hashes()
20
    |> create_canonical_identifiers_for_multiple_node_hashes()
21
    |> apply_canonicalization(input)
62✔
22
  end
23

24
  # 3)
25
  defp create_canonical_identifiers_for_single_node_hashes(state) do
26
    non_normalized_identifiers = Map.keys(state.bnode_to_statements)
62✔
27
    do_create_canonical_identifiers_for_single_node_hashes(state, non_normalized_identifiers)
62✔
28
  end
29

30
  # 4)
31
  defp do_create_canonical_identifiers_for_single_node_hashes(
32
         state,
33
         non_normalized_identifiers,
34
         simple \\ true
62✔
35
       )
36

37
  # 5)
38
  defp do_create_canonical_identifiers_for_single_node_hashes(
39
         state,
40
         non_normalized_identifiers,
41
         true
42
       ) do
43
    # 5.2)
44
    state = State.clear_hash_to_bnodes(state)
84✔
45

46
    # 5.3) Calculate hashes for first degree nodes
47
    state =
84✔
48
      Enum.reduce(non_normalized_identifiers, state, fn identifier, state ->
49
        State.add_bnode_hash(state, identifier, hash_first_degree_quads(state, identifier))
234✔
50
      end)
51

52
    # 5.4) Create canonical replacements for hashes mapping to a single node
53
    {non_normalized_identifiers, state, simple} =
84✔
54
      state.hash_to_bnodes
84✔
55
      |> Enum.sort()
56
      |> Enum.reduce({non_normalized_identifiers, state, false}, fn
57
        {hash, identifier_list}, {non_normalized_identifiers, state, simple} ->
58
          case MapSet.to_list(identifier_list) do
97✔
59
            [identifier] ->
60
              state = State.issue_canonical_identifier(state, identifier)
41✔
61

62
              {
41✔
63
                List.delete(non_normalized_identifiers, identifier),
64
                State.delete_bnode_hash(state, hash),
65
                true
66
              }
67

68
            [] ->
69
              raise "unexpected empty identifier list"
×
70

71
            _ ->
72
              {non_normalized_identifiers, state, simple}
56✔
73
          end
74
      end)
75

76
    do_create_canonical_identifiers_for_single_node_hashes(
84✔
77
      state,
78
      non_normalized_identifiers,
79
      simple
80
    )
81
  end
82

83
  defp do_create_canonical_identifiers_for_single_node_hashes(state, _, false), do: state
62✔
84

85
  # 6)
86
  defp create_canonical_identifiers_for_multiple_node_hashes(state) do
87
    state.hash_to_bnodes
62✔
88
    |> Enum.sort()
89
    |> Enum.reduce(state, fn {_hash, identifier_list}, state ->
62✔
90
      # 6.1-2) Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
91
      identifier_list
92
      |> Enum.reduce([], fn identifier, hash_path_list ->
93
        if IdentifierIssuer.issued?(state.canonical_issuer, identifier) do
147✔
94
          hash_path_list
49✔
95
        else
96
          {_issued_identifier, temporary_issuer} =
98✔
97
            "_:b"
98
            |> IdentifierIssuer.new()
99
            |> IdentifierIssuer.issue_identifier(identifier)
100

101
          [
102
            hash_n_degree_quads(state, identifier, temporary_issuer)
103
            | hash_path_list
104
          ]
105
        end
106
      end)
107
      |> Enum.sort()
108
      # 6.3) Create canonical replacements for nodes
109
      |> Enum.reduce(state, fn {_hash, issuer}, state ->
44✔
110
        issuer
111
        |> IdentifierIssuer.issued_identifiers()
112
        |> Enum.reduce(state, &State.issue_canonical_identifier(&2, &1))
98✔
113
      end)
114
    end)
115
  end
116

117
  # 7)
118
  defp apply_canonicalization(state, data) do
119
    Enum.reduce(data, Dataset.new(), fn statement, canonicalized_data ->
62✔
120
      Dataset.add(
337✔
121
        canonicalized_data,
122
        if Statement.has_bnode?(statement) do
123
          Statement.map(statement, fn
277✔
124
            {_, %BlankNode{} = bnode} ->
125
              state.canonical_issuer
514✔
126
              |> IdentifierIssuer.identifier(bnode)
127
              |> BlankNode.new()
514✔
128

129
            {_, node} ->
130
              node
328✔
131
          end)
132
        else
133
          statement
60✔
134
        end
135
      )
136
    end)
137
  end
138

139
  # see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-first-degree-quads
140
  defp hash_first_degree_quads(state, ref_bnode_id) do
141
    state.bnode_to_statements
1,498✔
142
    |> Map.get(ref_bnode_id, [])
143
    |> Enum.map(fn statement ->
144
      statement
145
      |> Quad.new()
146
      |> Statement.map(fn
147
        {_, ^ref_bnode_id} -> ~B<a>
7,624✔
148
        {_, %BlankNode{}} -> ~B<z>
7,620✔
149
        {_, node} -> node
15,240✔
150
      end)
151
      |> RDF.dataset()
152
      |> NQuads.write_string!()
7,621✔
153
    end)
154
    |> Enum.sort()
155
    |> Enum.join()
156
    |> hash()
1,498✔
157

158
    # |> IO.inspect(label: "1deg: node: #{inspect(ref_bnode_id)}, hash_first_degree_quads")
159
  end
160

161
  # see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-related-blank-node
162
  defp hash_related_bnode(state, related, statement, issuer, position) do
163
    identifier =
8,806✔
164
      IdentifierIssuer.identifier(state.canonical_issuer, related) ||
8,806✔
165
        IdentifierIssuer.identifier(issuer, related) ||
8,792✔
166
        hash_first_degree_quads(state, related)
1,264✔
167

168
    input = to_string(position)
8,806✔
169

170
    input =
8,806✔
171
      if position != :g do
172
        "#{input}<#{Statement.predicate(statement)}>"
8,792✔
173
      else
174
        input
14✔
175
      end <> identifier
176

177
    hash(input)
8,806✔
178
    # |> IO.inspect(label: "hrel: input: #{inspect(input)}, hash_related_bnode")
179
  end
180

181
  # see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-n-degree-quads
182
  def hash_n_degree_quads(state, identifier, issuer) do
183
    # IO.inspect(identifier, label: "ndeg: identifier")
184

185
    # 1-3)
186
    hash_to_related_bnodes =
1,561✔
187
      Enum.reduce(state.bnode_to_statements[identifier], %{}, fn statement, map ->
1,561✔
188
        Map.merge(
8,796✔
189
          map,
190
          hash_related_statement(state, identifier, statement, issuer),
191
          fn _, terms, new -> terms ++ new end
432✔
192
        )
193
      end)
194

195
    # |> IO.inspect(label: "ndeg: hash_to_related_bnodes")
196

197
    {data_to_hash, issuer} =
1,561✔
198
      hash_to_related_bnodes
199
      |> Enum.sort()
200
      |> Enum.reduce({"", issuer}, fn
8,374✔
201
        {related_hash, bnode_list}, {data_to_hash, issuer} ->
202
          # 5.1)
203
          data_to_hash = data_to_hash <> related_hash
8,374✔
204
          chosen_path = ""
8,374✔
205
          chosen_issuer = nil
8,374✔
206

207
          # 5.2-4)
208
          {chosen_path, chosen_issuer} =
8,374✔
209
            bnode_list
210
            |> Utils.permutations()
211
            |> Enum.reduce({chosen_path, chosen_issuer}, fn
212
              permutation, {chosen_path, chosen_issuer} ->
213
                # IO.inspect(permutation, label: "ndeg: perm")
214

215
                issuer_copy = IdentifierIssuer.copy(issuer)
9,022✔
216
                chosen_path_length = String.length(chosen_path)
9,022✔
217

218
                # 5.4.4)
219
                {path, recursion_list, issuer_copy} =
9,022✔
220
                  Enum.reduce_while(permutation, {"", [], issuer_copy}, fn
221
                    related, {path, recursion_list, issuer_copy} ->
222
                      {path, recursion_list, issuer_copy} =
10,462✔
223
                        if issued_identifier =
224
                             IdentifierIssuer.identifier(state.canonical_issuer, related) do
10,462✔
225
                          {path <> issued_identifier, recursion_list, issuer_copy}
14✔
226
                        else
227
                          if issued_identifier = IdentifierIssuer.identifier(issuer_copy, related) do
10,448✔
228
                            {path <> issued_identifier, recursion_list, issuer_copy}
8,985✔
229
                          else
230
                            {issued_identifier, issuer_copy} =
1,463✔
231
                              IdentifierIssuer.issue_identifier(issuer_copy, related)
232

233
                            {
1,463✔
234
                              path <> issued_identifier,
235
                              [related | recursion_list],
236
                              issuer_copy
237
                            }
238
                          end
239
                        end
240

241
                      if chosen_path_length != 0 and
10,462✔
242
                           String.length(path) >= chosen_path_length and
1,656✔
243
                           path > chosen_path do
324✔
244
                        {:halt, {path, recursion_list, issuer_copy}}
245
                      else
246
                        {:cont, {path, recursion_list, issuer_copy}}
247
                      end
248
                  end)
249

250
                # IO.puts("ndeg: related_hash: #{related_hash}, path: #{path}, recursion: #{inspect(recursion_list)}")
251

252
                # 5.4.5)
253
                {issuer_copy, path} =
9,022✔
254
                  recursion_list
255
                  |> Enum.reverse()
256
                  |> Enum.reduce_while({issuer_copy, path}, fn related, {issuer_copy, path} ->
257
                    # Note: The following steps seem to be the only steps in the whole algorithm
258
                    # which really rely on global state.
259

260
                    # 5.4.5.1)
261
                    {result_hash, result_issuer} =
1,463✔
262
                      hash_n_degree_quads(state, related, issuer_copy)
263

264
                    # This step was added to circumvent the need for global state.
265
                    # It's unclear whether it is actually required, since all test
266
                    # of the test suite pass without it.
267
                    # see https://github.com/w3c-ccg/rdf-dataset-canonicalization/issues/31
268
                    result_issuer =
1,463✔
269
                      if result_issuer.id == issuer_copy.id do
1,463✔
270
                        {_, issuer} = IdentifierIssuer.issue_identifier(result_issuer, related)
×
271
                        issuer
×
272
                      else
273
                        result_issuer
1,463✔
274
                      end
275

276
                    # 5.4.5.2)
277
                    {issued_identifier, _issuer_copy} =
1,463✔
278
                      IdentifierIssuer.issue_identifier(issuer_copy, related)
279

280
                    path = path <> issued_identifier <> "<#{result_hash}>"
1,463✔
281

282
                    if chosen_path_length != 0 and
1,463✔
283
                         String.length(path) >= chosen_path_length and
828✔
284
                         path > chosen_path do
324✔
285
                      {:halt, {result_issuer, path}}
286
                    else
287
                      {:cont, {result_issuer, path}}
288
                    end
289
                  end)
290

291
                if chosen_path_length == 0 or path < chosen_path do
9,022✔
292
                  {path, issuer_copy}
293
                else
294
                  {chosen_path, chosen_issuer}
295
                end
296
            end)
297

298
          # 5.5)
299
          {data_to_hash <> chosen_path, chosen_issuer}
300
      end)
301

302
    # IO.puts("ndeg: datatohash: #{data_to_hash}, hash: #{hash(data_to_hash)}")
303

304
    {hash(data_to_hash), issuer}
305
  end
306

307
  # 4.8.2.3.1) Group adjacent bnodes by hash
308
  defp hash_related_statement(state, identifier, statement, issuer) do
309
    [
310
      s: Statement.subject(statement),
311
      o: Statement.object(statement),
312
      g: Statement.graph_name(statement)
313
    ]
314
    |> Enum.reduce(%{}, fn
8,796✔
315
      {_, ^identifier}, map ->
316
        map
8,798✔
317

318
      {pos, %BlankNode{} = term}, map ->
319
        hash = hash_related_bnode(state, term, statement, issuer, pos)
8,806✔
320

321
        Map.update(map, hash, [term], fn terms ->
8,806✔
322
          if term in terms, do: terms, else: [term | terms]
×
323
        end)
324

325
      _, map ->
326
        map
8,784✔
327
    end)
328
  end
329

330
  defp hash(data) do
331
    :crypto.hash(:sha256, data) |> Base.encode16(case: :lower)
11,865✔
332
  end
333
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc