e13c6dddfa3dcff42f19061d70f6a1c7c0caafb8-PR-17

pending completion

Build # e13c6dddfa3dcff42f19061d70f6a1c7c0caafb8-PR-17

Build Type

Pull #17

github

Committed by marcelotto

Commit Message

Pull Request Pull Request #17: Add implementation of the RDF Dataset canonicalization algorithm

Run Details

166 of 166 new or added lines in 9 files covered. (100.0%)

5349 of 7224 relevant lines covered (74.04%)

567.51 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.51

/lib/rdf/canonicalization/canonicalization.ex

defmodule RDF.Canonicalization do
  @moduledoc """
  An implementation of the standard RDF Dataset Canonicalization Algorithm.

  See <https://w3c-ccg.github.io/rdf-dataset-canonicalization/spec/>.
  """

  use RDF
  alias RDF.Canonicalization.{IdentifierIssuer, State}
  alias RDF.{BlankNode, Dataset, Quad, Statement, Utils}

  def normalize(input) do
    urdna2015(input)
  end

  defp urdna2015(input) do
    input
    |> State.new()
    |> create_canonical_identifiers_for_single_node_hashes()
    |> create_canonical_identifiers_for_multiple_node_hashes()
    |> apply_canonicalization(input)
  end

  # 3)
  defp create_canonical_identifiers_for_single_node_hashes(state) do
    non_normalized_identifiers = Map.keys(state.bnode_to_statements)
    do_create_canonical_identifiers_for_single_node_hashes(state, non_normalized_identifiers)
  end

  # 4)
  defp do_create_canonical_identifiers_for_single_node_hashes(
         state,
         non_normalized_identifiers,
         simple \\ true
       )

  # 5)
  defp do_create_canonical_identifiers_for_single_node_hashes(
         state,
         non_normalized_identifiers,
         true
       ) do
    # 5.2)
    state = State.clear_hash_to_bnodes(state)

    # 5.3) Calculate hashes for first degree nodes
    state =
      Enum.reduce(non_normalized_identifiers, state, fn identifier, state ->
        State.add_bnode_hash(state, identifier, hash_first_degree_quads(state, identifier))
      end)

    # 5.4) Create canonical replacements for hashes mapping to a single node
    {non_normalized_identifiers, state, simple} =
      state.hash_to_bnodes
      |> Enum.sort()
      |> Enum.reduce({non_normalized_identifiers, state, false}, fn
        {hash, identifier_list}, {non_normalized_identifiers, state, simple} ->
          case MapSet.to_list(identifier_list) do
            [identifier] ->
              state = State.issue_canonical_identifier(state, identifier)

              {
                List.delete(non_normalized_identifiers, identifier),
                State.delete_bnode_hash(state, hash),
                true
              }

            [] ->
              raise "unexpected empty identifier list"

            _ ->
              {non_normalized_identifiers, state, simple}
          end
      end)

    do_create_canonical_identifiers_for_single_node_hashes(
      state,
      non_normalized_identifiers,
      simple
    )
  end

  defp do_create_canonical_identifiers_for_single_node_hashes(state, _, false), do: state

  # 6)
  defp create_canonical_identifiers_for_multiple_node_hashes(state) do
    state.hash_to_bnodes
    |> Enum.sort()
    |> Enum.reduce(state, fn {_hash, identifier_list}, state ->
      # 6.1-2) Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
      identifier_list
      |> Enum.reduce([], fn identifier, hash_path_list ->
        if IdentifierIssuer.issued?(state.canonical_issuer, identifier) do
          hash_path_list
        else
          {_issued_identifier, temporary_issuer} =
            "_:b"
            |> IdentifierIssuer.new()
            |> IdentifierIssuer.issue_identifier(identifier)

          [
            hash_n_degree_quads(state, identifier, temporary_issuer)
            | hash_path_list
          ]
        end
      end)
      |> Enum.sort()
      # 6.3) Create canonical replacements for nodes
      |> Enum.reduce(state, fn {_hash, issuer}, state ->
        issuer
        |> IdentifierIssuer.issued_identifiers()
        |> Enum.reduce(state, &State.issue_canonical_identifier(&2, &1))
      end)
    end)
  end

  # 7)
  defp apply_canonicalization(state, data) do
    Enum.reduce(data, Dataset.new(), fn statement, canonicalized_data ->
      Dataset.add(
        canonicalized_data,
        if Statement.has_bnode?(statement) do
          Statement.map(statement, fn
            {_, %BlankNode{} = bnode} ->
              state.canonical_issuer
              |> IdentifierIssuer.identifier(bnode)
              |> BlankNode.new()

            {_, node} ->
              node
          end)
        else
          statement
        end
      )
    end)
  end

  # see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-first-degree-quads
  defp hash_first_degree_quads(state, ref_bnode_id) do
    state.bnode_to_statements
    |> Map.get(ref_bnode_id, [])
    |> Enum.map(fn statement ->
      statement
      |> Quad.new()
      |> Statement.map(fn
        {_, ^ref_bnode_id} -> ~B<a>
        {_, %BlankNode{}} -> ~B<z>
        {_, node} -> node
      end)
      |> RDF.dataset()
      |> NQuads.write_string!()
    end)
    |> Enum.sort()
    |> Enum.join()
    |> hash()

    # |> IO.inspect(label: "1deg: node: #{inspect(ref_bnode_id)}, hash_first_degree_quads")
  end

  # see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-related-blank-node
  defp hash_related_bnode(state, related, statement, issuer, position) do
    identifier =
      IdentifierIssuer.identifier(state.canonical_issuer, related) ||
        IdentifierIssuer.identifier(issuer, related) ||
        hash_first_degree_quads(state, related)

    input = to_string(position)

    input =
      if position != :g do
        "#{input}<#{Statement.predicate(statement)}>"
      else
        input
      end <> identifier

    hash(input)
    # |> IO.inspect(label: "hrel: input: #{inspect(input)}, hash_related_bnode")
  end

  # see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-n-degree-quads
  def hash_n_degree_quads(state, identifier, issuer) do
    # IO.inspect(identifier, label: "ndeg: identifier")

    # 1-3)
    hash_to_related_bnodes =
      Enum.reduce(state.bnode_to_statements[identifier], %{}, fn statement, map ->
        Map.merge(
          map,
          hash_related_statement(state, identifier, statement, issuer),
          fn _, terms, new -> terms ++ new end
        )
      end)

    # |> IO.inspect(label: "ndeg: hash_to_related_bnodes")

    {data_to_hash, issuer} =
      hash_to_related_bnodes
      |> Enum.sort()
      |> Enum.reduce({"", issuer}, fn
        {related_hash, bnode_list}, {data_to_hash, issuer} ->
          # 5.1)
          data_to_hash = data_to_hash <> related_hash
          chosen_path = ""
          chosen_issuer = nil

          # 5.2-4)
          {chosen_path, chosen_issuer} =
            bnode_list
            |> Utils.permutations()
            |> Enum.reduce({chosen_path, chosen_issuer}, fn
              permutation, {chosen_path, chosen_issuer} ->
                # IO.inspect(permutation, label: "ndeg: perm")

                issuer_copy = IdentifierIssuer.copy(issuer)
                chosen_path_length = String.length(chosen_path)

                # 5.4.4)
                {path, recursion_list, issuer_copy} =
                  Enum.reduce_while(permutation, {"", [], issuer_copy}, fn
                    related, {path, recursion_list, issuer_copy} ->
                      {path, recursion_list, issuer_copy} =
                        if issued_identifier =
                             IdentifierIssuer.identifier(state.canonical_issuer, related) do
                          {path <> issued_identifier, recursion_list, issuer_copy}
                        else
                          if issued_identifier = IdentifierIssuer.identifier(issuer_copy, related) do
                            {path <> issued_identifier, recursion_list, issuer_copy}
                          else
                            {issued_identifier, issuer_copy} =
                              IdentifierIssuer.issue_identifier(issuer_copy, related)

                            {
                              path <> issued_identifier,
                              [related | recursion_list],
                              issuer_copy
                            }
                          end
                        end

                      if chosen_path_length != 0 and
                           String.length(path) >= chosen_path_length and
                           path > chosen_path do
                        {:halt, {path, recursion_list, issuer_copy}}
                      else
                        {:cont, {path, recursion_list, issuer_copy}}
                      end
                  end)

                # IO.puts("ndeg: related_hash: #{related_hash}, path: #{path}, recursion: #{inspect(recursion_list)}")

                # 5.4.5)
                {issuer_copy, path} =
                  recursion_list
                  |> Enum.reverse()
                  |> Enum.reduce_while({issuer_copy, path}, fn related, {issuer_copy, path} ->
                    # Note: The following steps seem to be the only steps in the whole algorithm
                    # which really rely on global state.

                    # 5.4.5.1)
                    {result_hash, result_issuer} =
                      hash_n_degree_quads(state, related, issuer_copy)

                    # This step was added to circumvent the need for global state.
                    # It's unclear whether it is actually required, since all test
                    # of the test suite pass without it.
                    # see https://github.com/w3c-ccg/rdf-dataset-canonicalization/issues/31
                    result_issuer =
                      if result_issuer.id == issuer_copy.id do
                        {_, issuer} = IdentifierIssuer.issue_identifier(result_issuer, related)
                        issuer
                      else
                        result_issuer
                      end

                    # 5.4.5.2)
                    {issued_identifier, _issuer_copy} =
                      IdentifierIssuer.issue_identifier(issuer_copy, related)

                    path = path <> issued_identifier <> "<#{result_hash}>"

                    if chosen_path_length != 0 and
                         String.length(path) >= chosen_path_length and
                         path > chosen_path do
                      {:halt, {result_issuer, path}}
                    else
                      {:cont, {result_issuer, path}}
                    end
                  end)

                if chosen_path_length == 0 or path < chosen_path do
                  {path, issuer_copy}
                else
                  {chosen_path, chosen_issuer}
                end
            end)

          # 5.5)
          {data_to_hash <> chosen_path, chosen_issuer}
      end)

    # IO.puts("ndeg: datatohash: #{data_to_hash}, hash: #{hash(data_to_hash)}")

    {hash(data_to_hash), issuer}
  end

  # 4.8.2.3.1) Group adjacent bnodes by hash
  defp hash_related_statement(state, identifier, statement, issuer) do
    [
      s: Statement.subject(statement),
      o: Statement.object(statement),
      g: Statement.graph_name(statement)
    ]
    |> Enum.reduce(%{}, fn
      {_, ^identifier}, map ->
        map

      {pos, %BlankNode{} = term}, map ->
        hash = hash_related_bnode(state, term, statement, issuer, pos)

        Map.update(map, hash, [term], fn terms ->
          if term in terms, do: terms, else: [term | terms]
        end)

      _, map ->
        map
    end)
  end

  defp hash(data) do
    :crypto.hash(:sha256, data) |> Base.encode16(case: :lower)
  end
end

1	defmodule RDF.Canonicalization do
2	@moduledoc """
3	An implementation of the standard RDF Dataset Canonicalization Algorithm.
4
5	See <https://w3c-ccg.github.io/rdf-dataset-canonicalization/spec/>.
6	"""
7
8	use RDF
9	alias RDF.Canonicalization.{IdentifierIssuer, State}
10	alias RDF.{BlankNode, Dataset, Quad, Statement, Utils}
11
12	def normalize(input) do
13	urdna2015(input)	62✔
14	end
15
16	defp urdna2015(input) do
17	input
18	\|> State.new()
19	\|> create_canonical_identifiers_for_single_node_hashes()
20	\|> create_canonical_identifiers_for_multiple_node_hashes()
21	\|> apply_canonicalization(input)	62✔
22	end
23
24	# 3)
25	defp create_canonical_identifiers_for_single_node_hashes(state) do
26	non_normalized_identifiers = Map.keys(state.bnode_to_statements)	62✔
27	do_create_canonical_identifiers_for_single_node_hashes(state, non_normalized_identifiers)	62✔
28	end
29
30	# 4)
31	defp do_create_canonical_identifiers_for_single_node_hashes(
32	state,
33	non_normalized_identifiers,
34	simple \\ true	62✔
35	)
36
37	# 5)
38	defp do_create_canonical_identifiers_for_single_node_hashes(
39	state,
40	non_normalized_identifiers,
41	true
42	) do
43	# 5.2)
44	state = State.clear_hash_to_bnodes(state)	84✔
45
46	# 5.3) Calculate hashes for first degree nodes
47	state =	84✔
48	Enum.reduce(non_normalized_identifiers, state, fn identifier, state ->
49	State.add_bnode_hash(state, identifier, hash_first_degree_quads(state, identifier))	234✔
50	end)
51
52	# 5.4) Create canonical replacements for hashes mapping to a single node
53	{non_normalized_identifiers, state, simple} =	84✔
54	state.hash_to_bnodes	84✔
55	\|> Enum.sort()
56	\|> Enum.reduce({non_normalized_identifiers, state, false}, fn
57	{hash, identifier_list}, {non_normalized_identifiers, state, simple} ->
58	case MapSet.to_list(identifier_list) do	97✔
59	[identifier] ->
60	state = State.issue_canonical_identifier(state, identifier)	41✔
61
62	{	41✔
63	List.delete(non_normalized_identifiers, identifier),
64	State.delete_bnode_hash(state, hash),
65	true
66	}
67
68	[] ->
69	raise "unexpected empty identifier list"	×
70
71	_ ->
72	{non_normalized_identifiers, state, simple}	56✔
73	end
74	end)
75
76	do_create_canonical_identifiers_for_single_node_hashes(	84✔
77	state,
78	non_normalized_identifiers,
79	simple
80	)
81	end
82
83	defp do_create_canonical_identifiers_for_single_node_hashes(state, _, false), do: state	62✔
84
85	# 6)
86	defp create_canonical_identifiers_for_multiple_node_hashes(state) do
87	state.hash_to_bnodes	62✔
88	\|> Enum.sort()
89	\|> Enum.reduce(state, fn {_hash, identifier_list}, state ->	62✔
90	# 6.1-2) Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
91	identifier_list
92	\|> Enum.reduce([], fn identifier, hash_path_list ->
93	if IdentifierIssuer.issued?(state.canonical_issuer, identifier) do	147✔
94	hash_path_list	49✔
95	else
96	{_issued_identifier, temporary_issuer} =	98✔
97	"_:b"
98	\|> IdentifierIssuer.new()
99	\|> IdentifierIssuer.issue_identifier(identifier)
100
101	[
102	hash_n_degree_quads(state, identifier, temporary_issuer)
103	\| hash_path_list
104	]
105	end
106	end)
107	\|> Enum.sort()
108	# 6.3) Create canonical replacements for nodes
109	\|> Enum.reduce(state, fn {_hash, issuer}, state ->	44✔
110	issuer
111	\|> IdentifierIssuer.issued_identifiers()
112	\|> Enum.reduce(state, &State.issue_canonical_identifier(&2, &1))	98✔
113	end)
114	end)
115	end
116
117	# 7)
118	defp apply_canonicalization(state, data) do
119	Enum.reduce(data, Dataset.new(), fn statement, canonicalized_data ->	62✔
120	Dataset.add(	337✔
121	canonicalized_data,
122	if Statement.has_bnode?(statement) do
123	Statement.map(statement, fn	277✔
124	{_, %BlankNode{} = bnode} ->
125	state.canonical_issuer	514✔
126	\|> IdentifierIssuer.identifier(bnode)
127	\|> BlankNode.new()	514✔
128
129	{_, node} ->
130	node	328✔
131	end)
132	else
133	statement	60✔
134	end
135	)
136	end)
137	end
138
139	# see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-first-degree-quads
140	defp hash_first_degree_quads(state, ref_bnode_id) do
141	state.bnode_to_statements	1,498✔
142	\|> Map.get(ref_bnode_id, [])
143	\|> Enum.map(fn statement ->
144	statement
145	\|> Quad.new()
146	\|> Statement.map(fn
147	{_, ^ref_bnode_id} -> ~B<a>	7,624✔
148	{_, %BlankNode{}} -> ~B<z>	7,620✔
149	{_, node} -> node	15,240✔
150	end)
151	\|> RDF.dataset()
152	\|> NQuads.write_string!()	7,621✔
153	end)
154	\|> Enum.sort()
155	\|> Enum.join()
156	\|> hash()	1,498✔
157
158	# \|> IO.inspect(label: "1deg: node: #{inspect(ref_bnode_id)}, hash_first_degree_quads")
159	end
160
161	# see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-related-blank-node
162	defp hash_related_bnode(state, related, statement, issuer, position) do
163	identifier =	8,806✔
164	IdentifierIssuer.identifier(state.canonical_issuer, related) \|\|	8,806✔
165	IdentifierIssuer.identifier(issuer, related) \|\|	8,792✔
166	hash_first_degree_quads(state, related)	1,264✔
167
168	input = to_string(position)	8,806✔
169
170	input =	8,806✔
171	if position != :g do
172	"#{input}<#{Statement.predicate(statement)}>"	8,792✔
173	else
174	input	14✔
175	end <> identifier
176
177	hash(input)	8,806✔
178	# \|> IO.inspect(label: "hrel: input: #{inspect(input)}, hash_related_bnode")
179	end
180
181	# see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-n-degree-quads
182	def hash_n_degree_quads(state, identifier, issuer) do
183	# IO.inspect(identifier, label: "ndeg: identifier")
184
185	# 1-3)
186	hash_to_related_bnodes =	1,561✔
187	Enum.reduce(state.bnode_to_statements[identifier], %{}, fn statement, map ->	1,561✔
188	Map.merge(	8,796✔
189	map,
190	hash_related_statement(state, identifier, statement, issuer),
191	fn _, terms, new -> terms ++ new end	432✔
192	)
193	end)
194
195	# \|> IO.inspect(label: "ndeg: hash_to_related_bnodes")
196
197	{data_to_hash, issuer} =	1,561✔
198	hash_to_related_bnodes
199	\|> Enum.sort()
200	\|> Enum.reduce({"", issuer}, fn	8,374✔
201	{related_hash, bnode_list}, {data_to_hash, issuer} ->
202	# 5.1)
203	data_to_hash = data_to_hash <> related_hash	8,374✔
204	chosen_path = ""	8,374✔
205	chosen_issuer = nil	8,374✔
206
207	# 5.2-4)
208	{chosen_path, chosen_issuer} =	8,374✔
209	bnode_list
210	\|> Utils.permutations()
211	\|> Enum.reduce({chosen_path, chosen_issuer}, fn
212	permutation, {chosen_path, chosen_issuer} ->
213	# IO.inspect(permutation, label: "ndeg: perm")
214
215	issuer_copy = IdentifierIssuer.copy(issuer)	9,022✔
216	chosen_path_length = String.length(chosen_path)	9,022✔
217
218	# 5.4.4)
219	{path, recursion_list, issuer_copy} =	9,022✔
220	Enum.reduce_while(permutation, {"", [], issuer_copy}, fn
221	related, {path, recursion_list, issuer_copy} ->
222	{path, recursion_list, issuer_copy} =	10,462✔
223	if issued_identifier =
224	IdentifierIssuer.identifier(state.canonical_issuer, related) do	10,462✔
225	{path <> issued_identifier, recursion_list, issuer_copy}	14✔
226	else
227	if issued_identifier = IdentifierIssuer.identifier(issuer_copy, related) do	10,448✔
228	{path <> issued_identifier, recursion_list, issuer_copy}	8,985✔
229	else
230	{issued_identifier, issuer_copy} =	1,463✔
231	IdentifierIssuer.issue_identifier(issuer_copy, related)
232
233	{	1,463✔
234	path <> issued_identifier,
235	[related \| recursion_list],
236	issuer_copy
237	}
238	end
239	end
240
241	if chosen_path_length != 0 and	10,462✔
242	String.length(path) >= chosen_path_length and	1,656✔
243	path > chosen_path do	324✔
244	{:halt, {path, recursion_list, issuer_copy}}
245	else
246	{:cont, {path, recursion_list, issuer_copy}}
247	end
248	end)
249
250	# IO.puts("ndeg: related_hash: #{related_hash}, path: #{path}, recursion: #{inspect(recursion_list)}")
251
252	# 5.4.5)
253	{issuer_copy, path} =	9,022✔
254	recursion_list
255	\|> Enum.reverse()
256	\|> Enum.reduce_while({issuer_copy, path}, fn related, {issuer_copy, path} ->
257	# Note: The following steps seem to be the only steps in the whole algorithm
258	# which really rely on global state.
259
260	# 5.4.5.1)
261	{result_hash, result_issuer} =	1,463✔
262	hash_n_degree_quads(state, related, issuer_copy)
263
264	# This step was added to circumvent the need for global state.
265	# It's unclear whether it is actually required, since all test
266	# of the test suite pass without it.
267	# see https://github.com/w3c-ccg/rdf-dataset-canonicalization/issues/31
268	result_issuer =	1,463✔
269	if result_issuer.id == issuer_copy.id do	1,463✔
270	{_, issuer} = IdentifierIssuer.issue_identifier(result_issuer, related)	×
271	issuer	×
272	else
273	result_issuer	1,463✔
274	end
275
276	# 5.4.5.2)
277	{issued_identifier, _issuer_copy} =	1,463✔
278	IdentifierIssuer.issue_identifier(issuer_copy, related)
279
280	path = path <> issued_identifier <> "<#{result_hash}>"	1,463✔
281
282	if chosen_path_length != 0 and	1,463✔
283	String.length(path) >= chosen_path_length and	828✔
284	path > chosen_path do	324✔
285	{:halt, {result_issuer, path}}
286	else
287	{:cont, {result_issuer, path}}
288	end
289	end)
290
291	if chosen_path_length == 0 or path < chosen_path do	9,022✔
292	{path, issuer_copy}
293	else
294	{chosen_path, chosen_issuer}
295	end
296	end)
297
298	# 5.5)
299	{data_to_hash <> chosen_path, chosen_issuer}
300	end)
301
302	# IO.puts("ndeg: datatohash: #{data_to_hash}, hash: #{hash(data_to_hash)}")
303
304	{hash(data_to_hash), issuer}
305	end
306
307	# 4.8.2.3.1) Group adjacent bnodes by hash
308	defp hash_related_statement(state, identifier, statement, issuer) do
309	[
310	s: Statement.subject(statement),
311	o: Statement.object(statement),
312	g: Statement.graph_name(statement)
313	]
314	\|> Enum.reduce(%{}, fn	8,796✔
315	{_, ^identifier}, map ->
316	map	8,798✔
317
318	{pos, %BlankNode{} = term}, map ->
319	hash = hash_related_bnode(state, term, statement, issuer, pos)	8,806✔
320
321	Map.update(map, hash, [term], fn terms ->	8,806✔
322	if term in terms, do: terms, else: [term \| terms]	×
323	end)
324
325	_, map ->
326	map	8,784✔
327	end)
328	end
329
330	defp hash(data) do
331	:crypto.hash(:sha256, data) \|> Base.encode16(case: :lower)	11,865✔
332	end
333	end

rdf-elixir / rdf-ex / e13c6dddfa3dcff42f19061d70f6a1c7c0caafb8-PR-17

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous