From e865f6f807d3c4f9c2c9dfd6bc0ca9664ba101f0 Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Wed, 26 Jun 2024 12:04:48 +0200 Subject: [PATCH 01/12] draft: hashing test --- Cargo.lock | 40 +++++++++++++++++++++++++++++++++++++++- Cargo.toml | 40 ++++++++++++++++++++-------------------- src/crypto.rs | 28 ++++++++++++++++++++++++++-- src/model.rs | 7 +++++-- 4 files changed, 90 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3a22d02..04cda11 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -51,12 +51,43 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + [[package]] name = "bitflags" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +[[package]] +name = "blake3" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30cca6d3674597c30ddf2c587bf8d9d65c9a84d2326d941cc79c9842dfe0ef52" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + +[[package]] +name = "cc" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d" + [[package]] name = "cfg-if" version = "1.0.0" @@ -109,6 +140,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" +[[package]] +name = "constant_time_eq" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" + [[package]] name = "crossbeam-channel" version = "0.5.13" @@ -351,9 +388,10 @@ dependencies = [ [[package]] name = "rdf-protect" -version = "0.0.1" +version = "0.1.0" dependencies = [ "bitflags", + "blake3", "clap", "io-enum", "rio_api", diff --git a/Cargo.toml b/Cargo.toml index 9218930..8214711 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,23 +1,23 @@ -[dependencies] - bitflags = '2.5.0' - io-enum = '1.1.3' - rio_api = '0.8.4' - rio_turtle = '0.8.4' - serde_yml = '0.0.10' - slog = '2.7.0' - slog-async = '2.8.0' - slog-term = '2.9.0' - tempfile = '3.10.1' +[package] +name = "rdf-protect" +version = "0.1.0" +edition = "2021" - [dependencies.clap] - features = ['derive'] - version = '4.5.7' +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies.serde] - features = ['derive'] - version = '1.0' +[dependencies] +# Good logging library. +slog = "2.7.0" +slog-term = "2.9.0" +slog-async = "2.8.0" -[package] - edition = '2021' - name = 'rdf-protect' - version = '0.0.1' +# Popular serialization library. +serde = { version = "1.0", features = ['derive']} +clap = { version = "4.5.7", features = ["derive"] } +rio_turtle = "0.8.4" +rio_api = "0.8.4" +bitflags = "2.5.0" +io-enum = "1.1.3" +serde_yml = "0.0.10" +tempfile = "3.10.1" +blake3 = "1.5.1" diff --git a/src/crypto.rs b/src/crypto.rs index a8417bc..1fe8277 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -1,4 +1,28 @@ +use blake3; +use rio_api::model::{Literal, Term}; +enum Pseudonymizer { + Hasher, + Encrypter, +} + +pub struct Hasher { + algo: HashingAlgo, + salt: Option, +} + +enum HashingAlgo { + BLAKE3, + SHA256, +} + +struct Encrypter { + secret_key: Option, +} + // Computes the hash of string `s`. -pub fn hash(s: &str) -> String { - return s.to_string(); +pub fn hash_literal(s: Literal) -> Literal { + return Literal::Simple { + value: blake3::hash(&s.to_string().as_bytes()) + .to_hex().as_str() + }; } diff --git a/src/model.rs b/src/model.rs index 72b09ca..953641a 100644 --- a/src/model.rs +++ b/src/model.rs @@ -1,5 +1,8 @@ +use std::hash::Hash; + +use crate::crypto::{hash_literal, Hasher}; use bitflags; -use rio_api::model::{Subject, Term, Triple}; +use rio_api::model::{Literal, Subject, Term, Triple}; pub trait Pseudonymize { fn pseudo(&self) -> Self; @@ -47,7 +50,7 @@ pub fn pseudonymize_triple<'a>(triple: &Triple<'a>, mask: TripleMask) -> Triple< impl Pseudonymize for Term<'_> { fn pseudo(&self) -> Self { match self { - Term::Literal(val) => Term::Literal(*val), + Term::Literal(val) => Term::Literal(hash_literal(*val)), Term::NamedNode(val) => Term::NamedNode(*val), Term::BlankNode(val) => Term::BlankNode(*val), Term::Triple(_) => panic!("RDF-star not supported (triple as object)"), From cbbfa6cb5be41ba9e34cb7737fcd7ecb446738c4 Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Thu, 27 Jun 2024 16:47:23 +0200 Subject: [PATCH 02/12] fix: from function to convert pointer to boxed string --- src/crypto.rs | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- src/model.rs | 5 ++++- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/src/crypto.rs b/src/crypto.rs index 1fe8277..db85f10 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -1,5 +1,44 @@ +use std::rc::Rc; + use blake3; -use rio_api::model::{Literal, Term}; +use rio_api::model::{Literal, NamedNode, Term}; + +enum BoxLiteral<'a> { + Simple { + value: Box, + }, + LanguageTaggedString { + value: Box, + language: Box, + }, + Typed { + value: Box, + datatype: Box>, + }, +} + +impl<'a> From> for BoxLiteral<'a> { + fn from(l: Literal<'a>) -> Self { + match l { + Literal::Simple { value } => BoxLiteral::Simple { + value: Box::new(value.to_string()), + }, + Literal::LanguageTaggedString { value, language } => BoxLiteral::LanguageTaggedString { + value: Box::new(value.to_string()), + language: Box::new(language.to_string()), + }, + Literal::Typed { value, datatype } => BoxLiteral::Typed { + value: Box::new(value.to_string()), + datatype: Box::new(datatype), + }, + } + } +} + +pub fn hash_literal(l: &Literal) -> BoxLiteral { + let bl = BoxLiteral::from(l) + return BoxLiteral.from(l) +} enum Pseudonymizer { Hasher, Encrypter, @@ -19,10 +58,20 @@ struct Encrypter { secret_key: Option, } +// Define a struct to hold the string and the literal +pub struct HashedLiteral<'a> { + pub literal: Literal<'a>, + pub _storage: Rc, // Store the string to ensure it lives long enough +} + // Computes the hash of string `s`. -pub fn hash_literal(s: Literal) -> Literal { - return Literal::Simple { - value: blake3::hash(&s.to_string().as_bytes()) - .to_hex().as_str() - }; +pub fn hash_literal<'a>(s: &[u8]) -> HashedLiteral<'a> { + let hashed_literal = blake3::hash(s).to_hex().to_string(); + let storage = Rc::new(hashed_literal); + let value = Rc::as_ref(&storage).as_str(); + let literal_return = Literal::Simple { value }; + HashedLiteral { + literal: literal_return, + _storage: storage, + } } diff --git a/src/model.rs b/src/model.rs index 953641a..3d4bf4a 100644 --- a/src/model.rs +++ b/src/model.rs @@ -50,7 +50,10 @@ pub fn pseudonymize_triple<'a>(triple: &Triple<'a>, mask: TripleMask) -> Triple< impl Pseudonymize for Term<'_> { fn pseudo(&self) -> Self { match self { - Term::Literal(val) => Term::Literal(hash_literal(*val)), + Term::Literal(val) => { + let hashed = hash_literal(val); + Term::Literal(hashed.literal) // Use the literal part of the struct + } Term::NamedNode(val) => Term::NamedNode(*val), Term::BlankNode(val) => Term::BlankNode(*val), Term::Triple(_) => panic!("RDF-star not supported (triple as object)"), From 12944e73f658f94769d0b945e869eb1746269ff5 Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Thu, 27 Jun 2024 16:57:45 +0200 Subject: [PATCH 03/12] fix: clean crypto --- src/crypto.rs | 46 ++++++++-------------------------------------- src/model.rs | 4 ++-- 2 files changed, 10 insertions(+), 40 deletions(-) diff --git a/src/crypto.rs b/src/crypto.rs index db85f10..6744bb4 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -17,6 +17,12 @@ enum BoxLiteral<'a> { }, } +impl BoxLiteral<'_> { + fn to_literal<'a>(&'a self) -> Literal<'a> { + Literal::Simple { value: self } + } +} + impl<'a> From> for BoxLiteral<'a> { fn from(l: Literal<'a>) -> Self { match l { @@ -35,43 +41,7 @@ impl<'a> From> for BoxLiteral<'a> { } } -pub fn hash_literal(l: &Literal) -> BoxLiteral { - let bl = BoxLiteral::from(l) +pub fn hash_literal(l: Literal) -> BoxLiteral { + let bl = BoxLiteral::from(l); return BoxLiteral.from(l) } -enum Pseudonymizer { - Hasher, - Encrypter, -} - -pub struct Hasher { - algo: HashingAlgo, - salt: Option, -} - -enum HashingAlgo { - BLAKE3, - SHA256, -} - -struct Encrypter { - secret_key: Option, -} - -// Define a struct to hold the string and the literal -pub struct HashedLiteral<'a> { - pub literal: Literal<'a>, - pub _storage: Rc, // Store the string to ensure it lives long enough -} - -// Computes the hash of string `s`. -pub fn hash_literal<'a>(s: &[u8]) -> HashedLiteral<'a> { - let hashed_literal = blake3::hash(s).to_hex().to_string(); - let storage = Rc::new(hashed_literal); - let value = Rc::as_ref(&storage).as_str(); - let literal_return = Literal::Simple { value }; - HashedLiteral { - literal: literal_return, - _storage: storage, - } -} diff --git a/src/model.rs b/src/model.rs index 3d4bf4a..a6d2144 100644 --- a/src/model.rs +++ b/src/model.rs @@ -51,8 +51,8 @@ impl Pseudonymize for Term<'_> { fn pseudo(&self) -> Self { match self { Term::Literal(val) => { - let hashed = hash_literal(val); - Term::Literal(hashed.literal) // Use the literal part of the struct + let hashed = hash_literal(*val); + Term::Literal(hashed) // Use the literal part of the struct } Term::NamedNode(val) => Term::NamedNode(*val), Term::BlankNode(val) => Term::BlankNode(*val), From 4bcf2f1321a687a2cd5d7e5cb77a8f5e59642f1e Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Thu, 27 Jun 2024 17:13:56 +0200 Subject: [PATCH 04/12] fix: managed hashing? --- src/crypto.rs | 20 ++++++++++++++------ src/model.rs | 2 +- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/crypto.rs b/src/crypto.rs index 6744bb4..967ee0f 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -1,7 +1,5 @@ -use std::rc::Rc; - use blake3; -use rio_api::model::{Literal, NamedNode, Term}; +use rio_api::model::{Literal, NamedNode}; enum BoxLiteral<'a> { Simple { @@ -18,8 +16,15 @@ enum BoxLiteral<'a> { } impl BoxLiteral<'_> { - fn to_literal<'a>(&'a self) -> Literal<'a> { - Literal::Simple { value: self } + pub fn to_literal<'a>(&'a self) -> Literal<'a> { + return Literal::Simple { value: self } + } + pub fn as_bytes(&self) -> &[u8] { + match self { + BoxLiteral::Simple { value } => value.as_bytes(), + BoxLiteral::LanguageTaggedString { value, language } => value.as_bytes(), + BoxLiteral::Typed { value, datatype } => value.as_bytes(), + } } } @@ -43,5 +48,8 @@ impl<'a> From> for BoxLiteral<'a> { pub fn hash_literal(l: Literal) -> BoxLiteral { let bl = BoxLiteral::from(l); - return BoxLiteral.from(l) + let hashed_l = blake3::hash(&bl.as_bytes()); + &hashed_l.to_hex().to_string(); + + return bl } diff --git a/src/model.rs b/src/model.rs index a6d2144..cf8f4a8 100644 --- a/src/model.rs +++ b/src/model.rs @@ -52,7 +52,7 @@ impl Pseudonymize for Term<'_> { match self { Term::Literal(val) => { let hashed = hash_literal(*val); - Term::Literal(hashed) // Use the literal part of the struct + Term::Literal(hashed.to_literal()) // Use the literal part of the struct } Term::NamedNode(val) => Term::NamedNode(*val), Term::BlankNode(val) => Term::BlankNode(*val), From 31026a88cf5635bcf2599760fa0a64b451403348 Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Mon, 1 Jul 2024 17:07:14 +0200 Subject: [PATCH 05/12] refactor: types for rdf protect to rio types --- src/crypto.rs | 55 +----------- src/main.rs | 1 + src/model.rs | 81 ++++++----------- src/pass_second.rs | 10 +-- src/rdf_types.rs | 220 ++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 254 insertions(+), 113 deletions(-) diff --git a/src/crypto.rs b/src/crypto.rs index 967ee0f..4946acb 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -1,55 +1,2 @@ +use crate::rdf_types; use blake3; -use rio_api::model::{Literal, NamedNode}; - -enum BoxLiteral<'a> { - Simple { - value: Box, - }, - LanguageTaggedString { - value: Box, - language: Box, - }, - Typed { - value: Box, - datatype: Box>, - }, -} - -impl BoxLiteral<'_> { - pub fn to_literal<'a>(&'a self) -> Literal<'a> { - return Literal::Simple { value: self } - } - pub fn as_bytes(&self) -> &[u8] { - match self { - BoxLiteral::Simple { value } => value.as_bytes(), - BoxLiteral::LanguageTaggedString { value, language } => value.as_bytes(), - BoxLiteral::Typed { value, datatype } => value.as_bytes(), - } - } -} - -impl<'a> From> for BoxLiteral<'a> { - fn from(l: Literal<'a>) -> Self { - match l { - Literal::Simple { value } => BoxLiteral::Simple { - value: Box::new(value.to_string()), - }, - Literal::LanguageTaggedString { value, language } => BoxLiteral::LanguageTaggedString { - value: Box::new(value.to_string()), - language: Box::new(language.to_string()), - }, - Literal::Typed { value, datatype } => BoxLiteral::Typed { - value: Box::new(value.to_string()), - datatype: Box::new(datatype), - }, - } - } -} - -pub fn hash_literal(l: Literal) -> BoxLiteral { - let bl = BoxLiteral::from(l); - let hashed_l = blake3::hash(&bl.as_bytes()); - &hashed_l.to_hex().to_string(); - - return bl -} diff --git a/src/main.rs b/src/main.rs index a24b009..f43e307 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,7 @@ mod log; mod model; mod pass_first; mod pass_second; +mod rdf_types; mod rules; // Define the imports. diff --git a/src/model.rs b/src/model.rs index cf8f4a8..17c495a 100644 --- a/src/model.rs +++ b/src/model.rs @@ -1,11 +1,36 @@ use std::hash::Hash; -use crate::crypto::{hash_literal, Hasher}; +use crate::rdf_types::*; use bitflags; -use rio_api::model::{Literal, Subject, Term, Triple}; + +pub enum Entity { + Literal(Literal), + Subject(Subject), + NamedNode(NamedNode), + Triple(Triple), + BlankNode(BlankNode), +} pub trait Pseudonymize { - fn pseudo(&self) -> Self; + fn pseudo(&self, e: Entity) -> Entity { + match e { + Literal => self.pseudo_literal(e), + Subject => self.pseudo_uri(e), + NamedNode => self.pseudo_uri(e), + Triple => self.pseudo_triple(e), + BlankNode => e, + } + } + // private methods? Blanket implementations + fn pseudo_triple(&self, t: Entity) -> Entity { + return t; + } + fn pseudo_literal(&self, l: Entity) -> Entity { + return l; + } + fn pseudo_uri(&self, u: Entity) -> Entity { + return u; + } } // Used to select any combination of fields in a triple @@ -24,53 +49,3 @@ impl TripleMask { return (*other - *self).bits() != 0; } } - -// Pseudonymize parts of a triple set by its mask -pub fn pseudonymize_triple<'a>(triple: &Triple<'a>, mask: TripleMask) -> Triple<'a> { - let pseudo_subject = if mask.is_set(&TripleMask::SUBJECT) { - &triple.subject.pseudo() - } else { - &triple.subject.clone() - }; - - let pseudo_object = if mask.is_set(&TripleMask::OBJECT) { - triple.object.pseudo() - } else { - triple.object.clone() - }; - - return Triple { - subject: *pseudo_subject, - predicate: triple.predicate, - object: pseudo_object, - }; -} - -// Pseudonymization of objects (Nodes or literals) -impl Pseudonymize for Term<'_> { - fn pseudo(&self) -> Self { - match self { - Term::Literal(val) => { - let hashed = hash_literal(*val); - Term::Literal(hashed.to_literal()) // Use the literal part of the struct - } - Term::NamedNode(val) => Term::NamedNode(*val), - Term::BlankNode(val) => Term::BlankNode(*val), - Term::Triple(_) => panic!("RDF-star not supported (triple as object)"), - } - } -} - -// Pseudonymization of subjects (always a URI / blank node) -impl Pseudonymize for Subject<'_> { - fn pseudo(&self) -> Self { - match self { - Subject::NamedNode(val) => Subject::NamedNode(*val), - Subject::BlankNode(val) => Subject::BlankNode(*val), - Subject::Triple(_) => panic!("RDF-star not supported (triple as subject)"), - } - } -} - -// TODO: implement for blanknodes -// NOTE: Support for RDF-star? diff --git a/src/pass_second.rs b/src/pass_second.rs index 54afdbd..aca3cc7 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -13,21 +13,21 @@ use crate::{ rules::Config, }; -fn mask_triple(triple: &Triple) -> TripleMask { +fn mask_triple(triple: Triple) -> TripleMask { return TripleMask::SUBJECT; } // mask and encode input triple // NOTE: This will need the type-map to perform masking fn process_triple( - triple: &Triple, + triple: Triple, rules_config: &Config, node_to_type: &HashMap, out: &mut impl Write, ) -> Result<(), TurtleError> { let mask = mask_triple(triple); - let pseudo_triple = pseudonymize_triple(&triple, mask); - let _ = out.write(&format!("{} .\n", &pseudo_triple.to_string()).into_bytes()); + let pseudo_triple = pseudonymize_triple(triple.into(), mask); + let _ = out.write(&format!("{} .\n", pseudo_triple.to_string()).into_bytes()); Ok(()) } @@ -57,7 +57,7 @@ pub fn pseudonymize_graph(log: &Logger, input: &Path, config: &Path, output: &Pa let mut triples = io::parse_ntriples(buf_input); while !triples.is_end() { triples - .parse_step(&mut |t| process_triple(&t, &rules_config, &node_to_type, &mut buf_output)) + .parse_step(&mut |t| process_triple(t, &rules_config, &node_to_type, &mut buf_output)) .unwrap(); } } diff --git a/src/rdf_types.rs b/src/rdf_types.rs index 0dfa4de..aa0db27 100644 --- a/src/rdf_types.rs +++ b/src/rdf_types.rs @@ -1,9 +1,227 @@ use rio_api; +use std::{fmt, fmt::Write, ops::Sub}; -#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] +// Rewrite all the rio types to be able to instanciate triples +// Rename rio types as XXXView to distinguish them from our types +// Use rio types for parsing and serializing +// Define mappers between the two types + +#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)] pub struct NamedNode { /// The [IRI](https://www.w3.org/TR/rdf11-concepts/#dfn-iri) itself. pub iri: String, } +impl fmt::Display for NamedNode { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "<{}>", self.iri) + } +} + type NamedNodeView<'a> = rio_api::model::NamedNode<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub enum Literal { + /// A [simple literal](https://www.w3.org/TR/rdf11-concepts/#dfn-simple-literal) without datatype or language form. + Simple { + /// The [lexical form](https://www.w3.org/TR/rdf11-concepts/#dfn-lexical-form). + value: String, + }, + /// A [language-tagged string](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tagged-string) + LanguageTaggedString { + /// The [lexical form](https://www.w3.org/TR/rdf11-concepts/#dfn-lexical-form). + value: String, + /// The [language tag](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tag). + language: String, + }, + /// A literal with an explicit datatype + Typed { + /// The [lexical form](https://www.w3.org/TR/rdf11-concepts/#dfn-lexical-form). + value: String, + /// The [datatype IRI](https://www.w3.org/TR/rdf11-concepts/#dfn-datatype-iri). + datatype: NamedNode, + }, +} + +impl fmt::Display for Literal { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Literal::Simple { value } => fmt_quoted_str(value, f), + Literal::LanguageTaggedString { value, language } => { + fmt_quoted_str(value, f)?; + write!(f, "@{}", language) + } + Literal::Typed { value, datatype } => { + fmt_quoted_str(value, f)?; + write!(f, "^^{}", datatype) + } + } + } +} + +type LiteralView<'a> = rio_api::model::Literal<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub enum Term { + NamedNode(NamedNode), + BlankNode(BlankNode), + Literal(Literal), +} + +impl fmt::Display for Term { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Term::NamedNode(node) => node.fmt(f), + Term::BlankNode(node) => node.fmt(f), + Term::Literal(literal) => literal.fmt(f), + } + } +} + +type TermView<'a> = rio_api::model::Term<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct Triple { + pub subject: Subject, + pub predicate: NamedNode, + pub object: Term, +} + +impl fmt::Display for Triple { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} {} {}", self.subject, self.predicate, self.object) + } +} + +impl<'a> From> for Triple { + fn from(t: TripleView<'a>) -> Self { + match t { + TripleView { + subject, + predicate, + object, + } => Triple { + subject: subject.into(), + predicate: predicate.into(), + object: object.into(), + }, + } + } +} + +type TripleView<'a> = rio_api::model::Triple<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub enum Subject { + NamedNode(NamedNode), + BlankNode(BlankNode), +} + +impl fmt::Display for Subject { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Subject::NamedNode(node) => node.fmt(f), + Subject::BlankNode(node) => node.fmt(f), + } + } +} + +impl<'a> From> for Subject { + #[inline] + fn from(resource: SubjectView) -> Self { + match resource { + SubjectView::NamedNode(node) => Subject::NamedNode(node.into()), + SubjectView::BlankNode(node) => Subject::BlankNode(node.into()), + _ => panic!("Unexpected subject type"), + } + } +} + +impl<'a> From> for Term { + #[inline] + fn from(term: TermView<'a>) -> Self { + match term { + TermView::NamedNode(node) => Term::NamedNode(node.into()), + TermView::BlankNode(node) => Term::BlankNode(node.into()), + TermView::Literal(literal) => Term::Literal(literal.into()), + _ => panic!("Unexpected term type"), + } + } +} + +impl<'a> From> for NamedNode { + #[inline] + fn from(node: NamedNodeView<'a>) -> Self { + NamedNode { + iri: node.iri.to_string(), + } + } +} + +impl<'a> From> for BlankNode { + #[inline] + fn from(node: BlankNodeView<'a>) -> Self { + BlankNode { + id: node.id.to_string(), + } + } +} + +type SubjectView<'a> = rio_api::model::Subject<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct BlankNode { + /// The [blank node identifier](https://www.w3.org/TR/rdf11-concepts/#dfn-blank-node-identifier). + pub id: String, +} + +impl fmt::Display for BlankNode { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "_:{}", self.id) + } +} + +type BlankNodeView<'a> = rio_api::model::BlankNode<'a>; + +impl<'a> From> for Literal { + fn from(l: LiteralView<'a>) -> Self { + match l { + LiteralView::Simple { value } => Literal::Simple { + value: value.to_string(), + }, + LiteralView::LanguageTaggedString { value, language } => { + Literal::LanguageTaggedString { + value: value.to_string(), + language: language.to_string(), + } + } + LiteralView::Typed { value, datatype } => Literal::Typed { + value: value.to_string(), + datatype: NamedNode { + iri: datatype.iri.to_string(), + }, + }, + } + } +} + +#[inline] +fn fmt_quoted_str(string: &String, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_char('"')?; + for c in string.chars() { + match c { + '\n' => f.write_str("\\n"), + '\r' => f.write_str("\\r"), + '"' => f.write_str("\\\""), + '\\' => f.write_str("\\\\"), + c => f.write_char(c), + }?; + } + f.write_char('"') +} From 3ddbb82c5858704523f0af2d7e3ed844692b1f9d Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Tue, 2 Jul 2024 17:43:18 +0200 Subject: [PATCH 06/12] fix: add new pseudo structure --- src/crypto.rs | 77 +++++++++++++++++++++++++++++++++++++++++++++- src/model.rs | 24 --------------- src/pass_second.rs | 7 +---- src/rdf_types.rs | 40 ++++++++++++++++++++++++ 4 files changed, 117 insertions(+), 31 deletions(-) diff --git a/src/crypto.rs b/src/crypto.rs index 4946acb..81fdb01 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -1,2 +1,77 @@ -use crate::rdf_types; +use super::model::Entity; +use crate::{model::TripleMask, rdf_types::*}; use blake3; + +pub trait Pseudonymize { + // Pseudonymize parts of a triple set by its mask + fn pseudo_triple(&self, triple: &Triple, mask: TripleMask) -> Triple { + let pseudo_subject = if mask.is_set(&TripleMask::SUBJECT) { + &self.pseudo_entity(&triple.subject.into()) + } else { + &triple.subject.into() + }; + + let pseudo_object = if mask.is_set(&TripleMask::OBJECT) { + &self.pseudo_entity(&triple.object.into()) + } else { + &triple.object.into() + }; + + return Triple { + subject: Subject::from(pseudo_subject.clone()), + predicate: triple.predicate.clone(), + object: Term::from(pseudo_object.clone()), + }; + } + + fn pseudo_entity(&self, e: &Entity) -> Entity { + match e { + Entity::Literal(l) => Entity::Literal(self.pseudo_literal(&l)), + Entity::NamedNode(n) => Entity::NamedNode(self.pseudo_named_node(&n)), + Entity::BlankNode(b) => Entity::BlankNode(self.pseudo_blank_node(&b)), + } + } + // private methods? Blanket implementations + fn pseudo_named_node(&self, t: &NamedNode) -> NamedNode; + //return t.clone(); + + fn pseudo_literal(&self, l: &Literal) -> Literal; + //return l.clone(); + + fn pseudo_blank_node(&self, u: &BlankNode) -> BlankNode; + // return u.clone() +} + +pub struct Hasher { + hasher: blake3::Hasher, +} + +impl Hasher { + pub fn new() -> Self { + return Hasher { + hasher: blake3::Hasher::new(), + }; + } +} + +impl Pseudonymize for Hasher { + fn pseudo_named_node(&self, t: &NamedNode) -> NamedNode { + return t.clone(); + } + + fn pseudo_literal(&self, l: &Literal) -> Literal { + let value = match l { + Literal::Typed { value, datatype: _ } => value, + Literal::LanguageTaggedString { value, language: _ } => value, + Literal::Simple { value } => value, + }; + let hash = blake3::hash(value.as_bytes()); + return Literal::Simple { + value: hash.to_string(), + }; + } + + fn pseudo_blank_node(&self, u: &BlankNode) -> BlankNode { + return u.clone(); + } +} diff --git a/src/model.rs b/src/model.rs index 17c495a..8400ab7 100644 --- a/src/model.rs +++ b/src/model.rs @@ -5,34 +5,10 @@ use bitflags; pub enum Entity { Literal(Literal), - Subject(Subject), NamedNode(NamedNode), - Triple(Triple), BlankNode(BlankNode), } -pub trait Pseudonymize { - fn pseudo(&self, e: Entity) -> Entity { - match e { - Literal => self.pseudo_literal(e), - Subject => self.pseudo_uri(e), - NamedNode => self.pseudo_uri(e), - Triple => self.pseudo_triple(e), - BlankNode => e, - } - } - // private methods? Blanket implementations - fn pseudo_triple(&self, t: Entity) -> Entity { - return t; - } - fn pseudo_literal(&self, l: Entity) -> Entity { - return l; - } - fn pseudo_uri(&self, u: Entity) -> Entity { - return u; - } -} - // Used to select any combination of fields in a triple bitflags::bitflags! { #[derive(Debug, Copy, Clone)] diff --git a/src/pass_second.rs b/src/pass_second.rs index aca3cc7..70a1a0e 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -6,12 +6,7 @@ use std::{ path::Path, }; -use crate::{ - io, - log::Logger, - model::{pseudonymize_triple, TripleMask}, - rules::Config, -}; +use crate::{io, log::Logger, model::TripleMask, rules::Config}; fn mask_triple(triple: Triple) -> TripleMask { return TripleMask::SUBJECT; diff --git a/src/rdf_types.rs b/src/rdf_types.rs index aa0db27..3ee8140 100644 --- a/src/rdf_types.rs +++ b/src/rdf_types.rs @@ -1,5 +1,6 @@ use rio_api; use std::{fmt, fmt::Write, ops::Sub}; +use super::model::Entity; // Rewrite all the rio types to be able to instanciate triples // Rename rio types as XXXView to distinguish them from our types @@ -211,6 +212,45 @@ impl<'a> From> for Literal { } } +impl From for Entity { + fn from(subject: Subject) -> Entity { + match subject { + Subject::NamedNode(node) => Entity::NamedNode(node), + Subject::BlankNode(node) => Entity::BlankNode(node), + } + } +} + +impl From for Entity { + fn from(term: Term) -> Entity { + match term { + Term::NamedNode(node) => Entity::NamedNode(node), + Term::BlankNode(node) => Entity::BlankNode(node), + Term::Literal(literal) => Entity::Literal(literal), + } + } +} + +impl From for Subject { + fn from(entity: Entity) -> Subject { + match entity { + Entity::NamedNode(node) => Subject::NamedNode(node), + Entity::BlankNode(node) => Subject::BlankNode(node), + _ => panic!("Unexpected entity type"), + } + } +} + +impl From for Term { + fn from(entity: Entity) -> Term { + match entity { + Entity::NamedNode(node) => Term::NamedNode(node), + Entity::BlankNode(node) => Term::BlankNode(node), + Entity::Literal(literal) => Term::Literal(literal), + } + } +} + #[inline] fn fmt_quoted_str(string: &String, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_char('"')?; From 9cd0b7d95cdca6de61fdbae7f043b6a0dc0738a2 Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Wed, 3 Jul 2024 13:35:07 +0200 Subject: [PATCH 07/12] fix: match types and restructure pseudonymization call --- src/crypto.rs | 16 ++++++++-------- src/model.rs | 1 + src/pass_second.rs | 12 ++++++------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/crypto.rs b/src/crypto.rs index 81fdb01..6e1e4df 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -6,15 +6,15 @@ pub trait Pseudonymize { // Pseudonymize parts of a triple set by its mask fn pseudo_triple(&self, triple: &Triple, mask: TripleMask) -> Triple { let pseudo_subject = if mask.is_set(&TripleMask::SUBJECT) { - &self.pseudo_entity(&triple.subject.into()) + &self.pseudo_entity(&triple.subject.clone().into()) } else { - &triple.subject.into() + &triple.subject.clone().into() }; let pseudo_object = if mask.is_set(&TripleMask::OBJECT) { - &self.pseudo_entity(&triple.object.into()) + &self.pseudo_entity(&triple.object.clone().into()) } else { - &triple.object.into() + &triple.object.clone().into() }; return Triple { @@ -42,19 +42,19 @@ pub trait Pseudonymize { // return u.clone() } -pub struct Hasher { +pub struct DefaultHasher { hasher: blake3::Hasher, } -impl Hasher { +impl DefaultHasher { pub fn new() -> Self { - return Hasher { + return DefaultHasher { hasher: blake3::Hasher::new(), }; } } -impl Pseudonymize for Hasher { +impl Pseudonymize for DefaultHasher { fn pseudo_named_node(&self, t: &NamedNode) -> NamedNode { return t.clone(); } diff --git a/src/model.rs b/src/model.rs index 8400ab7..d0880a9 100644 --- a/src/model.rs +++ b/src/model.rs @@ -3,6 +3,7 @@ use std::hash::Hash; use crate::rdf_types::*; use bitflags; +#[derive(Eq, PartialEq, Debug, Clone, Hash)] pub enum Entity { Literal(Literal), NamedNode(NamedNode), diff --git a/src/pass_second.rs b/src/pass_second.rs index 70a1a0e..1c2ff25 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -1,4 +1,4 @@ -use rio_api::{model::Triple, parser::TriplesParser}; +use rio_api::parser::TriplesParser; use rio_turtle::TurtleError; use std::{ collections::HashMap, @@ -6,7 +6,7 @@ use std::{ path::Path, }; -use crate::{io, log::Logger, model::TripleMask, rules::Config}; +use crate::{io, log::Logger, model::TripleMask, rules::Config, crypto::{DefaultHasher, Pseudonymize}, rdf_types::{Triple, Term}}; fn mask_triple(triple: Triple) -> TripleMask { return TripleMask::SUBJECT; @@ -20,9 +20,9 @@ fn process_triple( node_to_type: &HashMap, out: &mut impl Write, ) -> Result<(), TurtleError> { - let mask = mask_triple(triple); - let pseudo_triple = pseudonymize_triple(triple.into(), mask); - let _ = out.write(&format!("{} .\n", pseudo_triple.to_string()).into_bytes()); + let mask = mask_triple(triple.clone()); + let hasher = DefaultHasher::new(); + let _ = out.write(&format!("{} .\n", hasher.pseudo_triple(&triple, mask).to_string()).into_bytes()); Ok(()) } @@ -52,7 +52,7 @@ pub fn pseudonymize_graph(log: &Logger, input: &Path, config: &Path, output: &Pa let mut triples = io::parse_ntriples(buf_input); while !triples.is_end() { triples - .parse_step(&mut |t| process_triple(t, &rules_config, &node_to_type, &mut buf_output)) + .parse_step(&mut |t| process_triple(t.into(), &rules_config, &node_to_type, &mut buf_output)) .unwrap(); } } From d66942af1b1e664a60a5929130ac6bdcc1313a03 Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Wed, 3 Jul 2024 17:27:15 +0200 Subject: [PATCH 08/12] fix: implemented subject rules --- src/crypto.rs | 7 ++++++- src/model.rs | 16 ++++++++++++++- src/pass_second.rs | 49 ++++++++++++++++++++++++++++++++++++---------- src/rdf_types.rs | 10 +++++----- 4 files changed, 65 insertions(+), 17 deletions(-) diff --git a/src/crypto.rs b/src/crypto.rs index 6e1e4df..3e9f677 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -56,7 +56,12 @@ impl DefaultHasher { impl Pseudonymize for DefaultHasher { fn pseudo_named_node(&self, t: &NamedNode) -> NamedNode { - return t.clone(); + // We check for the last backslash in the IRI and add 1 to include the backslash + let prefix = &t.iri[0..t.iri.rfind('/').unwrap()+1]; + let hash = blake3::hash(t.iri.as_bytes()).to_string(); + return NamedNode { + iri: format!("{prefix}{hash}"), + }; } fn pseudo_literal(&self, l: &Literal) -> Literal { diff --git a/src/model.rs b/src/model.rs index d0880a9..3e6049e 100644 --- a/src/model.rs +++ b/src/model.rs @@ -23,6 +23,20 @@ bitflags::bitflags! { impl TripleMask { // Checks if bit from another mask are all set in this mask pub fn is_set(&self, other: &TripleMask) -> bool { - return (*other - *self).bits() != 0; + return (*other - *self).bits() == 0; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + // Test the parsing of a triple. + fn is_subject_set() { + let mask_s = TripleMask::SUBJECT; + let mask_so = TripleMask::SUBJECT | TripleMask::OBJECT; + assert!(mask_s.is_set(&TripleMask::SUBJECT)); + assert!(mask_so.is_set(&TripleMask::SUBJECT)); } } diff --git a/src/pass_second.rs b/src/pass_second.rs index 1c2ff25..d2b3fa8 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -1,15 +1,40 @@ use rio_api::parser::TriplesParser; use rio_turtle::TurtleError; use std::{ - collections::HashMap, - io::{BufRead, Write}, - path::Path, + collections::HashMap, fmt::{Debug, Display}, io::{BufRead, Write}, path::Path, }; -use crate::{io, log::Logger, model::TripleMask, rules::Config, crypto::{DefaultHasher, Pseudonymize}, rdf_types::{Triple, Term}}; +use crate::{ + crypto::{DefaultHasher, Pseudonymize}, + io, + log::Logger, + model::TripleMask, + rdf_types::*, + rules::Config, +}; -fn mask_triple(triple: Triple) -> TripleMask { - return TripleMask::SUBJECT; +fn mask_triple(triple: Triple, rules: &Config, type_map: &HashMap) -> TripleMask { + // Check each field of the triple against the rules + // rules.replace_uri_of_nodes_with_type has to to with OBJECT and SUBJECT + // rules.replace_value_of_predicate has to do with OBJECT + // rules.replace_value_of_object has to do with OBJECT + // We will start by checking the subject and then the object + match triple.subject { + Subject::NamedNode(n) => { + // First check if the iri is in any of the types + let type_check = type_map.contains_key(&n.iri); + // If the iri is in the type map, check if it is in the rules + if type_check { + let iri_type = type_map.get(&n.iri).unwrap(); + let config_check = rules.replace_uri_of_nodes_with_type.contains(iri_type); + if config_check { + return TripleMask::SUBJECT; + } + } + } + Subject::BlankNode(_) => {} + } + return TripleMask::from_bits_truncate(0b0) } // mask and encode input triple @@ -20,9 +45,11 @@ fn process_triple( node_to_type: &HashMap, out: &mut impl Write, ) -> Result<(), TurtleError> { - let mask = mask_triple(triple.clone()); + let mask = mask_triple(triple.clone(), &rules_config, &node_to_type); + println!("Mask: {:?}", mask.bits()); let hasher = DefaultHasher::new(); - let _ = out.write(&format!("{} .\n", hasher.pseudo_triple(&triple, mask).to_string()).into_bytes()); + let _ = + out.write(&format!("{} .\n", hasher.pseudo_triple(&triple, mask).to_string()).into_bytes()); Ok(()) } @@ -34,7 +61,7 @@ fn load_type_map(input: impl BufRead) -> HashMap { while !triples.is_end() { let _: Result<(), TurtleError> = triples.parse_step(&mut |t| { - node_to_type.insert(t.subject.to_string(), t.object.to_string()); + node_to_type.insert(t.subject.to_string().replace(&['<','>'][..], ""), t.object.to_string().replace(&['<','>'][..], "")); Ok(()) }); } @@ -52,7 +79,9 @@ pub fn pseudonymize_graph(log: &Logger, input: &Path, config: &Path, output: &Pa let mut triples = io::parse_ntriples(buf_input); while !triples.is_end() { triples - .parse_step(&mut |t| process_triple(t.into(), &rules_config, &node_to_type, &mut buf_output)) + .parse_step(&mut |t| { + process_triple(t.into(), &rules_config, &node_to_type, &mut buf_output) + }) .unwrap(); } } diff --git a/src/rdf_types.rs b/src/rdf_types.rs index 3ee8140..38f653f 100644 --- a/src/rdf_types.rs +++ b/src/rdf_types.rs @@ -1,6 +1,6 @@ +use super::model::Entity; use rio_api; use std::{fmt, fmt::Write, ops::Sub}; -use super::model::Entity; // Rewrite all the rio types to be able to instanciate triples // Rename rio types as XXXView to distinguish them from our types @@ -212,7 +212,7 @@ impl<'a> From> for Literal { } } -impl From for Entity { +impl From for Entity { fn from(subject: Subject) -> Entity { match subject { Subject::NamedNode(node) => Entity::NamedNode(node), @@ -221,7 +221,7 @@ impl From for Entity { } } -impl From for Entity { +impl From for Entity { fn from(term: Term) -> Entity { match term { Term::NamedNode(node) => Entity::NamedNode(node), @@ -231,7 +231,7 @@ impl From for Entity { } } -impl From for Subject { +impl From for Subject { fn from(entity: Entity) -> Subject { match entity { Entity::NamedNode(node) => Subject::NamedNode(node), @@ -241,7 +241,7 @@ impl From for Subject { } } -impl From for Term { +impl From for Term { fn from(entity: Entity) -> Term { match entity { Entity::NamedNode(node) => Term::NamedNode(node), From 144f895b63055b223122f556846103d62bc654f1 Mon Sep 17 00:00:00 2001 From: supermaxiste Date: Mon, 8 Jul 2024 11:47:19 +0200 Subject: [PATCH 09/12] fix: implemented rules with tests --- src/crypto.rs | 2 +- src/pass_second.rs | 39 +++--- src/rdf_types.rs | 8 +- src/rules.rs | 291 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 314 insertions(+), 26 deletions(-) diff --git a/src/crypto.rs b/src/crypto.rs index 3e9f677..500ad17 100644 --- a/src/crypto.rs +++ b/src/crypto.rs @@ -57,7 +57,7 @@ impl DefaultHasher { impl Pseudonymize for DefaultHasher { fn pseudo_named_node(&self, t: &NamedNode) -> NamedNode { // We check for the last backslash in the IRI and add 1 to include the backslash - let prefix = &t.iri[0..t.iri.rfind('/').unwrap()+1]; + let prefix = &t.iri[0..t.iri.rfind('/').unwrap() + 1]; let hash = blake3::hash(t.iri.as_bytes()).to_string(); return NamedNode { iri: format!("{prefix}{hash}"), diff --git a/src/pass_second.rs b/src/pass_second.rs index d2b3fa8..ead03ff 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -1,7 +1,10 @@ use rio_api::parser::TriplesParser; use rio_turtle::TurtleError; use std::{ - collections::HashMap, fmt::{Debug, Display}, io::{BufRead, Write}, path::Path, + collections::HashMap, + fmt::{Debug, Display}, + io::{BufRead, Write}, + path::Path, }; use crate::{ @@ -10,31 +13,17 @@ use crate::{ log::Logger, model::TripleMask, rdf_types::*, - rules::Config, + rules::{eval_predicate_rule, eval_type_rule_object, eval_type_rule_subject, eval_subject_predicate_rule, Config}, }; fn mask_triple(triple: Triple, rules: &Config, type_map: &HashMap) -> TripleMask { // Check each field of the triple against the rules - // rules.replace_uri_of_nodes_with_type has to to with OBJECT and SUBJECT - // rules.replace_value_of_predicate has to do with OBJECT - // rules.replace_value_of_object has to do with OBJECT - // We will start by checking the subject and then the object - match triple.subject { - Subject::NamedNode(n) => { - // First check if the iri is in any of the types - let type_check = type_map.contains_key(&n.iri); - // If the iri is in the type map, check if it is in the rules - if type_check { - let iri_type = type_map.get(&n.iri).unwrap(); - let config_check = rules.replace_uri_of_nodes_with_type.contains(iri_type); - if config_check { - return TripleMask::SUBJECT; - } - } - } - Subject::BlankNode(_) => {} - } - return TripleMask::from_bits_truncate(0b0) + let mut mask = TripleMask::new(); + mask = eval_type_rule_subject(&triple.subject, mask, type_map, rules); + mask = eval_type_rule_object(&triple.object, mask, type_map, rules); + mask = eval_predicate_rule(&triple.predicate, mask, rules); + mask = eval_subject_predicate_rule(&triple.subject, &triple.predicate, mask, type_map, rules); + return mask; } // mask and encode input triple @@ -46,7 +35,6 @@ fn process_triple( out: &mut impl Write, ) -> Result<(), TurtleError> { let mask = mask_triple(triple.clone(), &rules_config, &node_to_type); - println!("Mask: {:?}", mask.bits()); let hasher = DefaultHasher::new(); let _ = out.write(&format!("{} .\n", hasher.pseudo_triple(&triple, mask).to_string()).into_bytes()); @@ -61,7 +49,10 @@ fn load_type_map(input: impl BufRead) -> HashMap { while !triples.is_end() { let _: Result<(), TurtleError> = triples.parse_step(&mut |t| { - node_to_type.insert(t.subject.to_string().replace(&['<','>'][..], ""), t.object.to_string().replace(&['<','>'][..], "")); + node_to_type.insert( + t.subject.to_string().replace(&['<', '>'][..], ""), + t.object.to_string().replace(&['<', '>'][..], ""), + ); Ok(()) }); } diff --git a/src/rdf_types.rs b/src/rdf_types.rs index 38f653f..093a7ea 100644 --- a/src/rdf_types.rs +++ b/src/rdf_types.rs @@ -1,4 +1,4 @@ -use super::model::Entity; +use super::model::{Entity, TripleMask}; use rio_api; use std::{fmt, fmt::Write, ops::Sub}; @@ -98,6 +98,12 @@ impl fmt::Display for Triple { } } +impl TripleMask { + pub fn new() -> Self { + return TripleMask::from_bits_truncate(0b0); + } +} + impl<'a> From> for Triple { fn from(t: TripleView<'a>) -> Self { match t { diff --git a/src/rules.rs b/src/rules.rs index c00452e..d2bfbe3 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,6 +1,9 @@ +use crate::rdf_types::*; use ::std::collections::{HashMap, HashSet}; use serde::{Deserialize, Serialize}; +use crate::model::TripleMask; + #[derive(Serialize, Deserialize, Debug)] pub struct Config { // Replace values of nodes with a certain type. @@ -12,3 +15,291 @@ pub struct Config { // Replace values in matched `predicates`. pub replace_value_of_predicate: HashSet, } + +pub fn eval_type_rule_named_node( + is_subject: bool, + n: NamedNode, + mut mask: TripleMask, + rules: &Config, + type_map: &HashMap, +) -> TripleMask { + // First check if the iri is in any of the types in the type map + let iri_type = if type_map.contains_key(&n.iri) { + type_map.get(&n.iri).unwrap() + } else { + return mask; + }; + // If the iri is in the type map, check if it is in the rules + if rules.replace_uri_of_nodes_with_type.contains(iri_type) { + if is_subject { + mask |= TripleMask::SUBJECT; + return mask; + } else { + mask |= TripleMask::OBJECT; + return mask; + }; + } else { + return mask; + } +} + +pub fn eval_type_rule_subject( + subject: &Subject, + mut mask: TripleMask, + type_map: &HashMap, + rules: &Config, +) -> TripleMask { + match subject { + Subject::NamedNode(n) => { + mask = eval_type_rule_named_node(true, n.clone(), mask, rules, type_map); + return mask; + } + Subject::BlankNode(_) => return mask, + } +} + +pub fn eval_type_rule_object( + object: &Term, + mut mask: TripleMask, + type_map: &HashMap, + rules: &Config, +) -> TripleMask { + match object { + Term::NamedNode(n) => { + mask = eval_type_rule_named_node(false, n.clone(), mask, rules, type_map); + return mask; + } + _ => return mask, + } +} + +pub fn eval_predicate_rule( + predicate: &NamedNode, + mut mask: TripleMask, + rules: &Config, +) -> TripleMask { + match predicate { + NamedNode { iri: n } => { + // check if rule contains iri is in replace_value_of_predicate + if rules.replace_value_of_predicate.contains(n) { + mask |= TripleMask::OBJECT; + return mask; + } else { + return mask; + } + } + } +} + +pub fn eval_subject_predicate_rule( + subject: &Subject, + predicate: &NamedNode, + mut mask: TripleMask, + type_map: &HashMap, + rules: &Config, +) -> TripleMask { + match subject { + Subject::NamedNode(n) => { + // check if rule contains iri is in replace_value_of_subject_predicate + let subject_type = if type_map.contains_key(&n.iri) { + type_map.get(&n.iri).unwrap() + } else { + return mask; + }; + if rules.replace_values_of_subject_predicate.contains_key(subject_type) { + let is_in_config = rules.replace_values_of_subject_predicate[subject_type] + .contains(&predicate.iri); + if is_in_config { + mask |= TripleMask::OBJECT; + return mask; + } else { + return mask; + } + } else { + return mask; + } + } + Subject::BlankNode(_) => return mask, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn set_type_rule(t: &str) -> Config { + let mut rules = Config { + replace_uri_of_nodes_with_type: HashSet::new(), + replace_values_of_subject_predicate: HashMap::new(), + replace_value_of_predicate: HashSet::new(), + }; + rules.replace_uri_of_nodes_with_type.insert(t.to_string()); + return rules; + } + + fn set_predicate_rule(p: &str) -> Config { + let mut rules = Config { + replace_uri_of_nodes_with_type: HashSet::new(), + replace_values_of_subject_predicate: HashMap::new(), + replace_value_of_predicate: HashSet::new(), + }; + rules.replace_value_of_predicate.insert(p.to_string()); + return rules; + } + + fn set_subject_predicate_rule(s: &str, p: &str) -> Config { + let mut rules = Config { + replace_uri_of_nodes_with_type: HashSet::new(), + replace_values_of_subject_predicate: HashMap::new(), + replace_value_of_predicate: HashSet::new(), + }; + let mut set = HashSet::new(); + set.insert(p.to_string()); + rules.replace_values_of_subject_predicate.insert(s.to_string(), set); + return rules; + } + + #[test] + // Test the type rule for a subject that is in the rules & type map + fn is_subject_set() { + let subject = Subject::NamedNode(NamedNode { + iri: "http://example.org/Alice".to_string(), + }); + let rules = set_type_rule("Person"); + let mut type_map = HashMap::new(); + type_map.insert("http://example.org/Alice".to_string(), "Person".to_string()); + let mut mask = TripleMask::new(); + mask = eval_type_rule_subject(&subject, mask, &type_map, &rules); + assert!(mask.is_set(&TripleMask::SUBJECT)); + assert!(!mask.is_set(&TripleMask::OBJECT)); + } + #[test] + // Test the type rule for a subject that is not in the rules but in the type map + fn subject_not_in_rules() { + let subject = Subject::NamedNode(NamedNode { + iri: "http://example.org/Alice".to_string(), + }); + let rules = set_type_rule("Bank"); + let mut type_map = HashMap::new(); + type_map.insert("http://example.org/Alice".to_string(), "Person".to_string()); + let mut mask = TripleMask::new(); + mask = eval_type_rule_subject(&subject, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + assert!(!mask.is_set(&TripleMask::OBJECT)); + } + #[test] + // Test the type rule for a subject neither in the rules nor in the type map + fn subject_not_in_types() { + let subject = Subject::NamedNode(NamedNode { + iri: "http://example.org/Alice".to_string(), + }); + let rules = set_type_rule("Bank"); + let mut type_map = HashMap::new(); + type_map.insert("http://example.org/Bank".to_string(), "Bank".to_string()); + let mut mask = TripleMask::new(); + mask = eval_type_rule_subject(&subject, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + assert!(!mask.is_set(&TripleMask::OBJECT)); + } + #[test] + // Test the type rule for an object that is in the rules & type map + fn is_object_set() { + let object = Term::NamedNode(NamedNode { + iri: "http://example.org/Alice".to_string(), + }); + let rules = set_type_rule("Person"); + let mut type_map = HashMap::new(); + type_map.insert("http://example.org/Alice".to_string(), "Person".to_string()); + let mut mask = TripleMask::new(); + mask = eval_type_rule_object(&object, mask, &type_map, &rules); + assert!(mask.is_set(&TripleMask::OBJECT)); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + } + #[test] + fn predicate_in_config() { + let predicate = NamedNode { + iri: "http://example.org/hasName".to_string(), + }; + let rules = set_predicate_rule("http://example.org/hasName"); + let mut mask = TripleMask::new(); + mask = eval_predicate_rule(&predicate, mask, &rules); + assert!(mask.is_set(&TripleMask::OBJECT)); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + } + #[test] + fn predicate_not_in_config() { + let predicate = NamedNode { + iri: "http://example.org/hasName".to_string(), + }; + let rules = set_predicate_rule("http://example.org/hasAge"); + let mut mask = TripleMask::new(); + mask = eval_predicate_rule(&predicate, mask, &rules); + assert!(!mask.is_set(&TripleMask::OBJECT)); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + } + #[test] + fn subject_predicate_in_config() { + let subject = Subject::NamedNode(NamedNode { + iri: "http://example.org/Alice".to_string(), + }); + let predicate = NamedNode { + iri: "http://example.org/hasName".to_string(), + }; + let rules = set_subject_predicate_rule("http://example.org/Person", "http://example.org/hasName"); + let mut mask = TripleMask::new(); + let mut type_map = HashMap::new(); + type_map.insert("http://example.org/Alice".to_string(), "http://example.org/Person".to_string()); + mask = eval_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + assert!(mask.is_set(&TripleMask::OBJECT)); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + } + #[test] + fn subject_in_config_predicate_not_() { + let subject = Subject::NamedNode(NamedNode { + iri: "http://example.org/Alice".to_string(), + }); + let predicate = NamedNode { + iri: "http://example.org/hasName".to_string(), + }; + let rules = set_subject_predicate_rule("http://example.org/Alice", "http://example.org/hasAge"); + let mut mask = TripleMask::new(); + let mut type_map = HashMap::new(); + type_map.insert("http://example.org/Alice".to_string(), "http://example.org/Person".to_string()); + mask = eval_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::OBJECT)); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + } + #[test] + fn subject_predicate_not_in_config() { + let subject = Subject::NamedNode(NamedNode { + iri: "http://example.org/Alice".to_string(), + }); + let predicate = NamedNode { + iri: "http://example.org/hasName".to_string(), + }; + let rules = set_subject_predicate_rule("http://example.org/Bob", "http://example.org/hasAge"); + let mut mask = TripleMask::new(); + let mut type_map = HashMap::new(); + type_map.insert("http://example.org/Alice".to_string(), "http://example.org/Person".to_string()); + mask = eval_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::OBJECT)); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + } + #[test] + // Rule subject predicate where subject is not in type list + fn subject_predicate_not_in_types() { + let subject = Subject::NamedNode(NamedNode { + iri: "http://example.org/Alice".to_string(), + }); + let predicate = NamedNode { + iri: "http://example.org/hasName".to_string(), + }; + let rules = set_subject_predicate_rule("http://example.org/Bob", "http://example.org/hasAge"); + let mut mask = TripleMask::new(); + let mut type_map = HashMap::new(); + type_map.insert("http://example.org/Bob".to_string(), "http://example.org/Person".to_string()); + mask = eval_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::OBJECT)); + assert!(!mask.is_set(&TripleMask::SUBJECT)); + } +} From f731093e96d374a4677f61879fd5dd0e88709aaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20N=C3=BCtzi?= Date: Tue, 9 Jul 2024 10:52:36 +0200 Subject: [PATCH 10/12] fix: review changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove nesting and group types and impls together Signed-off-by: Gabriel Nützi --- src/model.rs | 9 ++- src/pass_second.rs | 24 ++++--- src/rdf_types.rs | 113 ++++++++++++++----------------- src/rules.rs | 163 ++++++++++++++++++++++++--------------------- 4 files changed, 162 insertions(+), 147 deletions(-) diff --git a/src/model.rs b/src/model.rs index 3e6049e..8b164fb 100644 --- a/src/model.rs +++ b/src/model.rs @@ -12,7 +12,7 @@ pub enum Entity { // Used to select any combination of fields in a triple bitflags::bitflags! { - #[derive(Debug, Copy, Clone)] + #[derive(Debug, Copy, Clone, Default)] pub struct TripleMask: u8 { const SUBJECT = 1 << 2 ; const PREDICATE = 1 << 1; @@ -31,6 +31,13 @@ impl TripleMask { mod tests { use super::*; + #[test] + // Test for default constructor. + fn test_default() { + let mask = TripleMask::default(); + assert!(mask.is_empty()); + } + #[test] // Test the parsing of a triple. fn is_subject_set() { diff --git a/src/pass_second.rs b/src/pass_second.rs index ead03ff..b5e0d0e 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -13,16 +13,20 @@ use crate::{ log::Logger, model::TripleMask, rdf_types::*, - rules::{eval_predicate_rule, eval_type_rule_object, eval_type_rule_subject, eval_subject_predicate_rule, Config}, + rules::{ + match_predicate_rule, match_subject_predicate_rule, match_type_rule_object, + match_type_rule_subject, Config, + }, }; fn mask_triple(triple: Triple, rules: &Config, type_map: &HashMap) -> TripleMask { // Check each field of the triple against the rules - let mut mask = TripleMask::new(); - mask = eval_type_rule_subject(&triple.subject, mask, type_map, rules); - mask = eval_type_rule_object(&triple.object, mask, type_map, rules); - mask = eval_predicate_rule(&triple.predicate, mask, rules); - mask = eval_subject_predicate_rule(&triple.subject, &triple.predicate, mask, type_map, rules); + let mut mask = TripleMask::default(); + mask = match_type_rule_subject(&triple.subject, mask, type_map, rules); + mask = match_type_rule_object(&triple.object, mask, type_map, rules); + mask = match_predicate_rule(&triple.predicate, mask, rules); + mask = match_subject_predicate_rule(&triple.subject, &triple.predicate, mask, type_map, rules); + return mask; } @@ -64,10 +68,12 @@ pub fn pseudonymize_graph(log: &Logger, input: &Path, config: &Path, output: &Pa let buf_input = io::get_reader(input); let buf_index = io::get_reader(index); let mut buf_output = io::get_writer(output); - let rules_config = io::parse_config(config); + let rules_config = io::parse_config(config); let node_to_type: HashMap = load_type_map(buf_index); + let mut triples = io::parse_ntriples(buf_input); + while !triples.is_end() { triples .parse_step(&mut |t| { @@ -86,12 +92,14 @@ mod tests { #[test] // Test the parsing of a triple. fn encrypt_nt_file() { + let logger = log::create_logger(true); + let dir = tempdir().unwrap(); let input_path = Path::new("tests/data/test.nt"); let config_path = Path::new("tests/data/config.yaml"); let output_path = dir.path().join("output.nt"); let type_map_path = Path::new("tests/data/type_map.nt"); - let logger = log::create_logger(true); + pseudonymize_graph( &logger, &input_path, diff --git a/src/rdf_types.rs b/src/rdf_types.rs index 093a7ea..1f659a6 100644 --- a/src/rdf_types.rs +++ b/src/rdf_types.rs @@ -6,6 +6,26 @@ use std::{fmt, fmt::Write, ops::Sub}; // Rename rio types as XXXView to distinguish them from our types // Use rio types for parsing and serializing // Define mappers between the two types +// +type NamedNodeView<'a> = rio_api::model::NamedNode<'a>; +type LiteralView<'a> = rio_api::model::Literal<'a>; +type TermView<'a> = rio_api::model::Term<'a>; +type TripleView<'a> = rio_api::model::Triple<'a>; +type BlankNodeView<'a> = rio_api::model::BlankNode<'a>; +type SubjectView<'a> = rio_api::model::Subject<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct Triple { + pub subject: Subject, + pub predicate: NamedNode, + pub object: Term, +} + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub enum Subject { + NamedNode(NamedNode), + BlankNode(BlankNode), +} #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)] pub struct NamedNode { @@ -13,14 +33,18 @@ pub struct NamedNode { pub iri: String, } -impl fmt::Display for NamedNode { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "<{}>", self.iri) - } +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub enum Term { + NamedNode(NamedNode), + BlankNode(BlankNode), + Literal(Literal), } -type NamedNodeView<'a> = rio_api::model::NamedNode<'a>; +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct BlankNode { + /// The [blank node identifier](https://www.w3.org/TR/rdf11-concepts/#dfn-blank-node-identifier). + pub id: String, +} #[derive(Eq, PartialEq, Debug, Clone, Hash)] pub enum Literal { @@ -45,6 +69,13 @@ pub enum Literal { }, } +impl fmt::Display for NamedNode { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "<{}>", self.iri) + } +} + impl fmt::Display for Literal { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -62,15 +93,6 @@ impl fmt::Display for Literal { } } -type LiteralView<'a> = rio_api::model::Literal<'a>; - -#[derive(Eq, PartialEq, Debug, Clone, Hash)] -pub enum Term { - NamedNode(NamedNode), - BlankNode(BlankNode), - Literal(Literal), -} - impl fmt::Display for Term { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -82,15 +104,6 @@ impl fmt::Display for Term { } } -type TermView<'a> = rio_api::model::Term<'a>; - -#[derive(Eq, PartialEq, Debug, Clone, Hash)] -pub struct Triple { - pub subject: Subject, - pub predicate: NamedNode, - pub object: Term, -} - impl fmt::Display for Triple { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -98,9 +111,20 @@ impl fmt::Display for Triple { } } -impl TripleMask { - pub fn new() -> Self { - return TripleMask::from_bits_truncate(0b0); +impl fmt::Display for Subject { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Subject::NamedNode(node) => node.fmt(f), + Subject::BlankNode(node) => node.fmt(f), + } + } +} + +impl fmt::Display for BlankNode { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "_:{}", self.id) } } @@ -120,24 +144,6 @@ impl<'a> From> for Triple { } } -type TripleView<'a> = rio_api::model::Triple<'a>; - -#[derive(Eq, PartialEq, Debug, Clone, Hash)] -pub enum Subject { - NamedNode(NamedNode), - BlankNode(BlankNode), -} - -impl fmt::Display for Subject { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Subject::NamedNode(node) => node.fmt(f), - Subject::BlankNode(node) => node.fmt(f), - } - } -} - impl<'a> From> for Subject { #[inline] fn from(resource: SubjectView) -> Self { @@ -179,23 +185,6 @@ impl<'a> From> for BlankNode { } } -type SubjectView<'a> = rio_api::model::Subject<'a>; - -#[derive(Eq, PartialEq, Debug, Clone, Hash)] -pub struct BlankNode { - /// The [blank node identifier](https://www.w3.org/TR/rdf11-concepts/#dfn-blank-node-identifier). - pub id: String, -} - -impl fmt::Display for BlankNode { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "_:{}", self.id) - } -} - -type BlankNodeView<'a> = rio_api::model::BlankNode<'a>; - impl<'a> From> for Literal { fn from(l: LiteralView<'a>) -> Self { match l { diff --git a/src/rules.rs b/src/rules.rs index d2bfbe3..d3d45d1 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -16,74 +16,65 @@ pub struct Config { pub replace_value_of_predicate: HashSet, } -pub fn eval_type_rule_named_node( +pub fn match_type_rule_named_node( is_subject: bool, - n: NamedNode, - mut mask: TripleMask, + n: &NamedNode, + mask: TripleMask, rules: &Config, type_map: &HashMap, ) -> TripleMask { - // First check if the iri is in any of the types in the type map - let iri_type = if type_map.contains_key(&n.iri) { - type_map.get(&n.iri).unwrap() + let iri_type = if let Some(v) = type_map.get(&n.iri) { + v } else { + // Not in the type map. return mask; }; - // If the iri is in the type map, check if it is in the rules - if rules.replace_uri_of_nodes_with_type.contains(iri_type) { - if is_subject { - mask |= TripleMask::SUBJECT; - return mask; - } else { - mask |= TripleMask::OBJECT; - return mask; - }; - } else { + + if !rules.replace_uri_of_nodes_with_type.contains(iri_type) { + // Not in the rules. return mask; } + + return if is_subject { + mask | TripleMask::SUBJECT + } else { + mask | TripleMask::OBJECT + }; } -pub fn eval_type_rule_subject( +pub fn match_type_rule_subject( subject: &Subject, - mut mask: TripleMask, + mask: TripleMask, type_map: &HashMap, rules: &Config, ) -> TripleMask { match subject { Subject::NamedNode(n) => { - mask = eval_type_rule_named_node(true, n.clone(), mask, rules, type_map); - return mask; + return mask | match_type_rule_named_node(true, &n, mask, rules, type_map); } Subject::BlankNode(_) => return mask, } } -pub fn eval_type_rule_object( +pub fn match_type_rule_object( object: &Term, - mut mask: TripleMask, + mask: TripleMask, type_map: &HashMap, rules: &Config, ) -> TripleMask { match object { Term::NamedNode(n) => { - mask = eval_type_rule_named_node(false, n.clone(), mask, rules, type_map); - return mask; + return mask | match_type_rule_named_node(false, &n, mask, rules, type_map); } _ => return mask, } } -pub fn eval_predicate_rule( - predicate: &NamedNode, - mut mask: TripleMask, - rules: &Config, -) -> TripleMask { +pub fn match_predicate_rule(predicate: &NamedNode, mask: TripleMask, rules: &Config) -> TripleMask { match predicate { NamedNode { iri: n } => { - // check if rule contains iri is in replace_value_of_predicate if rules.replace_value_of_predicate.contains(n) { - mask |= TripleMask::OBJECT; - return mask; + return mask | TripleMask::OBJECT; } else { return mask; } @@ -91,33 +82,29 @@ pub fn eval_predicate_rule( } } -pub fn eval_subject_predicate_rule( +pub fn match_subject_predicate_rule( subject: &Subject, predicate: &NamedNode, - mut mask: TripleMask, + mask: TripleMask, type_map: &HashMap, rules: &Config, ) -> TripleMask { match subject { Subject::NamedNode(n) => { - // check if rule contains iri is in replace_value_of_subject_predicate - let subject_type = if type_map.contains_key(&n.iri) { - type_map.get(&n.iri).unwrap() + let subject_type = if let Some(v) = type_map.get(&n.iri) { + v } else { + // Not in the type map. return mask; }; - if rules.replace_values_of_subject_predicate.contains_key(subject_type) { - let is_in_config = rules.replace_values_of_subject_predicate[subject_type] - .contains(&predicate.iri); - if is_in_config { - mask |= TripleMask::OBJECT; - return mask; - } else { - return mask; - } - } else { + + let preds = rules.replace_values_of_subject_predicate.get(subject_type); + if preds.is_none() || !preds.unwrap().contains(&predicate.iri) { + // Not in the rules. return mask; } + + return mask | TripleMask::OBJECT; } Subject::BlankNode(_) => return mask, } @@ -155,7 +142,9 @@ mod tests { }; let mut set = HashSet::new(); set.insert(p.to_string()); - rules.replace_values_of_subject_predicate.insert(s.to_string(), set); + rules + .replace_values_of_subject_predicate + .insert(s.to_string(), set); return rules; } @@ -168,8 +157,8 @@ mod tests { let rules = set_type_rule("Person"); let mut type_map = HashMap::new(); type_map.insert("http://example.org/Alice".to_string(), "Person".to_string()); - let mut mask = TripleMask::new(); - mask = eval_type_rule_subject(&subject, mask, &type_map, &rules); + let mut mask = TripleMask::default(); + mask = match_type_rule_subject(&subject, mask, &type_map, &rules); assert!(mask.is_set(&TripleMask::SUBJECT)); assert!(!mask.is_set(&TripleMask::OBJECT)); } @@ -182,11 +171,12 @@ mod tests { let rules = set_type_rule("Bank"); let mut type_map = HashMap::new(); type_map.insert("http://example.org/Alice".to_string(), "Person".to_string()); - let mut mask = TripleMask::new(); - mask = eval_type_rule_subject(&subject, mask, &type_map, &rules); + let mut mask = TripleMask::default(); + mask = match_type_rule_subject(&subject, mask, &type_map, &rules); assert!(!mask.is_set(&TripleMask::SUBJECT)); assert!(!mask.is_set(&TripleMask::OBJECT)); } + #[test] // Test the type rule for a subject neither in the rules nor in the type map fn subject_not_in_types() { @@ -196,8 +186,8 @@ mod tests { let rules = set_type_rule("Bank"); let mut type_map = HashMap::new(); type_map.insert("http://example.org/Bank".to_string(), "Bank".to_string()); - let mut mask = TripleMask::new(); - mask = eval_type_rule_subject(&subject, mask, &type_map, &rules); + let mut mask = TripleMask::default(); + mask = match_type_rule_subject(&subject, mask, &type_map, &rules); assert!(!mask.is_set(&TripleMask::SUBJECT)); assert!(!mask.is_set(&TripleMask::OBJECT)); } @@ -210,33 +200,36 @@ mod tests { let rules = set_type_rule("Person"); let mut type_map = HashMap::new(); type_map.insert("http://example.org/Alice".to_string(), "Person".to_string()); - let mut mask = TripleMask::new(); - mask = eval_type_rule_object(&object, mask, &type_map, &rules); + let mut mask = TripleMask::default(); + mask = match_type_rule_object(&object, mask, &type_map, &rules); assert!(mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } + #[test] fn predicate_in_config() { let predicate = NamedNode { iri: "http://example.org/hasName".to_string(), }; let rules = set_predicate_rule("http://example.org/hasName"); - let mut mask = TripleMask::new(); - mask = eval_predicate_rule(&predicate, mask, &rules); + let mut mask = TripleMask::default(); + mask = match_predicate_rule(&predicate, mask, &rules); assert!(mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } + #[test] fn predicate_not_in_config() { let predicate = NamedNode { iri: "http://example.org/hasName".to_string(), }; let rules = set_predicate_rule("http://example.org/hasAge"); - let mut mask = TripleMask::new(); - mask = eval_predicate_rule(&predicate, mask, &rules); + let mut mask = TripleMask::default(); + mask = match_predicate_rule(&predicate, mask, &rules); assert!(!mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } + #[test] fn subject_predicate_in_config() { let subject = Subject::NamedNode(NamedNode { @@ -245,11 +238,15 @@ mod tests { let predicate = NamedNode { iri: "http://example.org/hasName".to_string(), }; - let rules = set_subject_predicate_rule("http://example.org/Person", "http://example.org/hasName"); - let mut mask = TripleMask::new(); + let rules = + set_subject_predicate_rule("http://example.org/Person", "http://example.org/hasName"); + let mut mask = TripleMask::default(); let mut type_map = HashMap::new(); - type_map.insert("http://example.org/Alice".to_string(), "http://example.org/Person".to_string()); - mask = eval_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + type_map.insert( + "http://example.org/Alice".to_string(), + "http://example.org/Person".to_string(), + ); + mask = match_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); assert!(mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } @@ -261,14 +258,19 @@ mod tests { let predicate = NamedNode { iri: "http://example.org/hasName".to_string(), }; - let rules = set_subject_predicate_rule("http://example.org/Alice", "http://example.org/hasAge"); - let mut mask = TripleMask::new(); + let rules = + set_subject_predicate_rule("http://example.org/Alice", "http://example.org/hasAge"); + let mut mask = TripleMask::default(); let mut type_map = HashMap::new(); - type_map.insert("http://example.org/Alice".to_string(), "http://example.org/Person".to_string()); - mask = eval_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + type_map.insert( + "http://example.org/Alice".to_string(), + "http://example.org/Person".to_string(), + ); + mask = match_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); assert!(!mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } + #[test] fn subject_predicate_not_in_config() { let subject = Subject::NamedNode(NamedNode { @@ -277,14 +279,19 @@ mod tests { let predicate = NamedNode { iri: "http://example.org/hasName".to_string(), }; - let rules = set_subject_predicate_rule("http://example.org/Bob", "http://example.org/hasAge"); - let mut mask = TripleMask::new(); + let rules = + set_subject_predicate_rule("http://example.org/Bob", "http://example.org/hasAge"); + let mut mask = TripleMask::default(); let mut type_map = HashMap::new(); - type_map.insert("http://example.org/Alice".to_string(), "http://example.org/Person".to_string()); - mask = eval_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + type_map.insert( + "http://example.org/Alice".to_string(), + "http://example.org/Person".to_string(), + ); + mask = match_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); assert!(!mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } + #[test] // Rule subject predicate where subject is not in type list fn subject_predicate_not_in_types() { @@ -294,11 +301,15 @@ mod tests { let predicate = NamedNode { iri: "http://example.org/hasName".to_string(), }; - let rules = set_subject_predicate_rule("http://example.org/Bob", "http://example.org/hasAge"); - let mut mask = TripleMask::new(); + let rules = + set_subject_predicate_rule("http://example.org/Bob", "http://example.org/hasAge"); + let mut mask = TripleMask::default(); let mut type_map = HashMap::new(); - type_map.insert("http://example.org/Bob".to_string(), "http://example.org/Person".to_string()); - mask = eval_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + type_map.insert( + "http://example.org/Bob".to_string(), + "http://example.org/Person".to_string(), + ); + mask = match_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); assert!(!mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } From aa8e6dfa31fc7e119e1650bdc707c80ab2deb445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20N=C3=BCtzi?= Date: Wed, 10 Jul 2024 10:37:22 +0200 Subject: [PATCH 11/12] fix: review changes - added rstest for table driven tests. --- Cargo.lock | 253 ++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/io.rs | 4 +- src/pass_second.rs | 13 ++- src/rdf_types.rs.orig | 240 +++++++++++++++++++++++++++++++++++++++ src/rules.rs | 70 ++++++++---- 6 files changed, 552 insertions(+), 29 deletions(-) create mode 100644 src/rdf_types.rs.orig diff --git a/Cargo.lock b/Cargo.lock index 04cda11..e927bb7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "anstream" version = "0.6.14" @@ -63,6 +72,12 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + [[package]] name = "bitflags" version = "2.5.0" @@ -224,6 +239,101 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +[[package]] +name = "futures" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-executor" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-timer" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -235,6 +345,12 @@ dependencies = [ "wasi", ] +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "hashbrown" version = "0.14.5" @@ -362,12 +478,33 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05417ee46e2eb40dd9d590b4d67fc2408208b3a48a6b7f71d2bc1d7ce12a3e0" +[[package]] +name = "pin-project-lite" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "powerfmt" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "proc-macro-crate" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro2" version = "1.0.85" @@ -396,6 +533,7 @@ dependencies = [ "io-enum", "rio_api", "rio_turtle", + "rstest", "serde", "serde_yml", "slog", @@ -415,6 +553,41 @@ dependencies = [ "thiserror", ] +[[package]] +name = "regex" +version = "1.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + +[[package]] +name = "relative-path" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" + [[package]] name = "rio_api" version = "0.8.4" @@ -432,6 +605,45 @@ dependencies = [ "rio_api", ] +[[package]] +name = "rstest" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afd55a67069d6e434a95161415f5beeada95a01c7b815508a82dcb0e1593682" +dependencies = [ + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4165dfae59a39dd41d8dec720d3cbfbc71f69744efb480a3920f5d4e0cc6798d" +dependencies = [ + "cfg-if", + "glob", + "proc-macro-crate", + "proc-macro2", + "quote", + "regex", + "relative-path", + "rustc_version", + "syn", + "unicode-ident", +] + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.38.34" @@ -457,6 +669,12 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + [[package]] name = "serde" version = "1.0.203" @@ -505,6 +723,15 @@ dependencies = [ "tempfile", ] +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + [[package]] name = "slog" version = "2.7.0" @@ -643,6 +870,23 @@ dependencies = [ "time-core", ] +[[package]] +name = "toml_datetime" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" + +[[package]] +name = "toml_edit" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +dependencies = [ + "indexmap", + "toml_datetime", + "winnow", +] + [[package]] name = "unicode-ident" version = "1.0.12" @@ -755,3 +999,12 @@ name = "windows_x86_64_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" + +[[package]] +name = "winnow" +version = "0.5.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +dependencies = [ + "memchr", +] diff --git a/Cargo.toml b/Cargo.toml index 8214711..f9bcffe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,3 +21,4 @@ io-enum = "1.1.3" serde_yml = "0.0.10" tempfile = "3.10.1" blake3 = "1.5.1" +rstest = "0.21.0" diff --git a/src/io.rs b/src/io.rs index 3983da1..36e1cca 100644 --- a/src/io.rs +++ b/src/io.rs @@ -1,4 +1,4 @@ -use crate::rules::Config; +use crate::rules::Rules; use rio_turtle::NTriplesParser; use serde_yml; use std::{ @@ -45,7 +45,7 @@ pub fn parse_ntriples(reader: impl BufRead) -> NTriplesParser { } // Parse yaml configuration file. -pub fn parse_config(path: &Path) -> Config { +pub fn parse_config(path: &Path) -> Rules { return match File::open(&path) { Ok(file) => serde_yml::from_reader(file).expect("Error parsing config file."), Err(e) => panic!("Cannot open file '{:?}': '{}'.", path, e), diff --git a/src/pass_second.rs b/src/pass_second.rs index b5e0d0e..e538755 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -15,13 +15,14 @@ use crate::{ rdf_types::*, rules::{ match_predicate_rule, match_subject_predicate_rule, match_type_rule_object, - match_type_rule_subject, Config, + match_type_rule_subject, Rules, }, }; -fn mask_triple(triple: Triple, rules: &Config, type_map: &HashMap) -> TripleMask { +fn match_rules(triple: Triple, rules: &Rules, type_map: &HashMap) -> TripleMask { // Check each field of the triple against the rules let mut mask = TripleMask::default(); + mask = match_type_rule_subject(&triple.subject, mask, type_map, rules); mask = match_type_rule_object(&triple.object, mask, type_map, rules); mask = match_predicate_rule(&triple.predicate, mask, rules); @@ -34,11 +35,11 @@ fn mask_triple(triple: Triple, rules: &Config, type_map: &HashMap, out: &mut impl Write, ) -> Result<(), TurtleError> { - let mask = mask_triple(triple.clone(), &rules_config, &node_to_type); + let mask = match_rules(triple.clone(), &rules_config, &node_to_type); let hasher = DefaultHasher::new(); let _ = out.write(&format!("{} .\n", hasher.pseudo_triple(&triple, mask).to_string()).into_bytes()); @@ -74,6 +75,9 @@ pub fn pseudonymize_graph(log: &Logger, input: &Path, config: &Path, output: &Pa let mut triples = io::parse_ntriples(buf_input); + // TODO: Try to make this into an iterator loop to leverage rayons parallelization feature over + // iterators. + while !triples.is_end() { triples .parse_step(&mut |t| { @@ -82,6 +86,7 @@ pub fn pseudonymize_graph(log: &Logger, input: &Path, config: &Path, output: &Pa .unwrap(); } } + #[cfg(test)] mod tests { use super::pseudonymize_graph; diff --git a/src/rdf_types.rs.orig b/src/rdf_types.rs.orig new file mode 100644 index 0000000..c67f9ef --- /dev/null +++ b/src/rdf_types.rs.orig @@ -0,0 +1,240 @@ +<<<<<<< HEAD +use rio_api; + +#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] +pub struct NamedNode { + /// The [IRI](https://www.w3.org/TR/rdf11-concepts/#dfn-iri) itself. + pub iri: String, +} + +type NamedNodeView<'a> = rio_api::model::NamedNode<'a>; +||||||| parent of 83413f86f3 (refactor: types for rdf protect to rio types) +======= +use rio_api; +use std::{fmt, fmt::Write, ops::Sub}; + +// Rewrite all the rio types to be able to instanciate triples +// Rename rio types as XXXView to distinguish them from our types +// Use rio types for parsing and serializing +// Define mappers between the two types + +#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)] +pub struct NamedNode { + /// The [IRI](https://www.w3.org/TR/rdf11-concepts/#dfn-iri) itself. + pub iri: String, +} + +impl fmt::Display for NamedNode { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "<{}>", self.iri) + } +} + +type NamedNodeView<'a> = rio_api::model::NamedNode<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub enum Literal { + /// A [simple literal](https://www.w3.org/TR/rdf11-concepts/#dfn-simple-literal) without datatype or language form. + Simple { + /// The [lexical form](https://www.w3.org/TR/rdf11-concepts/#dfn-lexical-form). + value: String, + }, + /// A [language-tagged string](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tagged-string) + LanguageTaggedString { + /// The [lexical form](https://www.w3.org/TR/rdf11-concepts/#dfn-lexical-form). + value: String, + /// The [language tag](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tag). + language: String, + }, + /// A literal with an explicit datatype + Typed { + /// The [lexical form](https://www.w3.org/TR/rdf11-concepts/#dfn-lexical-form). + value: String, + /// The [datatype IRI](https://www.w3.org/TR/rdf11-concepts/#dfn-datatype-iri). + datatype: NamedNode, + }, +} + +impl fmt::Display for Literal { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Literal::Simple { value } => fmt_quoted_str(value, f), + Literal::LanguageTaggedString { value, language } => { + fmt_quoted_str(value, f)?; + write!(f, "@{}", language) + } + Literal::Typed { value, datatype } => { + fmt_quoted_str(value, f)?; + write!(f, "^^{}", datatype) + } + } + } +} + +type LiteralView<'a> = rio_api::model::Literal<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub enum Term { + NamedNode(NamedNode), + BlankNode(BlankNode), + Literal(Literal), +} + +impl fmt::Display for Term { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Term::NamedNode(node) => node.fmt(f), + Term::BlankNode(node) => node.fmt(f), + Term::Literal(literal) => literal.fmt(f), + } + } +} + +type TermView<'a> = rio_api::model::Term<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct Triple { + pub subject: Subject, + pub predicate: NamedNode, + pub object: Term, +} + +impl fmt::Display for Triple { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} {} {}", self.subject, self.predicate, self.object) + } +} + +impl<'a> From> for Triple { + fn from(t: TripleView<'a>) -> Self { + match t { + TripleView { + subject, + predicate, + object, + } => Triple { + subject: subject.into(), + predicate: predicate.into(), + object: object.into(), + }, + } + } +} + +type TripleView<'a> = rio_api::model::Triple<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub enum Subject { + NamedNode(NamedNode), + BlankNode(BlankNode), +} + +impl fmt::Display for Subject { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Subject::NamedNode(node) => node.fmt(f), + Subject::BlankNode(node) => node.fmt(f), + } + } +} + +impl<'a> From> for Subject { + #[inline] + fn from(resource: SubjectView) -> Self { + match resource { + SubjectView::NamedNode(node) => Subject::NamedNode(node.into()), + SubjectView::BlankNode(node) => Subject::BlankNode(node.into()), + _ => panic!("Unexpected subject type"), + } + } +} + +impl<'a> From> for Term { + #[inline] + fn from(term: TermView<'a>) -> Self { + match term { + TermView::NamedNode(node) => Term::NamedNode(node.into()), + TermView::BlankNode(node) => Term::BlankNode(node.into()), + TermView::Literal(literal) => Term::Literal(literal.into()), + _ => panic!("Unexpected term type"), + } + } +} + +impl<'a> From> for NamedNode { + #[inline] + fn from(node: NamedNodeView<'a>) -> Self { + NamedNode { + iri: node.iri.to_string(), + } + } +} + +impl<'a> From> for BlankNode { + #[inline] + fn from(node: BlankNodeView<'a>) -> Self { + BlankNode { + id: node.id.to_string(), + } + } +} + +type SubjectView<'a> = rio_api::model::Subject<'a>; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct BlankNode { + /// The [blank node identifier](https://www.w3.org/TR/rdf11-concepts/#dfn-blank-node-identifier). + pub id: String, +} + +impl fmt::Display for BlankNode { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "_:{}", self.id) + } +} + +type BlankNodeView<'a> = rio_api::model::BlankNode<'a>; + +impl<'a> From> for Literal { + fn from(l: LiteralView<'a>) -> Self { + match l { + LiteralView::Simple { value } => Literal::Simple { + value: value.to_string(), + }, + LiteralView::LanguageTaggedString { value, language } => { + Literal::LanguageTaggedString { + value: value.to_string(), + language: language.to_string(), + } + } + LiteralView::Typed { value, datatype } => Literal::Typed { + value: value.to_string(), + datatype: NamedNode { + iri: datatype.iri.to_string(), + }, + }, + } + } +} + +#[inline] +fn fmt_quoted_str(string: &String, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_char('"')?; + for c in string.chars() { + match c { + '\n' => f.write_str("\\n"), + '\r' => f.write_str("\\r"), + '"' => f.write_str("\\\""), + '\\' => f.write_str("\\\\"), + c => f.write_char(c), + }?; + } + f.write_char('"') +} +>>>>>>> 83413f86f3 (refactor: types for rdf protect to rio types) diff --git a/src/rules.rs b/src/rules.rs index d3d45d1..134f0a6 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -4,8 +4,8 @@ use serde::{Deserialize, Serialize}; use crate::model::TripleMask; -#[derive(Serialize, Deserialize, Debug)] -pub struct Config { +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct Rules { // Replace values of nodes with a certain type. pub replace_uri_of_nodes_with_type: HashSet, @@ -20,7 +20,7 @@ pub fn match_type_rule_named_node( is_subject: bool, n: &NamedNode, mask: TripleMask, - rules: &Config, + rules: &Rules, type_map: &HashMap, ) -> TripleMask { let iri_type = if let Some(v) = type_map.get(&n.iri) { @@ -46,7 +46,7 @@ pub fn match_type_rule_subject( subject: &Subject, mask: TripleMask, type_map: &HashMap, - rules: &Config, + rules: &Rules, ) -> TripleMask { match subject { Subject::NamedNode(n) => { @@ -60,7 +60,7 @@ pub fn match_type_rule_object( object: &Term, mask: TripleMask, type_map: &HashMap, - rules: &Config, + rules: &Rules, ) -> TripleMask { match object { Term::NamedNode(n) => { @@ -70,7 +70,7 @@ pub fn match_type_rule_object( } } -pub fn match_predicate_rule(predicate: &NamedNode, mask: TripleMask, rules: &Config) -> TripleMask { +pub fn match_predicate_rule(predicate: &NamedNode, mask: TripleMask, rules: &Rules) -> TripleMask { match predicate { NamedNode { iri: n } => { if rules.replace_value_of_predicate.contains(n) { @@ -87,7 +87,7 @@ pub fn match_subject_predicate_rule( predicate: &NamedNode, mask: TripleMask, type_map: &HashMap, - rules: &Config, + rules: &Rules, ) -> TripleMask { match subject { Subject::NamedNode(n) => { @@ -114,18 +114,15 @@ pub fn match_subject_predicate_rule( mod tests { use super::*; - fn set_type_rule(t: &str) -> Config { - let mut rules = Config { - replace_uri_of_nodes_with_type: HashSet::new(), - replace_values_of_subject_predicate: HashMap::new(), - replace_value_of_predicate: HashSet::new(), - }; + fn set_type_rule(t: &str) -> Rules { + let mut rules = Rules::default(); + rules.replace_uri_of_nodes_with_type.insert(t.to_string()); return rules; } - fn set_predicate_rule(p: &str) -> Config { - let mut rules = Config { + fn set_predicate_rule(p: &str) -> Rules { + let mut rules = Rules { replace_uri_of_nodes_with_type: HashSet::new(), replace_values_of_subject_predicate: HashMap::new(), replace_value_of_predicate: HashSet::new(), @@ -134,17 +131,20 @@ mod tests { return rules; } - fn set_subject_predicate_rule(s: &str, p: &str) -> Config { - let mut rules = Config { + fn set_subject_predicate_rule(s: &str, p: &str) -> Rules { + let mut rules = Rules { replace_uri_of_nodes_with_type: HashSet::new(), replace_values_of_subject_predicate: HashMap::new(), replace_value_of_predicate: HashSet::new(), }; + let mut set = HashSet::new(); set.insert(p.to_string()); + rules .replace_values_of_subject_predicate .insert(s.to_string(), set); + return rules; } @@ -172,7 +172,9 @@ mod tests { let mut type_map = HashMap::new(); type_map.insert("http://example.org/Alice".to_string(), "Person".to_string()); let mut mask = TripleMask::default(); + mask = match_type_rule_subject(&subject, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::SUBJECT)); assert!(!mask.is_set(&TripleMask::OBJECT)); } @@ -187,7 +189,9 @@ mod tests { let mut type_map = HashMap::new(); type_map.insert("http://example.org/Bank".to_string(), "Bank".to_string()); let mut mask = TripleMask::default(); + mask = match_type_rule_subject(&subject, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::SUBJECT)); assert!(!mask.is_set(&TripleMask::OBJECT)); } @@ -201,7 +205,9 @@ mod tests { let mut type_map = HashMap::new(); type_map.insert("http://example.org/Alice".to_string(), "Person".to_string()); let mut mask = TripleMask::default(); + mask = match_type_rule_object(&object, mask, &type_map, &rules); + assert!(mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } @@ -213,7 +219,9 @@ mod tests { }; let rules = set_predicate_rule("http://example.org/hasName"); let mut mask = TripleMask::default(); + mask = match_predicate_rule(&predicate, mask, &rules); + assert!(mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } @@ -225,31 +233,39 @@ mod tests { }; let rules = set_predicate_rule("http://example.org/hasAge"); let mut mask = TripleMask::default(); + mask = match_predicate_rule(&predicate, mask, &rules); + assert!(!mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } #[test] fn subject_predicate_in_config() { + let alice_iri = "http://example.org/Alice"; + let person_iri = "http://example.org/Person"; + let pred_hn = "http://example.org/hasName"; + let subject = Subject::NamedNode(NamedNode { - iri: "http://example.org/Alice".to_string(), + iri: alice_iri.to_string(), }); let predicate = NamedNode { - iri: "http://example.org/hasName".to_string(), + iri: pred_hn.to_string(), }; + let rules = set_subject_predicate_rule("http://example.org/Person", "http://example.org/hasName"); + let mut mask = TripleMask::default(); let mut type_map = HashMap::new(); - type_map.insert( - "http://example.org/Alice".to_string(), - "http://example.org/Person".to_string(), - ); + type_map.insert(alice_iri.to_string(), person_iri.to_string()); + mask = match_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + assert!(mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } + #[test] fn subject_in_config_predicate_not_() { let subject = Subject::NamedNode(NamedNode { @@ -266,7 +282,9 @@ mod tests { "http://example.org/Alice".to_string(), "http://example.org/Person".to_string(), ); + mask = match_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } @@ -287,7 +305,9 @@ mod tests { "http://example.org/Alice".to_string(), "http://example.org/Person".to_string(), ); + mask = match_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } @@ -303,13 +323,17 @@ mod tests { }; let rules = set_subject_predicate_rule("http://example.org/Bob", "http://example.org/hasAge"); + let mut mask = TripleMask::default(); let mut type_map = HashMap::new(); + type_map.insert( "http://example.org/Bob".to_string(), "http://example.org/Person".to_string(), ); + mask = match_subject_predicate_rule(&subject, &predicate, mask, &type_map, &rules); + assert!(!mask.is_set(&TripleMask::OBJECT)); assert!(!mask.is_set(&TripleMask::SUBJECT)); } From 61f162c9604df03f379566da523988db1761ac8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20N=C3=BCtzi?= Date: Wed, 10 Jul 2024 10:41:19 +0200 Subject: [PATCH 12/12] fix: remove obsolete brackets --- src/pass_second.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pass_second.rs b/src/pass_second.rs index e538755..7843933 100644 --- a/src/pass_second.rs +++ b/src/pass_second.rs @@ -55,8 +55,8 @@ fn load_type_map(input: impl BufRead) -> HashMap { while !triples.is_end() { let _: Result<(), TurtleError> = triples.parse_step(&mut |t| { node_to_type.insert( - t.subject.to_string().replace(&['<', '>'][..], ""), - t.object.to_string().replace(&['<', '>'][..], ""), + t.subject.to_string().replace(&['<', '>'], ""), + t.object.to_string().replace(&['<', '>'], ""), ); Ok(()) });