-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
perf(index): lightweight structure #57
Changes from 27 commits
4cdbe79
b001c5d
fe101a2
16ab28d
9811272
af6fe3f
5f2af98
cc31ae6
4c823fe
4032b3b
2fab215
8cca4c1
32e711c
f612fe6
e69a9d1
d487733
f7d60f4
28e3615
95d6563
cad8af7
4f9146c
afd0844
7d5ebeb
0fc975c
132f688
c496272
d38acb7
5b7d68f
0284072
cfb1616
08b62ee
5f7ad25
4642153
a2c3092
c4e2c77
f1284fd
c80493c
9ea47fb
42cbdcb
56fc66b
b3545d1
8331759
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,141 @@ | ||
use rio_api::parser::TriplesParser; | ||
use rio_turtle::TurtleError; | ||
use std::{io::Write, path::Path}; | ||
use serde::{Deserialize, Serialize}; | ||
use smallvec::{smallvec, SmallVec}; | ||
use std::{ | ||
collections::HashMap, | ||
hash::{DefaultHasher, Hash, Hasher}, | ||
path::Path, | ||
}; | ||
|
||
use crate::{ | ||
io, | ||
rdf_types::{Triple, TripleView}, | ||
}; | ||
|
||
fn index_triple(t: Triple, out: &mut impl Write) { | ||
/// Stores a mapping from hashed instance uri to their types | ||
#[derive(Serialize, Deserialize)] | ||
pub struct TypeIndex { | ||
pub types: Vec<String>, | ||
map: HashMap<u64, SmallVec<[usize; 1]>>, | ||
} | ||
|
||
impl TypeIndex { | ||
fn hash(&mut self, s: &str) -> u64 { | ||
let mut hasher = DefaultHasher::new(); | ||
s.hash(&mut hasher); | ||
hasher.finish().to_le() | ||
} | ||
|
||
pub fn from_iter(type_map: impl Iterator<Item = (String, String)>) -> Self { | ||
let vec: Vec<(String, String)> = type_map.collect(); | ||
cmdoret marked this conversation as resolved.
Show resolved
Hide resolved
|
||
let mut idx = TypeIndex::new(); | ||
idx.types = vec | ||
.clone() | ||
.iter() | ||
.map(|(_, t)| t.clone()) | ||
.collect::<std::collections::HashSet<String>>() | ||
.into_iter() | ||
.collect(); | ||
|
||
vec.iter().for_each(|(subject, type_uri)| { | ||
idx.insert(&subject.to_string(), &type_uri.to_string()) | ||
.unwrap() | ||
}); | ||
|
||
return idx; | ||
} | ||
|
||
pub fn new() -> Self { | ||
TypeIndex { | ||
types: Vec::new(), | ||
map: HashMap::new(), | ||
} | ||
} | ||
|
||
// Insert input subject-type mapping into the index. | ||
// The index will store the hash of the subject. | ||
pub fn insert(&mut self, subject_uri: &str, type_uri: &str) -> Result<(), std::io::Error> { | ||
let key = self.hash(subject_uri); | ||
let type_idx: usize; | ||
|
||
// Get type index or add a new one | ||
cmdoret marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if self.types.contains(&type_uri.to_string()) { | ||
cmdoret marked this conversation as resolved.
Show resolved
Hide resolved
|
||
type_idx = self.types.iter().position(|x| *x == type_uri).unwrap(); | ||
} else { | ||
type_idx = self.types.len(); | ||
self.types.push(type_uri.to_string()); | ||
} | ||
// Insert mapping into the index | ||
cmdoret marked this conversation as resolved.
Show resolved
Hide resolved
|
||
match self.map.get_mut(&key) { | ||
Some(v) => { | ||
v.push(type_idx); | ||
} | ||
None => { | ||
self.map.insert(key, smallvec![type_idx]); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
||
pub fn get(&mut self, subject_key: &str) -> Option<Vec<&String>> { | ||
cmdoret marked this conversation as resolved.
Show resolved
Hide resolved
|
||
let key = self.hash(subject_key); | ||
self.map | ||
.get(&key) | ||
.map(|v| v.iter().map(|i| &self.types[*i]).collect()) | ||
cmdoret marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
|
||
fn index_triple(t: Triple, index: &mut TypeIndex) { | ||
if t.predicate.iri.as_str() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" { | ||
let r = || -> std::io::Result<()> { | ||
out.write_all(t.to_string().as_bytes())?; | ||
out.write_all(b" .\n") | ||
}(); | ||
let r = { index.insert(&t.subject.to_string(), &t.object.to_string()) }; | ||
|
||
if let Err(e) = r { | ||
panic!("Error writting to out buffer: {e}"); | ||
} | ||
} | ||
} | ||
|
||
pub fn create_type_map(input: &Path, output: &Path) { | ||
pub fn create_type_index(input: &Path, output: &Path) { | ||
let buf_in = io::get_reader(input); | ||
let mut buf_out = io::get_writer(output); | ||
let buf_out = io::get_writer(output); | ||
let mut triples = io::parse_ntriples(buf_in); | ||
let mut index = TypeIndex::new(); | ||
|
||
while !triples.is_end() { | ||
let _ = triples | ||
.parse_step(&mut |t: TripleView| { | ||
index_triple(t.into(), &mut buf_out); | ||
index_triple(t.into(), &mut index); | ||
Result::<(), TurtleError>::Ok(()) | ||
}) | ||
.inspect_err(|e| { | ||
panic!("Parsing error occured: {e}"); | ||
}); | ||
} | ||
let _ = serde_json::to_writer(buf_out, &index); | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
#[test] | ||
// Test the parsing of a triple. | ||
fn index_from_iter() { | ||
let vals = vec![ | ||
("urn:Alice", "urn:Person"), | ||
("urn:Alice", "urn:Employee"), | ||
("urn:ACME", "urn:Organization"), | ||
] | ||
.into_iter() | ||
.map(|(a, b)| (a.to_string(), b.to_string())); | ||
|
||
let mut idx = TypeIndex::from_iter(vals); | ||
|
||
assert_eq!( | ||
idx.get("urn:Alice").unwrap(), | ||
vec!["urn:Person", "urn:Employee"] | ||
); | ||
println!("{}", serde_json::to_string(&idx).unwrap()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: maybe just |
||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,4 +1,4 @@ | ||||||
use crate::rules::Rules; | ||||||
use crate::{index::TypeIndex, rules::Rules}; | ||||||
use rio_turtle::NTriplesParser; | ||||||
use std::{ | ||||||
fs::File, | ||||||
|
@@ -46,7 +46,15 @@ pub fn parse_ntriples(reader: impl BufRead) -> NTriplesParser<impl BufRead> { | |||||
pub fn parse_rules(path: &Path) -> Rules { | ||||||
return match File::open(path) { | ||||||
Ok(file) => serde_yml::from_reader(file).expect("Error parsing rules file."), | ||||||
Err(e) => panic!("Cannot open file '{:?}': '{}'.", path, e), | ||||||
Err(e) => panic!("Cannot open rules file '{:?}': '{}'.", path, e), | ||||||
}; | ||||||
} | ||||||
|
||||||
// Parse yaml type index | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
pub fn parse_index(path: &Path) -> TypeIndex { | ||||||
return match File::open(path) { | ||||||
Ok(file) => serde_json::from_reader(file).expect("Error parsing index file."), | ||||||
Err(e) => panic!("Cannot open index file '{:?}': '{}'.", path, e), | ||||||
}; | ||||||
} | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ pub fn create_logger(use_stdout: bool) -> Arc<Logger> { | |
.fuse(); | ||
|
||
let drain = slog_async::Async::new(drain) | ||
.chan_size(5_000_000) | ||
.chan_size(1_000) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice! |
||
.build() | ||
.fuse(); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
use rio_api::parser::TriplesParser; | ||
use rio_turtle::TurtleError; | ||
use std::{ | ||
collections::HashMap, | ||
io::{BufRead, Write}, | ||
io::Write, | ||
path::{Path, PathBuf}, | ||
}; | ||
|
||
use crate::{ | ||
crypto::{new_pseudonymizer, Pseudonymize}, | ||
index::TypeIndex, | ||
io, | ||
log::Logger, | ||
rdf_types::*, | ||
|
@@ -19,7 +19,7 @@ use crate::{ | |
fn process_triple( | ||
triple: Triple, | ||
rules_config: &Rules, | ||
node_to_type: &HashMap<String, String>, | ||
node_to_type: &mut TypeIndex, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment: I love types. this looks now soo much better! |
||
out: &mut impl Write, | ||
hasher: &dyn Pseudonymize, | ||
) { | ||
|
@@ -35,24 +35,6 @@ fn process_triple( | |
} | ||
} | ||
|
||
// Create a index mapping node -> type from an input ntriples buffer | ||
fn load_type_map(input: impl BufRead) -> HashMap<String, String> { | ||
let mut node_to_type: HashMap<String, String> = HashMap::new(); | ||
let mut triples = io::parse_ntriples(input); | ||
|
||
while !triples.is_end() { | ||
let _: Result<(), TurtleError> = triples.parse_step(&mut |t| { | ||
node_to_type.insert( | ||
t.subject.to_string().replace(['<', '>'], ""), | ||
t.object.to_string().replace(['<', '>'], ""), | ||
); | ||
Ok(()) | ||
}); | ||
} | ||
|
||
return node_to_type; | ||
} | ||
|
||
pub fn pseudonymize_graph( | ||
_: &Logger, | ||
input: &Path, | ||
|
@@ -62,11 +44,10 @@ pub fn pseudonymize_graph( | |
secret_path: &Option<PathBuf>, | ||
) { | ||
let buf_input = io::get_reader(input); | ||
let buf_index = io::get_reader(index_path); | ||
let mut buf_output = io::get_writer(output); | ||
|
||
let rules = io::parse_rules(rules_path); | ||
let node_to_type: HashMap<String, String> = load_type_map(buf_index); | ||
let mut type_index = io::parse_index(index_path); | ||
|
||
let secret = secret_path.as_ref().map(io::read_bytes); | ||
let pseudonymizer = new_pseudonymizer(None, secret); | ||
|
@@ -80,7 +61,7 @@ pub fn pseudonymize_graph( | |
process_triple( | ||
t.into(), | ||
&rules, | ||
&node_to_type, | ||
&mut type_index, | ||
&mut buf_output, | ||
&pseudonymizer, | ||
); | ||
|
@@ -102,14 +83,14 @@ mod tests { | |
|
||
#[test] | ||
// Test the parsing of a triple. | ||
fn encrypt_nt_file() { | ||
fn pseudo_nt_file() { | ||
let logger = log::create_logger(true); | ||
|
||
let dir = tempdir().unwrap(); | ||
let input_path = Path::new("tests/data/test.nt"); | ||
let rules_path = Path::new("tests/data/rules.yaml"); | ||
let output_path = dir.path().join("output.nt"); | ||
let type_map_path = Path::new("tests/data/type_map.nt"); | ||
let type_map_path = Path::new("tests/data/type_index.json"); | ||
let key = None; | ||
pseudonymize_graph( | ||
&logger, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion: Its not clear from reading the doc string, what
types
contains andwhat
SmallVec
contains (be specific).