Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(index): lightweight structure #57

Merged
merged 42 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
4cdbe79
feat(index): add dedicated struct+impl
cmdoret Aug 20, 2024
b001c5d
feat: serde-based index creation
cmdoret Aug 20, 2024
fe101a2
refactor(rules): use index struct [WIP]
cmdoret Aug 20, 2024
16ab28d
feat(rules): type pred matching
cmdoret Aug 20, 2024
9811272
style: formatting
cmdoret Aug 20, 2024
af6fe3f
refactor(index): make hasher short-lived
cmdoret Aug 21, 2024
5f2af98
feat(io): index parser
cmdoret Aug 21, 2024
cc31ae6
feat(pseudo): use new index struct
cmdoret Aug 21, 2024
4c823fe
refactor: better index function naming
cmdoret Aug 21, 2024
4032b3b
test(rules): use new index in rules tests
cmdoret Aug 21, 2024
2fab215
test(rules): update index macro to use TypeIndex
cmdoret Aug 21, 2024
8cca4c1
refactor(index): from_map -> from_iter
cmdoret Aug 21, 2024
32e711c
chore: rm unused import
cmdoret Aug 21, 2024
f612fe6
style: formatting
cmdoret Aug 21, 2024
e69a9d1
test(pseudo): update test index
cmdoret Aug 21, 2024
d487733
chore(deps): add smallvec
cmdoret Aug 22, 2024
f7d60f4
refactor(index): use smallvec
cmdoret Aug 22, 2024
28e3615
perf(log): reduce reserved mem
cmdoret Aug 22, 2024
95d6563
test(rules): fix default for index macro
cmdoret Aug 22, 2024
cad8af7
perf(bench): add benchmark script
cmdoret Aug 22, 2024
4f9146c
perf(bench): can select binary profile
cmdoret Aug 25, 2024
afd0844
chore(bench): release as default profile
cmdoret Aug 25, 2024
7d5ebeb
perf(index): hashes as u64 instead of String
cmdoret Aug 26, 2024
0fc975c
docs(bench): comment script
cmdoret Aug 26, 2024
132f688
chore(deps): add serde_json
cmdoret Aug 26, 2024
c496272
refactor(index): yaml -> json
cmdoret Aug 26, 2024
d38acb7
style: format+clippy recommendations
cmdoret Aug 26, 2024
5b7d68f
refactor(index): simplify impl, always use nt uri serialization
cmdoret Aug 26, 2024
0284072
test(rules): simplify macro, use uri.to_string()
cmdoret Aug 26, 2024
cfb1616
tests(data): uri -> <uri>
cmdoret Aug 26, 2024
08b62ee
test(index): use <uri> in cases
cmdoret Aug 26, 2024
5f7ad25
refactor(rules): use uri.to_string()
cmdoret Aug 26, 2024
4642153
test(rules): use <uri> in cases
cmdoret Aug 26, 2024
a2c3092
style: lint+format
cmdoret Aug 26, 2024
c4e2c77
docs(index): better comments
cmdoret Aug 26, 2024
f1284fd
refactor(index): return &str from TypeIndex.get
cmdoret Aug 26, 2024
c80493c
style(bench): shellcheck suggestions
cmdoret Aug 26, 2024
9ea47fb
chore(bench): add nix devshell
cmdoret Aug 26, 2024
42cbdcb
fix(bench): git clone in nix
cmdoret Aug 26, 2024
56fc66b
refactor(bench): function-based benchmark script
cmdoret Aug 26, 2024
b3545d1
docs(bench): document benchmark execution
cmdoret Aug 26, 2024
8331759
fix(bench): syntax
cmdoret Aug 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
rio_api = '0.8.4'
rio_turtle = '0.8.4'
rstest = '0.21.0'
serde_json = "1.0.127"
serde_yml = '0.0.10'
slog = '2.7.0'
slog-async = '2.8.0'
slog-term = '2.9.0'
smallvec = { version = "1.13.2", features = ["serde"] }
tempfile = '3.10.1'

[dependencies.clap]
Expand Down
121 changes: 112 additions & 9 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,38 +1,141 @@
use rio_api::parser::TriplesParser;
use rio_turtle::TurtleError;
use std::{io::Write, path::Path};
use serde::{Deserialize, Serialize};
use smallvec::{smallvec, SmallVec};
use std::{
collections::HashMap,
hash::{DefaultHasher, Hash, Hasher},
path::Path,
};

use crate::{
io,
rdf_types::{Triple, TripleView},
};

fn index_triple(t: Triple, out: &mut impl Write) {
/// Stores a mapping from hashed instance uri to their types
#[derive(Serialize, Deserialize)]
pub struct TypeIndex {
pub types: Vec<String>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Its not clear from reading the doc string, what types contains and
what SmallVec contains (be specific).

map: HashMap<u64, SmallVec<[usize; 1]>>,
}

impl TypeIndex {
fn hash(&mut self, s: &str) -> u64 {
let mut hasher = DefaultHasher::new();
s.hash(&mut hasher);
hasher.finish().to_le()
}

pub fn from_iter(type_map: impl Iterator<Item = (String, String)>) -> Self {
let vec: Vec<(String, String)> = type_map.collect();
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
let mut idx = TypeIndex::new();
idx.types = vec
.clone()
.iter()
.map(|(_, t)| t.clone())
.collect::<std::collections::HashSet<String>>()
.into_iter()
.collect();

vec.iter().for_each(|(subject, type_uri)| {
idx.insert(&subject.to_string(), &type_uri.to_string())
.unwrap()
});

return idx;
}

pub fn new() -> Self {
TypeIndex {
types: Vec::new(),
map: HashMap::new(),
}
}

// Insert input subject-type mapping into the index.
// The index will store the hash of the subject.
pub fn insert(&mut self, subject_uri: &str, type_uri: &str) -> Result<(), std::io::Error> {
let key = self.hash(subject_uri);
let type_idx: usize;

// Get type index or add a new one
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
if self.types.contains(&type_uri.to_string()) {
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
type_idx = self.types.iter().position(|x| *x == type_uri).unwrap();
} else {
type_idx = self.types.len();
self.types.push(type_uri.to_string());
}
// Insert mapping into the index
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
match self.map.get_mut(&key) {
Some(v) => {
v.push(type_idx);
}
None => {
self.map.insert(key, smallvec![type_idx]);
}
}

Ok(())
}

pub fn get(&mut self, subject_key: &str) -> Option<Vec<&String>> {
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
let key = self.hash(subject_key);
self.map
.get(&key)
.map(|v| v.iter().map(|i| &self.types[*i]).collect())
cmdoret marked this conversation as resolved.
Show resolved Hide resolved
}
}

fn index_triple(t: Triple, index: &mut TypeIndex) {
if t.predicate.iri.as_str() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" {
let r = || -> std::io::Result<()> {
out.write_all(t.to_string().as_bytes())?;
out.write_all(b" .\n")
}();
let r = { index.insert(&t.subject.to_string(), &t.object.to_string()) };

if let Err(e) = r {
panic!("Error writting to out buffer: {e}");
}
}
}

pub fn create_type_map(input: &Path, output: &Path) {
pub fn create_type_index(input: &Path, output: &Path) {
let buf_in = io::get_reader(input);
let mut buf_out = io::get_writer(output);
let buf_out = io::get_writer(output);
let mut triples = io::parse_ntriples(buf_in);
let mut index = TypeIndex::new();

while !triples.is_end() {
let _ = triples
.parse_step(&mut |t: TripleView| {
index_triple(t.into(), &mut buf_out);
index_triple(t.into(), &mut index);
Result::<(), TurtleError>::Ok(())
})
.inspect_err(|e| {
panic!("Parsing error occured: {e}");
});
}
let _ = serde_json::to_writer(buf_out, &index);
}

#[cfg(test)]
mod tests {
use super::*;
#[test]
// Test the parsing of a triple.
fn index_from_iter() {
let vals = vec![
("urn:Alice", "urn:Person"),
("urn:Alice", "urn:Employee"),
("urn:ACME", "urn:Organization"),
]
.into_iter()
.map(|(a, b)| (a.to_string(), b.to_string()));

let mut idx = TypeIndex::from_iter(vals);

assert_eq!(
idx.get("urn:Alice").unwrap(),
vec!["urn:Person", "urn:Employee"]
);
println!("{}", serde_json::to_string(&idx).unwrap());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: maybe just assert_not!(..., None) or something the like...

}
}
12 changes: 10 additions & 2 deletions src/io.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::rules::Rules;
use crate::{index::TypeIndex, rules::Rules};
use rio_turtle::NTriplesParser;
use std::{
fs::File,
Expand Down Expand Up @@ -46,7 +46,15 @@ pub fn parse_ntriples(reader: impl BufRead) -> NTriplesParser<impl BufRead> {
pub fn parse_rules(path: &Path) -> Rules {
return match File::open(path) {
Ok(file) => serde_yml::from_reader(file).expect("Error parsing rules file."),
Err(e) => panic!("Cannot open file '{:?}': '{}'.", path, e),
Err(e) => panic!("Cannot open rules file '{:?}': '{}'.", path, e),
};
}

// Parse yaml type index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Parse yaml type index
// Parse yaml type index.

pub fn parse_index(path: &Path) -> TypeIndex {
return match File::open(path) {
Ok(file) => serde_json::from_reader(file).expect("Error parsing index file."),
Err(e) => panic!("Cannot open index file '{:?}': '{}'.", path, e),
};
}

Expand Down
2 changes: 1 addition & 1 deletion src/log.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub fn create_logger(use_stdout: bool) -> Arc<Logger> {
.fuse();

let drain = slog_async::Async::new(drain)
.chan_size(5_000_000)
.chan_size(1_000)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

.build()
.fuse();

Expand Down
4 changes: 2 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ mod rules;

// Define the imports.
use crate::{
index::create_type_map,
index::create_type_index,
log::{create_logger, info},
pseudo::pseudonymize_graph,
};
Expand Down Expand Up @@ -87,7 +87,7 @@ fn main() {
match cli.command {
Subcommands::Index(args) => {
info!(log, "Args: {:?}", args);
create_type_map(&args.input, &args.output)
create_type_index(&args.input, &args.output)
}
Subcommands::Pseudo(args) => {
info!(log, "Args: {:?}", args);
Expand Down
33 changes: 7 additions & 26 deletions src/pseudo.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use rio_api::parser::TriplesParser;
use rio_turtle::TurtleError;
use std::{
collections::HashMap,
io::{BufRead, Write},
io::Write,
path::{Path, PathBuf},
};

use crate::{
crypto::{new_pseudonymizer, Pseudonymize},
index::TypeIndex,
io,
log::Logger,
rdf_types::*,
Expand All @@ -19,7 +19,7 @@ use crate::{
fn process_triple(
triple: Triple,
rules_config: &Rules,
node_to_type: &HashMap<String, String>,
node_to_type: &mut TypeIndex,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment: I love types. this looks now soo much better!

out: &mut impl Write,
hasher: &dyn Pseudonymize,
) {
Expand All @@ -35,24 +35,6 @@ fn process_triple(
}
}

// Create a index mapping node -> type from an input ntriples buffer
fn load_type_map(input: impl BufRead) -> HashMap<String, String> {
let mut node_to_type: HashMap<String, String> = HashMap::new();
let mut triples = io::parse_ntriples(input);

while !triples.is_end() {
let _: Result<(), TurtleError> = triples.parse_step(&mut |t| {
node_to_type.insert(
t.subject.to_string().replace(['<', '>'], ""),
t.object.to_string().replace(['<', '>'], ""),
);
Ok(())
});
}

return node_to_type;
}

pub fn pseudonymize_graph(
_: &Logger,
input: &Path,
Expand All @@ -62,11 +44,10 @@ pub fn pseudonymize_graph(
secret_path: &Option<PathBuf>,
) {
let buf_input = io::get_reader(input);
let buf_index = io::get_reader(index_path);
let mut buf_output = io::get_writer(output);

let rules = io::parse_rules(rules_path);
let node_to_type: HashMap<String, String> = load_type_map(buf_index);
let mut type_index = io::parse_index(index_path);

let secret = secret_path.as_ref().map(io::read_bytes);
let pseudonymizer = new_pseudonymizer(None, secret);
Expand All @@ -80,7 +61,7 @@ pub fn pseudonymize_graph(
process_triple(
t.into(),
&rules,
&node_to_type,
&mut type_index,
&mut buf_output,
&pseudonymizer,
);
Expand All @@ -102,14 +83,14 @@ mod tests {

#[test]
// Test the parsing of a triple.
fn encrypt_nt_file() {
fn pseudo_nt_file() {
let logger = log::create_logger(true);

let dir = tempdir().unwrap();
let input_path = Path::new("tests/data/test.nt");
let rules_path = Path::new("tests/data/rules.yaml");
let output_path = dir.path().join("output.nt");
let type_map_path = Path::new("tests/data/type_map.nt");
let type_map_path = Path::new("tests/data/type_index.json");
let key = None;
pseudonymize_graph(
&logger,
Expand Down
Loading