Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add runtime dataset fetch and parse in-place #186

Merged
merged 14 commits into from
Feb 9, 2022
86 changes: 59 additions & 27 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,66 +19,98 @@ jobs:
- beta
- nightly
- 1.56.1
env:
RUSTFLAGS: "-C target-cpu=native -C opt-level=3"
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved
ROARINGRS_BENCH_OFFLINE: "true"

steps:
- uses: actions/checkout@v2
- name: Checkout roaring-rs
uses: actions/checkout@v2

- uses: actions-rs/toolchain@v1
- name: Checkout benchmark datasets
uses: actions/checkout@v2
with:
repository: "RoaringBitmap/real-roaring-datasets"
path: "benchmarks/real-roaring-datasets"
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved

- name: Initialize rust toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: ${{ matrix.rust }}
override: true
components: rustfmt, clippy

- uses: actions-rs/cargo@v1
- name: Fetch
uses: actions-rs/cargo@v1
with:
command: fetch

- name: Fetch benchmarks
uses: actions-rs/cargo@v1
with:
command: fetch
args: --manifest-path benchmarks/Cargo.toml

- name: Build
uses: actions-rs/cargo@v1
with:
command: build
args: --all-targets

- uses: actions-rs/cargo@v1
- name: Build benchmarks
uses: actions-rs/cargo@v1
with:
command: test
command: build
args: --manifest-path benchmarks/Cargo.toml --all-targets
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved

- uses: actions-rs/cargo@v1
- name: Check
uses: actions-rs/cargo@v1
with:
command: test
args: --benches --manifest-path benchmarks/Cargo.toml
command: clippy
args: --all-targets -- -D warnings

- uses: actions-rs/cargo@v1
- name: Check benchmarks
uses: actions-rs/cargo@v1
with:
command: clippy
args: --manifest-path benchmarks/Cargo.toml --all-targets -- -D warnings

- name: Check formatting
uses: actions-rs/cargo@v1
with:
command: fmt
args: -- --check

- uses: actions-rs/cargo@v1
- name: Check benchmark formatting
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved
uses: actions-rs/cargo@v1
with:
command: fmt
args: --manifest-path benchmarks/Cargo.toml -- --check

- uses: actions-rs/cargo@v1
- name: Test
uses: actions-rs/cargo@v1
with:
command: clippy
args: --all-targets -- -D warnings
simd:
name: SIMD Feature
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
command: test

- uses: actions-rs/toolchain@v1
- name: Test benchmarks
uses: actions-rs/cargo@v1
with:
profile: minimal
toolchain: nightly
override: true
components: rustfmt, clippy
command: test
args: --manifest-path benchmarks/Cargo.toml --benches

- uses: actions-rs/cargo@v1
- name: SIMD test
if: matrix.rust == 'nightly'
uses: actions-rs/cargo@v1
with:
toolchain: nightly
command: test
args: --features "simd"

- uses: actions-rs/cargo@v1
- name: SIMD test benchmarks
if: matrix.rust == 'nightly'
uses: actions-rs/cargo@v1
with:
toolchain: nightly
command: test
args: --features "simd" --benches --manifest-path benchmarks/Cargo.toml
args: --manifest-path benchmarks/Cargo.toml --features "simd" --benches
5 changes: 0 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,2 @@
/target
/Cargo.lock

# This is generated by the benchmarks crate build script, do not version with git.
/benchmarks/benches/datasets_paths.rs
/benchmarks/target
/benchmarks/Cargo.lock
3 changes: 3 additions & 0 deletions benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/target
/Cargo.lock
/real-roaring-datasets
13 changes: 4 additions & 9 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,11 @@ publish = false
roaring = { path = ".." }

[dev-dependencies]
once_cell = "1.9"
git2 = { version = "0.13", default-features = false, features = ["vendored-openssl"] }
zip = { version = "0.5", default-features = false, features = ["deflate"] }
indicatif = "0.16"
criterion = { version = "0.3", features = ["html_reports"] }
quickcheck = "0.9"
quickcheck_macros = "0.9"
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved

[build-dependencies]
anyhow = "1.0"
bytes = "1.0"
convert_case = "0.4"
reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
zip = "0.5.12"

[features]
simd = ["roaring/simd"]
Expand Down
203 changes: 203 additions & 0 deletions benchmarks/benches/datasets.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
use std::env;
use std::fs::File;
use std::io::BufReader;
use std::path::{Path, PathBuf};

use git2::FetchOptions;
use once_cell::sync::OnceCell as SyncOnceCell;

use roaring::RoaringBitmap;

static INSTANCE: SyncOnceCell<Vec<Dataset>> = SyncOnceCell::new();

pub struct Datasets;

pub struct DatasetsIter {
iter: std::slice::Iter<'static, Dataset>,
}

impl Iterator for DatasetsIter {
type Item = &'static Dataset;

fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
}
}

impl IntoIterator for Datasets {
type Item = &'static Dataset;
type IntoIter = DatasetsIter;

fn into_iter(self) -> Self::IntoIter {
DatasetsIter {
iter: INSTANCE
.get_or_init(|| {
init_datasets().and_then(parse_datasets).expect("a collection of datasets")
})
.iter(),
}
}
}

pub struct Dataset {
pub name: String,
pub bitmaps: Vec<RoaringBitmap>,
}

fn init_datasets() -> Result<PathBuf, Box<dyn std::error::Error>> {
let out_dir = env::var_os("CARGO_MANIFEST_DIR").ok_or(env::VarError::NotPresent)?;
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved

let out_path = Path::new(&out_dir);
let repo_path = out_path.join("real-roaring-datasets");
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved

// Check if in offline mode

let offline = env::var("ROARINGRS_BENCH_OFFLINE");
match offline {
Ok(value) => {
if value.parse::<bool>()? {
return Ok(repo_path);
}
}
Err(ref err) => match err {
env::VarError::NotPresent => (),
_ => {
offline?;
}
},
};

// Setup progress callbacks

let pb_cell = once_cell::unsync::OnceCell::new();
let mut cb = git2::RemoteCallbacks::new();

cb.transfer_progress(|progress| {
let pb = pb_cell.get_or_init(|| {
indicatif::ProgressBar::new(progress.total_objects() as u64)
.with_style(
indicatif::ProgressStyle::default_bar()
.template(&format!(
"{{prefix}}{{msg:.cyan/blue}} [{{bar}}] {{pos}}/{}",
progress.total_objects()
))
.progress_chars("#> "),
)
.with_prefix(" ")
.with_message("Receiving objects")
});

pb.set_position((progress.local_objects() + progress.received_objects()) as u64);
true
});

let mut fetch_opts = FetchOptions::new();
fetch_opts.remote_callbacks(cb);

// Do update

if !Path::new(&repo_path).exists() {
git2::build::RepoBuilder::new()
.fetch_options(fetch_opts)
.clone("git://github.com/RoaringBitmap/real-roaring-datasets.git", &repo_path)?;
} else {
let repo = git2::Repository::open(&repo_path)?;
repo.find_remote("origin")?.fetch(&["master"], Some(&mut fetch_opts), None)?;

let head = repo.head()?.peel_to_commit()?;
let origin_master_head = repo
.find_branch("origin/master", git2::BranchType::Remote)?
.into_reference()
.peel_to_commit()?;

if head.id() != origin_master_head.id() {
repo.reset(origin_master_head.as_object(), git2::ResetType::Hard, None)?;
}
}

if let Some(pb) = pb_cell.get() {
pb.finish()
}

Ok(repo_path)
}

fn parse_datasets<P: AsRef<Path>>(path: P) -> Result<Vec<Dataset>, Box<dyn std::error::Error>> {
const DATASET_FILENAME_WHITELIST: &[&str] = &[
"census-income.zip",
"census-income_srt.zip",
"census1881.zip",
"census1881_srt.zip",
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved
"weather_sept_85.zip",
"weather_sept_85_srt.zip",
"wikileaks-noquotes.zip",
"wikileaks-noquotes_srt.zip",
];

use indicatif::{ProgressBar, ProgressStyle};
use std::io::BufRead;
use zip::ZipArchive;

let dir = path.as_ref().read_dir()?;

let mut datasets = Vec::new();

// Future work: Reuse this buffer to parse croaring bitmaps for comparison
let mut numbers = Vec::new();

for dir_entry_result in dir {
let dir_entry = dir_entry_result?;
let metadata = dir_entry.metadata()?;
let file_name = dir_entry.file_name();
// TODO dont panic
let file_name_str = file_name.to_str().expect("utf-8 filename");

if metadata.is_file() && DATASET_FILENAME_WHITELIST.contains(&file_name_str) {
let file = File::open(dir_entry.path())?;
let name = file_name_str.split_at(file_name_str.len() - ".zip".len()).0.to_string();

let mut zip = ZipArchive::new(file)?;

let mut total_size = 0;
for i in 0..zip.len() {
let file = zip.by_index(i)?;
total_size += file.size();
}

let pb = ProgressBar::new(total_size)
.with_style(
ProgressStyle::default_bar()
.template(" {prefix:.green} [{bar}] {msg}")
.progress_chars("#> "),
)
.with_prefix("Parsing")
.with_message(name.clone());

let mut bitmaps = Vec::with_capacity(zip.len());
for i in 0..zip.len() {
let file = zip.by_index(i)?;
let size = file.size();
let buf = BufReader::new(file);

for bytes in buf.split(b',') {
let bytes = bytes?;
let str = String::from_utf8(bytes)?;
let n = str.trim().parse::<u32>()?;
numbers.push(n);
}

let bitmap = RoaringBitmap::from_sorted_iter(numbers.iter().copied())?;
numbers.clear();
bitmaps.push(bitmap);

pb.set_position(pb.position() + size);
}

pb.finish();
datasets.push(Dataset { name, bitmaps });
}
}
datasets.sort_unstable_by(|a, b| a.name.cmp(&b.name));
println!();
Kerollmops marked this conversation as resolved.
Show resolved Hide resolved
Ok(datasets)
}
Loading