diff --git a/.gitignore b/.gitignore index 2afabdb..acec195 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,7 @@ flamegraph.svg perf.data perf.data.old + +# sometimes i make this file when i pipe benchmark results to a file, +# don't wanna accidentally commit it +benchmark_result.txt diff --git a/simdnbt-derive/Cargo.toml b/simdnbt-derive/Cargo.toml index aac2785..528efa6 100644 --- a/simdnbt-derive/Cargo.toml +++ b/simdnbt-derive/Cargo.toml @@ -9,9 +9,9 @@ repository = "https://github.com/azalea-rs/simdnbt" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -proc-macro2 = "1.0.78" -quote = "1.0.35" -syn = "2.0.48" +proc-macro2 = "1.0.82" +quote = "1.0.36" +syn = "2.0.63" [lib] proc-macro = true diff --git a/simdnbt/Cargo.toml b/simdnbt/Cargo.toml index 39aced1..bffe789 100644 --- a/simdnbt/Cargo.toml +++ b/simdnbt/Cargo.toml @@ -10,10 +10,10 @@ repository = "https://github.com/azalea-rs/simdnbt" [dependencies] byteorder = "1.5.0" -flate2 = "^1.0.28" +flate2 = "^1.0.30" residua-mutf8 = "2.0.0" simdnbt-derive = { version = "0.4.0", path = "../simdnbt-derive", optional = true } -thiserror = "1.0.56" +thiserror = "1.0.60" valence_nbt = { version = "0.8.0", features = ["binary"] } [dev-dependencies] @@ -21,12 +21,12 @@ criterion = { version = "0.5.1", features = ["html_reports"] } graphite_binary = "0.1.0" valence_nbt = { version = "0.8.0", features = ["binary"] } -fastnbt = "2.4.4" +fastnbt = "2.5.0" azalea-nbt = { git = "https://github.com/azalea-rs/azalea", rev = "84e036ce3752ecf57904b0f5aff1f33d43e95a32" } hematite-nbt = { version = "0.5.2", default-features = false } shen-nbt5 = "0.4.4" -mimalloc = "0.1.39" +mimalloc = "0.1.41" [features] default = ["derive"] diff --git a/simdnbt/README.md b/simdnbt/README.md index 275ec86..7b4bbe3 100644 --- a/simdnbt/README.md +++ b/simdnbt/README.md @@ -77,23 +77,23 @@ Here's a benchmark comparing Simdnbt against a few of the other fastest NBT crat | Library | Throughput | | --------------------------------------------------------------------------- | ------------ | -| [simdnbt::borrow](https://docs.rs/simdnbt/latest/simdnbt/borrow/index.html) | 1.7619 GiB/s | -| [simdnbt::owned](https://docs.rs/simdnbt/latest/simdnbt/owned/index.html) | 329.10 MiB/s | -| [shen_nbt5](https://docs.rs/shen-nbt5/latest/shen_nbt5/) | 306.58 MiB/s | -| [azalea_nbt](https://docs.rs/azalea-nbt/latest/azalea_nbt/) | 297.28 MiB/s | -| [valence_nbt](https://docs.rs/valence_nbt/latest/valence_nbt/) | 236.42 MiB/s | -| [graphite_binary](https://docs.rs/graphite_binary/latest/graphite_binary/) | 210.51 MiB/s | -| [fastnbt](https://docs.rs/fastnbt/latest/fastnbt/) | 115.54 MiB/s | -| [hematite_nbt](https://docs.rs/hematite-nbt/latest/nbt/) | 108.91 MiB/s | +| [simdnbt::borrow](https://docs.rs/simdnbt/latest/simdnbt/borrow/index.html) | 1.6795 GiB/s | +| [simdnbt::owned](https://docs.rs/simdnbt/latest/simdnbt/owned/index.html) | 811.08 MiB/s | +| [shen_nbt5](https://docs.rs/shen-nbt5/latest/shen_nbt5/) | 606.68 MiB/s | +| [graphite_binary](https://docs.rs/graphite_binary/latest/graphite_binary/) | 363.94 MiB/s | +| [azalea_nbt](https://docs.rs/azalea-nbt/latest/azalea_nbt/) | 330.46 MiB/s | +| [valence_nbt](https://docs.rs/valence_nbt/latest/valence_nbt/) | 279.58 MiB/s | +| [fastnbt](https://docs.rs/fastnbt/latest/fastnbt/) | 162.92 MiB/s | +| [hematite_nbt](https://docs.rs/hematite-nbt/latest/nbt/) | 180.22 MiB/s | And for writing `complex_player.dat`: | Library | Throughput | | --------------- | ------------ | -| simdnbt::borrow | 2.5914 GiB/s | -| azalea_nbt | 2.1096 GiB/s | -| simdnbt::owned | 1.9508 GiB/s | -| graphite_binary | 1.7745 GiB/s | +| simdnbt::borrow | 2.4670 GiB/s | +| azalea_nbt | 2.4152 GiB/s | +| simdnbt::owned | 1.9660 GiB/s | +| graphite_binary | 1.8804 GiB/s | The tables above were made from the [compare benchmark](https://github.com/azalea-rs/simdnbt/tree/master/simdnbt/benches) in this repo. Note that the benchmark is somewhat unfair, since `simdnbt::borrow` doesn't fully decode some things like strings and integer arrays until they're used. diff --git a/simdnbt/benches/compare.rs b/simdnbt/benches/compare.rs index 198dd0c..206983c 100644 --- a/simdnbt/benches/compare.rs +++ b/simdnbt/benches/compare.rs @@ -6,56 +6,85 @@ use std::{ use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use flate2::read::GzDecoder; -pub fn bench_read_file(filename: &str, c: &mut Criterion) { +fn bench_read_file(filename: &str, c: &mut Criterion) { let mut file = File::open(format!("tests/{filename}")).unwrap(); let mut contents = Vec::new(); file.read_to_end(&mut contents).unwrap(); let mut src = &contents[..]; // decode the original src so most of the time isn't spent on unzipping - let mut decoded_src_decoder = GzDecoder::new(&mut src); + let mut src_decoder = GzDecoder::new(&mut src); let mut input = Vec::new(); - if decoded_src_decoder.read_to_end(&mut input).is_err() { + if src_decoder.read_to_end(&mut input).is_err() { // oh probably wasn't gzipped then input = contents; } - let input = input.as_slice(); + + let mut input_stream = Cursor::new(&input[..]); let mut group = c.benchmark_group(format!("compare/{filename}")); group.throughput(Throughput::Bytes(input.len() as u64)); group.bench_function("simdnbt_borrow_parse", |b| { b.iter(|| { - let input = black_box(input); - let nbt = simdnbt::borrow::Nbt::read(&mut Cursor::new(input)) - .unwrap() - .unwrap(); - // let _ = black_box(nbt.list("").unwrap().ints()); - black_box(nbt); + black_box(simdnbt::borrow::Nbt::read(&mut input_stream).unwrap()); + input_stream.set_position(0); }) }); - group.bench_function("simdnbt_owned_parse", |b| { b.iter(|| { - let input = black_box(input); - let nbt = simdnbt::owned::Nbt::read(&mut Cursor::new(input)) - .unwrap() - .unwrap(); - // let _ = black_box(nbt.list("").unwrap().ints()); - black_box(nbt); + black_box(simdnbt::owned::Nbt::read(&mut input_stream).unwrap()); + input_stream.set_position(0); }) }); - group.bench_function("shen_parse", |b| { - let mut input = black_box(input.to_vec()); + let mut input = input.to_vec(); b.iter(|| { let nbt = shen_nbt5::NbtValue::from_binary::(&mut input) .unwrap(); black_box(nbt); }) }); + group.bench_function("azalea_parse", |b| { + b.iter(|| { + black_box(azalea_nbt::Nbt::read(&mut input_stream).unwrap()); + input_stream.set_position(0); + }) + }); + group.bench_function("graphite_parse", |b| { + b.iter(|| { + black_box(graphite_binary::nbt::decode::read(&mut &input[..]).unwrap()); + }) + }); + group.bench_function("valence_parse", |b| { + b.iter(|| { + let nbt = valence_nbt::from_binary::(&mut &input[..]).unwrap(); + black_box(nbt); + }) + }); + group.bench_function("fastnbt_parse", |b| { + b.iter(|| { + let nbt: fastnbt::Value = fastnbt::from_bytes(&input).unwrap(); + black_box(nbt); + }) + }); + group.bench_function("hematite_parse", |b| { + b.iter(|| { + black_box(nbt::Blob::from_reader(&mut input_stream).unwrap()); + input_stream.set_position(0); + }) + }); - let nbt = simdnbt::borrow::Nbt::read(&mut Cursor::new(input)) + let nbt = azalea_nbt::Nbt::read(&mut Cursor::new(&input)).unwrap(); + group.bench_function("azalea_write", |b| { + b.iter(|| { + let mut out = Vec::new(); + nbt.write(&mut out); + black_box(out); + }) + }); + + let nbt = simdnbt::borrow::Nbt::read(&mut Cursor::new(&input)) .unwrap() .unwrap(); group.bench_function("simdnbt_borrow_write", |b| { @@ -66,7 +95,7 @@ pub fn bench_read_file(filename: &str, c: &mut Criterion) { }) }); - let nbt = simdnbt::owned::Nbt::read(&mut Cursor::new(input)) + let nbt = simdnbt::owned::Nbt::read(&mut Cursor::new(&input)) .unwrap() .unwrap(); group.bench_function("simdnbt_owned_write", |b| { @@ -77,30 +106,6 @@ pub fn bench_read_file(filename: &str, c: &mut Criterion) { }) }); - group.bench_function("azalea_parse", |b| { - b.iter(|| { - let input = black_box(input); - let nbt = azalea_nbt::Nbt::read(&mut Cursor::new(input)).unwrap(); - black_box(nbt); - }) - }); - - let nbt = azalea_nbt::Nbt::read(&mut Cursor::new(input)).unwrap(); - group.bench_function("azalea_write", |b| { - b.iter(|| { - let mut out = Vec::new(); - nbt.write(&mut out); - black_box(out); - }) - }); - - group.bench_function("graphite_parse", |b| { - b.iter(|| { - let input = black_box(input); - let nbt = graphite_binary::nbt::decode::read(&mut &input[..]).unwrap(); - black_box(nbt); - }) - }); let nbt = graphite_binary::nbt::decode::read(&mut &input[..]).unwrap(); group.bench_function("graphite_write", |b| { b.iter(|| { @@ -108,32 +113,11 @@ pub fn bench_read_file(filename: &str, c: &mut Criterion) { black_box(out); }) }); - - group.bench_function("valence_parse", |b| { - b.iter(|| { - let input = black_box(input); - let nbt = valence_nbt::from_binary::(&mut &input[..]).unwrap(); - black_box(nbt); - }) - }); - - group.bench_function("fastnbt_parse", |b| { - b.iter(|| { - let input = black_box(input); - let nbt: fastnbt::Value = fastnbt::from_bytes(input).unwrap(); - black_box(nbt); - }) - }); - - group.bench_function("hematite_parse", |b| { - b.iter(|| { - let input = black_box(input); - let nbt = nbt::Blob::from_reader(&mut Cursor::new(input)).unwrap(); - black_box(nbt); - }) - }); } +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + fn bench(c: &mut Criterion) { // bench_read_file("hello_world.nbt", c); // bench_read_file("bigtest.nbt", c); diff --git a/simdnbt/benches/nbt_borrow.rs b/simdnbt/benches/nbt_borrow.rs index e8b1232..ed506e0 100644 --- a/simdnbt/benches/nbt_borrow.rs +++ b/simdnbt/benches/nbt_borrow.rs @@ -1,10 +1,11 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; -use flate2::read::GzDecoder; use std::{ fs::File, io::{Cursor, Read}, }; +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use flate2::read::GzDecoder; + fn bench_file(filename: &str, c: &mut Criterion) { let mut file = File::open(format!("tests/{filename}")).unwrap(); let mut contents = Vec::new(); @@ -12,27 +13,26 @@ fn bench_file(filename: &str, c: &mut Criterion) { let mut src = &contents[..]; // decode the original src so most of the time isn't spent on unzipping - let mut decoded_src_decoder = GzDecoder::new(&mut src); - let mut decoded_src = Vec::new(); - if decoded_src_decoder.read_to_end(&mut decoded_src).is_err() { + let mut src_decoder = GzDecoder::new(&mut src); + let mut input = Vec::new(); + if src_decoder.read_to_end(&mut input).is_err() { // oh probably wasn't gzipped then - decoded_src = contents; + input = contents; } - let mut decoded_src_stream = Cursor::new(&decoded_src[..]); + let mut input_stream = Cursor::new(&input[..]); let mut group = c.benchmark_group(format!("nbt_borrow/{filename}")); - - group.throughput(Throughput::Bytes(decoded_src.len() as u64)); + group.throughput(Throughput::Bytes(input.len() as u64)); group.bench_function("Decode", |b| { b.iter(|| { - black_box(simdnbt::borrow::Nbt::read(&mut decoded_src_stream).unwrap()); - decoded_src_stream.set_position(0); + black_box(simdnbt::borrow::Nbt::read(&mut input_stream).unwrap()); + input_stream.set_position(0); }) }); - let nbt = simdnbt::borrow::Nbt::read(&mut decoded_src_stream) + let nbt = simdnbt::borrow::Nbt::read(&mut input_stream) .unwrap() .unwrap(); group.bench_function("Get", |b| { diff --git a/simdnbt/src/mutf8.rs b/simdnbt/src/mutf8.rs index ec3f582..c620959 100644 --- a/simdnbt/src/mutf8.rs +++ b/simdnbt/src/mutf8.rs @@ -92,14 +92,14 @@ impl Mutf8Str { #[inline] pub fn from_str(s: &str) -> Cow { match mutf8::encode(s) { - Cow::Borrowed(b) => Cow::Borrowed(Mutf8Str::from_slice(b)), - Cow::Owned(o) => Cow::Owned(Mutf8String { vec: o }), + Cow::Borrowed(slice) => Cow::Borrowed(Mutf8Str::from_slice(slice)), + Cow::Owned(vec) => Cow::Owned(Mutf8String { vec }), } } #[inline] pub fn to_str(&self) -> Cow { - // fast check to skip if none of the bytes have the top bit set or are null + // fast check to skip if none of the bytes have the top bit set if is_plain_ascii(&self.slice) { // SAFETY: &[u8] and &str are the same layout. unsafe { Cow::Borrowed(std::str::from_utf8_unchecked(&self.slice)) } diff --git a/simdnbt/src/owned/compound.rs b/simdnbt/src/owned/compound.rs index 39337b1..ec6c547 100644 --- a/simdnbt/src/owned/compound.rs +++ b/simdnbt/src/owned/compound.rs @@ -1,4 +1,7 @@ -use std::{io::Cursor, mem}; +use std::{ + io::Cursor, + mem::{self, MaybeUninit}, +}; use byteorder::ReadBytesExt; @@ -33,6 +36,12 @@ impl NbtCompound { if depth > MAX_DEPTH { return Err(Error::MaxDepthExceeded); } + + let mut tags_buffer = unsafe { + MaybeUninit::<[MaybeUninit<(Mutf8String, NbtTag)>; 8]>::uninit().assume_init() + }; + let mut tags_buffer_len: usize = 0; + let mut values = Vec::with_capacity(8); loop { let tag_type = data.read_u8().map_err(|_| Error::UnexpectedEof)?; @@ -40,9 +49,23 @@ impl NbtCompound { break; } let tag_name = read_string(data)?.to_owned(); + let tag = NbtTag::read_with_type(data, tag_type, depth)?; - values.push((tag_name, NbtTag::read_with_type(data, tag_type, depth)?)); + tags_buffer[tags_buffer_len] = MaybeUninit::new((tag_name, tag)); + tags_buffer_len += 1; + if tags_buffer_len == tags_buffer.len() { + // writing the tags in groups like this is slightly faster + for i in 0..tags_buffer_len { + values.push(unsafe { tags_buffer.get_unchecked(i).assume_init_read() }); + } + tags_buffer_len = 0; + } } + + for i in 0..tags_buffer_len { + values.push(unsafe { tags_buffer.get_unchecked(i).assume_init_read() }); + } + Ok(Self { values }) } diff --git a/simdnbt/src/owned/mod.rs b/simdnbt/src/owned/mod.rs index 251f494..d7760b9 100644 --- a/simdnbt/src/owned/mod.rs +++ b/simdnbt/src/owned/mod.rs @@ -226,6 +226,7 @@ impl NbtTag { unsafe { *<*const _>::from(self).cast::() } } + #[inline(always)] fn read_with_type(data: &mut Cursor<&[u8]>, tag_type: u8, depth: usize) -> Result { match tag_type { BYTE_ID => Ok(NbtTag::Byte(