calling it simdnbt is no longer a lie

2025-08-02 15:36:03 +00:00 · 2023-08-29 19:52:01 -05:00 · 2023-08-29 19:52:01 -05:00 · bb2d8d9fe8
commit bb2d8d9fe8
parent d0ccb6f406
3 changed files with 101 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@

 an unnecessarily fast nbt decoder. like seriously you probably don't need this unless you're trying to win benchmarks.

-at the moment, simdnbt does not actually make use of simd instructions (the name is a play on simdjson). there's one place where i know i could take advantage of simd but it just hasn't been implemented yet (swapping the endianness of integer arrays).
+simdnbt currently only makes use of simd instructions for swapping the endianness of arrays, and tbh that's really only there so i can call it "simdnbt" without lying. the name is mostly a play on simdjson.

 simdnbt might be the fastest nbt decoder currently in existence. however to achieve this silly speed, it takes a couple of shortcuts:
 1. it requires a reference to the original data (to avoid cloning)
@ -11,5 +11,4 @@ simdnbt might be the fastest nbt decoder currently in existence. however to achi
 here's a benchmark with the two other fastest nbt crates (azalea-nbt was also made by me):
 ![simdnbt is ~3x faster than the second fastest nbt crate](https://github.com/mat-1/simdnbt/assets/27899617/4c252b98-628c-4d81-92cd-3c8e1a7bd023)

-
 take this with a grain of salt as they're not all doing the same work. regardless, you can still see it's very fast.
--- a/benches/nbt.rs
+++ b/benches/nbt.rs
@ -52,12 +52,12 @@ fn bench_file(filename: &str, c: &mut Criterion) {
 fn bench(c: &mut Criterion) {
    // bench_file("bigtest.nbt", c);
    // bench_file("simple_player.dat", c);
-    bench_file("complex_player.dat", c);
+    // bench_file("complex_player.dat", c);
    // bench_file("level.dat", c);
    // bench_file("stringtest.nbt", c);
    // bench_file("inttest16.nbt", c);

-    // bench_file("inttest1023.nbt", c);
+    bench_file("inttest1023.nbt", c);
    // bench_file("inttest3.nbt", c);
 }

--- a/src/lib.rs
+++ b/src/lib.rs
@ -11,10 +11,12 @@
 //! assert_eq!(nbt.string("name").unwrap().to_str(), "Bananrama");
 //! ```

+#![feature(portable_simd)]
+
 mod error;
 mod mutf8;

-use std::{io::Cursor, ops::Deref, slice};
+use std::{io::Cursor, ops::Deref, simd::prelude::*, slice};

 use byteorder::{ReadBytesExt, BE};
 pub use error::Error;
@ -70,7 +72,7 @@ fn read_u16(data: &mut Cursor<&[u8]>) -> Result<u16, Error> {

 #[inline(always)]
 fn read_with_u16_length<'a>(data: &mut Cursor<&'a [u8]>, width: usize) -> Result<&'a [u8], Error> {
-    let length: u16 = read_u16(data)?;
+    let length = read_u16(data)?;
    let length_in_bytes = length as usize * width;
    // make sure we don't read more than the length
    if data.get_ref().len() < data.position() as usize + length_in_bytes {
@ -294,12 +296,88 @@ fn read_short_array(data: &mut Cursor<&[u8]>) -> Result<Vec<i16>, Error> {
 }
 fn read_int_array(data: &mut Cursor<&[u8]>) -> Result<Vec<i32>, Error> {
    let array_bytes = read_with_u32_length(data, 4)?;
-    let mut array_bytes_cursor = Cursor::new(array_bytes);
+    assert!(array_bytes.len() % 4 == 0);
    let length = array_bytes.len() / 4;
-    let mut ints = Vec::with_capacity(length);
-    for _ in 0..length {
-        ints.push(array_bytes_cursor.read_i32::<BE>()?);
+    let mut ints = array_bytes.to_vec();
+
+    for i in 0..length / 16 {
+        let simd: u8x64 = Simd::from_slice(&ints[i * 16 * 4..(i + 1) * 16 * 4].as_ref());
+        #[rustfmt::skip]
+        let simd = simd_swizzle!(simd, [
+            3, 2, 1, 0,
+            7, 6, 5, 4,
+            11, 10, 9, 8,
+            15, 14, 13, 12,
+            19, 18, 17, 16,
+            23, 22, 21, 20,
+            27, 26, 25, 24,
+            31, 30, 29, 28,
+            35, 34, 33, 32,
+            39, 38, 37, 36,
+            43, 42, 41, 40,
+            47, 46, 45, 44,
+            51, 50, 49, 48,
+            55, 54, 53, 52,
+            59, 58, 57, 56,
+            63, 62, 61, 60,
+        ]);
+        ints[i * 16 * 4..(i + 1) * 16 * 4].copy_from_slice(simd.as_array());
    }
+
+    let mut i = length / 16 * 16;
+    if i >= 8 {
+        let simd: u8x32 = Simd::from_slice(array_bytes[i * 4..i * 4 + 32].as_ref());
+        #[rustfmt::skip]
+        let simd = simd_swizzle!(simd, [
+            3, 2, 1, 0,
+            7, 6, 5, 4,
+            11, 10, 9, 8,
+            15, 14, 13, 12,
+            19, 18, 17, 16,
+            23, 22, 21, 20,
+            27, 26, 25, 24,
+            31, 30, 29, 28,
+        ]);
+        ints[i * 4..i * 4 + 32].copy_from_slice(simd.as_array());
+        i += 8;
+    }
+    if i >= 4 {
+        let simd: u8x16 = Simd::from_slice(array_bytes[i * 4..i * 4 + 16].as_ref());
+        #[rustfmt::skip]
+        let simd = simd_swizzle!(simd, [
+            3, 2, 1, 0,
+            7, 6, 5, 4,
+            11, 10, 9, 8,
+            15, 14, 13, 12,
+        ]);
+        ints[i * 4..i * 4 + 16].copy_from_slice(simd.as_array());
+        i += 4;
+    }
+    if i >= 2 {
+        let simd: u8x8 = Simd::from_slice(array_bytes[i * 4..i * 4 + 8].as_ref());
+        #[rustfmt::skip]
+        let simd = simd_swizzle!(simd, [
+            3, 2, 1, 0,
+            7, 6, 5, 4,
+        ]);
+        ints[i * 4..i * 4 + 8].copy_from_slice(simd.as_array());
+        i += 2;
+    }
+    if i >= 1 {
+        let simd: u8x4 = Simd::from_slice(array_bytes[i * 4..i * 4 + 4].as_ref());
+        #[rustfmt::skip]
+        let simd = simd_swizzle!(simd, [
+            3, 2, 1, 0,
+        ]);
+        ints[i * 4..i * 4 + 4].copy_from_slice(simd.as_array());
+    }
+
+    let ints = {
+        let ptr = ints.as_ptr() as *const i32;
+        std::mem::forget(ints);
+        unsafe { Vec::from_raw_parts(ptr as *mut i32, length, length) }
+    };
+
    Ok(ints)
 }
 fn read_long_array(data: &mut Cursor<&[u8]>) -> Result<Vec<i64>, Error> {
@ -633,6 +711,20 @@ mod tests {
        assert_eq!(nbt.list("Rotation").unwrap().floats().unwrap().len(), 2);
    }

+    #[test]
+    fn inttest() {
+        let nbt = Nbt::new(&mut Cursor::new(include_bytes!("../tests/inttest1023.nbt")))
+            .unwrap()
+            .unwrap();
+
+        let ints = nbt.list("").unwrap().ints().unwrap();
+
+        for (i, &item) in ints.iter().enumerate() {
+            assert_eq!(i as i32, item);
+        }
+        assert_eq!(ints.len(), 1023);
+    }
+
    // #[test]
    // fn generate_inttest() {
    //     use byteorder::WriteBytesExt;