Skip to content

Commit 8fd3707

Browse files
authored
blobby: add number of entries to the file header (#1207)
This change allows to use simpler one-pass decoding at the cost of adding one or two bytes to each blb file. Previously, we had to perform a separate pass to find number of entries in the file, which could make compilation for large blb files noticeably slower.
1 parent 303c9e8 commit 8fd3707

File tree

6 files changed

+109
-82
lines changed

6 files changed

+109
-82
lines changed

blobby/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77
## 0.4.0 (unreleased)
88
### Changed
99
- Edition changed to 2024 and MSRV bumped to 1.85 ([#1149])
10+
- Replaced iterators with `const fn` parsing ([#1187])
11+
- Format of the file. File header now contains total number of stored blobs. ([#1207])
1012

1113
[#1149]: https://github.com/RustCrypto/utils/pull/1149
14+
[#1187]: https://github.com/RustCrypto/utils/pull/1187
15+
[#1207]: https://github.com/RustCrypto/utils/pull/1207
1216

1317
## 0.3.1 (2021-12-07)
1418
### Added

blobby/README.md

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77
![Rust Version][rustc-image]
88
[![Project Chat][chat-image]][chat-link]
99

10-
Iterators over a simple binary blob storage.
10+
An encoding and decoding library for the Blobby (`blb`) file format, which serves as a simple,
11+
deduplicated storage format for a sequence of binary blobs.
1112

1213
## Examples
1314
```
1415
// We recommend to save blobby data into separate files and
1516
// use the `include_bytes!` macro
16-
static BLOBBY_DATA: &[u8] = b"\x02\x05hello\x06world!\x01\x02 \x00\x03\x06:::\x03\x01\x00";
17+
static BLOBBY_DATA: &[u8; 27] = b"\x08\x02\x05hello\x06world!\x01\x02 \x00\x03\x06:::\x03\x01\x00";
1718
1819
static SLICE: &[&[u8]] = blobby::parse_into_slice!(BLOBBY_DATA);
1920
@@ -54,7 +55,7 @@ assert_eq!(
5455
assert_eq!(ITEMS.len(), 2);
5556
```
5657

57-
## Encoding and decoding
58+
## Encoding and decoding utilities
5859

5960
This crate provides encoding and decoding utilities for converting between
6061
the blobby format and text file with hex-encoded strings.
@@ -97,9 +98,7 @@ This file can be converted to the Blobby format by running the following command
9798
cargo run --release --features alloc --bin encode -- /path/to/input.txt /path/to/output.blb
9899
```
99100

100-
This will create a file which can be read using `blobby::Blob2Iterator`.
101-
102-
To see contents of an existing Blobby file you can use the following command:
101+
To inspect contents of an existing Blobby file you can use the following command:
103102
```sh
104103
cargo run --release --features alloc --bin decode -- /path/to/input.blb /path/to/output.txt
105104
```
@@ -109,20 +108,22 @@ in the input file.
109108
## Storage format
110109

111110
Storage format represents a sequence of binary blobs. The format uses
112-
git-flavored [variable-length quantity][0] (VLQ) for encoding unsigned
111+
git-flavored [variable-length quantity][VLQ] (VLQ) for encoding unsigned
113112
numbers.
114113

115-
File starts with a number of de-duplicated blobs `d`. It followed by `d`
116-
entries. Each entry starts with an integer `m`, immediately followed by `m`
114+
Blobby files start with two numbers: total number of blobs in the file `n` and
115+
number of de-duplicated blobs `d`. The numbers are followed by `d` entries.
116+
Each entry starts with an integer `m`, immediately followed by `m`
117117
bytes representing de-duplicated binary blob.
118118

119-
Next follows unspecified number of entries representing sequence of stored
120-
blobs. Each entry starts with an unsigned integer `n`. The least significant
119+
Next, follows `n` entries representing sequence of stored blobs.
120+
Each entry starts with an unsigned integer `l`. The least significant
121121
bit of this integer is used as a flag. If the flag is equal to 0, then the
122122
number is followed by `n >> 1` bytes, representing a stored binary blob.
123-
Otherwise the entry references a de-duplicated entry number `n >> 1`.
123+
Otherwise the entry references a de-duplicated entry number `n >> 1`
124+
which should be smaller than `d`.
124125

125-
[0]: https://en.wikipedia.org/wiki/Variable-length_quantity
126+
[VLQ]: https://en.wikipedia.org/wiki/Variable-length_quantity
126127

127128
## License
128129

blobby/src/decode.rs

Lines changed: 34 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -50,51 +50,38 @@ macro_rules! try_read_vlq {
5050
};
5151
}
5252

53-
pub const fn parse_dedup_len(mut data: &[u8]) -> Result<usize, Error> {
54-
read_vlq(&mut data)
53+
/// Blobby file header
54+
pub struct Header {
55+
/// Number of blobs stored in the file
56+
pub items_len: usize,
57+
/// Number of deduplicated blobs
58+
pub dedup_len: usize,
5559
}
5660

57-
pub const fn parse_items_len(mut data: &[u8]) -> Result<usize, Error> {
58-
let dedup_index_len = try_read_vlq!(data);
59-
60-
let mut i = 0;
61-
while i < dedup_index_len {
62-
let m = try_read_vlq!(data);
63-
let split = data.split_at(m);
64-
data = split.1;
65-
i += 1;
66-
}
67-
68-
let mut i = 0;
69-
loop {
70-
if data.is_empty() {
71-
return Ok(i);
61+
impl Header {
62+
/// Parse blobby header
63+
pub const fn parse(data: &mut &[u8]) -> Result<Self, Error> {
64+
match (read_vlq(data), read_vlq(data)) {
65+
(Ok(items_len), Ok(dedup_len)) => Ok(Header {
66+
items_len,
67+
dedup_len,
68+
}),
69+
(Err(err), _) | (Ok(_), Err(err)) => Err(err),
7270
}
73-
let val = try_read_vlq!(data);
74-
// the least significant bit is used as a flag
75-
let is_ref = (val & 1) != 0;
76-
let val = val >> 1;
77-
if is_ref {
78-
if val >= dedup_index_len {
79-
return Err(Error::InvalidIndex);
80-
}
81-
} else {
82-
if val > data.len() {
83-
return Err(Error::UnexpectedEnd);
84-
}
85-
let split = data.split_at(val);
86-
data = split.1;
87-
};
88-
i += 1;
8971
}
9072
}
9173

9274
/// Parse blobby data into an array.
93-
pub const fn parse_into_array<const ITEMS: usize, const DEDUP_LEN: usize>(
75+
pub const fn parse_into_array<const ITEMS_LEN: usize, const DEDUP_LEN: usize>(
9476
mut data: &[u8],
95-
) -> Result<[&[u8]; ITEMS], Error> {
96-
if try_read_vlq!(data) != DEDUP_LEN {
97-
return Err(Error::BadArrayLen);
77+
) -> Result<[&[u8]; ITEMS_LEN], Error> {
78+
match Header::parse(&mut data) {
79+
Ok(header) => {
80+
if header.items_len != ITEMS_LEN || header.dedup_len != DEDUP_LEN {
81+
return Err(Error::BadArrayLen);
82+
}
83+
}
84+
Err(err) => return Err(err),
9885
}
9986

10087
let mut dedup_index: [&[u8]; DEDUP_LEN] = [&[]; DEDUP_LEN];
@@ -108,7 +95,7 @@ pub const fn parse_into_array<const ITEMS: usize, const DEDUP_LEN: usize>(
10895
i += 1;
10996
}
11097

111-
let mut res: [&[u8]; ITEMS] = [&[]; ITEMS];
98+
let mut res: [&[u8]; ITEMS_LEN] = [&[]; ITEMS_LEN];
11299

113100
let mut i = 0;
114101
while i < res.len() {
@@ -144,7 +131,10 @@ pub const fn parse_into_array<const ITEMS: usize, const DEDUP_LEN: usize>(
144131
pub fn parse_into_vec(mut data: &[u8]) -> Result<alloc::vec::Vec<&[u8]>, Error> {
145132
use alloc::{vec, vec::Vec};
146133

147-
let dedup_len = try_read_vlq!(data);
134+
let Header {
135+
items_len,
136+
dedup_len,
137+
} = Header::parse(&mut data)?;
148138

149139
let mut dedup_index: Vec<&[u8]> = vec![&[]; dedup_len];
150140

@@ -157,7 +147,6 @@ pub fn parse_into_vec(mut data: &[u8]) -> Result<alloc::vec::Vec<&[u8]>, Error>
157147
i += 1;
158148
}
159149

160-
let items_len = parse_items_len(data)?;
161150
let mut res: Vec<&[u8]> = vec![&[]; items_len];
162151

163152
let mut i = 0;
@@ -189,20 +178,15 @@ pub fn parse_into_vec(mut data: &[u8]) -> Result<alloc::vec::Vec<&[u8]>, Error>
189178
#[macro_export]
190179
macro_rules! parse_into_slice {
191180
($data:expr) => {{
192-
const ITEMS_LEN: usize = {
193-
match $crate::parse_items_len($data) {
181+
const HEADER: $crate::Header = {
182+
let mut data: &[u8] = $data;
183+
match $crate::Header::parse(&mut data) {
194184
Ok(v) => v,
195185
Err(_) => panic!("Failed to parse items len"),
196186
}
197187
};
198-
const DEDUP_LEN: usize = {
199-
match $crate::parse_dedup_len($data) {
200-
Ok(v) => v,
201-
Err(_) => panic!("Failed to parse dedup len"),
202-
}
203-
};
204-
const ITEMS: [&[u8]; ITEMS_LEN] = {
205-
match $crate::parse_into_array::<ITEMS_LEN, DEDUP_LEN>($data) {
188+
const ITEMS: [&[u8]; { HEADER.items_len }] = {
189+
match $crate::parse_into_array::<{ HEADER.items_len }, { HEADER.dedup_len }>($data) {
206190
Ok(v) => v,
207191
Err(_) => panic!("Failed to parse items"),
208192
}

blobby/src/encode.rs

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,59 +30,66 @@ fn encode_vlq(mut val: usize, buf: &mut [u8; 4]) -> &[u8] {
3030
/// Returns the encoded data together with a count of the number of blobs included in the index.
3131
///
3232
/// The encoded file format is:
33-
/// - count of index entries=N
34-
/// - N x index entries, each encoded as:
33+
/// - number of blobs in the file = N
34+
/// - number of deduplicated index entries = M
35+
/// - M x index entries encoded as:
3536
/// - size L of index entry (VLQ)
3637
/// - index blob contents (L bytes)
37-
/// - repeating encoded blobs, each encoded as:
38+
/// - N x blobs encoded as:
3839
/// - VLQ value that is either:
3940
/// - (J << 1) & 0x01: indicates this blob is index entry J
4041
/// - (L << 1) & 0x00: indicates an explicit blob of len L
4142
/// - (in the latter case) explicit blob contents (L bytes)
42-
pub fn encode_blobs<'a, I, T>(blobs: &'a I) -> (alloc::vec::Vec<u8>, usize)
43+
pub fn encode_blobs<T>(blobs: &[T]) -> (alloc::vec::Vec<u8>, usize)
4344
where
44-
&'a I: IntoIterator<Item = &'a T>,
45-
T: AsRef<[u8]> + 'a,
45+
T: AsRef<[u8]>,
4646
{
4747
use alloc::{collections::BTreeMap, vec::Vec};
4848

49-
let mut idx_map = BTreeMap::new();
49+
let mut dedup_map = BTreeMap::new();
5050
blobs
51-
.into_iter()
51+
.iter()
5252
.map(|v| v.as_ref())
5353
.filter(|blob| !blob.is_empty())
5454
.for_each(|blob| {
55-
let v = idx_map.entry(blob.as_ref()).or_insert(0);
55+
let v = dedup_map.entry(blob.as_ref()).or_insert(0);
5656
*v += 1;
5757
});
5858

59-
let mut idx: Vec<&[u8]> = idx_map
59+
let mut dedup_list: Vec<&[u8]> = dedup_map
6060
.iter()
6161
.filter(|&(_, &v)| v > 1)
6262
.map(|(&k, _)| k)
6363
.collect();
64-
idx.sort_by_key(|e| {
64+
dedup_list.sort_by_key(|e| {
6565
let k = match e {
6666
[0] => 2,
6767
[1] => 1,
6868
_ => 0,
6969
};
70-
(k, idx_map.get(e).unwrap())
70+
(k, dedup_map.get(e).unwrap())
7171
});
72-
idx.reverse();
73-
let idx_len = idx.len();
72+
dedup_list.reverse();
73+
let idx_len = dedup_list.len();
7474

75-
let rev_idx: BTreeMap<&[u8], usize> = idx.iter().enumerate().map(|(i, &e)| (e, i)).collect();
75+
let rev_idx: BTreeMap<&[u8], usize> = dedup_list
76+
.iter()
77+
.enumerate()
78+
.map(|(i, &e)| (e, i))
79+
.collect();
7680

7781
let mut out_buf = Vec::new();
7882
let mut buf = [0u8; 4];
79-
out_buf.extend_from_slice(encode_vlq(idx.len(), &mut buf));
80-
for e in idx {
83+
84+
out_buf.extend_from_slice(encode_vlq(blobs.len(), &mut buf));
85+
out_buf.extend_from_slice(encode_vlq(dedup_list.len(), &mut buf));
86+
87+
for e in dedup_list {
8188
out_buf.extend_from_slice(encode_vlq(e.len(), &mut buf));
8289
out_buf.extend_from_slice(e);
8390
}
8491

85-
for blob in blobs.into_iter().map(|v| v.as_ref()) {
92+
for blob in blobs.iter().map(|v| v.as_ref()) {
8693
if let Some(dup_pos) = rev_idx.get(blob) {
8794
let n = (dup_pos << 1) + 1usize;
8895
out_buf.extend_from_slice(encode_vlq(n, &mut buf));

blobby/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ extern crate alloc;
1313
pub(crate) mod decode;
1414
#[cfg(feature = "alloc")]
1515
pub use decode::parse_into_vec;
16-
pub use decode::{parse_dedup_len, parse_into_array, parse_items_len};
16+
pub use decode::{Header, parse_into_array};
1717

1818
#[cfg(feature = "alloc")]
1919
mod encode;

blobby/tests/mod.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#![cfg(feature = "alloc")]
2+
3+
const ITEMS_LEN: usize = 10;
4+
const DEDUP_LEN: usize = 3;
5+
const TEST_BLOBS: &[&[u8]; ITEMS_LEN] = &[
6+
b"1",
7+
b"12",
8+
b"1",
9+
b"1",
10+
b"123",
11+
&[42; 100_000],
12+
&[42; 100_000],
13+
&[13; 7_000],
14+
&[13; 7_000],
15+
&[13; 5_000],
16+
];
17+
18+
#[test]
19+
fn blobby_rondtrip_test() -> Result<(), blobby::Error> {
20+
let (blobby_data, dedup_len) = blobby::encode_blobs(TEST_BLOBS);
21+
assert_eq!(dedup_len, DEDUP_LEN);
22+
assert_eq!(blobby_data.len(), 112_025);
23+
24+
let decoded_blobs = blobby::parse_into_array::<ITEMS_LEN, DEDUP_LEN>(&blobby_data)?;
25+
assert_eq!(decoded_blobs, TEST_BLOBS[..]);
26+
27+
let decoded_blobs = blobby::parse_into_vec(&blobby_data)?;
28+
assert_eq!(decoded_blobs, TEST_BLOBS[..]);
29+
30+
Ok(())
31+
}

0 commit comments

Comments
 (0)