diff --git a/Cargo.toml b/Cargo.toml index a365d8e..b4f77c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,20 +15,21 @@ categories = ["caching", "data-structures"] travis-ci = { repository = "anderslanglands/ustr", branch = "master" } [dependencies] +byteorder = "1.5" +lazy_static = "1.5" +parking_lot = "0.12" +serde = { version = "1", optional = true } ahash = { version = "0.8.3", default-features = false } -byteorder = "1.4.3" -lazy_static = "1.4.0" -parking_lot = "0.12.1" -serde = { version = "1.0", optional = true } + [dev-dependencies] -criterion = "0.4.0" -crossbeam-channel = "0.5.0" -crossbeam-utils = "0.8.1" -libc = "0.2.62" -serde_json = "1.0" -string-interner = "0.13.0" -string_cache = "0.8.1" +criterion = "0.4" +crossbeam-channel = "0.5" +crossbeam-utils = "0.8" +libc = "0.2" +serde_json = "1" +string-interner = "0.13" +string_cache = "0.8" [[bench]] name = "creation" diff --git a/README.md b/README.md index 2075586..bd2d59a 100644 --- a/README.md +++ b/README.md @@ -39,15 +39,15 @@ is directy ported from OIIO. use ustr::{Ustr, ustr}; // Creation is quick and easy using either `Ustr::from` or the `ustr` short -// function and only one copy of any string is stored +// function and only one copy of any string is stored. let h1 = Ustr::from("hello"); let h2 = ustr("hello"); -// Comparisons and copies are extremely cheap +// Comparisons and copies are extremely cheap. let h3 = h1; assert_eq!(h2, h3); -// You can pass straight to FFI +// You can pass straight to FFI. let len = unsafe { libc::strlen(h1.as_char_ptr()) }; @@ -58,7 +58,7 @@ assert_eq!(len, 5); // the UstrMap and UstrSet exports: use ustr::UstrMap; -// Key type is always Ustr +// Key type is always Ustr. let mut map: UstrMap = UstrMap::default(); map.insert(u1, 17); assert_eq!(*map.get(&u1).unwrap(), 17); @@ -100,7 +100,8 @@ If you are writing a library that uses ustr and want users to be able to create ## Changelog ### Changes since 0.10 -* Actually renamed use of "serialization" feature to "serde" + +* Actually renamed `serialization` feature to `serde` ### Changes since 0.9 @@ -334,11 +335,11 @@ I use it regularly on 64-bit systems, and it has passed Miri on a 32-bit system as well, bit 32-bit is not checked regularly. If you want to use it on 32-bit, please make sure to run Miri and open and issue if you find any problems. -## Licence +## License BSD+ License -Copyright © 2019—2020 Anders Langlands +Copyright © 2019—2024 Anders Langlands Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -387,9 +388,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Contains code ported from [OpenImageIO](https://github.com/OpenImageIO/oiio), -BSD 3-clause licence. +BSD 3-clause license. -Contains a copy of Max Woolf's [Big List of Naughty Strings](https://github.com/minimaxir/big-list-of-naughty-strings), MIT licence. +Contains a copy of Max Woolf's [Big List of Naughty Strings](https://github.com/minimaxir/big-list-of-naughty-strings), +MIT license. Contains some strings from -[SecLists](https://github.com/danielmiessler/SecLists), MIT licence. +[SecLists](https://github.com/danielmiessler/SecLists), MIT license. diff --git a/benches/creation.rs b/benches/creation.rs index 93d8e0a..3e59aa1 100644 --- a/benches/creation.rs +++ b/benches/creation.rs @@ -10,10 +10,7 @@ use string_interner::StringInterner; use ustr::*; -#[cfg(not(feature = "spinlock"))] use parking_lot::Mutex; -#[cfg(feature = "spinlock")] -use spin::Mutex; fn criterion_benchmark(c: &mut Criterion) { let path = diff --git a/src/bumpalloc.rs b/src/bumpalloc.rs index 09e750d..81d731f 100644 --- a/src/bumpalloc.rs +++ b/src/bumpalloc.rs @@ -39,12 +39,12 @@ impl LeakyBumpAlloc { // Allocates a new chunk. Aborts if out of memory. pub unsafe fn allocate(&mut self, num_bytes: usize) -> *mut u8 { - // our new ptr will be offset down the heap by num_bytes bytes + // Our new ptr will be offset down the heap by num_bytes bytes. let ptr = self.ptr as usize; let new_ptr = ptr.checked_sub(num_bytes).expect("ptr sub overflowed"); - // round down to alignment + // Round down to alignment. let new_ptr = new_ptr & !(self.layout.align() - 1); - //check we have enough capacity + // Check we have enough capacity. let start = self.start as usize; if new_ptr < start { eprintln!( @@ -52,7 +52,8 @@ impl LeakyBumpAlloc { self.end as usize - new_ptr, self.capacity() ); - // we have to abort here rather than panic or the mutex may deadlock + // We have to abort here rather than panic or the mutex may + // deadlock. std::process::abort(); } diff --git a/src/hash.rs b/src/hash.rs index 5048be5..fbffe6f 100644 --- a/src/hash.rs +++ b/src/hash.rs @@ -1,16 +1,19 @@ use super::Ustr; use byteorder::{ByteOrder, NativeEndian}; -use std::collections::{HashMap, HashSet}; -use std::hash::{BuildHasherDefault, Hasher}; +use std::{ + collections::{HashMap, HashSet}, + hash::{BuildHasherDefault, Hasher}, +}; /// A standard `HashMap` using `Ustr` as the key type with a custom `Hasher` -/// that just uses the precomputed hash for speed instead of calculating it +/// that just uses the precomputed hash for speed instead of calculating it. pub type UstrMap = HashMap>; + /// A standard `HashSet` using `Ustr` as the key type with a custom `Hasher` -/// that just uses the precomputed hash for speed instead of calculating it +/// that just uses the precomputed hash for speed instead of calculating it. pub type UstrSet = HashSet>; -/// The worst hasher in the world - the identity hasher. +/// The worst hasher in the world -- the identity hasher. #[doc(hidden)] #[derive(Default)] pub struct IdentityHasher { diff --git a/src/lib.rs b/src/lib.rs index fe277ba..c573a5b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,19 +1,26 @@ //! Fast, FFI-friendly string interning. A [`Ustr`] (**U**nique **Str**) is a //! lightweight handle representing a static, immutable entry in a global string //! cache, allowing for: +//! //! * Extremely fast string assignment and comparisons -- it's just a pointer //! comparison. +//! //! * Efficient storage -- only one copy of the string is held in memory, and //! getting access to it is just a pointer indirection. -//! * Fast hashing -- the precomputed hash is stored with the string +//! +//! * Fast hashing -- the precomputed hash is stored with the string. +//! //! * Fast FFI -- the string is stored with a terminating null byte so can be -//! passed to C directly without doing the CString dance. +//! passed to C directly without doing the `CString` dance. //! //! The downside is no strings are ever freed, so if you're creating lots and //! lots of strings, you might run out of memory. On the other hand, War and //! Peace is only 3MB, so it's probably fine. //! -//! This crate is based on [OpenImageIO's ustring](https://github.com/OpenImageIO/oiio/blob/master/src/include/OpenImageIO/ustring.h) but it is NOT binary-compatible (yet). The underlying hash map implementation is directy ported from OIIO. +//! This crate is based on [OpenImageIO's](https://openimageio.readthedocs.io/en/v2.4.10.0/) +//! (OIIO) [`ustring`](https://github.com/OpenImageIO/oiio/blob/master/src/include/OpenImageIO/ustring.h) +//! but it is *not* binary-compatible (yet). The underlying hash map +//! implementation is directy ported from OIIO. //! //! # Usage //! @@ -22,21 +29,21 @@ //! //! # unsafe { ustr::_clear_cache() }; //! // Creation is quick and easy using either `Ustr::from` or the ustr function -//! // and only one copy of any string is stored +//! // and only one copy of any string is stored. //! let u1 = Ustr::from("the quick brown fox"); //! let u2 = ustr("the quick brown fox"); //! -//! // Comparisons and copies are extremely cheap +//! // Comparisons and copies are extremely cheap. //! let u3 = u1; //! assert_eq!(u2, u3); //! -//! // You can pass straight to FFI +//! // You can pass straight to FFI. //! let len = unsafe { //! libc::strlen(u1.as_char_ptr()) //! }; //! assert_eq!(len, 19); //! -//! // Use as_str() to get a &str +//! // Use as_str() to get a `str`. //! let words: Vec<&str> = u1.as_str().split_whitespace().collect(); //! assert_eq!(words, ["the", "quick", "brown", "fox"]); //! @@ -45,7 +52,7 @@ //! // the UstrMap and UstrSet exports: //! use ustr::UstrMap; //! -//! // Key type is always Ustr +//! // Key type is always `Ustr`. //! let mut map: UstrMap = UstrMap::default(); //! map.insert(u1, 17); //! assert_eq!(*map.get(&u1).unwrap(), 17); @@ -89,12 +96,16 @@ //! //! - Each individual `Ustr` is very small -- in fact, we guarantee that a //! `Ustr` is the same size and memory layout as an ordinary `*u8`. +//! //! - Storage is frugal, since there is only one allocated copy of each unique //! character sequence, throughout the lifetime of the program. +//! //! - Assignment from one `Ustr` to another is just copy of the pointer; no //! allocation, no character copying, no reference counting. +//! //! - Equality testing (do the strings contain the same characters) is a //! single operation, the comparison of the pointer. +//! //! - Memory allocation only occurs when a new `Ustr` is constructed from raw //! characters the FIRST time -- subsequent constructions of the same string //! just finds it in the canonial string set, but doesn't need to allocate @@ -108,25 +119,33 @@ //! across. //! //! On the whole, `Ustr`s are a really great string representation +//! //! - if you tend to have (relatively) few unique strings, but many copies of //! those strings; +//! //! - if the creation of strings from raw characters is relatively rare //! compared to copying or comparing to existing strings; +//! //! - if you tend to make the same strings over and over again, and if it's //! relatively rare that a single unique character sequence is used only //! once in the entire lifetime of the program; +//! //! - if your most common string operations are assignment and equality //! testing and you want them to be as fast as possible; +//! //! - if you are doing relatively little character-by-character assembly of //! strings, string concatenation, or other "string manipulation" (other //! than equality testing). //! //! `Ustr`s are not so hot +//! //! - if your program tends to have very few copies of each character sequence //! over the entire lifetime of the program; +//! //! - if your program tends to generate a huge variety of unique strings over //! its lifetime, each of which is used only a short time and then //! discarded, never to be needed again; +//! //! - if you don't need to do a lot of string assignment or equality testing, //! but lots of more complex string manipulation. //! @@ -139,15 +158,26 @@ //! use it on 32-bit, please make sure to run Miri and open and issue if you //! find any problems. use parking_lot::Mutex; -use std::borrow::Cow; -use std::ffi::OsStr; -use std::fmt; +use std::{ + borrow::Cow, + cmp::Ordering, + ffi::{CStr, OsStr}, + fmt, + hash::{Hash, Hasher}, + mem::size_of, + ops::Deref, + os::raw::c_char, + path::Path, + ptr::NonNull, + rc::Rc, + slice, str, + str::FromStr, + sync::Arc, +}; -use std::mem::size_of; -use std::path::Path; -use std::rc::Rc; -use std::str::FromStr; -use std::sync::Arc; +mod hash; +pub use hash::*; +mod bumpalloc; mod stringcache; pub use stringcache::*; @@ -156,37 +186,32 @@ pub mod serialization; #[cfg(feature = "serde")] pub use serialization::DeserializedCache; -mod bumpalloc; - -mod hash; -pub use hash::*; -use std::cmp::Ordering; -use std::hash::{Hash, Hasher}; -use std::ptr::NonNull; - /// A handle representing a string in the global string cache. /// -/// To use, create one using `Ustr::from` or the `ustr` function. You can freely -/// copy, destroy or send Ustrs to other threads: the underlying string is -/// always valid in memory (and is never destroyed). +/// To use, create one using [`Ustr::from`] or the [`ustr`] function. You can +/// freely copy, destroy or send `Ustr`s to other threads: the underlying string +/// is always valid in memory (and is never destroyed). #[derive(Copy, Clone, PartialEq)] #[repr(transparent)] pub struct Ustr { char_ptr: NonNull, } -/// Defer to &str for equality - lexicographic ordering will be slower than -/// pointer comparison, but much less surprising if you use Ustrs as keys in -/// e.g. a BTreeMap +/// Defer to `str` for equality. +/// +/// Lexicographic ordering will be slower than pointer comparison, but much less +/// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`. impl Ord for Ustr { fn cmp(&self, other: &Self) -> Ordering { self.as_str().cmp(other.as_str()) } } -/// Defer to &str for equality - lexicographic ordering will be slower than -/// pointer comparison, but much less surprising if you use Ustrs as keys in -/// e.g. a BTreeMap +/// Defer to `str` for equality. +/// +/// Lexicographic ordering will be slower thanpointer comparison, but much less +/// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`. +#[allow(clippy::non_canonical_partial_ord_impl)] impl PartialOrd for Ustr { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -194,9 +219,12 @@ impl PartialOrd for Ustr { } impl Ustr { - /// Create a new Ustr from the given &str. + /// Create a new `Ustr` from the given `str`. + /// + /// You can also use the [`ustr`] function. + /// + /// # Examples /// - /// You can also use the ustr function /// ``` /// use ustr::{Ustr, ustr as u}; /// # unsafe { ustr::_clear_cache() }; @@ -233,7 +261,10 @@ impl Ustr { }) } - /// Get the cached string as a &str + /// Get the cached `Ustr` as a `str`. + /// + /// # Examples + /// /// ``` /// use ustr::ustr as u; /// # unsafe { ustr::_clear_cache() }; @@ -250,17 +281,19 @@ impl Ustr { // All these are guaranteed by StringCache::insert() and by the fact // we can only construct a Ustr from a valid &str. unsafe { - std::str::from_utf8_unchecked(std::slice::from_raw_parts( + str::from_utf8_unchecked(slice::from_raw_parts( self.char_ptr.as_ptr(), self.len(), )) } } - /// Get the cached string as a C char*. + /// Get the cached string as a C `char*`. /// /// This includes the null terminator so is safe to pass straight to FFI. /// + /// # Examples + /// /// ``` /// use ustr::ustr as u; /// # unsafe { ustr::_clear_cache() }; @@ -273,32 +306,36 @@ impl Ustr { /// ``` /// /// # Safety - /// This is just passing a raw byte array with a null terminator to C. - /// If your source string contains non-ascii bytes then this will pass them + /// + /// This is just passing a raw byte array with a null terminator to C. If + /// your source string contains non-ascii bytes then this will pass them /// straight along with no checking. + /// /// The string is **immutable**. That means that if you modify it across the /// FFI boundary then all sorts of terrible things will happen. - pub fn as_char_ptr(&self) -> *const std::os::raw::c_char { - self.char_ptr.as_ptr() as *const std::os::raw::c_char + pub fn as_char_ptr(&self) -> *const c_char { + self.char_ptr.as_ptr() as *const c_char } - /// Get this ustr as a CStr + /// Get this `Ustr` as a [`CStr`] /// - /// This is useful for passing to APIs (like ash) that use CStr + /// This is useful for passing to APIs (like ash) that use `CStr`. /// /// # Safety - /// This function by itself is safe as the pointer and length are - /// guaranteed to be valid. All the same caveats for the use of the CStr - /// as given in the CSstr docs apply - pub fn as_cstr(&self) -> &std::ffi::CStr { + /// + /// This function by itself is safe as the pointer and length are guaranteed + /// to be valid. All the same caveats for the use of the `CStr` as given in + /// the `CStr` docs apply. + pub fn as_cstr(&self) -> &CStr { unsafe { - std::ffi::CStr::from_bytes_with_nul_unchecked( - std::slice::from_raw_parts(self.as_ptr(), self.len() + 1), - ) + CStr::from_bytes_with_nul_unchecked(slice::from_raw_parts( + self.as_ptr(), + self.len() + 1, + )) } } - // Get a raw pointer to the StringCacheEntry + /// Get a raw pointer to the `StringCacheEntry`. #[inline] fn as_string_cache_entry(&self) -> &StringCacheEntry { // The allocator guarantees that the alignment is correct and that @@ -317,7 +354,7 @@ impl Ustr { self.len() == 0 } - /// Get the precomputed hash for this string + /// Get the precomputed hash for this string. #[inline] pub fn precomputed_hash(&self) -> u64 { self.as_string_cache_entry().hash @@ -566,7 +603,7 @@ impl Default for Ustr { } } -impl std::ops::Deref for Ustr { +impl Deref for Ustr { type Target = str; fn deref(&self) -> &Self::Target { self.as_str() @@ -595,11 +632,12 @@ impl Hash for Ustr { /// DO NOT CALL THIS. /// -/// Clears the cache - used for benchmarking and testing purposes to clear the +/// Clears the cache -- used for benchmarking and testing purposes to clear the /// cache. Calling this will invalidate any previously created `UStr`s and /// probably cause your house to burn down. DO NOT CALL THIS. /// /// # Safety +/// /// DO NOT CALL THIS. #[doc(hidden)] pub unsafe fn _clear_cache() { @@ -609,7 +647,7 @@ pub unsafe fn _clear_cache() { } /// Returns the total amount of memory allocated and in use by the cache in -/// bytes +/// bytes. pub fn total_allocated() -> usize { STRING_CACHE .0 @@ -622,7 +660,7 @@ pub fn total_allocated() -> usize { .sum() } -/// Returns the total amount of memory reserved by the cache in bytes +/// Returns the total amount of memory reserved by the cache in bytes. pub fn total_capacity() -> usize { STRING_CACHE .0 @@ -634,7 +672,9 @@ pub fn total_capacity() -> usize { .sum() } -/// Create a new Ustr from the given &str. +/// Create a new `Ustr` from the given `str`. +/// +/// # Examples /// /// ``` /// use ustr::ustr; @@ -650,8 +690,10 @@ pub fn ustr(s: &str) -> Ustr { Ustr::from(s) } -/// Create a new Ustr from the given &str but only if it already exists in the -/// string cache. +/// Create a new `Ustr` from the given `str` but only if it already exists in +/// the string cache. +/// +/// # Examples /// /// ``` /// use ustr::{ustr, existing_ustr}; @@ -672,6 +714,7 @@ pub fn existing_ustr(s: &str) -> Option { /// serialization. /// /// # Examples +/// /// ``` /// # use ustr::{Ustr, ustr, ustr as u}; /// # #[cfg(feature="serde")] @@ -684,11 +727,13 @@ pub fn cache() -> &'static Bins { &STRING_CACHE } -/// Returns the number of unique strings in the cache +/// Returns the number of unique strings in the cache. /// /// This may be an underestimate if other threads are writing to the cache /// concurrently. /// +/// # Examples +/// /// ``` /// use ustr::ustr as u; /// @@ -725,6 +770,7 @@ pub fn num_entries_per_bin() -> Vec { /// might not show up in the view of the cache presented by this iterator. /// /// # Safety +/// /// This returns an iterator to the state of the cache at the time when /// `string_cache_iter()` was called. It is of course possible that another /// thread will add more strings to the cache after this, but since we never @@ -769,6 +815,10 @@ pub fn string_cache_iter() -> StringCacheIterator { } } +/// The type used for the global string cache. +/// +/// This is exposed to allow e.g. serialization of the data returned by the +/// [`cache()`] function. #[repr(transparent)] pub struct Bins(pub(crate) [Mutex; NUM_BINS]); @@ -1007,7 +1057,7 @@ mod tests { assert_eq!(diff.len(), 0); } - #[cfg(feature = "serde")] + #[cfg(all(feature = "serde", not(miri)))] #[test] fn serialization_ustr() { let _t = TEST_LOCK.lock(); diff --git a/src/serialization.rs b/src/serialization.rs index e6cf935..bbe175e 100644 --- a/src/serialization.rs +++ b/src/serialization.rs @@ -1,5 +1,4 @@ use super::*; - use serde::{ de::{Deserialize, Deserializer, Error, SeqAccess, Visitor}, ser::{Serialize, SerializeSeq, Serializer}, diff --git a/src/stringcache.rs b/src/stringcache.rs index 45cad27..f82c49c 100644 --- a/src/stringcache.rs +++ b/src/stringcache.rs @@ -1,12 +1,12 @@ use super::bumpalloc::LeakyBumpAlloc; -// StringCache stores a Vec of pointers to the StringCacheEntry structs. The -// actual memory for the StringCacheEntry is stored in the LeakyBumpAlloc, and -// each Alloc is rotated out when it's full and a new one twice its size is -// allocated. The Allocator memory is never freed so our strings essentialy have -// a 'static lifetime. +// `StringCache` stores a `Vec` of pointers to the `StringCacheEntry` structs. +// The actual memory for the `StringCacheEntry` is stored in the LeakyBumpAlloc, +// and each `Alloc` is rotated out when it's full and a new one twice its size +// is allocated. The Allocator memory is never freed so our strings essentialy +// have a 'static lifetime. // -// The actual memory representation is as follows. Each StringCacheEntry is +// The actual memory representation is as follows. Each `StringCacheEntry` is // aligned to 8 bytes on a 64-bit system. The 64-bit memoized hash of the string // is stored first, then a usize length, then the u8 characters, followed by a // null terminator (not included in len), then x<8 bytes of uninitialized memory @@ -18,14 +18,14 @@ use super::bumpalloc::LeakyBumpAlloc; // ^ StringCacheEntry ^ u8 chars ^ null ^ Next entry // // Proper alignment is guaranteed when allocating each entry as the alignment -// is baked into the allocator. StringCache is responsible for monitoring the -// Allocator and creating a new one when it would overflow - the Alloc itself -// will just abort() if it runs out of memory. Note that we abort() rather than -// panic because the behaviour of the spinlock in case of a panic while holding -// the lock is undefined. +// is baked into the allocator. `StringCache` is responsible for monitoring the +// Allocator and creating a new one when it would overflow -- the `Alloc` itself +// will just `abort()` if it runs out of memory. Note that we abort() rather +// than panic because the behaviour of the spinlock in case of a panic while +// holding the lock is undefined. // -// Thread safety is ensured because we can only access the StringCache through -// the spinlock in the lazy_static ref. The initial capacity of the cache is +// Thread safety is ensured because we can only access the `StringCache` through +// the spinlock in the `lazy_static` ref. The initial capacity of the cache is // divided evenly among a number of 'bins' or shards each with their own lock, // in order to reduce contention. #[repr(align(128))] @@ -36,9 +36,9 @@ pub(crate) struct StringCache { num_entries: usize, mask: usize, total_allocated: usize, - // padding and aligning to 128 bytes gives up to 20% performance + // Padding and aligning to 128 bytes gives up to 20% performance // improvement this actually aligns to 256 bytes because of the Mutex - // around it + // around it. _pad: [u32; 3], } @@ -63,13 +63,13 @@ impl StringCache { std::mem::align_of::(), ); StringCache { - // current allocator + // Current allocator. alloc, - // old allocators we'll keep around for iteration purposes. + // Old allocators we'll keep around for iteration purposes. // 16 would mean we've allocated 128GB of string storage since we // double each time. old_allocs: Vec::with_capacity(16), - // Vector of pointers to the StringCacheEntry headers + // Vector of pointers to the `StringCacheEntry` headers. entries: vec![std::ptr::null_mut(); capacity], num_entries: 0, mask: capacity - 1, @@ -91,14 +91,14 @@ impl StringCache { return None; } // This is safe as long as entry points to a valid address and the - // layout described in the StringCache doc comment holds. + // layout described in the `StringCache` doc comment holds. unsafe { - // entry is a *StringCacheEntry so offseting by 1 gives us a + // entry is a `*StringCacheEntry` so offseting by 1 gives us a // pointer to the end of the entry, aka the beginning of the // chars. // As long as the memory is valid and the layout is correct, // we're safe to create a string slice from the chars since - // they were copied directly from a valid &str. + // they were copied directly from a valid `str`. let entry_chars = entry.add(1) as *const u8; // if entry is non-null then it must point to a valid // StringCacheEntry @@ -114,14 +114,14 @@ impl StringCache { } } - // keep looking + // Keep looking. dist += 1; debug_assert!(dist <= self.mask); pos = (pos + dist) & self.mask; } } - // Insert the given string with its given hash into the cache + // Insert the given string with its given hash into the cache. pub(crate) fn insert(&mut self, string: &str, hash: u64) -> *const u8 { let mut pos = self.mask & hash as usize; let mut dist = 0; @@ -133,17 +133,17 @@ impl StringCache { } // This is safe as long as entry points to a valid address and the - // layout described in the StringCache doc comment holds. + // layout described in the `StringCache` doc comment holds. unsafe { - // entry is a *StringCacheEntry so offseting by 1 gives us a + // entry is a `*StringCacheEntry` so offseting by 1 gives us a // pointer to the end of the entry, aka the beginning of the // chars. // As long as the memory is valid and the layout is correct, // we're safe to create a string slice from the chars since - // they were copied directly from a valid &str. + // they were copied directly from a valid `str`. let entry_chars = entry.add(1) as *const u8; - // if entry is non-null then it must point to a valid - // StringCacheEntry + // If entry is non-null then it must point to a valid + // `StringCacheEntry`. let sce = &**entry; if sce.hash == hash && sce.len == string.len() @@ -163,16 +163,16 @@ impl StringCache { } // - // insert the new string + // Insert the new string. // - // we know pos is in bounds as it's &ed with the mask above + // We know pos is in bounds as it's &ed with the mask above. let entry_ptr = unsafe { self.entries.get_unchecked_mut(pos) }; - // add one to length for null byte - // there's no way we could overflow here in practice since that would - // require having allocated a u64::MAX-length string, by which time + // Ddd one to length for null byte. + // There's no way we could overflow here in practice since that would + // require having allocated a `u64::MAX`-length string, by which time // we'll be using 128-bit pointers and we'll need to rewrite this - // crate anyway + // crate anyway. let byte_len = string.len() + 1; let alloc_size = std::mem::size_of::() + byte_len; @@ -201,18 +201,18 @@ impl StringCache { } // This is safe as long as: - // 1. alloc_size is calculated correctly + // 1. `alloc_size` is calculated correctly. // 2. there is enough space in the allocator (checked in the block - // above) - // 3. The StringCacheEntry layout descibed above holds and the memory + // above). + // 3. The `StringCacheEntry` layout descibed above holds and the memory // returned by allocate() is prooperly aligned. unsafe { *entry_ptr = self.alloc.allocate(alloc_size) as *mut StringCacheEntry; - // write the header - // entry_ptr is guaranteed to point to a valid StringCacheEntry, or - // alloc.allocate() would have aborted + // Write the header. + // `entry_ptr` is guaranteed to point to a valid `StringCacheEntry`, + // or `alloc.allocate()` would have aborted. std::ptr::write( *entry_ptr, StringCacheEntry { @@ -220,20 +220,20 @@ impl StringCache { len: string.len(), }, ); - // write the characters after the StringCacheEntry + // Write the characters after the `StringCacheEntry`. let char_ptr = entry_ptr.add(1) as *mut u8; std::ptr::copy_nonoverlapping( string.as_bytes().as_ptr(), char_ptr, string.len(), ); - // write the trailing null + // Write the trailing null. let write_ptr = char_ptr.add(string.len()); std::ptr::write(write_ptr, 0u8); self.num_entries += 1; - // we want to keep an 0.5 load factor for the map, so grow if we've - // exceeded that + // We want to keep an 0.5 load factor for the map, so grow if we've + // exceeded that. if self.num_entries * 2 > self.mask { self.grow(); } @@ -242,14 +242,18 @@ impl StringCache { } } - // Double the size of the map storage - // This is safe as long as - // - The in-memory layout of the StringCacheEntry is correct + // Double the size of the map storage. + // + // This is safe as long as: + // - The in-memory layout of the `StringCacheEntry` is correct. + // // If there's not enough memory for the new entry table, it will just abort pub(crate) unsafe fn grow(&mut self) { let new_mask = self.mask * 2 + 1; - let mut new_entries = - vec![std::ptr::null_mut::(); new_mask + 1]; + + let mut new_entries: std::vec::Vec<*mut StringCacheEntry> = + vec![std::ptr::null_mut(); new_mask + 1]; + // copy the existing map into the new map let mut to_copy = self.num_entries; for e in self.entries.iter_mut() { @@ -257,19 +261,19 @@ impl StringCache { continue; } - // start of the entry is the hash + // Start of the entry is the hash. let hash = *(*e as *const u64); let mut pos = (hash as usize) & new_mask; let mut dist = 0; loop { if new_entries[pos].is_null() { - // here's an empty slot to put the pointer in + // Here's an empty slot to put the pointer in. break; } dist += 1; // This should be impossble as we've allocated twice as many - // slots as we have entries + // slots as we have entries. debug_assert!(dist <= new_mask, "Probing wrapped around"); pos = pos.wrapping_add(dist) & new_mask; } @@ -285,8 +289,8 @@ impl StringCache { self.mask = new_mask; } - // This is only called by clear() during tests to clear the cache between - // runs. DO NOT CALL THIS. + // This is only called by `clear()` during tests to clear the cache between + // runs. **DO NOT CALL THIS**. pub(crate) unsafe fn clear(&mut self) { // just zero all the pointers that have already been set std::ptr::write_bytes(self.entries.as_mut_ptr(), 0, self.mask + 1); @@ -324,7 +328,7 @@ impl Default for StringCache { } } -// We are safe to be Send but not Sync (we get Sync by wrapping in a mutex) +// We are safe to be `Send` but not `Sync` (we get Sync by wrapping in a mutex). unsafe impl Send for StringCache {} #[doc(hidden)] @@ -342,35 +346,34 @@ fn round_up_to(n: usize, align: usize) -> usize { impl Iterator for StringCacheIterator { type Item = &'static str; fn next(&mut self) -> Option { - if self.allocs.is_empty() { - return None; - } - - let mut s = self.allocs[self.current_alloc]; - if self.current_ptr >= unsafe { s.as_ptr().add(s.len()) } { - // we've reached the end of the current alloc + let (_, end) = self.allocs[self.current_alloc]; + if self.current_ptr >= end { + // We've reached the end of the current alloc. if self.current_alloc == self.allocs.len() - 1 { - // we've reached the end + // We've reached the end. return None; } else { - // advance to the next alloc + // Advance to the next alloc. self.current_alloc += 1; s = self.allocs[self.current_alloc]; self.current_ptr = s.as_ptr(); } } - // Cast the current ptr to a StringCacheEntry and create the next string - // from it. + // Cast the current ptr to a `StringCacheEntry` and create the next + // string from it. unsafe { let sce = &*(self.current_ptr as *const StringCacheEntry); - // the next entry will be the size of the number of bytes in the - // string, +1 for the null byte, rounded up to the alignment (8) + // The next entry will be the size of the number of bytes in the + // string, +1 for the null byte, rounded up to the alignment (8). self.current_ptr = sce.next_entry(); - // we know we're safe not to check here since we put valid UTF-8 in - let pos = sce.char_ptr() as usize - s.as_ptr() as usize; - Some(std::str::from_utf8_unchecked(&s[pos..(pos + sce.len)])) + // We know we're safe not to check here since we put valid UTF-8 in. + let s = std::str::from_utf8_unchecked(std::slice::from_raw_parts( + sce.char_ptr(), + sce.len, + )); + Some(s) } } } @@ -383,15 +386,15 @@ pub(crate) struct StringCacheEntry { } impl StringCacheEntry { - // get the pointer to the characters + // Get the pointer to the characters. pub(crate) fn char_ptr(&self) -> *const u8 { - // we know the chars are always directly after this struct in memory - // because that's the way they're laid out on initialization + // We know the chars are always directly after this struct in memory + // because that's the way they're laid out on initialization. unsafe { (self as *const StringCacheEntry).add(1) as *const u8 } } // Calcualte the address of the next entry in the cache. This is a utility - // function to hide the pointer arithmetic in iterators + // function to hide the pointer arithmetic in iterators. pub(crate) unsafe fn next_entry(&self) -> *const u8 { self.char_ptr().add(round_up_to( self.len + 1,