feat(event cache): allow using the bloom filter OR the store to deduplicate events

This commit is contained in:
Benjamin Bouvier
2025-02-12 17:13:54 +01:00
parent b95cf79a6d
commit faa0e6e554
3 changed files with 120 additions and 39 deletions

View File

@@ -18,12 +18,109 @@
use std::{collections::BTreeSet, fmt, sync::Mutex};
use growable_bloom_filter::{GrowableBloom, GrowableBloomBuilder};
use ruma::OwnedEventId;
use matrix_sdk_base::event_cache::store::EventCacheStoreLock;
use ruma::{OwnedEventId, OwnedRoomId};
use tracing::{debug, warn};
use super::room::events::{Event, RoomEvents};
use super::{
room::events::{Event, RoomEvents},
EventCacheError,
};
/// `Deduplicator` is an efficient type to find duplicated events.
pub enum Deduplicator {
InMemory(BloomFilterDeduplicator),
PersistentStore(StoreDeduplicator),
}
impl Deduplicator {
/// Create an empty deduplicator instance that uses an internal Bloom
/// filter.
///
/// Such a deduplicator is stateful, with no initial known events, and it
/// will learn over time by using a Bloom filter which events are
/// duplicates or not.
///
/// When the persistent storage is enabled by default, this constructor
/// (and the associated variant) will be removed.
pub fn new_memory_based() -> Self {
Self::InMemory(BloomFilterDeduplicator::new())
}
/// Create new store-based deduplicator that will run queries against the
/// store to find if any event is deduplicated or not.
///
/// This deduplicator is stateless.
///
/// When the persistent storage is enabled by default, this will become the
/// default, and [`Deduplicator`] will be replaced with
/// [`StoreDeduplicator`].
pub fn new_store_based(room_id: OwnedRoomId, store: EventCacheStoreLock) -> Self {
Self::PersistentStore(StoreDeduplicator { room_id, store })
}
/// Find duplicates in the given collection of events, and return both
/// valid events (those with an event id) as well as the event ids of
/// duplicate events.
pub async fn filter_duplicate_events<I>(
&self,
events: I,
room_events: &RoomEvents,
) -> Result<(Vec<Event>, Vec<OwnedEventId>), EventCacheError>
where
I: Iterator<Item = Event>,
{
match self {
Deduplicator::InMemory(dedup) => Ok(dedup.filter_duplicate_events(events, room_events)),
Deduplicator::PersistentStore(dedup) => dedup.filter_duplicate_events(events).await,
}
}
}
/// A deduplication mechanism based on the persistent storage associated to the
/// event cache.
///
/// It will use queries to the persistent storage to figure where events are
/// duplicates or not, making it entirely stateless.
pub struct StoreDeduplicator {
/// The room this deduplicator applies to.
room_id: OwnedRoomId,
/// The actual event cache store implementation used to query events.
store: EventCacheStoreLock,
}
impl StoreDeduplicator {
async fn filter_duplicate_events<I>(
&self,
events: I,
) -> Result<(Vec<Event>, Vec<OwnedEventId>), EventCacheError>
where
I: Iterator<Item = Event>,
{
let store = self.store.lock().await?;
// Collect event ids as we "validate" events (i.e. check they have a valid event
// id.)
let mut event_ids = Vec::new();
let events = events
.filter_map(|event| {
if let Some(event_id) = event.event_id() {
event_ids.push(event_id);
Some(event)
} else {
None
}
})
.collect::<Vec<_>>();
// Let the store do its magic ✨
let duplicates = store.filter_duplicated_events(&self.room_id, event_ids).await?;
Ok((events, duplicates))
}
}
/// `BloomFilterDeduplicator` is an efficient type to find duplicated events,
/// using an in-memory cache.
///
/// It uses a [bloom filter] to provide a memory efficient probabilistic answer
/// to: “has event E been seen already?”. False positives are possible, while
@@ -49,34 +146,18 @@ impl BloomFilterDeduplicator {
const DESIRED_FALSE_POSITIVE_RATE: f64 = 0.01;
/// Create a new `Deduplicator` with no prior knowledge of known events.
#[cfg(test)]
pub fn new() -> Self {
Self::with_initial_events(std::iter::empty())
}
/// Create a new `Deduplicator` filled with initial events.
///
/// This won't detect duplicates in the initial events, only learn about
/// those events.
pub fn with_initial_events<'a>(events: impl Iterator<Item = &'a Event>) -> Self {
let mut bloom_filter = GrowableBloomBuilder::new()
fn new() -> Self {
let bloom_filter = GrowableBloomBuilder::new()
.estimated_insertions(Self::APPROXIMATED_MAXIMUM_NUMBER_OF_EVENTS)
.desired_error_ratio(Self::DESIRED_FALSE_POSITIVE_RATE)
.build();
for e in events {
let Some(event_id) = e.event_id() else {
warn!("initial event in deduplicator had no event id");
continue;
};
bloom_filter.insert(event_id);
}
Self { bloom_filter: Mutex::new(bloom_filter) }
}
/// Find duplicates in the given collection of events, and return both
/// valid events (those with an event id) as well as the event ids of
/// duplicate events.
pub fn filter_duplicate_events<'a, I>(
fn filter_duplicate_events<'a, I>(
&'a self,
events: I,
room_events: &'a RoomEvents,
@@ -184,7 +265,7 @@ impl BloomFilterDeduplicator {
/// Information about the scanned collection of events.
#[derive(Debug)]
pub enum Decoration<I> {
enum Decoration<I> {
/// This event is not duplicated.
Unique(I),

View File

@@ -181,7 +181,7 @@ impl RoomPagination {
.collect::<Vec<_>>();
let (new_events, duplicated_event_ids, all_deduplicated) =
state.collect_valid_and_duplicated_events(sync_events.clone().into_iter());
state.collect_valid_and_duplicated_events(sync_events.clone().into_iter()).await?;
let (backpagination_outcome, sync_timeline_events_diffs) = state
.with_events_mut(move |room_events| {

View File

@@ -424,8 +424,9 @@ impl RoomEventCacheInner {
return Ok(());
}
let (events, duplicated_event_ids, all_duplicates) =
state.collect_valid_and_duplicated_events(sync_timeline_events.clone().into_iter());
let (events, duplicated_event_ids, all_duplicates) = state
.collect_valid_and_duplicated_events(sync_timeline_events.clone().into_iter())
.await?;
let sync_timeline_events_diffs = if all_duplicates {
// No new events, thus no need to change the room events.
@@ -543,7 +544,7 @@ mod private {
use tracing::{error, instrument, trace};
use super::{chunk_debug_string, events::RoomEvents};
use crate::event_cache::{deduplicator::BloomFilterDeduplicator, EventCacheError};
use crate::event_cache::{deduplicator::Deduplicator, EventCacheError};
/// State for a single room's event cache.
///
@@ -563,7 +564,7 @@ mod private {
events: RoomEvents,
/// The events deduplicator instance to help finding duplicates.
deduplicator: BloomFilterDeduplicator,
deduplicator: Deduplicator,
/// Have we ever waited for a previous-batch-token to come from sync, in
/// the context of pagination? We do this at most once per room,
@@ -602,7 +603,7 @@ mod private {
room: OwnedRoomId,
store: Arc<OnceCell<EventCacheStoreLock>>,
) -> Result<Self, EventCacheError> {
let events = if let Some(store) = store.get() {
let (events, deduplicator) = if let Some(store) = store.get() {
let locked = store.lock().await?;
// Try to reload a linked chunk from storage. If it fails, log the error and
@@ -620,15 +621,14 @@ mod private {
}
};
RoomEvents::with_initial_chunks(linked_chunk)
(
RoomEvents::with_initial_chunks(linked_chunk),
Deduplicator::new_store_based(room.clone(), store.clone()),
)
} else {
RoomEvents::default()
(RoomEvents::default(), Deduplicator::new_memory_based())
};
let deduplicator = BloomFilterDeduplicator::with_initial_events(
events.events().map(|(_pos, event)| event),
);
Ok(Self { room, store, events, deduplicator, waited_for_initial_prev_token: false })
}
@@ -658,19 +658,19 @@ mod private {
/// possibly misplace them. And we should not be missing
/// events either: the already-known events would have their own
/// previous-batch token (it might already be consumed).
pub fn collect_valid_and_duplicated_events<'a, I>(
pub async fn collect_valid_and_duplicated_events<'a, I>(
&'a mut self,
events: I,
) -> (Vec<Event>, Vec<OwnedEventId>, bool)
) -> Result<(Vec<Event>, Vec<OwnedEventId>, bool), EventCacheError>
where
I: Iterator<Item = Event> + 'a,
{
let (events, duplicated_event_ids) =
self.deduplicator.filter_duplicate_events(events, &self.events);
self.deduplicator.filter_duplicate_events(events, &self.events).await?;
let all_duplicates = !events.is_empty() && events.len() == duplicated_event_ids.len();
(events, duplicated_event_ids, all_duplicates)
Ok((events, duplicated_event_ids, all_duplicates))
}
/// Removes the bundled relations from an event, if they were present.