From 3e49f1de107fbe8d1c67fae8eded3b25eeaa212a Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 16:45:39 -0800 Subject: [PATCH] comments --- .../ops/indexing/change_detection/handler.rs | 8 - core/src/ops/indexing/db_writer.rs | 17 - core/src/ops/indexing/ephemeral/cache.rs | 3 - core/src/ops/indexing/ephemeral/writer.rs | 23 +- core/src/ops/indexing/job.rs | 35 +- docs/core/indexing.mdx | 357 ++++++++++-------- 6 files changed, 212 insertions(+), 231 deletions(-) diff --git a/core/src/ops/indexing/change_detection/handler.rs b/core/src/ops/indexing/change_detection/handler.rs index ab6dc614f..23c1411fc 100644 --- a/core/src/ops/indexing/change_detection/handler.rs +++ b/core/src/ops/indexing/change_detection/handler.rs @@ -54,10 +54,6 @@ pub trait ChangeHandler: Send + Sync { async fn handle_new_directory(&self, path: &Path) -> Result<()>; } -// ============================================================================ -// Shared Logic - Used by both handlers -// ============================================================================ - /// Check if a path exists, distinguishing between "doesn't exist" and "can't access". /// /// Critical for preventing false deletions when volumes go offline. @@ -191,10 +187,6 @@ pub async fn build_dir_entry( }) } -// ============================================================================ -// Generic Change Application -// ============================================================================ - /// Apply a batch of filesystem changes using the provided handler. /// /// Processes events in the correct order: removes first, then renames, diff --git a/core/src/ops/indexing/db_writer.rs b/core/src/ops/indexing/db_writer.rs index 12cb37809..83d481af8 100644 --- a/core/src/ops/indexing/db_writer.rs +++ b/core/src/ops/indexing/db_writer.rs @@ -566,7 +566,6 @@ impl DBWriter { new_path: &Path, location_root_path: &Path, ) -> Result<(), JobError> { - // Begin transaction for atomic move operation let txn = db .begin() .await @@ -605,7 +604,6 @@ impl DBWriter { location_root_path: &Path, txn: &DatabaseTransaction, ) -> Result<(), JobError> { - // Get the entry let db_entry = entities::entry::Entity::find_by_id(entry_id) .one(txn) .await @@ -918,27 +916,22 @@ impl DBWriter { let mut entry_active: entities::entry::ActiveModel = db_entry.into(); - // Find new parent entry ID let new_parent_id = if let Some(parent_path) = new_path.parent() { state.entry_id_cache.get(parent_path).copied() } else { None }; - // Update entry fields entry_active.parent_id = Set(new_parent_id); - // Extract new name and extension for files match new_path.extension() { Some(ext) => { - // File with extension if let Some(stem) = new_path.file_stem() { entry_active.name = Set(stem.to_string_lossy().to_string()); entry_active.extension = Set(Some(ext.to_string_lossy().to_lowercase())); } } None => { - // File without extension or directory if let Some(name) = new_path.file_name() { entry_active.name = Set(name.to_string_lossy().to_string()); entry_active.extension = Set(None); @@ -946,13 +939,11 @@ impl DBWriter { } } - // Save the updated entry entry_active .update(txn) .await .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; - // Update cache state.entry_id_cache.remove(old_path); state .entry_id_cache @@ -1030,10 +1021,6 @@ impl DBWriter { Ok(()) } - // ======================================================================== - // Subtree Deletion - // ======================================================================== - /// Deletes an entry and all its descendants from the database. /// /// This is a raw database operation that does NOT: @@ -1070,7 +1057,6 @@ impl DBWriter { { use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; - // Collect all descendants via closure table let mut to_delete_ids: Vec = vec![entry_id]; if let Ok(rows) = entities::entry_closure::Entity::find() .filter(entities::entry_closure::Column::AncestorId.eq(entry_id)) @@ -1083,7 +1069,6 @@ impl DBWriter { to_delete_ids.dedup(); if !to_delete_ids.is_empty() { - // Delete closure links (both directions) let _ = entities::entry_closure::Entity::delete_many() .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) .exec(db) @@ -1093,13 +1078,11 @@ impl DBWriter { .exec(db) .await; - // Delete directory paths let _ = entities::directory_paths::Entity::delete_many() .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) .exec(db) .await; - // Delete entries let _ = entities::entry::Entity::delete_many() .filter(entities::entry::Column::Id.is_in(to_delete_ids)) .exec(db) diff --git a/core/src/ops/indexing/ephemeral/cache.rs b/core/src/ops/indexing/ephemeral/cache.rs index 77a7aaae9..2de7997b1 100644 --- a/core/src/ops/indexing/ephemeral/cache.rs +++ b/core/src/ops/indexing/ephemeral/cache.rs @@ -106,7 +106,6 @@ impl EphemeralIndexCache { let mut index = self.index.write().await; let (cleared, deleted_browsed_dirs) = index.clear_directory_children(path, &indexed); - // Remove deleted browsed directories from indexed_paths if !deleted_browsed_dirs.is_empty() { let mut indexed_paths = self.indexed_paths.write(); for deleted_path in deleted_browsed_dirs { @@ -197,7 +196,6 @@ impl EphemeralIndexCache { pub fn find_watched_root(&self, path: &Path) -> Option { let watched = self.watched_paths.read(); - // Find the longest matching watched path that is an ancestor of `path` let mut best_match: Option<&PathBuf> = None; let mut best_len = 0; @@ -269,7 +267,6 @@ impl EphemeralIndexCache { /// Legacy: Insert (no-op, entries are added directly to global index) #[deprecated(note = "Entries should be added directly to the global index")] pub fn insert(&self, path: PathBuf, _index: Arc>) { - // Mark the path as indexed let mut indexed = self.indexed_paths.write(); indexed.insert(path); } diff --git a/core/src/ops/indexing/ephemeral/writer.rs b/core/src/ops/indexing/ephemeral/writer.rs index e3ff10d98..ebc65ca91 100644 --- a/core/src/ops/indexing/ephemeral/writer.rs +++ b/core/src/ops/indexing/ephemeral/writer.rs @@ -53,14 +53,11 @@ impl EphemeralWriter { } } - /// Generate the next entry ID. fn next_id(&self) -> i32 { self.next_id.fetch_add(1, Ordering::SeqCst) } - /// Add an entry to the index and emit a ResourceChanged event. - /// - /// This is the core write operation used by both pipelines. + /// Core write operation shared by both watcher and indexer pipelines. async fn add_entry_internal( &self, path: &Path, @@ -78,7 +75,6 @@ impl EphemeralWriter { Ok((entry_id, content_kind)) } - /// Emit a ResourceChanged event for UI updates. async fn emit_resource_changed( &self, uuid: Uuid, @@ -142,7 +138,7 @@ impl ChangeHandler for EphemeralWriter { } async fn find_by_inode(&self, _inode: u64) -> Result> { - // Ephemeral index doesn't track inodes for move detection + // Inode tracking is skipped to minimize memory overhead; fall back to path-only detection. Ok(None) } @@ -154,7 +150,6 @@ impl ChangeHandler for EphemeralWriter { .add_entry_internal(&metadata.path, entry_uuid, entry_metadata.clone()) .await?; - // Emit event if entry was actually added (not a duplicate) if let Some(content_kind) = content_kind { self.emit_resource_changed(entry_uuid, &metadata.path, &entry_metadata, content_kind) .await; @@ -216,7 +211,7 @@ impl ChangeHandler for EphemeralWriter { } async fn run_processors(&self, _entry: &EntryRef, _is_new: bool) -> Result<()> { - // Ephemeral indexing skips processor pipeline (no thumbnails/content hash) + // File processors (thumbnails, content hash) are disabled to ensure responsive, low-overhead browsing. Ok(()) } @@ -295,10 +290,6 @@ impl ChangeHandler for EphemeralWriter { } } -// ============================================================================ -// IndexPersistence Implementation (Job Pipeline) -// ============================================================================ - #[async_trait::async_trait] impl IndexPersistence for EphemeralWriter { async fn store_entry( @@ -336,7 +327,6 @@ impl IndexPersistence for EphemeralWriter { (self.next_id(), content_kind) }; - // Emit event if entry was actually added (not a duplicate) if let Some(content_kind) = content_kind { self.emit_resource_changed(entry_uuid, &entry.path, &metadata, content_kind) .await; @@ -351,7 +341,6 @@ impl IndexPersistence for EphemeralWriter { _path: &Path, _cas_id: String, ) -> JobResult<()> { - // Ephemeral indexing doesn't track content identities Ok(()) } @@ -359,12 +348,10 @@ impl IndexPersistence for EphemeralWriter { &self, _indexing_path: &Path, ) -> JobResult, Option, u64)>> { - // Ephemeral indexing doesn't support incremental indexing Ok(HashMap::new()) } async fn update_entry(&self, _entry_id: i32, _entry: &DirEntry) -> JobResult<()> { - // Updates are handled via add_entry (overwrites existing) Ok(()) } @@ -393,7 +380,6 @@ mod tests { let mut writer = EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); - // Test create let dir_entry = DirEntry { path: test_file.clone(), kind: EntryKind::File, @@ -411,7 +397,6 @@ mod tests { assert_eq!(entry_ref.path, test_file); assert_eq!(entry_ref.kind, EntryKind::File); - // Verify entry exists let found = writer .find_by_path(&test_file) .await @@ -448,7 +433,6 @@ mod tests { assert!(entry_id > 0); assert!(!writer.is_persistent()); - // Verify index was updated let idx = index.read().await; assert!(idx.has_entry(&test_file)); } @@ -481,7 +465,6 @@ mod tests { .await .expect("store_entry should succeed"); - // Try to receive the event let event = tokio::time::timeout(tokio::time::Duration::from_millis(100), subscriber.recv()).await; diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index 197f7d8f1..02c46b8ab 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -224,29 +224,18 @@ impl IndexerJob { self.state = Some(IndexerState::new(&self.config.path)); } else { ctx.log("Resuming indexer from saved state"); - let state = self.state.as_ref().unwrap(); info!("INDEXER_STATE: Job resuming with saved state - phase: {:?}, entry_batches: {}, entries_for_content: {}, seen_paths: {}", - state.phase, - state.entry_batches.len(), - state.entries_for_content.len(), - state.seen_paths.len()); - warn!( - "DEBUG: Resumed state - phase: {:?}, entry_batches: {}, entries_for_content: {}", - state.phase, - state.entry_batches.len(), - state.entries_for_content.len() - ); + self.state.as_ref().unwrap().phase, + self.state.as_ref().unwrap().entry_batches.len(), + self.state.as_ref().unwrap().entries_for_content.len(), + self.state.as_ref().unwrap().seen_paths.len()); } let state = self.state.as_mut().unwrap(); - // For cloud volumes, we use the path component from the SdPath (e.g., "/" or "folder/") - // since discovery operates through the volume backend (not direct filesystem access). let root_path_buf = if let Some(p) = self.config.path.as_local_path() { p.to_path_buf() } else if let Some(cloud_path) = self.config.path.cloud_path() { - // Cloud path - use the path component within the cloud volume - // The actual I/O will go through the volume backend PathBuf::from(cloud_path) } else if !self.config.is_ephemeral() { let loc_uuid = self @@ -326,7 +315,6 @@ impl IndexerJob { ctx.check_interrupt().await?; let current_phase = state.phase.clone(); - warn!("DEBUG: IndexerJob entering phase: {:?}", current_phase); match current_phase { Phase::Discovery => { let cloud_url_base = @@ -359,7 +347,6 @@ impl IndexerJob { } Phase::Processing => { - warn!("DEBUG: IndexerJob starting Processing phase"); if self.config.is_ephemeral() { let ephemeral_index = self.ephemeral_index.clone().ok_or_else(|| { JobError::execution("Ephemeral index not initialized".to_string()) @@ -435,11 +422,6 @@ impl IndexerJob { Phase::Complete => break, } - - warn!( - "DEBUG: IndexerJob completed phase: {:?}, next phase will be: {:?}", - current_phase, state.phase - ); } let final_progress = IndexerProgress { @@ -545,12 +527,7 @@ impl JobHandler for IndexerJob { } async fn on_resume(&mut self, ctx: &JobContext<'_>) -> JobResult { - warn!("DEBUG: IndexerJob on_resume called"); if let Some(state) = &self.state { - warn!( - "DEBUG: IndexerJob has state, resuming in {:?} phase", - state.phase - ); ctx.log(format!("Resuming indexer in {:?} phase", state.phase)); ctx.log(format!( "Progress: {} files, {} dirs, {} errors so far", @@ -559,7 +536,6 @@ impl JobHandler for IndexerJob { self.timer = Some(PhaseTimer::new()); } else { - warn!("DEBUG: IndexerJob has no state during resume - creating new state!"); self.state = Some(IndexerState::new(&self.config.path)); } Ok(()) @@ -641,8 +617,6 @@ impl IndexerJob { use super::state::{DirEntry, EntryKind}; use tokio::fs; - ctx.log("Starting current scope discovery (single level)"); - let mut entries = fs::read_dir(root_path) .await .map_err(|e| JobError::execution(format!("Failed to read directory: {}", e)))?; @@ -677,7 +651,6 @@ impl IndexerJob { state.pending_entries.push(dir_entry); state.items_since_last_update += 1; - // Update stats match entry_kind { EntryKind::File => state.stats.files += 1, EntryKind::Directory => state.stats.dirs += 1, diff --git a/docs/core/indexing.mdx b/docs/core/indexing.mdx index f61b18cdb..2b27aea95 100644 --- a/docs/core/indexing.mdx +++ b/docs/core/indexing.mdx @@ -3,19 +3,25 @@ title: Indexing sidebarTitle: Indexing --- -The indexing system discovers and analyzes your files through a sophisticated multi-phase process. Built on Spacedrive's job system, it provides resumable operations, real-time progress tracking, and supports both persistent library indexing and ephemeral browsing of external drives. +The indexing system discovers and analyzes your files through a multi-phase pipeline. Built on Spacedrive's job system, it provides resumable operations, real-time progress tracking, and supports both persistent library indexing and ephemeral browsing of external drives. ## Architecture Overview -The indexing system consists of several key components working together: +The indexing system consists of specialized components working together: **IndexerJob** orchestrates the entire indexing process as a resumable job. It maintains state across application restarts and provides detailed progress reporting. -**IndexerState** preserves all necessary information to resume indexing from any interruption point. This includes the current phase, directories to process, and accumulated statistics. +**IndexerState** preserves all necessary information to resume indexing from any interruption point. This includes the current phase, directories to process, accumulated statistics, and ephemeral UUID mappings for preserving user metadata across browsing-to-persistent transitions. -**EntryProcessor** handles the complex task of creating and updating database records while maintaining referential integrity through materialized paths. +**DBWriter** provides the low-level database CRUD layer. All database operations (create, update, move, delete) flow through this module for consistency. -**FileTypeRegistry** identifies files through a combination of extensions, magic bytes, and content analysis to provide accurate type detection. +**PersistentWriter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the database via `DBWriter`. + +**EphemeralWriter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the in-memory `EphemeralIndex`. + +This dual-implementation architecture unifies watcher and job pipelines, eliminating code duplication between real-time filesystem monitoring and batch indexing operations. + +**FileTypeRegistry** identifies files through extensions, magic bytes, and content analysis. The system integrates deeply with Spacedrive's job infrastructure, which provides automatic state persistence through MessagePack serialization. When you pause an indexing operation, the entire job state is saved to a dedicated jobs database, allowing seamless resumption even after application restarts. @@ -24,63 +30,153 @@ The system integrates deeply with Spacedrive's job infrastructure, which provide architecture ensures no work is lost if interrupted. +## Database Architecture + +The indexing system uses a closure table for hierarchy management instead of recursive queries: + +### Closure Table + +Parent-child relationships are stored in the `entry_closure` table with precomputed ancestor-descendant pairs. This makes "find all descendants" queries O(1) regardless of nesting depth, at the cost of additional storage (worst-case N² for deeply nested trees). + +```sql +CREATE TABLE entry_closure ( + ancestor_id INTEGER, + descendant_id INTEGER, + depth INTEGER +); +``` + +The closure table stores all transitive relationships. For a file at `/home/user/docs/report.pdf`, entries exist for: +- (home_id, report_id, depth=3) +- (user_id, report_id, depth=2) +- (docs_id, report_id, depth=1) +- (report_id, report_id, depth=0) + +Move operations require rebuilding closures for the entire moved subtree, which can affect thousands of rows when moving large directories. + +### Directory Paths Cache + +The `directory_paths` table provides O(1) absolute path lookups for directories: + +```sql +CREATE TABLE directory_paths ( + entry_id INTEGER PRIMARY KEY, + path TEXT UNIQUE +); +``` + +This eliminates recursive parent traversal when building file paths. Each directory stores its complete absolute path, enabling instant resolution for child entries. + +### Entries Table + +```sql +CREATE TABLE entry ( + id INTEGER PRIMARY KEY, + uuid UUID UNIQUE, + parent_id INTEGER, + name TEXT, + extension TEXT, + kind INTEGER, + size BIGINT, + inode BIGINT, + content_id INTEGER, + aggregate_size BIGINT, + child_count INTEGER, + file_count INTEGER +); +``` + ## Indexing Phases -The indexer operates through four distinct phases, each designed to be interruptible and resumable: +The indexer operates through five distinct phases, each designed to be interruptible and resumable: ### Phase 1: Discovery -The discovery phase walks your filesystem to build a list of all files and directories. This phase is optimized for speed, collecting just enough information to plan the work ahead: +Discovery walks the filesystem using parallel workers with a work-stealing model. On systems with 8+ cores, multiple threads scan directories concurrently, communicating via channels to maximize disk throughput. -```rust -// Discovery maintains a queue of directories to process -pub struct DiscoveryPhase { - dirs_to_walk: VecDeque, - seen_paths: HashSet, // Cycle detection -} -``` +Discovered entries are filtered through `IndexerRuler`, which applies toggleable system rules (like `NO_HIDDEN`, `NO_DEV_DIRS`) and dynamically loaded `.gitignore` patterns when inside a Git repository. -The phase uses a breadth-first traversal to ensure shallow directories are processed first, providing quicker initial results. Progress is measured by directories discovered versus total estimated. +Progress is measured by directories discovered. Entries are collected into batches of 1,000 items before moving to processing. ### Phase 2: Processing -Processing creates or updates database entries for each discovered item. This is where Spacedrive builds its understanding of your file structure: +Processing converts discovered entries into database records. Entries are sorted by depth (parents before children) to maintain referential integrity during batch insertion. -```rust -// Batch processing for efficiency -const BATCH_SIZE: usize = 1000; +**Change Detection** runs during this phase. The `ChangeDetector` loads existing database entries for the indexing path, then compares against filesystem state to identify: -// Process entries in parent-first order -let sorted_batch = batch.sort_by_depth(); -persistence.process_batch(sorted_batch, &mut entry_cache)?; -``` +- **New**: Paths not in database +- **Modified**: Size or mtime differs +- **Moved**: Same inode at different path +- **Deleted**: In database but missing from filesystem -The system uses materialized paths instead of parent IDs, making queries faster and eliminating complex recursive lookups. Each entry stores its full path prefix, enabling instant directory listings. +Changes are processed in batch transactions. Each batch inserts closure table rows, updates the directory paths cache, and syncs entries across devices. + +**Ephemeral UUID Preservation** happens here. When a browsed folder is promoted to a managed location, UUIDs assigned during ephemeral indexing are preserved (`state.ephemeral_uuids`). This prevents orphaning user metadata like tags and notes attached during browsing sessions. + +The processing phase validates that the indexing path stays within location boundaries, preventing catastrophic cross-location deletion if watcher routing bugs send events for the wrong path. ### Phase 3: Aggregation -Aggregation calculates sizes and counts for directories by traversing the tree bottom-up. This phase provides the statistics you see in the UI: +Aggregation walks the entry tree bottom-up, computing directory statistics: -- Total size including subdirectories -- Direct child count -- Recursive file count -- Aggregate content types +- `aggregate_size`: Total bytes including subdirectories +- `child_count`: Direct children only +- `file_count`: Recursive file count + +These aggregates are stored in the entry table and enable instant directory size display without traversing descendants. ### Phase 4: Content Identification -The final phase generates content-addressed storage (CAS) identifiers and performs deep file analysis: +Content identification generates BLAKE3 hashes for files, linking entries to `content_identity` records for deduplication. + +Content identities use deterministic v5 UUIDs (namespace hash of `content_hash + library_id`) so different devices can independently identify identical files and merge metadata without coordination. This enables offline duplicate detection across library peers. + +**Sync Order**: Content identities must be synced before entries to avoid foreign key violations on receiving devices. The job system enforces this ordering. + +For new content, file type identification runs via `FileTypeRegistry` to populate `kind_id` and `mime_type_id` fields. + +### Phase 5: Finalizing + +Finalizing handles post-processing tasks like directory aggregation updates and potential processor dispatch (thumbnail generation for Deep Mode). + +## Change Detection System + +The indexing system includes both batch and real-time change detection: + +### Batch Change Detection + +`ChangeDetector` compares database state against filesystem during indexer job scans: ```rust -// Sampled hashing for large files -let cas_id = cas_generator - .generate_cas_id(path, file_size) - .await?; +let mut detector = ChangeDetector::new(); +detector.load_existing_entries(ctx, location_id, indexing_path).await?; -// Link to content identity for deduplication -content_processor.link_or_create(entry_id, cas_id)?; +for entry in discovered_entries { + if let Some(change) = detector.check_path(&path, &metadata, inode) { + // Process New, Modified, or Moved change + } +} + +let deleted = detector.find_deleted(&seen_paths); ``` -This phase enables deduplication, content-based search, and file tracking across renames. +The detector tracks paths by inode to identify moves. On Unix systems, inodes provide stable file identity across renames. Windows falls back to path-only matching since file indices are unstable across reboots. + +### Real-Time Change Detection + +Both `PersistentWriter` and `EphemeralWriter` implement the `ChangeHandler` trait, which defines the interface for responding to filesystem watcher events: + +```rust +pub trait ChangeHandler { + async fn find_by_path(&self, path: &Path) -> Result>; + async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result; + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()>; + async fn move_entry(&mut self, entry: &EntryRef, old_path: &Path, new_path: &Path) -> Result<()>; + async fn delete(&mut self, entry: &EntryRef) -> Result<()>; +} +``` + +The watcher routes events to the appropriate handler based on whether the path belongs to a persistent location (`PersistentWriter` → database) or ephemeral session (`EphemeralWriter` → memory). ## Indexing Modes and Scopes @@ -88,29 +184,21 @@ The system provides flexible configuration through modes and scopes: ### Index Modes -**Shallow Mode** extracts only filesystem metadata (name, size, dates). Completes in under 500ms for typical directories. Perfect for responsive UI navigation. +**Shallow Mode** extracts only filesystem metadata (name, size, dates). Completes in under 500ms for typical directories. -**Content Mode** adds cryptographic hashing to identify files by content. Enables deduplication and content tracking. Moderate performance impact. +**Content Mode** adds BLAKE3 hashing to identify files by content. Enables deduplication and content tracking. -**Deep Mode** performs full analysis including thumbnails and media metadata extraction. Best for photo and video libraries. +**Deep Mode** performs full analysis including file type identification and metadata extraction. Triggers thumbnail generation for images and videos. ### Index Scopes -**Current Scope** indexes only the immediate directory contents: +**Current Scope** indexes only immediate directory contents. Used for responsive UI navigation. -```rust -IndexerJobConfig::ui_navigation(location_id, path) -``` - -**Recursive Scope** indexes the entire directory tree: - -```rust -IndexerJobConfig::new(location_id, path, IndexMode::Deep) -``` +**Recursive Scope** indexes the entire directory tree. Used for full location indexing. ## Persistence and Ephemeral Indexing -One of Spacedrive's key innovations is supporting both persistent and ephemeral indexing modes. +Spacedrive supports both persistent and ephemeral indexing modes: ### Persistent Indexing @@ -123,27 +211,63 @@ Persistent indexing stores all data in the database permanently. This is the def ### Ephemeral Indexing -Ephemeral indexing keeps data in memory only, perfect for browsing external drives: +Ephemeral indexing keeps data in memory only, perfect for browsing external drives without permanent storage. -```rust -let config = IndexerJobConfig::ephemeral_browse( - usb_path, - IndexScope::Current -); -``` +The ephemeral system uses highly memory-optimized structures: -The ephemeral index uses an LRU cache with automatic cleanup: +**NodeArena**: Slab allocator for `FileNode` entries with pointer-sized entry IDs. Provides contiguous memory layout for cache efficiency. -- No database writes -- Session-based lifetime -- Memory-efficient storage -- Automatic expiration +**NameCache**: Global string interning pool. One copy of "index.js" serves thousands of node_modules files. + +**NameRegistry**: BTreeMap for fast name-based lookups without full-text indexing overhead. + +Memory usage is around 50 bytes per entry vs 200+ bytes with naive `HashMap` approach. This 4-6x reduction enables browsing hundreds of thousands of files without database overhead. + +Multiple directory trees can coexist in the same `EphemeralIndex` (browsing both `/mnt/nas` and `/media/usb` simultaneously), sharing the string interning pool for maximum deduplication. + +The `EphemeralIndexCache` tracks which paths have been indexed, are currently being indexed, or are registered for filesystem watching. When a watched path receives filesystem events, `EphemeralWriter` updates the in-memory index in real-time. Ephemeral mode lets you explore USB drives or network shares without permanently adding them to your library. +## Indexer Rules + +The `IndexerRuler` applies filtering rules during discovery to skip unwanted files: + +**System Rules** are toggleable patterns like: +- `NO_HIDDEN`: Skip dotfiles (`.git`, `.DS_Store`) +- `NO_DEV_DIRS`: Skip `node_modules`, `target`, `dist` +- `NO_SYSTEM`: Skip OS folders (`System32`, `Windows`) + +**Git Integration**: When indexing inside a Git repository, rules are dynamically loaded from `.gitignore` files. This automatically excludes build artifacts and local configuration. + +Rules return a `RulerDecision` (Accept/Reject) for each path during discovery, preventing unwanted entries from ever reaching the processing phase. + +## Index Integrity Verification + +The `IndexVerifyAction` checks integrity by running a fresh ephemeral scan and comparing metadata against the existing persistent index: + +```rust +let verify = IndexVerifyAction::from_input(IndexVerifyInput { path }).await?; +let output = verify.execute(library, context).await?; + +// output.report contains: +// - missing_from_index: Files on disk but not in database +// - stale_in_index: Entries in database but missing from filesystem +// - metadata_mismatches: Size, mtime, or inode differences +``` + +The verification system detects: +- **MissingFromIndex**: Files created outside Spacedrive +- **StaleInIndex**: Deleted files not yet purged from database +- **SizeMismatch**: Files modified externally +- **ModifiedTimeMismatch**: Timestamp drift (with 1-second tolerance) +- **InodeMismatch**: File replacement or filesystem corruption + +Verification runs as a library action and returns a detailed `IntegrityReport` with per-file diagnostics. + ## Job System Integration The indexing system leverages Spacedrive's job infrastructure for reliability and monitoring. @@ -159,8 +283,8 @@ pub struct IndexerState { dirs_to_walk: VecDeque, entry_batches: Vec>, entry_id_cache: HashMap, + ephemeral_uuids: HashMap, stats: IndexerStats, - // ... checkpoint data } ``` @@ -172,25 +296,17 @@ Real-time progress flows through multiple channels: ```rust pub struct IndexerProgress { - phase: String, - items_done: u64, - total_items: u64, - bytes_per_second: f64, - eta_seconds: Option, + pub phase: IndexPhase, + pub total_found: IndexerStats, + pub processing_rate: f32, + pub estimated_remaining: Option, } ``` -Progress updates are: - -- Sent to UI via channels -- Persisted to database -- Available through job queries -- Used for time estimates +Progress updates are sent to the UI via channels, persisted to the database, and available through job queries for time estimates. ### Error Handling -The job system provides structured error handling: - **Non-critical errors** are accumulated but don't stop indexing: - Permission denied on individual files @@ -203,47 +319,6 @@ The job system provides structured error handling: - Filesystem unmounted - Out of disk space -## Database Schema - -The indexer populates several key tables designed for query performance. - -### Entries Table - -The core table uses materialized paths for efficient queries: - -```sql -CREATE TABLE entries ( - id INTEGER PRIMARY KEY, - uuid UUID UNIQUE, - location_id INTEGER, - relative_path TEXT, -- Parent path (materialized) - name TEXT, -- Without extension - extension TEXT, - kind INTEGER, -- 0=File, 1=Directory - size BIGINT, - inode BIGINT, -- Change detection - content_id INTEGER -); - --- Key indexes for performance -CREATE INDEX idx_entries_location_path - ON entries(location_id, relative_path); -``` - -### Content Identities Table - -Enables deduplication across your library: - -```sql -CREATE TABLE content_identities ( - id INTEGER PRIMARY KEY, - cas_id TEXT UNIQUE, - kind_id INTEGER, - total_size BIGINT, - entry_count INTEGER -); -``` - ## Performance Characteristics Indexing performance varies by mode and scope: @@ -259,32 +334,12 @@ Indexing performance varies by mode and scope: **Batch Processing**: Groups operations into transactions of 1,000 items, reducing database overhead by 30x. -**Parallel I/O**: Content identification runs on multiple threads, saturating disk bandwidth on fast storage. +**Parallel Discovery**: Work-stealing model with atomic counters for directory traversal, using half of available CPU cores by default. -**Smart Caching**: The entry ID cache eliminates redundant parent lookups, critical for deep directory trees. +**Entry ID Cache**: Eliminates redundant parent lookups during hierarchy construction, critical for deep directory trees. **Checkpoint Strategy**: Checkpoints occur every 5,000 items or 30 seconds, balancing durability with performance. -## Change Detection - -The indexer efficiently detects changes without full rescans: - -```rust -// Platform-specific change detection -#[cfg(unix)] -let file_id = metadata.ino(); // inode - -#[cfg(windows)] -let file_id = get_file_index(path)?; // File index -``` - -Detection capabilities: - -- New files: Appear with unknown inodes -- Modified files: Same inode, different size/mtime -- Moved files: Known inode at new path -- Deleted files: Missing from filesystem walk - ## Usage Examples ### Quick UI Navigation @@ -310,7 +365,7 @@ let job = IndexerJob::new(config); ### Full Library Location -Comprehensive indexing with all features: +Full indexing with content identification: ```rust let config = IndexerJobConfig::new( @@ -318,8 +373,6 @@ let config = IndexerJobConfig::new( path, IndexMode::Deep ); -config.with_checkpointing(true) - .with_filters(indexer_rules); ``` ## CLI Commands @@ -342,9 +395,9 @@ spacedrive job monitor # Watch progress ### Common Issues -**Slow Indexing**: Check for large node_modules or build directories. Use `.spacedriveignore` files to exclude them. +**Slow Indexing**: Check for large `node_modules` or build directories. System rules automatically skip common patterns, or use `.gitignore` to exclude project-specific artifacts. -**High Memory Usage**: Reduce batch size or avoid ephemeral mode for very large directories. +**High Memory Usage**: Reduce batch size for directories over 1M files. Ephemeral mode uses around 50 bytes per entry, so 100K files requires roughly 5MB. **Resume Not Working**: Ensure the jobs database isn't corrupted. Check logs for serialization errors. @@ -364,7 +417,7 @@ spacedrive job info --detailed ## Platform Notes -**Windows**: Uses file indices for change detection. Supports long paths transparently. Network drives may require polling. +**Windows**: Uses file indices for change detection where available, falling back to path-only matching. Supports long paths transparently. Network drives may require polling. **macOS**: Leverages FSEvents and native inodes. Integrates with Time Machine exclusions. APFS provides efficient cloning. @@ -372,15 +425,15 @@ spacedrive job info --detailed ## Best Practices -1. **Start shallow** for new locations to verify configuration -2. **Use filters** to exclude build artifacts and caches -3. **Monitor progress** through the job system instead of polling -4. **Schedule deep scans** during low-usage periods -5. **Enable checkpointing** for locations over 100K files +1. **Start shallow** for new locations to verify configuration before deep scans +2. **Use Git repositories** to automatically inherit `.gitignore` exclusions +3. **Monitor progress** through the job system instead of polling the database +4. **Schedule deep scans** during low-usage periods for large photo/video libraries +5. **Enable checkpointing** for locations over 100K files to survive interruptions Always let indexing jobs complete or pause them properly. Force-killing can - corrupt the job state. + corrupt the job state and require reindexing from scratch. ## Related Documentation