This commit is contained in:
Jamie Pine
2025-12-08 16:45:39 -08:00
parent 2641c335ff
commit 3e49f1de10
6 changed files with 212 additions and 231 deletions

View File

@@ -54,10 +54,6 @@ pub trait ChangeHandler: Send + Sync {
async fn handle_new_directory(&self, path: &Path) -> Result<()>;
}
// ============================================================================
// Shared Logic - Used by both handlers
// ============================================================================
/// Check if a path exists, distinguishing between "doesn't exist" and "can't access".
///
/// Critical for preventing false deletions when volumes go offline.
@@ -191,10 +187,6 @@ pub async fn build_dir_entry(
})
}
// ============================================================================
// Generic Change Application
// ============================================================================
/// Apply a batch of filesystem changes using the provided handler.
///
/// Processes events in the correct order: removes first, then renames,

View File

@@ -566,7 +566,6 @@ impl DBWriter {
new_path: &Path,
location_root_path: &Path,
) -> Result<(), JobError> {
// Begin transaction for atomic move operation
let txn = db
.begin()
.await
@@ -605,7 +604,6 @@ impl DBWriter {
location_root_path: &Path,
txn: &DatabaseTransaction,
) -> Result<(), JobError> {
// Get the entry
let db_entry = entities::entry::Entity::find_by_id(entry_id)
.one(txn)
.await
@@ -918,27 +916,22 @@ impl DBWriter {
let mut entry_active: entities::entry::ActiveModel = db_entry.into();
// Find new parent entry ID
let new_parent_id = if let Some(parent_path) = new_path.parent() {
state.entry_id_cache.get(parent_path).copied()
} else {
None
};
// Update entry fields
entry_active.parent_id = Set(new_parent_id);
// Extract new name and extension for files
match new_path.extension() {
Some(ext) => {
// File with extension
if let Some(stem) = new_path.file_stem() {
entry_active.name = Set(stem.to_string_lossy().to_string());
entry_active.extension = Set(Some(ext.to_string_lossy().to_lowercase()));
}
}
None => {
// File without extension or directory
if let Some(name) = new_path.file_name() {
entry_active.name = Set(name.to_string_lossy().to_string());
entry_active.extension = Set(None);
@@ -946,13 +939,11 @@ impl DBWriter {
}
}
// Save the updated entry
entry_active
.update(txn)
.await
.map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?;
// Update cache
state.entry_id_cache.remove(old_path);
state
.entry_id_cache
@@ -1030,10 +1021,6 @@ impl DBWriter {
Ok(())
}
// ========================================================================
// Subtree Deletion
// ========================================================================
/// Deletes an entry and all its descendants from the database.
///
/// This is a raw database operation that does NOT:
@@ -1070,7 +1057,6 @@ impl DBWriter {
{
use sea_orm::{ColumnTrait, EntityTrait, QueryFilter};
// Collect all descendants via closure table
let mut to_delete_ids: Vec<i32> = vec![entry_id];
if let Ok(rows) = entities::entry_closure::Entity::find()
.filter(entities::entry_closure::Column::AncestorId.eq(entry_id))
@@ -1083,7 +1069,6 @@ impl DBWriter {
to_delete_ids.dedup();
if !to_delete_ids.is_empty() {
// Delete closure links (both directions)
let _ = entities::entry_closure::Entity::delete_many()
.filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone()))
.exec(db)
@@ -1093,13 +1078,11 @@ impl DBWriter {
.exec(db)
.await;
// Delete directory paths
let _ = entities::directory_paths::Entity::delete_many()
.filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone()))
.exec(db)
.await;
// Delete entries
let _ = entities::entry::Entity::delete_many()
.filter(entities::entry::Column::Id.is_in(to_delete_ids))
.exec(db)

View File

@@ -106,7 +106,6 @@ impl EphemeralIndexCache {
let mut index = self.index.write().await;
let (cleared, deleted_browsed_dirs) = index.clear_directory_children(path, &indexed);
// Remove deleted browsed directories from indexed_paths
if !deleted_browsed_dirs.is_empty() {
let mut indexed_paths = self.indexed_paths.write();
for deleted_path in deleted_browsed_dirs {
@@ -197,7 +196,6 @@ impl EphemeralIndexCache {
pub fn find_watched_root(&self, path: &Path) -> Option<PathBuf> {
let watched = self.watched_paths.read();
// Find the longest matching watched path that is an ancestor of `path`
let mut best_match: Option<&PathBuf> = None;
let mut best_len = 0;
@@ -269,7 +267,6 @@ impl EphemeralIndexCache {
/// Legacy: Insert (no-op, entries are added directly to global index)
#[deprecated(note = "Entries should be added directly to the global index")]
pub fn insert(&self, path: PathBuf, _index: Arc<TokioRwLock<EphemeralIndex>>) {
// Mark the path as indexed
let mut indexed = self.indexed_paths.write();
indexed.insert(path);
}

View File

@@ -53,14 +53,11 @@ impl EphemeralWriter {
}
}
/// Generate the next entry ID.
fn next_id(&self) -> i32 {
self.next_id.fetch_add(1, Ordering::SeqCst)
}
/// Add an entry to the index and emit a ResourceChanged event.
///
/// This is the core write operation used by both pipelines.
/// Core write operation shared by both watcher and indexer pipelines.
async fn add_entry_internal(
&self,
path: &Path,
@@ -78,7 +75,6 @@ impl EphemeralWriter {
Ok((entry_id, content_kind))
}
/// Emit a ResourceChanged event for UI updates.
async fn emit_resource_changed(
&self,
uuid: Uuid,
@@ -142,7 +138,7 @@ impl ChangeHandler for EphemeralWriter {
}
async fn find_by_inode(&self, _inode: u64) -> Result<Option<EntryRef>> {
// Ephemeral index doesn't track inodes for move detection
// Inode tracking is skipped to minimize memory overhead; fall back to path-only detection.
Ok(None)
}
@@ -154,7 +150,6 @@ impl ChangeHandler for EphemeralWriter {
.add_entry_internal(&metadata.path, entry_uuid, entry_metadata.clone())
.await?;
// Emit event if entry was actually added (not a duplicate)
if let Some(content_kind) = content_kind {
self.emit_resource_changed(entry_uuid, &metadata.path, &entry_metadata, content_kind)
.await;
@@ -216,7 +211,7 @@ impl ChangeHandler for EphemeralWriter {
}
async fn run_processors(&self, _entry: &EntryRef, _is_new: bool) -> Result<()> {
// Ephemeral indexing skips processor pipeline (no thumbnails/content hash)
// File processors (thumbnails, content hash) are disabled to ensure responsive, low-overhead browsing.
Ok(())
}
@@ -295,10 +290,6 @@ impl ChangeHandler for EphemeralWriter {
}
}
// ============================================================================
// IndexPersistence Implementation (Job Pipeline)
// ============================================================================
#[async_trait::async_trait]
impl IndexPersistence for EphemeralWriter {
async fn store_entry(
@@ -336,7 +327,6 @@ impl IndexPersistence for EphemeralWriter {
(self.next_id(), content_kind)
};
// Emit event if entry was actually added (not a duplicate)
if let Some(content_kind) = content_kind {
self.emit_resource_changed(entry_uuid, &entry.path, &metadata, content_kind)
.await;
@@ -351,7 +341,6 @@ impl IndexPersistence for EphemeralWriter {
_path: &Path,
_cas_id: String,
) -> JobResult<()> {
// Ephemeral indexing doesn't track content identities
Ok(())
}
@@ -359,12 +348,10 @@ impl IndexPersistence for EphemeralWriter {
&self,
_indexing_path: &Path,
) -> JobResult<HashMap<PathBuf, (i32, Option<u64>, Option<SystemTime>, u64)>> {
// Ephemeral indexing doesn't support incremental indexing
Ok(HashMap::new())
}
async fn update_entry(&self, _entry_id: i32, _entry: &DirEntry) -> JobResult<()> {
// Updates are handled via add_entry (overwrites existing)
Ok(())
}
@@ -393,7 +380,6 @@ mod tests {
let mut writer =
EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf());
// Test create
let dir_entry = DirEntry {
path: test_file.clone(),
kind: EntryKind::File,
@@ -411,7 +397,6 @@ mod tests {
assert_eq!(entry_ref.path, test_file);
assert_eq!(entry_ref.kind, EntryKind::File);
// Verify entry exists
let found = writer
.find_by_path(&test_file)
.await
@@ -448,7 +433,6 @@ mod tests {
assert!(entry_id > 0);
assert!(!writer.is_persistent());
// Verify index was updated
let idx = index.read().await;
assert!(idx.has_entry(&test_file));
}
@@ -481,7 +465,6 @@ mod tests {
.await
.expect("store_entry should succeed");
// Try to receive the event
let event =
tokio::time::timeout(tokio::time::Duration::from_millis(100), subscriber.recv()).await;

View File

@@ -224,29 +224,18 @@ impl IndexerJob {
self.state = Some(IndexerState::new(&self.config.path));
} else {
ctx.log("Resuming indexer from saved state");
let state = self.state.as_ref().unwrap();
info!("INDEXER_STATE: Job resuming with saved state - phase: {:?}, entry_batches: {}, entries_for_content: {}, seen_paths: {}",
state.phase,
state.entry_batches.len(),
state.entries_for_content.len(),
state.seen_paths.len());
warn!(
"DEBUG: Resumed state - phase: {:?}, entry_batches: {}, entries_for_content: {}",
state.phase,
state.entry_batches.len(),
state.entries_for_content.len()
);
self.state.as_ref().unwrap().phase,
self.state.as_ref().unwrap().entry_batches.len(),
self.state.as_ref().unwrap().entries_for_content.len(),
self.state.as_ref().unwrap().seen_paths.len());
}
let state = self.state.as_mut().unwrap();
// For cloud volumes, we use the path component from the SdPath (e.g., "/" or "folder/")
// since discovery operates through the volume backend (not direct filesystem access).
let root_path_buf = if let Some(p) = self.config.path.as_local_path() {
p.to_path_buf()
} else if let Some(cloud_path) = self.config.path.cloud_path() {
// Cloud path - use the path component within the cloud volume
// The actual I/O will go through the volume backend
PathBuf::from(cloud_path)
} else if !self.config.is_ephemeral() {
let loc_uuid = self
@@ -326,7 +315,6 @@ impl IndexerJob {
ctx.check_interrupt().await?;
let current_phase = state.phase.clone();
warn!("DEBUG: IndexerJob entering phase: {:?}", current_phase);
match current_phase {
Phase::Discovery => {
let cloud_url_base =
@@ -359,7 +347,6 @@ impl IndexerJob {
}
Phase::Processing => {
warn!("DEBUG: IndexerJob starting Processing phase");
if self.config.is_ephemeral() {
let ephemeral_index = self.ephemeral_index.clone().ok_or_else(|| {
JobError::execution("Ephemeral index not initialized".to_string())
@@ -435,11 +422,6 @@ impl IndexerJob {
Phase::Complete => break,
}
warn!(
"DEBUG: IndexerJob completed phase: {:?}, next phase will be: {:?}",
current_phase, state.phase
);
}
let final_progress = IndexerProgress {
@@ -545,12 +527,7 @@ impl JobHandler for IndexerJob {
}
async fn on_resume(&mut self, ctx: &JobContext<'_>) -> JobResult {
warn!("DEBUG: IndexerJob on_resume called");
if let Some(state) = &self.state {
warn!(
"DEBUG: IndexerJob has state, resuming in {:?} phase",
state.phase
);
ctx.log(format!("Resuming indexer in {:?} phase", state.phase));
ctx.log(format!(
"Progress: {} files, {} dirs, {} errors so far",
@@ -559,7 +536,6 @@ impl JobHandler for IndexerJob {
self.timer = Some(PhaseTimer::new());
} else {
warn!("DEBUG: IndexerJob has no state during resume - creating new state!");
self.state = Some(IndexerState::new(&self.config.path));
}
Ok(())
@@ -641,8 +617,6 @@ impl IndexerJob {
use super::state::{DirEntry, EntryKind};
use tokio::fs;
ctx.log("Starting current scope discovery (single level)");
let mut entries = fs::read_dir(root_path)
.await
.map_err(|e| JobError::execution(format!("Failed to read directory: {}", e)))?;
@@ -677,7 +651,6 @@ impl IndexerJob {
state.pending_entries.push(dir_entry);
state.items_since_last_update += 1;
// Update stats
match entry_kind {
EntryKind::File => state.stats.files += 1,
EntryKind::Directory => state.stats.dirs += 1,

View File

@@ -3,19 +3,25 @@ title: Indexing
sidebarTitle: Indexing
---
The indexing system discovers and analyzes your files through a sophisticated multi-phase process. Built on Spacedrive's job system, it provides resumable operations, real-time progress tracking, and supports both persistent library indexing and ephemeral browsing of external drives.
The indexing system discovers and analyzes your files through a multi-phase pipeline. Built on Spacedrive's job system, it provides resumable operations, real-time progress tracking, and supports both persistent library indexing and ephemeral browsing of external drives.
## Architecture Overview
The indexing system consists of several key components working together:
The indexing system consists of specialized components working together:
**IndexerJob** orchestrates the entire indexing process as a resumable job. It maintains state across application restarts and provides detailed progress reporting.
**IndexerState** preserves all necessary information to resume indexing from any interruption point. This includes the current phase, directories to process, and accumulated statistics.
**IndexerState** preserves all necessary information to resume indexing from any interruption point. This includes the current phase, directories to process, accumulated statistics, and ephemeral UUID mappings for preserving user metadata across browsing-to-persistent transitions.
**EntryProcessor** handles the complex task of creating and updating database records while maintaining referential integrity through materialized paths.
**DBWriter** provides the low-level database CRUD layer. All database operations (create, update, move, delete) flow through this module for consistency.
**FileTypeRegistry** identifies files through a combination of extensions, magic bytes, and content analysis to provide accurate type detection.
**PersistentWriter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the database via `DBWriter`.
**EphemeralWriter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the in-memory `EphemeralIndex`.
This dual-implementation architecture unifies watcher and job pipelines, eliminating code duplication between real-time filesystem monitoring and batch indexing operations.
**FileTypeRegistry** identifies files through extensions, magic bytes, and content analysis.
The system integrates deeply with Spacedrive's job infrastructure, which provides automatic state persistence through MessagePack serialization. When you pause an indexing operation, the entire job state is saved to a dedicated jobs database, allowing seamless resumption even after application restarts.
@@ -24,63 +30,153 @@ The system integrates deeply with Spacedrive's job infrastructure, which provide
architecture ensures no work is lost if interrupted.
</Note>
## Database Architecture
The indexing system uses a closure table for hierarchy management instead of recursive queries:
### Closure Table
Parent-child relationships are stored in the `entry_closure` table with precomputed ancestor-descendant pairs. This makes "find all descendants" queries O(1) regardless of nesting depth, at the cost of additional storage (worst-case N² for deeply nested trees).
```sql
CREATE TABLE entry_closure (
ancestor_id INTEGER,
descendant_id INTEGER,
depth INTEGER
);
```
The closure table stores all transitive relationships. For a file at `/home/user/docs/report.pdf`, entries exist for:
- (home_id, report_id, depth=3)
- (user_id, report_id, depth=2)
- (docs_id, report_id, depth=1)
- (report_id, report_id, depth=0)
Move operations require rebuilding closures for the entire moved subtree, which can affect thousands of rows when moving large directories.
### Directory Paths Cache
The `directory_paths` table provides O(1) absolute path lookups for directories:
```sql
CREATE TABLE directory_paths (
entry_id INTEGER PRIMARY KEY,
path TEXT UNIQUE
);
```
This eliminates recursive parent traversal when building file paths. Each directory stores its complete absolute path, enabling instant resolution for child entries.
### Entries Table
```sql
CREATE TABLE entry (
id INTEGER PRIMARY KEY,
uuid UUID UNIQUE,
parent_id INTEGER,
name TEXT,
extension TEXT,
kind INTEGER,
size BIGINT,
inode BIGINT,
content_id INTEGER,
aggregate_size BIGINT,
child_count INTEGER,
file_count INTEGER
);
```
## Indexing Phases
The indexer operates through four distinct phases, each designed to be interruptible and resumable:
The indexer operates through five distinct phases, each designed to be interruptible and resumable:
### Phase 1: Discovery
The discovery phase walks your filesystem to build a list of all files and directories. This phase is optimized for speed, collecting just enough information to plan the work ahead:
Discovery walks the filesystem using parallel workers with a work-stealing model. On systems with 8+ cores, multiple threads scan directories concurrently, communicating via channels to maximize disk throughput.
```rust
// Discovery maintains a queue of directories to process
pub struct DiscoveryPhase {
dirs_to_walk: VecDeque<PathBuf>,
seen_paths: HashSet<PathBuf>, // Cycle detection
}
```
Discovered entries are filtered through `IndexerRuler`, which applies toggleable system rules (like `NO_HIDDEN`, `NO_DEV_DIRS`) and dynamically loaded `.gitignore` patterns when inside a Git repository.
The phase uses a breadth-first traversal to ensure shallow directories are processed first, providing quicker initial results. Progress is measured by directories discovered versus total estimated.
Progress is measured by directories discovered. Entries are collected into batches of 1,000 items before moving to processing.
### Phase 2: Processing
Processing creates or updates database entries for each discovered item. This is where Spacedrive builds its understanding of your file structure:
Processing converts discovered entries into database records. Entries are sorted by depth (parents before children) to maintain referential integrity during batch insertion.
```rust
// Batch processing for efficiency
const BATCH_SIZE: usize = 1000;
**Change Detection** runs during this phase. The `ChangeDetector` loads existing database entries for the indexing path, then compares against filesystem state to identify:
// Process entries in parent-first order
let sorted_batch = batch.sort_by_depth();
persistence.process_batch(sorted_batch, &mut entry_cache)?;
```
- **New**: Paths not in database
- **Modified**: Size or mtime differs
- **Moved**: Same inode at different path
- **Deleted**: In database but missing from filesystem
The system uses materialized paths instead of parent IDs, making queries faster and eliminating complex recursive lookups. Each entry stores its full path prefix, enabling instant directory listings.
Changes are processed in batch transactions. Each batch inserts closure table rows, updates the directory paths cache, and syncs entries across devices.
**Ephemeral UUID Preservation** happens here. When a browsed folder is promoted to a managed location, UUIDs assigned during ephemeral indexing are preserved (`state.ephemeral_uuids`). This prevents orphaning user metadata like tags and notes attached during browsing sessions.
The processing phase validates that the indexing path stays within location boundaries, preventing catastrophic cross-location deletion if watcher routing bugs send events for the wrong path.
### Phase 3: Aggregation
Aggregation calculates sizes and counts for directories by traversing the tree bottom-up. This phase provides the statistics you see in the UI:
Aggregation walks the entry tree bottom-up, computing directory statistics:
- Total size including subdirectories
- Direct child count
- Recursive file count
- Aggregate content types
- `aggregate_size`: Total bytes including subdirectories
- `child_count`: Direct children only
- `file_count`: Recursive file count
These aggregates are stored in the entry table and enable instant directory size display without traversing descendants.
### Phase 4: Content Identification
The final phase generates content-addressed storage (CAS) identifiers and performs deep file analysis:
Content identification generates BLAKE3 hashes for files, linking entries to `content_identity` records for deduplication.
Content identities use deterministic v5 UUIDs (namespace hash of `content_hash + library_id`) so different devices can independently identify identical files and merge metadata without coordination. This enables offline duplicate detection across library peers.
**Sync Order**: Content identities must be synced before entries to avoid foreign key violations on receiving devices. The job system enforces this ordering.
For new content, file type identification runs via `FileTypeRegistry` to populate `kind_id` and `mime_type_id` fields.
### Phase 5: Finalizing
Finalizing handles post-processing tasks like directory aggregation updates and potential processor dispatch (thumbnail generation for Deep Mode).
## Change Detection System
The indexing system includes both batch and real-time change detection:
### Batch Change Detection
`ChangeDetector` compares database state against filesystem during indexer job scans:
```rust
// Sampled hashing for large files
let cas_id = cas_generator
.generate_cas_id(path, file_size)
.await?;
let mut detector = ChangeDetector::new();
detector.load_existing_entries(ctx, location_id, indexing_path).await?;
// Link to content identity for deduplication
content_processor.link_or_create(entry_id, cas_id)?;
for entry in discovered_entries {
if let Some(change) = detector.check_path(&path, &metadata, inode) {
// Process New, Modified, or Moved change
}
}
let deleted = detector.find_deleted(&seen_paths);
```
This phase enables deduplication, content-based search, and file tracking across renames.
The detector tracks paths by inode to identify moves. On Unix systems, inodes provide stable file identity across renames. Windows falls back to path-only matching since file indices are unstable across reboots.
### Real-Time Change Detection
Both `PersistentWriter` and `EphemeralWriter` implement the `ChangeHandler` trait, which defines the interface for responding to filesystem watcher events:
```rust
pub trait ChangeHandler {
async fn find_by_path(&self, path: &Path) -> Result<Option<EntryRef>>;
async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result<EntryRef>;
async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()>;
async fn move_entry(&mut self, entry: &EntryRef, old_path: &Path, new_path: &Path) -> Result<()>;
async fn delete(&mut self, entry: &EntryRef) -> Result<()>;
}
```
The watcher routes events to the appropriate handler based on whether the path belongs to a persistent location (`PersistentWriter` → database) or ephemeral session (`EphemeralWriter` → memory).
## Indexing Modes and Scopes
@@ -88,29 +184,21 @@ The system provides flexible configuration through modes and scopes:
### Index Modes
**Shallow Mode** extracts only filesystem metadata (name, size, dates). Completes in under 500ms for typical directories. Perfect for responsive UI navigation.
**Shallow Mode** extracts only filesystem metadata (name, size, dates). Completes in under 500ms for typical directories.
**Content Mode** adds cryptographic hashing to identify files by content. Enables deduplication and content tracking. Moderate performance impact.
**Content Mode** adds BLAKE3 hashing to identify files by content. Enables deduplication and content tracking.
**Deep Mode** performs full analysis including thumbnails and media metadata extraction. Best for photo and video libraries.
**Deep Mode** performs full analysis including file type identification and metadata extraction. Triggers thumbnail generation for images and videos.
### Index Scopes
**Current Scope** indexes only the immediate directory contents:
**Current Scope** indexes only immediate directory contents. Used for responsive UI navigation.
```rust
IndexerJobConfig::ui_navigation(location_id, path)
```
**Recursive Scope** indexes the entire directory tree:
```rust
IndexerJobConfig::new(location_id, path, IndexMode::Deep)
```
**Recursive Scope** indexes the entire directory tree. Used for full location indexing.
## Persistence and Ephemeral Indexing
One of Spacedrive's key innovations is supporting both persistent and ephemeral indexing modes.
Spacedrive supports both persistent and ephemeral indexing modes:
### Persistent Indexing
@@ -123,27 +211,63 @@ Persistent indexing stores all data in the database permanently. This is the def
### Ephemeral Indexing
Ephemeral indexing keeps data in memory only, perfect for browsing external drives:
Ephemeral indexing keeps data in memory only, perfect for browsing external drives without permanent storage.
```rust
let config = IndexerJobConfig::ephemeral_browse(
usb_path,
IndexScope::Current
);
```
The ephemeral system uses highly memory-optimized structures:
The ephemeral index uses an LRU cache with automatic cleanup:
**NodeArena**: Slab allocator for `FileNode` entries with pointer-sized entry IDs. Provides contiguous memory layout for cache efficiency.
- No database writes
- Session-based lifetime
- Memory-efficient storage
- Automatic expiration
**NameCache**: Global string interning pool. One copy of "index.js" serves thousands of node_modules files.
**NameRegistry**: BTreeMap for fast name-based lookups without full-text indexing overhead.
Memory usage is around 50 bytes per entry vs 200+ bytes with naive `HashMap<PathBuf, Entry>` approach. This 4-6x reduction enables browsing hundreds of thousands of files without database overhead.
Multiple directory trees can coexist in the same `EphemeralIndex` (browsing both `/mnt/nas` and `/media/usb` simultaneously), sharing the string interning pool for maximum deduplication.
The `EphemeralIndexCache` tracks which paths have been indexed, are currently being indexed, or are registered for filesystem watching. When a watched path receives filesystem events, `EphemeralWriter` updates the in-memory index in real-time.
<Info>
Ephemeral mode lets you explore USB drives or network shares without
permanently adding them to your library.
</Info>
## Indexer Rules
The `IndexerRuler` applies filtering rules during discovery to skip unwanted files:
**System Rules** are toggleable patterns like:
- `NO_HIDDEN`: Skip dotfiles (`.git`, `.DS_Store`)
- `NO_DEV_DIRS`: Skip `node_modules`, `target`, `dist`
- `NO_SYSTEM`: Skip OS folders (`System32`, `Windows`)
**Git Integration**: When indexing inside a Git repository, rules are dynamically loaded from `.gitignore` files. This automatically excludes build artifacts and local configuration.
Rules return a `RulerDecision` (Accept/Reject) for each path during discovery, preventing unwanted entries from ever reaching the processing phase.
## Index Integrity Verification
The `IndexVerifyAction` checks integrity by running a fresh ephemeral scan and comparing metadata against the existing persistent index:
```rust
let verify = IndexVerifyAction::from_input(IndexVerifyInput { path }).await?;
let output = verify.execute(library, context).await?;
// output.report contains:
// - missing_from_index: Files on disk but not in database
// - stale_in_index: Entries in database but missing from filesystem
// - metadata_mismatches: Size, mtime, or inode differences
```
The verification system detects:
- **MissingFromIndex**: Files created outside Spacedrive
- **StaleInIndex**: Deleted files not yet purged from database
- **SizeMismatch**: Files modified externally
- **ModifiedTimeMismatch**: Timestamp drift (with 1-second tolerance)
- **InodeMismatch**: File replacement or filesystem corruption
Verification runs as a library action and returns a detailed `IntegrityReport` with per-file diagnostics.
## Job System Integration
The indexing system leverages Spacedrive's job infrastructure for reliability and monitoring.
@@ -159,8 +283,8 @@ pub struct IndexerState {
dirs_to_walk: VecDeque<PathBuf>,
entry_batches: Vec<Vec<DirEntry>>,
entry_id_cache: HashMap<PathBuf, i32>,
ephemeral_uuids: HashMap<PathBuf, Uuid>,
stats: IndexerStats,
// ... checkpoint data
}
```
@@ -172,25 +296,17 @@ Real-time progress flows through multiple channels:
```rust
pub struct IndexerProgress {
phase: String,
items_done: u64,
total_items: u64,
bytes_per_second: f64,
eta_seconds: Option<u32>,
pub phase: IndexPhase,
pub total_found: IndexerStats,
pub processing_rate: f32,
pub estimated_remaining: Option<Duration>,
}
```
Progress updates are:
- Sent to UI via channels
- Persisted to database
- Available through job queries
- Used for time estimates
Progress updates are sent to the UI via channels, persisted to the database, and available through job queries for time estimates.
### Error Handling
The job system provides structured error handling:
**Non-critical errors** are accumulated but don't stop indexing:
- Permission denied on individual files
@@ -203,47 +319,6 @@ The job system provides structured error handling:
- Filesystem unmounted
- Out of disk space
## Database Schema
The indexer populates several key tables designed for query performance.
### Entries Table
The core table uses materialized paths for efficient queries:
```sql
CREATE TABLE entries (
id INTEGER PRIMARY KEY,
uuid UUID UNIQUE,
location_id INTEGER,
relative_path TEXT, -- Parent path (materialized)
name TEXT, -- Without extension
extension TEXT,
kind INTEGER, -- 0=File, 1=Directory
size BIGINT,
inode BIGINT, -- Change detection
content_id INTEGER
);
-- Key indexes for performance
CREATE INDEX idx_entries_location_path
ON entries(location_id, relative_path);
```
### Content Identities Table
Enables deduplication across your library:
```sql
CREATE TABLE content_identities (
id INTEGER PRIMARY KEY,
cas_id TEXT UNIQUE,
kind_id INTEGER,
total_size BIGINT,
entry_count INTEGER
);
```
## Performance Characteristics
Indexing performance varies by mode and scope:
@@ -259,32 +334,12 @@ Indexing performance varies by mode and scope:
**Batch Processing**: Groups operations into transactions of 1,000 items, reducing database overhead by 30x.
**Parallel I/O**: Content identification runs on multiple threads, saturating disk bandwidth on fast storage.
**Parallel Discovery**: Work-stealing model with atomic counters for directory traversal, using half of available CPU cores by default.
**Smart Caching**: The entry ID cache eliminates redundant parent lookups, critical for deep directory trees.
**Entry ID Cache**: Eliminates redundant parent lookups during hierarchy construction, critical for deep directory trees.
**Checkpoint Strategy**: Checkpoints occur every 5,000 items or 30 seconds, balancing durability with performance.
## Change Detection
The indexer efficiently detects changes without full rescans:
```rust
// Platform-specific change detection
#[cfg(unix)]
let file_id = metadata.ino(); // inode
#[cfg(windows)]
let file_id = get_file_index(path)?; // File index
```
Detection capabilities:
- New files: Appear with unknown inodes
- Modified files: Same inode, different size/mtime
- Moved files: Known inode at new path
- Deleted files: Missing from filesystem walk
## Usage Examples
### Quick UI Navigation
@@ -310,7 +365,7 @@ let job = IndexerJob::new(config);
### Full Library Location
Comprehensive indexing with all features:
Full indexing with content identification:
```rust
let config = IndexerJobConfig::new(
@@ -318,8 +373,6 @@ let config = IndexerJobConfig::new(
path,
IndexMode::Deep
);
config.with_checkpointing(true)
.with_filters(indexer_rules);
```
## CLI Commands
@@ -342,9 +395,9 @@ spacedrive job monitor # Watch progress
### Common Issues
**Slow Indexing**: Check for large node_modules or build directories. Use `.spacedriveignore` files to exclude them.
**Slow Indexing**: Check for large `node_modules` or build directories. System rules automatically skip common patterns, or use `.gitignore` to exclude project-specific artifacts.
**High Memory Usage**: Reduce batch size or avoid ephemeral mode for very large directories.
**High Memory Usage**: Reduce batch size for directories over 1M files. Ephemeral mode uses around 50 bytes per entry, so 100K files requires roughly 5MB.
**Resume Not Working**: Ensure the jobs database isn't corrupted. Check logs for serialization errors.
@@ -364,7 +417,7 @@ spacedrive job info <job-id> --detailed
## Platform Notes
**Windows**: Uses file indices for change detection. Supports long paths transparently. Network drives may require polling.
**Windows**: Uses file indices for change detection where available, falling back to path-only matching. Supports long paths transparently. Network drives may require polling.
**macOS**: Leverages FSEvents and native inodes. Integrates with Time Machine exclusions. APFS provides efficient cloning.
@@ -372,15 +425,15 @@ spacedrive job info <job-id> --detailed
## Best Practices
1. **Start shallow** for new locations to verify configuration
2. **Use filters** to exclude build artifacts and caches
3. **Monitor progress** through the job system instead of polling
4. **Schedule deep scans** during low-usage periods
5. **Enable checkpointing** for locations over 100K files
1. **Start shallow** for new locations to verify configuration before deep scans
2. **Use Git repositories** to automatically inherit `.gitignore` exclusions
3. **Monitor progress** through the job system instead of polling the database
4. **Schedule deep scans** during low-usage periods for large photo/video libraries
5. **Enable checkpointing** for locations over 100K files to survive interruptions
<Warning>
Always let indexing jobs complete or pause them properly. Force-killing can
corrupt the job state.
corrupt the job state and require reindexing from scratch.
</Warning>
## Related Documentation