From 8fc01634f6a6789d2b34bbe203cd047bd22e1df1 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 15:45:26 -0800 Subject: [PATCH 01/20] parallel indexing rayon without rayon due to async requirement --- core/Cargo.toml | 7 +- core/src/ops/indexing/phases/discovery.rs | 384 ++++++++++++++++++++-- core/src/ops/indexing/state.rs | 7 +- core/tests/indexing_test.rs | 18 +- 4 files changed, 383 insertions(+), 33 deletions(-) diff --git a/core/Cargo.toml b/core/Cargo.toml index 4a66f6b5b..209103542 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -19,9 +19,10 @@ cli = [] [dependencies] # Async runtime -async-trait = "0.1" -futures = "0.3" -tokio = { version = "1.40", features = ["full"] } +async-channel = { workspace = true } +async-trait = "0.1" +futures = "0.3" +tokio = { version = "1.40", features = ["full"] } # Database sea-orm = { version = "1.1", features = [ diff --git a/core/src/ops/indexing/phases/discovery.rs b/core/src/ops/indexing/phases/discovery.rs index 549537ac8..b1d2fb08a 100644 --- a/core/src/ops/indexing/phases/discovery.rs +++ b/core/src/ops/indexing/phases/discovery.rs @@ -9,7 +9,9 @@ use crate::{ state::{DirEntry, EntryKind, IndexError, IndexPhase, IndexerProgress, IndexerState}, }, }; +use async_channel as chan; use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}; use std::time::Instant; use std::{path::Path, sync::Arc}; @@ -22,7 +24,7 @@ impl crate::ops::indexing::rules::MetadataForIndexerRules for SimpleMetadata { } } -/// Run the discovery phase of indexing +/// Run the discovery phase of indexing with parallel directory walking pub async fn run_discovery_phase( state: &mut IndexerState, ctx: &JobContext<'_>, @@ -31,33 +33,382 @@ pub async fn run_discovery_phase( volume_backend: Option<&Arc>, cloud_url_base: Option, ) -> Result<(), JobError> { + let concurrency = state.discovery_concurrency; + + if concurrency <= 1 { + // Fall back to sequential discovery for concurrency = 1 + return run_discovery_phase_sequential( + state, + ctx, + root_path, + rule_toggles, + volume_backend, + cloud_url_base, + ) + .await; + } + ctx.log(format!( - "Discovery phase starting from: {}", - root_path.display() + "Discovery phase starting from: {} (concurrency: {})", + root_path.display(), + concurrency )); ctx.log(format!( "Initial directories to walk: {}", state.dirs_to_walk.len() )); - let mut skipped_count = 0u64; + run_parallel_discovery(state, ctx, root_path, rule_toggles, volume_backend, cloud_url_base) + .await +} - let toggles = rule_toggles; +/// Parallel discovery implementation using Rayon-style work-stealing +async fn run_parallel_discovery( + state: &mut IndexerState, + ctx: &JobContext<'_>, + root_path: &Path, + rule_toggles: RuleToggles, + volume_backend: Option<&Arc>, + cloud_url_base: Option, +) -> Result<(), JobError> { + let concurrency = state.discovery_concurrency; + + // Use unbounded channels to avoid backpressure/deadlock issues + let (work_tx, work_rx) = chan::unbounded::(); + let (result_tx, result_rx) = chan::unbounded::(); + + // Atomic counter tracking work in progress + shutdown signal + // INVARIANT: incremented BEFORE sending to work channel, decremented AFTER processing + let pending_work = Arc::new(AtomicUsize::new(0)); + let skipped_count = Arc::new(AtomicU64::new(0)); + let shutdown = Arc::new(AtomicBool::new(false)); + + // Seed initial work + while let Some(dir) = state.dirs_to_walk.pop_front() { + pending_work.fetch_add(1, Ordering::Release); + work_tx + .send(dir) + .await + .map_err(|_| JobError::execution("Work channel closed"))?; + } + + // Spawn worker tasks + let mut workers = Vec::new(); + for worker_id in 0..concurrency { + let work_rx = work_rx.clone(); + let work_tx = work_tx.clone(); + let result_tx = result_tx.clone(); + let pending_work = Arc::clone(&pending_work); + let skipped_count = Arc::clone(&skipped_count); + let shutdown = Arc::clone(&shutdown); + let root_path = root_path.to_path_buf(); + let volume_backend = volume_backend.cloned(); + let cloud_url_base = cloud_url_base.clone(); + + let worker = tokio::spawn(async move { + discovery_worker_rayon( + worker_id, + work_rx, + work_tx, + result_tx, + pending_work, + skipped_count, + shutdown, + root_path, + rule_toggles, + volume_backend, + cloud_url_base, + ) + .await + }); + + workers.push(worker); + } + + // Monitor task: signals shutdown when all work is done + let monitor = tokio::spawn({ + let shutdown = Arc::clone(&shutdown); + let pending_work = Arc::clone(&pending_work); + async move { + loop { + tokio::time::sleep(tokio::time::Duration::from_millis(10)).await; + if pending_work.load(Ordering::Acquire) == 0 { + shutdown.store(true, Ordering::Release); + break; + } + } + } + }); + + // Drop our copies so channels close when workers are done + drop(work_tx); + drop(result_tx); + + // Collect results + let mut total_processed = 0u64; + while let Ok(result) = result_rx.recv().await { + match result { + DiscoveryResult::Entry(entry) => { + state.pending_entries.push(entry); + total_processed += 1; + + if state.should_create_batch() { + let batch = state.create_batch(); + state.entry_batches.push(batch); + } + } + DiscoveryResult::Stats { + files, + dirs, + symlinks, + bytes, + } => { + state.stats.files += files; + state.stats.dirs += dirs; + state.stats.symlinks += symlinks; + state.stats.bytes += bytes; + } + DiscoveryResult::Error(error) => { + state.add_error(error); + } + DiscoveryResult::Progress { dirs_queued } => { + let indexer_progress = IndexerProgress { + phase: IndexPhase::Discovery { dirs_queued }, + current_path: root_path.display().to_string(), + total_found: state.stats, + processing_rate: state.calculate_rate(), + estimated_remaining: state.estimate_remaining(), + scope: None, + persistence: None, + is_ephemeral: false, + action_context: None, + }; + ctx.progress(Progress::generic(indexer_progress.to_generic_progress())); + state.items_since_last_update += 1; + } + DiscoveryResult::QueueDirectories(_) => { + // Workers queue directly, this shouldn't happen + unreachable!("Workers should not send QueueDirectories in Rayon-style mode"); + } + } + + ctx.check_interrupt().await?; + } + + // Wait for monitor and workers + monitor + .await + .map_err(|e| JobError::execution(format!("Monitor task failed: {}", e)))?; + + for worker in workers { + worker + .await + .map_err(|e| JobError::execution(format!("Worker task failed: {}", e)))?; + } + + // Final batch + if !state.pending_entries.is_empty() { + let final_batch_size = state.pending_entries.len(); + ctx.log(format!( + "Creating final batch with {} entries", + final_batch_size + )); + let batch = state.create_batch(); + state.entry_batches.push(batch); + } + + let skipped = skipped_count.load(Ordering::SeqCst); + state.stats.skipped = skipped; + + ctx.log(format!( + "Parallel discovery complete: {} files, {} dirs, {} symlinks, {} skipped, {} batches created", + state.stats.files, + state.stats.dirs, + state.stats.symlinks, + skipped, + state.entry_batches.len() + )); + + state.phase = crate::ops::indexing::state::Phase::Processing; + Ok(()) +} + +/// Result types sent from workers back to coordinator +enum DiscoveryResult { + Entry(DirEntry), + QueueDirectories(Vec), + Stats { + files: u64, + dirs: u64, + symlinks: u64, + bytes: u64, + }, + Error(IndexError), + Progress { dirs_queued: usize }, +} + +/// Rayon-style worker: processes directories and directly enqueues new work +async fn discovery_worker_rayon( + _worker_id: usize, + work_rx: chan::Receiver, + work_tx: chan::Sender, + result_tx: chan::Sender, + pending_work: Arc, + skipped_count: Arc, + shutdown: Arc, + root_path: PathBuf, + rule_toggles: RuleToggles, + volume_backend: Option>, + cloud_url_base: Option, +) { + let mut seen_paths = std::collections::HashSet::new(); + + loop { + // Check shutdown signal + if shutdown.load(Ordering::Acquire) { + break; + } + + // Try to get work with a timeout to periodically check shutdown + let dir_path = match tokio::time::timeout( + tokio::time::Duration::from_millis(50), + work_rx.recv(), + ) + .await + { + Ok(Ok(path)) => path, + Ok(Err(_)) => break, // Channel closed + Err(_) => continue, // Timeout, check shutdown flag again + }; + + // Skip if already seen (handles symlink loops) + if !seen_paths.insert(dir_path.clone()) { + pending_work.fetch_sub(1, Ordering::Release); + continue; + } + + // Build rules for this directory + let dir_ruler = build_default_ruler(rule_toggles, &root_path, &dir_path).await; + + // Read directory + match read_directory(&dir_path, volume_backend.as_ref(), cloud_url_base.as_deref()).await + { + Ok(entries) => { + let mut local_stats = LocalStats::default(); + + for entry in entries { + // Apply rules + let decision = dir_ruler + .evaluate_path( + &entry.path, + &SimpleMetadata { + is_dir: matches!(entry.kind, EntryKind::Directory), + }, + ) + .await; + + if matches!(decision, Ok(RulerDecision::Reject)) { + skipped_count.fetch_add(1, Ordering::Relaxed); + continue; + } + + if let Err(err) = decision { + let _ = result_tx + .send(DiscoveryResult::Error(IndexError::FilterCheck { + path: entry.path.to_string_lossy().to_string(), + error: err.to_string(), + })) + .await; + continue; + } + + match entry.kind { + EntryKind::Directory => { + local_stats.dirs += 1; + // Rayon-style: increment BEFORE queueing, worker directly enqueues + pending_work.fetch_add(1, Ordering::Release); + if work_tx.send(entry.path.clone()).await.is_err() { + // Channel closed, decrement and continue + pending_work.fetch_sub(1, Ordering::Release); + } + let _ = result_tx.send(DiscoveryResult::Entry(entry)).await; + } + EntryKind::File => { + local_stats.files += 1; + local_stats.bytes += entry.size; + let _ = result_tx.send(DiscoveryResult::Entry(entry)).await; + } + EntryKind::Symlink => { + local_stats.symlinks += 1; + let _ = result_tx.send(DiscoveryResult::Entry(entry)).await; + } + } + } + + // Send stats update + let _ = result_tx + .send(DiscoveryResult::Stats { + files: local_stats.files, + dirs: local_stats.dirs, + symlinks: local_stats.symlinks, + bytes: local_stats.bytes, + }) + .await; + + // Send progress update + let dirs_queued = pending_work.load(Ordering::Acquire); + let _ = result_tx + .send(DiscoveryResult::Progress { dirs_queued }) + .await; + } + Err(e) => { + let _ = result_tx + .send(DiscoveryResult::Error(IndexError::ReadDir { + path: dir_path.to_string_lossy().to_string(), + error: e.to_string(), + })) + .await; + } + } + + // Decrement AFTER processing complete + pending_work.fetch_sub(1, Ordering::Release); + } +} + +#[derive(Default)] +struct LocalStats { + files: u64, + dirs: u64, + symlinks: u64, + bytes: u64, +} + +/// Sequential discovery fallback (original implementation) +async fn run_discovery_phase_sequential( + state: &mut IndexerState, + ctx: &JobContext<'_>, + root_path: &Path, + rule_toggles: RuleToggles, + volume_backend: Option<&Arc>, + cloud_url_base: Option, +) -> Result<(), JobError> { + ctx.log(format!( + "Discovery phase starting from: {} (sequential mode)", + root_path.display() + )); + + let mut skipped_count = 0u64; while let Some(dir_path) = state.dirs_to_walk.pop_front() { ctx.check_interrupt().await?; - // Skip if already seen (handles symlink loops) if !state.seen_paths.insert(dir_path.clone()) { continue; } - // Build rules in the context of the current directory for gitignore behavior - let dir_ruler = build_default_ruler(toggles, root_path, &dir_path).await; + let dir_ruler = build_default_ruler(rule_toggles, root_path, &dir_path).await; - // Do not skip the directory itself by rules; only apply rules to its entries - - // Update progress let indexer_progress = IndexerProgress { phase: IndexPhase::Discovery { dirs_queued: state.dirs_to_walk.len(), @@ -69,21 +420,18 @@ pub async fn run_discovery_phase( scope: None, persistence: None, is_ephemeral: false, - action_context: None, // TODO: Pass action context from job state + action_context: None, }; ctx.progress(Progress::generic(indexer_progress.to_generic_progress())); - // Read directory entries with per-dir FS timing match read_directory(&dir_path, volume_backend, cloud_url_base.as_deref()).await { Ok(entries) => { let entry_count = entries.len(); let mut added_count = 0; for entry in entries { - // Check for interruption during entry processing ctx.check_interrupt().await?; - // Skip filtered entries via rules engine let decision = dir_ruler .evaluate_path( &entry.path, @@ -95,7 +443,6 @@ pub async fn run_discovery_phase( if matches!(decision, Ok(RulerDecision::Reject)) { state.stats.skipped += 1; skipped_count += 1; - eprintln!("[discovery] Filtered entry: {}", entry.path.display()); continue; } if let Err(err) = decision { @@ -135,7 +482,6 @@ pub async fn run_discovery_phase( )); } - // Batch entries if state.should_create_batch() { let batch = state.create_batch(); state.entry_batches.push(batch); @@ -151,13 +497,9 @@ pub async fn run_discovery_phase( } } - // Update rate tracking state.items_since_last_update += 1; - - // State is automatically saved during job serialization on shutdown } - // Final batch if !state.pending_entries.is_empty() { let final_batch_size = state.pending_entries.len(); ctx.log(format!( diff --git a/core/src/ops/indexing/state.rs b/core/src/ops/indexing/state.rs index 93a8913ad..4e769f8ee 100644 --- a/core/src/ops/indexing/state.rs +++ b/core/src/ops/indexing/state.rs @@ -135,6 +135,11 @@ impl IndexerState { dirs_to_walk.push_back(path.to_path_buf()); } + // Use half of available CPU cores for parallel discovery (Rayon-style) + let discovery_concurrency = std::thread::available_parallelism() + .map(|n| usize::max(n.get() / 2, 1)) + .unwrap_or(4); + Self { phase: Phase::Discovery, started_at: Instant::now(), @@ -150,7 +155,7 @@ impl IndexerState { last_progress_time: Instant::now(), items_since_last_update: 0, batch_size: 1000, - discovery_concurrency: 1, + discovery_concurrency, dirs_channel_capacity: 4096, entries_channel_capacity: 16384, } diff --git a/core/tests/indexing_test.rs b/core/tests/indexing_test.rs index d403a9c37..2a525c215 100644 --- a/core/tests/indexing_test.rs +++ b/core/tests/indexing_test.rs @@ -173,15 +173,16 @@ async fn test_location_indexing() -> Result<(), Box> { // 8. Verify indexed entries in database // Helper to get all entry IDs under the location let get_location_entry_ids = || async { - let descendant_ids = entry_closure::Entity::find() - .filter(entry_closure::Column::AncestorId.eq(location_entry_id)) + let location_id = location_entry_id.expect("Location should have entry_id"); + let descendant_ids: Vec = entry_closure::Entity::find() + .filter(entry_closure::Column::AncestorId.eq(location_id)) .all(db.conn()) .await? .into_iter() .map(|ec| ec.descendant_id) - .collect::>(); + .collect(); - let mut all_ids = vec![location_entry_id]; + let mut all_ids = vec![location_id]; all_ids.extend(descendant_ids); Ok::, anyhow::Error>(all_ids) }; @@ -337,15 +338,16 @@ async fn test_incremental_indexing() -> Result<(), Box> { } // Get all entry IDs under this location - let descendant_ids = entry_closure::Entity::find() - .filter(entry_closure::Column::AncestorId.eq(location_entry_id)) + let location_id = location_entry_id.expect("Location should have entry_id"); + let descendant_ids: Vec = entry_closure::Entity::find() + .filter(entry_closure::Column::AncestorId.eq(location_id)) .all(db.conn()) .await? .into_iter() .map(|ec| ec.descendant_id) - .collect::>(); + .collect(); - let mut all_entry_ids = vec![location_entry_id]; + let mut all_entry_ids = vec![location_id]; all_entry_ids.extend(descendant_ids); let initial_file_count = entities::entry::Entity::find() From cf400865f4d0333c7216ddcb464943e1ad877cf9 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 19:55:34 -0800 Subject: [PATCH 02/20] Introduce ephemeral index cache and status API - Add a complete ephemeral indexing subsystem - core/src/ops/core/ephemeral_status with input/output and query types - core/src/ops/indexing/ephemeral with arena, cache, registry, index_cache, types - expose EphemeralIndexCache and EphemeralIndex through core modules - EphemeralIndexCache supports get/insert/create_for_indexing/mark_indexing_complete eviction and stats - Implement EphemeralIndex data structures for memory-efficient storage - NodeArena, NameCache, NameRegistry, and related types - Add EphemeralIndex status API - EphemeralCacheStatusInput and EphemeralCacheStatusQuery - EphemeralCacheStatus with per-index details - Wire ephemeral indexing into the indexing flow - Change default Ephemeral Indexer behavior to shallow mode - Align code to EphemeralIndex usage across the codebase - Enhance content kind detection in UI - Add getContentKind(file) helper (prefers content_identity.kind, then content_kind) - Use getContentKind in Explorer utilities and UI components - Invalidate directory listings when location index_mode changes - Add useLocationChangeInvalidation to trigger refetches for ephemeral vs persistent indexing transitions - Misc refactors and formatting to accommodate the new modules and APIs --- Cargo.lock | Bin 325215 -> 325293 bytes apps/cli/src/domains/index/args.rs | 17 + apps/cli/src/domains/index/mod.rs | 94 + apps/cli/src/domains/update/mod.rs | 2 +- .../modules/sd-mobile-core/package.json | 5 +- .../modules/sd-mobile-core/src/index.ts | 12 +- apps/tauri/src-tauri/src/main.rs | 56 +- core/Cargo.toml | 6 + core/src/context.rs | 14 +- core/src/crypto/cloud_credentials.rs | 3 +- core/src/domain/file.rs | 4 +- core/src/filetype/registry.rs | 29 + core/src/infra/action/manager.rs | 4 +- core/src/infra/action/mod.rs | 20 +- core/src/infra/sync/event_log/aggregator.rs | 14 +- core/src/infra/sync/event_log/logger.rs | 20 +- core/src/infra/sync/event_log/query.rs | 3 +- core/src/infra/sync/event_log/types.rs | 6 +- core/src/lib.rs | 6 +- core/src/library/manager.rs | 5 +- core/src/ops/core/ephemeral_status/mod.rs | 9 + core/src/ops/core/ephemeral_status/output.rs | 58 + core/src/ops/core/ephemeral_status/query.rs | 100 + core/src/ops/core/mod.rs | 1 + core/src/ops/files/query/directory_listing.rs | 194 +- core/src/ops/indexing/entry.rs | 1723 +++++++------- core/src/ops/indexing/ephemeral/arena.rs | 169 ++ core/src/ops/indexing/ephemeral/cache.rs | 206 ++ .../src/ops/indexing/ephemeral/index_cache.rs | 300 +++ core/src/ops/indexing/ephemeral/mod.rs | 50 + core/src/ops/indexing/ephemeral/registry.rs | 224 ++ core/src/ops/indexing/ephemeral/types.rs | 470 ++++ core/src/ops/indexing/job.rs | 527 +++-- core/src/ops/indexing/mod.rs | 4 +- core/src/ops/indexing/persistence.rs | 143 +- core/src/ops/indexing/phases/discovery.rs | 22 +- core/src/ops/indexing/verify/action.rs | 2 +- core/src/ops/libraries/open/action.rs | 5 +- .../ops/locations/enable_indexing/action.rs | 15 +- .../ops/locations/enable_indexing/output.rs | 5 +- core/src/ops/media/thumbnail/job.rs | 99 +- core/src/ops/network/sync_setup/action.rs | 5 +- core/src/service/file_sharing.rs | 14 +- .../src/service/network/device/persistence.rs | 29 +- .../service/network/protocol/sync/handler.rs | 20 +- core/src/service/sync/backfill.rs | 17 +- core/src/service/sync/mod.rs | 7 +- core/src/service/sync/peer.rs | 5 +- core/src/service/sync/state.rs | 4 +- core/src/volume/backend/local.rs | 2 +- core/src/volume/fs/ntfs.rs | 558 ++--- core/src/volume/fs/refs.rs | 8 +- core/src/volume/manager.rs | 8 +- core/src/volume/platform/windows.rs | 4 +- core/tests/sync_event_log_test.rs | 44 +- docs/workbench | 2 +- packages/interface/src/Explorer.tsx | 671 +++--- .../src/components/Explorer/File/Thumb.tsx | 355 +-- .../src/components/Explorer/utils.ts | 94 +- .../Explorer/views/GridView/FileCard.tsx | 1036 +++++---- .../Explorer/views/KnowledgeView.tsx | 710 +++--- .../views/MediaView/MediaViewItem.tsx | 40 +- .../QuickPreview/ContentRenderer.tsx | 569 ++--- .../components/QuickPreview/QuickPreview.tsx | 264 ++- .../QuickPreview/QuickPreviewFullscreen.tsx | 97 +- .../hooks/useLocationChangeInvalidation.ts | 94 + .../src/inspectors/FileInspector.tsx | 2048 +++++++++-------- 67 files changed, 6961 insertions(+), 4390 deletions(-) create mode 100644 core/src/ops/core/ephemeral_status/mod.rs create mode 100644 core/src/ops/core/ephemeral_status/output.rs create mode 100644 core/src/ops/core/ephemeral_status/query.rs create mode 100644 core/src/ops/indexing/ephemeral/arena.rs create mode 100644 core/src/ops/indexing/ephemeral/cache.rs create mode 100644 core/src/ops/indexing/ephemeral/index_cache.rs create mode 100644 core/src/ops/indexing/ephemeral/mod.rs create mode 100644 core/src/ops/indexing/ephemeral/registry.rs create mode 100644 core/src/ops/indexing/ephemeral/types.rs create mode 100644 packages/interface/src/hooks/useLocationChangeInvalidation.ts diff --git a/Cargo.lock b/Cargo.lock index 84c98a427ae341e237109f1e4d90ee59198ae5ce..dd0229718646d5d00c655c14f32af3bdf90d2fe9 100644 GIT binary patch delta 98 zcmccrMtJR8;f5B*7N#xCyG*7ZGGG?kzRQI9Eu(5~YHn^~fsulNo~53Jk`9-GQeJ*Z zW?ChLS(+Q4Tu@p(`6Gw$^aEzhGLtp7@^8Ow#;nCSU2h39Z#%06^LADXmZ*II)BzzV delta 63 zcmV-F0KosP?GxYZ6M%#PgaWh!x-^$9Is*&0*E9p-0S0bwbZKUJlko@>m+>|O9Jd`e V11SNQca#Gshk7^zw|Y1PGrk)H7byS$ diff --git a/apps/cli/src/domains/index/args.rs b/apps/cli/src/domains/index/args.rs index 4b805d12f..0a3a199cb 100644 --- a/apps/cli/src/domains/index/args.rs +++ b/apps/cli/src/domains/index/args.rs @@ -4,6 +4,7 @@ use uuid::Uuid; use sd_core::{ domain::addressing::SdPath, + ops::core::ephemeral_status::EphemeralCacheStatusInput, ops::indexing::{ input::IndexInput, job::{IndexMode, IndexPersistence, IndexScope}, @@ -169,3 +170,19 @@ impl IndexVerifyArgs { } } } + +/// Arguments for ephemeral cache status +#[derive(Args, Debug, Clone)] +pub struct EphemeralCacheArgs { + /// Filter by path substring + #[arg(long)] + pub filter: Option, +} + +impl EphemeralCacheArgs { + pub fn to_input(&self) -> EphemeralCacheStatusInput { + EphemeralCacheStatusInput { + path_filter: self.filter.clone(), + } + } +} diff --git a/apps/cli/src/domains/index/mod.rs b/apps/cli/src/domains/index/mod.rs index 2face4f1f..6581d4129 100644 --- a/apps/cli/src/domains/index/mod.rs +++ b/apps/cli/src/domains/index/mod.rs @@ -2,6 +2,7 @@ pub mod args; use anyhow::Result; use clap::Subcommand; +use comfy_table::{presets::UTF8_BORDERS_ONLY, Attribute, Cell, Table}; use crate::util::prelude::*; @@ -20,6 +21,8 @@ pub enum IndexCmd { Browse(BrowseArgs), /// Verify index integrity for a path Verify(IndexVerifyArgs), + /// Show ephemeral index cache status + EphemeralCache(EphemeralCacheArgs), } pub async fn run(ctx: &Context, cmd: IndexCmd) -> Result<()> { @@ -232,6 +235,97 @@ pub async fn run(ctx: &Context, cmd: IndexCmd) -> Result<()> { } ); } + IndexCmd::EphemeralCache(args) => { + let input = args.to_input(); + let out: sd_core::ops::core::ephemeral_status::EphemeralCacheStatus = + execute_core_query!(ctx, input); + + print_output!( + ctx, + &out, + |status: &sd_core::ops::core::ephemeral_status::EphemeralCacheStatus| { + println!(); + println!("╔══════════════════════════════════════════════════════════════╗"); + println!("║ EPHEMERAL INDEX CACHE STATUS ║"); + println!("╠══════════════════════════════════════════════════════════════╣"); + println!( + "║ Total Indexes: {:3} In Progress: {:3} Stale: {:3} ║", + status.total_indexes, status.indexing_in_progress, status.stale_count + ); + println!("╚══════════════════════════════════════════════════════════════╝"); + + if status.indexes.is_empty() { + println!("\n No ephemeral indexes cached."); + } else { + for idx in &status.indexes { + println!(); + let mut table = Table::new(); + table.load_preset(UTF8_BORDERS_ONLY); + + let status_indicator = if idx.indexing_in_progress { + "● INDEXING" + } else { + "○ Ready" + }; + + table.set_header(vec![ + Cell::new(format!("{}", idx.root_path.display())) + .add_attribute(Attribute::Bold), + Cell::new(status_indicator), + ]); + + table.add_row(vec!["Entries (arena)", &idx.total_entries.to_string()]); + table.add_row(vec![ + "Path index count", + &idx.path_index_count.to_string(), + ]); + table.add_row(vec!["Unique names", &idx.unique_names.to_string()]); + table.add_row(vec![ + "Interned strings", + &idx.interned_strings.to_string(), + ]); + table.add_row(vec!["Content kinds", &idx.content_kinds.to_string()]); + table.add_row(vec![ + "Memory usage", + &format_bytes(idx.memory_bytes as u64), + ]); + table.add_row(vec!["Age", &format!("{:.1}s", idx.age_seconds)]); + table.add_row(vec!["Idle time", &format!("{:.1}s", idx.idle_seconds)]); + table.add_row(vec![ + "Job stats", + &format!( + "{} files, {} dirs, {} symlinks, {}", + idx.job_stats.files, + idx.job_stats.dirs, + idx.job_stats.symlinks, + format_bytes(idx.job_stats.bytes) + ), + ]); + + println!("{}", table); + } + } + println!(); + } + ); + } } Ok(()) } + +fn format_bytes(bytes: u64) -> String { + const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"]; + let mut size = bytes as f64; + let mut unit_index = 0; + + while size >= 1024.0 && unit_index < UNITS.len() - 1 { + size /= 1024.0; + unit_index += 1; + } + + if unit_index == 0 { + format!("{} {}", bytes, UNITS[unit_index]) + } else { + format!("{:.1} {}", size, UNITS[unit_index]) + } +} diff --git a/apps/cli/src/domains/update/mod.rs b/apps/cli/src/domains/update/mod.rs index 71430a26a..586fe2388 100644 --- a/apps/cli/src/domains/update/mod.rs +++ b/apps/cli/src/domains/update/mod.rs @@ -268,4 +268,4 @@ async fn start_daemon(data_dir: &PathBuf) -> Result<()> { tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; Ok(()) -} \ No newline at end of file +} diff --git a/apps/mobile/modules/sd-mobile-core/package.json b/apps/mobile/modules/sd-mobile-core/package.json index 882679647..eb4b285bb 100644 --- a/apps/mobile/modules/sd-mobile-core/package.json +++ b/apps/mobile/modules/sd-mobile-core/package.json @@ -2,5 +2,8 @@ "name": "sd-mobile-core", "version": "1.0.0", "main": "./src/index.ts", - "types": "./src/index.ts" + "types": "./src/index.ts", + "peerDependencies": { + "expo-modules-core": "*" + } } \ No newline at end of file diff --git a/apps/mobile/modules/sd-mobile-core/src/index.ts b/apps/mobile/modules/sd-mobile-core/src/index.ts index 0a74856f3..7d52bf893 100644 --- a/apps/mobile/modules/sd-mobile-core/src/index.ts +++ b/apps/mobile/modules/sd-mobile-core/src/index.ts @@ -1,13 +1,7 @@ -// @ts-ignore - Expo modules types may not be available in all environments -const { EventEmitter, NativeModulesProxy } = require("expo-modules-core"); +// TODO: Test if we can rely on Expo's autolinking instead of manually requiring the module +import { requireNativeModule, EventEmitter } from "expo-modules-core"; -const SDMobileCoreModule = NativeModulesProxy?.SDMobileCore; - -if (!SDMobileCoreModule) { - throw new Error( - "SDMobileCore native module not found. Did you run 'cargo xtask build-mobile' and rebuild the app?", - ); -} +const SDMobileCoreModule = requireNativeModule("SDMobileCore"); const emitter = new EventEmitter(SDMobileCoreModule); diff --git a/apps/tauri/src-tauri/src/main.rs b/apps/tauri/src-tauri/src/main.rs index 77221eb4e..ccefc0044 100644 --- a/apps/tauri/src-tauri/src/main.rs +++ b/apps/tauri/src-tauri/src/main.rs @@ -800,17 +800,25 @@ async fn stop_daemon_process( async fn check_daemon_installed() -> Result { #[cfg(target_os = "macos")] { - let home = std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; - let plist_path = std::path::PathBuf::from(home).join("Library/LaunchAgents/com.spacedrive.daemon.plist"); + let home = + std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; + let plist_path = + std::path::PathBuf::from(home).join("Library/LaunchAgents/com.spacedrive.daemon.plist"); let exists = plist_path.exists(); - tracing::info!("Checking daemon installation at {}: {}", plist_path.display(), exists); + tracing::info!( + "Checking daemon installation at {}: {}", + plist_path.display(), + exists + ); Ok(exists) } #[cfg(target_os = "linux")] { - let home = std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; - let service_path = std::path::PathBuf::from(home).join(".config/systemd/user/spacedrive-daemon.service"); + let home = + std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; + let service_path = + std::path::PathBuf::from(home).join(".config/systemd/user/spacedrive-daemon.service"); Ok(service_path.exists()) } @@ -865,7 +873,8 @@ async fn install_daemon_service( { use std::io::Write; - let home = std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; + let home = + std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; let launch_agents_dir = std::path::PathBuf::from(&home).join("Library/LaunchAgents"); std::fs::create_dir_all(&launch_agents_dir) @@ -881,7 +890,10 @@ async fn install_daemon_service( .join("sd-daemon"); if !daemon_path.exists() { - return Err(format!("Daemon binary not found at {}", daemon_path.display())); + return Err(format!( + "Daemon binary not found at {}", + daemon_path.display() + )); } let log_dir = data_dir.join("logs"); @@ -938,7 +950,10 @@ async fn install_daemon_service( .output() .map_err(|e| format!("Failed to load service: {}", e))?; - tracing::info!("launchctl load output: {:?}", String::from_utf8_lossy(&output.stdout)); + tracing::info!( + "launchctl load output: {:?}", + String::from_utf8_lossy(&output.stdout) + ); if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); tracing::error!("launchctl load failed: {:?}", stderr); @@ -980,7 +995,8 @@ async fn install_daemon_service( { use std::io::Write; - let home = std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; + let home = + std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; let systemd_dir = std::path::PathBuf::from(&home).join(".config/systemd/user"); std::fs::create_dir_all(&systemd_dir) @@ -995,7 +1011,10 @@ async fn install_daemon_service( .join("sd-daemon"); if !daemon_path.exists() { - return Err(format!("Daemon binary not found at {}", daemon_path.display())); + return Err(format!( + "Daemon binary not found at {}", + daemon_path.display() + )); } let service_content = format!( @@ -1112,7 +1131,10 @@ WantedBy=default.target .join("sd-daemon.exe"); if !daemon_path.exists() { - return Err(format!("Daemon binary not found at {}", daemon_path.display())); + return Err(format!( + "Daemon binary not found at {}", + daemon_path.display() + )); } // Delete existing task if it exists @@ -1248,8 +1270,10 @@ WantedBy=default.target async fn uninstall_daemon_service() -> Result<(), String> { #[cfg(target_os = "macos")] { - let home = std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; - let plist_path = std::path::PathBuf::from(&home).join("Library/LaunchAgents/com.spacedrive.daemon.plist"); + let home = + std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; + let plist_path = std::path::PathBuf::from(&home) + .join("Library/LaunchAgents/com.spacedrive.daemon.plist"); if plist_path.exists() { // Unload the service @@ -1266,8 +1290,10 @@ async fn uninstall_daemon_service() -> Result<(), String> { #[cfg(target_os = "linux")] { - let home = std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; - let service_path = std::path::PathBuf::from(&home).join(".config/systemd/user/spacedrive-daemon.service"); + let home = + std::env::var("HOME").map_err(|_| "Could not determine home directory".to_string())?; + let service_path = + std::path::PathBuf::from(&home).join(".config/systemd/user/spacedrive-daemon.service"); if service_path.exists() { // Stop and disable the service diff --git a/core/Cargo.toml b/core/Cargo.toml index 209103542..adf44d036 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -167,6 +167,12 @@ once_cell = "1.20" rand = "0.8" # Random number generation for secure delete tempfile = "3.14" # Temporary directories for testing uuid = { version = "1.11", features = ["serde", "v4", "v5", "v7"] } + +# High-performance ephemeral index +memmap2 = "0.9" # Memory-mapped file support for arena storage +smallvec = "1.13" # Small vector optimization for children arrays +parking_lot = "0.12" # Fast mutex for name cache +num_cpus = "1.16" # CPU count for parallel walker whoami = "1.5" # Secure storage diff --git a/core/src/context.rs b/core/src/context.rs index 4c38be5d8..e19f1915d 100644 --- a/core/src/context.rs +++ b/core/src/context.rs @@ -3,9 +3,9 @@ use crate::{ config::JobLoggingConfig, crypto::key_manager::KeyManager, device::DeviceManager, infra::action::manager::ActionManager, infra::event::EventBus, infra::sync::TransactionManager, - library::LibraryManager, service::network::NetworkingService, - service::session::SessionStateService, service::sidecar_manager::SidecarManager, - volume::VolumeManager, + library::LibraryManager, ops::indexing::ephemeral::EphemeralIndexCache, + service::network::NetworkingService, service::session::SessionStateService, + service::sidecar_manager::SidecarManager, volume::VolumeManager, }; use std::{path::PathBuf, sync::Arc}; use tokio::sync::{Mutex, RwLock}; @@ -22,6 +22,8 @@ pub struct CoreContext { pub action_manager: Arc>>>, pub networking: Arc>>>, pub plugin_manager: Arc>>>>, + // Ephemeral index cache for unmanaged paths + pub ephemeral_index_cache: Arc, // Job logging configuration pub job_logging_config: Option, pub job_logs_dir: Option, @@ -47,11 +49,17 @@ impl CoreContext { action_manager: Arc::new(RwLock::new(None)), networking: Arc::new(RwLock::new(None)), plugin_manager: Arc::new(RwLock::new(None)), + ephemeral_index_cache: Arc::new(EphemeralIndexCache::new()), job_logging_config: None, job_logs_dir: None, } } + /// Get the ephemeral index cache + pub fn ephemeral_cache(&self) -> &Arc { + &self.ephemeral_index_cache + } + /// Get the library manager pub async fn libraries(&self) -> Arc { self.library_manager.read().await.clone().unwrap() diff --git a/core/src/crypto/cloud_credentials.rs b/core/src/crypto/cloud_credentials.rs index 97c5cc41e..86afc24ab 100644 --- a/core/src/crypto/cloud_credentials.rs +++ b/core/src/crypto/cloud_credentials.rs @@ -138,7 +138,8 @@ impl CloudCredentialManager { // Decrypt let library_key = self.key_manager.get_library_key(library_id).await?; - let decrypted = self.decrypt_credential(&credential_model.encrypted_credential, &library_key)?; + let decrypted = + self.decrypt_credential(&credential_model.encrypted_credential, &library_key)?; // Deserialize let credential: CloudCredential = serde_json::from_slice(&decrypted)?; diff --git a/core/src/domain/file.rs b/core/src/domain/file.rs index 6e68884b5..52cd76373 100644 --- a/core/src/domain/file.rs +++ b/core/src/domain/file.rs @@ -78,8 +78,8 @@ pub struct File { pub accessed_at: Option>, /// Additional computed fields - pub content_kind: ContentKind, // This is redundant with ContentIdentity, it lives inside - pub is_local: bool, // this is also redundant with SdPath + pub content_kind: ContentKind, // Populated by the ephemeral indexer, for when a File does not have a ContentIdentity + pub is_local: bool, // this is redundant with SdPath /// Video duration (for grid display optimization) pub duration_seconds: Option, diff --git a/core/src/filetype/registry.rs b/core/src/filetype/registry.rs index 98653c9ce..4d1477755 100644 --- a/core/src/filetype/registry.rs +++ b/core/src/filetype/registry.rs @@ -145,6 +145,35 @@ impl FileTypeRegistry { .collect() } + /// Fast identification by extension only (no file I/O) + /// + /// This is useful for quick file type detection during indexing where + /// we don't need high-confidence identification. Returns the content kind + /// based purely on extension matching. + /// + /// Returns `ContentKind::Unknown` if the extension is not recognized. + pub fn identify_by_extension(&self, path: &Path) -> ContentKind { + let extension = match path.extension().and_then(|s| s.to_str()) { + Some(ext) => ext, + None => return ContentKind::Unknown, + }; + + let candidates = self.get_by_extension(extension); + + match candidates.len() { + 0 => ContentKind::Unknown, + 1 => candidates[0].category, + _ => { + // Multiple matches - pick highest priority + candidates + .iter() + .max_by_key(|ft| ft.priority) + .map(|ft| ft.category) + .unwrap_or(ContentKind::Unknown) + } + } + } + /// Identify a file type from a path pub async fn identify(&self, path: &Path) -> Result { // Get extension diff --git a/core/src/infra/action/manager.rs b/core/src/infra/action/manager.rs index 097dae5ed..79fe7b826 100644 --- a/core/src/infra/action/manager.rs +++ b/core/src/infra/action/manager.rs @@ -91,9 +91,7 @@ impl ActionManager { .await?; // Validate the action first - let validation_result = action - .validate(&library, self.context.clone()) - .await?; + let validation_result = action.validate(&library, self.context.clone()).await?; // Check if confirmation is required match validation_result { diff --git a/core/src/infra/action/mod.rs b/core/src/infra/action/mod.rs index 65c1da6c3..fa95d02fb 100644 --- a/core/src/infra/action/mod.rs +++ b/core/src/infra/action/mod.rs @@ -58,14 +58,18 @@ pub trait CoreAction: Send + Sync + 'static { fn validate( &self, _context: std::sync::Arc, - ) -> impl std::future::Future> + Send - { + ) -> impl std::future::Future< + Output = Result, + > + Send { async { Ok(ValidationResult::Success) } } /// Resolve a user confirmation choice (optional) /// Called when the action previously returned RequiresConfirmation - fn resolve_confirmation(&mut self, _choice_index: usize) -> Result<(), crate::infra::action::error::ActionError> { + fn resolve_confirmation( + &mut self, + _choice_index: usize, + ) -> Result<(), crate::infra::action::error::ActionError> { Ok(()) } @@ -102,14 +106,18 @@ pub trait LibraryAction: Send + Sync + 'static { &self, _library: &std::sync::Arc, _context: std::sync::Arc, - ) -> impl std::future::Future> + Send - { + ) -> impl std::future::Future< + Output = Result, + > + Send { async { Ok(ValidationResult::Success) } } /// Resolve a user confirmation choice (optional) /// Called when the action previously returned RequiresConfirmation - fn resolve_confirmation(&mut self, _choice_index: usize) -> Result<(), crate::infra::action::error::ActionError> { + fn resolve_confirmation( + &mut self, + _choice_index: usize, + ) -> Result<(), crate::infra::action::error::ActionError> { Ok(()) } diff --git a/core/src/infra/sync/event_log/aggregator.rs b/core/src/infra/sync/event_log/aggregator.rs index d0572daaf..35e236727 100644 --- a/core/src/infra/sync/event_log/aggregator.rs +++ b/core/src/infra/sync/event_log/aggregator.rs @@ -94,16 +94,13 @@ impl BatchAggregator { } /// Add records to the batch - pub async fn add_records( - &self, - model_type: String, - count: u64, - peer_id: Option, - ) { + pub async fn add_records(&self, model_type: String, count: u64, peer_id: Option) { let key = BatchKey { peer_id }; let mut batches = self.pending_batches.write().await; - let batch = batches.entry(key.clone()).or_insert_with(|| PendingBatch::new(peer_id)); + let batch = batches + .entry(key.clone()) + .or_insert_with(|| PendingBatch::new(peer_id)); batch.add(model_type, count); @@ -195,7 +192,8 @@ impl BatchAggregator { let keys_to_flush: Vec = batches .iter() .filter(|(_, batch)| { - now.signed_duration_since(batch.started_at) >= chrono::Duration::from_std(self.config.flush_interval).unwrap() + now.signed_duration_since(batch.started_at) + >= chrono::Duration::from_std(self.config.flush_interval).unwrap() }) .map(|(k, _)| k.clone()) .collect(); diff --git a/core/src/infra/sync/event_log/logger.rs b/core/src/infra/sync/event_log/logger.rs index b6d1368e0..31c5e1986 100644 --- a/core/src/infra/sync/event_log/logger.rs +++ b/core/src/infra/sync/event_log/logger.rs @@ -25,11 +25,7 @@ pub struct SyncEventLogger { impl SyncEventLogger { /// Create a new event logger - pub fn new( - library_id: Uuid, - device_id: Uuid, - conn: Arc, - ) -> Self { + pub fn new(library_id: Uuid, device_id: Uuid, conn: Arc) -> Self { Self { library_id, device_id, @@ -55,10 +51,7 @@ impl SyncEventLogger { .map(|d| serde_json::to_string(d)) .transpose()?; - let model_types_str = event - .model_types - .as_ref() - .map(|types| types.join(",")); + let model_types_str = event.model_types.as_ref().map(|types| types.join(",")); self.conn .execute(Statement::from_sql_and_values( @@ -161,8 +154,7 @@ impl SyncEventLogger { where_clause, limit, offset ); - let param_values: Vec = - params.into_iter().map(|p| p.into()).collect(); + let param_values: Vec = params.into_iter().map(|p| p.into()).collect(); let stmt = Statement::from_sql_and_values(DbBackend::Sqlite, &sql, param_values); @@ -194,8 +186,7 @@ impl SyncEventLogger { Ok(SyncEventLog { id: Some(id), - timestamp: DateTime::parse_from_rfc3339(×tamp_str)? - .with_timezone(&Utc), + timestamp: DateTime::parse_from_rfc3339(×tamp_str)?.with_timezone(&Utc), device_id: Uuid::parse_str(&device_id_str)?, event_type: SyncEventType::from_str(&event_type_str) .ok_or_else(|| anyhow::anyhow!("Invalid event type: {}", event_type_str))?, @@ -213,8 +204,7 @@ impl SyncEventLogger { peer_device_id: peer_device_id_str .as_ref() .and_then(|s| Uuid::parse_str(s).ok()), - model_types: model_types_str - .map(|s| s.split(',').map(|t| t.to_string()).collect()), + model_types: model_types_str.map(|s| s.split(',').map(|t| t.to_string()).collect()), record_count: record_count.map(|c| c as u64), duration_ms: duration_ms.map(|d| d as u64), }) diff --git a/core/src/infra/sync/event_log/query.rs b/core/src/infra/sync/event_log/query.rs index 059076df1..c5baee3ea 100644 --- a/core/src/infra/sync/event_log/query.rs +++ b/core/src/infra/sync/event_log/query.rs @@ -182,8 +182,7 @@ impl QueryBuilder { } pub fn add_model_type_filter(&mut self, model_type: &str) { - self.where_clauses - .push("model_types LIKE ?".to_string()); + self.where_clauses.push("model_types LIKE ?".to_string()); self.params.push(format!("%{}%", model_type)); } diff --git a/core/src/infra/sync/event_log/types.rs b/core/src/infra/sync/event_log/types.rs index 52307d4fc..0ef7f6ece 100644 --- a/core/src/infra/sync/event_log/types.rs +++ b/core/src/infra/sync/event_log/types.rs @@ -33,11 +33,7 @@ pub struct SyncEventLog { impl SyncEventLog { /// Create a new event with common fields pre-filled - pub fn new( - device_id: Uuid, - event_type: SyncEventType, - summary: impl Into, - ) -> Self { + pub fn new(device_id: Uuid, event_type: SyncEventType, summary: impl Into) -> Self { let (category, severity) = event_type.default_category_and_severity(); Self { diff --git a/core/src/lib.rs b/core/src/lib.rs index c36b4faa6..fe931c07a 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -110,7 +110,11 @@ impl Core { )?); // Initialize device manager - let device = Arc::new(DeviceManager::init(&data_dir, key_manager.clone(), system_device_name)?); + let device = Arc::new(DeviceManager::init( + &data_dir, + key_manager.clone(), + system_device_name, + )?); // Set a global device ID and slug for convenience crate::device::set_current_device_id(device.device_id()?); diff --git a/core/src/library/manager.rs b/core/src/library/manager.rs index b2436b59f..b9faab7d7 100644 --- a/core/src/library/manager.rs +++ b/core/src/library/manager.rs @@ -1245,7 +1245,10 @@ impl LibraryManager { .await { Ok((location_id, _)) => { - info!("Created default location '{}' at {:?} ({})", name, path, location_id); + info!( + "Created default location '{}' at {:?} ({})", + name, path, location_id + ); } Err(e) => { warn!("Failed to create default location '{}': {}", name, e); diff --git a/core/src/ops/core/ephemeral_status/mod.rs b/core/src/ops/core/ephemeral_status/mod.rs new file mode 100644 index 000000000..e17a7c4b2 --- /dev/null +++ b/core/src/ops/core/ephemeral_status/mod.rs @@ -0,0 +1,9 @@ +//! Ephemeral index cache status query +//! +//! Provides debugging information about the ephemeral index cache. + +pub mod output; +pub mod query; + +pub use output::*; +pub use query::*; diff --git a/core/src/ops/core/ephemeral_status/output.rs b/core/src/ops/core/ephemeral_status/output.rs new file mode 100644 index 000000000..789652b63 --- /dev/null +++ b/core/src/ops/core/ephemeral_status/output.rs @@ -0,0 +1,58 @@ +//! Ephemeral index cache status output types + +use serde::{Deserialize, Serialize}; +use specta::Type; +use std::path::PathBuf; + +/// Status of the entire ephemeral index cache +#[derive(Debug, Clone, Serialize, Deserialize, Type)] +pub struct EphemeralCacheStatus { + /// Total number of cached indexes + pub total_indexes: usize, + /// Number of indexes currently being populated + pub indexing_in_progress: usize, + /// Number of stale indexes (past TTL) + pub stale_count: usize, + /// Details for each cached index + pub indexes: Vec, +} + +/// Information about a single ephemeral index +#[derive(Debug, Clone, Serialize, Deserialize, Type)] +pub struct EphemeralIndexInfo { + /// Root path this index covers + pub root_path: PathBuf, + /// Whether indexing is currently in progress + pub indexing_in_progress: bool, + /// Total entries in the arena + pub total_entries: usize, + /// Number of entries indexed by path + pub path_index_count: usize, + /// Number of unique interned names + pub unique_names: usize, + /// Number of interned strings in cache + pub interned_strings: usize, + /// Number of content kinds stored + pub content_kinds: usize, + /// Estimated memory usage in bytes + pub memory_bytes: usize, + /// Age of the index in seconds + pub age_seconds: f64, + /// Seconds since last access + pub idle_seconds: f64, + /// Indexer job statistics (files/dirs/bytes counted) + pub job_stats: JobStats, +} + +/// Statistics from the indexer job +#[derive(Debug, Clone, Serialize, Deserialize, Type)] +pub struct JobStats { + /// Number of files indexed + pub files: u64, + /// Number of directories indexed + pub dirs: u64, + /// Number of symlinks indexed + pub symlinks: u64, + /// Total bytes indexed + pub bytes: u64, +} diff --git a/core/src/ops/core/ephemeral_status/query.rs b/core/src/ops/core/ephemeral_status/query.rs new file mode 100644 index 000000000..df03245a1 --- /dev/null +++ b/core/src/ops/core/ephemeral_status/query.rs @@ -0,0 +1,100 @@ +//! Ephemeral index cache status query +//! +//! Provides a snapshot of all cached ephemeral indexes for debugging. + +use super::output::*; +use crate::{ + context::CoreContext, + infra::query::{CoreQuery, QueryResult}, +}; +use serde::{Deserialize, Serialize}; +use specta::Type; +use std::sync::Arc; + +/// Input for the ephemeral cache status query +#[derive(Debug, Clone, Serialize, Deserialize, Type, Default)] +pub struct EphemeralCacheStatusInput { + /// Optional: only include indexes for paths containing this substring + #[serde(default)] + pub path_filter: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Type)] +pub struct EphemeralCacheStatusQuery { + input: EphemeralCacheStatusInput, +} + +impl CoreQuery for EphemeralCacheStatusQuery { + type Input = EphemeralCacheStatusInput; + type Output = EphemeralCacheStatus; + + fn from_input(input: Self::Input) -> QueryResult { + Ok(Self { input }) + } + + async fn execute( + self, + context: Arc, + _session: crate::infra::api::SessionContext, + ) -> QueryResult { + let cache = context.ephemeral_cache(); + + // Get basic cache stats + let cache_stats = cache.stats(); + let cached_paths = cache.cached_paths(); + + // Gather detailed info for each index + let mut indexes = Vec::new(); + + for path in cached_paths { + // Apply path filter if provided + if let Some(ref filter) = self.input.path_filter { + if !path.to_string_lossy().contains(filter) { + continue; + } + } + + // Check if indexing is in progress + let indexing_in_progress = cache.is_indexing(&path); + + // Try to get the index to read its internal stats + if let Some(index_arc) = cache.get(&path) { + let index = index_arc.read().await; + let stats = index.get_stats(); + + let info = EphemeralIndexInfo { + root_path: index.root_path.clone(), + indexing_in_progress, + total_entries: stats.total_entries, + path_index_count: index.path_index_count(), + unique_names: stats.unique_names, + interned_strings: stats.interned_strings, + content_kinds: index.content_kinds_count(), + memory_bytes: stats.memory_bytes, + age_seconds: index.age().as_secs_f64(), + idle_seconds: index.idle_time().as_secs_f64(), + job_stats: JobStats { + files: index.stats.files, + dirs: index.stats.dirs, + symlinks: index.stats.symlinks, + bytes: index.stats.bytes, + }, + }; + + indexes.push(info); + } + } + + // Sort by root path for consistent output + indexes.sort_by(|a, b| a.root_path.cmp(&b.root_path)); + + Ok(EphemeralCacheStatus { + total_indexes: cache_stats.total_entries, + indexing_in_progress: cache_stats.indexing_count, + stale_count: cache_stats.stale_count, + indexes, + }) + } +} + +crate::register_core_query!(EphemeralCacheStatusQuery, "core.ephemeral_status"); diff --git a/core/src/ops/core/mod.rs b/core/src/ops/core/mod.rs index 0f3c70891..fe553c4d3 100644 --- a/core/src/ops/core/mod.rs +++ b/core/src/ops/core/mod.rs @@ -1,2 +1,3 @@ +pub mod ephemeral_status; pub mod events; pub mod status; diff --git a/core/src/ops/files/query/directory_listing.rs b/core/src/ops/files/query/directory_listing.rs index df39ac365..becedfdfa 100644 --- a/core/src/ops/files/query/directory_listing.rs +++ b/core/src/ops/files/query/directory_listing.rs @@ -140,7 +140,9 @@ impl LibraryQuery for DirectoryListingQuery { if let Some(should_use_ephemeral) = self.check_location_index_mode(db.conn()).await { if should_use_ephemeral { tracing::info!("Location has IndexMode::None, using ephemeral indexing"); - return self.query_ephemeral_directory_impl(context, library_id).await; + return self + .query_ephemeral_directory_impl(context, library_id) + .await; } } @@ -615,39 +617,163 @@ impl DirectoryListingQuery { }) } - /// Query ephemeral directory (not indexed) - trigger on-demand indexing + /// Query ephemeral directory (not indexed) - check cache first, then trigger on-demand indexing async fn query_ephemeral_directory_impl( &self, context: Arc, library_id: Uuid, ) -> QueryResult { - use crate::ops::indexing::{IndexMode, IndexScope, IndexerJob, IndexerJobConfig}; + use crate::domain::file::File; + use crate::ops::indexing::{IndexScope, IndexerJob, IndexerJobConfig}; + + // Get the local path for cache lookup + let local_path = match &self.input.path { + SdPath::Physical { path, .. } => path.clone(), + _ => { + tracing::warn!( + "Ephemeral indexing only supported for physical paths: {:?}", + self.input.path + ); + return Ok(DirectoryListingOutput { + files: Vec::new(), + total_count: 0, + has_more: false, + }); + } + }; + + let cache = context.ephemeral_cache(); + + // Check if we have a cached index that covers this path + if let Some(index) = cache.get_for_path(&local_path) { + tracing::info!( + "Found cached ephemeral index for path: {}", + local_path.display() + ); + + // Try to get directory listing from cached index + let index_guard = index.read().await; + + // Check if the index actually has entries for this directory + if let Some(children) = index_guard.list_directory(&local_path) { + tracing::debug!( + "Cached index has {} children for {}", + children.len(), + local_path.display() + ); + + // Convert cached entries to File objects + let mut files = Vec::new(); + for child_path in children { + if let Some(metadata) = index_guard.get_entry_ref(&child_path) { + // Apply hidden file filter + if !self.input.include_hidden.unwrap_or(false) && metadata.is_hidden { + continue; + } + + // Get UUID from index + let entry_uuid = index_guard + .get_entry_uuid(&child_path) + .unwrap_or_else(Uuid::new_v4); + + // Build SdPath for this entry + let entry_sd_path = SdPath::Physical { + device_slug: match &self.input.path { + SdPath::Physical { device_slug, .. } => device_slug.clone(), + _ => String::new(), + }, + path: child_path.clone(), + }; + + // Get content kind from index (identified by extension) + let content_kind = index_guard.get_content_kind(&child_path); + + // Convert to File + let mut file = File::from_ephemeral(entry_uuid, &metadata, entry_sd_path); + file.content_kind = content_kind; + files.push(file); + } + } + + // Apply sorting + self.sort_files(&mut files); + + // Apply limit + let total_count = files.len() as u32; + let has_more = if let Some(limit) = self.input.limit { + if files.len() > limit as usize { + files.truncate(limit as usize); + true + } else { + false + } + } else { + false + }; + + return Ok(DirectoryListingOutput { + files, + total_count, + has_more, + }); + } + + // Index exists but doesn't have this directory yet + // Fall through to spawn indexer job + tracing::debug!( + "Cached index doesn't contain directory: {}", + local_path.display() + ); + } + + // No cached index or index doesn't cover this path + // Check if indexing is already in progress + if cache.is_indexing(&local_path) { + tracing::info!("Indexing already in progress for: {}", local_path.display()); + // Return empty, UI will get updates via events + return Ok(DirectoryListingOutput { + files: Vec::new(), + total_count: 0, + has_more: false, + }); + } tracing::info!( - "Path not indexed, triggering ephemeral indexing for: {:?}", + "No cached index, triggering ephemeral indexing for: {:?}", self.input.path ); // Get library to dispatch indexer job if let Some(library) = context.get_library(library_id).await { + // Create cache entry and get the index to share with the job + let ephemeral_index = cache.create_for_indexing(local_path.clone()); + // Create ephemeral indexer job for this directory (shallow, current scope only) let config = IndexerJobConfig::ephemeral_browse( self.input.path.clone(), IndexScope::Current, // Only current directory, not recursive ); - let indexer_job = IndexerJob::new(config); + let mut indexer_job = IndexerJob::new(config); - // Dispatch job asynchronously (fire and forget) + // Share the cached index with the job + indexer_job.set_ephemeral_index(ephemeral_index); + + // Dispatch job asynchronously // The job will emit ResourceChanged events as files are discovered - if let Err(e) = library.jobs().dispatch(indexer_job).await { - tracing::warn!( - "Failed to dispatch ephemeral indexer for {:?}: {}", - self.input.path, - e - ); - } else { - tracing::info!("Dispatched ephemeral indexer for {:?}", self.input.path); + match library.jobs().dispatch(indexer_job).await { + Ok(_) => { + tracing::info!("Dispatched ephemeral indexer for {:?}", self.input.path); + } + Err(e) => { + tracing::warn!( + "Failed to dispatch ephemeral indexer for {:?}: {}", + self.input.path, + e + ); + // Mark indexing as not in progress since job failed + cache.mark_indexing_complete(&local_path); + } } } @@ -659,6 +785,42 @@ impl DirectoryListingQuery { has_more: false, }) } + + /// Sort files according to the input options + fn sort_files(&self, files: &mut Vec) { + use crate::domain::file::EntryKind; + + let folders_first = self.input.folders_first.unwrap_or(false); + + files.sort_by(|a, b| { + // Folders first if enabled + if folders_first { + let a_is_dir = matches!(a.kind, EntryKind::Directory); + let b_is_dir = matches!(b.kind, EntryKind::Directory); + if a_is_dir != b_is_dir { + return b_is_dir.cmp(&a_is_dir); // Directories first + } + } + + // Then apply sort order + match self.input.sort_by { + DirectorySortBy::Name => a.name.to_lowercase().cmp(&b.name.to_lowercase()), + DirectorySortBy::Modified => b.modified_at.cmp(&a.modified_at), + DirectorySortBy::Size => b.size.cmp(&a.size), + DirectorySortBy::Type => { + // Sort by kind (directories first), then name + if !folders_first { + let a_is_dir = matches!(a.kind, EntryKind::Directory); + let b_is_dir = matches!(b.kind, EntryKind::Directory); + if a_is_dir != b_is_dir { + return b_is_dir.cmp(&a_is_dir); + } + } + a.name.to_lowercase().cmp(&b.name.to_lowercase()) + } + } + }); + } } impl DirectoryListingQuery { @@ -677,7 +839,9 @@ impl DirectoryListingQuery { for loc in locations { // Get the location's root path if let Some(entry_id) = loc.entry_id { - if let Ok(Some(dir_path)) = directory_paths::Entity::find_by_id(entry_id).one(db).await { + if let Ok(Some(dir_path)) = + directory_paths::Entity::find_by_id(entry_id).one(db).await + { // Check if this location's path is a parent of the requested path if path_str.starts_with(&dir_path.path) { // Check if index_mode is "none" diff --git a/core/src/ops/indexing/entry.rs b/core/src/ops/indexing/entry.rs index f0b5ac328..592f25885 100644 --- a/core/src/ops/indexing/entry.rs +++ b/core/src/ops/indexing/entry.rs @@ -5,12 +5,12 @@ use super::path_resolver::PathResolver; use super::state::{DirEntry, EntryKind, IndexerState}; use crate::infra::job::prelude::{JobContext, JobError}; use crate::{ - filetype::FileTypeRegistry, - infra::db::entities::{self, directory_paths, entry_closure}, + filetype::FileTypeRegistry, + infra::db::entities::{self, directory_paths, entry_closure}, }; use sea_orm::{ - ActiveModelTrait, ActiveValue::Set, ColumnTrait, ConnectionTrait, DatabaseTransaction, - DbBackend, EntityTrait, IntoActiveModel, QueryFilter, QuerySelect, Statement, TransactionTrait, + ActiveModelTrait, ActiveValue::Set, ColumnTrait, ConnectionTrait, DatabaseTransaction, + DbBackend, EntityTrait, IntoActiveModel, QueryFilter, QuerySelect, Statement, TransactionTrait, }; use std::path::{Path, PathBuf}; use uuid::Uuid; @@ -18,47 +18,47 @@ use uuid::Uuid; /// Normalize cloud directory path for consistent lookups /// Cloud paths stored with trailing slashes don't match PathBuf::parent() results fn normalize_cloud_dir_path(path: &Path) -> PathBuf { - let path_str = path.to_string_lossy(); - if path_str.contains("://") && path_str.ends_with('/') { - PathBuf::from(path_str.trim_end_matches('/')) - } else { - path.to_path_buf() - } + let path_str = path.to_string_lossy(); + if path_str.contains("://") && path_str.ends_with('/') { + PathBuf::from(path_str.trim_end_matches('/')) + } else { + path.to_path_buf() + } } /// Metadata about a file system entry #[derive(Debug, Clone)] pub struct EntryMetadata { - pub path: PathBuf, - pub kind: EntryKind, - pub size: u64, - pub modified: Option, - pub accessed: Option, - pub created: Option, - pub inode: Option, - pub permissions: Option, - pub is_hidden: bool, + pub path: PathBuf, + pub kind: EntryKind, + pub size: u64, + pub modified: Option, + pub accessed: Option, + pub created: Option, + pub inode: Option, + pub permissions: Option, + pub is_hidden: bool, } impl From for EntryMetadata { - fn from(entry: DirEntry) -> Self { - Self { - path: entry.path.clone(), - kind: entry.kind, - size: entry.size, - modified: entry.modified, - accessed: None, - created: None, - inode: entry.inode, - permissions: None, - is_hidden: entry - .path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| n.starts_with('.')) - .unwrap_or(false), - } - } + fn from(entry: DirEntry) -> Self { + Self { + path: entry.path.clone(), + kind: entry.kind, + size: entry.size, + modified: entry.modified, + accessed: None, + created: None, + inode: entry.inode, + permissions: None, + is_hidden: entry + .path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + } + } } /// Handles entry creation and updates in the database @@ -66,537 +66,540 @@ pub struct EntryProcessor; /// Result of content identity linking (for batch sync) pub struct ContentLinkResult { - pub content_identity: entities::content_identity::Model, - pub entry: entities::entry::Model, - pub is_new_content: bool, + pub content_identity: entities::content_identity::Model, + pub entry: entities::entry::Model, + pub is_new_content: bool, } impl EntryProcessor { - /// Get platform-specific inode - #[cfg(unix)] - pub fn get_inode(metadata: &std::fs::Metadata) -> Option { - use std::os::unix::fs::MetadataExt; - Some(metadata.ino()) - } + /// Get platform-specific inode + #[cfg(unix)] + pub fn get_inode(metadata: &std::fs::Metadata) -> Option { + use std::os::unix::fs::MetadataExt; + Some(metadata.ino()) + } - #[cfg(windows)] - pub fn get_inode(_metadata: &std::fs::Metadata) -> Option { - // Windows doesn't have inodes. - // The method `file_index()` from `std::os::windows::fs::MetadataExt` is unstable (issue #63010). - // Returning None is safe as the field is Optional. - None - } + #[cfg(windows)] + pub fn get_inode(_metadata: &std::fs::Metadata) -> Option { + // Windows doesn't have inodes. + // The method `file_index()` from `std::os::windows::fs::MetadataExt` is unstable (issue #63010). + // Returning None is safe as the field is Optional. + None + } - #[cfg(not(any(unix, windows)))] - pub fn get_inode(_metadata: &std::fs::Metadata) -> Option { - None - } + #[cfg(not(any(unix, windows)))] + pub fn get_inode(_metadata: &std::fs::Metadata) -> Option { + None + } - /// Extract detailed metadata from a path - /// - /// Uses the provided VolumeBackend if available, otherwise falls back to direct filesystem access. - pub async fn extract_metadata( - path: &Path, - backend: Option<&std::sync::Arc>, - ) -> Result { - // Use backend if available, otherwise fall back to local filesystem - if let Some(backend) = backend { - let raw = backend - .metadata(path) - .await - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + /// Extract detailed metadata from a path + /// + /// Uses the provided VolumeBackend if available, otherwise falls back to direct filesystem access. + pub async fn extract_metadata( + path: &Path, + backend: Option<&std::sync::Arc>, + ) -> Result { + // Use backend if available, otherwise fall back to local filesystem + if let Some(backend) = backend { + let raw = backend + .metadata(path) + .await + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - Ok(EntryMetadata { - path: path.to_path_buf(), - kind: raw.kind, - size: raw.size, - modified: raw.modified, - accessed: raw.accessed, - created: raw.created, - inode: raw.inode, - permissions: raw.permissions, - is_hidden: path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| n.starts_with('.')) - .unwrap_or(false), - }) - } else { - // Fallback to direct filesystem access - let metadata = tokio::fs::symlink_metadata(path).await?; + Ok(EntryMetadata { + path: path.to_path_buf(), + kind: raw.kind, + size: raw.size, + modified: raw.modified, + accessed: raw.accessed, + created: raw.created, + inode: raw.inode, + permissions: raw.permissions, + is_hidden: path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }) + } else { + // Fallback to direct filesystem access + let metadata = tokio::fs::symlink_metadata(path).await?; - let kind = if metadata.is_dir() { - EntryKind::Directory - } else if metadata.is_symlink() { - EntryKind::Symlink - } else { - EntryKind::File - }; + let kind = if metadata.is_dir() { + EntryKind::Directory + } else if metadata.is_symlink() { + EntryKind::Symlink + } else { + EntryKind::File + }; - let inode = Self::get_inode(&metadata); + let inode = Self::get_inode(&metadata); - #[cfg(unix)] - let permissions = { - use std::os::unix::fs::MetadataExt; - Some(metadata.mode()) - }; + #[cfg(unix)] + let permissions = { + use std::os::unix::fs::MetadataExt; + Some(metadata.mode()) + }; - #[cfg(not(unix))] - let permissions = None; + #[cfg(not(unix))] + let permissions = None; - Ok(EntryMetadata { - path: path.to_path_buf(), - kind, - size: metadata.len(), - modified: metadata.modified().ok(), - accessed: metadata.accessed().ok(), - created: metadata.created().ok(), - inode, - permissions, - is_hidden: path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| n.starts_with('.')) - .unwrap_or(false), - }) - } - } + Ok(EntryMetadata { + path: path.to_path_buf(), + kind, + size: metadata.len(), + modified: metadata.modified().ok(), + accessed: metadata.accessed().ok(), + created: metadata.created().ok(), + inode, + permissions, + is_hidden: path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }) + } + } - /// Create an entry record in the database using a provided connection/transaction - /// and collect related rows for bulk insertion by the caller. - pub async fn create_entry_in_conn( - state: &mut IndexerState, - ctx: &impl IndexingCtx, - entry: &DirEntry, - device_id: i32, - location_root_path: &Path, - conn: &C, - out_self_closures: &mut Vec, - out_dir_paths: &mut Vec, - ) -> Result { - // Extract file extension (without dot) for files, None for directories - let extension = match entry.kind { - EntryKind::File => entry - .path - .extension() - .and_then(|ext| ext.to_str()) - .map(|ext| ext.to_lowercase()), - EntryKind::Directory | EntryKind::Symlink => None, - }; + /// Create an entry record in the database using a provided connection/transaction + /// and collect related rows for bulk insertion by the caller. + pub async fn create_entry_in_conn( + state: &mut IndexerState, + ctx: &impl IndexingCtx, + entry: &DirEntry, + device_id: i32, + location_root_path: &Path, + conn: &C, + out_self_closures: &mut Vec, + out_dir_paths: &mut Vec, + ) -> Result { + // Extract file extension (without dot) for files, None for directories + let extension = match entry.kind { + EntryKind::File => entry + .path + .extension() + .and_then(|ext| ext.to_str()) + .map(|ext| ext.to_lowercase()), + EntryKind::Directory | EntryKind::Symlink => None, + }; - // Get file/directory name - // For files: use stem (name without extension) - // For directories: use full name (including .app, etc.) - let name = match entry.kind { - EntryKind::File => { - // For files, use stem (without extension) - entry - .path - .file_stem() - .map(|stem| stem.to_string_lossy().to_string()) - .unwrap_or_else(|| { - entry - .path - .file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| "unknown".to_string()) - }) - } - EntryKind::Directory | EntryKind::Symlink => { - // For directories and symlinks, use full name - entry - .path - .file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| "unknown".to_string()) - } - }; + // Get file/directory name + // For files: use stem (name without extension) + // For directories: use full name (including .app, etc.) + let name = match entry.kind { + EntryKind::File => { + // For files, use stem (without extension) + entry + .path + .file_stem() + .map(|stem| stem.to_string_lossy().to_string()) + .unwrap_or_else(|| { + entry + .path + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown".to_string()) + }) + } + EntryKind::Directory | EntryKind::Symlink => { + // For directories and symlinks, use full name + entry + .path + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown".to_string()) + } + }; - // Convert timestamps - let modified_at = entry - .modified - .and_then(|t| { - chrono::DateTime::from_timestamp( - t.duration_since(std::time::UNIX_EPOCH).ok()?.as_secs() as i64, - 0, - ) - }) - .unwrap_or_else(|| chrono::Utc::now()); + // Convert timestamps + let modified_at = entry + .modified + .and_then(|t| { + chrono::DateTime::from_timestamp( + t.duration_since(std::time::UNIX_EPOCH).ok()?.as_secs() as i64, + 0, + ) + }) + .unwrap_or_else(|| chrono::Utc::now()); - // All entries get UUIDs immediately for UI normalized caching compatibility. - // Sync readiness is now determined by content_id presence (for regular files) - // or by entry kind (for directories/empty files). - let entry_uuid = Some(Uuid::new_v4()); + // All entries get UUIDs immediately for UI normalized caching compatibility. + // Sync readiness is now determined by content_id presence (for regular files) + // or by entry kind (for directories/empty files). + let entry_uuid = Some(Uuid::new_v4()); - // Find parent entry ID - let parent_id = if let Some(parent_path) = entry.path.parent() { - ctx.log(format!( - "Looking up parent for {}: parent_path = {}", - entry.path.display(), - parent_path.display() - )); + // Find parent entry ID + let parent_id = if let Some(parent_path) = entry.path.parent() { + ctx.log(format!( + "Looking up parent for {}: parent_path = {}", + entry.path.display(), + parent_path.display() + )); - // First check the cache - if let Some(id) = state.entry_id_cache.get(parent_path).copied() { - ctx.log(format!("Found parent in cache: id = {}", id)); - Some(id) - } else { - // If not in cache, try to find it in the database - // For cloud paths, try both with and without trailing slash - let parent_path_str = parent_path.to_string_lossy().to_string(); - let is_cloud = parent_path_str.contains("://"); + // First check the cache + if let Some(id) = state.entry_id_cache.get(parent_path).copied() { + ctx.log(format!("Found parent in cache: id = {}", id)); + Some(id) + } else { + // If not in cache, try to find it in the database + // For cloud paths, try both with and without trailing slash + let parent_path_str = parent_path.to_string_lossy().to_string(); + let is_cloud = parent_path_str.contains("://"); - let parent_variants = if is_cloud && !parent_path_str.ends_with('/') { - vec![parent_path_str.clone(), format!("{}/", parent_path_str)] - } else { - vec![parent_path_str.clone()] - }; + let parent_variants = if is_cloud && !parent_path_str.ends_with('/') { + vec![parent_path_str.clone(), format!("{}/", parent_path_str)] + } else { + vec![parent_path_str.clone()] + }; - let query = entities::directory_paths::Entity::find() - .filter(entities::directory_paths::Column::Path.is_in(parent_variants.clone())); + let query = entities::directory_paths::Entity::find() + .filter(entities::directory_paths::Column::Path.is_in(parent_variants.clone())); - if let Ok(Some(dir_path_record)) = query.one(ctx.library_db()).await { - // Found parent in database, cache it - ctx.log(format!("Found parent in database: id = {}", dir_path_record.entry_id)); - state - .entry_id_cache - .insert(parent_path.to_path_buf(), dir_path_record.entry_id); - Some(dir_path_record.entry_id) - } else { - // Parent not found - this shouldn't happen with proper sorting - ctx.log(format!( - "WARNING: Parent not found for {}: {} (tried: {:?})", - entry.path.display(), - parent_path.display(), - parent_variants - )); - None - } - } - } else { - None - }; + if let Ok(Some(dir_path_record)) = query.one(ctx.library_db()).await { + // Found parent in database, cache it + ctx.log(format!( + "Found parent in database: id = {}", + dir_path_record.entry_id + )); + state + .entry_id_cache + .insert(parent_path.to_path_buf(), dir_path_record.entry_id); + Some(dir_path_record.entry_id) + } else { + // Parent not found - this shouldn't happen with proper sorting + ctx.log(format!( + "WARNING: Parent not found for {}: {} (tried: {:?})", + entry.path.display(), + parent_path.display(), + parent_variants + )); + None + } + } + } else { + None + }; - // Create entry - let now = chrono::Utc::now(); - tracing::debug!( - "Creating entry: name={}, path={}, inode={:?}, parent_id={:?}", - name, - entry.path.display(), - entry.inode, - parent_id - ); - let new_entry = entities::entry::ActiveModel { - uuid: Set(entry_uuid), - name: Set(name.clone()), - kind: Set(Self::entry_kind_to_int(entry.kind)), - extension: Set(extension), - metadata_id: Set(None), // User metadata only created when user adds metadata - content_id: Set(None), // Will be set later during content identification phase - size: Set(entry.size as i64), - aggregate_size: Set(0), // Will be calculated in aggregation phase - child_count: Set(0), // Will be calculated in aggregation phase - file_count: Set(0), // Will be calculated in aggregation phase - created_at: Set(now), - modified_at: Set(modified_at), - accessed_at: Set(None), - indexed_at: Set(Some(now)), // Record when we indexed this entry - permissions: Set(None), // TODO: Could extract from metadata - inode: Set(entry.inode.map(|i| i as i64)), - parent_id: Set(parent_id), - ..Default::default() - }; + // Create entry + let now = chrono::Utc::now(); + tracing::debug!( + "Creating entry: name={}, path={}, inode={:?}, parent_id={:?}", + name, + entry.path.display(), + entry.inode, + parent_id + ); + let new_entry = entities::entry::ActiveModel { + uuid: Set(entry_uuid), + name: Set(name.clone()), + kind: Set(Self::entry_kind_to_int(entry.kind)), + extension: Set(extension), + metadata_id: Set(None), // User metadata only created when user adds metadata + content_id: Set(None), // Will be set later during content identification phase + size: Set(entry.size as i64), + aggregate_size: Set(0), // Will be calculated in aggregation phase + child_count: Set(0), // Will be calculated in aggregation phase + file_count: Set(0), // Will be calculated in aggregation phase + created_at: Set(now), + modified_at: Set(modified_at), + accessed_at: Set(None), + indexed_at: Set(Some(now)), // Record when we indexed this entry + permissions: Set(None), // TODO: Could extract from metadata + inode: Set(entry.inode.map(|i| i as i64)), + parent_id: Set(parent_id), + ..Default::default() + }; - // Insert the entry - let result = new_entry - .insert(conn) - .await - .map_err(|e| JobError::execution(format!("Failed to create entry: {}", e)))?; + // Insert the entry + let result = new_entry + .insert(conn) + .await + .map_err(|e| JobError::execution(format!("Failed to create entry: {}", e)))?; - tracing::debug!( - "Entry inserted in DB: id={}, name={}, inode={:?}", - result.id, - result.name, - result.inode - ); + tracing::debug!( + "Entry inserted in DB: id={}, name={}, inode={:?}", + result.id, + result.name, + result.inode + ); - // Populate closure table - // First, insert self-reference - let self_closure = entry_closure::ActiveModel { - ancestor_id: Set(result.id), - descendant_id: Set(result.id), - depth: Set(0), - ..Default::default() - }; - out_self_closures.push(self_closure); + // Populate closure table + // First, insert self-reference + let self_closure = entry_closure::ActiveModel { + ancestor_id: Set(result.id), + descendant_id: Set(result.id), + depth: Set(0), + ..Default::default() + }; + out_self_closures.push(self_closure); - // If there's a parent, copy all parent's ancestors - if let Some(parent_id) = parent_id { - // Insert closure entries for all ancestors - conn.execute_unprepared(&format!( - "INSERT INTO entry_closure (ancestor_id, descendant_id, depth) \ + // If there's a parent, copy all parent's ancestors + if let Some(parent_id) = parent_id { + // Insert closure entries for all ancestors + conn.execute_unprepared(&format!( + "INSERT INTO entry_closure (ancestor_id, descendant_id, depth) \ SELECT ancestor_id, {}, depth + 1 \ FROM entry_closure \ WHERE descendant_id = {}", - result.id, parent_id - )) - .await - .map_err(|e| { - JobError::execution(format!("Failed to populate ancestor closures: {}", e)) - })?; - } + result.id, parent_id + )) + .await + .map_err(|e| { + JobError::execution(format!("Failed to populate ancestor closures: {}", e)) + })?; + } - // If this is a directory, populate the directory_paths table - if entry.kind == EntryKind::Directory { - // Use the absolute path from the DirEntry which contains the full filesystem path - let absolute_path = entry.path.to_string_lossy().to_string(); + // If this is a directory, populate the directory_paths table + if entry.kind == EntryKind::Directory { + // Use the absolute path from the DirEntry which contains the full filesystem path + let absolute_path = entry.path.to_string_lossy().to_string(); - // Insert into directory_paths table - let dir_path_entry = directory_paths::ActiveModel { - entry_id: Set(result.id), - path: Set(absolute_path), - ..Default::default() - }; - out_dir_paths.push(dir_path_entry); - } + // Insert into directory_paths table + let dir_path_entry = directory_paths::ActiveModel { + entry_id: Set(result.id), + path: Set(absolute_path), + ..Default::default() + }; + out_dir_paths.push(dir_path_entry); + } - // Cache the entry ID for potential children - // Normalize cloud directory paths to match what parent() returns - let cache_key = if entry.kind == EntryKind::Directory { - normalize_cloud_dir_path(&entry.path) - } else { - entry.path.clone() - }; - state.entry_id_cache.insert(cache_key, result.id); + // Cache the entry ID for potential children + // Normalize cloud directory paths to match what parent() returns + let cache_key = if entry.kind == EntryKind::Directory { + normalize_cloud_dir_path(&entry.path) + } else { + entry.path.clone() + }; + state.entry_id_cache.insert(cache_key, result.id); - Ok(result) - } + Ok(result) + } - /// Create an entry, starting and committing its own transaction (single insert) - pub async fn create_entry( - state: &mut IndexerState, - ctx: &impl IndexingCtx, - entry: &DirEntry, - device_id: i32, - location_root_path: &Path, - ) -> Result { - let txn = ctx - .library_db() - .begin() - .await - .map_err(|e| JobError::execution(format!("Failed to begin transaction: {}", e)))?; + /// Create an entry, starting and committing its own transaction (single insert) + pub async fn create_entry( + state: &mut IndexerState, + ctx: &impl IndexingCtx, + entry: &DirEntry, + device_id: i32, + location_root_path: &Path, + ) -> Result { + let txn = ctx + .library_db() + .begin() + .await + .map_err(|e| JobError::execution(format!("Failed to begin transaction: {}", e)))?; - let mut self_closures: Vec = Vec::new(); - let mut dir_paths: Vec = Vec::new(); - let result = Self::create_entry_in_conn( - state, - ctx, - entry, - device_id, - location_root_path, - &txn, - &mut self_closures, - &mut dir_paths, - ) - .await; + let mut self_closures: Vec = Vec::new(); + let mut dir_paths: Vec = Vec::new(); + let result = Self::create_entry_in_conn( + state, + ctx, + entry, + device_id, + location_root_path, + &txn, + &mut self_closures, + &mut dir_paths, + ) + .await; - let entry_model = match result { - Ok(model) => model, - Err(e) => { - let _ = txn.rollback().await; - return Err(e); - } - }; + let entry_model = match result { + Ok(model) => model, + Err(e) => { + let _ = txn.rollback().await; + return Err(e); + } + }; - if !self_closures.is_empty() { - entry_closure::Entity::insert_many(self_closures) - .exec(&txn) - .await - .map_err(|e| { - JobError::execution(format!("Failed to bulk insert self-closures: {}", e)) - })?; - } - if !dir_paths.is_empty() { - directory_paths::Entity::insert_many(dir_paths) - .exec(&txn) - .await - .map_err(|e| { - JobError::execution(format!("Failed to bulk insert directory paths: {}", e)) - })?; - } - txn.commit() - .await - .map_err(|e| JobError::execution(format!("Failed to commit transaction: {}", e)))?; + if !self_closures.is_empty() { + entry_closure::Entity::insert_many(self_closures) + .exec(&txn) + .await + .map_err(|e| { + JobError::execution(format!("Failed to bulk insert self-closures: {}", e)) + })?; + } + if !dir_paths.is_empty() { + directory_paths::Entity::insert_many(dir_paths) + .exec(&txn) + .await + .map_err(|e| { + JobError::execution(format!("Failed to bulk insert directory paths: {}", e)) + })?; + } + txn.commit() + .await + .map_err(|e| JobError::execution(format!("Failed to commit transaction: {}", e)))?; - // Sync entry to other devices - if let Some(library) = ctx.library() { - tracing::info!( - "ENTRY_SYNC: About to sync entry name={} uuid={:?}", - entry_model.name, - entry_model.uuid - ); - if let Err(e) = library - .sync_model_with_db( - &entry_model, - crate::infra::sync::ChangeType::Insert, - ctx.library_db(), - ) - .await - { - tracing::warn!( - "ENTRY_SYNC: Failed to sync entry {}: {}", - entry_model - .uuid - .map(|u| u.to_string()) - .unwrap_or_else(|| "no-uuid".to_string()), - e - ); - } else { - tracing::info!( - "ENTRY_SYNC: Successfully synced entry name={} uuid={:?}", - entry_model.name, - entry_model.uuid - ); - } - } + // Sync entry to other devices + if let Some(library) = ctx.library() { + tracing::info!( + "ENTRY_SYNC: About to sync entry name={} uuid={:?}", + entry_model.name, + entry_model.uuid + ); + if let Err(e) = library + .sync_model_with_db( + &entry_model, + crate::infra::sync::ChangeType::Insert, + ctx.library_db(), + ) + .await + { + tracing::warn!( + "ENTRY_SYNC: Failed to sync entry {}: {}", + entry_model + .uuid + .map(|u| u.to_string()) + .unwrap_or_else(|| "no-uuid".to_string()), + e + ); + } else { + tracing::info!( + "ENTRY_SYNC: Successfully synced entry name={} uuid={:?}", + entry_model.name, + entry_model.uuid + ); + } + } - Ok(entry_model.id) - } + Ok(entry_model.id) + } - /// Update an existing entry - pub async fn update_entry( - ctx: &impl IndexingCtx, - entry_id: i32, - entry: &DirEntry, - ) -> Result<(), JobError> { - let db_entry = entities::entry::Entity::find_by_id(entry_id) - .one(ctx.library_db()) - .await - .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? - .ok_or_else(|| JobError::execution("Entry not found for update".to_string()))?; + /// Update an existing entry + pub async fn update_entry( + ctx: &impl IndexingCtx, + entry_id: i32, + entry: &DirEntry, + ) -> Result<(), JobError> { + let db_entry = entities::entry::Entity::find_by_id(entry_id) + .one(ctx.library_db()) + .await + .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? + .ok_or_else(|| JobError::execution("Entry not found for update".to_string()))?; - let mut entry_active: entities::entry::ActiveModel = db_entry.into(); + let mut entry_active: entities::entry::ActiveModel = db_entry.into(); - // Update modifiable fields - entry_active.size = Set(entry.size as i64); - if let Some(modified) = entry.modified { - if let Some(timestamp) = chrono::DateTime::from_timestamp( - modified - .duration_since(std::time::UNIX_EPOCH) - .ok() - .map(|d| d.as_secs() as i64) - .unwrap_or(0), - 0, - ) { - entry_active.modified_at = Set(timestamp); - } - } + // Update modifiable fields + entry_active.size = Set(entry.size as i64); + if let Some(modified) = entry.modified { + if let Some(timestamp) = chrono::DateTime::from_timestamp( + modified + .duration_since(std::time::UNIX_EPOCH) + .ok() + .map(|d| d.as_secs() as i64) + .unwrap_or(0), + 0, + ) { + entry_active.modified_at = Set(timestamp); + } + } - if let Some(inode) = entry.inode { - entry_active.inode = Set(Some(inode as i64)); - } + if let Some(inode) = entry.inode { + entry_active.inode = Set(Some(inode as i64)); + } - // TODO: Rename indexed_at to last_indexed_at to better reflect its purpose - // Update indexed_at so incremental sync picks up this change - // Without this, modified entries would be skipped by watermark-based queries - entry_active.indexed_at = Set(Some(chrono::Utc::now())); + // TODO: Rename indexed_at to last_indexed_at to better reflect its purpose + // Update indexed_at so incremental sync picks up this change + // Without this, modified entries would be skipped by watermark-based queries + entry_active.indexed_at = Set(Some(chrono::Utc::now())); - entry_active - .update(ctx.library_db()) - .await - .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; + entry_active + .update(ctx.library_db()) + .await + .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; - Ok(()) - } + Ok(()) + } - /// Handle entry move operation with closure table updates (creates own transaction) - pub async fn move_entry( - state: &mut IndexerState, - ctx: &impl IndexingCtx, - entry_id: i32, - old_path: &Path, - new_path: &Path, - location_root_path: &Path, - ) -> Result<(), JobError> { - // Begin transaction for atomic move operation - let txn = ctx - .library_db() - .begin() - .await - .map_err(|e| JobError::execution(format!("Failed to begin transaction: {}", e)))?; + /// Handle entry move operation with closure table updates (creates own transaction) + pub async fn move_entry( + state: &mut IndexerState, + ctx: &impl IndexingCtx, + entry_id: i32, + old_path: &Path, + new_path: &Path, + location_root_path: &Path, + ) -> Result<(), JobError> { + // Begin transaction for atomic move operation + let txn = ctx + .library_db() + .begin() + .await + .map_err(|e| JobError::execution(format!("Failed to begin transaction: {}", e)))?; - let result = Self::move_entry_in_conn( - state, - ctx, - entry_id, - old_path, - new_path, - location_root_path, - &txn, - ) - .await; + let result = Self::move_entry_in_conn( + state, + ctx, + entry_id, + old_path, + new_path, + location_root_path, + &txn, + ) + .await; - match result { - Ok(()) => { - txn.commit().await.map_err(|e| { - JobError::execution(format!("Failed to commit move transaction: {}", e)) - })?; - Ok(()) - } - Err(e) => { - let _ = txn.rollback().await; - Err(e) - } - } - } + match result { + Ok(()) => { + txn.commit().await.map_err(|e| { + JobError::execution(format!("Failed to commit move transaction: {}", e)) + })?; + Ok(()) + } + Err(e) => { + let _ = txn.rollback().await; + Err(e) + } + } + } - /// Handle entry move operation within existing transaction - pub async fn move_entry_in_conn( - state: &mut IndexerState, - ctx: &impl IndexingCtx, - entry_id: i32, - old_path: &Path, - new_path: &Path, - location_root_path: &Path, - txn: &DatabaseTransaction, - ) -> Result<(), JobError> { - // Get the entry - let db_entry = entities::entry::Entity::find_by_id(entry_id) - .one(txn) - .await - .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? - .ok_or_else(|| JobError::execution("Entry not found for move".to_string()))?; + /// Handle entry move operation within existing transaction + pub async fn move_entry_in_conn( + state: &mut IndexerState, + ctx: &impl IndexingCtx, + entry_id: i32, + old_path: &Path, + new_path: &Path, + location_root_path: &Path, + txn: &DatabaseTransaction, + ) -> Result<(), JobError> { + // Get the entry + let db_entry = entities::entry::Entity::find_by_id(entry_id) + .one(txn) + .await + .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? + .ok_or_else(|| JobError::execution("Entry not found for move".to_string()))?; - let is_directory = db_entry.kind == Self::entry_kind_to_int(EntryKind::Directory); - let mut entry_active: entities::entry::ActiveModel = db_entry.into(); + let is_directory = db_entry.kind == Self::entry_kind_to_int(EntryKind::Directory); + let mut entry_active: entities::entry::ActiveModel = db_entry.into(); - // Find new parent entry ID - let new_parent_id = if let Some(parent_path) = new_path.parent() { - state.entry_id_cache.get(parent_path).copied() - } else { - None - }; + // Find new parent entry ID + let new_parent_id = if let Some(parent_path) = new_path.parent() { + state.entry_id_cache.get(parent_path).copied() + } else { + None + }; - // Update entry fields - entry_active.parent_id = Set(new_parent_id); + // Update entry fields + entry_active.parent_id = Set(new_parent_id); - // Extract new name if it changed - let mut new_name_value = None; - if let Some(new_name) = new_path.file_stem() { - let name_string = new_name.to_string_lossy().to_string(); - new_name_value = Some(name_string.clone()); - entry_active.name = Set(name_string); - } + // Extract new name if it changed + let mut new_name_value = None; + if let Some(new_name) = new_path.file_stem() { + let name_string = new_name.to_string_lossy().to_string(); + new_name_value = Some(name_string.clone()); + entry_active.name = Set(name_string); + } - // Save the updated entry - entry_active - .update(txn) - .await - .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; + // Save the updated entry + entry_active + .update(txn) + .await + .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; - // Update closure table for the move operation - // Step 1: Delete all ancestor relationships for the moved subtree (except internal relationships) - txn.execute_unprepared(&format!( + // Update closure table for the move operation + // Step 1: Delete all ancestor relationships for the moved subtree (except internal relationships) + txn.execute_unprepared(&format!( "DELETE FROM entry_closure \ WHERE descendant_id IN (SELECT descendant_id FROM entry_closure WHERE ancestor_id = {}) \ AND ancestor_id NOT IN (SELECT descendant_id FROM entry_closure WHERE ancestor_id = {})", @@ -605,406 +608,406 @@ impl EntryProcessor { .await .map_err(|e| JobError::execution(format!("Failed to disconnect subtree: {}", e)))?; - // Step 2: If there's a new parent, reconnect the subtree - if let Some(new_parent_id) = new_parent_id { - // Connect moved subtree to new parent - txn.execute_unprepared(&format!( - "INSERT INTO entry_closure (ancestor_id, descendant_id, depth) \ + // Step 2: If there's a new parent, reconnect the subtree + if let Some(new_parent_id) = new_parent_id { + // Connect moved subtree to new parent + txn.execute_unprepared(&format!( + "INSERT INTO entry_closure (ancestor_id, descendant_id, depth) \ SELECT p.ancestor_id, c.descendant_id, p.depth + c.depth + 1 \ FROM entry_closure p, entry_closure c \ WHERE p.descendant_id = {} AND c.ancestor_id = {}", - new_parent_id, entry_id - )) - .await - .map_err(|e| JobError::execution(format!("Failed to reconnect subtree: {}", e)))?; - } + new_parent_id, entry_id + )) + .await + .map_err(|e| JobError::execution(format!("Failed to reconnect subtree: {}", e)))?; + } - // If this is a directory, update its path in directory_paths table - if is_directory { - // Get the new name from what we saved earlier - let new_name = new_name_value.unwrap_or_else(|| { - // If name didn't change, get it from the path - new_path - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or("unknown") - .to_string() - }); + // If this is a directory, update its path in directory_paths table + if is_directory { + // Get the new name from what we saved earlier + let new_name = new_name_value.unwrap_or_else(|| { + // If name didn't change, get it from the path + new_path + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown") + .to_string() + }); - // Build the new path - let new_directory_path = - PathResolver::build_directory_path(txn, new_parent_id, &new_name) - .await - .map_err(|e| { - JobError::execution(format!("Failed to build new directory path: {}", e)) - })?; + // Build the new path + let new_directory_path = + PathResolver::build_directory_path(txn, new_parent_id, &new_name) + .await + .map_err(|e| { + JobError::execution(format!("Failed to build new directory path: {}", e)) + })?; - // Get the old path for descendant updates - let old_directory_path = PathResolver::get_directory_path(txn, entry_id) - .await - .map_err(|e| { - JobError::execution(format!("Failed to get old directory path: {}", e)) - })?; + // Get the old path for descendant updates + let old_directory_path = PathResolver::get_directory_path(txn, entry_id) + .await + .map_err(|e| { + JobError::execution(format!("Failed to get old directory path: {}", e)) + })?; - // Update the directory's own path - let mut dir_path_active = directory_paths::Entity::find_by_id(entry_id) - .one(txn) - .await - .map_err(|e| JobError::execution(format!("Failed to find directory path: {}", e)))? - .ok_or_else(|| JobError::execution("Directory path not found".to_string()))? - .into_active_model(); - dir_path_active.path = Set(new_directory_path.clone()); - dir_path_active.update(txn).await.map_err(|e| { - JobError::execution(format!("Failed to update directory path: {}", e)) - })?; + // Update the directory's own path + let mut dir_path_active = directory_paths::Entity::find_by_id(entry_id) + .one(txn) + .await + .map_err(|e| JobError::execution(format!("Failed to find directory path: {}", e)))? + .ok_or_else(|| JobError::execution("Directory path not found".to_string()))? + .into_active_model(); + dir_path_active.path = Set(new_directory_path.clone()); + dir_path_active.update(txn).await.map_err(|e| { + JobError::execution(format!("Failed to update directory path: {}", e)) + })?; - // Update descendant directory paths within the same transaction - // Note: This is done synchronously within the batch transaction for consistency - if let Err(e) = PathResolver::update_descendant_paths( - txn, - entry_id, - &old_directory_path, - &new_directory_path, - ) - .await - { - tracing::error!("Failed to update descendant paths: {}", e); - } - } + // Update descendant directory paths within the same transaction + // Note: This is done synchronously within the batch transaction for consistency + if let Err(e) = PathResolver::update_descendant_paths( + txn, + entry_id, + &old_directory_path, + &new_directory_path, + ) + .await + { + tracing::error!("Failed to update descendant paths: {}", e); + } + } - // Update cache - state.entry_id_cache.remove(old_path); - state - .entry_id_cache - .insert(new_path.to_path_buf(), entry_id); + // Update cache + state.entry_id_cache.remove(old_path); + state + .entry_id_cache + .insert(new_path.to_path_buf(), entry_id); - Ok(()) - } + Ok(()) + } - /// Convert EntryKind to integer for database storage - pub fn entry_kind_to_int(kind: EntryKind) -> i32 { - match kind { - EntryKind::File => 0, - EntryKind::Directory => 1, - EntryKind::Symlink => 2, - } - } + /// Convert EntryKind to integer for database storage + pub fn entry_kind_to_int(kind: EntryKind) -> i32 { + match kind { + EntryKind::File => 0, + EntryKind::Directory => 1, + EntryKind::Symlink => 2, + } + } - /// Create or find content identity and link to entry with deterministic UUID - /// This method implements the content identification phase logic - /// Returns models for batch syncing (caller responsible for sync) - pub async fn link_to_content_identity( - ctx: &impl IndexingCtx, - entry_id: i32, - path: &Path, - content_hash: String, - library_id: Uuid, - ) -> Result { - // Check if content identity already exists by content_hash - let existing = entities::content_identity::Entity::find() - .filter(entities::content_identity::Column::ContentHash.eq(&content_hash)) - .one(ctx.library_db()) - .await - .map_err(|e| JobError::execution(format!("Failed to query content identity: {}", e)))?; + /// Create or find content identity and link to entry with deterministic UUID + /// This method implements the content identification phase logic + /// Returns models for batch syncing (caller responsible for sync) + pub async fn link_to_content_identity( + ctx: &impl IndexingCtx, + entry_id: i32, + path: &Path, + content_hash: String, + library_id: Uuid, + ) -> Result { + // Check if content identity already exists by content_hash + let existing = entities::content_identity::Entity::find() + .filter(entities::content_identity::Column::ContentHash.eq(&content_hash)) + .one(ctx.library_db()) + .await + .map_err(|e| JobError::execution(format!("Failed to query content identity: {}", e)))?; - let (content_model, is_new_content) = if let Some(existing) = existing { - // Increment entry count for existing content - let mut existing_active: entities::content_identity::ActiveModel = existing.into(); - existing_active.entry_count = Set(existing_active.entry_count.unwrap() + 1); - existing_active.last_verified_at = Set(chrono::Utc::now()); + let (content_model, is_new_content) = if let Some(existing) = existing { + // Increment entry count for existing content + let mut existing_active: entities::content_identity::ActiveModel = existing.into(); + existing_active.entry_count = Set(existing_active.entry_count.unwrap() + 1); + existing_active.last_verified_at = Set(chrono::Utc::now()); - let updated = existing_active - .update(ctx.library_db()) - .await - .map_err(|e| { - JobError::execution(format!("Failed to update content identity: {}", e)) - })?; + let updated = existing_active + .update(ctx.library_db()) + .await + .map_err(|e| { + JobError::execution(format!("Failed to update content identity: {}", e)) + })?; - (updated, false) - } else { - // Create new content identity with deterministic UUID (ready for sync) - let file_size = tokio::fs::symlink_metadata(path) - .await - .map(|m| m.len() as i64) - .unwrap_or(0); + (updated, false) + } else { + // Create new content identity with deterministic UUID (ready for sync) + let file_size = tokio::fs::symlink_metadata(path) + .await + .map(|m| m.len() as i64) + .unwrap_or(0); - // Generate deterministic UUID from content_hash + library_id - let deterministic_uuid = { - const LIBRARY_NAMESPACE: uuid::Uuid = uuid::Uuid::from_bytes([ - 0x6b, 0xa7, 0xb8, 0x10, 0x9d, 0xad, 0x11, 0xd1, 0x80, 0xb4, 0x00, 0xc0, 0x4f, - 0xd4, 0x30, 0xc8, - ]); - // We use v5 to ensure the UUID is deterministic and unique within the library - let namespace = uuid::Uuid::new_v5(&LIBRARY_NAMESPACE, library_id.as_bytes()); - uuid::Uuid::new_v5(&namespace, content_hash.as_bytes()) - }; + // Generate deterministic UUID from content_hash + library_id + let deterministic_uuid = { + const LIBRARY_NAMESPACE: uuid::Uuid = uuid::Uuid::from_bytes([ + 0x6b, 0xa7, 0xb8, 0x10, 0x9d, 0xad, 0x11, 0xd1, 0x80, 0xb4, 0x00, 0xc0, 0x4f, + 0xd4, 0x30, 0xc8, + ]); + // We use v5 to ensure the UUID is deterministic and unique within the library + let namespace = uuid::Uuid::new_v5(&LIBRARY_NAMESPACE, library_id.as_bytes()); + uuid::Uuid::new_v5(&namespace, content_hash.as_bytes()) + }; - // Detect file type using the file type registry - let registry = FileTypeRegistry::default(); - let file_type_result = registry.identify(path).await; + // Detect file type using the file type registry + let registry = FileTypeRegistry::default(); + let file_type_result = registry.identify(path).await; - let (kind_id, mime_type_id) = match file_type_result { - Ok(result) => { - // Get content kind ID directly from the enum - let kind_id = result.file_type.category as i32; + let (kind_id, mime_type_id) = match file_type_result { + Ok(result) => { + // Get content kind ID directly from the enum + let kind_id = result.file_type.category as i32; - // Handle MIME type - upsert if found - let mime_type_id = if let Some(mime_str) = result.file_type.primary_mime_type() - { - // Check if MIME type already exists - let existing = entities::mime_type::Entity::find() - .filter(entities::mime_type::Column::MimeType.eq(mime_str)) - .one(ctx.library_db()) - .await - .map_err(|e| { - JobError::execution(format!("Failed to query mime type: {}", e)) - })?; + // Handle MIME type - upsert if found + let mime_type_id = if let Some(mime_str) = result.file_type.primary_mime_type() + { + // Check if MIME type already exists + let existing = entities::mime_type::Entity::find() + .filter(entities::mime_type::Column::MimeType.eq(mime_str)) + .one(ctx.library_db()) + .await + .map_err(|e| { + JobError::execution(format!("Failed to query mime type: {}", e)) + })?; - match existing { - Some(mime_record) => Some(mime_record.id), - None => { - // Create new MIME type entry - let new_mime = entities::mime_type::ActiveModel { - uuid: Set(Uuid::new_v4()), - mime_type: Set(mime_str.to_string()), - created_at: Set(chrono::Utc::now()), - ..Default::default() - }; + match existing { + Some(mime_record) => Some(mime_record.id), + None => { + // Create new MIME type entry + let new_mime = entities::mime_type::ActiveModel { + uuid: Set(Uuid::new_v4()), + mime_type: Set(mime_str.to_string()), + created_at: Set(chrono::Utc::now()), + ..Default::default() + }; - let mime_result = - new_mime.insert(ctx.library_db()).await.map_err(|e| { - JobError::execution(format!( - "Failed to create mime type: {}", - e - )) - })?; + let mime_result = + new_mime.insert(ctx.library_db()).await.map_err(|e| { + JobError::execution(format!( + "Failed to create mime type: {}", + e + )) + })?; - Some(mime_result.id) - } - } - } else { - None - }; + Some(mime_result.id) + } + } + } else { + None + }; - (kind_id, mime_type_id) - } - Err(_) => { - // If identification fails, fall back to "unknown" (0) - (0, None) - } - }; + (kind_id, mime_type_id) + } + Err(_) => { + // If identification fails, fall back to "unknown" (0) + (0, None) + } + }; - let new_content = entities::content_identity::ActiveModel { - uuid: Set(Some(deterministic_uuid)), // Deterministic UUID for sync - integrity_hash: Set(None), // Generated later by validate job - content_hash: Set(content_hash.clone()), - mime_type_id: Set(mime_type_id), - kind_id: Set(kind_id), - text_content: Set(None), // TODO: Extract text content for indexing - total_size: Set(file_size), - entry_count: Set(1), - first_seen_at: Set(chrono::Utc::now()), - last_verified_at: Set(chrono::Utc::now()), - ..Default::default() - }; + let new_content = entities::content_identity::ActiveModel { + uuid: Set(Some(deterministic_uuid)), // Deterministic UUID for sync + integrity_hash: Set(None), // Generated later by validate job + content_hash: Set(content_hash.clone()), + mime_type_id: Set(mime_type_id), + kind_id: Set(kind_id), + text_content: Set(None), // TODO: Extract text content for indexing + total_size: Set(file_size), + entry_count: Set(1), + first_seen_at: Set(chrono::Utc::now()), + last_verified_at: Set(chrono::Utc::now()), + ..Default::default() + }; - // Try to insert, but handle unique constraint violations - let result = match new_content.insert(ctx.library_db()).await { - Ok(model) => (model, true), - Err(e) => { - // Check if it's a unique constraint violation - if e.to_string().contains("UNIQUE constraint failed") { - // Another job created it - find and use the existing one - let existing = entities::content_identity::Entity::find() + // Try to insert, but handle unique constraint violations + let result = match new_content.insert(ctx.library_db()).await { + Ok(model) => (model, true), + Err(e) => { + // Check if it's a unique constraint violation + if e.to_string().contains("UNIQUE constraint failed") { + // Another job created it - find and use the existing one + let existing = entities::content_identity::Entity::find() .filter(entities::content_identity::Column::ContentHash.eq(&content_hash)) .one(ctx.library_db()) .await .map_err(|e| JobError::execution(format!("Failed to find existing content identity: {}", e)))? .ok_or_else(|| JobError::execution("Content identity should exist after unique constraint violation".to_string()))?; - // Update entry count - let mut existing_active: entities::content_identity::ActiveModel = - existing.clone().into(); - existing_active.entry_count = Set(existing.entry_count + 1); - existing_active.last_verified_at = Set(chrono::Utc::now()); + // Update entry count + let mut existing_active: entities::content_identity::ActiveModel = + existing.clone().into(); + existing_active.entry_count = Set(existing.entry_count + 1); + existing_active.last_verified_at = Set(chrono::Utc::now()); - let updated = - existing_active - .update(ctx.library_db()) - .await - .map_err(|e| { - JobError::execution(format!( - "Failed to update content identity: {}", - e - )) - })?; + let updated = + existing_active + .update(ctx.library_db()) + .await + .map_err(|e| { + JobError::execution(format!( + "Failed to update content identity: {}", + e + )) + })?; - (updated, false) - } else { - return Err(JobError::execution(format!( - "Failed to create content identity: {}", - e - ))); - } - } - }; + (updated, false) + } else { + return Err(JobError::execution(format!( + "Failed to create content identity: {}", + e + ))); + } + } + }; - result - }; + result + }; - // Update Entry with content_id (now sync-ready for regular files) - let entry = entities::entry::Entity::find_by_id(entry_id) - .one(ctx.library_db()) - .await - .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? - .ok_or_else(|| JobError::execution("Entry not found after creation".to_string()))?; + // Update Entry with content_id (now sync-ready for regular files) + let entry = entities::entry::Entity::find_by_id(entry_id) + .one(ctx.library_db()) + .await + .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? + .ok_or_else(|| JobError::execution("Entry not found after creation".to_string()))?; - let mut entry_active: entities::entry::ActiveModel = entry.into(); - entry_active.content_id = Set(Some(content_model.id)); + let mut entry_active: entities::entry::ActiveModel = entry.into(); + entry_active.content_id = Set(Some(content_model.id)); - let updated_entry = entry_active.update(ctx.library_db()).await.map_err(|e| { - JobError::execution(format!("Failed to link content identity to entry: {}", e)) - })?; + let updated_entry = entry_active.update(ctx.library_db()).await.map_err(|e| { + JobError::execution(format!("Failed to link content identity to entry: {}", e)) + })?; - Ok(ContentLinkResult { - content_identity: content_model, - entry: updated_entry, - is_new_content, - }) - } + Ok(ContentLinkResult { + content_identity: content_model, + entry: updated_entry, + is_new_content, + }) + } - /// Simple move entry within existing transaction (no directory path cascade updates) - pub async fn simple_move_entry_in_conn( - state: &mut IndexerState, - ctx: &impl IndexingCtx, - entry_id: i32, - old_path: &Path, - new_path: &Path, - txn: &DatabaseTransaction, - ) -> Result<(), JobError> { - // Get the entry - let db_entry = entities::entry::Entity::find_by_id(entry_id) - .one(txn) - .await - .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? - .ok_or_else(|| JobError::execution("Entry not found for move".to_string()))?; + /// Simple move entry within existing transaction (no directory path cascade updates) + pub async fn simple_move_entry_in_conn( + state: &mut IndexerState, + ctx: &impl IndexingCtx, + entry_id: i32, + old_path: &Path, + new_path: &Path, + txn: &DatabaseTransaction, + ) -> Result<(), JobError> { + // Get the entry + let db_entry = entities::entry::Entity::find_by_id(entry_id) + .one(txn) + .await + .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? + .ok_or_else(|| JobError::execution("Entry not found for move".to_string()))?; - let mut entry_active: entities::entry::ActiveModel = db_entry.into(); + let mut entry_active: entities::entry::ActiveModel = db_entry.into(); - // Find new parent entry ID - let new_parent_id = if let Some(parent_path) = new_path.parent() { - state.entry_id_cache.get(parent_path).copied() - } else { - None - }; + // Find new parent entry ID + let new_parent_id = if let Some(parent_path) = new_path.parent() { + state.entry_id_cache.get(parent_path).copied() + } else { + None + }; - // Update entry fields - entry_active.parent_id = Set(new_parent_id); + // Update entry fields + entry_active.parent_id = Set(new_parent_id); - // Extract new name and extension for files - match new_path.extension() { - Some(ext) => { - // File with extension - if let Some(stem) = new_path.file_stem() { - entry_active.name = Set(stem.to_string_lossy().to_string()); - entry_active.extension = Set(Some(ext.to_string_lossy().to_lowercase())); - } - } - None => { - // File without extension or directory - if let Some(name) = new_path.file_name() { - entry_active.name = Set(name.to_string_lossy().to_string()); - entry_active.extension = Set(None); - } - } - } + // Extract new name and extension for files + match new_path.extension() { + Some(ext) => { + // File with extension + if let Some(stem) = new_path.file_stem() { + entry_active.name = Set(stem.to_string_lossy().to_string()); + entry_active.extension = Set(Some(ext.to_string_lossy().to_lowercase())); + } + } + None => { + // File without extension or directory + if let Some(name) = new_path.file_name() { + entry_active.name = Set(name.to_string_lossy().to_string()); + entry_active.extension = Set(None); + } + } + } - // Save the updated entry - entry_active - .update(txn) - .await - .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; + // Save the updated entry + entry_active + .update(txn) + .await + .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; - // Update cache - state.entry_id_cache.remove(old_path); - state - .entry_id_cache - .insert(new_path.to_path_buf(), entry_id); + // Update cache + state.entry_id_cache.remove(old_path); + state + .entry_id_cache + .insert(new_path.to_path_buf(), entry_id); - Ok(()) - } + Ok(()) + } - /// Bulk move entries within a single transaction for better performance - pub async fn bulk_move_entries( - state: &mut IndexerState, - ctx: &impl IndexingCtx, - moves: &[(i32, PathBuf, PathBuf, super::state::DirEntry)], - _location_root_path: &Path, - txn: &DatabaseTransaction, - ) -> Result { - let mut moved_count = 0; + /// Bulk move entries within a single transaction for better performance + pub async fn bulk_move_entries( + state: &mut IndexerState, + ctx: &impl IndexingCtx, + moves: &[(i32, PathBuf, PathBuf, super::state::DirEntry)], + _location_root_path: &Path, + txn: &DatabaseTransaction, + ) -> Result { + let mut moved_count = 0; - for (entry_id, old_path, new_path, _) in moves { - match Self::simple_move_entry_in_conn(state, ctx, *entry_id, old_path, new_path, txn) - .await - { - Ok(()) => { - moved_count += 1; - } - Err(e) => { - // Log error but continue with other moves - ctx.log(format!( - "Failed to move entry {} from {} to {}: {}", - entry_id, - old_path.display(), - new_path.display(), - e - )); - } - } - } + for (entry_id, old_path, new_path, _) in moves { + match Self::simple_move_entry_in_conn(state, ctx, *entry_id, old_path, new_path, txn) + .await + { + Ok(()) => { + moved_count += 1; + } + Err(e) => { + // Log error but continue with other moves + ctx.log(format!( + "Failed to move entry {} from {} to {}: {}", + entry_id, + old_path.display(), + new_path.display(), + e + )); + } + } + } - Ok(moved_count) - } + Ok(moved_count) + } - /// Update entry within existing transaction - pub async fn update_entry_in_conn( - ctx: &impl IndexingCtx, - entry_id: i32, - entry: &super::state::DirEntry, - txn: &DatabaseTransaction, - ) -> Result<(), JobError> { - // Get the existing entry - let db_entry = entities::entry::Entity::find_by_id(entry_id) - .one(txn) - .await - .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? - .ok_or_else(|| JobError::execution("Entry not found for update".to_string()))?; + /// Update entry within existing transaction + pub async fn update_entry_in_conn( + ctx: &impl IndexingCtx, + entry_id: i32, + entry: &super::state::DirEntry, + txn: &DatabaseTransaction, + ) -> Result<(), JobError> { + // Get the existing entry + let db_entry = entities::entry::Entity::find_by_id(entry_id) + .one(txn) + .await + .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? + .ok_or_else(|| JobError::execution("Entry not found for update".to_string()))?; - let mut entry_active: entities::entry::ActiveModel = db_entry.into(); + let mut entry_active: entities::entry::ActiveModel = db_entry.into(); - // Update size if it changed - if let Ok(metadata) = std::fs::symlink_metadata(&entry.path) { - entry_active.size = Set(metadata.len() as i64); + // Update size if it changed + if let Ok(metadata) = std::fs::symlink_metadata(&entry.path) { + entry_active.size = Set(metadata.len() as i64); - // Update modified time - if let Ok(modified) = metadata.modified() { - if let Ok(duration) = modified.duration_since(std::time::UNIX_EPOCH) { - entry_active.modified_at = Set(chrono::DateTime::from_timestamp( - duration.as_secs() as i64, - 0, - ) - .unwrap_or_default()); - } - } - } + // Update modified time + if let Ok(modified) = metadata.modified() { + if let Ok(duration) = modified.duration_since(std::time::UNIX_EPOCH) { + entry_active.modified_at = Set(chrono::DateTime::from_timestamp( + duration.as_secs() as i64, + 0, + ) + .unwrap_or_default()); + } + } + } - // Save the updated entry - entry_active - .update(txn) - .await - .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; + // Save the updated entry + entry_active + .update(txn) + .await + .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; - Ok(()) - } -} \ No newline at end of file + Ok(()) + } +} diff --git a/core/src/ops/indexing/ephemeral/arena.rs b/core/src/ops/indexing/ephemeral/arena.rs new file mode 100644 index 000000000..198e5b58b --- /dev/null +++ b/core/src/ops/indexing/ephemeral/arena.rs @@ -0,0 +1,169 @@ +//! Vec-based arena storage for file nodes +//! +//! The NodeArena provides efficient, contiguous storage for FileNodes. +//! Key features: +//! - O(1) insertion and lookup by EntryId +//! - Cache-friendly contiguous memory layout +//! - Iteration over all nodes +//! +//! For very large indexes (10M+ files), this could be upgraded to use +//! memory-mapped storage, but Vec is sufficient for most use cases. + +use super::types::{EntryId, FileNode}; + +/// Arena storage for file nodes using a simple Vec +/// +/// Nodes are stored contiguously in memory for cache-friendly access. +/// EntryIds are stable indexes into this Vec. +pub struct NodeArena { + /// Vector of nodes + nodes: Vec, +} + +impl NodeArena { + /// Create a new empty arena + pub fn new() -> Self { + Self { nodes: Vec::new() } + } + + /// Create an arena with pre-allocated capacity + pub fn with_capacity(capacity: usize) -> Self { + Self { + nodes: Vec::with_capacity(capacity), + } + } + + /// Insert a node and return its ID + pub fn insert(&mut self, node: FileNode) -> EntryId { + let id = EntryId::from_usize(self.nodes.len()); + self.nodes.push(node); + id + } + + /// Get node by ID + pub fn get(&self, id: EntryId) -> Option<&FileNode> { + self.nodes.get(id.as_usize()) + } + + /// Get mutable node by ID + pub fn get_mut(&mut self, id: EntryId) -> Option<&mut FileNode> { + self.nodes.get_mut(id.as_usize()) + } + + /// Get the number of nodes + pub fn len(&self) -> usize { + self.nodes.len() + } + + /// Check if the arena is empty + pub fn is_empty(&self) -> bool { + self.nodes.is_empty() + } + + /// Shrink capacity to fit current size + pub fn shrink_to_fit(&mut self) { + self.nodes.shrink_to_fit(); + } + + /// Get current capacity + pub fn capacity(&self) -> usize { + self.nodes.capacity() + } + + /// Reserve additional capacity + pub fn reserve(&mut self, additional: usize) { + self.nodes.reserve(additional); + } + + /// Iterate over all nodes + pub fn iter(&self) -> impl Iterator { + self.nodes + .iter() + .enumerate() + .map(|(i, node)| (EntryId::from_usize(i), node)) + } + + /// Iterate over all nodes mutably + pub fn iter_mut(&mut self) -> impl Iterator { + self.nodes + .iter_mut() + .enumerate() + .map(|(i, node)| (EntryId::from_usize(i), node)) + } + + /// Get approximate memory usage in bytes + pub fn memory_usage(&self) -> usize { + // Base struct size + Vec allocation + std::mem::size_of::() + + self.nodes.capacity() * std::mem::size_of::() + + self + .nodes + .iter() + .map(|n| n.children.capacity() * std::mem::size_of::()) + .sum::() + } +} + +impl Default for NodeArena { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ops::indexing::ephemeral::types::{ + FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata, + }; + + fn make_test_node(name: &'static str) -> FileNode { + let meta = PackedMetadata::new(NodeState::Accessible, FileType::File, 100); + FileNode::new(NameRef::new(name, MaybeEntryId::NONE), meta) + } + + #[test] + fn test_insert_and_get() { + let mut arena = NodeArena::new(); + + let id1 = arena.insert(make_test_node("file1.txt")); + let id2 = arena.insert(make_test_node("file2.txt")); + + assert_eq!(arena.len(), 2); + assert_eq!(arena.get(id1).unwrap().name(), "file1.txt"); + assert_eq!(arena.get(id2).unwrap().name(), "file2.txt"); + } + + #[test] + fn test_get_nonexistent() { + let arena = NodeArena::new(); + assert!(arena.get(EntryId::from_usize(0)).is_none()); + } + + #[test] + fn test_iteration() { + let mut arena = NodeArena::new(); + + arena.insert(make_test_node("a")); + arena.insert(make_test_node("b")); + arena.insert(make_test_node("c")); + + let names: Vec<&str> = arena.iter().map(|(_, node)| node.name()).collect(); + assert_eq!(names, vec!["a", "b", "c"]); + } + + #[test] + fn test_with_capacity() { + let arena = NodeArena::with_capacity(1000); + assert!(arena.capacity() >= 1000); + assert!(arena.is_empty()); + } + + #[test] + fn test_shrink_to_fit() { + let mut arena = NodeArena::with_capacity(1000); + arena.insert(make_test_node("a")); + arena.shrink_to_fit(); + assert!(arena.capacity() < 1000); + } +} diff --git a/core/src/ops/indexing/ephemeral/cache.rs b/core/src/ops/indexing/ephemeral/cache.rs new file mode 100644 index 000000000..b6d66b73f --- /dev/null +++ b/core/src/ops/indexing/ephemeral/cache.rs @@ -0,0 +1,206 @@ +//! String interning cache for deduplicating filenames +//! +//! The NameCache provides global string interning to reduce memory usage. +//! Common filenames like `.git`, `node_modules`, `target`, `README.md` etc. +//! are stored only once and referenced via pointers. +//! +//! Benefits: +//! - 30-40% memory reduction on typical filesystems +//! - Pointer-based equality (faster comparisons) +//! - Stable references for NameRef + +use parking_lot::Mutex; +use std::collections::BTreeSet; + +/// Global string interning pool for deduplicating filenames +/// +/// Strings are stored in a BTreeSet for ordered iteration and fast lookup. +/// The Mutex ensures thread-safe access for concurrent indexing. +pub struct NameCache { + inner: Mutex>>, +} + +impl NameCache { + /// Create a new empty cache + pub fn new() -> Self { + Self { + inner: Mutex::new(BTreeSet::new()), + } + } + + /// Intern a string and return a stable reference + /// + /// If the string already exists, returns a reference to the existing copy. + /// If not, inserts a new copy and returns a reference to it. + /// + /// # Safety + /// The returned reference is valid as long as the NameCache exists. + /// NameCache never removes strings, so references remain stable. + pub fn intern<'cache>(&'cache self, name: &str) -> &'cache str { + let mut inner = self.inner.lock(); + + // Check if already interned + if let Some(existing) = inner.get(name) { + // SAFETY: BTreeSet owns the Box, which lives as long as NameCache. + // We return a reference with lifetime tied to &self. + return unsafe { &*(existing.as_ref() as *const str) }; + } + + // Insert new string + let boxed: Box = name.into(); + let ptr = boxed.as_ref() as *const str; + inner.insert(boxed); + + // SAFETY: We just inserted the string, and NameCache never removes strings. + // The pointer remains valid as long as NameCache exists. + unsafe { &*ptr } + } + + /// Get the number of interned strings + pub fn len(&self) -> usize { + self.inner.lock().len() + } + + /// Check if the cache is empty + pub fn is_empty(&self) -> bool { + self.inner.lock().is_empty() + } + + /// Check if a string is already interned + pub fn contains(&self, name: &str) -> bool { + self.inner.lock().contains(name) + } + + /// Get approximate memory usage in bytes + pub fn memory_usage(&self) -> usize { + let inner = self.inner.lock(); + // Base struct size + BTreeSet overhead + string contents + std::mem::size_of::() + + inner.len() * std::mem::size_of::>() + + inner.iter().map(|s| s.len()).sum::() + } + + /// Iterate over all interned strings + pub fn iter(&self) -> impl Iterator { + let inner = self.inner.lock(); + inner + .iter() + .map(|s| s.to_string()) + .collect::>() + .into_iter() + } +} + +impl Default for NameCache { + fn default() -> Self { + Self::new() + } +} + +// SAFETY: NameCache uses Mutex for thread-safe access +unsafe impl Send for NameCache {} +unsafe impl Sync for NameCache {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_intern_returns_same_pointer() { + let cache = NameCache::new(); + + let s1 = cache.intern("hello"); + let s2 = cache.intern("hello"); + + // Same pointer means same interned string + assert!(std::ptr::eq(s1, s2)); + assert_eq!(s1, "hello"); + } + + #[test] + fn test_intern_different_strings() { + let cache = NameCache::new(); + + let s1 = cache.intern("hello"); + let s2 = cache.intern("world"); + + assert!(!std::ptr::eq(s1, s2)); + assert_eq!(s1, "hello"); + assert_eq!(s2, "world"); + } + + #[test] + fn test_len_and_contains() { + let cache = NameCache::new(); + + assert_eq!(cache.len(), 0); + assert!(!cache.contains("test")); + + cache.intern("test"); + assert_eq!(cache.len(), 1); + assert!(cache.contains("test")); + + // Interning same string doesn't increase count + cache.intern("test"); + assert_eq!(cache.len(), 1); + } + + #[test] + fn test_common_filenames() { + let cache = NameCache::new(); + + // Simulate common filesystem patterns + let common_names = [ + ".git", + ".gitignore", + "node_modules", + "target", + "Cargo.toml", + "README.md", + "package.json", + "src", + "lib", + "main.rs", + ]; + + for name in &common_names { + cache.intern(name); + } + + // All unique, so length equals count + assert_eq!(cache.len(), common_names.len()); + + // Interning again returns same references + for name in &common_names { + let ptr1 = cache.intern(name); + let ptr2 = cache.intern(name); + assert!(std::ptr::eq(ptr1, ptr2)); + } + } + + #[test] + fn test_thread_safety() { + use std::sync::Arc; + use std::thread; + + let cache = Arc::new(NameCache::new()); + let mut handles = vec![]; + + for i in 0..10 { + let cache = Arc::clone(&cache); + handles.push(thread::spawn(move || { + for j in 0..100 { + let name = format!("file_{}_{}", i, j); + cache.intern(&name); + } + })); + } + + for handle in handles { + handle.join().unwrap(); + } + + // Should have 1000 unique strings + assert_eq!(cache.len(), 1000); + } +} diff --git a/core/src/ops/indexing/ephemeral/index_cache.rs b/core/src/ops/indexing/ephemeral/index_cache.rs new file mode 100644 index 000000000..52932cf35 --- /dev/null +++ b/core/src/ops/indexing/ephemeral/index_cache.rs @@ -0,0 +1,300 @@ +//! Global cache for ephemeral indexes +//! +//! This module provides a thread-safe cache for storing ephemeral indexes +//! by their root path. This allows directory listing queries to reuse +//! existing indexes instead of spawning new indexer jobs. + +use crate::ops::indexing::EphemeralIndex; +use parking_lot::RwLock; +use std::{ + collections::HashMap, + path::{Path, PathBuf}, + sync::Arc, + time::{Duration, Instant}, +}; +use tokio::sync::RwLock as TokioRwLock; + +/// Default TTL for ephemeral indexes (5 minutes) +const DEFAULT_TTL: Duration = Duration::from_secs(5 * 60); + +/// Maximum idle time before an index is considered stale (2 minutes) +const MAX_IDLE_TIME: Duration = Duration::from_secs(2 * 60); + +/// Cache entry wrapping an ephemeral index with metadata +struct CacheEntry { + /// The ephemeral index + index: Arc>, + /// When this entry was created + created_at: Instant, + /// Whether an indexer job is currently running for this path + indexing_in_progress: bool, +} + +impl CacheEntry { + fn new(index: Arc>) -> Self { + Self { + index, + created_at: Instant::now(), + indexing_in_progress: false, + } + } + + fn is_stale(&self, ttl: Duration) -> bool { + self.created_at.elapsed() > ttl + } +} + +/// Global cache for ephemeral indexes +/// +/// Stores ephemeral indexes by their root path for reuse across queries. +/// Indexes are automatically evicted based on TTL and idle time. +pub struct EphemeralIndexCache { + /// Map of root path to cache entry + entries: RwLock>, + /// Time-to-live for cache entries + ttl: Duration, +} + +impl EphemeralIndexCache { + /// Create a new cache with default TTL + pub fn new() -> Self { + Self { + entries: RwLock::new(HashMap::new()), + ttl: DEFAULT_TTL, + } + } + + /// Create a new cache with custom TTL + pub fn with_ttl(ttl: Duration) -> Self { + Self { + entries: RwLock::new(HashMap::new()), + ttl, + } + } + + /// Get an existing index for a path, or None if not cached or stale + /// + /// Also checks if the index is still being populated (indexing in progress). + pub fn get(&self, path: &Path) -> Option>> { + let entries = self.entries.read(); + if let Some(entry) = entries.get(path) { + // Check if stale + if entry.is_stale(self.ttl) { + return None; + } + Some(entry.index.clone()) + } else { + None + } + } + + /// Get an existing index for a path (exact match only) + /// + /// Returns the index if: + /// 1. An index exists for this exact path + /// 2. The index is not stale + /// + /// Note: We only use exact matches because ephemeral indexing uses + /// IndexScope::Current (single level), so an ancestor index doesn't + /// contain the contents of subdirectories. + pub fn get_for_path(&self, path: &Path) -> Option>> { + let entries = self.entries.read(); + + // Only exact match - ancestor indexes don't contain subdirectory contents + // because ephemeral indexing uses IndexScope::Current (single level) + if let Some(entry) = entries.get(path) { + if !entry.is_stale(self.ttl) { + return Some(entry.index.clone()); + } + } + + None + } + + /// Check if indexing is in progress for a path + pub fn is_indexing(&self, path: &Path) -> bool { + let entries = self.entries.read(); + entries + .get(path) + .map(|e| e.indexing_in_progress) + .unwrap_or(false) + } + + /// Insert or update an index in the cache + pub fn insert(&self, path: PathBuf, index: Arc>) { + let mut entries = self.entries.write(); + entries.insert(path, CacheEntry::new(index)); + } + + /// Create a new index for a path and mark it as indexing in progress + /// + /// Returns the index to be used by the indexer job. + pub fn create_for_indexing(&self, path: PathBuf) -> Arc> { + let mut entries = self.entries.write(); + + // Check if entry already exists + if let Some(entry) = entries.get_mut(&path) { + entry.indexing_in_progress = true; + return entry.index.clone(); + } + + // Create new entry + let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); + let mut entry = CacheEntry::new(index.clone()); + entry.indexing_in_progress = true; + entries.insert(path, entry); + index + } + + /// Mark indexing as complete for a path + /// + /// This also refreshes the entry's `created_at` timestamp so it's no longer + /// considered stale. This is important because `create_for_indexing()` may + /// have reused an existing stale entry, and without this refresh the entry + /// would remain stale even after being freshly populated. + pub fn mark_indexing_complete(&self, path: &Path) { + let mut entries = self.entries.write(); + if let Some(entry) = entries.get_mut(path) { + entry.indexing_in_progress = false; + // Reset created_at so the freshly-populated index is no longer stale + entry.created_at = Instant::now(); + } + } + + /// Remove an index from the cache + pub fn remove(&self, path: &Path) { + let mut entries = self.entries.write(); + entries.remove(path); + } + + /// Remove stale entries from the cache + pub fn evict_stale(&self) { + let mut entries = self.entries.write(); + entries.retain(|_, entry| !entry.is_stale(self.ttl)); + } + + /// Get the number of cached indexes + pub fn len(&self) -> usize { + self.entries.read().len() + } + + /// Check if the cache is empty + pub fn is_empty(&self) -> bool { + self.entries.read().is_empty() + } + + /// Get all cached root paths + pub fn cached_paths(&self) -> Vec { + self.entries.read().keys().cloned().collect() + } + + /// Get cache statistics + pub fn stats(&self) -> EphemeralIndexCacheStats { + let entries = self.entries.read(); + let total_entries = entries.len(); + let indexing_count = entries.values().filter(|e| e.indexing_in_progress).count(); + let stale_count = entries.values().filter(|e| e.is_stale(self.ttl)).count(); + + EphemeralIndexCacheStats { + total_entries, + indexing_count, + stale_count, + } + } +} + +impl Default for EphemeralIndexCache { + fn default() -> Self { + Self::new() + } +} + +/// Statistics about the ephemeral index cache +#[derive(Debug, Clone)] +pub struct EphemeralIndexCacheStats { + pub total_entries: usize, + pub indexing_count: usize, + pub stale_count: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_insert_and_get() { + let cache = EphemeralIndexCache::new(); + let path = PathBuf::from("/test/path"); + let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); + + cache.insert(path.clone(), index.clone()); + + assert!(cache.get(&path).is_some()); + assert_eq!(cache.len(), 1); + } + + #[test] + fn test_get_nonexistent() { + let cache = EphemeralIndexCache::new(); + assert!(cache.get(Path::new("/nonexistent")).is_none()); + } + + #[test] + fn test_create_for_indexing() { + let cache = EphemeralIndexCache::new(); + let path = PathBuf::from("/test/path"); + + let index = cache.create_for_indexing(path.clone()); + + assert!(cache.is_indexing(&path)); + + cache.mark_indexing_complete(&path); + + assert!(!cache.is_indexing(&path)); + } + + #[test] + fn test_remove() { + let cache = EphemeralIndexCache::new(); + let path = PathBuf::from("/test/path"); + let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); + + cache.insert(path.clone(), index); + assert_eq!(cache.len(), 1); + + cache.remove(&path); + assert_eq!(cache.len(), 0); + } + + #[test] + fn test_get_for_path_exact_match_only() { + let cache = EphemeralIndexCache::new(); + let root = PathBuf::from("/test"); + let child = PathBuf::from("/test/subdir/file.txt"); + let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(root.clone()))); + + cache.insert(root.clone(), index); + + // Should NOT find ancestor index - we only use exact matches + // because ephemeral indexing is single-level (IndexScope::Current) + assert!(cache.get_for_path(&child).is_none()); + + // Should find exact match + assert!(cache.get_for_path(&root).is_some()); + } + + #[test] + fn test_stale_detection() { + let cache = EphemeralIndexCache::with_ttl(Duration::from_millis(1)); + let path = PathBuf::from("/test/path"); + let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); + + cache.insert(path.clone(), index); + + // Wait for TTL to expire + std::thread::sleep(Duration::from_millis(10)); + + // Should be stale now + assert!(cache.get(&path).is_none()); + } +} diff --git a/core/src/ops/indexing/ephemeral/mod.rs b/core/src/ops/indexing/ephemeral/mod.rs new file mode 100644 index 000000000..94960ece2 --- /dev/null +++ b/core/src/ops/indexing/ephemeral/mod.rs @@ -0,0 +1,50 @@ +//! High-performance ephemeral index storage backend +//! +//! This module provides memory-efficient storage for ephemeral file indexes, +//! achieving 3-4x memory reduction compared to HashMap. +//! +//! ## Architecture +//! +//! ```text +//! EphemeralIndex +//! ├── NodeArena: Vec - Contiguous node storage +//! ├── NameCache: BTreeSet> - String interning pool +//! ├── NameRegistry: BTreeMap - Fast name lookups +//! └── path_index: HashMap - Path to node mapping +//! ``` +//! +//! ## Memory Comparison +//! +//! | Files | HashMap Approach | This Module | Reduction | +//! |-------|------------------|-------------|-----------| +//! | 10K | 2-3 MB | 0.5 MB | 4-6x | +//! | 100K | 20-30 MB | 5 MB | 4-6x | +//! | 1M | 200-300 MB | 50 MB | 4-6x | +//! +//! ## Usage +//! +//! ```rust,ignore +//! use sd_core::ops::indexing::ephemeral::EphemeralIndex; +//! +//! let mut index = EphemeralIndex::new("/path/to/root".into()); +//! +//! // Add entries +//! index.add_entry(path, uuid, metadata); +//! +//! // Query +//! let entry = index.get_entry(&path); +//! let children = index.list_directory(&parent); +//! ``` + +pub mod arena; +pub mod cache; +pub mod index_cache; +pub mod registry; +pub mod types; + +// Re-export public types +pub use arena::NodeArena; +pub use cache::NameCache; +pub use index_cache::EphemeralIndexCache; +pub use registry::NameRegistry; +pub use types::{EntryId, FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata}; diff --git a/core/src/ops/indexing/ephemeral/registry.rs b/core/src/ops/indexing/ephemeral/registry.rs new file mode 100644 index 000000000..ad70a2c07 --- /dev/null +++ b/core/src/ops/indexing/ephemeral/registry.rs @@ -0,0 +1,224 @@ +//! Name-based lookup registry for fast queries +//! +//! The NameRegistry provides O(log k) lookups by filename across the entire index. +//! This enables efficient queries like "find all files named 'package.json'". +//! +//! Features: +//! - Fast exact name lookup: O(log k) where k = unique filenames +//! - Prefix search for autocomplete +//! - Multiple entries per name (common for files like 'index.js', 'README.md') + +use super::types::EntryId; +use std::collections::BTreeMap; + +/// Maps filenames to node IDs for fast name-based queries +/// +/// Uses BTreeMap for ordered iteration and efficient prefix searches. +/// Each name can map to multiple EntryIds (e.g., many 'index.js' files). +pub struct NameRegistry { + /// Maps interned name pointers to entry IDs + /// Using *const str as key since we use interned strings from NameCache + map: BTreeMap>, +} + +/// Key type for the registry that wraps an interned string pointer +#[derive(Clone, Copy, PartialEq, Eq)] +struct NameKey(*const str); + +impl NameKey { + fn as_str(&self) -> &str { + // SAFETY: The pointer comes from NameCache and remains valid + unsafe { &*self.0 } + } +} + +impl PartialOrd for NameKey { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for NameKey { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.as_str().cmp(other.as_str()) + } +} + +// SAFETY: NameKey contains a pointer to an interned string that lives +// as long as the NameCache. Since NameCache is thread-safe and never +// deallocates, NameKey is safe to use across threads. +unsafe impl Send for NameKey {} +unsafe impl Sync for NameKey {} + +impl NameRegistry { + /// Create a new empty registry + pub fn new() -> Self { + Self { + map: BTreeMap::new(), + } + } + + /// Register a name-to-entry mapping + /// + /// # Arguments + /// * `name` - An interned string reference from NameCache + /// * `id` - The EntryId to associate with this name + pub fn insert(&mut self, name: &str, id: EntryId) { + let key = NameKey(name as *const str); + self.map.entry(key).or_default().push(id); + } + + /// Get all entries with the exact name + pub fn get(&self, name: &str) -> Option<&[EntryId]> { + // We need to find by string content, not pointer + // This is less efficient but works with non-interned queries + for (key, ids) in &self.map { + if key.as_str() == name { + return Some(ids.as_slice()); + } + } + None + } + + /// Get all entries with the exact name (using interned pointer) + /// + /// More efficient when you have an interned string + pub fn get_interned(&self, name: &str) -> Option<&[EntryId]> { + let key = NameKey(name as *const str); + self.map.get(&key).map(|v| v.as_slice()) + } + + /// Find all entries with names starting with the given prefix + /// + /// Useful for autocomplete and directory listings + pub fn find_prefix(&self, prefix: &str) -> Vec { + self.map + .iter() + .filter(|(k, _)| k.as_str().starts_with(prefix)) + .flat_map(|(_, ids)| ids.iter().copied()) + .collect() + } + + /// Find all entries with names containing the given substring + pub fn find_containing(&self, substring: &str) -> Vec { + self.map + .iter() + .filter(|(k, _)| k.as_str().contains(substring)) + .flat_map(|(_, ids)| ids.iter().copied()) + .collect() + } + + /// Get the number of unique names + pub fn unique_names(&self) -> usize { + self.map.len() + } + + /// Get the total number of entries + pub fn total_entries(&self) -> usize { + self.map.values().map(|v| v.len()).sum() + } + + /// Check if a name exists in the registry + pub fn contains(&self, name: &str) -> bool { + self.get(name).is_some() + } + + /// Get approximate memory usage in bytes + pub fn memory_usage(&self) -> usize { + std::mem::size_of::() + + self.map.len() * std::mem::size_of::<(NameKey, Vec)>() + + self + .map + .values() + .map(|v| v.capacity() * std::mem::size_of::()) + .sum::() + } + + /// Iterate over all (name, entry_ids) pairs + pub fn iter(&self) -> impl Iterator { + self.map.iter().map(|(k, v)| (k.as_str(), v.as_slice())) + } +} + +impl Default for NameRegistry { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_insert_and_get() { + let mut registry = NameRegistry::new(); + + let name = "test.txt"; + let id = EntryId::from_usize(42); + + registry.insert(name, id); + + let result = registry.get("test.txt"); + assert!(result.is_some()); + assert_eq!(result.unwrap(), &[id]); + } + + #[test] + fn test_multiple_entries_same_name() { + let mut registry = NameRegistry::new(); + + // Many projects have multiple index.js files + let name = "index.js"; + let ids: Vec = (0..5).map(|i| EntryId::from_usize(i)).collect(); + + for &id in &ids { + registry.insert(name, id); + } + + let result = registry.get("index.js").unwrap(); + assert_eq!(result.len(), 5); + } + + #[test] + fn test_find_prefix() { + let mut registry = NameRegistry::new(); + + registry.insert("README.md", EntryId::from_usize(1)); + registry.insert("README.txt", EntryId::from_usize(2)); + registry.insert("README", EntryId::from_usize(3)); + registry.insert("Rakefile", EntryId::from_usize(4)); + + let results = registry.find_prefix("README"); + assert_eq!(results.len(), 3); + } + + #[test] + fn test_find_containing() { + let mut registry = NameRegistry::new(); + + registry.insert("my_test.rs", EntryId::from_usize(1)); + registry.insert("test_utils.rs", EntryId::from_usize(2)); + registry.insert("integration_test.rs", EntryId::from_usize(3)); + registry.insert("main.rs", EntryId::from_usize(4)); + + let results = registry.find_containing("test"); + assert_eq!(results.len(), 3); + } + + #[test] + fn test_unique_names_vs_total() { + let mut registry = NameRegistry::new(); + + // 3 unique names, 6 total entries + registry.insert("a.txt", EntryId::from_usize(1)); + registry.insert("a.txt", EntryId::from_usize(2)); + registry.insert("b.txt", EntryId::from_usize(3)); + registry.insert("b.txt", EntryId::from_usize(4)); + registry.insert("c.txt", EntryId::from_usize(5)); + registry.insert("c.txt", EntryId::from_usize(6)); + + assert_eq!(registry.unique_names(), 3); + assert_eq!(registry.total_entries(), 6); + } +} diff --git a/core/src/ops/indexing/ephemeral/types.rs b/core/src/ops/indexing/ephemeral/types.rs new file mode 100644 index 000000000..8b0467233 --- /dev/null +++ b/core/src/ops/indexing/ephemeral/types.rs @@ -0,0 +1,470 @@ +//! Core types for efficient ephemeral index storage +//! +//! This module provides compact data structures for storing file system entries +//! with minimal memory overhead. Key optimizations: +//! - 32-bit node IDs (4 bytes vs 8 bytes for u64) +//! - Bit-packed metadata (16 bytes for state, type, size, mtime, ctime) +//! - String interning via NameRef pointers +//! +//! Memory per node: ~48 bytes vs ~200 bytes with HashMap + +use smallvec::SmallVec; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// Identifies a node in the arena. Uses u32 to halve memory vs u64 +/// while supporting up to 4.3 billion nodes. +#[repr(transparent)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct EntryId(u32); + +impl EntryId { + /// Create an EntryId from a usize index + /// + /// # Panics + /// Panics if index >= u32::MAX - 1 (reserved for NONE sentinel) + pub fn from_usize(index: usize) -> Self { + assert!( + index < u32::MAX as usize - 1, + "EntryId overflow: index {} exceeds maximum", + index + ); + Self(index as u32) + } + + /// Get the underlying index as usize + pub fn as_usize(self) -> usize { + self.0 as usize + } + + /// Get the raw u32 value + pub fn as_u32(self) -> u32 { + self.0 + } +} + +/// Optional EntryId using u32::MAX as None sentinel +/// This saves 8 bytes per optional reference vs Option +#[repr(transparent)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct MaybeEntryId(u32); + +impl MaybeEntryId { + /// The sentinel value representing None + pub const NONE: Self = Self(u32::MAX); + + /// Create a Some variant + pub fn some(id: EntryId) -> Self { + debug_assert!(id.0 != u32::MAX, "EntryId cannot use reserved NONE value"); + Self(id.0) + } + + /// Convert to Option + pub fn as_option(self) -> Option { + if self.0 == u32::MAX { + None + } else { + Some(EntryId(self.0)) + } + } + + /// Check if this is None + pub fn is_none(self) -> bool { + self.0 == u32::MAX + } + + /// Check if this is Some + pub fn is_some(self) -> bool { + self.0 != u32::MAX + } +} + +impl Default for MaybeEntryId { + fn default() -> Self { + Self::NONE + } +} + +impl From> for MaybeEntryId { + fn from(opt: Option) -> Self { + match opt { + Some(id) => Self::some(id), + None => Self::NONE, + } + } +} + +/// Node state indicating accessibility +#[repr(u8)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] +pub enum NodeState { + #[default] + Unknown = 0, + Accessible = 1, + Inaccessible = 2, +} + +impl NodeState { + pub fn from_u8(value: u8) -> Self { + match value { + 0 => Self::Unknown, + 1 => Self::Accessible, + 2 => Self::Inaccessible, + _ => Self::Unknown, + } + } +} + +/// File type classification +#[repr(u8)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] +pub enum FileType { + #[default] + Unknown = 0, + File = 1, + Directory = 2, + Symlink = 3, +} + +impl FileType { + pub fn from_u8(value: u8) -> Self { + match value { + 0 => Self::Unknown, + 1 => Self::File, + 2 => Self::Directory, + 3 => Self::Symlink, + _ => Self::Unknown, + } + } +} + +/// Convert from state::EntryKind to FileType +impl From for FileType { + fn from(kind: super::super::state::EntryKind) -> Self { + match kind { + super::super::state::EntryKind::File => FileType::File, + super::super::state::EntryKind::Directory => FileType::Directory, + super::super::state::EntryKind::Symlink => FileType::Symlink, + } + } +} + +/// Convert from FileType to state::EntryKind +impl From for super::super::state::EntryKind { + fn from(ft: FileType) -> Self { + match ft { + FileType::File => super::super::state::EntryKind::File, + FileType::Directory => super::super::state::EntryKind::Directory, + FileType::Symlink => super::super::state::EntryKind::Symlink, + FileType::Unknown => super::super::state::EntryKind::File, // Default to file + } + } +} + +/// Compact metadata packed into 16 bytes +/// +/// Layout: +/// - Bits 62-63: state (2 bits) +/// - Bits 60-61: type (2 bits) +/// - Bits 0-59: size (60 bits, max ~1 exabyte) +/// - mtime: seconds since epoch (32 bits) +/// - ctime: seconds since epoch (32 bits) +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct PackedMetadata { + /// Bits 62-63: state, 60-61: type, 0-59: size + state_type_size: u64, + /// Modified time (seconds since epoch, 0 = None) + mtime: u32, + /// Created time (seconds since epoch, 0 = None) + ctime: u32, +} + +impl PackedMetadata { + const SIZE_MASK: u64 = (1u64 << 60) - 1; + const TYPE_SHIFT: u32 = 60; + const STATE_SHIFT: u32 = 62; + + /// Create new packed metadata + pub fn new(state: NodeState, file_type: FileType, size: u64) -> Self { + // Clamp size to 60 bits (max ~1 exabyte) + let size = size.min(Self::SIZE_MASK); + let packed = + size | ((file_type as u64) << Self::TYPE_SHIFT) | ((state as u64) << Self::STATE_SHIFT); + + Self { + state_type_size: packed, + mtime: 0, + ctime: 0, + } + } + + /// Get the file size + pub fn size(&self) -> u64 { + self.state_type_size & Self::SIZE_MASK + } + + /// Get the file type + pub fn file_type(&self) -> FileType { + FileType::from_u8(((self.state_type_size >> Self::TYPE_SHIFT) & 0b11) as u8) + } + + /// Get the node state + pub fn state(&self) -> NodeState { + NodeState::from_u8(((self.state_type_size >> Self::STATE_SHIFT) & 0b11) as u8) + } + + /// Set timestamps + pub fn with_times(mut self, mtime: Option, ctime: Option) -> Self { + self.mtime = mtime + .and_then(|t| t.duration_since(UNIX_EPOCH).ok()) + .map(|d| d.as_secs() as u32) + .unwrap_or(0); + self.ctime = ctime + .and_then(|t| t.duration_since(UNIX_EPOCH).ok()) + .map(|d| d.as_secs() as u32) + .unwrap_or(0); + self + } + + /// Get modified time as SystemTime + pub fn mtime_as_system_time(&self) -> Option { + if self.mtime == 0 { + None + } else { + Some(UNIX_EPOCH + std::time::Duration::from_secs(self.mtime as u64)) + } + } + + /// Get created time as SystemTime + pub fn ctime_as_system_time(&self) -> Option { + if self.ctime == 0 { + None + } else { + Some(UNIX_EPOCH + std::time::Duration::from_secs(self.ctime as u64)) + } + } + + /// Get raw mtime value + pub fn mtime_secs(&self) -> u32 { + self.mtime + } + + /// Get raw ctime value + pub fn ctime_secs(&self) -> u32 { + self.ctime + } +} + +impl Default for PackedMetadata { + fn default() -> Self { + Self::new(NodeState::Unknown, FileType::Unknown, 0) + } +} + +/// Reference to an interned string with parent link +/// +/// Memory layout: 16 bytes total +/// - ptr: 8 bytes (pointer to string in NameCache) +/// - len: 4 bytes (string length) +/// - parent: 4 bytes (parent EntryId or NONE) +#[repr(C)] +pub struct NameRef { + /// Pointer to string in NameCache (stable reference) + ptr: *const u8, + /// String length + len: u32, + /// Parent node ID (u32::MAX if root) + parent: MaybeEntryId, +} + +// SAFETY: NameRef contains a raw pointer to an interned string that lives +// as long as the NameCache. The NameCache is owned by EphemeralIndex and +// never deallocates strings. This makes NameRef safe to send between threads. +unsafe impl Send for NameRef {} +unsafe impl Sync for NameRef {} + +impl NameRef { + /// Create a new NameRef from an interned string + /// + /// # Safety + /// The interned string must live as long as any NameRef referencing it. + /// This is guaranteed when used with NameCache. + pub fn new(interned: &str, parent: MaybeEntryId) -> Self { + Self { + ptr: interned.as_ptr(), + len: interned.len() as u32, + parent, + } + } + + /// Get the filename + /// + /// # Safety + /// Assumes the interned string is still valid. This is guaranteed + /// when NameCache is not dropped before NameRef. + pub fn name(&self) -> &str { + unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr, self.len as usize)) + } + } + + /// Get the parent entry ID + pub fn parent(&self) -> Option { + self.parent.as_option() + } +} + +impl std::fmt::Debug for NameRef { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("NameRef") + .field("name", &self.name()) + .field("parent", &self.parent.as_option()) + .finish() + } +} + +/// Single node in the file tree +/// +/// Memory: ~48 bytes total +/// - name_ref: 16 bytes +/// - children: 8-24 bytes (SmallVec with inline storage) +/// - meta: 16 bytes +pub struct FileNode { + /// Interned filename + parent reference + pub name_ref: NameRef, + /// Child node IDs (directories only) + /// SmallVec stores 0 elements inline (8 bytes), grows on heap when needed + pub children: SmallVec<[EntryId; 0]>, + /// Packed metadata + pub meta: PackedMetadata, +} + +impl FileNode { + /// Create a new file node + pub fn new(name_ref: NameRef, meta: PackedMetadata) -> Self { + Self { + name_ref, + children: SmallVec::new(), + meta, + } + } + + /// Get the filename + pub fn name(&self) -> &str { + self.name_ref.name() + } + + /// Get the parent entry ID + pub fn parent(&self) -> Option { + self.name_ref.parent() + } + + /// Check if this is a directory + pub fn is_directory(&self) -> bool { + self.meta.file_type() == FileType::Directory + } + + /// Add a child (for directories) - checks for duplicates + pub fn add_child(&mut self, child_id: EntryId) { + // Prevent duplicate children + if !self.children.contains(&child_id) { + self.children.push(child_id); + } + } +} + +impl std::fmt::Debug for FileNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FileNode") + .field("name", &self.name()) + .field("type", &self.meta.file_type()) + .field("size", &self.meta.size()) + .field("children", &self.children.len()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_entry_id_roundtrip() { + let id = EntryId::from_usize(42); + assert_eq!(id.as_usize(), 42); + assert_eq!(id.as_u32(), 42); + } + + #[test] + fn test_maybe_entry_id() { + let none = MaybeEntryId::NONE; + assert!(none.is_none()); + assert!(!none.is_some()); + assert_eq!(none.as_option(), None); + + let some = MaybeEntryId::some(EntryId::from_usize(42)); + assert!(!some.is_none()); + assert!(some.is_some()); + assert_eq!(some.as_option(), Some(EntryId::from_usize(42))); + } + + #[test] + fn test_packed_metadata_size() { + // Verify struct size is 16 bytes + assert_eq!(std::mem::size_of::(), 16); + } + + #[test] + fn test_packed_metadata_roundtrip() { + let meta = PackedMetadata::new(NodeState::Accessible, FileType::File, 12345); + + assert_eq!(meta.state(), NodeState::Accessible); + assert_eq!(meta.file_type(), FileType::File); + assert_eq!(meta.size(), 12345); + } + + #[test] + fn test_packed_metadata_max_size() { + // Test that large sizes are clamped + let meta = PackedMetadata::new(NodeState::Accessible, FileType::File, u64::MAX); + + // Size should be clamped to 60-bit max + assert_eq!(meta.size(), (1u64 << 60) - 1); + assert_eq!(meta.file_type(), FileType::File); + } + + #[test] + fn test_packed_metadata_times() { + use std::time::Duration; + + let mtime = UNIX_EPOCH + Duration::from_secs(1700000000); + let ctime = UNIX_EPOCH + Duration::from_secs(1600000000); + + let meta = PackedMetadata::new(NodeState::Accessible, FileType::File, 1000) + .with_times(Some(mtime), Some(ctime)); + + assert_eq!(meta.mtime_secs(), 1700000000); + assert_eq!(meta.ctime_secs(), 1600000000); + assert!(meta.mtime_as_system_time().is_some()); + assert!(meta.ctime_as_system_time().is_some()); + } + + #[test] + fn test_name_ref_size() { + // Verify NameRef is 16 bytes + assert_eq!(std::mem::size_of::(), 16); + } + + #[test] + fn test_file_type_conversion() { + use crate::ops::indexing::state::EntryKind; + + assert_eq!(FileType::from(EntryKind::File), FileType::File); + assert_eq!(FileType::from(EntryKind::Directory), FileType::Directory); + assert_eq!(FileType::from(EntryKind::Symlink), FileType::Symlink); + + assert_eq!(EntryKind::from(FileType::File), EntryKind::File); + assert_eq!(EntryKind::from(FileType::Directory), EntryKind::Directory); + assert_eq!(EntryKind::from(FileType::Symlink), EntryKind::Symlink); + } +} diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index 29ba0c6e2..d41da969e 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -128,7 +128,7 @@ impl IndexerJobConfig { Self { location_id: None, path, - mode: IndexMode::Content, // Enable content identification for ephemeral browsing + mode: IndexMode::Shallow, // Ephemeral jobs identify content kind by extension, no hashing needed scope, persistence: IndexPersistence::Ephemeral, max_depth: if scope == IndexScope::Current { @@ -152,33 +152,93 @@ impl IndexerJobConfig { } /// In-memory storage for ephemeral indexing results -#[derive(Debug)] +/// +/// This implementation uses efficient data structures for memory optimization: +/// - NodeArena: Contiguous storage for file nodes (~48 bytes per node) +/// - NameCache: String interning for common filenames +/// - NameRegistry: Fast name-based lookups +/// +/// Memory usage: ~50 bytes per entry vs ~200 bytes with HashMap pub struct EphemeralIndex { - pub entries: HashMap, - pub entry_uuids: HashMap, - pub content_identities: HashMap, - pub created_at: std::time::Instant, - pub last_accessed: std::time::Instant, + /// Efficient tree storage + arena: super::ephemeral::NodeArena, + + /// Root node + root: super::ephemeral::EntryId, + + /// String interning + cache: std::sync::Arc, + + /// Fast name lookups + registry: super::ephemeral::NameRegistry, + + /// Path → EntryId mapping (for lookups by path) + path_index: HashMap, + + /// UUID mapping (for API compatibility) + entry_uuids: HashMap, + + /// Content kinds by path (fast extension-based identification) + content_kinds: HashMap, + + /// Metadata + created_at: std::time::Instant, + last_accessed: std::time::Instant, pub root_path: PathBuf, pub stats: IndexerStats, } -/// Simplified content identity for ephemeral storage -#[derive(Debug, Clone)] -pub struct EphemeralContentIdentity { - pub cas_id: String, - pub mime_type: Option, - pub file_size: u64, - pub entry_count: u32, +impl std::fmt::Debug for EphemeralIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("EphemeralIndex") + .field("root_path", &self.root_path) + .field("entry_count", &self.arena.len()) + .field("interned_names", &self.cache.len()) + .finish() + } } impl EphemeralIndex { pub fn new(root_path: PathBuf) -> Self { + use super::ephemeral::{ + FileNode, FileType, MaybeEntryId, NameCache, NameRef, NameRegistry, NodeArena, + NodeState, PackedMetadata, + }; + + let cache = std::sync::Arc::new(NameCache::new()); + let mut arena = NodeArena::new(); + let registry = NameRegistry::new(); + + // Create root node + let root_name = cache.intern( + root_path + .file_name() + .map(|s| s.to_string_lossy()) + .as_deref() + .unwrap_or("/"), + ); + + let root_node = FileNode::new( + NameRef::new(root_name, MaybeEntryId::NONE), + PackedMetadata::new(NodeState::Accessible, FileType::Directory, 0), + ); + + let root = arena.insert(root_node); + let now = std::time::Instant::now(); + + // Add root path to path_index so list_directory works for the root + let mut path_index = HashMap::new(); + path_index.insert(root_path.clone(), root); + Self { - entries: HashMap::new(), + arena, + root, + cache, + registry, + path_index, entry_uuids: HashMap::new(), - content_identities: HashMap::new(), + content_kinds: HashMap::new(), created_at: now, last_accessed: now, root_path, @@ -186,24 +246,193 @@ impl EphemeralIndex { } } - pub fn add_entry(&mut self, path: PathBuf, uuid: Uuid, metadata: EntryMetadata) { - self.entries.insert(path.clone(), metadata); - self.entry_uuids.insert(path, uuid); + /// Add an entry to the index. Returns Some(content_kind) if added, None if duplicate. + pub fn add_entry( + &mut self, + path: PathBuf, + uuid: Uuid, + metadata: EntryMetadata, + ) -> Option { + use super::ephemeral::{ + FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata, + }; + use crate::domain::ContentKind; + use crate::filetype::FileTypeRegistry; + + // Check if entry already exists for this path - skip if so to prevent duplicates + if self.path_index.contains_key(&path) { + tracing::trace!("Skipping duplicate entry: {}", path.display()); + return None; + } + + // Intern the filename + let name = self.cache.intern( + path.file_name() + .map(|s| s.to_string_lossy()) + .as_deref() + .unwrap_or("unknown"), + ); + + // Find parent + let parent_id = path + .parent() + .and_then(|p| self.path_index.get(p).copied()) + .unwrap_or(self.root); + + // Create metadata + let file_type = FileType::from(metadata.kind); + + let meta = PackedMetadata::new(NodeState::Accessible, file_type, metadata.size) + .with_times(metadata.modified, metadata.created); + + // Create node + let node = FileNode::new(NameRef::new(name, MaybeEntryId::some(parent_id)), meta); + + let id = self.arena.insert(node); + + // Add to parent's children + if let Some(parent) = self.arena.get_mut(parent_id) { + parent.add_child(id); + } + + // Detect content kind by extension (fast, no I/O) + let content_kind = if metadata.kind == super::state::EntryKind::File { + let registry = FileTypeRegistry::default(); + registry.identify_by_extension(&path) + } else if metadata.kind == super::state::EntryKind::Directory { + ContentKind::Unknown // Directories don't have content kind + } else { + ContentKind::Unknown + }; + + // Index by path and name + self.path_index.insert(path.clone(), id); + self.registry.insert(name, id); + self.entry_uuids.insert(path.clone(), uuid); + self.content_kinds.insert(path, content_kind); + self.last_accessed = std::time::Instant::now(); + Some(content_kind) } - pub fn get_entry(&mut self, path: &PathBuf) -> Option<&EntryMetadata> { + pub fn get_entry(&mut self, path: &PathBuf) -> Option { + use super::state::EntryKind; + + let id = self.path_index.get(path)?; + let node = self.arena.get(*id)?; + self.last_accessed = std::time::Instant::now(); - self.entries.get(path) + + Some(EntryMetadata { + path: path.clone(), + kind: EntryKind::from(node.meta.file_type()), + size: node.meta.size(), + modified: node.meta.mtime_as_system_time(), + accessed: None, + created: node.meta.ctime_as_system_time(), + inode: None, + permissions: None, + is_hidden: path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }) + } + + /// Get entry reference for read-only access (doesn't update last_accessed) + pub fn get_entry_ref(&self, path: &PathBuf) -> Option { + use super::state::EntryKind; + + let id = self.path_index.get(path)?; + let node = self.arena.get(*id)?; + + Some(EntryMetadata { + path: path.clone(), + kind: EntryKind::from(node.meta.file_type()), + size: node.meta.size(), + modified: node.meta.mtime_as_system_time(), + accessed: None, + created: node.meta.ctime_as_system_time(), + inode: None, + permissions: None, + is_hidden: path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }) } pub fn get_entry_uuid(&self, path: &PathBuf) -> Option { self.entry_uuids.get(path).copied() } - pub fn add_content_identity(&mut self, cas_id: String, content: EphemeralContentIdentity) { - self.content_identities.insert(cas_id, content); - self.last_accessed = std::time::Instant::now(); + /// Get the content kind for an entry (identified by extension) + pub fn get_content_kind(&self, path: &PathBuf) -> crate::domain::ContentKind { + self.content_kinds + .get(path) + .copied() + .unwrap_or(crate::domain::ContentKind::Unknown) + } + + /// List directory children + pub fn list_directory(&self, path: &std::path::Path) -> Option> { + let id = self.path_index.get(path)?; + let node = self.arena.get(*id)?; + + Some( + node.children + .iter() + .filter_map(|&child_id| self.reconstruct_path(child_id)) + .collect(), + ) + } + + /// Reconstruct full path for a node + fn reconstruct_path(&self, id: super::ephemeral::EntryId) -> Option { + let mut segments = Vec::new(); + let mut current = id; + + while let Some(node) = self.arena.get(current) { + if let Some(parent) = node.parent() { + segments.push(node.name().to_owned()); + current = parent; + } else { + break; + } + } + + if segments.is_empty() { + return Some(self.root_path.clone()); + } + + let mut path = self.root_path.clone(); + for segment in segments.into_iter().rev() { + path.push(segment); + } + Some(path) + } + + /// Find all entries with the given filename + pub fn find_by_name(&self, name: &str) -> Vec { + self.registry + .get(name) + .map(|ids| { + ids.iter() + .filter_map(|&id| self.reconstruct_path(id)) + .collect() + }) + .unwrap_or_default() + } + + /// Find all entries with names starting with the given prefix + pub fn find_by_prefix(&self, prefix: &str) -> Vec { + self.registry + .find_prefix(prefix) + .iter() + .filter_map(|&id| self.reconstruct_path(id)) + .collect() } pub fn age(&self) -> Duration { @@ -213,6 +442,90 @@ impl EphemeralIndex { pub fn idle_time(&self) -> Duration { self.last_accessed.elapsed() } + + /// Get the total number of entries + pub fn len(&self) -> usize { + self.arena.len() + } + + /// Check if the index is empty + pub fn is_empty(&self) -> bool { + self.arena.is_empty() + } + + /// Get approximate memory usage in bytes + pub fn memory_usage(&self) -> usize { + self.arena.memory_usage() + + self.cache.memory_usage() + + self.registry.memory_usage() + + self.path_index.capacity() + * (std::mem::size_of::() + + std::mem::size_of::()) + + self.entry_uuids.capacity() + * (std::mem::size_of::() + std::mem::size_of::()) + } + + /// Get statistics about the index + pub fn get_stats(&self) -> EphemeralIndexStats { + EphemeralIndexStats { + total_entries: self.arena.len(), + unique_names: self.registry.unique_names(), + interned_strings: self.cache.len(), + memory_bytes: self.memory_usage(), + } + } + + /// Get the number of content kinds stored + pub fn content_kinds_count(&self) -> usize { + self.content_kinds.len() + } + + /// Get the number of entries in the path index + pub fn path_index_count(&self) -> usize { + self.path_index.len() + } + + /// Get all entries as a HashMap (for backward compatibility) + /// + /// This method reconstructs paths for all entries. For large indexes, + /// consider using iterators or specific queries instead. + pub fn entries(&self) -> HashMap { + use super::state::EntryKind; + + let mut result = HashMap::with_capacity(self.path_index.len()); + + for (path, &id) in &self.path_index { + if let Some(node) = self.arena.get(id) { + let metadata = EntryMetadata { + path: path.clone(), + kind: EntryKind::from(node.meta.file_type()), + size: node.meta.size(), + modified: node.meta.mtime_as_system_time(), + accessed: None, + created: node.meta.ctime_as_system_time(), + inode: None, + permissions: None, + is_hidden: path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }; + result.insert(path.clone(), metadata); + } + } + + result + } +} + +/// Statistics about an ephemeral index +#[derive(Debug, Clone)] +pub struct EphemeralIndexStats { + pub total_entries: usize, + pub unique_names: usize, + pub interned_strings: usize, + pub memory_bytes: usize, } /// Indexer job - discovers and indexes files in a location @@ -395,11 +708,12 @@ impl JobHandler for IndexerJob { match current_phase { Phase::Discovery => { // For cloud volumes, construct the base URL for building absolute paths - let cloud_url_base = if let Some((service, identifier, _)) = self.config.path.as_cloud() { - Some(format!("{}://{}/", service.scheme(), identifier)) - } else { - None - }; + let cloud_url_base = + if let Some((service, identifier, _)) = self.config.path.as_cloud() { + Some(format!("{}://{}/", service.scheme(), identifier)) + } else { + None + }; // Use scope-aware discovery if self.config.is_current_scope() { @@ -468,9 +782,10 @@ impl JobHandler for IndexerJob { ) .await?; } else { - // Skip aggregation for ephemeral jobs - ctx.log("Skipping aggregation phase for ephemeral job"); - state.phase = Phase::ContentIdentification; + // Skip aggregation and content phases for ephemeral jobs + // Content kind is already identified by extension during add_entry + ctx.log("Skipping aggregation and content phases for ephemeral job (content kind identified by extension)"); + state.phase = Phase::Complete; continue; } @@ -483,14 +798,10 @@ impl JobHandler for IndexerJob { Phase::ContentIdentification => { if self.config.mode >= IndexMode::Content { if self.config.is_ephemeral() { - let ephemeral_index = - self.ephemeral_index.clone().ok_or_else(|| { - JobError::execution( - "Ephemeral index not initialized".to_string(), - ) - })?; - Self::run_ephemeral_content_phase_static(state, &ctx, ephemeral_index) - .await?; + // Skip content phase for ephemeral jobs - content kind already identified + ctx.log("Skipping content identification for ephemeral job"); + state.phase = Phase::Complete; + continue; } else { let library_id = ctx.library().id(); phases::run_content_phase( @@ -566,6 +877,21 @@ impl JobHandler for IndexerJob { } } + // Mark ephemeral indexing as complete in the cache + if self.config.is_ephemeral() { + if let Some(ephemeral_index) = &self.ephemeral_index { + let root_path = ephemeral_index.read().await.root_path.clone(); + ctx.library() + .core_context() + .ephemeral_cache() + .mark_indexing_complete(&root_path); + ctx.log(format!( + "Marked ephemeral indexing complete for: {}", + root_path.display() + )); + } + } + // Generate final output Ok(IndexerOutput { location_id: self.config.location_id, @@ -773,130 +1099,17 @@ impl IndexerJob { while let Some(batch) = state.entry_batches.pop() { for entry in batch { // Store entry (this will emit ResourceChanged events) - let entry_id = persistence.store_entry(&entry, None, &root_path).await?; - - // Queue files for content identification - if entry.kind == super::state::EntryKind::File && entry.size > 0 { - state - .entries_for_content - .push((entry_id, entry.path.clone())); - } + // Content kind is identified by extension during add_entry, no hashing needed + let _entry_id = persistence.store_entry(&entry, None, &root_path).await?; } } - state.phase = Phase::ContentIdentification; + // Skip content identification for ephemeral jobs - go directly to complete + state.phase = Phase::Complete; ctx.log("Ephemeral processing complete"); Ok(()) } - - /// Run ephemeral content identification - async fn run_ephemeral_content_phase_static( - state: &mut IndexerState, - ctx: &JobContext<'_>, - ephemeral_index: Arc>, - ) -> JobResult<()> { - use crate::domain::content_identity::ContentHashGenerator; - use crate::ops::indexing::persistence::PersistenceFactory; - - ctx.log(format!( - "Starting ephemeral content identification for {} files", - state.entries_for_content.len() - )); - - if state.entries_for_content.is_empty() { - state.phase = Phase::Complete; - return Ok(()); - } - - // Get root path and event bus - let (root_path, event_bus) = { - let index = ephemeral_index.read().await; - ( - index.root_path.clone(), - Some(ctx.library().event_bus().clone()), - ) - }; - - // Create ephemeral persistence for event emission - let persistence = - PersistenceFactory::ephemeral(ephemeral_index.clone(), event_bus, root_path); - - // Process files for content identification - let mut success_count = 0; - let mut error_count = 0; - - // Process in chunks to emit progress - const CHUNK_SIZE: usize = 50; - let total = state.entries_for_content.len(); - - while !state.entries_for_content.is_empty() { - ctx.check_interrupt().await?; - - let chunk_size = CHUNK_SIZE.min(state.entries_for_content.len()); - let chunk: Vec<_> = state.entries_for_content.drain(..chunk_size).collect(); - - // Process chunk in parallel - let hash_futures: Vec<_> = chunk - .iter() - .map(|(entry_id, path)| async move { - let hash_result = ContentHashGenerator::generate_content_hash(path).await; - (*entry_id, path.clone(), hash_result) - }) - .collect(); - - let results = futures::future::join_all(hash_futures).await; - - // Store results and emit events - for (entry_id, path, hash_result) in results { - match hash_result { - Ok(cas_id) => { - // Store via persistence (this emits ResourceChanged event with content_identity) - if let Err(e) = persistence - .store_content_identity(entry_id, &path, cas_id.clone()) - .await - { - ctx.add_non_critical_error(format!( - "Failed to store content identity for {}: {}", - path.display(), - e - )); - error_count += 1; - } else { - success_count += 1; - } - } - Err(e) => { - // Skip empty files or errors - if !matches!(e, crate::domain::ContentHashError::EmptyFile) { - ctx.add_non_critical_error(format!( - "Failed to hash {}: {}", - path.display(), - e - )); - error_count += 1; - } - } - } - } - - ctx.log(format!( - "Content identification progress: {}/{} (success: {}, errors: {})", - total - state.entries_for_content.len(), - total, - success_count, - error_count - )); - } - - state.phase = Phase::Complete; - ctx.log(format!( - "Ephemeral content identification complete: {} files processed, {} errors", - success_count, error_count - )); - - Ok(()) - } } /// Job output with comprehensive results diff --git a/core/src/ops/indexing/mod.rs b/core/src/ops/indexing/mod.rs index 336176173..f8f87face 100644 --- a/core/src/ops/indexing/mod.rs +++ b/core/src/ops/indexing/mod.rs @@ -12,6 +12,7 @@ pub mod action; pub mod change_detection; pub mod ctx; pub mod entry; +pub mod ephemeral; pub mod hierarchy; pub mod input; pub mod job; @@ -30,10 +31,11 @@ pub mod verify; pub use action::IndexingAction; pub use ctx::{IndexingCtx, ResponderCtx}; pub use entry::{EntryMetadata, EntryProcessor}; +pub use ephemeral::EphemeralIndexCache; pub use hierarchy::HierarchyQuery; pub use input::IndexInput; pub use job::{ - EphemeralContentIdentity, EphemeralIndex, IndexMode, IndexPersistence, IndexScope, IndexerJob, + EphemeralIndex, EphemeralIndexStats, IndexMode, IndexPersistence, IndexScope, IndexerJob, IndexerJobConfig, IndexerOutput, }; pub use metrics::IndexerMetrics; diff --git a/core/src/ops/indexing/persistence.rs b/core/src/ops/indexing/persistence.rs index a9bac5be4..cc89b6bdf 100644 --- a/core/src/ops/indexing/persistence.rs +++ b/core/src/ops/indexing/persistence.rs @@ -23,7 +23,7 @@ use tokio::sync::RwLock; use uuid::Uuid; use super::{ - job::{EphemeralContentIdentity, EphemeralIndex}, + job::EphemeralIndex, state::{DirEntry, EntryKind}, PathResolver, }; @@ -478,18 +478,27 @@ impl IndexPersistence for EphemeralPersistence { let entry_uuid = Uuid::new_v4(); // Store in ephemeral index with UUID - { + // add_entry returns Some(content_kind) if added, None if duplicate + let content_kind = { let mut index = self.index.write().await; - index.add_entry(entry.path.clone(), entry_uuid, metadata.clone()); + let result = index.add_entry(entry.path.clone(), entry_uuid, metadata.clone()); - // Update stats - match entry.kind { - EntryKind::File => index.stats.files += 1, - EntryKind::Directory => index.stats.dirs += 1, - EntryKind::Symlink => index.stats.symlinks += 1, + // Only update stats if the entry was actually added (not a duplicate) + if result.is_some() { + match entry.kind { + EntryKind::File => index.stats.files += 1, + EntryKind::Directory => index.stats.dirs += 1, + EntryKind::Symlink => index.stats.symlinks += 1, + } + index.stats.bytes += entry.size; } - index.stats.bytes += entry.size; - } + result + }; + + // Only emit event if entry was actually added + let Some(content_kind) = content_kind else { + return Ok(entry_id); + }; // Emit ResourceChanged event for UI if let Some(event_bus) = &self.event_bus { @@ -507,7 +516,8 @@ impl IndexPersistence for EphemeralPersistence { }; // Build File domain object from ephemeral data - let file = File::from_ephemeral(entry_uuid, &metadata, sd_path); + let mut file = File::from_ephemeral(entry_uuid, &metadata, sd_path); + file.content_kind = content_kind; // Emit event with path metadata for filtering let parent_path = entry.path.parent().map(|p| SdPath::Physical { @@ -539,106 +549,11 @@ impl IndexPersistence for EphemeralPersistence { async fn store_content_identity( &self, - entry_id: i32, - path: &Path, - cas_id: String, + _entry_id: i32, + _path: &Path, + _cas_id: String, ) -> JobResult<()> { - // Get file size - let file_size = tokio::fs::symlink_metadata(path) - .await - .map(|m| m.len()) - .unwrap_or(0); - - // Detect file type using the file type registry - let registry = FileTypeRegistry::default(); - let (mime_type, content_kind) = if let Ok(result) = registry.identify(path).await { - ( - result.file_type.primary_mime_type().map(|s| s.to_string()), - result.file_type.category, - ) - } else { - (None, crate::domain::ContentKind::Unknown) - }; - - let content_identity = EphemeralContentIdentity { - cas_id: cas_id.clone(), - mime_type: mime_type.clone(), - file_size, - entry_count: 1, - }; - - // Store in ephemeral index - { - let mut index = self.index.write().await; - index.add_content_identity(cas_id.clone(), content_identity); - } - - // Emit ResourceChanged event with updated content_identity - if let Some(event_bus) = &self.event_bus { - use crate::device::get_current_device_slug; - use crate::domain::addressing::SdPath; - use crate::domain::content_identity::ContentIdentity; - use crate::domain::file::File; - use crate::infra::event::{Event, ResourceMetadata}; - - // Get the stored metadata and UUID for this entry - let (metadata_opt, entry_uuid_opt) = { - let index = self.index.read().await; - (index.entries.get(path).cloned(), index.get_entry_uuid(&path.to_path_buf())) - }; - - if let (Some(metadata), Some(entry_uuid)) = (metadata_opt, entry_uuid_opt) { - // Build SdPath - let device_slug = get_current_device_slug(); - let sd_path = SdPath::Physical { - device_slug: device_slug.clone(), - path: path.to_path_buf(), - }; - - // Build File with content_identity - let mut file = File::from_ephemeral(entry_uuid, &metadata, sd_path); - - // Add content identity - file.content_identity = Some(ContentIdentity { - uuid: uuid::Uuid::new_v4(), - kind: content_kind, - content_hash: cas_id.clone(), - integrity_hash: None, - mime_type_id: None, - text_content: None, - total_size: file_size as i64, - entry_count: 1, - first_seen_at: chrono::Utc::now(), - last_verified_at: chrono::Utc::now(), - }); - file.content_kind = content_kind; - - // Emit event with updated file - let parent_path = path.parent().map(|p| SdPath::Physical { - device_slug, - path: p.to_path_buf(), - }); - - let affected_paths = if let Some(parent) = parent_path { - vec![parent] - } else { - vec![] - }; - - if let Ok(resource_json) = serde_json::to_value(&file) { - event_bus.emit(Event::ResourceChanged { - resource_type: "file".to_string(), - resource: resource_json, - metadata: Some(ResourceMetadata { - no_merge_fields: vec!["sd_path".to_string()], - alternate_ids: vec![], - affected_paths, - }), - }); - } - } - } - + // Ephemeral indexes do not store content identities Ok(()) } @@ -763,8 +678,12 @@ mod tests { ); // Extract UUIDs from both events - let uuid1 = events[0]["id"].as_str().expect("First event should have UUID"); - let uuid2 = events[1]["id"].as_str().expect("Second event should have UUID"); + let uuid1 = events[0]["id"] + .as_str() + .expect("First event should have UUID"); + let uuid2 = events[1]["id"] + .as_str() + .expect("Second event should have UUID"); // CRITICAL: Both events must have the same UUID for the same file assert_eq!( diff --git a/core/src/ops/indexing/phases/discovery.rs b/core/src/ops/indexing/phases/discovery.rs index b1d2fb08a..c9cf09a57 100644 --- a/core/src/ops/indexing/phases/discovery.rs +++ b/core/src/ops/indexing/phases/discovery.rs @@ -58,8 +58,15 @@ pub async fn run_discovery_phase( state.dirs_to_walk.len() )); - run_parallel_discovery(state, ctx, root_path, rule_toggles, volume_backend, cloud_url_base) - .await + run_parallel_discovery( + state, + ctx, + root_path, + rule_toggles, + volume_backend, + cloud_url_base, + ) + .await } /// Parallel discovery implementation using Rayon-style work-stealing @@ -244,7 +251,9 @@ enum DiscoveryResult { bytes: u64, }, Error(IndexError), - Progress { dirs_queued: usize }, + Progress { + dirs_queued: usize, + }, } /// Rayon-style worker: processes directories and directly enqueues new work @@ -291,7 +300,12 @@ async fn discovery_worker_rayon( let dir_ruler = build_default_ruler(rule_toggles, &root_path, &dir_path).await; // Read directory - match read_directory(&dir_path, volume_backend.as_ref(), cloud_url_base.as_deref()).await + match read_directory( + &dir_path, + volume_backend.as_ref(), + cloud_url_base.as_deref(), + ) + .await { Ok(entries) => { let mut local_stats = LocalStats::default(); diff --git a/core/src/ops/indexing/verify/action.rs b/core/src/ops/indexing/verify/action.rs index 667bf0291..d9418d920 100644 --- a/core/src/ops/indexing/verify/action.rs +++ b/core/src/ops/indexing/verify/action.rs @@ -183,7 +183,7 @@ impl IndexVerifyAction { // Extract the results from our shared ephemeral index let entries = { let index = ephemeral_index.read().await; - index.entries.clone() + index.entries() }; tracing::debug!( diff --git a/core/src/ops/libraries/open/action.rs b/core/src/ops/libraries/open/action.rs index 73c060abe..d61783631 100644 --- a/core/src/ops/libraries/open/action.rs +++ b/core/src/ops/libraries/open/action.rs @@ -79,7 +79,10 @@ impl CoreAction for LibraryOpenAction { "library.open" } - async fn validate(&self, _context: Arc) -> Result { + async fn validate( + &self, + _context: Arc, + ) -> Result { // Check if the path exists if !self.input.path.exists() { return Err(ActionError::Validation { diff --git a/core/src/ops/locations/enable_indexing/action.rs b/core/src/ops/locations/enable_indexing/action.rs index 7b31fe420..3d7ac43df 100644 --- a/core/src/ops/locations/enable_indexing/action.rs +++ b/core/src/ops/locations/enable_indexing/action.rs @@ -66,12 +66,15 @@ impl LibraryAction for EnableIndexingAction { .ok_or_else(|| ActionError::LocationNotFound(self.input.id))?; // Parse the index mode - let index_mode: IndexMode = self.input.index_mode.as_str().parse().map_err(|e| { - ActionError::Validation { - field: "index_mode".to_string(), - message: format!("Invalid index mode: {}", e), - } - })?; + let index_mode: IndexMode = + self.input + .index_mode + .as_str() + .parse() + .map_err(|e| ActionError::Validation { + field: "index_mode".to_string(), + message: format!("Invalid index mode: {}", e), + })?; // Don't allow setting to None if index_mode == IndexMode::None { diff --git a/core/src/ops/locations/enable_indexing/output.rs b/core/src/ops/locations/enable_indexing/output.rs index a8c456922..eb8a6cb00 100644 --- a/core/src/ops/locations/enable_indexing/output.rs +++ b/core/src/ops/locations/enable_indexing/output.rs @@ -13,6 +13,9 @@ pub struct EnableIndexingOutput { impl EnableIndexingOutput { pub fn new(location_id: Uuid, job_id: String) -> Self { - Self { location_id, job_id } + Self { + location_id, + job_id, + } } } diff --git a/core/src/ops/media/thumbnail/job.rs b/core/src/ops/media/thumbnail/job.rs index df50d6d69..be1a825fe 100644 --- a/core/src/ops/media/thumbnail/job.rs +++ b/core/src/ops/media/thumbnail/job.rs @@ -595,61 +595,72 @@ impl ThumbnailJob { let is_cloud = Self::is_cloud_path(&entry.relative_path); // For cloud files, download to temp file. For local files, use direct path - let (source_path, temp_file): (std::path::PathBuf, Option) = if is_cloud { - // Cloud path - need to download via volume backend - let volume_manager = ctx.volume_manager() - .ok_or_else(|| ThumbnailError::other("VolumeManager not available for cloud file"))?; + let (source_path, temp_file): (std::path::PathBuf, Option) = + if is_cloud { + // Cloud path - need to download via volume backend + let volume_manager = ctx.volume_manager().ok_or_else(|| { + ThumbnailError::other("VolumeManager not available for cloud file") + })?; - // Parse the cloud path to get an SdPath - use crate::domain::addressing::SdPath; - let sdpath = SdPath::from_uri_with_context(&entry.relative_path, &library.core_context()) - .await - .map_err(|e| ThumbnailError::other(format!("Failed to parse cloud path: {}", e)))?; + // Parse the cloud path to get an SdPath + use crate::domain::addressing::SdPath; + let sdpath = + SdPath::from_uri_with_context(&entry.relative_path, &library.core_context()) + .await + .map_err(|e| { + ThumbnailError::other(format!("Failed to parse cloud path: {}", e)) + })?; - // Resolve the volume backend for this path - let volume = volume_manager - .resolve_volume_for_sdpath(&sdpath, &library) - .await - .map_err(|e| ThumbnailError::other(format!("Failed to resolve volume: {}", e)))? - .ok_or_else(|| ThumbnailError::other("No volume found for cloud path"))?; + // Resolve the volume backend for this path + let volume = volume_manager + .resolve_volume_for_sdpath(&sdpath, &library) + .await + .map_err(|e| ThumbnailError::other(format!("Failed to resolve volume: {}", e)))? + .ok_or_else(|| ThumbnailError::other("No volume found for cloud path"))?; - let backend = volume.backend - .as_ref() - .ok_or_else(|| ThumbnailError::other("Volume has no backend"))?; + let backend = volume + .backend + .as_ref() + .ok_or_else(|| ThumbnailError::other("Volume has no backend"))?; - // Get the backend-relative path (strip s3://bucket/ prefix) - let backend_path = Self::to_backend_path(&entry.relative_path); + // Get the backend-relative path (strip s3://bucket/ prefix) + let backend_path = Self::to_backend_path(&entry.relative_path); - // Download file content from cloud - let file_data = backend - .read(&backend_path) - .await - .map_err(|e| ThumbnailError::other(format!("Failed to read cloud file: {}", e)))?; + // Download file content from cloud + let file_data = backend.read(&backend_path).await.map_err(|e| { + ThumbnailError::other(format!("Failed to read cloud file: {}", e)) + })?; - // Write to temporary file - let mut temp = tempfile::NamedTempFile::new() - .map_err(|e| ThumbnailError::other(format!("Failed to create temp file: {}", e)))?; + // Write to temporary file + let mut temp = tempfile::NamedTempFile::new().map_err(|e| { + ThumbnailError::other(format!("Failed to create temp file: {}", e)) + })?; - use std::io::Write; - temp.write_all(&file_data) - .map_err(|e| ThumbnailError::other(format!("Failed to write temp file: {}", e)))?; - temp.flush() - .map_err(|e| ThumbnailError::other(format!("Failed to flush temp file: {}", e)))?; + use std::io::Write; + temp.write_all(&file_data).map_err(|e| { + ThumbnailError::other(format!("Failed to write temp file: {}", e)) + })?; + temp.flush().map_err(|e| { + ThumbnailError::other(format!("Failed to flush temp file: {}", e)) + })?; - let temp_path = temp.path().to_path_buf(); - ctx.log(format!("Downloaded cloud file {} to temp location", entry.relative_path)); + let temp_path = temp.path().to_path_buf(); + ctx.log(format!( + "Downloaded cloud file {} to temp location", + entry.relative_path + )); - (temp_path, Some(temp)) - } else { - // Local path - use direct filesystem access - let source_path = library.path().join(&entry.relative_path); + (temp_path, Some(temp)) + } else { + // Local path - use direct filesystem access + let source_path = library.path().join(&entry.relative_path); - if !source_path.exists() { - return Err(ThumbnailError::FileNotFound(entry.relative_path.clone())); - } + if !source_path.exists() { + return Err(ThumbnailError::FileNotFound(entry.relative_path.clone())); + } - (source_path, None) - }; + (source_path, None) + }; let mut total_thumbnail_size = 0u64; diff --git a/core/src/ops/network/sync_setup/action.rs b/core/src/ops/network/sync_setup/action.rs index a6c54453f..157e1930d 100644 --- a/core/src/ops/network/sync_setup/action.rs +++ b/core/src/ops/network/sync_setup/action.rs @@ -95,7 +95,10 @@ impl CoreAction for LibrarySyncSetupAction { } // DEPRICATED: Sync no longer requires a leader device - async fn validate(&self, context: Arc) -> Result { + async fn validate( + &self, + context: Arc, + ) -> Result { // Validate leader device is one of the two devices if self.input.leader_device_id != self.input.local_device_id && self.input.leader_device_id != self.input.remote_device_id diff --git a/core/src/service/file_sharing.rs b/core/src/service/file_sharing.rs index 49ade8cdd..33acfed08 100644 --- a/core/src/service/file_sharing.rs +++ b/core/src/service/file_sharing.rs @@ -519,11 +519,8 @@ mod tests { ); let events = Arc::new(EventBus::default()); - let device_manager = Arc::new(DeviceManager::init( - temp_dir.path(), - key_manager.clone(), - None, - ).unwrap()); + let device_manager = + Arc::new(DeviceManager::init(temp_dir.path(), key_manager.clone(), None).unwrap()); let volume_manager = Arc::new(crate::volume::VolumeManager::new( uuid::Uuid::new_v4(), // Test device ID crate::volume::VolumeDetectionConfig::default(), @@ -570,11 +567,8 @@ mod tests { ); let events = Arc::new(EventBus::default()); - let device_manager = Arc::new(DeviceManager::init( - temp_dir.path(), - key_manager.clone(), - None, - ).unwrap()); + let device_manager = + Arc::new(DeviceManager::init(temp_dir.path(), key_manager.clone(), None).unwrap()); let volume_manager = Arc::new(crate::volume::VolumeManager::new( uuid::Uuid::new_v4(), // Test device ID crate::volume::VolumeDetectionConfig::default(), diff --git a/core/src/service/network/device/persistence.rs b/core/src/service/network/device/persistence.rs index fee28a25c..98a96f7c2 100644 --- a/core/src/service/network/device/persistence.rs +++ b/core/src/service/network/device/persistence.rs @@ -73,8 +73,7 @@ impl DevicePersistence { /// Save list of paired device IDs async fn save_device_list(&self, device_ids: &[Uuid]) -> Result<()> { - let data = - serde_json::to_vec(device_ids).map_err(|e| NetworkingError::Serialization(e))?; + let data = serde_json::to_vec(device_ids).map_err(|e| NetworkingError::Serialization(e))?; self.key_manager .set_secret(Self::DEVICE_LIST_KEY, &data) .await @@ -92,8 +91,7 @@ impl DevicePersistence { for (device_id, device) in devices { let key = Self::device_key(*device_id); - let data = - serde_json::to_vec(device).map_err(|e| NetworkingError::Serialization(e))?; + let data = serde_json::to_vec(device).map_err(|e| NetworkingError::Serialization(e))?; self.key_manager .set_secret(&key, &data) .await @@ -114,18 +112,16 @@ impl DevicePersistence { for device_id in device_ids { let key = Self::device_key(device_id); match self.key_manager.get_secret(&key).await { - Ok(data) => { - match serde_json::from_slice::(&data) { - Ok(device) => { - if !device.session_keys.is_expired() { - devices.insert(device_id, device); - } - } - Err(e) => { - eprintln!("Failed to deserialize device {}: {}", device_id, e); + Ok(data) => match serde_json::from_slice::(&data) { + Ok(device) => { + if !device.session_keys.is_expired() { + devices.insert(device_id, device); } } - } + Err(e) => { + eprintln!("Failed to deserialize device {}: {}", device_id, e); + } + }, Err(e) => { eprintln!("Failed to load device {}: {}", device_id, e); } @@ -279,7 +275,9 @@ impl DevicePersistence { self.key_manager .delete_secret(Self::DEVICE_LIST_KEY) .await - .map_err(|e| NetworkingError::Protocol(format!("Failed to clear device list: {}", e)))?; + .map_err(|e| { + NetworkingError::Protocol(format!("Failed to clear device list: {}", e)) + })?; Ok(()) } @@ -413,5 +411,4 @@ mod tests { session_keys.shared_secret ); } - } diff --git a/core/src/service/network/protocol/sync/handler.rs b/core/src/service/network/protocol/sync/handler.rs index 8095d08ab..e842f0790 100644 --- a/core/src/service/network/protocol/sync/handler.rs +++ b/core/src/service/network/protocol/sync/handler.rs @@ -497,10 +497,19 @@ impl SyncProtocolHandler { let log_handler = backfill_manager.log_handler(); let response = log_handler - .handle_event_log_request(requesting_device, since, event_types, correlation_id, limit) + .handle_event_log_request( + requesting_device, + since, + event_types, + correlation_id, + limit, + ) .await .map_err(|e| { - NetworkingError::Protocol(format!("Failed to handle event log request: {}", e)) + NetworkingError::Protocol(format!( + "Failed to handle event log request: {}", + e + )) })?; Ok(Some(response)) @@ -725,11 +734,8 @@ mod tests { KeyManager::new_with_fallback(temp_dir.path().to_path_buf(), Some(device_key_fallback)) .unwrap(), ); - let device_manager = Arc::new(DeviceManager::init( - temp_dir.path(), - key_manager.clone(), - None, - ).unwrap()); + let device_manager = + Arc::new(DeviceManager::init(temp_dir.path(), key_manager.clone(), None).unwrap()); let logger = Arc::new(crate::service::network::utils::SilentLogger); let registry = DeviceRegistry::new(device_manager, key_manager, logger); let device_registry = Arc::new(tokio::sync::RwLock::new(registry)); diff --git a/core/src/service/sync/backfill.rs b/core/src/service/sync/backfill.rs index 13ce07081..3f063a5b8 100644 --- a/core/src/service/sync/backfill.rs +++ b/core/src/service/sync/backfill.rs @@ -126,7 +126,10 @@ impl BackfillManager { let event = SyncEventLog::new( self.device_id, SyncEventType::BackfillSessionStarted, - format!("Backfill session started with {} available peers", available_peers.len()), + format!( + "Backfill session started with {} available peers", + available_peers.len() + ), ) .with_correlation_id(session_id) .with_details(json!({ @@ -175,7 +178,11 @@ impl BackfillManager { let event = SyncEventLog::new( self.device_id, SyncEventType::BackfillSessionStarted, - format!("Selected peer {} from {} candidates", selected_peer, available_peers.len()), + format!( + "Selected peer {} from {} candidates", + selected_peer, + available_peers.len() + ), ) .with_correlation_id(session_id) .with_peer(selected_peer) @@ -1077,7 +1084,11 @@ impl BackfillManager { // Feed batch aggregator for event logging self.batch_aggregator - .add_records("shared_resources".to_string(), batch_size as u64, Some(peer)) + .add_records( + "shared_resources".to_string(), + batch_size as u64, + Some(peer), + ) .await; // Log progress every 10,000 records for large backfills diff --git a/core/src/service/sync/mod.rs b/core/src/service/sync/mod.rs index 910fd342e..4c70fcedb 100644 --- a/core/src/service/sync/mod.rs +++ b/core/src/service/sync/mod.rs @@ -201,11 +201,8 @@ impl SyncService { ); // Create protocol handlers - let mut log_handler = LogSyncHandler::new( - library_id, - library.db().clone(), - peer_sync.clone(), - ); + let mut log_handler = + LogSyncHandler::new(library_id, library.db().clone(), peer_sync.clone()); log_handler.set_event_logger(event_logger.clone()); let log_handler = Arc::new(log_handler); diff --git a/core/src/service/sync/peer.rs b/core/src/service/sync/peer.rs index 8d7bfc8c3..4e8c011f5 100644 --- a/core/src/service/sync/peer.rs +++ b/core/src/service/sync/peer.rs @@ -2669,7 +2669,10 @@ impl PeerSync { let event = SyncEventLog::new( self.device_id, SyncEventType::SyncError, - format!("Buffer overflow: {} updates dropped during backfill", dropped_count), + format!( + "Buffer overflow: {} updates dropped during backfill", + dropped_count + ), ) .with_severity(EventSeverity::Error) .with_details(json!({ diff --git a/core/src/service/sync/state.rs b/core/src/service/sync/state.rs index cd0717f5b..a046b1744 100644 --- a/core/src/service/sync/state.rs +++ b/core/src/service/sync/state.rs @@ -173,7 +173,9 @@ impl BufferQueue { warn!( current_size = queue.len(), max_size = self.max_size, - total_dropped = self.dropped_count.load(std::sync::atomic::Ordering::Relaxed), + total_dropped = self + .dropped_count + .load(std::sync::atomic::Ordering::Relaxed), "Buffer queue at capacity, dropping new update" ); return; diff --git a/core/src/volume/backend/local.rs b/core/src/volume/backend/local.rs index f5e092f67..6984dada6 100644 --- a/core/src/volume/backend/local.rs +++ b/core/src/volume/backend/local.rs @@ -319,4 +319,4 @@ mod tests { assert!(backend.exists(test_path).await.unwrap()); } -} \ No newline at end of file +} diff --git a/core/src/volume/fs/ntfs.rs b/core/src/volume/fs/ntfs.rs index 1971eee24..925389dc0 100644 --- a/core/src/volume/fs/ntfs.rs +++ b/core/src/volume/fs/ntfs.rs @@ -13,32 +13,32 @@ use tracing::{debug, warn}; pub struct NtfsHandler; impl NtfsHandler { - pub fn new() -> Self { - Self - } + pub fn new() -> Self { + Self + } - /// Check if two paths are on the same NTFS volume - pub async fn same_physical_storage(&self, path1: &Path, path2: &Path) -> bool { - // Check if both paths are on the same NTFS volume - if let (Ok(vol1), Ok(vol2)) = ( - self.get_volume_info(path1).await, - self.get_volume_info(path2).await, - ) { - // Same volume GUID = same physical storage - return vol1.volume_guid == vol2.volume_guid; - } + /// Check if two paths are on the same NTFS volume + pub async fn same_physical_storage(&self, path1: &Path, path2: &Path) -> bool { + // Check if both paths are on the same NTFS volume + if let (Ok(vol1), Ok(vol2)) = ( + self.get_volume_info(path1).await, + self.get_volume_info(path2).await, + ) { + // Same volume GUID = same physical storage + return vol1.volume_guid == vol2.volume_guid; + } - false - } + false + } - /// Get NTFS volume information for a path - async fn get_volume_info(&self, path: &Path) -> VolumeResult { - let path = path.to_path_buf(); + /// Get NTFS volume information for a path + async fn get_volume_info(&self, path: &Path) -> VolumeResult { + let path = path.to_path_buf(); - task::spawn_blocking(move || { - // Use PowerShell to get volume information - let script = format!( - r#" + task::spawn_blocking(move || { + // Use PowerShell to get volume information + let script = format!( + r#" $volume = Get-Volume -FilePath '{}' $partition = Get-Partition -DriveLetter $volume.DriveLetter $disk = Get-Disk -Number $partition.DiskNumber @@ -55,66 +55,66 @@ impl NtfsHandler { MediaType = $disk.MediaType }} | ConvertTo-Json "#, - path.display() - ); + path.display() + ); - let output = std::process::Command::new("powershell") - .args(["-Command", &script]) - .output() - .map_err(|e| { - crate::volume::error::VolumeError::platform(format!( - "Failed to run PowerShell: {}", - e - )) - })?; + let output = std::process::Command::new("powershell") + .args(["-Command", &script]) + .output() + .map_err(|e| { + crate::volume::error::VolumeError::platform(format!( + "Failed to run PowerShell: {}", + e + )) + })?; - if !output.status.success() { - return Err(crate::volume::error::VolumeError::platform( - "PowerShell command failed".to_string(), - )); - } + if !output.status.success() { + return Err(crate::volume::error::VolumeError::platform( + "PowerShell command failed".to_string(), + )); + } - let output_text = String::from_utf8_lossy(&output.stdout); - parse_volume_info(&output_text) - }) - .await - .map_err(|e| { - crate::volume::error::VolumeError::platform(format!("Task join error: {}", e)) - })? - } + let output_text = String::from_utf8_lossy(&output.stdout); + parse_volume_info(&output_text) + }) + .await + .map_err(|e| { + crate::volume::error::VolumeError::platform(format!("Task join error: {}", e)) + })? + } - /// Check if NTFS hardlinks are supported (they always are on NTFS) - pub async fn supports_hardlinks(&self, path: &Path) -> bool { - // NTFS always supports hardlinks - if let Ok(vol_info) = self.get_volume_info(path).await { - return vol_info.file_system == "NTFS"; - } - false - } + /// Check if NTFS hardlinks are supported (they always are on NTFS) + pub async fn supports_hardlinks(&self, path: &Path) -> bool { + // NTFS always supports hardlinks + if let Ok(vol_info) = self.get_volume_info(path).await { + return vol_info.file_system == "NTFS"; + } + false + } - /// Check if NTFS junction points are supported - pub async fn supports_junctions(&self, path: &Path) -> bool { - // NTFS supports junction points (directory symbolic links) - if let Ok(vol_info) = self.get_volume_info(path).await { - return vol_info.file_system == "NTFS"; - } - false - } + /// Check if NTFS junction points are supported + pub async fn supports_junctions(&self, path: &Path) -> bool { + // NTFS supports junction points (directory symbolic links) + if let Ok(vol_info) = self.get_volume_info(path).await { + return vol_info.file_system == "NTFS"; + } + false + } - /// Resolve junction points and symbolic links - pub async fn resolve_ntfs_path(&self, path: &Path) -> PathBuf { - let path = path.to_path_buf(); - // Clone the path so we have an owned copy to move into the closure - // while keeping the original 'path' available for the fallback (unwrap_or) - let path_clone = path.clone(); + /// Resolve junction points and symbolic links + pub async fn resolve_ntfs_path(&self, path: &Path) -> PathBuf { + let path = path.to_path_buf(); + // Clone the path so we have an owned copy to move into the closure + // while keeping the original 'path' available for the fallback (unwrap_or) + let path_clone = path.clone(); - let result = task::spawn_blocking(move || { - // Use the cloned path inside the closure - let path = path_clone; - - // Use PowerShell to resolve the path - let script = format!( - r#" + let result = task::spawn_blocking(move || { + // Use the cloned path inside the closure + let path = path_clone; + + // Use PowerShell to resolve the path + let script = format!( + r#" try {{ $resolvedPath = Resolve-Path -Path '{}' -ErrorAction Stop Write-Output $resolvedPath.Path @@ -122,40 +122,40 @@ impl NtfsHandler { Write-Output '{}' }} "#, - path.display(), - path.display() - ); + path.display(), + path.display() + ); - let output = std::process::Command::new("powershell") - .args(["-Command", &script]) - .output(); + let output = std::process::Command::new("powershell") + .args(["-Command", &script]) + .output(); - match output { - Ok(output) if output.status.success() => { - let resolved = String::from_utf8_lossy(&output.stdout).trim().to_string(); - if !resolved.is_empty() { - PathBuf::from(resolved) - } else { - path - } - } - _ => path, - } - }) - .await; + match output { + Ok(output) if output.status.success() => { + let resolved = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if !resolved.is_empty() { + PathBuf::from(resolved) + } else { + path + } + } + _ => path, + } + }) + .await; - // If the task fails (e.g. panic), return the original path - result.unwrap_or(path) - } + // If the task fails (e.g. panic), return the original path + result.unwrap_or(path) + } - /// Get NTFS file system features - pub async fn get_ntfs_features(&self, path: &Path) -> VolumeResult { - let path = path.to_path_buf(); + /// Get NTFS file system features + pub async fn get_ntfs_features(&self, path: &Path) -> VolumeResult { + let path = path.to_path_buf(); - task::spawn_blocking(move || { - // Use fsutil to get NTFS features - let script = format!( - r#" + task::spawn_blocking(move || { + // Use fsutil to get NTFS features + let script = format!( + r#" $driveLetter = Split-Path -Path '{}' -Qualifier $features = @{{}} @@ -183,237 +183,237 @@ impl NtfsHandler { $features | ConvertTo-Json "#, - path.display() - ); + path.display() + ); - let output = std::process::Command::new("powershell") - .args(["-Command", &script]) - .output() - .map_err(|e| { - crate::volume::error::VolumeError::platform(format!( - "Failed to run PowerShell: {}", - e - )) - })?; + let output = std::process::Command::new("powershell") + .args(["-Command", &script]) + .output() + .map_err(|e| { + crate::volume::error::VolumeError::platform(format!( + "Failed to run PowerShell: {}", + e + )) + })?; - if !output.status.success() { - // Return default NTFS features - return Ok(NtfsFeatures { - supports_hardlinks: true, - supports_junctions: true, - supports_symlinks: true, - supports_streams: true, - supports_compression: true, - supports_encryption: true, - }); - } + if !output.status.success() { + // Return default NTFS features + return Ok(NtfsFeatures { + supports_hardlinks: true, + supports_junctions: true, + supports_symlinks: true, + supports_streams: true, + supports_compression: true, + supports_encryption: true, + }); + } - let output_text = String::from_utf8_lossy(&output.stdout); - parse_ntfs_features(&output_text) - }) - .await - .map_err(|e| { - crate::volume::error::VolumeError::platform(format!("Task join error: {}", e)) - })? - } + let output_text = String::from_utf8_lossy(&output.stdout); + parse_ntfs_features(&output_text) + }) + .await + .map_err(|e| { + crate::volume::error::VolumeError::platform(format!("Task join error: {}", e)) + })? + } } #[async_trait] impl super::FilesystemHandler for NtfsHandler { - async fn enhance_volume(&self, volume: &mut Volume) -> VolumeResult<()> { - // Add NTFS-specific information like feature support - if let Some(mount_point) = volume.mount_point.to_str() { - if let Ok(features) = self.get_ntfs_features(Path::new(mount_point)).await { - debug!("Enhanced NTFS volume with features: {:?}", features); - // Could store NTFS features in volume metadata - } - } - Ok(()) - } + async fn enhance_volume(&self, volume: &mut Volume) -> VolumeResult<()> { + // Add NTFS-specific information like feature support + if let Some(mount_point) = volume.mount_point.to_str() { + if let Ok(features) = self.get_ntfs_features(Path::new(mount_point)).await { + debug!("Enhanced NTFS volume with features: {:?}", features); + // Could store NTFS features in volume metadata + } + } + Ok(()) + } - async fn same_physical_storage(&self, path1: &Path, path2: &Path) -> bool { - self.same_physical_storage(path1, path2).await - } + async fn same_physical_storage(&self, path1: &Path, path2: &Path) -> bool { + self.same_physical_storage(path1, path2).await + } - fn get_copy_strategy(&self) -> Box { - // Use streaming copy for NTFS (no built-in CoW like APFS/ReFS) - // Could potentially use hardlinks for same-volume copies - Box::new(crate::ops::files::copy::strategy::LocalStreamCopyStrategy) - } + fn get_copy_strategy(&self) -> Box { + // Use streaming copy for NTFS (no built-in CoW like APFS/ReFS) + // Could potentially use hardlinks for same-volume copies + Box::new(crate::ops::files::copy::strategy::LocalStreamCopyStrategy) + } - fn contains_path(&self, volume: &Volume, path: &std::path::Path) -> bool { - // Check primary mount point - if path.starts_with(&volume.mount_point) { - return true; - } + fn contains_path(&self, volume: &Volume, path: &std::path::Path) -> bool { + // Check primary mount point + if path.starts_with(&volume.mount_point) { + return true; + } - // Check additional mount points - if volume.mount_points.iter().any(|mp| path.starts_with(mp)) { - return true; - } + // Check additional mount points + if volume.mount_points.iter().any(|mp| path.starts_with(mp)) { + return true; + } - // TODO: NTFS-specific logic for junction points and mount points - // Windows can have volumes mounted as folders (mount points) within other volumes - // NTFS also supports junction points and symbolic links that may need resolution + // TODO: NTFS-specific logic for junction points and mount points + // Windows can have volumes mounted as folders (mount points) within other volumes + // NTFS also supports junction points and symbolic links that may need resolution - false - } + false + } } /// NTFS volume information #[derive(Debug, Clone)] pub struct NtfsVolumeInfo { - pub volume_guid: String, - pub file_system: String, - pub drive_letter: Option, - pub label: Option, - pub size_bytes: u64, - pub available_bytes: u64, - pub disk_number: Option, - pub partition_number: Option, - pub media_type: Option, + pub volume_guid: String, + pub file_system: String, + pub drive_letter: Option, + pub label: Option, + pub size_bytes: u64, + pub available_bytes: u64, + pub disk_number: Option, + pub partition_number: Option, + pub media_type: Option, } /// NTFS filesystem features #[derive(Debug, Clone)] pub struct NtfsFeatures { - pub supports_hardlinks: bool, - pub supports_junctions: bool, - pub supports_symlinks: bool, - pub supports_streams: bool, - pub supports_compression: bool, - pub supports_encryption: bool, + pub supports_hardlinks: bool, + pub supports_junctions: bool, + pub supports_symlinks: bool, + pub supports_streams: bool, + pub supports_compression: bool, + pub supports_encryption: bool, } /// Parse PowerShell volume info JSON output fn parse_volume_info(json_output: &str) -> VolumeResult { - // Simple JSON parsing - in production, you'd use serde_json - let json_output = json_output.trim(); + // Simple JSON parsing - in production, you'd use serde_json + let json_output = json_output.trim(); - let volume_guid = extract_json_string(json_output, "VolumeGuid").unwrap_or_default(); - let file_system = extract_json_string(json_output, "FileSystem").unwrap_or_default(); - let drive_letter_str = extract_json_string(json_output, "DriveLetter"); - let label = extract_json_string(json_output, "Label"); - let size_bytes = extract_json_number(json_output, "Size").unwrap_or(0); - let available_bytes = extract_json_number(json_output, "SizeRemaining").unwrap_or(0); - let disk_number = extract_json_number(json_output, "DiskNumber").map(|n| n as u32); - let partition_number = extract_json_number(json_output, "PartitionNumber").map(|n| n as u32); - let media_type = extract_json_string(json_output, "MediaType"); + let volume_guid = extract_json_string(json_output, "VolumeGuid").unwrap_or_default(); + let file_system = extract_json_string(json_output, "FileSystem").unwrap_or_default(); + let drive_letter_str = extract_json_string(json_output, "DriveLetter"); + let label = extract_json_string(json_output, "Label"); + let size_bytes = extract_json_number(json_output, "Size").unwrap_or(0); + let available_bytes = extract_json_number(json_output, "SizeRemaining").unwrap_or(0); + let disk_number = extract_json_number(json_output, "DiskNumber").map(|n| n as u32); + let partition_number = extract_json_number(json_output, "PartitionNumber").map(|n| n as u32); + let media_type = extract_json_string(json_output, "MediaType"); - let drive_letter = drive_letter_str.and_then(|s| s.chars().next()); + let drive_letter = drive_letter_str.and_then(|s| s.chars().next()); - Ok(NtfsVolumeInfo { - volume_guid, - file_system, - drive_letter, - label, - size_bytes, - available_bytes, - disk_number, - partition_number, - media_type, - }) + Ok(NtfsVolumeInfo { + volume_guid, + file_system, + drive_letter, + label, + size_bytes, + available_bytes, + disk_number, + partition_number, + media_type, + }) } /// Parse NTFS features JSON output fn parse_ntfs_features(json_output: &str) -> VolumeResult { - // Simple parsing - in production, use proper JSON parser - let json_output = json_output.trim(); + // Simple parsing - in production, use proper JSON parser + let json_output = json_output.trim(); - let supports_compression = - extract_json_bool(json_output, "SupportsCompression").unwrap_or(true); - let supports_encryption = extract_json_bool(json_output, "SupportsEncryption").unwrap_or(true); + let supports_compression = + extract_json_bool(json_output, "SupportsCompression").unwrap_or(true); + let supports_encryption = extract_json_bool(json_output, "SupportsEncryption").unwrap_or(true); - Ok(NtfsFeatures { - supports_hardlinks: true, // NTFS always supports these - supports_junctions: true, - supports_symlinks: true, - supports_streams: true, - supports_compression, - supports_encryption, - }) + Ok(NtfsFeatures { + supports_hardlinks: true, // NTFS always supports these + supports_junctions: true, + supports_symlinks: true, + supports_streams: true, + supports_compression, + supports_encryption, + }) } /// Extract string value from JSON (simple implementation) fn extract_json_string(json: &str, key: &str) -> Option { - let pattern = format!("\"{}\":", key); - if let Some(start) = json.find(&pattern) { - let start = start + pattern.len(); - if let Some(value_start) = json[start..].find('"') { - let value_start = start + value_start + 1; - if let Some(value_end) = json[value_start..].find('"') { - let value = &json[value_start..value_start + value_end]; - if value != "null" && !value.is_empty() { - return Some(value.to_string()); - } - } - } - } - None + let pattern = format!("\"{}\":", key); + if let Some(start) = json.find(&pattern) { + let start = start + pattern.len(); + if let Some(value_start) = json[start..].find('"') { + let value_start = start + value_start + 1; + if let Some(value_end) = json[value_start..].find('"') { + let value = &json[value_start..value_start + value_end]; + if value != "null" && !value.is_empty() { + return Some(value.to_string()); + } + } + } + } + None } /// Extract number value from JSON (simple implementation) fn extract_json_number(json: &str, key: &str) -> Option { - let pattern = format!("\"{}\":", key); - if let Some(start) = json.find(&pattern) { - let start = start + pattern.len(); - let remaining = json[start..].trim_start(); - if let Some(end) = remaining.find(|c: char| !c.is_ascii_digit()) { - let number_str = &remaining[..end]; - return number_str.parse().ok(); - } - } - None + let pattern = format!("\"{}\":", key); + if let Some(start) = json.find(&pattern) { + let start = start + pattern.len(); + let remaining = json[start..].trim_start(); + if let Some(end) = remaining.find(|c: char| !c.is_ascii_digit()) { + let number_str = &remaining[..end]; + return number_str.parse().ok(); + } + } + None } /// Extract boolean value from JSON (simple implementation) fn extract_json_bool(json: &str, key: &str) -> Option { - let pattern = format!("\"{}\":", key); - if let Some(start) = json.find(&pattern) { - let start = start + pattern.len(); - let remaining = json[start..].trim_start(); - if remaining.starts_with("true") { - return Some(true); - } else if remaining.starts_with("false") { - return Some(false); - } - } - None + let pattern = format!("\"{}\":", key); + if let Some(start) = json.find(&pattern) { + let start = start + pattern.len(); + let remaining = json[start..].trim_start(); + if remaining.starts_with("true") { + return Some(true); + } else if remaining.starts_with("false") { + return Some(false); + } + } + None } /// Enhance volume with NTFS-specific information from Windows pub async fn enhance_volume_from_windows(volume: &mut Volume) -> VolumeResult<()> { - // FIX: Import the trait from the correct module - use crate::volume::fs::FilesystemHandler; + // FIX: Import the trait from the correct module + use crate::volume::fs::FilesystemHandler; - let handler = NtfsHandler::new(); - handler.enhance_volume(volume).await + let handler = NtfsHandler::new(); + handler.enhance_volume(volume).await } #[cfg(test)] mod tests { - use super::*; + use super::*; - #[test] - fn test_extract_json_string() { - let json = - r#"{"VolumeGuid": "12345678-1234-1234-1234-123456789abc", "FileSystem": "NTFS"}"#; - assert_eq!( - extract_json_string(json, "VolumeGuid"), - Some("12345678-1234-1234-1234-123456789abc".to_string()) - ); - assert_eq!( - extract_json_string(json, "FileSystem"), - Some("NTFS".to_string()) - ); - assert_eq!(extract_json_string(json, "NonExistent"), None); - } + #[test] + fn test_extract_json_string() { + let json = + r#"{"VolumeGuid": "12345678-1234-1234-1234-123456789abc", "FileSystem": "NTFS"}"#; + assert_eq!( + extract_json_string(json, "VolumeGuid"), + Some("12345678-1234-1234-1234-123456789abc".to_string()) + ); + assert_eq!( + extract_json_string(json, "FileSystem"), + Some("NTFS".to_string()) + ); + assert_eq!(extract_json_string(json, "NonExistent"), None); + } - #[test] - fn test_extract_json_bool() { - let json = r#"{"SupportsCompression": true, "SupportsEncryption": false}"#; - assert_eq!(extract_json_bool(json, "SupportsCompression"), Some(true)); - assert_eq!(extract_json_bool(json, "SupportsEncryption"), Some(false)); - assert_eq!(extract_json_bool(json, "NonExistent"), None); - } -} \ No newline at end of file + #[test] + fn test_extract_json_bool() { + let json = r#"{"SupportsCompression": true, "SupportsEncryption": false}"#; + assert_eq!(extract_json_bool(json, "SupportsCompression"), Some(true)); + assert_eq!(extract_json_bool(json, "SupportsEncryption"), Some(false)); + assert_eq!(extract_json_bool(json, "NonExistent"), None); + } +} diff --git a/core/src/volume/fs/refs.rs b/core/src/volume/fs/refs.rs index b132eebf5..048178404 100644 --- a/core/src/volume/fs/refs.rs +++ b/core/src/volume/fs/refs.rs @@ -312,11 +312,11 @@ fn extract_json_number(json: &str, key: &str) -> Option { /// Enhance volume with ReFS-specific information from Windows pub async fn enhance_volume_from_windows(volume: &mut Volume) -> VolumeResult<()> { - // Import the trait from the parent module so the enhance_volume method is available - use super::FilesystemHandler; + // Import the trait from the parent module so the enhance_volume method is available + use super::FilesystemHandler; - let handler = RefsHandler::new(); - handler.enhance_volume(volume).await + let handler = RefsHandler::new(); + handler.enhance_volume(volume).await } #[cfg(test)] diff --git a/core/src/volume/manager.rs b/core/src/volume/manager.rs index f4207111b..7037b85d4 100644 --- a/core/src/volume/manager.rs +++ b/core/src/volume/manager.rs @@ -167,10 +167,10 @@ impl VolumeManager { // Try to load credentials and recreate the backend let credential_manager = CloudCredentialManager::new( - key_manager.clone(), - library.db().clone(), - library.id(), - ); + key_manager.clone(), + library.db().clone(), + library.id(), + ); match credential_manager .get_credential(library.id(), &db_volume.fingerprint) diff --git a/core/src/volume/platform/windows.rs b/core/src/volume/platform/windows.rs index 752492d5b..b94aff4fb 100644 --- a/core/src/volume/platform/windows.rs +++ b/core/src/volume/platform/windows.rs @@ -238,10 +238,10 @@ pub fn should_include_volume(volume: &Volume, config: &VolumeDetectionConfig) -> return false; } - // FIX: Use parentheses to call the method + // FIX: Use parentheses to call the method if !config.include_virtual && volume.total_bytes_capacity() == 0 { return false; } true -} \ No newline at end of file +} diff --git a/core/tests/sync_event_log_test.rs b/core/tests/sync_event_log_test.rs index 456d2a19d..e0b1fce6c 100644 --- a/core/tests/sync_event_log_test.rs +++ b/core/tests/sync_event_log_test.rs @@ -77,7 +77,10 @@ impl EventLogTestHarness { // Initialize sync service library_alice - .init_sync_service(device_alice_id, transport_alice.clone() as Arc) + .init_sync_service( + device_alice_id, + transport_alice.clone() as Arc, + ) .await?; // Start sync service @@ -105,7 +108,8 @@ impl EventLogTestHarness { let stmt = Statement::from_string( DatabaseBackend::Sqlite, - "SELECT event_type, summary, correlation_id FROM sync_event_log ORDER BY timestamp".to_string(), + "SELECT event_type, summary, correlation_id FROM sync_event_log ORDER BY timestamp" + .to_string(), ); let rows = event_logger.conn().query_all(stmt).await?; @@ -195,10 +199,7 @@ async fn test_backfill_session_correlation() -> anyhow::Result<()> { let events = harness.query_events_api(query).await?; - tracing::info!( - event_count = events.len(), - "Events retrieved via query API" - ); + tracing::info!(event_count = events.len(), "Events retrieved via query API"); // Verify query API works (even if no events yet) assert!( @@ -240,8 +241,8 @@ async fn test_event_query_filtering() -> anyhow::Result<()> { tokio::time::sleep(Duration::from_millis(200)).await; // Test filtering by event type - let query = SyncEventQuery::new(library_id) - .with_event_types(vec![SyncEventType::StateTransition]); + let query = + SyncEventQuery::new(library_id).with_event_types(vec![SyncEventType::StateTransition]); let events = harness.query_events_api(query).await?; @@ -260,8 +261,8 @@ async fn test_event_query_filtering() -> anyhow::Result<()> { } // Test filtering by category - let query_category = SyncEventQuery::new(library_id) - .with_categories(vec![EventCategory::Lifecycle]); + let query_category = + SyncEventQuery::new(library_id).with_categories(vec![EventCategory::Lifecycle]); let lifecycle_events = harness.query_events_api(query_category).await?; @@ -353,15 +354,12 @@ async fn test_batch_aggregation() -> anyhow::Result<()> { // Query batch ingestion events let library_id = harness.library_alice.id(); - let query = SyncEventQuery::new(library_id) - .with_event_types(vec![SyncEventType::BatchIngestion]); + let query = + SyncEventQuery::new(library_id).with_event_types(vec![SyncEventType::BatchIngestion]); let events = harness.query_events_api(query).await?; - tracing::info!( - batch_events = events.len(), - "Batch ingestion events logged" - ); + tracing::info!(batch_events = events.len(), "Batch ingestion events logged"); // Should have one batch event aggregating all the adds assert!( @@ -417,9 +415,7 @@ async fn test_buffer_overflow_logging() -> anyhow::Result<()> { // For testing, we'll simulate by tracking drops manually // Transition to Ready (this checks for dropped count) - peer_sync - .set_state_for_test(DeviceSyncState::Ready) - .await; + peer_sync.set_state_for_test(DeviceSyncState::Ready).await; tokio::time::sleep(Duration::from_millis(200)).await; @@ -431,17 +427,11 @@ async fn test_buffer_overflow_logging() -> anyhow::Result<()> { let error_events = harness.query_events_api(query).await?; - tracing::info!( - error_count = error_events.len(), - "Error events logged" - ); + tracing::info!(error_count = error_events.len(), "Error events logged"); // Note: Buffer overflow only logs if drops actually occurred // This test verifies the infrastructure exists, even if no drops happened - assert!( - error_events.len() >= 0, - "Error event query should work" - ); + assert!(error_events.len() >= 0, "Error event query should work"); Ok(()) } diff --git a/docs/workbench b/docs/workbench index 10256fd2d..cab1f9e49 160000 --- a/docs/workbench +++ b/docs/workbench @@ -1 +1 @@ -Subproject commit 10256fd2d66f3a125a939bade2eebd4d2f1e5b5d +Subproject commit cab1f9e49e81f8622f2c77f8c1162f7cbd2b1b1d diff --git a/packages/interface/src/Explorer.tsx b/packages/interface/src/Explorer.tsx index 38d0a0223..3d44d0cac 100644 --- a/packages/interface/src/Explorer.tsx +++ b/packages/interface/src/Explorer.tsx @@ -1,38 +1,47 @@ import { SpacedriveProvider, type SpacedriveClient } from "./context"; import { ReactQueryDevtools } from "@tanstack/react-query-devtools"; import { - RouterProvider, - Outlet, - useLocation, - useParams, + RouterProvider, + Outlet, + useLocation, + useParams, } from "react-router-dom"; import { useEffect, useMemo } from "react"; +import { useLocationChangeInvalidation } from "./hooks/useLocationChangeInvalidation"; import { Dialogs } from "@sd/ui"; import { Inspector, type InspectorVariant } from "./Inspector"; import { TopBarProvider, TopBar } from "./TopBar"; import { motion, AnimatePresence } from "framer-motion"; import { - ExplorerProvider, - useExplorer, - Sidebar, - getSpaceItemKeyFromRoute, + ExplorerProvider, + useExplorer, + Sidebar, + getSpaceItemKeyFromRoute, } from "./components/Explorer"; import { - SelectionProvider, - useSelection, + SelectionProvider, + useSelection, } from "./components/Explorer/SelectionContext"; import { KeyboardHandler } from "./components/Explorer/KeyboardHandler"; import { TagAssignmentMode } from "./components/Explorer/TagAssignmentMode"; import { SpacesSidebar } from "./components/SpacesSidebar"; import { - QuickPreviewFullscreen, - PREVIEW_LAYER_ID, + QuickPreviewFullscreen, + PREVIEW_LAYER_ID, } from "./components/QuickPreview"; import { createExplorerRouter } from "./router"; import { useNormalizedQuery, useLibraryMutation } from "./context"; import { usePlatform } from "./platform"; import type { LocationInfo } from "@sd/ts-client"; -import { DndContext, DragOverlay, PointerSensor, useSensor, useSensors, pointerWithin, rectIntersection } from "@dnd-kit/core"; +import { + DndContext, + DragOverlay, + PointerSensor, + useSensor, + useSensors, + pointerWithin, + rectIntersection, +} from "@dnd-kit/core"; import type { CollisionDetection } from "@dnd-kit/core"; import { useState } from "react"; import type { File } from "@sd/ts-client"; @@ -40,214 +49,239 @@ import { File as FileComponent } from "./components/Explorer/File"; import { DaemonDisconnectedOverlay } from "./components/DaemonDisconnectedOverlay"; interface AppProps { - client: SpacedriveClient; + client: SpacedriveClient; } export function ExplorerLayout() { - const location = useLocation(); - const params = useParams(); - const platform = usePlatform(); - const { - sidebarVisible, - inspectorVisible, - setInspectorVisible, - quickPreviewFileId, - setQuickPreviewFileId, - closeQuickPreview, - currentFiles, - tagModeActive, - setTagModeActive, - viewMode, - setSpaceItemId, - } = useExplorer(); - const { selectedFiles, selectFile } = useSelection(); + const location = useLocation(); + const params = useParams(); + const platform = usePlatform(); + const { + sidebarVisible, + inspectorVisible, + setInspectorVisible, + quickPreviewFileId, + setQuickPreviewFileId, + closeQuickPreview, + currentFiles, + tagModeActive, + setTagModeActive, + viewMode, + setSpaceItemId, + } = useExplorer(); + const { selectedFiles, selectFile } = useSelection(); - // Sync route with explorer context for view preferences - useEffect(() => { - const spaceItemKey = getSpaceItemKeyFromRoute( - location.pathname, - location.search, - ); - setSpaceItemId(spaceItemKey); - }, [location.pathname, location.search, setSpaceItemId]); + // Listen for location index_mode changes and invalidate directory listing queries + useLocationChangeInvalidation(); - // Sync QuickPreview with selection - Explorer is source of truth - useEffect(() => { - if (!quickPreviewFileId) return; + // Sync route with explorer context for view preferences + useEffect(() => { + const spaceItemKey = getSpaceItemKeyFromRoute( + location.pathname, + location.search, + ); + setSpaceItemId(spaceItemKey); + }, [location.pathname, location.search, setSpaceItemId]); - // When selection changes and QuickPreview is open, update preview to match selection - if (selectedFiles.length === 1 && selectedFiles[0].id !== quickPreviewFileId) { - setQuickPreviewFileId(selectedFiles[0].id); - } - }, [selectedFiles, quickPreviewFileId, setQuickPreviewFileId]); + // Sync QuickPreview with selection - Explorer is source of truth + useEffect(() => { + if (!quickPreviewFileId) return; - // Check if we're on Overview (hide inspector) or in Knowledge view (has its own inspector) - const isOverview = location.pathname === "/"; - const isKnowledgeView = viewMode === "knowledge"; + // When selection changes and QuickPreview is open, update preview to match selection + if ( + selectedFiles.length === 1 && + selectedFiles[0].id !== quickPreviewFileId + ) { + setQuickPreviewFileId(selectedFiles[0].id); + } + }, [selectedFiles, quickPreviewFileId, setQuickPreviewFileId]); - // Fetch locations to get current location info - const locationsQuery = useNormalizedQuery< - null, - { locations: LocationInfo[] } - >({ - wireMethod: "query:locations.list", - input: null, - resourceType: "location", - }); + // Check if we're on Overview (hide inspector) or in Knowledge view (has its own inspector) + const isOverview = location.pathname === "/"; + const isKnowledgeView = viewMode === "knowledge"; - // Get current location if we're on a location route - const currentLocation = useMemo(() => { - if (!params.locationId || !locationsQuery.data?.locations) return null; - return ( - locationsQuery.data.locations.find( - (loc) => loc.id === params.locationId, - ) || null - ); - }, [params.locationId, locationsQuery.data]); + // Fetch locations to get current location info + const locationsQuery = useNormalizedQuery< + null, + { locations: LocationInfo[] } + >({ + wireMethod: "query:locations.list", + input: null, + resourceType: "location", + }); - useEffect(() => { - // Listen for inspector window close events - if (!platform.onWindowEvent) return; + // Get current location if we're on a location route + const currentLocation = useMemo(() => { + if (!params.locationId || !locationsQuery.data?.locations) return null; + return ( + locationsQuery.data.locations.find( + (loc) => loc.id === params.locationId, + ) || null + ); + }, [params.locationId, locationsQuery.data]); - let unlisten: (() => void) | undefined; + useEffect(() => { + // Listen for inspector window close events + if (!platform.onWindowEvent) return; - (async () => { - try { - unlisten = await platform.onWindowEvent( - "inspector-window-closed", - () => { - // Show embedded inspector when floating window closes - setInspectorVisible(true); - }, - ); - } catch (err) { - console.error("Failed to setup inspector close listener:", err); - } - })(); + let unlisten: (() => void) | undefined; - return () => { - unlisten?.(); - }; - }, [platform, setInspectorVisible]); + (async () => { + try { + unlisten = await platform.onWindowEvent( + "inspector-window-closed", + () => { + // Show embedded inspector when floating window closes + setInspectorVisible(true); + }, + ); + } catch (err) { + console.error("Failed to setup inspector close listener:", err); + } + })(); - const handlePopOutInspector = async () => { - if (!platform.showWindow) return; + return () => { + unlisten?.(); + }; + }, [platform, setInspectorVisible]); - try { - await platform.showWindow({ - type: "Inspector", - item_id: null, - }); - // Hide the embedded inspector when popped out - setInspectorVisible(false); - } catch (err) { - console.error("Failed to pop out inspector:", err); - } - }; + const handlePopOutInspector = async () => { + if (!platform.showWindow) return; - const isPreviewActive = !!quickPreviewFileId; + try { + await platform.showWindow({ + type: "Inspector", + item_id: null, + }); + // Hide the embedded inspector when popped out + setInspectorVisible(false); + } catch (err) { + console.error("Failed to pop out inspector:", err); + } + }; - return ( -
- {/* Preview layer - portal target for fullscreen preview, sits between content and sidebar/inspector */} -
+ const isPreviewActive = !!quickPreviewFileId; - + return ( +
+ {/* Preview layer - portal target for fullscreen preview, sits between content and sidebar/inspector */} +
- - {sidebarVisible && ( - - - - )} - + -
- {/* Router content renders here */} - + + {sidebarVisible && ( + + + + )} + - {/* Tag Assignment Mode - positioned at bottom of main content area */} - setTagModeActive(false)} - /> -
+
+ {/* Router content renders here */} + - {/* Keyboard handler (invisible, doesn't cause parent rerenders) */} - + {/* Tag Assignment Mode - positioned at bottom of main content area */} + setTagModeActive(false)} + /> +
- - {/* Hide inspector on Overview screen and Knowledge view (has its own) */} - {inspectorVisible && !isOverview && !isKnowledgeView && ( - -
- -
-
- )} -
+ {/* Keyboard handler (invisible, doesn't cause parent rerenders) */} + - {/* Quick Preview - renders via portal into preview layer */} - {quickPreviewFileId && (() => { - const currentIndex = currentFiles.findIndex(f => f.id === quickPreviewFileId); - const hasPrevious = currentIndex > 0; - const hasNext = currentIndex < currentFiles.length - 1; + + {/* Hide inspector on Overview screen and Knowledge view (has its own) */} + {inspectorVisible && !isOverview && !isKnowledgeView && ( + +
+ +
+
+ )} +
- const handleNext = () => { - if (hasNext && currentFiles[currentIndex + 1]) { - selectFile(currentFiles[currentIndex + 1], currentFiles, false, false); - } - }; + {/* Quick Preview - renders via portal into preview layer */} + {quickPreviewFileId && + (() => { + const currentIndex = currentFiles.findIndex( + (f) => f.id === quickPreviewFileId, + ); + const hasPrevious = currentIndex > 0; + const hasNext = currentIndex < currentFiles.length - 1; - const handlePrevious = () => { - if (hasPrevious && currentFiles[currentIndex - 1]) { - selectFile(currentFiles[currentIndex - 1], currentFiles, false, false); - } - }; + const handleNext = () => { + if (hasNext && currentFiles[currentIndex + 1]) { + selectFile( + currentFiles[currentIndex + 1], + currentFiles, + false, + false, + ); + } + }; - return ( - - ); - })()} -
- ); + const handlePrevious = () => { + if (hasPrevious && currentFiles[currentIndex - 1]) { + selectFile( + currentFiles[currentIndex - 1], + currentFiles, + false, + false, + ); + } + }; + + return ( + + ); + })()} +
+ ); } /** @@ -275,142 +309,153 @@ export function ExplorerLayout() { * - Data: { type, spaceId, groupId? } */ function DndWrapper({ children }: { children: React.ReactNode }) { - const sensors = useSensors( - useSensor(PointerSensor, { - activationConstraint: { - distance: 8, // Require 8px movement before activating drag - }, - }) - ); - const addItem = useLibraryMutation("spaces.add_item"); - const [activeItem, setActiveItem] = useState(null); + const sensors = useSensors( + useSensor(PointerSensor, { + activationConstraint: { + distance: 8, // Require 8px movement before activating drag + }, + }), + ); + const addItem = useLibraryMutation("spaces.add_item"); + const [activeItem, setActiveItem] = useState(null); - // Custom collision detection: prefer -top zones over -bottom zones to avoid double lines - const customCollision: CollisionDetection = (args) => { - const collisions = pointerWithin(args); - if (!collisions || collisions.length === 0) return collisions; + // Custom collision detection: prefer -top zones over -bottom zones to avoid double lines + const customCollision: CollisionDetection = (args) => { + const collisions = pointerWithin(args); + if (!collisions || collisions.length === 0) return collisions; - // If we have multiple collisions, prefer -top over -bottom - const hasTop = collisions.find(c => String(c.id).endsWith('-top')); - const hasMiddle = collisions.find(c => String(c.id).endsWith('-middle')); + // If we have multiple collisions, prefer -top over -bottom + const hasTop = collisions.find((c) => String(c.id).endsWith("-top")); + const hasMiddle = collisions.find((c) => + String(c.id).endsWith("-middle"), + ); - if (hasMiddle) return [hasMiddle]; // Middle zone takes priority - if (hasTop) return [hasTop]; // Top zone over bottom - return [collisions[0]]; // First collision - }; + if (hasMiddle) return [hasMiddle]; // Middle zone takes priority + if (hasTop) return [hasTop]; // Top zone over bottom + return [collisions[0]]; // First collision + }; - const handleDragStart = (event: any) => { - setActiveItem(event.active.data.current); - }; + const handleDragStart = (event: any) => { + setActiveItem(event.active.data.current); + }; - const handleDragEnd = async (event: any) => { - const { active, over } = event; + const handleDragEnd = async (event: any) => { + const { active, over } = event; - setActiveItem(null); + setActiveItem(null); - if (!over || !active.data.current) return; + if (!over || !active.data.current) return; - const dragData = active.data.current; - const dropData = over.data.current; + const dragData = active.data.current; + const dropData = over.data.current; - if (!dragData || dragData.type !== "explorer-file") return; + if (!dragData || dragData.type !== "explorer-file") return; - // Insert before/after sidebar items (adds item to space/group) - if (dropData?.action === "insert-before" || dropData?.action === "insert-after") { - if (!dropData.spaceId) return; + // Insert before/after sidebar items (adds item to space/group) + if ( + dropData?.action === "insert-before" || + dropData?.action === "insert-after" + ) { + if (!dropData.spaceId) return; - try { - await addItem.mutateAsync({ - space_id: dropData.spaceId, - group_id: dropData.groupId || null, - item_type: { Path: { sd_path: dragData.sdPath } }, - }); - // TODO: Implement proper ordering relative to itemId - } catch (err) { - console.error("Failed to add item:", err); - } - return; - } + try { + await addItem.mutateAsync({ + space_id: dropData.spaceId, + group_id: dropData.groupId || null, + item_type: { Path: { sd_path: dragData.sdPath } }, + }); + // TODO: Implement proper ordering relative to itemId + } catch (err) { + console.error("Failed to add item:", err); + } + return; + } - // Move file into location/volume/folder - if (dropData?.action === "move-into") { - // TODO: Implement with files.move mutation based on targetType - // - location: Use targetPath - // - volume: Look up volume root path - // - folder: Use targetPath from Path item - return; - } + // Move file into location/volume/folder + if (dropData?.action === "move-into") { + // TODO: Implement with files.move mutation based on targetType + // - location: Use targetPath + // - volume: Look up volume root path + // - folder: Use targetPath from Path item + return; + } - // Drop on space root area (adds to space) - if (dropData?.type === "space" && dragData.type === "explorer-file") { - try { - await addItem.mutateAsync({ - space_id: dropData.spaceId, - group_id: null, - item_type: { Path: { sd_path: dragData.sdPath } }, - }); - } catch (err) { - console.error("Failed to add item:", err); - } - } + // Drop on space root area (adds to space) + if (dropData?.type === "space" && dragData.type === "explorer-file") { + try { + await addItem.mutateAsync({ + space_id: dropData.spaceId, + group_id: null, + item_type: { Path: { sd_path: dragData.sdPath } }, + }); + } catch (err) { + console.error("Failed to add item:", err); + } + } - // Drop on group area (adds to group) - if (dropData?.type === "group" && dragData.type === "explorer-file") { - try { - await addItem.mutateAsync({ - space_id: dropData.spaceId, - group_id: dropData.groupId, - item_type: { Path: { sd_path: dragData.sdPath } }, - }); - } catch (err) { - console.error("Failed to add item to group:", err); - } - } - }; + // Drop on group area (adds to group) + if (dropData?.type === "group" && dragData.type === "explorer-file") { + try { + await addItem.mutateAsync({ + space_id: dropData.spaceId, + group_id: dropData.groupId, + item_type: { Path: { sd_path: dragData.sdPath } }, + }); + } catch (err) { + console.error("Failed to add item to group:", err); + } + } + }; - return ( - - {children} - - {activeItem?.file && activeItem.gridSize ? ( -
-
-
- -
-
- {activeItem.name} -
-
-
- ) : null} -
-
- ); + return ( + + {children} + + {activeItem?.file && activeItem.gridSize ? ( +
+
+
+ +
+
+ {activeItem.name} +
+
+
+ ) : null} +
+
+ ); } export function Explorer({ client }: AppProps) { - const router = createExplorerRouter(); + const router = createExplorerRouter(); - return ( - - - - - - - - - - - - - - - ); + return ( + + + + + + + + + + + + + + + ); } diff --git a/packages/interface/src/components/Explorer/File/Thumb.tsx b/packages/interface/src/components/Explorer/File/Thumb.tsx index 16182cb15..0f0c816c8 100644 --- a/packages/interface/src/components/Explorer/File/Thumb.tsx +++ b/packages/interface/src/components/Explorer/File/Thumb.tsx @@ -3,14 +3,15 @@ import clsx from "clsx"; import { getIcon } from "@sd/assets/util"; import type { File } from "@sd/ts-client"; import { ThumbstripScrubber } from "./ThumbstripScrubber"; +import { getContentKind } from "../utils"; interface ThumbProps { - file: File; - size?: number; - className?: string; - frameClassName?: string; // Custom frame styling (border, radius, bg) - iconScale?: number; // Scale factor for fallback icon (0-1, default 1) - squareMode?: boolean; // Whether thumbnail is cropped to square (media view) or maintains aspect ratio + file: File; + size?: number; + className?: string; + frameClassName?: string; // Custom frame styling (border, radius, bg) + iconScale?: number; // Scale factor for fallback icon (0-1, default 1) + squareMode?: boolean; // Whether thumbnail is cropped to square (media view) or maintains aspect ratio } // Global cache for thumbnail loaded states (survives component unmount/remount) @@ -18,203 +19,209 @@ const thumbLoadedCache = new Map(); const thumbErrorCache = new Map(); export const Thumb = memo(function Thumb({ - file, - size = 100, - className, - frameClassName, - iconScale = 1, - squareMode = false, + file, + size = 100, + className, + frameClassName, + iconScale = 1, + squareMode = false, }: ThumbProps) { - const cacheKey = `${file.id}-${size}`; + const cacheKey = `${file.id}-${size}`; - const [thumbLoaded, setThumbLoaded] = useState( - () => thumbLoadedCache.get(cacheKey) || false, - ); - const [thumbError, setThumbError] = useState( - () => thumbErrorCache.get(cacheKey) || false, - ); + const [thumbLoaded, setThumbLoaded] = useState( + () => thumbLoadedCache.get(cacheKey) || false, + ); + const [thumbError, setThumbError] = useState( + () => thumbErrorCache.get(cacheKey) || false, + ); - // Update cache when state changes - useEffect(() => { - if (thumbLoaded) thumbLoadedCache.set(cacheKey, true); - }, [thumbLoaded, cacheKey]); + // Update cache when state changes + useEffect(() => { + if (thumbLoaded) thumbLoadedCache.set(cacheKey, true); + }, [thumbLoaded, cacheKey]); - useEffect(() => { - if (thumbError) thumbErrorCache.set(cacheKey, true); - }, [thumbError, cacheKey]); + useEffect(() => { + if (thumbError) thumbErrorCache.set(cacheKey, true); + }, [thumbError, cacheKey]); - const iconSize = size * iconScale; + const iconSize = size * iconScale; - // Check if this is a video with thumbstrip sidecar - const isVideo = file.content_identity?.kind === "video"; - const hasThumbstrip = file.sidecars?.some((s) => s.kind === "thumbstrip"); + // Check if this is a video with thumbstrip sidecar + const isVideo = getContentKind(file) === "video"; + const hasThumbstrip = file.sidecars?.some((s) => s.kind === "thumbstrip"); - // Get appropriate thumbnail URL from sidecars based on size - const getThumbnailUrl = (targetSize: number) => { - const serverUrl = (window as any).__SPACEDRIVE_SERVER_URL__; - const libraryId = (window as any).__SPACEDRIVE_LIBRARY_ID__; + // Get appropriate thumbnail URL from sidecars based on size + const getThumbnailUrl = (targetSize: number) => { + const serverUrl = (window as any).__SPACEDRIVE_SERVER_URL__; + const libraryId = (window as any).__SPACEDRIVE_LIBRARY_ID__; - if (!serverUrl || !libraryId) { - return null; - } + if (!serverUrl || !libraryId) { + return null; + } - // Need content_identity to build sidecar URL - if (!file.content_identity?.uuid) { - return null; - } + // Need content_identity to build sidecar URL + if (!file.content_identity?.uuid) { + return null; + } - // Find thumbnail sidecar closest to requested size - const thumbnails = file.sidecars.filter((s) => s.kind === "thumb"); + // Find thumbnail sidecar closest to requested size + const thumbnails = file.sidecars.filter((s) => s.kind === "thumb"); - if (thumbnails.length === 0) { - return null; - } + if (thumbnails.length === 0) { + return null; + } - // Prefer 1x (lower resolution) variants for better performance - // Only use higher resolution for very large sizes (>400px) - const preferredSize = targetSize <= 400 ? targetSize * 0.6 : targetSize; + // Prefer 1x (lower resolution) variants for better performance + // Only use higher resolution for very large sizes (>400px) + const preferredSize = targetSize <= 400 ? targetSize * 0.6 : targetSize; - const thumbnail = thumbnails.sort((a, b) => { - // Parse variant (e.g., "grid@1x", "detail@1x") to get size and scale - const aSize = parseInt( - a.variant.split("x")[0]?.replace(/\D/g, "") || "0", - ); - const bSize = parseInt( - b.variant.split("x")[0]?.replace(/\D/g, "") || "0", - ); + const thumbnail = thumbnails.sort((a, b) => { + // Parse variant (e.g., "grid@1x", "detail@1x") to get size and scale + const aSize = parseInt( + a.variant.split("x")[0]?.replace(/\D/g, "") || "0", + ); + const bSize = parseInt( + b.variant.split("x")[0]?.replace(/\D/g, "") || "0", + ); - // Extract scale factor (1x, 2x, 3x) from variants like "grid@1x" or "detail@2x" - const aScaleMatch = a.variant.match(/@(\d+)x/); - const bScaleMatch = b.variant.match(/@(\d+)x/); - const aScale = aScaleMatch ? parseInt(aScaleMatch[1]) : 1; - const bScale = bScaleMatch ? parseInt(bScaleMatch[1]) : 1; + // Extract scale factor (1x, 2x, 3x) from variants like "grid@1x" or "detail@2x" + const aScaleMatch = a.variant.match(/@(\d+)x/); + const bScaleMatch = b.variant.match(/@(\d+)x/); + const aScale = aScaleMatch ? parseInt(aScaleMatch[1]) : 1; + const bScale = bScaleMatch ? parseInt(bScaleMatch[1]) : 1; - // Strongly prefer 1x variants (add penalty for higher scales) - const aPenalty = (aScale - 1) * 100; - const bPenalty = (bScale - 1) * 100; + // Strongly prefer 1x variants (add penalty for higher scales) + const aPenalty = (aScale - 1) * 100; + const bPenalty = (bScale - 1) * 100; - // Find closest match to preferred size, with scale penalty - return ( - Math.abs(aSize - preferredSize) + - aPenalty - - (Math.abs(bSize - preferredSize) + bPenalty) - ); - })[0]; + // Find closest match to preferred size, with scale penalty + return ( + Math.abs(aSize - preferredSize) + + aPenalty - + (Math.abs(bSize - preferredSize) + bPenalty) + ); + })[0]; - const contentUuid = file.content_identity.uuid; - const url = `${serverUrl}/sidecar/${libraryId}/${contentUuid}/${thumbnail.kind}/${thumbnail.variant}.${thumbnail.format}`; + const contentUuid = file.content_identity.uuid; + const url = `${serverUrl}/sidecar/${libraryId}/${contentUuid}/${thumbnail.kind}/${thumbnail.variant}.${thumbnail.format}`; - return url; - }; + return url; + }; - const thumbnailSrc = getThumbnailUrl(size); + const thumbnailSrc = getThumbnailUrl(size); - // This is jank and has to be done in several places. Ideally a util function. - const fileKind = - file?.content_identity?.kind && file.content_identity.kind !== "unknown" - ? file.content_identity.kind - : file.kind === "File" - ? file.extension || "File" - : file.kind; - // this too - const kindCapitalized = fileKind.charAt(0).toUpperCase() + fileKind.slice(1); + // Get content kind (prefers content_identity.kind, falls back to content_kind) + const contentKind = getContentKind(file); + const fileKind = + contentKind && contentKind !== "unknown" + ? contentKind + : file.kind === "File" + ? file.extension || "File" + : file.kind; + const kindCapitalized = + fileKind.charAt(0).toUpperCase() + fileKind.slice(1); - const icon = getIcon( - kindCapitalized, - true, // Dark theme - file.extension, - file.kind === "Directory", - ); + const icon = getIcon( + kindCapitalized, + true, // Dark theme + file.extension, + file.kind === "Directory", + ); - return ( -
- {/* Always show icon first (instant), then thumbnail loads over it */} - + return ( +
+ {/* Always show icon first (instant), then thumbnail loads over it */} + - {/* Load thumbnail if available */} - {thumbnailSrc && !thumbError && ( - {file.name} setThumbLoaded(true)} - onError={() => setThumbError(true)} - /> - )} + {/* Load thumbnail if available */} + {thumbnailSrc && !thumbError && ( + {file.name} setThumbLoaded(true)} + onError={() => setThumbError(true)} + /> + )} - {/* Thumbstrip scrubber overlay (for videos with thumbstrips) */} - {isVideo && hasThumbstrip && thumbLoaded && ( - - )} -
- ); + {/* Thumbstrip scrubber overlay (for videos with thumbstrips) */} + {isVideo && hasThumbstrip && thumbLoaded && ( + + )} +
+ ); }); export function Icon({ - file, - size = 24, - className, + file, + size = 24, + className, }: { - file: File; - size?: number; - className?: string; + file: File; + size?: number; + className?: string; }) { - // This is jank and has to be done in several places. Ideally a util function. - const fileKind = - file?.content_identity?.kind && file.content_identity.kind !== "unknown" - ? file.content_identity.kind - : file.kind === "File" - ? file.extension || "File" - : file.kind; - // this too - const kindCapitalized = fileKind.charAt(0).toUpperCase() + fileKind.slice(1); + // Get content kind (prefers content_identity.kind, falls back to content_kind) + const contentKind = getContentKind(file); + const fileKind = + contentKind && contentKind !== "unknown" + ? contentKind + : file.kind === "File" + ? file.extension || "File" + : file.kind; + const kindCapitalized = + fileKind.charAt(0).toUpperCase() + fileKind.slice(1); - const icon = getIcon( - kindCapitalized, - true, // Dark theme - file.extension, - file.kind === "Directory", - ); + const icon = getIcon( + kindCapitalized, + true, // Dark theme + file.extension, + file.kind === "Directory", + ); - return ( - - ); + return ( + + ); } diff --git a/packages/interface/src/components/Explorer/utils.ts b/packages/interface/src/components/Explorer/utils.ts index afc4a2197..6554916f1 100644 --- a/packages/interface/src/components/Explorer/utils.ts +++ b/packages/interface/src/components/Explorer/utils.ts @@ -2,66 +2,74 @@ import LaptopIcon from "@sd/assets/icons/Laptop.png"; import MobileIcon from "@sd/assets/icons/Mobile.png"; import ServerIcon from "@sd/assets/icons/Server.png"; import PCIcon from "@sd/assets/icons/PC.png"; -import type { SdPath } from "@sd/ts-client"; +import type { ContentKind, File, SdPath } from "@sd/ts-client"; + +/** + * Get the content kind for a file, preferring content_identity.kind if available, + * falling back to content_kind (identified by extension during ephemeral indexing). + */ +export function getContentKind(file: File | null | undefined): ContentKind { + return file?.content_identity?.kind ?? file?.content_kind ?? "unknown"; +} export function formatBytes(bytes: number): string { - if (bytes === 0) return "0 B"; - const k = 1024; - const sizes = ["B", "KB", "MB", "GB", "TB"]; - const i = Math.floor(Math.log(bytes) / Math.log(k)); - return Math.round(bytes / Math.pow(k, i)) + " " + sizes[i]; + if (bytes === 0) return "0 B"; + const k = 1024; + const sizes = ["B", "KB", "MB", "GB", "TB"]; + const i = Math.floor(Math.log(bytes) / Math.log(k)); + return Math.round(bytes / Math.pow(k, i)) + " " + sizes[i]; } export function formatRelativeTime(date: Date | string): string { - const d = typeof date === "string" ? new Date(date) : date; - const now = new Date(); - const diff = now.getTime() - d.getTime(); - const seconds = Math.floor(diff / 1000); - const minutes = Math.floor(seconds / 60); - const hours = Math.floor(minutes / 60); - const days = Math.floor(hours / 24); + const d = typeof date === "string" ? new Date(date) : date; + const now = new Date(); + const diff = now.getTime() - d.getTime(); + const seconds = Math.floor(diff / 1000); + const minutes = Math.floor(seconds / 60); + const hours = Math.floor(minutes / 60); + const days = Math.floor(hours / 24); - if (days > 7) return d.toLocaleDateString(); - if (days > 0) return `${days}d ago`; - if (hours > 0) return `${hours}h ago`; - if (minutes > 0) return `${minutes}m ago`; - return "Just now"; + if (days > 7) return d.toLocaleDateString(); + if (days > 0) return `${days}d ago`; + if (hours > 0) return `${hours}h ago`; + if (minutes > 0) return `${minutes}m ago`; + return "Just now"; } export function getDeviceIcon(os: string, model?: string): string { - const osLower = os.toLowerCase(); + const osLower = os.toLowerCase(); - if (osLower.includes("ios") || osLower.includes("android")) { - return MobileIcon; - } + if (osLower.includes("ios") || osLower.includes("android")) { + return MobileIcon; + } - if (osLower.includes("windows")) { - return PCIcon; - } + if (osLower.includes("windows")) { + return PCIcon; + } - if (osLower.includes("server") || model?.toLowerCase().includes("server")) { - return ServerIcon; - } + if (osLower.includes("server") || model?.toLowerCase().includes("server")) { + return ServerIcon; + } - return LaptopIcon; + return LaptopIcon; } export function sdPathToUri(sdPath: SdPath): string { - if ("Physical" in sdPath) { - const { device_slug, path } = sdPath.Physical; - return `local://${device_slug}${path}`; - } + if ("Physical" in sdPath) { + const { device_slug, path } = sdPath.Physical; + return `local://${device_slug}${path}`; + } - if ("Cloud" in sdPath) { - const { service, identifier, path } = sdPath.Cloud; - const scheme = service.toLowerCase(); - return `${scheme}://${identifier}/${path}`; - } + if ("Cloud" in sdPath) { + const { service, identifier, path } = sdPath.Cloud; + const scheme = service.toLowerCase(); + return `${scheme}://${identifier}/${path}`; + } - if ("Content" in sdPath) { - const { content_id } = sdPath.Content; - return `content://${content_id}`; - } + if ("Content" in sdPath) { + const { content_id } = sdPath.Content; + return `content://${content_id}`; + } - return ""; + return ""; } diff --git a/packages/interface/src/components/Explorer/views/GridView/FileCard.tsx b/packages/interface/src/components/Explorer/views/GridView/FileCard.tsx index 8cc3802c2..04a2d09d0 100644 --- a/packages/interface/src/components/Explorer/views/GridView/FileCard.tsx +++ b/packages/interface/src/components/Explorer/views/GridView/FileCard.tsx @@ -27,504 +27,592 @@ import { useContextMenu } from "../../../../hooks/useContextMenu"; import { useJobDispatch } from "../../../../hooks/useJobDispatch"; import { useLibraryMutation } from "../../../../context"; import { usePlatform } from "../../../../platform"; -import { formatBytes } from "../../utils"; +import { formatBytes, getContentKind } from "../../utils"; import { TagDot } from "../../../Tags"; import { useDraggable } from "@dnd-kit/core"; interface FileCardProps { - file: File; - fileIndex: number; - allFiles: File[]; - selected: boolean; - focused: boolean; - selectedFiles: File[]; - selectFile: (file: File, files: File[], multi?: boolean, range?: boolean) => void; + file: File; + fileIndex: number; + allFiles: File[]; + selected: boolean; + focused: boolean; + selectedFiles: File[]; + selectFile: ( + file: File, + files: File[], + multi?: boolean, + range?: boolean, + ) => void; } -export const FileCard = memo(function FileCard({ file, fileIndex, allFiles, selected, focused, selectedFiles, selectFile }: FileCardProps) { - const { setCurrentPath, viewSettings, currentPath } = useExplorer(); - const { gridSize, showFileSize } = viewSettings; - const platform = usePlatform(); - const copyFiles = useLibraryMutation("files.copy"); - const deleteFiles = useLibraryMutation("files.delete"); - const { runJob } = useJobDispatch(); +export const FileCard = memo( + function FileCard({ + file, + fileIndex, + allFiles, + selected, + focused, + selectedFiles, + selectFile, + }: FileCardProps) { + const { setCurrentPath, viewSettings, currentPath } = useExplorer(); + const { gridSize, showFileSize } = viewSettings; + const platform = usePlatform(); + const copyFiles = useLibraryMutation("files.copy"); + const deleteFiles = useLibraryMutation("files.delete"); + const { runJob } = useJobDispatch(); - // Get the files to operate on (multi-select or just this file) - const getTargetFiles = () => { - if (selected && selectedFiles.length > 0) { - return selectedFiles; - } - return [file]; - }; + // Get the files to operate on (multi-select or just this file) + const getTargetFiles = () => { + if (selected && selectedFiles.length > 0) { + return selectedFiles; + } + return [file]; + }; - const contextMenu = useContextMenu({ - items: [ - { - icon: Eye, - label: "Quick Look", - onClick: () => { - console.log("Quick Look:", file.name); - // TODO: Implement quick look - }, - keybind: "Space", - }, - { - icon: FolderOpen, - label: "Open", - onClick: () => { - if (file.kind === "Directory") { - setCurrentPath(file.sd_path); - } else { - console.log("Open file:", file.name); - // TODO: Implement file opening - } - }, - keybind: "⌘O", - condition: () => file.kind === "Directory" || file.kind === "File", - }, - { - icon: MagnifyingGlass, - label: "Show in Finder", - onClick: async () => { - // Extract the physical path from SdPath - if ("Physical" in file.sd_path) { - const physicalPath = file.sd_path.Physical.path; - if (platform.revealFile) { - try { - await platform.revealFile(physicalPath); - } catch (err) { - console.error("Failed to reveal file:", err); - alert(`Failed to reveal file: ${err}`); - } - } else { - console.log("revealFile not supported on this platform"); - } - } else { - console.log("Cannot reveal non-physical file"); - } - }, - keybind: "⌘⇧R", - condition: () => "Physical" in file.sd_path && !!platform.revealFile, - }, - { type: "separator" }, - { - icon: Copy, - label: selected && selectedFiles.length > 1 ? `Copy ${selectedFiles.length} items` : "Copy", - onClick: async () => { - const targets = getTargetFiles(); - const sdPaths = targets.map(f => f.sd_path); + const contextMenu = useContextMenu({ + items: [ + { + icon: Eye, + label: "Quick Look", + onClick: () => { + console.log("Quick Look:", file.name); + // TODO: Implement quick look + }, + keybind: "Space", + }, + { + icon: FolderOpen, + label: "Open", + onClick: () => { + if (file.kind === "Directory") { + setCurrentPath(file.sd_path); + } else { + console.log("Open file:", file.name); + // TODO: Implement file opening + } + }, + keybind: "⌘O", + condition: () => + file.kind === "Directory" || file.kind === "File", + }, + { + icon: MagnifyingGlass, + label: "Show in Finder", + onClick: async () => { + // Extract the physical path from SdPath + if ("Physical" in file.sd_path) { + const physicalPath = file.sd_path.Physical.path; + if (platform.revealFile) { + try { + await platform.revealFile(physicalPath); + } catch (err) { + console.error( + "Failed to reveal file:", + err, + ); + alert(`Failed to reveal file: ${err}`); + } + } else { + console.log( + "revealFile not supported on this platform", + ); + } + } else { + console.log("Cannot reveal non-physical file"); + } + }, + keybind: "⌘⇧R", + condition: () => + "Physical" in file.sd_path && !!platform.revealFile, + }, + { type: "separator" }, + { + icon: Copy, + label: + selected && selectedFiles.length > 1 + ? `Copy ${selectedFiles.length} items` + : "Copy", + onClick: async () => { + const targets = getTargetFiles(); + const sdPaths = targets.map((f) => f.sd_path); - console.log("Copying files:", targets.map(f => f.name)); + console.log( + "Copying files:", + targets.map((f) => f.name), + ); - // Store the file paths for paste - window.__SPACEDRIVE__ = window.__SPACEDRIVE__ || {}; - window.__SPACEDRIVE__.clipboard = { - operation: 'copy', - files: sdPaths, - sourcePath: currentPath, - }; + // Store the file paths for paste + window.__SPACEDRIVE__ = window.__SPACEDRIVE__ || {}; + window.__SPACEDRIVE__.clipboard = { + operation: "copy", + files: sdPaths, + sourcePath: currentPath, + }; - console.log(`Copied ${sdPaths.length} files to clipboard`); - }, - keybind: "⌘C", - }, - { - icon: Copy, - label: "Paste", - onClick: async () => { - const clipboard = window.__SPACEDRIVE__?.clipboard; - if (!clipboard || !clipboard.files || !currentPath) { - console.log("Nothing to paste or no destination"); - return; - } + console.log( + `Copied ${sdPaths.length} files to clipboard`, + ); + }, + keybind: "⌘C", + }, + { + icon: Copy, + label: "Paste", + onClick: async () => { + const clipboard = window.__SPACEDRIVE__?.clipboard; + if (!clipboard || !clipboard.files || !currentPath) { + console.log("Nothing to paste or no destination"); + return; + } - console.log(`Pasting ${clipboard.files.length} files to:`, currentPath); + console.log( + `Pasting ${clipboard.files.length} files to:`, + currentPath, + ); - try { - console.log("Paste params:", { - sources: clipboard.files, - destination: currentPath, - }); + try { + console.log("Paste params:", { + sources: clipboard.files, + destination: currentPath, + }); - const result = await copyFiles.mutateAsync({ - sources: { paths: clipboard.files }, - destination: currentPath, - overwrite: false, - verify_checksum: false, - preserve_timestamps: true, - move_files: false, - copy_method: "Auto" as const, - }); + const result = await copyFiles.mutateAsync({ + sources: { paths: clipboard.files }, + destination: currentPath, + overwrite: false, + verify_checksum: false, + preserve_timestamps: true, + move_files: false, + copy_method: "Auto" as const, + }); - console.log("Paste operation result:", result); - console.log("Result type:", typeof result, result); + console.log("Paste operation result:", result); + console.log("Result type:", typeof result, result); - // Check if it's a confirmation request - if (result && typeof result === 'object' && 'NeedsConfirmation' in result) { - console.log("Action needs confirmation:", result); - alert("File conflict detected - confirmation UI not implemented yet"); - } else if (result && typeof result === 'object' && 'job_id' in result) { - console.log("Job started with ID:", result.job_id); - } - } catch (err) { - console.error("Failed to paste:", err); - alert(`Failed to paste: ${err}`); - } - }, - keybind: "⌘V", - condition: () => { - const clipboard = window.__SPACEDRIVE__?.clipboard; - return !!clipboard && !!clipboard.files && clipboard.files.length > 0; - }, - }, - // Media Processing submenu - { - type: "submenu", - icon: Image, - label: "Image Processing", - condition: () => file.content_identity?.kind === "image", - submenu: [ - { - icon: Sparkle, - label: "Generate Blurhash", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("thumbnail", { - file_ids: targets.map((f) => f.id), - generate_blurhash: true, - }); - }, - condition: () => !file.image_media_data?.blurhash, - }, - { - icon: Crop, - label: "Regenerate Thumbnail", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("thumbnail", { - file_ids: targets.map((f) => f.id), - force: true, - }); - }, - }, - { - icon: TextAa, - label: "Extract Text (OCR)", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("ocr", { - file_ids: targets.map((f) => f.id), - }); - }, - keybind: "⌘⇧T", - }, - ], - }, - { - type: "submenu", - icon: Video, - label: "Video Processing", - condition: () => file.content_identity?.kind === "video", - submenu: [ - { - icon: FilmStrip, - label: "Generate Thumbstrip", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("thumbstrip", { - file_ids: targets.map((f) => f.id), - frame_count: 10, - }); - }, - condition: () => !file.sidecars?.some((s) => s.kind === "thumbstrip"), - }, - { - icon: Sparkle, - label: "Generate Blurhash", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("thumbnail", { - file_ids: targets.map((f) => f.id), - generate_blurhash: true, - }); - }, - condition: () => !file.video_media_data?.blurhash, - }, - { - icon: Crop, - label: "Regenerate Thumbnail", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("thumbnail", { - file_ids: targets.map((f) => f.id), - force: true, - }); - }, - }, - { - icon: Waveform, - label: "Extract Subtitles", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("speech_to_text", { - file_ids: targets.map((f) => f.id), - output_format: "srt", - }); - }, - }, - { - icon: FileVideo, - label: "Generate Proxy", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("proxy", { - file_ids: targets.map((f) => f.id), - quality: "720p", - }); - }, - keybind: "⌘⇧P", - }, - ], - }, - { - type: "submenu", - icon: Microphone, - label: "Audio Processing", - condition: () => file.content_identity?.kind === "audio", - submenu: [ - { - icon: TextAa, - label: "Transcribe Audio", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("speech_to_text", { - file_ids: targets.map((f) => f.id), - model: "whisper-base", - }); - }, - keybind: "⌘⇧T", - }, - ], - }, - { - type: "submenu", - icon: FileText, - label: "Document Processing", - condition: () => file.kind === "File" && ["pdf", "doc", "docx"].includes(file.extension || ""), - submenu: [ - { - icon: TextAa, - label: "Extract Text (OCR)", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("ocr", { - file_ids: targets.map((f) => f.id), - }); - }, - keybind: "⌘⇧T", - }, - { - icon: Crop, - label: "Regenerate Thumbnail", - onClick: async () => { - const targets = getTargetFiles(); - await runJob("thumbnail", { - file_ids: targets.map((f) => f.id), - force: true, - }); - }, - }, - ], - }, - // Batch operations submenu - { - type: "submenu", - icon: Stack, - label: `Process ${selectedFiles.length} Items`, - condition: () => selected && selectedFiles.length > 1, - submenu: [ - { - icon: Crop, - label: "Regenerate All Thumbnails", - onClick: async () => { - await runJob("thumbnail", { - file_ids: selectedFiles.map((f) => f.id), - force: true, - }); - }, - }, - { - icon: Sparkle, - label: "Generate Blurhashes", - onClick: async () => { - await runJob("thumbnail", { - file_ids: selectedFiles.map((f) => f.id), - generate_blurhash: true, - }); - }, - keybind: "⌘⇧B", - }, - { - icon: TextAa, - label: "Extract Text from All", - onClick: async () => { - await runJob("ocr", { - file_ids: selectedFiles.map((f) => f.id), - }); - }, - }, - { - icon: FilmStrip, - label: "Generate Thumbstrips (Videos)", - onClick: async () => { - const videos = selectedFiles.filter((f) => f.content_identity?.kind === "video"); - if (videos.length > 0) { - await runJob("thumbstrip", { - file_ids: videos.map((f) => f.id), - }); - } - }, - condition: () => selectedFiles.some((f) => f.content_identity?.kind === "video"), - }, - ], - }, - { type: "separator" }, - { - icon: Trash, - label: selected && selectedFiles.length > 1 ? `Delete ${selectedFiles.length} items` : "Delete", - onClick: async () => { - const targets = getTargetFiles(); - const message = targets.length > 1 - ? `Delete ${targets.length} items?` - : `Delete "${file.name}"?`; + // Check if it's a confirmation request + if ( + result && + typeof result === "object" && + "NeedsConfirmation" in result + ) { + console.log( + "Action needs confirmation:", + result, + ); + alert( + "File conflict detected - confirmation UI not implemented yet", + ); + } else if ( + result && + typeof result === "object" && + "job_id" in result + ) { + console.log( + "Job started with ID:", + result.job_id, + ); + } + } catch (err) { + console.error("Failed to paste:", err); + alert(`Failed to paste: ${err}`); + } + }, + keybind: "⌘V", + condition: () => { + const clipboard = window.__SPACEDRIVE__?.clipboard; + return ( + !!clipboard && + !!clipboard.files && + clipboard.files.length > 0 + ); + }, + }, + // Media Processing submenu + { + type: "submenu", + icon: Image, + label: "Image Processing", + condition: () => getContentKind(file) === "image", + submenu: [ + { + icon: Sparkle, + label: "Generate Blurhash", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("thumbnail", { + file_ids: targets.map((f) => f.id), + generate_blurhash: true, + }); + }, + condition: () => !file.image_media_data?.blurhash, + }, + { + icon: Crop, + label: "Regenerate Thumbnail", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("thumbnail", { + file_ids: targets.map((f) => f.id), + force: true, + }); + }, + }, + { + icon: TextAa, + label: "Extract Text (OCR)", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("ocr", { + file_ids: targets.map((f) => f.id), + }); + }, + keybind: "⌘⇧T", + }, + ], + }, + { + type: "submenu", + icon: Video, + label: "Video Processing", + condition: () => getContentKind(file) === "video", + submenu: [ + { + icon: FilmStrip, + label: "Generate Thumbstrip", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("thumbstrip", { + file_ids: targets.map((f) => f.id), + frame_count: 10, + }); + }, + condition: () => + !file.sidecars?.some( + (s) => s.kind === "thumbstrip", + ), + }, + { + icon: Sparkle, + label: "Generate Blurhash", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("thumbnail", { + file_ids: targets.map((f) => f.id), + generate_blurhash: true, + }); + }, + condition: () => !file.video_media_data?.blurhash, + }, + { + icon: Crop, + label: "Regenerate Thumbnail", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("thumbnail", { + file_ids: targets.map((f) => f.id), + force: true, + }); + }, + }, + { + icon: Waveform, + label: "Extract Subtitles", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("speech_to_text", { + file_ids: targets.map((f) => f.id), + output_format: "srt", + }); + }, + }, + { + icon: FileVideo, + label: "Generate Proxy", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("proxy", { + file_ids: targets.map((f) => f.id), + quality: "720p", + }); + }, + keybind: "⌘⇧P", + }, + ], + }, + { + type: "submenu", + icon: Microphone, + label: "Audio Processing", + condition: () => getContentKind(file) === "audio", + submenu: [ + { + icon: TextAa, + label: "Transcribe Audio", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("speech_to_text", { + file_ids: targets.map((f) => f.id), + model: "whisper-base", + }); + }, + keybind: "⌘⇧T", + }, + ], + }, + { + type: "submenu", + icon: FileText, + label: "Document Processing", + condition: () => + file.kind === "File" && + ["pdf", "doc", "docx"].includes(file.extension || ""), + submenu: [ + { + icon: TextAa, + label: "Extract Text (OCR)", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("ocr", { + file_ids: targets.map((f) => f.id), + }); + }, + keybind: "⌘⇧T", + }, + { + icon: Crop, + label: "Regenerate Thumbnail", + onClick: async () => { + const targets = getTargetFiles(); + await runJob("thumbnail", { + file_ids: targets.map((f) => f.id), + force: true, + }); + }, + }, + ], + }, + // Batch operations submenu + { + type: "submenu", + icon: Stack, + label: `Process ${selectedFiles.length} Items`, + condition: () => selected && selectedFiles.length > 1, + submenu: [ + { + icon: Crop, + label: "Regenerate All Thumbnails", + onClick: async () => { + await runJob("thumbnail", { + file_ids: selectedFiles.map((f) => f.id), + force: true, + }); + }, + }, + { + icon: Sparkle, + label: "Generate Blurhashes", + onClick: async () => { + await runJob("thumbnail", { + file_ids: selectedFiles.map((f) => f.id), + generate_blurhash: true, + }); + }, + keybind: "⌘⇧B", + }, + { + icon: TextAa, + label: "Extract Text from All", + onClick: async () => { + await runJob("ocr", { + file_ids: selectedFiles.map((f) => f.id), + }); + }, + }, + { + icon: FilmStrip, + label: "Generate Thumbstrips (Videos)", + onClick: async () => { + const videos = selectedFiles.filter( + (f) => getContentKind(f) === "video", + ); + if (videos.length > 0) { + await runJob("thumbstrip", { + file_ids: videos.map((f) => f.id), + }); + } + }, + condition: () => + selectedFiles.some( + (f) => getContentKind(f) === "video", + ), + }, + ], + }, + { type: "separator" }, + { + icon: Trash, + label: + selected && selectedFiles.length > 1 + ? `Delete ${selectedFiles.length} items` + : "Delete", + onClick: async () => { + const targets = getTargetFiles(); + const message = + targets.length > 1 + ? `Delete ${targets.length} items?` + : `Delete "${file.name}"?`; - if (confirm(message)) { - console.log("Deleting files:", targets.map(f => f.name)); + if (confirm(message)) { + console.log( + "Deleting files:", + targets.map((f) => f.name), + ); - try { - const result = await deleteFiles.mutateAsync({ - targets: { paths: targets.map(f => f.sd_path) }, - permanent: false, // Move to trash, not permanent delete - recursive: true, // Allow deleting non-empty directories - }); - console.log("Delete operation started:", result); - } catch (err) { - console.error("Failed to delete:", err); - alert(`Failed to delete: ${err}`); - } - } - }, - keybind: "⌘⌫", - variant: "danger" as const, - }, - ], - }); + try { + const result = await deleteFiles.mutateAsync({ + targets: { + paths: targets.map((f) => f.sd_path), + }, + permanent: false, // Move to trash, not permanent delete + recursive: true, // Allow deleting non-empty directories + }); + console.log( + "Delete operation started:", + result, + ); + } catch (err) { + console.error("Failed to delete:", err); + alert(`Failed to delete: ${err}`); + } + } + }, + keybind: "⌘⌫", + variant: "danger" as const, + }, + ], + }); - const handleClick = (e: React.MouseEvent) => { - const multi = e.metaKey || e.ctrlKey; - const range = e.shiftKey; - selectFile(file, allFiles, multi, range); - }; + const handleClick = (e: React.MouseEvent) => { + const multi = e.metaKey || e.ctrlKey; + const range = e.shiftKey; + selectFile(file, allFiles, multi, range); + }; - const handleDoubleClick = () => { - if (file.kind === "Directory") { - setCurrentPath(file.sd_path); - } - }; + const handleDoubleClick = () => { + if (file.kind === "Directory") { + setCurrentPath(file.sd_path); + } + }; - const handleContextMenu = async (e: React.MouseEvent) => { - e.preventDefault(); - e.stopPropagation(); + const handleContextMenu = async (e: React.MouseEvent) => { + e.preventDefault(); + e.stopPropagation(); - if (!selected) { - selectFile(file, allFiles, false, false); - } + if (!selected) { + selectFile(file, allFiles, false, false); + } - await contextMenu.show(e); - }; + await contextMenu.show(e); + }; - const { attributes, listeners, setNodeRef, isDragging: dndIsDragging } = useDraggable({ - id: file.id, - data: { - type: "explorer-file", - sdPath: file.sd_path, - name: file.name, - file: file, - gridSize: gridSize, // Pass grid size for overlay - }, - }); + const { + attributes, + listeners, + setNodeRef, + isDragging: dndIsDragging, + } = useDraggable({ + id: file.id, + data: { + type: "explorer-file", + sdPath: file.sd_path, + name: file.name, + file: file, + gridSize: gridSize, // Pass grid size for overlay + }, + }); - const thumbSize = Math.max(gridSize * 0.6, 60); + const thumbSize = Math.max(gridSize * 0.6, 60); - return ( -
- -
- -
-
-
- {file.name} -
- {showFileSize && file.size > 0 && ( -
- {formatBytes(file.size)} -
- )} + return ( +
+ +
+ +
+
+
+ {file.name} +
+ {showFileSize && file.size > 0 && ( +
+ {formatBytes(file.size)} +
+ )} - {/* Tag Indicators */} - {file.tags && file.tags.length > 0 && ( -
t.canonical_name).join(', ')} - > - {file.tags.slice(0, 3).map((tag) => ( - - ))} - {file.tags.length > 3 && ( - - +{file.tags.length - 3} - - )} -
- )} -
-
-
- ); -}, (prev, next) => { - // Custom comparison - rerender if file object, selection, or focus changed - // Ignore selectedFiles and selectFile function reference changes - if (prev.file !== next.file) return false; // File object reference changed - if (prev.selected !== next.selected) return false; // Selection state changed - if (prev.focused !== next.focused) return false; // Focus state changed - if (prev.fileIndex !== next.fileIndex) return false; // Index changed - // Ignore: allFiles, selectedFiles, selectFile (passed through to handlers) - return true; // Props are equal, skip rerender -}); + {/* Tag Indicators */} + {file.tags && file.tags.length > 0 && ( +
t.canonical_name) + .join(", ")} + > + {file.tags.slice(0, 3).map((tag) => ( + + ))} + {file.tags.length > 3 && ( + + +{file.tags.length - 3} + + )} +
+ )} +
+
+
+ ); + }, + (prev, next) => { + // Custom comparison - rerender if file object, selection, or focus changed + // Ignore selectedFiles and selectFile function reference changes + if (prev.file !== next.file) return false; // File object reference changed + if (prev.selected !== next.selected) return false; // Selection state changed + if (prev.focused !== next.focused) return false; // Focus state changed + if (prev.fileIndex !== next.fileIndex) return false; // Index changed + // Ignore: allFiles, selectedFiles, selectFile (passed through to handlers) + return true; // Props are equal, skip rerender + }, +); diff --git a/packages/interface/src/components/Explorer/views/KnowledgeView.tsx b/packages/interface/src/components/Explorer/views/KnowledgeView.tsx index 086918393..38f76dc43 100644 --- a/packages/interface/src/components/Explorer/views/KnowledgeView.tsx +++ b/packages/interface/src/components/Explorer/views/KnowledgeView.tsx @@ -1,395 +1,425 @@ import { - Sparkle, - Tag as TagIcon, - Chat, - Database, - FilmStrip, - Image, - MusicNote, - File as FileIcon, - Folder, - FileText, + Sparkle, + Tag as TagIcon, + Chat, + Database, + FilmStrip, + Image, + MusicNote, + File as FileIcon, + Folder, + FileText, } from "@phosphor-icons/react"; import { KnowledgeInspector } from "../../../inspectors/KnowledgeInspector"; import { useExplorer } from "../context"; import { useNormalizedQuery } from "../../../context"; import type { File, ContentKind } from "@sd/ts-client"; +import { getContentKind } from "../utils"; import { useMemo } from "react"; import clsx from "clsx"; import { File as FileComponent } from "../File"; const CONTENT_KIND_ICONS: Record = { - image: Image, - video: FilmStrip, - audio: MusicNote, - document: FileText, - archive: Folder, - code: FileText, - text: FileText, - database: Database, - book: FileText, - font: FileText, - mesh: FileIcon, - config: FileText, - encrypted: FileIcon, - key: FileIcon, - executable: FileIcon, - binary: FileIcon, - spreadsheet: FileText, - presentation: FileText, - email: FileText, - calendar: FileText, - contact: FileText, - web: FileText, - shortcut: FileIcon, - package: Folder, - model_entry: FileIcon, - unknown: FileIcon, + image: Image, + video: FilmStrip, + audio: MusicNote, + document: FileText, + archive: Folder, + code: FileText, + text: FileText, + database: Database, + book: FileText, + font: FileText, + mesh: FileIcon, + config: FileText, + encrypted: FileIcon, + key: FileIcon, + executable: FileIcon, + binary: FileIcon, + spreadsheet: FileText, + presentation: FileText, + email: FileText, + calendar: FileText, + contact: FileText, + web: FileText, + shortcut: FileIcon, + package: Folder, + model_entry: FileIcon, + unknown: FileIcon, }; const CONTENT_KIND_LABELS: Record = { - image: "Images", - video: "Videos", - audio: "Audio", - document: "Documents", - archive: "Archives", - code: "Code", - text: "Text", - database: "Databases", - book: "Books", - font: "Fonts", - mesh: "3D Models", - config: "Config", - encrypted: "Encrypted", - key: "Keys", - executable: "Apps", - binary: "Binary", - spreadsheet: "Spreadsheets", - presentation: "Presentations", - email: "Emails", - calendar: "Calendar", - contact: "Contacts", - web: "Web", - shortcut: "Shortcuts", - package: "Packages", - model_entry: "Models", - unknown: "Other", + image: "Images", + video: "Videos", + audio: "Audio", + document: "Documents", + archive: "Archives", + code: "Code", + text: "Text", + database: "Databases", + book: "Books", + font: "Fonts", + mesh: "3D Models", + config: "Config", + encrypted: "Encrypted", + key: "Keys", + executable: "Apps", + binary: "Binary", + spreadsheet: "Spreadsheets", + presentation: "Presentations", + email: "Emails", + calendar: "Calendar", + contact: "Contacts", + web: "Web", + shortcut: "Shortcuts", + package: "Packages", + model_entry: "Models", + unknown: "Other", }; export function KnowledgeView() { - const { inspectorVisible, currentPath, sortBy, viewSettings } = useExplorer(); + const { inspectorVisible, currentPath, sortBy, viewSettings } = + useExplorer(); - const directoryQuery = useNormalizedQuery({ - wireMethod: "query:files.directory_listing", - input: currentPath - ? { - path: currentPath, - limit: null, - include_hidden: false, - sort_by: sortBy, - folders_first: viewSettings.foldersFirst, - } - : null, - resourceType: "file", - enabled: !!currentPath, - }); + const directoryQuery = useNormalizedQuery({ + wireMethod: "query:files.directory_listing", + input: currentPath + ? { + path: currentPath, + limit: null, + include_hidden: false, + sort_by: sortBy, + folders_first: viewSettings.foldersFirst, + } + : null, + resourceType: "file", + enabled: !!currentPath, + }); - const files = (directoryQuery.data?.files || []) as File[]; + const files = (directoryQuery.data?.files || []) as File[]; - // Group files by content kind - const filesByKind = useMemo(() => { - const groups = new Map(); + // Group files by content kind + const filesByKind = useMemo(() => { + const groups = new Map(); - files.forEach((file) => { - const kind = file.content_identity?.kind || "unknown"; - if (!groups.has(kind)) { - groups.set(kind, []); - } - groups.get(kind)!.push(file); - }); + files.forEach((file) => { + const kind = getContentKind(file) || "unknown"; + if (!groups.has(kind)) { + groups.set(kind, []); + } + groups.get(kind)!.push(file); + }); - // Sort by count and return top categories - return Array.from(groups.entries()) - .sort((a, b) => b[1].length - a[1].length) - .slice(0, 6); - }, [files]); + // Sort by count and return top categories + return Array.from(groups.entries()) + .sort((a, b) => b[1].length - a[1].length) + .slice(0, 6); + }, [files]); - // Collect all unique tags - const allTags = useMemo(() => { - const tagMap = new Map(); + // Collect all unique tags + const allTags = useMemo(() => { + const tagMap = new Map< + string, + { id: string; name: string; color: string; count: number } + >(); - files.forEach((file) => { - file.tags?.forEach((tag) => { - if (tagMap.has(tag.id)) { - tagMap.get(tag.id)!.count++; - } else { - tagMap.set(tag.id, { - id: tag.id, - name: tag.canonical_name, - color: tag.color || "#3B82F6", - count: 1, - }); - } - }); - }); + files.forEach((file) => { + file.tags?.forEach((tag) => { + if (tagMap.has(tag.id)) { + tagMap.get(tag.id)!.count++; + } else { + tagMap.set(tag.id, { + id: tag.id, + name: tag.canonical_name, + color: tag.color || "#3B82F6", + count: 1, + }); + } + }); + }); - return Array.from(tagMap.values()).sort((a, b) => b.count - a.count); - }, [files]); + return Array.from(tagMap.values()).sort((a, b) => b.count - a.count); + }, [files]); - return ( -
- {/* Main content area */} -
-
- {/* Header */} -
- -
-

Knowledge View

-

- AI-powered insights for {files.length} items -

-
-
+ return ( +
+ {/* Main content area */} +
+
+ {/* Header */} +
+ +
+

+ Knowledge View +

+

+ AI-powered insights for {files.length} items +

+
+
- {/* Content Piles */} -
-
- {filesByKind.map(([kind, kindFiles]) => ( - - ))} -
-
+ {/* Content Piles */} +
+
+ {filesByKind.map(([kind, kindFiles]) => ( + + ))} +
+
- {/* Tags */} - {allTags.length > 0 && ( -
-
- {allTags.map((tag) => ( - - ))} -
-
- )} + {/* Tags */} + {allTags.length > 0 && ( +
+
+ {allTags.map((tag) => ( + + ))} +
+
+ )} - {/* Summary & Conversations */} -
- {/* Summary */} -
-
-

- This directory contains {files.length} items across{" "} - {filesByKind.length} content types. -

- {filesByKind.length > 0 && ( -

- Most common type: {CONTENT_KIND_LABELS[filesByKind[0][0]]} ( - {filesByKind[0][1].length} items) -

- )} - {allTags.length > 0 && ( -

Tagged items: {allTags.reduce((sum, tag) => sum + tag.count, 0)}

- )} -
-
+ {/* Summary & Conversations */} +
+ {/* Summary */} +
+
+

+ This directory contains {files.length} items + across {filesByKind.length} content types. +

+ {filesByKind.length > 0 && ( +

+ Most common type:{" "} + {CONTENT_KIND_LABELS[filesByKind[0][0]]}{" "} + ({filesByKind[0][1].length} items) +

+ )} + {allTags.length > 0 && ( +

+ Tagged items:{" "} + {allTags.reduce( + (sum, tag) => sum + tag.count, + 0, + )} +

+ )} +
+
- {/* Conversations */} -
-
- - -
-
-
+ {/* Conversations */} +
+
+ + +
+
+
- {/* Intelligence Sidecars */} -
-
- - - - -
-
-
-
+ {/* Intelligence Sidecars */} +
+
+ + + + +
+
+
+
- {/* Dedicated Knowledge Inspector */} - {inspectorVisible && ( -
-
- -
-
- )} -
- ); + {/* Dedicated Knowledge Inspector */} + {inspectorVisible && ( +
+
+ +
+
+ )} +
+ ); } function Section({ - title, - icon: Icon, - children, + title, + icon: Icon, + children, }: { - title: string; - icon: React.ElementType; - children: React.ReactNode; + title: string; + icon: React.ElementType; + children: React.ReactNode; }) { - return ( -
-
- -

{title}

-
- {children} -
- ); + return ( +
+
+ +

{title}

+
+ {children} +
+ ); } function ContentPile({ - kind, - files, - totalCount, + kind, + files, + totalCount, }: { - kind: ContentKind; - files: File[]; - totalCount: number; + kind: ContentKind; + files: File[]; + totalCount: number; }) { - const Icon = CONTENT_KIND_ICONS[kind]; - const label = CONTENT_KIND_LABELS[kind]; + const Icon = CONTENT_KIND_ICONS[kind]; + const label = CONTENT_KIND_LABELS[kind]; - return ( - - ); + {/* Label */} +
+
{label}
+
+ {totalCount} items +
+
+ + ); } function ConversationCard({ - title, - preview, - time, + title, + preview, + time, }: { - title: string; - preview: string; - time: string; + title: string; + preview: string; + time: string; }) { - return ( - - ); + return ( + + ); } function SidecarItem({ - kind, - variant, - status, - size, + kind, + variant, + status, + size, }: { - kind: string; - variant: string; - status: "ready" | "pending"; - size: string; + kind: string; + variant: string; + status: "ready" | "pending"; + size: string; }) { - return ( -
-
- -
-
-
{kind}
-
{variant}
-
{size}
-
- - {status} - -
- ); + return ( +
+
+ +
+
+
{kind}
+
{variant}
+
{size}
+
+ + {status} + +
+ ); } diff --git a/packages/interface/src/components/Explorer/views/MediaView/MediaViewItem.tsx b/packages/interface/src/components/Explorer/views/MediaView/MediaViewItem.tsx index cd619c95d..d44c5f126 100644 --- a/packages/interface/src/components/Explorer/views/MediaView/MediaViewItem.tsx +++ b/packages/interface/src/components/Explorer/views/MediaView/MediaViewItem.tsx @@ -21,6 +21,7 @@ import type { File } from "@sd/ts-client"; import { File as FileComponent } from "../../File"; import { useExplorer } from "../../context"; import { useSelection } from "../../SelectionContext"; +import { getContentKind } from "../../utils"; import { useContextMenu } from "../../../../hooks/useContextMenu"; import { useJobDispatch } from "../../../../hooks/useJobDispatch"; import { useLibraryMutation } from "../../../../context"; @@ -29,7 +30,7 @@ import { usePlatform } from "../../../../platform"; function formatDuration(seconds: number): string { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); - return `${mins}:${String(secs).padStart(2, '0')}`; + return `${mins}:${String(secs).padStart(2, "0")}`; } interface MediaViewItemProps { @@ -37,7 +38,12 @@ interface MediaViewItemProps { allFiles: File[]; selected: boolean; focused: boolean; - onSelect: (file: File, files: File[], multi?: boolean, range?: boolean) => void; + onSelect: ( + file: File, + files: File[], + multi?: boolean, + range?: boolean, + ) => void; size: number; } @@ -90,7 +96,8 @@ export const MediaViewItem = memo(function MediaViewItem({ } }, keybind: "⌘⇧R", - condition: () => "Physical" in file.sd_path && !!platform.revealFile, + condition: () => + "Physical" in file.sd_path && !!platform.revealFile, }, { type: "separator" }, { @@ -140,7 +147,11 @@ export const MediaViewItem = memo(function MediaViewItem({ keybind: "⌘V", condition: () => { const clipboard = window.__SPACEDRIVE__?.clipboard; - return !!clipboard && !!clipboard.files && clipboard.files.length > 0; + return ( + !!clipboard && + !!clipboard.files && + clipboard.files.length > 0 + ); }, }, // Media Processing submenus @@ -148,7 +159,7 @@ export const MediaViewItem = memo(function MediaViewItem({ type: "submenu", icon: Image, label: "Image Processing", - condition: () => file.content_identity?.kind === "image", + condition: () => getContentKind(file) === "image", submenu: [ { icon: Sparkle, @@ -190,7 +201,7 @@ export const MediaViewItem = memo(function MediaViewItem({ type: "submenu", icon: Video, label: "Video Processing", - condition: () => file.content_identity?.kind === "video", + condition: () => getContentKind(file) === "video", submenu: [ { icon: FilmStrip, @@ -202,7 +213,10 @@ export const MediaViewItem = memo(function MediaViewItem({ frame_count: 10, }); }, - condition: () => !file.sidecars?.some((s) => s.kind === "thumbstrip"), + condition: () => + !file.sidecars?.some( + (s) => s.kind === "thumbstrip", + ), }, { icon: Sparkle, @@ -256,7 +270,7 @@ export const MediaViewItem = memo(function MediaViewItem({ type: "submenu", icon: Microphone, label: "Audio Processing", - condition: () => file.content_identity?.kind === "audio", + condition: () => getContentKind(file) === "audio", submenu: [ { icon: TextAa, @@ -314,7 +328,7 @@ export const MediaViewItem = memo(function MediaViewItem({ label: "Generate Thumbstrips (Videos)", onClick: async () => { const videos = selectedFiles.filter( - (f) => f.content_identity?.kind === "video", + (f) => getContentKind(f) === "video", ); if (videos.length > 0) { await runJob("thumbstrip", { @@ -323,7 +337,9 @@ export const MediaViewItem = memo(function MediaViewItem({ } }, condition: () => - selectedFiles.some((f) => f.content_identity?.kind === "video"), + selectedFiles.some( + (f) => getContentKind(f) === "video", + ), }, ], }, @@ -344,7 +360,9 @@ export const MediaViewItem = memo(function MediaViewItem({ if (confirm(message)) { try { await deleteFiles.mutateAsync({ - targets: { paths: targets.map((f) => f.sd_path) }, + targets: { + paths: targets.map((f) => f.sd_path), + }, permanent: false, recursive: true, }); diff --git a/packages/interface/src/components/QuickPreview/ContentRenderer.tsx b/packages/interface/src/components/QuickPreview/ContentRenderer.tsx index a300fd08d..279d78fcb 100644 --- a/packages/interface/src/components/QuickPreview/ContentRenderer.tsx +++ b/packages/interface/src/components/QuickPreview/ContentRenderer.tsx @@ -1,12 +1,12 @@ import type { File, ContentKind } from "@sd/ts-client"; import { File as FileComponent } from "../Explorer/File"; -import { formatBytes } from "../Explorer/utils"; +import { formatBytes, getContentKind } from "../Explorer/utils"; import { usePlatform } from "../../platform"; import { useState, useEffect, useRef } from "react"; import { - MagnifyingGlassPlus, - MagnifyingGlassMinus, - ArrowCounterClockwise, + MagnifyingGlassPlus, + MagnifyingGlassMinus, + ArrowCounterClockwise, } from "@phosphor-icons/react"; import { VideoPlayer } from "./VideoPlayer"; import { AudioPlayer } from "./AudioPlayer"; @@ -14,325 +14,344 @@ import { useZoomPan } from "./useZoomPan"; import { Folder } from "@sd/assets/icons"; interface ContentRendererProps { - file: File; - onZoomChange?: (isZoomed: boolean) => void; + file: File; + onZoomChange?: (isZoomed: boolean) => void; } function ImageRenderer({ file, onZoomChange }: ContentRendererProps) { - const platform = usePlatform(); - const containerRef = useRef(null); - const [originalLoaded, setOriginalLoaded] = useState(false); - const [originalUrl, setOriginalUrl] = useState(null); - const { zoom, zoomIn, zoomOut, reset, isZoomed, transform } = useZoomPan(containerRef); + const platform = usePlatform(); + const containerRef = useRef(null); + const [originalLoaded, setOriginalLoaded] = useState(false); + const [originalUrl, setOriginalUrl] = useState(null); + const { zoom, zoomIn, zoomOut, reset, isZoomed, transform } = + useZoomPan(containerRef); - // Notify parent of zoom state changes - useEffect(() => { - onZoomChange?.(isZoomed); - }, [isZoomed, onZoomChange]); + // Notify parent of zoom state changes + useEffect(() => { + onZoomChange?.(isZoomed); + }, [isZoomed, onZoomChange]); - useEffect(() => { - if (!platform.convertFileSrc) { - return; - } + useEffect(() => { + if (!platform.convertFileSrc) { + return; + } - const sdPath = file.sd_path as any; - const physicalPath = sdPath?.Physical?.path; + const sdPath = file.sd_path as any; + const physicalPath = sdPath?.Physical?.path; - if (!physicalPath) { - console.log( - "[ImageRenderer] No physical path available, sd_path:", - file.sd_path, - ); - return; - } + if (!physicalPath) { + console.log( + "[ImageRenderer] No physical path available, sd_path:", + file.sd_path, + ); + return; + } - const url = platform.convertFileSrc(physicalPath); - console.log( - "[ImageRenderer] Loading original from:", - physicalPath, - "-> URL:", - url, - ); - setOriginalUrl(url); - }, [file, platform]); + const url = platform.convertFileSrc(physicalPath); + console.log( + "[ImageRenderer] Loading original from:", + physicalPath, + "-> URL:", + url, + ); + setOriginalUrl(url); + }, [file, platform]); - // Get highest resolution thumbnail first - const getHighestResThumbnail = () => { - const thumbnails = file.sidecars?.filter((s) => s.kind === "thumb") || []; - if (thumbnails.length === 0) return null; + // Get highest resolution thumbnail first + const getHighestResThumbnail = () => { + const thumbnails = + file.sidecars?.filter((s) => s.kind === "thumb") || []; + if (thumbnails.length === 0) return null; - const highest = thumbnails.sort((a, b) => { - const aSize = parseInt( - a.variant.split("x")[0]?.replace(/\D/g, "") || "0", - ); - const bSize = parseInt( - b.variant.split("x")[0]?.replace(/\D/g, "") || "0", - ); - return bSize - aSize; - })[0]; + const highest = thumbnails.sort((a, b) => { + const aSize = parseInt( + a.variant.split("x")[0]?.replace(/\D/g, "") || "0", + ); + const bSize = parseInt( + b.variant.split("x")[0]?.replace(/\D/g, "") || "0", + ); + return bSize - aSize; + })[0]; - const serverUrl = (window as any).__SPACEDRIVE_SERVER_URL__; - const libraryId = (window as any).__SPACEDRIVE_LIBRARY_ID__; - const contentUuid = file.content_identity?.uuid; + const serverUrl = (window as any).__SPACEDRIVE_SERVER_URL__; + const libraryId = (window as any).__SPACEDRIVE_LIBRARY_ID__; + const contentUuid = file.content_identity?.uuid; - if (!serverUrl || !libraryId || !contentUuid) return null; + if (!serverUrl || !libraryId || !contentUuid) return null; - return `${serverUrl}/sidecar/${libraryId}/${contentUuid}/${highest.kind}/${highest.variant}.${highest.format}`; - }; + return `${serverUrl}/sidecar/${libraryId}/${contentUuid}/${highest.kind}/${highest.variant}.${highest.format}`; + }; - const thumbnailUrl = getHighestResThumbnail(); + const thumbnailUrl = getHighestResThumbnail(); - return ( -
- {/* Zoom Controls */} -
- - - {zoom > 1 && ( - - )} -
+ return ( +
+ {/* Zoom Controls */} +
+ + + {zoom > 1 && ( + + )} +
- {/* Zoom level indicator */} - {zoom > 1 && ( -
- {Math.round(zoom * 100)}% -
- )} + {/* Zoom level indicator */} + {zoom > 1 && ( +
+ {Math.round(zoom * 100)}% +
+ )} - {/* Image container with zoom/pan transform */} -
- {/* High-res thumbnail (loads fast, shows immediately) */} - {thumbnailUrl && ( - {file.name} - )} + {/* Image container with zoom/pan transform */} +
+ {/* High-res thumbnail (loads fast, shows immediately) */} + {thumbnailUrl && ( + {file.name} + )} - {/* Original image (loads async, fades in when ready) */} - {originalUrl && ( - {file.name} setOriginalLoaded(true)} - onError={(e) => - console.error("[ImageRenderer] Original failed to load:", e) - } - draggable={false} - /> - )} -
-
- ); + {/* Original image (loads async, fades in when ready) */} + {originalUrl && ( + {file.name} setOriginalLoaded(true)} + onError={(e) => + console.error( + "[ImageRenderer] Original failed to load:", + e, + ) + } + draggable={false} + /> + )} +
+
+ ); } function VideoRenderer({ file, onZoomChange }: ContentRendererProps) { - const platform = usePlatform(); - const [videoUrl, setVideoUrl] = useState(null); + const platform = usePlatform(); + const [videoUrl, setVideoUrl] = useState(null); - useEffect(() => { - if (!platform.convertFileSrc) { - return; - } + useEffect(() => { + if (!platform.convertFileSrc) { + return; + } - const sdPath = file.sd_path as any; - const physicalPath = sdPath?.Physical?.path; + const sdPath = file.sd_path as any; + const physicalPath = sdPath?.Physical?.path; - if (!physicalPath) { - console.log("[VideoRenderer] No physical path available"); - return; - } + if (!physicalPath) { + console.log("[VideoRenderer] No physical path available"); + return; + } - const url = platform.convertFileSrc(physicalPath); - console.log( - "[VideoRenderer] Loading video from:", - physicalPath, - "-> URL:", - url, - ); - setVideoUrl(url); - }, [file, platform]); + const url = platform.convertFileSrc(physicalPath); + console.log( + "[VideoRenderer] Loading video from:", + physicalPath, + "-> URL:", + url, + ); + setVideoUrl(url); + }, [file, platform]); - if (!videoUrl) { - return ( -
- -
- ); - } + if (!videoUrl) { + return ( +
+ +
+ ); + } - return ; + return ( + + ); } function AudioRenderer({ file }: ContentRendererProps) { - const platform = usePlatform(); - const [audioUrl, setAudioUrl] = useState(null); + const platform = usePlatform(); + const [audioUrl, setAudioUrl] = useState(null); - useEffect(() => { - if (!platform.convertFileSrc) { - return; - } + useEffect(() => { + if (!platform.convertFileSrc) { + return; + } - const sdPath = file.sd_path as any; - const physicalPath = sdPath?.Physical?.path; + const sdPath = file.sd_path as any; + const physicalPath = sdPath?.Physical?.path; - if (!physicalPath) { - console.log("[AudioRenderer] No physical path available"); - return; - } + if (!physicalPath) { + console.log("[AudioRenderer] No physical path available"); + return; + } - const url = platform.convertFileSrc(physicalPath); - console.log( - "[AudioRenderer] Loading audio from:", - physicalPath, - "-> URL:", - url, - ); - setAudioUrl(url); - }, [file, platform]); + const url = platform.convertFileSrc(physicalPath); + console.log( + "[AudioRenderer] Loading audio from:", + physicalPath, + "-> URL:", + url, + ); + setAudioUrl(url); + }, [file, platform]); - if (!audioUrl) { - return ( -
-
- -
{file.name}
-
Loading...
-
-
- ); - } + if (!audioUrl) { + return ( +
+
+ +
+ {file.name} +
+
Loading...
+
+
+ ); + } - return ; + return ; } function DocumentRenderer({ file }: ContentRendererProps) { - return ( -
-
- -
{file.name}
-
- {file.content_identity?.kind ?? "unknown"} -
-
- {formatBytes(file.size || 0)} -
-
-
- ); + return ( +
+
+ +
+ {file.name} +
+
+ {getContentKind(file) ?? "unknown"} +
+
+ {formatBytes(file.size || 0)} +
+
+
+ ); } function TextRenderer({ file }: ContentRendererProps) { - // TODO: Load actual text content - return ( -
-
- -
{file.name}
-
Text File
-
- {formatBytes(file.size || 0)} -
-
- Full text preview coming soon -
-
-
- ); + // TODO: Load actual text content + return ( +
+
+ +
+ {file.name} +
+
Text File
+
+ {formatBytes(file.size || 0)} +
+
+ Full text preview coming soon +
+
+
+ ); } function DefaultRenderer({ file }: ContentRendererProps) { - return ( -
-
- -
{file.name}
-
- {file.content_identity?.kind ?? "unknown"} -
-
- {formatBytes(file.size || 0)} -
-
-
- ); + return ( +
+
+ +
+ {file.name} +
+
+ {getContentKind(file) ?? "unknown"} +
+
+ {formatBytes(file.size || 0)} +
+
+
+ ); } export function ContentRenderer({ file, onZoomChange }: ContentRendererProps) { - // Handle directories first - if (file.kind.type === "Directory") { - return ( -
- Folder Icon -
{file.name}
-
Folder
- {file.size > 0 && ( -
{formatBytes(file.size)}
- )} -
- ); - } + // Handle directories first + if (file.kind.type === "Directory") { + return ( +
+ Folder Icon +
{file.name}
+
Folder
+ {file.size > 0 && ( +
{formatBytes(file.size)}
+ )} +
+ ); + } - const kind = file.content_identity?.kind; + const kind = getContentKind(file); - switch (kind) { - case "image": - return ; - case "video": - return ; - case "audio": - return ; - case "document": - case "book": - case "spreadsheet": - case "presentation": - return ; - case "text": - case "code": - case "config": - return ; - default: - return ; - } + switch (kind) { + case "image": + return ; + case "video": + return ; + case "audio": + return ; + case "document": + case "book": + case "spreadsheet": + case "presentation": + return ; + case "text": + case "code": + case "config": + return ; + default: + return ; + } } diff --git a/packages/interface/src/components/QuickPreview/QuickPreview.tsx b/packages/interface/src/components/QuickPreview/QuickPreview.tsx index 3b8cfbec6..aa0e28211 100644 --- a/packages/interface/src/components/QuickPreview/QuickPreview.tsx +++ b/packages/interface/src/components/QuickPreview/QuickPreview.tsx @@ -2,150 +2,170 @@ import { useNormalizedQuery } from "../../context"; import { usePlatform } from "../../platform"; import type { File } from "@sd/ts-client"; import { useEffect, useState } from "react"; -import { formatBytes } from "../Explorer/utils"; +import { formatBytes, getContentKind } from "../Explorer/utils"; import { X } from "@phosphor-icons/react"; import { ContentRenderer } from "./ContentRenderer"; function MetadataPanel({ file }: { file: File }) { - return ( -
-
-
-
Name
-
{file.name}
-
+ return ( +
+
+
+
Name
+
+ {file.name} +
+
-
-
Kind
-
{file.content_identity?.kind ?? "unknown"}
-
+
+
Kind
+
+ {getContentKind(file)} +
+
-
-
Size
-
{formatBytes(file.size || 0)}
-
+
+
Size
+
+ {formatBytes(file.size || 0)} +
+
- {file.extension && ( -
-
Extension
-
{file.extension}
-
- )} + {file.extension && ( +
+
+ Extension +
+
{file.extension}
+
+ )} - {file.created_at && ( -
-
Created
-
- {new Date(file.created_at).toLocaleString()} -
-
- )} + {file.created_at && ( +
+
+ Created +
+
+ {new Date(file.created_at).toLocaleString()} +
+
+ )} - {file.modified_at && ( -
-
Modified
-
- {new Date(file.modified_at).toLocaleString()} -
-
- )} -
-
- ); + {file.modified_at && ( +
+
+ Modified +
+
+ {new Date(file.modified_at).toLocaleString()} +
+
+ )} +
+
+ ); } export function QuickPreview() { - const platform = usePlatform(); - const [fileId, setFileId] = useState(null); + const platform = usePlatform(); + const [fileId, setFileId] = useState(null); - useEffect(() => { - // Extract file_id from window label - if (platform.getCurrentWindowLabel) { - const label = platform.getCurrentWindowLabel(); + useEffect(() => { + // Extract file_id from window label + if (platform.getCurrentWindowLabel) { + const label = platform.getCurrentWindowLabel(); - // Label format: "quick-preview-{file_id}" - const match = label.match(/^quick-preview-(.+)$/); - if (match) { - setFileId(match[1]); - } - } - }, [platform]); + // Label format: "quick-preview-{file_id}" + const match = label.match(/^quick-preview-(.+)$/); + if (match) { + setFileId(match[1]); + } + } + }, [platform]); - const { data: file, isLoading, error } = useNormalizedQuery<{ file_id: string }, File>({ - wireMethod: "query:files.by_id", - input: { file_id: fileId! }, - resourceType: "file", - resourceId: fileId!, - enabled: !!fileId, - }); + const { + data: file, + isLoading, + error, + } = useNormalizedQuery<{ file_id: string }, File>({ + wireMethod: "query:files.by_id", + input: { file_id: fileId! }, + resourceType: "file", + resourceId: fileId!, + enabled: !!fileId, + }); - const handleClose = () => { - if (platform.closeCurrentWindow) { - platform.closeCurrentWindow(); - } - }; + const handleClose = () => { + if (platform.closeCurrentWindow) { + platform.closeCurrentWindow(); + } + }; - // Keyboard shortcuts - useEffect(() => { - const handleKeyDown = (e: KeyboardEvent) => { - if (e.code === "Escape") { - handleClose(); - } - }; + // Keyboard shortcuts + useEffect(() => { + const handleKeyDown = (e: KeyboardEvent) => { + if (e.code === "Escape") { + handleClose(); + } + }; - window.addEventListener("keydown", handleKeyDown); - return () => window.removeEventListener("keydown", handleKeyDown); - }, []); + window.addEventListener("keydown", handleKeyDown); + return () => window.removeEventListener("keydown", handleKeyDown); + }, []); - if (isLoading || !file) { - return ( -
-
Loading...
-
- ); - } + if (isLoading || !file) { + return ( +
+
Loading...
+
+ ); + } - if (error) { - return ( -
-
-
Error loading file
-
{error.message}
-
-
- ); - } + if (error) { + return ( +
+
+
+ Error loading file +
+
{error.message}
+
+
+ ); + } - return ( -
- {/* Header */} -
-
{file.name}
- -
+ return ( +
+ {/* Header */} +
+
+ {file.name} +
+ +
- {/* Content Area */} -
- {/* File Content */} -
- -
+ {/* Content Area */} +
+ {/* File Content */} +
+ +
- {/* Metadata Sidebar */} - -
+ {/* Metadata Sidebar */} + +
- {/* Footer with keyboard hints */} -
-
- Press ESC to close -
-
-
- ); + {/* Footer with keyboard hints */} +
+
+ Press ESC to close +
+
+
+ ); } diff --git a/packages/interface/src/components/QuickPreview/QuickPreviewFullscreen.tsx b/packages/interface/src/components/QuickPreview/QuickPreviewFullscreen.tsx index 0cfd14037..3d46235ac 100644 --- a/packages/interface/src/components/QuickPreview/QuickPreviewFullscreen.tsx +++ b/packages/interface/src/components/QuickPreview/QuickPreviewFullscreen.tsx @@ -1,11 +1,12 @@ -import { createPortal } from 'react-dom'; -import { motion, AnimatePresence } from 'framer-motion'; -import { X, ArrowLeft, ArrowRight } from '@phosphor-icons/react'; -import { useEffect, useState } from 'react'; -import type { File } from '@sd/ts-client'; -import { useNormalizedQuery } from '../../context'; -import { ContentRenderer } from './ContentRenderer'; -import { TopBarPortal } from '../../TopBar'; +import { createPortal } from "react-dom"; +import { motion, AnimatePresence } from "framer-motion"; +import { X, ArrowLeft, ArrowRight } from "@phosphor-icons/react"; +import { useEffect, useState } from "react"; +import type { File } from "@sd/ts-client"; +import { useNormalizedQuery } from "../../context"; +import { ContentRenderer } from "./ContentRenderer"; +import { TopBarPortal } from "../../TopBar"; +import { getContentKind } from "../Explorer/utils"; interface QuickPreviewFullscreenProps { fileId: string; @@ -19,7 +20,7 @@ interface QuickPreviewFullscreenProps { inspectorWidth?: number; } -const PREVIEW_LAYER_ID = 'quick-preview-layer'; +const PREVIEW_LAYER_ID = "quick-preview-layer"; export function QuickPreviewFullscreen({ fileId, @@ -30,7 +31,7 @@ export function QuickPreviewFullscreen({ hasPrevious, hasNext, sidebarWidth = 0, - inspectorWidth = 0 + inspectorWidth = 0, }: QuickPreviewFullscreenProps) { const [portalTarget, setPortalTarget] = useState(null); const [isZoomed, setIsZoomed] = useState(false); @@ -40,10 +41,14 @@ export function QuickPreviewFullscreen({ setIsZoomed(false); }, [fileId]); - const { data: file, isLoading, error } = useNormalizedQuery<{ file_id: string }, File>({ - wireMethod: 'query:files.by_id', + const { + data: file, + isLoading, + error, + } = useNormalizedQuery<{ file_id: string }, File>({ + wireMethod: "query:files.by_id", input: { file_id: fileId }, - resourceType: 'file', + resourceType: "file", resourceId: fileId, enabled: !!fileId && isOpen, }); @@ -59,30 +64,33 @@ export function QuickPreviewFullscreen({ const handleKeyDown = (e: KeyboardEvent) => { // Only handle close events - let Explorer handle navigation - if (e.code === 'Escape' || e.code === 'Space') { + if (e.code === "Escape" || e.code === "Space") { e.preventDefault(); e.stopImmediatePropagation(); onClose(); } }; - window.addEventListener('keydown', handleKeyDown, { capture: true }); - return () => window.removeEventListener('keydown', handleKeyDown, { capture: true }); + window.addEventListener("keydown", handleKeyDown, { capture: true }); + return () => + window.removeEventListener("keydown", handleKeyDown, { + capture: true, + }); }, [isOpen, onClose]); // Get background style based on content type const getBackgroundClass = () => { - if (!file) return 'bg-black/90'; + if (!file) return "bg-black/90"; - switch (file.content_identity?.kind) { - case 'video': - return 'bg-black'; - case 'audio': - return 'audio-gradient'; - case 'image': - return 'bg-black/95'; + switch (getContentKind(file)) { + case "video": + return "bg-black"; + case "audio": + return "audio-gradient"; + case "image": + return "bg-black/95"; default: - return 'bg-black/90'; + return "bg-black/90"; } }; @@ -106,7 +114,9 @@ export function QuickPreviewFullscreen({ ) : error ? (
-
Error loading file
+
+ Error loading file +
{error.message}
@@ -123,14 +133,20 @@ export function QuickPreviewFullscreen({ disabled={!hasPrevious} className="rounded-md p-1.5 text-white/70 transition-colors hover:bg-white/10 hover:text-white disabled:opacity-30" > - +
@@ -154,25 +170,36 @@ export function QuickPreviewFullscreen({ {/* Content Area - padded to fit between sidebar/inspector, expands on zoom */}
- +
{/* Footer with keyboard hints */}
- ESC or{' '} - Space to close + ESC{" "} + or{" "} + Space{" "} + to close {(hasPrevious || hasNext) && ( <> - {' · '} - /{' '} - to navigate + {" · "} + + ← + {" "} + /{" "} + + → + {" "} + to navigate )}
diff --git a/packages/interface/src/hooks/useLocationChangeInvalidation.ts b/packages/interface/src/hooks/useLocationChangeInvalidation.ts new file mode 100644 index 000000000..ab9dffc89 --- /dev/null +++ b/packages/interface/src/hooks/useLocationChangeInvalidation.ts @@ -0,0 +1,94 @@ +/** + * useLocationChangeInvalidation - Invalidates directory listing queries when location index_mode changes + * + * When a user enables indexing for a location (index_mode changes from "none" to something else), + * we need to refetch directory listings because: + * - Before: Data came from ephemeral in-memory index + * - After: Data comes from persistent database + * + * This hook subscribes to location events and invalidates affected queries. + */ + +import { useEffect, useRef } from "react"; +import { useQueryClient } from "@tanstack/react-query"; +import { useSpacedriveClient } from "@sd/ts-client/hooks"; +import type { Event, LocationInfo } from "@sd/ts-client"; + +export function useLocationChangeInvalidation() { + const client = useSpacedriveClient(); + const queryClient = useQueryClient(); + const libraryId = client.getCurrentLibraryId(); + + // Track previous index_mode for each location to detect changes + const prevIndexModes = useRef>(new Map()); + + useEffect(() => { + if (!libraryId) return; + + let unsubscribe: (() => void) | undefined; + let isCancelled = false; + + const handleEvent = (event: Event) => { + // Only handle ResourceChanged events for locations + if (typeof event === "string" || !("ResourceChanged" in event)) { + return; + } + + const { resource_type, resource } = event.ResourceChanged; + if (resource_type !== "location") { + return; + } + + const location = resource as LocationInfo; + const locationId = location.id; + const newIndexMode = location.index_mode; + + // Get previous index_mode + const prevIndexMode = prevIndexModes.current.get(locationId); + + // Update tracked index_mode + prevIndexModes.current.set(locationId, newIndexMode); + + // Check if index_mode changed from "none" to something else + // This means the user just enabled indexing + if (prevIndexMode === "none" && newIndexMode !== "none") { + console.log( + `[useLocationChangeInvalidation] Location ${locationId} indexing enabled (${prevIndexMode} -> ${newIndexMode}), invalidating directory_listing queries`, + ); + + // Invalidate all directory_listing queries + // They will refetch and get data from the persistent index instead of ephemeral + queryClient.invalidateQueries({ + predicate: (query) => { + const key = query.queryKey; + return ( + Array.isArray(key) && + key[0] === "query:files.directory_listing" + ); + }, + }); + } + }; + + client + .subscribeFiltered( + { + resource_type: "location", + library_id: libraryId, + }, + handleEvent, + ) + .then((unsub) => { + if (isCancelled) { + unsub(); + } else { + unsubscribe = unsub; + } + }); + + return () => { + isCancelled = true; + unsubscribe?.(); + }; + }, [client, queryClient, libraryId]); +} diff --git a/packages/interface/src/inspectors/FileInspector.tsx b/packages/interface/src/inspectors/FileInspector.tsx index df949e9ba..11be52745 100644 --- a/packages/interface/src/inspectors/FileInspector.tsx +++ b/packages/interface/src/inspectors/FileInspector.tsx @@ -1,35 +1,36 @@ import { - Info, - Tag as TagIcon, - Calendar, - HardDrive, - Hash, - Fingerprint, - Palette, - Image, - ClockCounterClockwise, - DotsThree, - MapPin, - ChatCircle, - PaperPlaneRight, - Paperclip, - Sparkle, - TextAa, - Microphone, - ArrowsClockwise, - MagnifyingGlass, - Trash, - FilmStrip, - VideoCamera, + Info, + Tag as TagIcon, + Calendar, + HardDrive, + Hash, + Fingerprint, + Palette, + Image, + ClockCounterClockwise, + DotsThree, + MapPin, + ChatCircle, + PaperPlaneRight, + Paperclip, + Sparkle, + TextAa, + Microphone, + ArrowsClockwise, + MagnifyingGlass, + Trash, + FilmStrip, + VideoCamera, } from "@phosphor-icons/react"; import { useState } from "react"; +import { getContentKind } from "../components/Explorer/utils"; import { - InfoRow, - Tag, - Section, - Divider, - Tabs, - TabContent, + InfoRow, + Tag, + Section, + Divider, + Tabs, + TabContent, } from "../components/Inspector"; import { TagSelectorButton } from "../components/Tags"; import clsx from "clsx"; @@ -41,1024 +42,1139 @@ import { useContextMenu } from "../hooks/useContextMenu"; import { usePlatform } from "../platform"; interface FileInspectorProps { - file: File; + file: File; } export function FileInspector({ file }: FileInspectorProps) { - const [activeTab, setActiveTab] = useState("overview"); + const [activeTab, setActiveTab] = useState("overview"); - const fileQuery = useNormalizedQuery<{ file_id: string }, File>({ - wireMethod: "query:files.by_id", - input: { file_id: file?.id || "" }, - resourceType: "file", - resourceId: file?.id, // Filter batch events to only this file - enabled: !!file?.id, - }); + const fileQuery = useNormalizedQuery<{ file_id: string }, File>({ + wireMethod: "query:files.by_id", + input: { file_id: file?.id || "" }, + resourceType: "file", + resourceId: file?.id, // Filter batch events to only this file + enabled: !!file?.id, + }); - const fileData = fileQuery.data || file; + const fileData = fileQuery.data || file; - const tabs = [ - { id: "overview", label: "Overview", icon: Info }, - { id: "sidecars", label: "Sidecars", icon: Image }, - { id: "instances", label: "Instances", icon: MapPin }, - { id: "chat", label: "Chat", icon: ChatCircle, badge: 3 }, - { id: "activity", label: "Activity", icon: ClockCounterClockwise }, - { id: "details", label: "More", icon: DotsThree }, - ]; + const tabs = [ + { id: "overview", label: "Overview", icon: Info }, + { id: "sidecars", label: "Sidecars", icon: Image }, + { id: "instances", label: "Instances", icon: MapPin }, + { id: "chat", label: "Chat", icon: ChatCircle, badge: 3 }, + { id: "activity", label: "Activity", icon: ClockCounterClockwise }, + { id: "details", label: "More", icon: DotsThree }, + ]; - return ( - <> - {/* Tabs */} - + return ( + <> + {/* Tabs */} + - {/* Tab Content */} -
- - - + {/* Tab Content */} +
+ + + - - - + + + - - - + + + - - - + + + - - - + + + - - - -
- - ); + + + +
+ + ); } function OverviewTab({ file }: { file: File }) { - const formatDate = (dateStr: string) => { - const date = new Date(dateStr); - return date.toLocaleDateString("en-US", { - month: "short", - day: "numeric", - year: "numeric", - }); - }; + const formatDate = (dateStr: string) => { + const date = new Date(dateStr); + return date.toLocaleDateString("en-US", { + month: "short", + day: "numeric", + year: "numeric", + }); + }; - // Tag mutations - const applyTag = useLibraryMutation("tags.apply"); + // Tag mutations + const applyTag = useLibraryMutation("tags.apply"); - // AI Processing mutations - const extractText = useLibraryMutation("media.ocr.extract"); - const transcribeAudio = useLibraryMutation("media.speech.transcribe"); - const regenerateThumbnail = useLibraryMutation("media.thumbnail.regenerate"); - const generateThumbstrip = useLibraryMutation("media.thumbstrip.generate"); - const generateProxy = useLibraryMutation("media.proxy.generate"); + // AI Processing mutations + const extractText = useLibraryMutation("media.ocr.extract"); + const transcribeAudio = useLibraryMutation("media.speech.transcribe"); + const regenerateThumbnail = useLibraryMutation( + "media.thumbnail.regenerate", + ); + const generateThumbstrip = useLibraryMutation("media.thumbstrip.generate"); + const generateProxy = useLibraryMutation("media.proxy.generate"); - // Check content kind for available actions - const isImage = file?.content_identity?.kind === "image"; - const isVideo = file?.content_identity?.kind === "video"; - const isAudio = file?.content_identity?.kind === "audio"; - const hasText = file?.content_identity?.text_content; + // Check content kind for available actions + const isImage = getContentKind(file) === "image"; + const isVideo = getContentKind(file) === "video"; + const isAudio = getContentKind(file) === "audio"; + const hasText = file?.content_identity?.text_content; - const fileKind = - file?.content_identity?.kind && file.content_identity.kind !== "unknown" - ? file.content_identity.kind - : file.kind === "File" - ? file.extension || "File" - : file.kind; + const contentKind = getContentKind(file); + const fileKind = + contentKind && contentKind !== "unknown" + ? contentKind + : file.kind === "File" + ? file.extension || "File" + : file.kind; - return ( -
- {/* Thumbnail */} -
- -
+ return ( +
+ {/* Thumbnail */} +
+ +
- {/* File name */} -
-

- {file.name} - {file.extension ? `.${file.extension}` : ""} -

-

{fileKind}

-
+ {/* File name */} +
+

+ {file.name} + {file.extension ? `.${file.extension}` : ""} +

+

{fileKind}

+
- + - {/* Details */} -
- - {file.kind === "File" && file.extension && ( - - )} - -
+ {/* Details */} +
+ + {file.kind === "File" && file.extension && ( + + )} + +
- {/* Dates */} -
- {/* Show capture date for media files */} - {file.video_media_data?.date_captured && ( - - )} - {file.image_media_data?.date_taken && ( - - )} - - - {file.accessed_at && ( - - )} -
+ {/* Dates */} +
+ {/* Show capture date for media files */} + {file.video_media_data?.date_captured && ( + + )} + {file.image_media_data?.date_taken && ( + + )} + + + {file.accessed_at && ( + + )} +
- {/* Image Metadata */} - {file.image_media_data && ( -
- - {file.image_media_data.camera_make && ( - - )} - {file.image_media_data.lens_model && ( - - )} - {file.image_media_data.iso && ( - - )} - {file.image_media_data.focal_length && ( - - )} - {file.image_media_data.aperture && ( - - )} - {file.image_media_data.shutter_speed && ( - - )} -
- )} + {/* Image Metadata */} + {file.image_media_data && ( +
+ + {file.image_media_data.camera_make && ( + + )} + {file.image_media_data.lens_model && ( + + )} + {file.image_media_data.iso && ( + + )} + {file.image_media_data.focal_length && ( + + )} + {file.image_media_data.aperture && ( + + )} + {file.image_media_data.shutter_speed && ( + + )} +
+ )} - {/* Video Metadata */} - {file.video_media_data && ( -
- - {file.video_media_data.duration_seconds && ( - - )} - {file.video_media_data.codec && ( - - )} - {file.video_media_data.bit_rate && ( - - )} - {file.video_media_data.fps_num && - file.video_media_data.fps_den && - file.video_media_data.fps_den !== 0 && ( - - )} - {file.video_media_data.audio_codec && ( - - )} -
- )} + {/* Video Metadata */} + {file.video_media_data && ( +
+ + {file.video_media_data.duration_seconds && ( + + )} + {file.video_media_data.codec && ( + + )} + {file.video_media_data.bit_rate && ( + + )} + {file.video_media_data.fps_num && + file.video_media_data.fps_den && + file.video_media_data.fps_den !== 0 && ( + + )} + {file.video_media_data.audio_codec && ( + + )} +
+ )} - {/* Audio Metadata */} - {file.audio_media_data && ( -
- {file.audio_media_data.artist && ( - - )} - {file.audio_media_data.album && ( - - )} - {file.audio_media_data.title && ( - - )} - {file.audio_media_data.duration_seconds && ( - - )} - {file.audio_media_data.codec && ( - - )} - {file.audio_media_data.bit_rate && ( - - )} - {file.audio_media_data.genre && ( - - )} - {file.audio_media_data.year && ( - - )} -
- )} + {/* Audio Metadata */} + {file.audio_media_data && ( +
+ {file.audio_media_data.artist && ( + + )} + {file.audio_media_data.album && ( + + )} + {file.audio_media_data.title && ( + + )} + {file.audio_media_data.duration_seconds && ( + + )} + {file.audio_media_data.codec && ( + + )} + {file.audio_media_data.bit_rate && ( + + )} + {file.audio_media_data.genre && ( + + )} + {file.audio_media_data.year && ( + + )} +
+ )} - {/* Storage */} -
- - -
+ {/* Storage */} +
+ + +
- {/* Tags */} -
-
- {file.tags && - file.tags.length > 0 && - file.tags.map((tag) => ( - - {tag.canonical_name} - - ))} + {/* Tags */} +
+
+ {file.tags && + file.tags.length > 0 && + file.tags.map((tag) => ( + + {tag.canonical_name} + + ))} - {/* Add Tag Button */} - { - // Use content-based tagging by default (tags all instances) - // Fall back to entry-based if no content identity - await applyTag.mutateAsync({ - targets: file.content_identity?.uuid - ? { type: "Content", ids: [file.content_identity.uuid] } - : { type: "Entry", ids: [parseInt(file.id)] }, - tag_ids: [tag.id], - source: "User", - confidence: 1.0, - }); - }} - contextTags={file.tags || []} - fileId={file.id} - contentId={file.content_identity?.uuid} - trigger={ - - } - /> -
-
+ {/* Add Tag Button */} + { + // Use content-based tagging by default (tags all instances) + // Fall back to entry-based if no content identity + await applyTag.mutateAsync({ + targets: file.content_identity?.uuid + ? { + type: "Content", + ids: [file.content_identity.uuid], + } + : { + type: "Entry", + ids: [parseInt(file.id)], + }, + tag_ids: [tag.id], + source: "User", + confidence: 1.0, + }); + }} + contextTags={file.tags || []} + fileId={file.id} + contentId={file.content_identity?.uuid} + trigger={ + + } + /> +
+
- {/* AI Processing */} - {(isImage || isVideo || isAudio) && ( -
-
- {/* OCR for images */} - {isImage && ( - - )} + {/* AI Processing */} + {(isImage || isVideo || isAudio) && ( +
+
+ {/* OCR for images */} + {isImage && ( + + )} - {/* Speech-to-text for audio/video */} - {(isVideo || isAudio) && ( - - )} + {/* Speech-to-text for audio/video */} + {(isVideo || isAudio) && ( + + )} - {/* Regenerate thumbnails */} - {(isImage || isVideo) && ( - - )} + {/* Regenerate thumbnails */} + {(isImage || isVideo) && ( + + )} - {/* Generate thumbstrip (for videos) */} - {isVideo && ( - - )} + {/* Generate thumbstrip (for videos) */} + {isVideo && ( + + )} - {/* Generate proxy (for videos) */} - {isVideo && ( - - )} + {/* Generate proxy (for videos) */} + {isVideo && ( + + )} - {/* Show extracted text if available */} - {hasText && ( -
-
- - - - - Extracted Text - -
-
-                  {file.content_identity.text_content}
-                
-
- )} -
-
- )} -
- ); + {/* Show extracted text if available */} + {hasText && ( +
+
+ + + + + Extracted Text + +
+
+									{file.content_identity.text_content}
+								
+
+ )} +
+ + )} +
+ ); } function SidecarsTab({ file }: { file: File }) { - const sidecars = file.sidecars || []; - const platform = usePlatform(); + const sidecars = file.sidecars || []; + const platform = usePlatform(); - // Helper to get sidecar URL - const getSidecarUrl = (sidecar: any) => { - if (typeof window === "undefined") return null; - const serverUrl = (window as any).__SPACEDRIVE_SERVER_URL__; - const libraryId = (window as any).__SPACEDRIVE_LIBRARY_ID__; + // Helper to get sidecar URL + const getSidecarUrl = (sidecar: any) => { + if (typeof window === "undefined") return null; + const serverUrl = (window as any).__SPACEDRIVE_SERVER_URL__; + const libraryId = (window as any).__SPACEDRIVE_LIBRARY_ID__; - if (!serverUrl || !libraryId || !file.content_identity) return null; + if (!serverUrl || !libraryId || !file.content_identity) return null; - const contentUuid = file.content_identity.uuid; - return `${serverUrl}/sidecar/${libraryId}/${contentUuid}/${sidecar.kind}/${sidecar.variant}.${sidecar.format}`; - }; + const contentUuid = file.content_identity.uuid; + return `${serverUrl}/sidecar/${libraryId}/${contentUuid}/${sidecar.kind}/${sidecar.variant}.${sidecar.format}`; + }; - return ( -
-

- Derivative files and associated content generated by Spacedrive -

+ return ( +
+

+ Derivative files and associated content generated by Spacedrive +

- {sidecars.length === 0 ? ( -
- No sidecars generated yet -
- ) : ( -
- {sidecars.map((sidecar, i) => ( - - ))} -
- )} -
- ); + {sidecars.length === 0 ? ( +
+ No sidecars generated yet +
+ ) : ( +
+ {sidecars.map((sidecar, i) => ( + + ))} +
+ )} +
+ ); } function SidecarItem({ - sidecar, - file, - sidecarUrl, - platform, + sidecar, + file, + sidecarUrl, + platform, }: { - sidecar: any; - file: File; - sidecarUrl: string | null; - platform: ReturnType; + sidecar: any; + file: File; + sidecarUrl: string | null; + platform: ReturnType; }) { - const isImage = - (sidecar.kind === "thumb" || sidecar.kind === "thumbstrip") && - (sidecar.format === "webp" || - sidecar.format === "jpg" || - sidecar.format === "png"); + const isImage = + (sidecar.kind === "thumb" || sidecar.kind === "thumbstrip") && + (sidecar.format === "webp" || + sidecar.format === "jpg" || + sidecar.format === "png"); - const contextMenu = useContextMenu({ - items: [ - { - icon: MagnifyingGlass, - label: "Show in Finder", - onClick: async () => { - if ( - platform.getSidecarPath && - platform.revealFile && - file.content_identity - ) { - try { - const libraryId = (window as any).__SPACEDRIVE_LIBRARY_ID__; - if (!libraryId) { - console.error("Library ID not found"); - return; - } + const contextMenu = useContextMenu({ + items: [ + { + icon: MagnifyingGlass, + label: "Show in Finder", + onClick: async () => { + if ( + platform.getSidecarPath && + platform.revealFile && + file.content_identity + ) { + try { + const libraryId = (window as any) + .__SPACEDRIVE_LIBRARY_ID__; + if (!libraryId) { + console.error("Library ID not found"); + return; + } - // Convert "text" format to "txt" extension (matches actual file on disk) - const format = sidecar.format === "text" ? "txt" : sidecar.format; - const sidecarPath = await platform.getSidecarPath( - libraryId, - file.content_identity.uuid, - sidecar.kind, - sidecar.variant, - format, - ); + // Convert "text" format to "txt" extension (matches actual file on disk) + const format = + sidecar.format === "text" + ? "txt" + : sidecar.format; + const sidecarPath = await platform.getSidecarPath( + libraryId, + file.content_identity.uuid, + sidecar.kind, + sidecar.variant, + format, + ); - await platform.revealFile(sidecarPath); - } catch (err) { - console.error("Failed to reveal sidecar:", err); - } - } - }, - condition: () => - !!platform.getSidecarPath && - !!platform.revealFile && - !!file.content_identity, - }, - { - icon: Trash, - label: "Delete Sidecar", - onClick: () => { - console.log("Delete sidecar:", sidecar); - // TODO: Implement sidecar deletion - }, - variant: "danger" as const, - }, - ], - }); + await platform.revealFile(sidecarPath); + } catch (err) { + console.error("Failed to reveal sidecar:", err); + } + } + }, + condition: () => + !!platform.getSidecarPath && + !!platform.revealFile && + !!file.content_identity, + }, + { + icon: Trash, + label: "Delete Sidecar", + onClick: () => { + console.log("Delete sidecar:", sidecar); + // TODO: Implement sidecar deletion + }, + variant: "danger" as const, + }, + ], + }); - const handleContextMenu = async (e: React.MouseEvent) => { - e.preventDefault(); - e.stopPropagation(); - await contextMenu.show(e); - }; + const handleContextMenu = async (e: React.MouseEvent) => { + e.preventDefault(); + e.stopPropagation(); + await contextMenu.show(e); + }; - return ( -
- {/* Preview thumbnail for image sidecars */} - {isImage && sidecarUrl ? ( -
- {`${sidecar.variant} { - // Fallback to icon on error - e.currentTarget.style.display = "none"; - if (e.currentTarget.nextElementSibling) { - ( - e.currentTarget.nextElementSibling as HTMLElement - ).style.display = "flex"; - } - }} - /> -
- -
-
- ) : ( -
- -
- )} + return ( +
+ {/* Preview thumbnail for image sidecars */} + {isImage && sidecarUrl ? ( +
+ {`${sidecar.variant} { + // Fallback to icon on error + e.currentTarget.style.display = "none"; + if (e.currentTarget.nextElementSibling) { + ( + e.currentTarget + .nextElementSibling as HTMLElement + ).style.display = "flex"; + } + }} + /> +
+ +
+
+ ) : ( +
+ +
+ )} -
-
- {String(sidecar.kind)} -
-
- {String(sidecar.variant)} · {formatBytes(sidecar.size)} -
-
- {String(sidecar.format).toUpperCase()} -
-
- - {String(sidecar.status)} - -
- ); +
+
+ {String(sidecar.kind)} +
+
+ {String(sidecar.variant)} · {formatBytes(sidecar.size)} +
+
+ {String(sidecar.format).toUpperCase()} +
+
+ + {String(sidecar.status)} + +
+ ); } function InstancesTab({ file }: { file: File }) { - const alternatePaths = file.alternate_paths || []; - const allPaths = [file.sd_path, ...alternatePaths]; + const alternatePaths = file.alternate_paths || []; + const allPaths = [file.sd_path, ...alternatePaths]; - const getPathDisplay = (sdPath: typeof file.sd_path) => { - if ("Physical" in sdPath) { - return sdPath.Physical.path; - } else if ("Cloud" in sdPath) { - return sdPath.Cloud.path; - } else { - return "Content"; - } - }; + const getPathDisplay = (sdPath: typeof file.sd_path) => { + if ("Physical" in sdPath) { + return sdPath.Physical.path; + } else if ("Cloud" in sdPath) { + return sdPath.Cloud.path; + } else { + return "Content"; + } + }; - return ( -
-

- All copies of this file across your devices and locations -

+ return ( +
+

+ All copies of this file across your devices and locations +

- {allPaths.length === 1 ? ( -
- No alternate instances found -
- ) : ( -
- {allPaths.map((sdPath, i) => ( -
-
- - - -
-
- {getPathDisplay(sdPath)} -
-
- {"Physical" in sdPath && "Local Device"} - {"Cloud" in sdPath && "Cloud Storage"} - {"Content" in sdPath && "Content Addressed"} -
-
-
-
-
- ))} -
- )} -
- ); + {allPaths.length === 1 ? ( +
+ No alternate instances found +
+ ) : ( +
+ {allPaths.map((sdPath, i) => ( +
+
+ + + +
+
+ {getPathDisplay(sdPath)} +
+
+ {"Physical" in sdPath && "Local Device"} + {"Cloud" in sdPath && "Cloud Storage"} + {"Content" in sdPath && + "Content Addressed"} +
+
+
+
+
+ ))} +
+ )} +
+ ); } function ChatTab() { - const [message, setMessage] = useState(""); + const [message, setMessage] = useState(""); - const messages = [ - { - id: 1, - sender: "Sarah", - avatar: "S", - content: "Can you check if this photo is also on the NAS?", - time: "2:34 PM", - isUser: false, - }, - { - id: 2, - sender: "You", - avatar: "J", - content: "Yeah, it's synced. Shows 3 instances across devices.", - time: "2:35 PM", - isUser: true, - }, - { - id: 3, - sender: "AI Assistant", - avatar: "", - content: - "I found 2 similar photos in your library from the same location. Would you like me to create a collection?", - time: "2:36 PM", - isUser: false, - isAI: true, - unread: true, - }, - { - id: 4, - sender: "Sarah", - avatar: "S", - content: "Perfect, thanks! Can you share the collection with me?", - time: "2:37 PM", - isUser: false, - unread: true, - }, - { - id: 5, - sender: "Alex", - avatar: "A", - content: "I just tagged this as Summer 2025 btw", - time: "2:38 PM", - isUser: false, - unread: true, - }, - ]; + const messages = [ + { + id: 1, + sender: "Sarah", + avatar: "S", + content: "Can you check if this photo is also on the NAS?", + time: "2:34 PM", + isUser: false, + }, + { + id: 2, + sender: "You", + avatar: "J", + content: "Yeah, it's synced. Shows 3 instances across devices.", + time: "2:35 PM", + isUser: true, + }, + { + id: 3, + sender: "AI Assistant", + avatar: "", + content: + "I found 2 similar photos in your library from the same location. Would you like me to create a collection?", + time: "2:36 PM", + isUser: false, + isAI: true, + unread: true, + }, + { + id: 4, + sender: "Sarah", + avatar: "S", + content: "Perfect, thanks! Can you share the collection with me?", + time: "2:37 PM", + isUser: false, + unread: true, + }, + { + id: 5, + sender: "Alex", + avatar: "A", + content: "I just tagged this as Summer 2025 btw", + time: "2:38 PM", + isUser: false, + unread: true, + }, + ]; - return ( -
- {/* Messages */} -
- {messages.map((msg) => ( -
- {/* Avatar */} -
- {msg.avatar} -
+ return ( +
+ {/* Messages */} +
+ {messages.map((msg) => ( +
+ {/* Avatar */} +
+ {msg.avatar} +
- {/* Message bubble */} -
-
- {!msg.isUser && ( -
- {msg.sender} -
- )} -

- {msg.content} -

-
- - {msg.time} - -
-
- ))} -
+ {/* Message bubble */} +
+
+ {!msg.isUser && ( +
+ {msg.sender} +
+ )} +

+ {msg.content} +

+
+ + {msg.time} + +
+
+ ))} +
- {/* Input */} -
-
- + {/* Input */} +
+
+ -
- setMessage(e.target.value)} - placeholder="Type a message..." - className="flex-1 bg-transparent text-xs text-sidebar-ink placeholder:text-sidebar-inkDull outline-none" - /> -
+
+ setMessage(e.target.value)} + placeholder="Type a message..." + className="flex-1 bg-transparent text-xs text-sidebar-ink placeholder:text-sidebar-inkDull outline-none" + /> +
- -
+ +
-
- - - -
-
-
- ); +
+ + + +
+
+
+ ); } function ActivityTab() { - const activity = [ - { action: "Synced to NAS", time: "2 min ago", device: "MacBook Pro" }, - { action: "Uploaded to S3", time: "1 hour ago", device: "MacBook Pro" }, - { - action: "Thumbnail generated", - time: "2 hours ago", - device: "MacBook Pro", - }, - { action: "Tagged as 'Travel'", time: "3 hours ago", device: "iPhone" }, - { action: "Created", time: "Jan 15, 2025", device: "iPhone" }, - ]; + const activity = [ + { action: "Synced to NAS", time: "2 min ago", device: "MacBook Pro" }, + { action: "Uploaded to S3", time: "1 hour ago", device: "MacBook Pro" }, + { + action: "Thumbnail generated", + time: "2 hours ago", + device: "MacBook Pro", + }, + { action: "Tagged as 'Travel'", time: "3 hours ago", device: "iPhone" }, + { action: "Created", time: "Jan 15, 2025", device: "iPhone" }, + ]; - return ( -
-

- History of changes and sync operations -

+ return ( +
+

+ History of changes and sync operations +

-
- {activity.map((item, i) => ( -
- - - -
-
{item.action}
-
- {item.time} · {item.device} -
-
-
- ))} -
-
- ); +
+ {activity.map((item, i) => ( +
+ + + +
+
+ {item.action} +
+
+ {item.time} · {item.device} +
+
+
+ ))} +
+
+ ); } function DetailsTab({ file }: { file: File }) { - return ( -
- {/* Content Identity */} - {file.content_identity && ( -
- - {file.content_identity.integrity_hash && ( - - )} - {file.content_identity.mime_type_id !== null && ( - - )} -
- )} + return ( +
+ {/* Content Identity */} + {file.content_identity && ( +
+ + {file.content_identity.integrity_hash && ( + + )} + {file.content_identity.mime_type_id !== null && ( + + )} +
+ )} - {/* Metadata */} -
- - - {file.extension && ( - - )} -
+ {/* Metadata */} +
+ + + {file.extension && ( + + )} +
- {/* System */} -
- - - -
-
- ); + {/* System */} +
+ + + +
+
+ ); } From c3517a554e76def817c22eca9db38af4886b242a Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 21:03:51 -0800 Subject: [PATCH 03/20] Preserve ephemeral UUIDs during indexing - Remove TTL-based ephemeral cache and switch to a permanent in-memory cache. - Reuse ephemeral UUIDs when creating persistent entries to preserve continuity of user data. - Populate ephemeral UUIDs during the processing phase and expose get_ephemeral_uuid in the indexer state. - Remove the location invalidation hook and related UI usage. --- apps/cli/src/domains/index/mod.rs | 4 +- core/src/ops/core/ephemeral_status/output.rs | 2 - core/src/ops/core/ephemeral_status/query.rs | 1 - core/src/ops/indexing/entry.rs | 19 ++- .../src/ops/indexing/ephemeral/index_cache.rs | 100 ++++---------- core/src/ops/indexing/phases/processing.rs | 14 ++ core/src/ops/indexing/state.rs | 122 ++++++++++++++++++ packages/interface/src/Explorer.tsx | 4 - .../hooks/useLocationChangeInvalidation.ts | 94 -------------- 9 files changed, 180 insertions(+), 180 deletions(-) delete mode 100644 packages/interface/src/hooks/useLocationChangeInvalidation.ts diff --git a/apps/cli/src/domains/index/mod.rs b/apps/cli/src/domains/index/mod.rs index 6581d4129..60fe3d9cf 100644 --- a/apps/cli/src/domains/index/mod.rs +++ b/apps/cli/src/domains/index/mod.rs @@ -249,8 +249,8 @@ pub async fn run(ctx: &Context, cmd: IndexCmd) -> Result<()> { println!("║ EPHEMERAL INDEX CACHE STATUS ║"); println!("╠══════════════════════════════════════════════════════════════╣"); println!( - "║ Total Indexes: {:3} In Progress: {:3} Stale: {:3} ║", - status.total_indexes, status.indexing_in_progress, status.stale_count + "║ Total Indexes: {:3} In Progress: {:3} ║", + status.total_indexes, status.indexing_in_progress ); println!("╚══════════════════════════════════════════════════════════════╝"); diff --git a/core/src/ops/core/ephemeral_status/output.rs b/core/src/ops/core/ephemeral_status/output.rs index 789652b63..b092da1a8 100644 --- a/core/src/ops/core/ephemeral_status/output.rs +++ b/core/src/ops/core/ephemeral_status/output.rs @@ -11,8 +11,6 @@ pub struct EphemeralCacheStatus { pub total_indexes: usize, /// Number of indexes currently being populated pub indexing_in_progress: usize, - /// Number of stale indexes (past TTL) - pub stale_count: usize, /// Details for each cached index pub indexes: Vec, } diff --git a/core/src/ops/core/ephemeral_status/query.rs b/core/src/ops/core/ephemeral_status/query.rs index df03245a1..8e2747efc 100644 --- a/core/src/ops/core/ephemeral_status/query.rs +++ b/core/src/ops/core/ephemeral_status/query.rs @@ -91,7 +91,6 @@ impl CoreQuery for EphemeralCacheStatusQuery { Ok(EphemeralCacheStatus { total_indexes: cache_stats.total_entries, indexing_in_progress: cache_stats.indexing_count, - stale_count: cache_stats.stale_count, indexes, }) } diff --git a/core/src/ops/indexing/entry.rs b/core/src/ops/indexing/entry.rs index 592f25885..16c32cb1a 100644 --- a/core/src/ops/indexing/entry.rs +++ b/core/src/ops/indexing/entry.rs @@ -223,10 +223,21 @@ impl EntryProcessor { }) .unwrap_or_else(|| chrono::Utc::now()); - // All entries get UUIDs immediately for UI normalized caching compatibility. - // Sync readiness is now determined by content_id presence (for regular files) - // or by entry kind (for directories/empty files). - let entry_uuid = Some(Uuid::new_v4()); + // UUID assignment strategy: + // 1. First check if there's an ephemeral UUID to preserve (from previous browsing) + // 2. If not, generate a new UUID + // + // This ensures that files browsed before enabling indexing keep the same UUID + let entry_uuid = if let Some(ephemeral_uuid) = state.get_ephemeral_uuid(&entry.path) { + tracing::debug!( + "Preserving ephemeral UUID {} for {}", + ephemeral_uuid, + entry.path.display() + ); + Some(ephemeral_uuid) + } else { + Some(Uuid::new_v4()) + }; // Find parent entry ID let parent_id = if let Some(parent_path) = entry.path.parent() { diff --git a/core/src/ops/indexing/ephemeral/index_cache.rs b/core/src/ops/indexing/ephemeral/index_cache.rs index 52932cf35..f464dbf16 100644 --- a/core/src/ops/indexing/ephemeral/index_cache.rs +++ b/core/src/ops/indexing/ephemeral/index_cache.rs @@ -3,6 +3,10 @@ //! This module provides a thread-safe cache for storing ephemeral indexes //! by their root path. This allows directory listing queries to reuse //! existing indexes instead of spawning new indexer jobs. +//! +//! The cache is permanent in memory (no TTL or expiration). Entries persist +//! until the daemon restarts or they are explicitly removed. This ensures +//! UUIDs from ephemeral indexing can be preserved when regular indexing is enabled. use crate::ops::indexing::EphemeralIndex; use parking_lot::RwLock; @@ -10,16 +14,10 @@ use std::{ collections::HashMap, path::{Path, PathBuf}, sync::Arc, - time::{Duration, Instant}, + time::Instant, }; use tokio::sync::RwLock as TokioRwLock; -/// Default TTL for ephemeral indexes (5 minutes) -const DEFAULT_TTL: Duration = Duration::from_secs(5 * 60); - -/// Maximum idle time before an index is considered stale (2 minutes) -const MAX_IDLE_TIME: Duration = Duration::from_secs(2 * 60); - /// Cache entry wrapping an ephemeral index with metadata struct CacheEntry { /// The ephemeral index @@ -38,77 +36,40 @@ impl CacheEntry { indexing_in_progress: false, } } - - fn is_stale(&self, ttl: Duration) -> bool { - self.created_at.elapsed() > ttl - } } /// Global cache for ephemeral indexes /// /// Stores ephemeral indexes by their root path for reuse across queries. -/// Indexes are automatically evicted based on TTL and idle time. +/// Indexes persist in memory until the daemon restarts or they are explicitly removed. pub struct EphemeralIndexCache { /// Map of root path to cache entry entries: RwLock>, - /// Time-to-live for cache entries - ttl: Duration, } impl EphemeralIndexCache { - /// Create a new cache with default TTL + /// Create a new cache pub fn new() -> Self { Self { entries: RwLock::new(HashMap::new()), - ttl: DEFAULT_TTL, } } - /// Create a new cache with custom TTL - pub fn with_ttl(ttl: Duration) -> Self { - Self { - entries: RwLock::new(HashMap::new()), - ttl, - } - } - - /// Get an existing index for a path, or None if not cached or stale - /// - /// Also checks if the index is still being populated (indexing in progress). + /// Get an existing index for a path, or None if not cached pub fn get(&self, path: &Path) -> Option>> { let entries = self.entries.read(); - if let Some(entry) = entries.get(path) { - // Check if stale - if entry.is_stale(self.ttl) { - return None; - } - Some(entry.index.clone()) - } else { - None - } + entries.get(path).map(|entry| entry.index.clone()) } /// Get an existing index for a path (exact match only) /// - /// Returns the index if: - /// 1. An index exists for this exact path - /// 2. The index is not stale + /// Returns the index if an index exists for this exact path. /// /// Note: We only use exact matches because ephemeral indexing uses /// IndexScope::Current (single level), so an ancestor index doesn't /// contain the contents of subdirectories. pub fn get_for_path(&self, path: &Path) -> Option>> { - let entries = self.entries.read(); - - // Only exact match - ancestor indexes don't contain subdirectory contents - // because ephemeral indexing uses IndexScope::Current (single level) - if let Some(entry) = entries.get(path) { - if !entry.is_stale(self.ttl) { - return Some(entry.index.clone()); - } - } - - None + self.get(path) } /// Check if indexing is in progress for a path @@ -147,17 +108,10 @@ impl EphemeralIndexCache { } /// Mark indexing as complete for a path - /// - /// This also refreshes the entry's `created_at` timestamp so it's no longer - /// considered stale. This is important because `create_for_indexing()` may - /// have reused an existing stale entry, and without this refresh the entry - /// would remain stale even after being freshly populated. pub fn mark_indexing_complete(&self, path: &Path) { let mut entries = self.entries.write(); if let Some(entry) = entries.get_mut(path) { entry.indexing_in_progress = false; - // Reset created_at so the freshly-populated index is no longer stale - entry.created_at = Instant::now(); } } @@ -167,12 +121,6 @@ impl EphemeralIndexCache { entries.remove(path); } - /// Remove stale entries from the cache - pub fn evict_stale(&self) { - let mut entries = self.entries.write(); - entries.retain(|_, entry| !entry.is_stale(self.ttl)); - } - /// Get the number of cached indexes pub fn len(&self) -> usize { self.entries.read().len() @@ -193,14 +141,20 @@ impl EphemeralIndexCache { let entries = self.entries.read(); let total_entries = entries.len(); let indexing_count = entries.values().filter(|e| e.indexing_in_progress).count(); - let stale_count = entries.values().filter(|e| e.is_stale(self.ttl)).count(); EphemeralIndexCacheStats { total_entries, indexing_count, - stale_count, } } + + /// Get the age of a cached index in seconds + pub fn get_age(&self, path: &Path) -> Option { + let entries = self.entries.read(); + entries + .get(path) + .map(|e| e.created_at.elapsed().as_secs_f64()) + } } impl Default for EphemeralIndexCache { @@ -214,7 +168,6 @@ impl Default for EphemeralIndexCache { pub struct EphemeralIndexCacheStats { pub total_entries: usize, pub indexing_count: usize, - pub stale_count: usize, } #[cfg(test)] @@ -244,7 +197,7 @@ mod tests { let cache = EphemeralIndexCache::new(); let path = PathBuf::from("/test/path"); - let index = cache.create_for_indexing(path.clone()); + let _index = cache.create_for_indexing(path.clone()); assert!(cache.is_indexing(&path)); @@ -284,17 +237,18 @@ mod tests { } #[test] - fn test_stale_detection() { - let cache = EphemeralIndexCache::with_ttl(Duration::from_millis(1)); + fn test_cache_persists() { + // Test that cache entries persist (no TTL expiration) + let cache = EphemeralIndexCache::new(); let path = PathBuf::from("/test/path"); let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); cache.insert(path.clone(), index); - // Wait for TTL to expire - std::thread::sleep(Duration::from_millis(10)); + // Wait a bit + std::thread::sleep(std::time::Duration::from_millis(100)); - // Should be stale now - assert!(cache.get(&path).is_none()); + // Should still be available (no expiration) + assert!(cache.get(&path).is_some()); } } diff --git a/core/src/ops/indexing/phases/processing.rs b/core/src/ops/indexing/phases/processing.rs index a5b474fca..68de9d604 100644 --- a/core/src/ops/indexing/phases/processing.rs +++ b/core/src/ops/indexing/phases/processing.rs @@ -42,6 +42,20 @@ pub async fn run_processing_phase( total_batches )); + // Populate ephemeral UUIDs for preservation before processing + // This allows entries that were browsed before enabling indexing to keep + // the same UUID, preserving any user data associated with them + let ephemeral_cache = ctx.library().core_context().ephemeral_cache(); + let preserved_count = state + .populate_ephemeral_uuids(ephemeral_cache, location_root_path) + .await; + if preserved_count > 0 { + ctx.log(format!( + "Found {} ephemeral UUIDs to preserve from previous browsing", + preserved_count + )); + } + if total_batches == 0 { ctx.log("No batches to process - transitioning to Aggregation phase"); state.phase = crate::ops::indexing::state::Phase::Aggregation; diff --git a/core/src/ops/indexing/state.rs b/core/src/ops/indexing/state.rs index 4e769f8ee..b1ac7c58a 100644 --- a/core/src/ops/indexing/state.rs +++ b/core/src/ops/indexing/state.rs @@ -9,6 +9,7 @@ use std::{ path::PathBuf, time::{Duration, Instant}, }; +use uuid::Uuid; /// Indexer progress information #[derive(Debug, Clone, Serialize, Deserialize)] @@ -107,6 +108,12 @@ pub struct IndexerState { // Database operations pub(crate) entry_id_cache: HashMap, // path -> entry_id for parent lookups + // Ephemeral UUID preservation + // UUIDs from ephemeral indexing that should be reused when creating persistent entries + // This ensures files browsed before enabling indexing keep the same UUID + #[serde(skip, default)] + pub(crate) ephemeral_uuids: HashMap, + // Change detection pub(crate) existing_entries: HashMap, Option)>, // path -> (id, inode, modified) @@ -149,6 +156,7 @@ impl IndexerState { entry_batches: Vec::new(), entries_for_content: Vec::new(), entry_id_cache: HashMap::new(), + ephemeral_uuids: HashMap::new(), existing_entries: HashMap::new(), stats: Default::default(), errors: Vec::new(), @@ -161,6 +169,51 @@ impl IndexerState { } } + /// Populate ephemeral UUIDs from the ephemeral cache for UUID preservation + /// + /// When a directory is browsed before being added as a managed location, + /// ephemeral indexing assigns UUIDs to each entry. This method extracts + /// those UUIDs so they can be reused when creating persistent database entries, + /// ensuring continuity for any user data (tags, notes, etc.) associated with + /// the ephemeral UUIDs. + pub async fn populate_ephemeral_uuids( + &mut self, + ephemeral_cache: &super::ephemeral::EphemeralIndexCache, + root_path: &std::path::Path, + ) -> usize { + // Try to get an ephemeral index that covers this path + if let Some(index) = ephemeral_cache.get_for_path(root_path) { + let index_read = index.read().await; + + // Get all paths from the entries and look up their UUIDs + let entries = index_read.entries(); + for path in entries.keys() { + if let Some(entry_uuid) = index_read.get_entry_uuid(path) { + self.ephemeral_uuids.insert(path.clone(), entry_uuid); + } + } + + let count = self.ephemeral_uuids.len(); + tracing::info!( + "Populated {} ephemeral UUIDs for preservation from cache covering {}", + count, + root_path.display() + ); + count + } else { + tracing::debug!("No ephemeral index found for path: {}", root_path.display()); + 0 + } + } + + /// Get an ephemeral UUID for a path if one exists + /// + /// Returns the UUID that was assigned during ephemeral indexing, + /// allowing it to be reused for the persistent database entry. + pub fn get_ephemeral_uuid(&self, path: &std::path::Path) -> Option { + self.ephemeral_uuids.get(path).copied() + } + pub fn calculate_rate(&mut self) -> f32 { let elapsed = self.last_progress_time.elapsed(); if elapsed.as_secs() > 0 { @@ -232,3 +285,72 @@ impl IndexerState { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::domain::addressing::SdPath; + + #[test] + fn test_ephemeral_uuid_lookup() { + let sd_path = SdPath::Physical { + device_slug: "local".to_string(), + path: PathBuf::from("/test"), + }; + let mut state = IndexerState::new(&sd_path); + + // Initially no ephemeral UUIDs + assert!(state + .get_ephemeral_uuid(std::path::Path::new("/test/file.txt")) + .is_none()); + + // Add an ephemeral UUID + let test_uuid = Uuid::new_v4(); + state + .ephemeral_uuids + .insert(PathBuf::from("/test/file.txt"), test_uuid); + + // Now we can retrieve it + assert_eq!( + state.get_ephemeral_uuid(std::path::Path::new("/test/file.txt")), + Some(test_uuid) + ); + + // Non-existent path still returns None + assert!(state + .get_ephemeral_uuid(std::path::Path::new("/test/other.txt")) + .is_none()); + } + + #[test] + fn test_ephemeral_uuid_preservation_concept() { + // This test demonstrates the UUID preservation concept: + // When ephemeral_uuids is populated, the same UUID should be used + // instead of generating a new one + + let sd_path = SdPath::Physical { + device_slug: "local".to_string(), + path: PathBuf::from("/test"), + }; + let mut state = IndexerState::new(&sd_path); + + // Simulate an ephemeral UUID from previous browsing + let preserved_uuid = Uuid::new_v4(); + let test_path = PathBuf::from("/test/document.pdf"); + state + .ephemeral_uuids + .insert(test_path.clone(), preserved_uuid); + + // When creating an entry, the code should check get_ephemeral_uuid first + let entry_uuid = if let Some(ephemeral_uuid) = state.get_ephemeral_uuid(&test_path) { + // Preserve the ephemeral UUID + ephemeral_uuid + } else { + // Generate a new UUID + Uuid::new_v4() + }; + + // The preserved UUID should be used + assert_eq!(entry_uuid, preserved_uuid); + } +} diff --git a/packages/interface/src/Explorer.tsx b/packages/interface/src/Explorer.tsx index 3d44d0cac..b3bdde815 100644 --- a/packages/interface/src/Explorer.tsx +++ b/packages/interface/src/Explorer.tsx @@ -7,7 +7,6 @@ import { useParams, } from "react-router-dom"; import { useEffect, useMemo } from "react"; -import { useLocationChangeInvalidation } from "./hooks/useLocationChangeInvalidation"; import { Dialogs } from "@sd/ui"; import { Inspector, type InspectorVariant } from "./Inspector"; import { TopBarProvider, TopBar } from "./TopBar"; @@ -71,9 +70,6 @@ export function ExplorerLayout() { } = useExplorer(); const { selectedFiles, selectFile } = useSelection(); - // Listen for location index_mode changes and invalidate directory listing queries - useLocationChangeInvalidation(); - // Sync route with explorer context for view preferences useEffect(() => { const spaceItemKey = getSpaceItemKeyFromRoute( diff --git a/packages/interface/src/hooks/useLocationChangeInvalidation.ts b/packages/interface/src/hooks/useLocationChangeInvalidation.ts deleted file mode 100644 index ab9dffc89..000000000 --- a/packages/interface/src/hooks/useLocationChangeInvalidation.ts +++ /dev/null @@ -1,94 +0,0 @@ -/** - * useLocationChangeInvalidation - Invalidates directory listing queries when location index_mode changes - * - * When a user enables indexing for a location (index_mode changes from "none" to something else), - * we need to refetch directory listings because: - * - Before: Data came from ephemeral in-memory index - * - After: Data comes from persistent database - * - * This hook subscribes to location events and invalidates affected queries. - */ - -import { useEffect, useRef } from "react"; -import { useQueryClient } from "@tanstack/react-query"; -import { useSpacedriveClient } from "@sd/ts-client/hooks"; -import type { Event, LocationInfo } from "@sd/ts-client"; - -export function useLocationChangeInvalidation() { - const client = useSpacedriveClient(); - const queryClient = useQueryClient(); - const libraryId = client.getCurrentLibraryId(); - - // Track previous index_mode for each location to detect changes - const prevIndexModes = useRef>(new Map()); - - useEffect(() => { - if (!libraryId) return; - - let unsubscribe: (() => void) | undefined; - let isCancelled = false; - - const handleEvent = (event: Event) => { - // Only handle ResourceChanged events for locations - if (typeof event === "string" || !("ResourceChanged" in event)) { - return; - } - - const { resource_type, resource } = event.ResourceChanged; - if (resource_type !== "location") { - return; - } - - const location = resource as LocationInfo; - const locationId = location.id; - const newIndexMode = location.index_mode; - - // Get previous index_mode - const prevIndexMode = prevIndexModes.current.get(locationId); - - // Update tracked index_mode - prevIndexModes.current.set(locationId, newIndexMode); - - // Check if index_mode changed from "none" to something else - // This means the user just enabled indexing - if (prevIndexMode === "none" && newIndexMode !== "none") { - console.log( - `[useLocationChangeInvalidation] Location ${locationId} indexing enabled (${prevIndexMode} -> ${newIndexMode}), invalidating directory_listing queries`, - ); - - // Invalidate all directory_listing queries - // They will refetch and get data from the persistent index instead of ephemeral - queryClient.invalidateQueries({ - predicate: (query) => { - const key = query.queryKey; - return ( - Array.isArray(key) && - key[0] === "query:files.directory_listing" - ); - }, - }); - } - }; - - client - .subscribeFiltered( - { - resource_type: "location", - library_id: libraryId, - }, - handleEvent, - ) - .then((unsub) => { - if (isCancelled) { - unsub(); - } else { - unsubscribe = unsub; - } - }); - - return () => { - isCancelled = true; - unsubscribe?.(); - }; - }, [client, queryClient, libraryId]); -} From 191c7f7ef033b8ea2d6b2971ab491e69037d8ada Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 21:31:15 -0800 Subject: [PATCH 04/20] Refactor to a single unified ephemeral index cache --- apps/cli/src/domains/index/mod.rs | 124 ++++--- core/src/ops/core/ephemeral_status/output.rs | 56 ++- core/src/ops/core/ephemeral_status/query.rs | 88 +++-- .../src/ops/indexing/ephemeral/index_cache.rs | 346 ++++++++++-------- core/src/ops/indexing/ephemeral/mod.rs | 5 +- core/src/ops/indexing/job.rs | 211 +++++++---- core/src/ops/indexing/persistence.rs | 4 +- core/src/ops/indexing/verify/action.rs | 2 +- 8 files changed, 511 insertions(+), 325 deletions(-) diff --git a/apps/cli/src/domains/index/mod.rs b/apps/cli/src/domains/index/mod.rs index 60fe3d9cf..e3cb7add9 100644 --- a/apps/cli/src/domains/index/mod.rs +++ b/apps/cli/src/domains/index/mod.rs @@ -246,63 +246,91 @@ pub async fn run(ctx: &Context, cmd: IndexCmd) -> Result<()> { |status: &sd_core::ops::core::ephemeral_status::EphemeralCacheStatus| { println!(); println!("╔══════════════════════════════════════════════════════════════╗"); - println!("║ EPHEMERAL INDEX CACHE STATUS ║"); + println!("║ UNIFIED EPHEMERAL INDEX CACHE ║"); println!("╠══════════════════════════════════════════════════════════════╣"); println!( - "║ Total Indexes: {:3} In Progress: {:3} ║", - status.total_indexes, status.indexing_in_progress + "║ Indexed Paths: {:3} In Progress: {:3} ║", + status.indexed_paths_count, status.indexing_in_progress_count ); println!("╚══════════════════════════════════════════════════════════════╝"); - if status.indexes.is_empty() { - println!("\n No ephemeral indexes cached."); + // Show unified index stats + let stats = &status.index_stats; + println!(); + let mut stats_table = Table::new(); + stats_table.load_preset(UTF8_BORDERS_ONLY); + stats_table.set_header(vec![ + Cell::new("SHARED INDEX STATS").add_attribute(Attribute::Bold), + Cell::new(""), + ]); + + stats_table.add_row(vec![ + "Total entries (shared arena)", + &stats.total_entries.to_string(), + ]); + stats_table.add_row(vec![ + "Path index count", + &stats.path_index_count.to_string(), + ]); + stats_table.add_row(vec![ + "Unique names (shared)", + &stats.unique_names.to_string(), + ]); + stats_table.add_row(vec![ + "Interned strings (shared)", + &stats.interned_strings.to_string(), + ]); + stats_table.add_row(vec![ + "Content kinds", + &stats.content_kinds.to_string(), + ]); + stats_table.add_row(vec![ + "Memory usage", + &format_bytes(stats.memory_bytes as u64), + ]); + stats_table.add_row(vec!["Cache age", &format!("{:.1}s", stats.age_seconds)]); + stats_table + .add_row(vec!["Idle time", &format!("{:.1}s", stats.idle_seconds)]); + + println!("{}", stats_table); + + // Show indexed paths + if status.indexed_paths.is_empty() && status.paths_in_progress.is_empty() { + println!("\n No paths indexed yet."); } else { - for idx in &status.indexes { + // Paths in progress + if !status.paths_in_progress.is_empty() { println!(); - let mut table = Table::new(); - table.load_preset(UTF8_BORDERS_ONLY); + let mut progress_table = Table::new(); + progress_table.load_preset(UTF8_BORDERS_ONLY); + progress_table.set_header(vec![ + Cell::new("INDEXING IN PROGRESS").add_attribute(Attribute::Bold), + ]); + for path in &status.paths_in_progress { + progress_table.add_row(vec![format!( + "● {}", + path.display() + )]); + } + println!("{}", progress_table); + } - let status_indicator = if idx.indexing_in_progress { - "● INDEXING" - } else { - "○ Ready" - }; - - table.set_header(vec![ - Cell::new(format!("{}", idx.root_path.display())) - .add_attribute(Attribute::Bold), - Cell::new(status_indicator), + // Indexed paths + if !status.indexed_paths.is_empty() { + println!(); + let mut paths_table = Table::new(); + paths_table.load_preset(UTF8_BORDERS_ONLY); + paths_table.set_header(vec![ + Cell::new("INDEXED PATHS").add_attribute(Attribute::Bold), + Cell::new("Children"), ]); - - table.add_row(vec!["Entries (arena)", &idx.total_entries.to_string()]); - table.add_row(vec![ - "Path index count", - &idx.path_index_count.to_string(), - ]); - table.add_row(vec!["Unique names", &idx.unique_names.to_string()]); - table.add_row(vec![ - "Interned strings", - &idx.interned_strings.to_string(), - ]); - table.add_row(vec!["Content kinds", &idx.content_kinds.to_string()]); - table.add_row(vec![ - "Memory usage", - &format_bytes(idx.memory_bytes as u64), - ]); - table.add_row(vec!["Age", &format!("{:.1}s", idx.age_seconds)]); - table.add_row(vec!["Idle time", &format!("{:.1}s", idx.idle_seconds)]); - table.add_row(vec![ - "Job stats", - &format!( - "{} files, {} dirs, {} symlinks, {}", - idx.job_stats.files, - idx.job_stats.dirs, - idx.job_stats.symlinks, - format_bytes(idx.job_stats.bytes) - ), - ]); - - println!("{}", table); + for info in &status.indexed_paths { + paths_table.add_row(vec![ + format!("○ {}", info.path.display()), + info.child_count.to_string(), + ]); + } + println!("{}", paths_table); } } println!(); diff --git a/core/src/ops/core/ephemeral_status/output.rs b/core/src/ops/core/ephemeral_status/output.rs index b092da1a8..b7bbce0b0 100644 --- a/core/src/ops/core/ephemeral_status/output.rs +++ b/core/src/ops/core/ephemeral_status/output.rs @@ -4,18 +4,60 @@ use serde::{Deserialize, Serialize}; use specta::Type; use std::path::PathBuf; -/// Status of the entire ephemeral index cache +/// Status of the unified ephemeral index cache #[derive(Debug, Clone, Serialize, Deserialize, Type)] pub struct EphemeralCacheStatus { - /// Total number of cached indexes - pub total_indexes: usize, - /// Number of indexes currently being populated - pub indexing_in_progress: usize, - /// Details for each cached index + /// Number of paths that have been indexed + pub indexed_paths_count: usize, + /// Number of paths currently being indexed + pub indexing_in_progress_count: usize, + /// Unified index statistics (shared arena and string interning) + pub index_stats: UnifiedIndexStats, + /// List of indexed paths (directories whose contents are ready) + pub indexed_paths: Vec, + /// List of paths currently being indexed + pub paths_in_progress: Vec, + + // Legacy fields for backward compatibility + #[serde(skip_serializing_if = "Option::is_none")] + pub total_indexes: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub indexing_in_progress: Option, + #[serde(skip_serializing_if = "Vec::is_empty", default)] pub indexes: Vec, } -/// Information about a single ephemeral index +/// Statistics for the unified ephemeral index +#[derive(Debug, Clone, Serialize, Deserialize, Type)] +pub struct UnifiedIndexStats { + /// Total entries in the shared arena + pub total_entries: usize, + /// Number of entries indexed by path + pub path_index_count: usize, + /// Number of unique interned names (shared across all paths) + pub unique_names: usize, + /// Number of interned strings in shared cache + pub interned_strings: usize, + /// Number of content kinds stored + pub content_kinds: usize, + /// Estimated memory usage in bytes + pub memory_bytes: usize, + /// Age of the cache in seconds + pub age_seconds: f64, + /// Seconds since last access + pub idle_seconds: f64, +} + +/// Information about an indexed path +#[derive(Debug, Clone, Serialize, Deserialize, Type)] +pub struct IndexedPathInfo { + /// The directory path that was indexed + pub path: PathBuf, + /// Number of direct children in this directory + pub child_count: usize, +} + +/// Legacy: Information about a single ephemeral index (for backward compatibility) #[derive(Debug, Clone, Serialize, Deserialize, Type)] pub struct EphemeralIndexInfo { /// Root path this index covers diff --git a/core/src/ops/core/ephemeral_status/query.rs b/core/src/ops/core/ephemeral_status/query.rs index 8e2747efc..9d2d96685 100644 --- a/core/src/ops/core/ephemeral_status/query.rs +++ b/core/src/ops/core/ephemeral_status/query.rs @@ -1,6 +1,6 @@ //! Ephemeral index cache status query //! -//! Provides a snapshot of all cached ephemeral indexes for debugging. +//! Provides a snapshot of the unified ephemeral index for debugging. use super::output::*; use crate::{ @@ -14,7 +14,7 @@ use std::sync::Arc; /// Input for the ephemeral cache status query #[derive(Debug, Clone, Serialize, Deserialize, Type, Default)] pub struct EphemeralCacheStatusInput { - /// Optional: only include indexes for paths containing this substring + /// Optional: only include indexed paths containing this substring #[serde(default)] pub path_filter: Option, } @@ -39,14 +39,31 @@ impl CoreQuery for EphemeralCacheStatusQuery { ) -> QueryResult { let cache = context.ephemeral_cache(); - // Get basic cache stats + // Get cache stats let cache_stats = cache.stats(); - let cached_paths = cache.cached_paths(); + let all_indexed_paths = cache.indexed_paths(); + let paths_in_progress = cache.paths_in_progress(); - // Gather detailed info for each index - let mut indexes = Vec::new(); + // Get the global index for detailed stats + let global_index = cache.get_global_index(); + let index = global_index.read().await; + let stats = index.get_stats(); - for path in cached_paths { + // Build unified index stats + let index_stats = UnifiedIndexStats { + total_entries: stats.total_entries, + path_index_count: index.path_index_count(), + unique_names: stats.unique_names, + interned_strings: stats.interned_strings, + content_kinds: index.content_kinds_count(), + memory_bytes: stats.memory_bytes, + age_seconds: cache.age().as_secs_f64(), + idle_seconds: index.idle_time().as_secs_f64(), + }; + + // Build indexed paths info with child counts + let mut indexed_paths = Vec::new(); + for path in all_indexed_paths { // Apply path filter if provided if let Some(ref filter) = self.input.path_filter { if !path.to_string_lossy().contains(filter) { @@ -54,44 +71,35 @@ impl CoreQuery for EphemeralCacheStatusQuery { } } - // Check if indexing is in progress - let indexing_in_progress = cache.is_indexing(&path); + // Get child count for this directory + let child_count = index.list_directory(&path).map(|c| c.len()).unwrap_or(0); - // Try to get the index to read its internal stats - if let Some(index_arc) = cache.get(&path) { - let index = index_arc.read().await; - let stats = index.get_stats(); - - let info = EphemeralIndexInfo { - root_path: index.root_path.clone(), - indexing_in_progress, - total_entries: stats.total_entries, - path_index_count: index.path_index_count(), - unique_names: stats.unique_names, - interned_strings: stats.interned_strings, - content_kinds: index.content_kinds_count(), - memory_bytes: stats.memory_bytes, - age_seconds: index.age().as_secs_f64(), - idle_seconds: index.idle_time().as_secs_f64(), - job_stats: JobStats { - files: index.stats.files, - dirs: index.stats.dirs, - symlinks: index.stats.symlinks, - bytes: index.stats.bytes, - }, - }; - - indexes.push(info); - } + indexed_paths.push(IndexedPathInfo { path, child_count }); } - // Sort by root path for consistent output - indexes.sort_by(|a, b| a.root_path.cmp(&b.root_path)); + // Sort by path for consistent output + indexed_paths.sort_by(|a, b| a.path.cmp(&b.path)); + + // Filter paths in progress + let filtered_in_progress: Vec<_> = if let Some(ref filter) = self.input.path_filter { + paths_in_progress + .into_iter() + .filter(|p| p.to_string_lossy().contains(filter)) + .collect() + } else { + paths_in_progress + }; Ok(EphemeralCacheStatus { - total_indexes: cache_stats.total_entries, - indexing_in_progress: cache_stats.indexing_count, - indexes, + indexed_paths_count: cache_stats.indexed_paths, + indexing_in_progress_count: cache_stats.indexing_in_progress, + index_stats, + indexed_paths, + paths_in_progress: filtered_in_progress, + // Legacy fields + total_indexes: None, + indexing_in_progress: None, + indexes: Vec::new(), }) } } diff --git a/core/src/ops/indexing/ephemeral/index_cache.rs b/core/src/ops/indexing/ephemeral/index_cache.rs index f464dbf16..1aef082a8 100644 --- a/core/src/ops/indexing/ephemeral/index_cache.rs +++ b/core/src/ops/indexing/ephemeral/index_cache.rs @@ -1,159 +1,181 @@ //! Global cache for ephemeral indexes //! -//! This module provides a thread-safe cache for storing ephemeral indexes -//! by their root path. This allows directory listing queries to reuse -//! existing indexes instead of spawning new indexer jobs. +//! This module provides a thread-safe cache with a SINGLE global ephemeral index. +//! All browsed directories share the same arena and string interning pool, +//! providing efficient memory usage through deduplication. //! -//! The cache is permanent in memory (no TTL or expiration). Entries persist -//! until the daemon restarts or they are explicitly removed. This ensures -//! UUIDs from ephemeral indexing can be preserved when regular indexing is enabled. +//! Key benefits of unified index: +//! - String interning shared across all paths (common names like .git, README.md) +//! - Single arena for all entries (~50 bytes per entry vs ~200 with HashMap) +//! - Hierarchical structure preserved for efficient directory listings +//! +//! The cache tracks which paths have been indexed (ready) vs are currently +//! being indexed (in progress). use crate::ops::indexing::EphemeralIndex; use parking_lot::RwLock; use std::{ - collections::HashMap, + collections::HashSet, path::{Path, PathBuf}, sync::Arc, time::Instant, }; use tokio::sync::RwLock as TokioRwLock; -/// Cache entry wrapping an ephemeral index with metadata -struct CacheEntry { - /// The ephemeral index - index: Arc>, - /// When this entry was created - created_at: Instant, - /// Whether an indexer job is currently running for this path - indexing_in_progress: bool, -} - -impl CacheEntry { - fn new(index: Arc>) -> Self { - Self { - index, - created_at: Instant::now(), - indexing_in_progress: false, - } - } -} - -/// Global cache for ephemeral indexes +/// Global cache with a single unified ephemeral index /// -/// Stores ephemeral indexes by their root path for reuse across queries. -/// Indexes persist in memory until the daemon restarts or they are explicitly removed. +/// Instead of separate indexes per path, all entries live in one shared index. +/// This maximizes memory efficiency through shared string interning and arena. pub struct EphemeralIndexCache { - /// Map of root path to cache entry - entries: RwLock>, + /// Single global index containing all browsed entries + index: Arc>, + + /// Paths whose immediate children have been indexed (ready for queries) + indexed_paths: RwLock>, + + /// Paths currently being indexed + indexing_in_progress: RwLock>, + + /// When the cache was created + created_at: Instant, } impl EphemeralIndexCache { - /// Create a new cache + /// Create a new cache with an empty global index pub fn new() -> Self { Self { - entries: RwLock::new(HashMap::new()), + index: Arc::new(TokioRwLock::new(EphemeralIndex::new())), + indexed_paths: RwLock::new(HashSet::new()), + indexing_in_progress: RwLock::new(HashSet::new()), + created_at: Instant::now(), } } - /// Get an existing index for a path, or None if not cached - pub fn get(&self, path: &Path) -> Option>> { - let entries = self.entries.read(); - entries.get(path).map(|entry| entry.index.clone()) + /// Get the global index if the given path has been indexed + /// + /// Returns Some(index) if this path's contents are available, + /// None if the path hasn't been browsed yet. + pub fn get_for_path(&self, path: &Path) -> Option>> { + let indexed = self.indexed_paths.read(); + if indexed.contains(path) { + Some(self.index.clone()) + } else { + None + } } - /// Get an existing index for a path (exact match only) - /// - /// Returns the index if an index exists for this exact path. - /// - /// Note: We only use exact matches because ephemeral indexing uses - /// IndexScope::Current (single level), so an ancestor index doesn't - /// contain the contents of subdirectories. - pub fn get_for_path(&self, path: &Path) -> Option>> { - self.get(path) + /// Get the global index unconditionally (for internal use) + pub fn get_global_index(&self) -> Arc> { + self.index.clone() + } + + /// Check if a path has been fully indexed + pub fn is_indexed(&self, path: &Path) -> bool { + self.indexed_paths.read().contains(path) } /// Check if indexing is in progress for a path pub fn is_indexing(&self, path: &Path) -> bool { - let entries = self.entries.read(); - entries - .get(path) - .map(|e| e.indexing_in_progress) - .unwrap_or(false) + self.indexing_in_progress.read().contains(path) } - /// Insert or update an index in the cache - pub fn insert(&self, path: PathBuf, index: Arc>) { - let mut entries = self.entries.write(); - entries.insert(path, CacheEntry::new(index)); - } - - /// Create a new index for a path and mark it as indexing in progress + /// Prepare the global index for indexing a new path /// - /// Returns the index to be used by the indexer job. + /// Marks the path as indexing-in-progress and returns the global index. + /// The indexer job should add entries to this shared index. pub fn create_for_indexing(&self, path: PathBuf) -> Arc> { - let mut entries = self.entries.write(); - - // Check if entry already exists - if let Some(entry) = entries.get_mut(&path) { - entry.indexing_in_progress = true; - return entry.index.clone(); - } - - // Create new entry - let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); - let mut entry = CacheEntry::new(index.clone()); - entry.indexing_in_progress = true; - entries.insert(path, entry); - index + let mut in_progress = self.indexing_in_progress.write(); + in_progress.insert(path); + self.index.clone() } /// Mark indexing as complete for a path + /// + /// Moves the path from "in progress" to "indexed" state. pub fn mark_indexing_complete(&self, path: &Path) { - let mut entries = self.entries.write(); - if let Some(entry) = entries.get_mut(path) { - entry.indexing_in_progress = false; - } + let mut in_progress = self.indexing_in_progress.write(); + let mut indexed = self.indexed_paths.write(); + + in_progress.remove(path); + indexed.insert(path.to_path_buf()); } - /// Remove an index from the cache - pub fn remove(&self, path: &Path) { - let mut entries = self.entries.write(); - entries.remove(path); + /// Remove a path from the indexed set (e.g., on invalidation) + /// + /// Note: This doesn't remove entries from the index itself, + /// just marks the path as needing re-indexing. + pub fn invalidate_path(&self, path: &Path) { + let mut indexed = self.indexed_paths.write(); + indexed.remove(path); } - /// Get the number of cached indexes + /// Get the number of indexed paths pub fn len(&self) -> usize { - self.entries.read().len() + self.indexed_paths.read().len() } - /// Check if the cache is empty + /// Check if no paths have been indexed pub fn is_empty(&self) -> bool { - self.entries.read().is_empty() + self.indexed_paths.read().is_empty() } - /// Get all cached root paths - pub fn cached_paths(&self) -> Vec { - self.entries.read().keys().cloned().collect() + /// Get all indexed paths + pub fn indexed_paths(&self) -> Vec { + self.indexed_paths.read().iter().cloned().collect() + } + + /// Get all paths currently being indexed + pub fn paths_in_progress(&self) -> Vec { + self.indexing_in_progress.read().iter().cloned().collect() } /// Get cache statistics pub fn stats(&self) -> EphemeralIndexCacheStats { - let entries = self.entries.read(); - let total_entries = entries.len(); - let indexing_count = entries.values().filter(|e| e.indexing_in_progress).count(); + let indexed = self.indexed_paths.read(); + let in_progress = self.indexing_in_progress.read(); EphemeralIndexCacheStats { - total_entries, - indexing_count, + indexed_paths: indexed.len(), + indexing_in_progress: in_progress.len(), } } - /// Get the age of a cached index in seconds - pub fn get_age(&self, path: &Path) -> Option { - let entries = self.entries.read(); - entries - .get(path) - .map(|e| e.created_at.elapsed().as_secs_f64()) + /// Get how long the cache has existed + pub fn age(&self) -> std::time::Duration { + self.created_at.elapsed() + } + + /// Legacy: Get age for a specific path (returns cache age since all share one index) + pub fn get_age(&self, _path: &Path) -> Option { + Some(self.created_at.elapsed().as_secs_f64()) + } + + // Legacy compatibility methods + + /// Legacy: Get an index by exact path (for backward compatibility) + #[deprecated(note = "Use get_for_path instead")] + pub fn get(&self, path: &Path) -> Option>> { + self.get_for_path(path) + } + + /// Legacy: Get all cached paths (returns indexed paths) + #[deprecated(note = "Use indexed_paths instead")] + pub fn cached_paths(&self) -> Vec { + self.indexed_paths() + } + + /// Legacy: Insert (no-op, entries are added directly to global index) + #[deprecated(note = "Entries should be added directly to the global index")] + pub fn insert(&self, path: PathBuf, _index: Arc>) { + // Mark the path as indexed + let mut indexed = self.indexed_paths.write(); + indexed.insert(path); + } + + /// Legacy: Remove (just invalidates the path) + #[deprecated(note = "Use invalidate_path instead")] + pub fn remove(&self, path: &Path) { + self.invalidate_path(path); } } @@ -166,8 +188,24 @@ impl Default for EphemeralIndexCache { /// Statistics about the ephemeral index cache #[derive(Debug, Clone)] pub struct EphemeralIndexCacheStats { - pub total_entries: usize, - pub indexing_count: usize, + /// Number of paths that have been indexed + pub indexed_paths: usize, + /// Number of paths currently being indexed + pub indexing_in_progress: usize, + + // Legacy field names for compatibility +} + +impl EphemeralIndexCacheStats { + /// Legacy: total_entries now means indexed_paths + pub fn total_entries(&self) -> usize { + self.indexed_paths + } + + /// Legacy: indexing_count now means indexing_in_progress + pub fn indexing_count(&self) -> usize { + self.indexing_in_progress + } } #[cfg(test)] @@ -175,80 +213,90 @@ mod tests { use super::*; #[test] - fn test_insert_and_get() { + fn test_single_global_index() { let cache = EphemeralIndexCache::new(); - let path = PathBuf::from("/test/path"); - let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); - cache.insert(path.clone(), index.clone()); - - assert!(cache.get(&path).is_some()); - assert_eq!(cache.len(), 1); + // Initially no paths are indexed + assert!(cache.is_empty()); + assert!(cache.get_for_path(Path::new("/test")).is_none()); } #[test] - fn test_get_nonexistent() { - let cache = EphemeralIndexCache::new(); - assert!(cache.get(Path::new("/nonexistent")).is_none()); - } - - #[test] - fn test_create_for_indexing() { + fn test_indexing_workflow() { let cache = EphemeralIndexCache::new(); let path = PathBuf::from("/test/path"); + // Start indexing let _index = cache.create_for_indexing(path.clone()); - assert!(cache.is_indexing(&path)); + assert!(!cache.is_indexed(&path)); + // Complete indexing cache.mark_indexing_complete(&path); - assert!(!cache.is_indexing(&path)); + assert!(cache.is_indexed(&path)); + + // Now get_for_path returns the index + assert!(cache.get_for_path(&path).is_some()); } #[test] - fn test_remove() { + fn test_shared_index_across_paths() { + let cache = EphemeralIndexCache::new(); + + let path1 = PathBuf::from("/test/path1"); + let path2 = PathBuf::from("/test/path2"); + + // Start indexing both paths + let index1 = cache.create_for_indexing(path1.clone()); + let index2 = cache.create_for_indexing(path2.clone()); + + // They should be the same index + assert!(Arc::ptr_eq(&index1, &index2)); + + // Complete both + cache.mark_indexing_complete(&path1); + cache.mark_indexing_complete(&path2); + + // Both paths now indexed + assert!(cache.is_indexed(&path1)); + assert!(cache.is_indexed(&path2)); + assert_eq!(cache.len(), 2); + } + + #[test] + fn test_invalidate_path() { let cache = EphemeralIndexCache::new(); let path = PathBuf::from("/test/path"); - let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); - cache.insert(path.clone(), index); - assert_eq!(cache.len(), 1); + // Index the path + let _index = cache.create_for_indexing(path.clone()); + cache.mark_indexing_complete(&path); + assert!(cache.is_indexed(&path)); - cache.remove(&path); - assert_eq!(cache.len(), 0); + // Invalidate it + cache.invalidate_path(&path); + assert!(!cache.is_indexed(&path)); + + // get_for_path now returns None + assert!(cache.get_for_path(&path).is_none()); } #[test] - fn test_get_for_path_exact_match_only() { + fn test_stats() { let cache = EphemeralIndexCache::new(); - let root = PathBuf::from("/test"); - let child = PathBuf::from("/test/subdir/file.txt"); - let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(root.clone()))); - cache.insert(root.clone(), index); + let path1 = PathBuf::from("/ready"); + let path2 = PathBuf::from("/in_progress"); - // Should NOT find ancestor index - we only use exact matches - // because ephemeral indexing is single-level (IndexScope::Current) - assert!(cache.get_for_path(&child).is_none()); + // One indexed, one in progress + let _index = cache.create_for_indexing(path1.clone()); + cache.mark_indexing_complete(&path1); - // Should find exact match - assert!(cache.get_for_path(&root).is_some()); - } + let _index = cache.create_for_indexing(path2.clone()); - #[test] - fn test_cache_persists() { - // Test that cache entries persist (no TTL expiration) - let cache = EphemeralIndexCache::new(); - let path = PathBuf::from("/test/path"); - let index = Arc::new(TokioRwLock::new(EphemeralIndex::new(path.clone()))); - - cache.insert(path.clone(), index); - - // Wait a bit - std::thread::sleep(std::time::Duration::from_millis(100)); - - // Should still be available (no expiration) - assert!(cache.get(&path).is_some()); + let stats = cache.stats(); + assert_eq!(stats.indexed_paths, 1); + assert_eq!(stats.indexing_in_progress, 1); } } diff --git a/core/src/ops/indexing/ephemeral/mod.rs b/core/src/ops/indexing/ephemeral/mod.rs index 94960ece2..147e0082f 100644 --- a/core/src/ops/indexing/ephemeral/mod.rs +++ b/core/src/ops/indexing/ephemeral/mod.rs @@ -26,9 +26,10 @@ //! ```rust,ignore //! use sd_core::ops::indexing::ephemeral::EphemeralIndex; //! -//! let mut index = EphemeralIndex::new("/path/to/root".into()); +//! // Create a unified index (supports multiple directory trees) +//! let mut index = EphemeralIndex::new(); //! -//! // Add entries +//! // Add entries with full paths - parent chain is created automatically //! index.add_entry(path, uuid, metadata); //! //! // Query diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index d41da969e..b29183ed7 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -8,7 +8,12 @@ use crate::{ use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; use serde::{Deserialize, Serialize}; use specta::Type; -use std::{collections::HashMap, path::PathBuf, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, +}; use tokio::sync::RwLock; use tracing::{info, warn}; use uuid::Uuid; @@ -155,18 +160,19 @@ impl IndexerJobConfig { /// /// This implementation uses efficient data structures for memory optimization: /// - NodeArena: Contiguous storage for file nodes (~48 bytes per node) -/// - NameCache: String interning for common filenames +/// - NameCache: String interning for common filenames (shared across all entries) /// - NameRegistry: Fast name-based lookups /// +/// All browsed paths share a single index, maximizing string deduplication +/// and memory efficiency. Parent-child relationships are established based +/// on path hierarchy. +/// /// Memory usage: ~50 bytes per entry vs ~200 bytes with HashMap pub struct EphemeralIndex { /// Efficient tree storage arena: super::ephemeral::NodeArena, - /// Root node - root: super::ephemeral::EntryId, - - /// String interning + /// String interning (shared across all paths) cache: std::sync::Arc, /// Fast name lookups @@ -184,66 +190,104 @@ pub struct EphemeralIndex { /// Metadata created_at: std::time::Instant, last_accessed: std::time::Instant, - pub root_path: PathBuf, pub stats: IndexerStats, } impl std::fmt::Debug for EphemeralIndex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("EphemeralIndex") - .field("root_path", &self.root_path) .field("entry_count", &self.arena.len()) .field("interned_names", &self.cache.len()) + .field("path_count", &self.path_index.len()) .finish() } } impl EphemeralIndex { - pub fn new(root_path: PathBuf) -> Self { - use super::ephemeral::{ - FileNode, FileType, MaybeEntryId, NameCache, NameRef, NameRegistry, NodeArena, - NodeState, PackedMetadata, - }; + /// Create a new empty ephemeral index + /// + /// The index stores entries with their full paths and builds parent-child + /// relationships based on path hierarchy. Multiple directory trees can + /// coexist in the same index, sharing the arena and string interning pool. + pub fn new() -> Self { + use super::ephemeral::{NameCache, NameRegistry, NodeArena}; let cache = std::sync::Arc::new(NameCache::new()); - let mut arena = NodeArena::new(); + let arena = NodeArena::new(); let registry = NameRegistry::new(); - // Create root node - let root_name = cache.intern( - root_path - .file_name() + let now = std::time::Instant::now(); + + Self { + arena, + cache, + registry, + path_index: HashMap::new(), + entry_uuids: HashMap::new(), + content_kinds: HashMap::new(), + created_at: now, + last_accessed: now, + stats: IndexerStats::default(), + } + } + + /// Ensure a directory exists in the index, creating ancestor chain if needed + /// + /// Returns the EntryId of the directory. + pub fn ensure_directory(&mut self, path: &Path) -> super::ephemeral::EntryId { + use super::ephemeral::{ + FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata, + }; + use super::state::EntryKind; + + // Already exists? + if let Some(&id) = self.path_index.get(path) { + return id; + } + + // Ensure parent exists first (recursive) + let parent_id = if let Some(parent_path) = path.parent() { + if parent_path.as_os_str().is_empty() { + None + } else { + Some(self.ensure_directory(parent_path)) + } + } else { + None + }; + + // Create this directory + let name = self.cache.intern( + path.file_name() .map(|s| s.to_string_lossy()) .as_deref() .unwrap_or("/"), ); - let root_node = FileNode::new( - NameRef::new(root_name, MaybeEntryId::NONE), - PackedMetadata::new(NodeState::Accessible, FileType::Directory, 0), - ); + let parent_ref = parent_id + .map(MaybeEntryId::some) + .unwrap_or(MaybeEntryId::NONE); + let meta = PackedMetadata::new(NodeState::Accessible, FileType::Directory, 0); + let node = FileNode::new(NameRef::new(name, parent_ref), meta); - let root = arena.insert(root_node); + let id = self.arena.insert(node); - let now = std::time::Instant::now(); - - // Add root path to path_index so list_directory works for the root - let mut path_index = HashMap::new(); - path_index.insert(root_path.clone(), root); - - Self { - arena, - root, - cache, - registry, - path_index, - entry_uuids: HashMap::new(), - content_kinds: HashMap::new(), - created_at: now, - last_accessed: now, - root_path, - stats: IndexerStats::default(), + // Add to parent's children + if let Some(parent_id) = parent_id { + if let Some(parent) = self.arena.get_mut(parent_id) { + parent.add_child(id); + } } + + // Index by path and name + self.path_index.insert(path.to_path_buf(), id); + self.registry.insert(name, id); + + // Generate UUID for directory + let uuid = uuid::Uuid::new_v4(); + self.entry_uuids.insert(path.to_path_buf(), uuid); + + id } /// Add an entry to the index. Returns Some(content_kind) if added, None if duplicate. @@ -265,7 +309,24 @@ impl EphemeralIndex { return None; } - // Intern the filename + // Ensure parent directory exists in the index FIRST (requires &mut self) + // This must happen before interning the name to avoid borrow conflicts + let parent_id = if let Some(parent_path) = path.parent() { + if parent_path.as_os_str().is_empty() { + // Root of filesystem, no parent + None + } else if let Some(&existing_id) = self.path_index.get(parent_path) { + // Parent already exists + Some(existing_id) + } else { + // Parent doesn't exist - ensure it (and ancestors) are created + Some(self.ensure_directory(parent_path)) + } + } else { + None + }; + + // Now intern the filename (borrows self.cache immutably) let name = self.cache.intern( path.file_name() .map(|s| s.to_string_lossy()) @@ -273,12 +334,6 @@ impl EphemeralIndex { .unwrap_or("unknown"), ); - // Find parent - let parent_id = path - .parent() - .and_then(|p| self.path_index.get(p).copied()) - .unwrap_or(self.root); - // Create metadata let file_type = FileType::from(metadata.kind); @@ -286,13 +341,18 @@ impl EphemeralIndex { .with_times(metadata.modified, metadata.created); // Create node - let node = FileNode::new(NameRef::new(name, MaybeEntryId::some(parent_id)), meta); + let parent_ref = parent_id + .map(MaybeEntryId::some) + .unwrap_or(MaybeEntryId::NONE); + let node = FileNode::new(NameRef::new(name, parent_ref), meta); let id = self.arena.insert(node); // Add to parent's children - if let Some(parent) = self.arena.get_mut(parent_id) { - parent.add_child(id); + if let Some(parent_id) = parent_id { + if let Some(parent) = self.arena.get_mut(parent_id) { + parent.add_child(id); + } } // Detect content kind by extension (fast, no I/O) @@ -394,20 +454,23 @@ impl EphemeralIndex { let mut segments = Vec::new(); let mut current = id; + // Walk up the tree collecting path segments while let Some(node) = self.arena.get(current) { + segments.push(node.name().to_owned()); if let Some(parent) = node.parent() { - segments.push(node.name().to_owned()); current = parent; } else { + // Reached a root node (no parent) break; } } if segments.is_empty() { - return Some(self.root_path.clone()); + return None; } - let mut path = self.root_path.clone(); + // Build absolute path from segments (root to leaf) + let mut path = PathBuf::from("/"); for segment in segments.into_iter().rev() { path.push(segment); } @@ -519,6 +582,12 @@ impl EphemeralIndex { } } +impl Default for EphemeralIndex { + fn default() -> Self { + Self::new() + } +} + /// Statistics about an ephemeral index #[derive(Debug, Clone)] pub struct EphemeralIndexStats { @@ -575,13 +644,7 @@ impl JobHandler for IndexerJob { // Initialize ephemeral index if needed if self.config.is_ephemeral() && self.ephemeral_index.is_none() { - let root_path = - self.config.path.as_local_path().ok_or_else(|| { - JobError::execution("Path not accessible locally".to_string()) - })?; - self.ephemeral_index = Some(Arc::new(RwLock::new(EphemeralIndex::new( - root_path.to_path_buf(), - )))); + self.ephemeral_index = Some(Arc::new(RwLock::new(EphemeralIndex::new()))); ctx.log("Initialized ephemeral index for non-persistent job"); } @@ -750,6 +813,7 @@ impl JobHandler for IndexerJob { state, &ctx, ephemeral_index, + root_path, volume_backend.as_ref(), ) .await?; @@ -879,15 +943,14 @@ impl JobHandler for IndexerJob { // Mark ephemeral indexing as complete in the cache if self.config.is_ephemeral() { - if let Some(ephemeral_index) = &self.ephemeral_index { - let root_path = ephemeral_index.read().await.root_path.clone(); + if let Some(local_path) = self.config.path.as_local_path() { ctx.library() .core_context() .ephemeral_cache() - .mark_indexing_complete(&root_path); + .mark_indexing_complete(local_path); ctx.log(format!( "Marked ephemeral indexing complete for: {}", - root_path.display() + local_path.display() )); } } @@ -1076,31 +1139,29 @@ impl IndexerJob { state: &mut IndexerState, ctx: &JobContext<'_>, ephemeral_index: Arc>, - volume_backend: Option<&Arc>, + root_path: &Path, + _volume_backend: Option<&Arc>, ) -> JobResult<()> { use super::persistence::PersistenceFactory; ctx.log("Starting ephemeral processing"); - // Get root path from ephemeral index - let root_path = { - let index = ephemeral_index.read().await; - index.root_path.clone() - }; - // Get event bus from library let event_bus = Some(ctx.library().event_bus().clone()); // Create ephemeral persistence layer (emits events as entries are stored) - let persistence = - PersistenceFactory::ephemeral(ephemeral_index.clone(), event_bus, root_path.clone()); + let persistence = PersistenceFactory::ephemeral( + ephemeral_index.clone(), + event_bus, + root_path.to_path_buf(), + ); // Process all batches through persistence layer while let Some(batch) = state.entry_batches.pop() { for entry in batch { // Store entry (this will emit ResourceChanged events) // Content kind is identified by extension during add_entry, no hashing needed - let _entry_id = persistence.store_entry(&entry, None, &root_path).await?; + let _entry_id = persistence.store_entry(&entry, None, root_path).await?; } } diff --git a/core/src/ops/indexing/persistence.rs b/core/src/ops/indexing/persistence.rs index cc89b6bdf..30d647bc3 100644 --- a/core/src/ops/indexing/persistence.rs +++ b/core/src/ops/indexing/persistence.rs @@ -620,9 +620,7 @@ mod tests { std::fs::write(&test_file, b"test content").unwrap(); // Create ephemeral index - let index = Arc::new(RwLock::new(EphemeralIndex::new( - temp_dir.path().to_path_buf(), - ))); + let index = Arc::new(RwLock::new(EphemeralIndex::new())); // Create event collector let collected_events = Arc::new(Mutex::new(Vec::new())); diff --git a/core/src/ops/indexing/verify/action.rs b/core/src/ops/indexing/verify/action.rs index d9418d920..130f754f3 100644 --- a/core/src/ops/indexing/verify/action.rs +++ b/core/src/ops/indexing/verify/action.rs @@ -104,7 +104,7 @@ impl IndexVerifyAction { tracing::debug!("Running ephemeral indexer job on {}", path.display()); // Create ephemeral index storage that we'll share with the job - let ephemeral_index = Arc::new(RwLock::new(EphemeralIndex::new(path.to_path_buf()))); + let ephemeral_index = Arc::new(RwLock::new(EphemeralIndex::new())); // Subscribe to job events before dispatching let mut event_subscriber = context.events.subscribe(); From 456da8a9240ec0dbd2e90f202b677b5b1d98187f Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 21:35:19 -0800 Subject: [PATCH 05/20] Enhance ephemeral indexing by clearing stale entries before re-indexing - Implemented a mechanism to clear stale entries for a directory's children during re-indexing to prevent ghost files. - Updated the `create_for_indexing` method to remove previously indexed paths and ensure a clean slate for new indexing operations. - Added logging for the number of cleared entries to aid in debugging and monitoring. --- core/src/ops/files/query/directory_listing.rs | 10 +++++ .../src/ops/indexing/ephemeral/index_cache.rs | 19 ++++++++- core/src/ops/indexing/job.rs | 42 +++++++++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/core/src/ops/files/query/directory_listing.rs b/core/src/ops/files/query/directory_listing.rs index becedfdfa..514d8e606 100644 --- a/core/src/ops/files/query/directory_listing.rs +++ b/core/src/ops/files/query/directory_listing.rs @@ -748,6 +748,16 @@ impl DirectoryListingQuery { // Create cache entry and get the index to share with the job let ephemeral_index = cache.create_for_indexing(local_path.clone()); + // Clear any stale entries from previous indexing (prevents ghost files) + let cleared = cache.clear_for_reindex(&local_path).await; + if cleared > 0 { + tracing::debug!( + "Cleared {} stale entries for re-indexing: {}", + cleared, + local_path.display() + ); + } + // Create ephemeral indexer job for this directory (shallow, current scope only) let config = IndexerJobConfig::ephemeral_browse( self.input.path.clone(), diff --git a/core/src/ops/indexing/ephemeral/index_cache.rs b/core/src/ops/indexing/ephemeral/index_cache.rs index 1aef082a8..660bf8570 100644 --- a/core/src/ops/indexing/ephemeral/index_cache.rs +++ b/core/src/ops/indexing/ephemeral/index_cache.rs @@ -83,12 +83,30 @@ impl EphemeralIndexCache { /// /// Marks the path as indexing-in-progress and returns the global index. /// The indexer job should add entries to this shared index. + /// + /// If the path was previously indexed, clears its children first to + /// prevent ghost entries from deleted files. pub fn create_for_indexing(&self, path: PathBuf) -> Arc> { let mut in_progress = self.indexing_in_progress.write(); + let mut indexed = self.indexed_paths.write(); + + // If this path was previously indexed, remove it from indexed set + // The actual clearing of stale entries happens asynchronously via clear_for_reindex + indexed.remove(&path); in_progress.insert(path); + self.index.clone() } + /// Clear stale entries for a path before re-indexing (async version) + /// + /// Call this after create_for_indexing to remove old children entries. + /// This prevents ghost entries when files are deleted between index runs. + pub async fn clear_for_reindex(&self, path: &Path) -> usize { + let mut index = self.index.write().await; + index.clear_directory_children(path) + } + /// Mark indexing as complete for a path /// /// Moves the path from "in progress" to "indexed" state. @@ -192,7 +210,6 @@ pub struct EphemeralIndexCacheStats { pub indexed_paths: usize, /// Number of paths currently being indexed pub indexing_in_progress: usize, - // Legacy field names for compatibility } diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index b29183ed7..6929da27d 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -449,6 +449,48 @@ impl EphemeralIndex { ) } + /// Clear all direct children of a directory (for re-indexing) + /// + /// This removes entries for the immediate children of the given directory, + /// preventing ghost entries when files are deleted between index runs. + /// Note: Does not recursively clear subdirectories. + pub fn clear_directory_children(&mut self, dir_path: &Path) -> usize { + // Get the directory's children paths first + let children_paths: Vec = if let Some(dir_id) = self.path_index.get(dir_path) { + if let Some(dir_node) = self.arena.get(*dir_id) { + dir_node + .children + .iter() + .filter_map(|&child_id| self.reconstruct_path(child_id)) + .collect() + } else { + return 0; + } + } else { + return 0; + }; + + let mut cleared = 0; + + // Remove each child from indexes (arena nodes are left as orphans - acceptable for ephemeral) + for child_path in &children_paths { + if self.path_index.remove(child_path).is_some() { + cleared += 1; + } + self.entry_uuids.remove(child_path); + self.content_kinds.remove(child_path); + } + + // Clear the parent's children list + if let Some(dir_id) = self.path_index.get(dir_path) { + if let Some(dir_node) = self.arena.get_mut(*dir_id) { + dir_node.children.clear(); + } + } + + cleared + } + /// Reconstruct full path for a node fn reconstruct_path(&self, id: super::ephemeral::EntryId) -> Option { let mut segments = Vec::new(); From aff2398563942375c148164fcd4e4b2f47e05abf Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 21:37:28 -0800 Subject: [PATCH 06/20] Implement shared path tracking in parallel discovery to prevent duplicate processing - Introduced a shared `seen_paths` structure using `RwLock` to manage paths across all workers, addressing symlink loops and duplicate directory processing. - Updated the `discovery_worker_rayon` function to utilize the shared `seen_paths`, enhancing efficiency and correctness in the discovery phase. --- core/src/ops/indexing/phases/discovery.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/core/src/ops/indexing/phases/discovery.rs b/core/src/ops/indexing/phases/discovery.rs index c9cf09a57..4db7f7ce3 100644 --- a/core/src/ops/indexing/phases/discovery.rs +++ b/core/src/ops/indexing/phases/discovery.rs @@ -90,6 +90,10 @@ async fn run_parallel_discovery( let skipped_count = Arc::new(AtomicU64::new(0)); let shutdown = Arc::new(AtomicBool::new(false)); + // Shared seen_paths across all workers to prevent duplicate processing + // (handles symlink loops and same directory reached via different paths) + let seen_paths = Arc::new(parking_lot::RwLock::new(std::collections::HashSet::new())); + // Seed initial work while let Some(dir) = state.dirs_to_walk.pop_front() { pending_work.fetch_add(1, Ordering::Release); @@ -108,6 +112,7 @@ async fn run_parallel_discovery( let pending_work = Arc::clone(&pending_work); let skipped_count = Arc::clone(&skipped_count); let shutdown = Arc::clone(&shutdown); + let seen_paths = Arc::clone(&seen_paths); let root_path = root_path.to_path_buf(); let volume_backend = volume_backend.cloned(); let cloud_url_base = cloud_url_base.clone(); @@ -121,6 +126,7 @@ async fn run_parallel_discovery( pending_work, skipped_count, shutdown, + seen_paths, root_path, rule_toggles, volume_backend, @@ -265,13 +271,12 @@ async fn discovery_worker_rayon( pending_work: Arc, skipped_count: Arc, shutdown: Arc, + seen_paths: Arc>>, root_path: PathBuf, rule_toggles: RuleToggles, volume_backend: Option>, cloud_url_base: Option, ) { - let mut seen_paths = std::collections::HashSet::new(); - loop { // Check shutdown signal if shutdown.load(Ordering::Acquire) { @@ -290,10 +295,13 @@ async fn discovery_worker_rayon( Err(_) => continue, // Timeout, check shutdown flag again }; - // Skip if already seen (handles symlink loops) - if !seen_paths.insert(dir_path.clone()) { - pending_work.fetch_sub(1, Ordering::Release); - continue; + // Skip if already seen (handles symlink loops across ALL workers) + { + let mut seen = seen_paths.write(); + if !seen.insert(dir_path.clone()) { + pending_work.fetch_sub(1, Ordering::Release); + continue; + } } // Build rules for this directory From 4a2590d418b49672bdacd3136c3343322902b49b Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 21:46:21 -0800 Subject: [PATCH 07/20] Refactor IndexerJob to separate job phase execution and ensure proper ephemeral indexing cleanup - Moved the job phase logic into a new `run_job_phases` method for better organization and clarity. - Updated the `run` method to always mark ephemeral indexing as complete, even on failure, preventing stuck indexing flags. - Enhanced logging to provide feedback on the completion status of ephemeral indexing. --- core/src/ops/indexing/job.rs | 81 ++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index 6929da27d..bffab0ef6 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -674,22 +674,9 @@ impl DynJob for IndexerJob { impl JobProgress for IndexerProgress {} -#[async_trait::async_trait] -impl JobHandler for IndexerJob { - type Output = IndexerOutput; - - async fn run(&mut self, ctx: JobContext<'_>) -> JobResult { - // Initialize timer - if self.timer.is_none() { - self.timer = Some(PhaseTimer::new()); - } - - // Initialize ephemeral index if needed - if self.config.is_ephemeral() && self.ephemeral_index.is_none() { - self.ephemeral_index = Some(Arc::new(RwLock::new(EphemeralIndex::new()))); - ctx.log("Initialized ephemeral index for non-persistent job"); - } - +impl IndexerJob { + /// Inner implementation of the job phases (separated for cleanup guarantee) + async fn run_job_phases(&mut self, ctx: &JobContext<'_>) -> JobResult { // Initialize or restore state // Ensure state is always created early to avoid serialization issues if self.state.is_none() { @@ -983,21 +970,7 @@ impl JobHandler for IndexerJob { } } - // Mark ephemeral indexing as complete in the cache - if self.config.is_ephemeral() { - if let Some(local_path) = self.config.path.as_local_path() { - ctx.library() - .core_context() - .ephemeral_cache() - .mark_indexing_complete(local_path); - ctx.log(format!( - "Marked ephemeral indexing complete for: {}", - local_path.display() - )); - } - } - - // Generate final output + // Generate final output (cleanup happens in outer run() method) Ok(IndexerOutput { location_id: self.config.location_id, stats: state.stats, @@ -1011,6 +984,52 @@ impl JobHandler for IndexerJob { }, }) } +} + +// JobHandler trait implementation +#[async_trait::async_trait] +impl JobHandler for IndexerJob { + type Output = IndexerOutput; + + async fn run(&mut self, ctx: JobContext<'_>) -> JobResult { + // Initialize timer + if self.timer.is_none() { + self.timer = Some(PhaseTimer::new()); + } + + // Initialize ephemeral index if needed + if self.config.is_ephemeral() && self.ephemeral_index.is_none() { + self.ephemeral_index = Some(Arc::new(RwLock::new(EphemeralIndex::new()))); + ctx.log("Initialized ephemeral index for non-persistent job"); + } + + // Run the actual job, ensuring ephemeral cleanup happens on both success and failure + let result = self.run_job_phases(&ctx).await; + + // ALWAYS mark ephemeral indexing complete, even on failure + // This prevents the indexing flag from being stuck forever + if self.config.is_ephemeral() { + if let Some(local_path) = self.config.path.as_local_path() { + ctx.library() + .core_context() + .ephemeral_cache() + .mark_indexing_complete(local_path); + match &result { + Ok(_) => ctx.log(format!( + "Marked ephemeral indexing complete for: {}", + local_path.display() + )), + Err(e) => ctx.log(format!( + "Marked ephemeral indexing complete (job failed: {}) for: {}", + e, + local_path.display() + )), + } + } + } + + result + } async fn on_resume(&mut self, ctx: &JobContext<'_>) -> JobResult { // State is already loaded from serialization From ed0fa209b4ee546362539d9b04b7326c4be0bb05 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 22:23:57 -0800 Subject: [PATCH 08/20] Improve comments --- apps/cli/src/domains/index/mod.rs | 19 +- core/src/ops/indexing/ctx.rs | 32 +- core/src/ops/indexing/entry.rs | 281 ++++++++++-------- core/src/ops/indexing/job.rs | 243 ++++++--------- core/src/ops/indexing/mod.rs | 32 +- core/src/ops/indexing/persistence.rs | 106 +++---- core/src/ops/indexing/phases/content.rs | 51 ++-- core/src/ops/indexing/phases/discovery.rs | 106 ++++--- core/src/ops/indexing/phases/processing.rs | 89 +++--- core/src/ops/indexing/state.rs | 103 +++---- .../JobManager/hooks/useJobCount.ts | 90 +++--- 11 files changed, 561 insertions(+), 591 deletions(-) diff --git a/apps/cli/src/domains/index/mod.rs b/apps/cli/src/domains/index/mod.rs index e3cb7add9..d205ba77b 100644 --- a/apps/cli/src/domains/index/mod.rs +++ b/apps/cli/src/domains/index/mod.rs @@ -280,17 +280,13 @@ pub async fn run(ctx: &Context, cmd: IndexCmd) -> Result<()> { "Interned strings (shared)", &stats.interned_strings.to_string(), ]); - stats_table.add_row(vec![ - "Content kinds", - &stats.content_kinds.to_string(), - ]); + stats_table.add_row(vec!["Content kinds", &stats.content_kinds.to_string()]); stats_table.add_row(vec![ "Memory usage", &format_bytes(stats.memory_bytes as u64), ]); stats_table.add_row(vec!["Cache age", &format!("{:.1}s", stats.age_seconds)]); - stats_table - .add_row(vec!["Idle time", &format!("{:.1}s", stats.idle_seconds)]); + stats_table.add_row(vec!["Idle time", &format!("{:.1}s", stats.idle_seconds)]); println!("{}", stats_table); @@ -303,14 +299,11 @@ pub async fn run(ctx: &Context, cmd: IndexCmd) -> Result<()> { println!(); let mut progress_table = Table::new(); progress_table.load_preset(UTF8_BORDERS_ONLY); - progress_table.set_header(vec![ - Cell::new("INDEXING IN PROGRESS").add_attribute(Attribute::Bold), - ]); + progress_table + .set_header(vec![Cell::new("INDEXING IN PROGRESS") + .add_attribute(Attribute::Bold)]); for path in &status.paths_in_progress { - progress_table.add_row(vec![format!( - "● {}", - path.display() - )]); + progress_table.add_row(vec![format!("● {}", path.display())]); } println!("{}", progress_table); } diff --git a/core/src/ops/indexing/ctx.rs b/core/src/ops/indexing/ctx.rs index ed8aa82ed..4ff1088a0 100644 --- a/core/src/ops/indexing/ctx.rs +++ b/core/src/ops/indexing/ctx.rs @@ -1,8 +1,9 @@ -//! Lightweight context abstraction for indexing operations +//! Context abstraction for indexing operations. //! -//! Provides a minimal interface required by indexing code paths so they can run -//! either inside the job system (with `JobContext`) or outside of it (watcher -//! responder) without duplicating logic. +//! The `IndexingCtx` trait provides a minimal interface that indexing code paths +//! need to function. This allows the same indexing logic to run both inside the +//! job system (with `JobContext`) and outside of it (watcher responder), avoiding +//! code duplication between job-based and event-driven indexing. use sea_orm::DatabaseConnection; use std::sync::Arc; @@ -10,17 +11,23 @@ use uuid::Uuid; use crate::{context::CoreContext, infra::job::prelude::JobContext, library::Library}; -/// Minimal capabilities needed by indexing operations +/// Minimal interface required by indexing operations. +/// +/// This trait abstracts away the difference between job-based indexing and +/// event-driven indexing (file watcher responders). Both execution contexts +/// provide database access and logging, but only the job context has full +/// library access for sync operations. pub trait IndexingCtx { - /// Access to the library database connection fn library_db(&self) -> &DatabaseConnection; - /// Access to the library for sync operations (optional - only available in job context) + /// Returns the library reference when running in job context, None otherwise. + /// + /// This is only available for job-based indexing since responder contexts + /// don't have direct library access (they operate through the event bus). fn library(&self) -> Option<&Library> { None } - /// Lightweight logging hook fn log(&self, message: impl AsRef) { tracing::debug!(message = %message.as_ref()); } @@ -36,14 +43,17 @@ impl<'a> IndexingCtx for JobContext<'a> { } } -/// Context for responder paths running outside the job system +/// Context for file watcher responders that run outside the job system. +/// +/// Responders handle filesystem events (file created, moved, deleted) by +/// performing incremental indexing updates. They operate independently of +/// the job system and communicate results through the event bus rather than +/// job completion. pub struct ResponderCtx { - /// Cloned DB connection for the target library db: DatabaseConnection, } impl ResponderCtx { - /// Build a responder context for a specific library pub async fn new(context: &Arc, library_id: Uuid) -> anyhow::Result { let library: Arc = context .get_library(library_id) diff --git a/core/src/ops/indexing/entry.rs b/core/src/ops/indexing/entry.rs index 16c32cb1a..5f0da14bd 100644 --- a/core/src/ops/indexing/entry.rs +++ b/core/src/ops/indexing/entry.rs @@ -1,4 +1,40 @@ -//! Entry processing and metadata extraction +//! # Entry Processing and Persistence +//! +//! `core::ops::indexing::entry` handles the translation of discovered filesystem +//! entries into database records, managing the full lifecycle from metadata extraction +//! to content identification and move operations. +//! +//! ## Key Design Decisions +//! +//! **Closure Table Hierarchy:** Parent-child relationships use a closure table +//! (`entry_closure`) instead of recursive Common Table Expressions (CTEs). This makes +//! "find all descendants" queries O(1) regardless of nesting depth, at the cost of +//! additional storage (~N² in worst case for deeply nested trees). Move operations +//! require rebuilding closures for the entire moved subtree. +//! +//! **Ephemeral UUID Preservation:** When converting ephemeral browsing sessions to +//! persistent indexed locations, entries retain their original UUIDs. This prevents +//! orphaning user metadata (tags, notes, colors) that were attached during browsing. +//! Without preservation, promoting `/mnt/nas` to a managed location would generate new +//! UUIDs and break all existing tag associations. +//! +//! **Deterministic Content UUIDs:** Content identities use v5 UUIDs (namespace hash of +//! `content_hash + library_id`) so different devices can independently identify identical +//! files and merge metadata without coordination. This enables offline duplicate detection. +//! +//! ## Example +//! ```rust,no_run +//! use spacedrive_core::ops::indexing::{EntryProcessor, state::DirEntry}; +//! +//! let entry = DirEntry { /* ... */ }; +//! let entry_id = EntryProcessor::create_entry( +//! &mut state, +//! &ctx, +//! &entry, +//! device_id, +//! &location_root, +//! ).await?; +//! ``` use super::ctx::IndexingCtx; use super::path_resolver::PathResolver; @@ -15,8 +51,12 @@ use sea_orm::{ use std::path::{Path, PathBuf}; use uuid::Uuid; -/// Normalize cloud directory path for consistent lookups -/// Cloud paths stored with trailing slashes don't match PathBuf::parent() results +/// Normalizes cloud storage paths to match PathBuf::parent() semantics. +/// +/// Cloud backends (S3, Dropbox) store directory paths with trailing slashes +/// ("s3://bucket/folder/"), but Rust's PathBuf::parent() strips the trailing slash. +/// This mismatch breaks cache lookups when creating child entries. We normalize by +/// removing the trailing slash for cloud paths so cached parent IDs can be found. fn normalize_cloud_dir_path(path: &Path) -> PathBuf { let path_str = path.to_string_lossy(); if path_str.contains("://") && path_str.ends_with('/') { @@ -26,7 +66,17 @@ fn normalize_cloud_dir_path(path: &Path) -> PathBuf { } } -/// Metadata about a file system entry +/// Snapshot of filesystem metadata for a single entry. +/// +/// This struct is deliberately separate from the database `entry::Model` to +/// decouple discovery (filesystem operations) from persistence (database writes). +/// During ephemeral browsing, thousands of these are created in memory without +/// touching the database, while persistent indexing converts them to ActiveModels +/// in batch transactions. +/// +/// The `inode` field is populated on Unix systems but remains `None` on Windows, +/// where file indices are unstable across reboots. Change detection uses +/// (inode, mtime, size) tuples when available, falling back to path-only matching. #[derive(Debug, Clone)] pub struct EntryMetadata { pub path: PathBuf, @@ -61,10 +111,20 @@ impl From for EntryMetadata { } } -/// Handles entry creation and updates in the database +/// Entry persistence operations for the indexing system. +/// +/// EntryProcessor provides methods for creating, updating, and moving database entries, +/// handling the complexity of closure table updates and directory path cascades. All +/// methods come in both standalone (creates own transaction) and `_in_conn` variants +/// (uses existing transaction) for flexible batch operations. pub struct EntryProcessor; -/// Result of content identity linking (for batch sync) +/// Result of linking an entry to its content identity. +/// +/// Returned by `link_to_content_identity` to provide both models for sync operations. +/// The caller must sync both the content_identity and entry if running outside the +/// job system (e.g., file watcher). The `is_new_content` flag indicates whether this +/// is the first entry with this content hash, which triggers thumbnail generation. pub struct ContentLinkResult { pub content_identity: entities::content_identity::Model, pub entry: entities::entry::Model, @@ -81,9 +141,10 @@ impl EntryProcessor { #[cfg(windows)] pub fn get_inode(_metadata: &std::fs::Metadata) -> Option { - // Windows doesn't have inodes. - // The method `file_index()` from `std::os::windows::fs::MetadataExt` is unstable (issue #63010). - // Returning None is safe as the field is Optional. + // Windows file indices exist but are unstable across reboots and volume operations, + // making them unsuitable for change detection. We return None and fall back to + // path-only matching, which is sufficient since Windows NTFS doesn't support hard + // links for directories (the main inode use case on Unix). None } @@ -92,14 +153,20 @@ impl EntryProcessor { None } - /// Extract detailed metadata from a path + /// Extracts filesystem metadata through either a volume backend or direct I/O. /// - /// Uses the provided VolumeBackend if available, otherwise falls back to direct filesystem access. + /// Volume backends abstract cloud storage (S3, Dropbox) and local filesystems + /// behind a unified interface. When a backend is provided, metadata comes from + /// the volume's cache or API; otherwise this falls back to `tokio::fs` for local + /// paths. Cloud volumes MUST provide a backend since there's no local file to read. + /// + /// Returns `Err` if the path doesn't exist or lacks read permissions. On permission + /// errors, the entry should still be indexed as inaccessible rather than skipped + /// entirely - this preserves the directory tree structure for UI navigation. pub async fn extract_metadata( path: &Path, backend: Option<&std::sync::Arc>, ) -> Result { - // Use backend if available, otherwise fall back to local filesystem if let Some(backend) = backend { let raw = backend .metadata(path) @@ -122,7 +189,6 @@ impl EntryProcessor { .unwrap_or(false), }) } else { - // Fallback to direct filesystem access let metadata = tokio::fs::symlink_metadata(path).await?; let kind = if metadata.is_dir() { @@ -174,7 +240,10 @@ impl EntryProcessor { out_self_closures: &mut Vec, out_dir_paths: &mut Vec, ) -> Result { - // Extract file extension (without dot) for files, None for directories + // Extensions are normalized to lowercase and stored without the leading dot + // because search queries are case-insensitive ("JPG" should match "*.jpg"). + // Directories never have extensions even if named "folder.app" since macOS + // treats .app bundles as atomic units, not files with extensions. let extension = match entry.kind { EntryKind::File => entry .path @@ -184,35 +253,25 @@ impl EntryProcessor { EntryKind::Directory | EntryKind::Symlink => None, }; - // Get file/directory name - // For files: use stem (name without extension) - // For directories: use full name (including .app, etc.) let name = match entry.kind { - EntryKind::File => { - // For files, use stem (without extension) - entry - .path - .file_stem() - .map(|stem| stem.to_string_lossy().to_string()) - .unwrap_or_else(|| { - entry - .path - .file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| "unknown".to_string()) - }) - } - EntryKind::Directory | EntryKind::Symlink => { - // For directories and symlinks, use full name - entry - .path - .file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| "unknown".to_string()) - } + EntryKind::File => entry + .path + .file_stem() + .map(|stem| stem.to_string_lossy().to_string()) + .unwrap_or_else(|| { + entry + .path + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown".to_string()) + }), + EntryKind::Directory | EntryKind::Symlink => entry + .path + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown".to_string()), }; - // Convert timestamps let modified_at = entry .modified .and_then(|t| { @@ -223,11 +282,11 @@ impl EntryProcessor { }) .unwrap_or_else(|| chrono::Utc::now()); - // UUID assignment strategy: - // 1. First check if there's an ephemeral UUID to preserve (from previous browsing) - // 2. If not, generate a new UUID - // - // This ensures that files browsed before enabling indexing keep the same UUID + // UUID assignment strategy: preserve ephemeral UUIDs from prior browsing sessions + // so user metadata (tags, notes) attached during ephemeral mode survives the + // transition to persistent indexing. Without preservation, adding a browsed folder + // as a managed location would orphan all existing tags and make Quick Look previews + // flash as UUIDs change. The ephemeral cache is populated during state initialization. let entry_uuid = if let Some(ephemeral_uuid) = state.get_ephemeral_uuid(&entry.path) { tracing::debug!( "Preserving ephemeral UUID {} for {}", @@ -239,7 +298,6 @@ impl EntryProcessor { Some(Uuid::new_v4()) }; - // Find parent entry ID let parent_id = if let Some(parent_path) = entry.path.parent() { ctx.log(format!( "Looking up parent for {}: parent_path = {}", @@ -247,13 +305,12 @@ impl EntryProcessor { parent_path.display() )); - // First check the cache if let Some(id) = state.entry_id_cache.get(parent_path).copied() { ctx.log(format!("Found parent in cache: id = {}", id)); Some(id) } else { - // If not in cache, try to find it in the database - // For cloud paths, try both with and without trailing slash + // For cloud paths, try both with and without trailing slash since cloud backends + // may store paths inconsistently depending on API responses. let parent_path_str = parent_path.to_string_lossy().to_string(); let is_cloud = parent_path_str.contains("://"); @@ -277,7 +334,9 @@ impl EntryProcessor { .insert(parent_path.to_path_buf(), dir_path_record.entry_id); Some(dir_path_record.entry_id) } else { - // Parent not found - this shouldn't happen with proper sorting + // Parent not found indicates entries arrived out of order, possibly from + // concurrent file watchers or interrupted batch processing. The entry will + // be orphaned (parent_id = NULL) until the next full reindex repairs the hierarchy. ctx.log(format!( "WARNING: Parent not found for {}: {} (tried: {:?})", entry.path.display(), @@ -291,7 +350,6 @@ impl EntryProcessor { None }; - // Create entry let now = chrono::Utc::now(); tracing::debug!( "Creating entry: name={}, path={}, inode={:?}, parent_id={:?}", @@ -305,23 +363,22 @@ impl EntryProcessor { name: Set(name.clone()), kind: Set(Self::entry_kind_to_int(entry.kind)), extension: Set(extension), - metadata_id: Set(None), // User metadata only created when user adds metadata - content_id: Set(None), // Will be set later during content identification phase + metadata_id: Set(None), + content_id: Set(None), size: Set(entry.size as i64), - aggregate_size: Set(0), // Will be calculated in aggregation phase - child_count: Set(0), // Will be calculated in aggregation phase - file_count: Set(0), // Will be calculated in aggregation phase + aggregate_size: Set(0), + child_count: Set(0), + file_count: Set(0), created_at: Set(now), modified_at: Set(modified_at), accessed_at: Set(None), - indexed_at: Set(Some(now)), // Record when we indexed this entry - permissions: Set(None), // TODO: Could extract from metadata + indexed_at: Set(Some(now)), + permissions: Set(None), inode: Set(entry.inode.map(|i| i as i64)), parent_id: Set(parent_id), ..Default::default() }; - // Insert the entry let result = new_entry .insert(conn) .await @@ -334,8 +391,6 @@ impl EntryProcessor { result.inode ); - // Populate closure table - // First, insert self-reference let self_closure = entry_closure::ActiveModel { ancestor_id: Set(result.id), descendant_id: Set(result.id), @@ -344,9 +399,9 @@ impl EntryProcessor { }; out_self_closures.push(self_closure); - // If there's a parent, copy all parent's ancestors + // Copy all parent's ancestor relationships to build the transitive closure for this entry. + // This allows "find all descendants" queries to run in O(1) without recursive traversal. if let Some(parent_id) = parent_id { - // Insert closure entries for all ancestors conn.execute_unprepared(&format!( "INSERT INTO entry_closure (ancestor_id, descendant_id, depth) \ SELECT ancestor_id, {}, depth + 1 \ @@ -360,12 +415,8 @@ impl EntryProcessor { })?; } - // If this is a directory, populate the directory_paths table if entry.kind == EntryKind::Directory { - // Use the absolute path from the DirEntry which contains the full filesystem path let absolute_path = entry.path.to_string_lossy().to_string(); - - // Insert into directory_paths table let dir_path_entry = directory_paths::ActiveModel { entry_id: Set(result.id), path: Set(absolute_path), @@ -374,8 +425,9 @@ impl EntryProcessor { out_dir_paths.push(dir_path_entry); } - // Cache the entry ID for potential children - // Normalize cloud directory paths to match what parent() returns + // Normalize cloud directory paths (remove trailing slash) so child entries can find + // their parent in the cache. PathBuf::parent() doesn't include trailing slashes, but + // cloud backends may store "s3://bucket/folder/" with the slash. let cache_key = if entry.kind == EntryKind::Directory { normalize_cloud_dir_path(&entry.path) } else { @@ -510,9 +562,9 @@ impl EntryProcessor { entry_active.inode = Set(Some(inode as i64)); } - // TODO: Rename indexed_at to last_indexed_at to better reflect its purpose - // Update indexed_at so incremental sync picks up this change - // Without this, modified entries would be skipped by watermark-based queries + // Update indexed_at so incremental sync picks up this change. + // The watermark-based query filters on indexed_at, so skipping this would + // cause modified entries to be ignored on subsequent scans. entry_active.indexed_at = Set(Some(chrono::Utc::now())); entry_active @@ -584,17 +636,14 @@ impl EntryProcessor { let is_directory = db_entry.kind == Self::entry_kind_to_int(EntryKind::Directory); let mut entry_active: entities::entry::ActiveModel = db_entry.into(); - // Find new parent entry ID let new_parent_id = if let Some(parent_path) = new_path.parent() { state.entry_id_cache.get(parent_path).copied() } else { None }; - // Update entry fields entry_active.parent_id = Set(new_parent_id); - // Extract new name if it changed let mut new_name_value = None; if let Some(new_name) = new_path.file_stem() { let name_string = new_name.to_string_lossy().to_string(); @@ -602,14 +651,16 @@ impl EntryProcessor { entry_active.name = Set(name_string); } - // Save the updated entry entry_active .update(txn) .await .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; - // Update closure table for the move operation - // Step 1: Delete all ancestor relationships for the moved subtree (except internal relationships) + // Rebuild closure table for the moved subtree. Moving a directory with 10,000 descendants + // requires updating ~50M closure rows in the worst case (full tree reconnection). We do this + // in two steps: (1) disconnect the subtree from old ancestors, (2) reconnect to new parent. + // Step 1: Delete all ancestor relationships for the moved subtree, but preserve internal + // relationships (entries within the subtree can still find their descendants). txn.execute_unprepared(&format!( "DELETE FROM entry_closure \ WHERE descendant_id IN (SELECT descendant_id FROM entry_closure WHERE ancestor_id = {}) \ @@ -619,9 +670,10 @@ impl EntryProcessor { .await .map_err(|e| JobError::execution(format!("Failed to disconnect subtree: {}", e)))?; - // Step 2: If there's a new parent, reconnect the subtree + // Step 2: Reconnect the subtree under the new parent by creating closure rows for all + // (ancestor, descendant) pairs where ancestor is in the new parent chain and descendant + // is in the moved subtree. The depth is calculated as parent_depth + child_depth + 1. if let Some(new_parent_id) = new_parent_id { - // Connect moved subtree to new parent txn.execute_unprepared(&format!( "INSERT INTO entry_closure (ancestor_id, descendant_id, depth) \ SELECT p.ancestor_id, c.descendant_id, p.depth + c.depth + 1 \ @@ -633,11 +685,8 @@ impl EntryProcessor { .map_err(|e| JobError::execution(format!("Failed to reconnect subtree: {}", e)))?; } - // If this is a directory, update its path in directory_paths table if is_directory { - // Get the new name from what we saved earlier let new_name = new_name_value.unwrap_or_else(|| { - // If name didn't change, get it from the path new_path .file_name() .and_then(|n| n.to_str()) @@ -645,7 +694,6 @@ impl EntryProcessor { .to_string() }); - // Build the new path let new_directory_path = PathResolver::build_directory_path(txn, new_parent_id, &new_name) .await @@ -653,14 +701,12 @@ impl EntryProcessor { JobError::execution(format!("Failed to build new directory path: {}", e)) })?; - // Get the old path for descendant updates let old_directory_path = PathResolver::get_directory_path(txn, entry_id) .await .map_err(|e| { JobError::execution(format!("Failed to get old directory path: {}", e)) })?; - // Update the directory's own path let mut dir_path_active = directory_paths::Entity::find_by_id(entry_id) .one(txn) .await @@ -672,8 +718,9 @@ impl EntryProcessor { JobError::execution(format!("Failed to update directory path: {}", e)) })?; - // Update descendant directory paths within the same transaction - // Note: This is done synchronously within the batch transaction for consistency + // Cascade path updates to all descendant directories. Moving "/home/user/docs" to + // "/backup/docs" requires rewriting paths for every child, which can be thousands + // of directories. This runs in the same transaction to maintain consistency. if let Err(e) = PathResolver::update_descendant_paths( txn, entry_id, @@ -704,9 +751,20 @@ impl EntryProcessor { } } - /// Create or find content identity and link to entry with deterministic UUID - /// This method implements the content identification phase logic - /// Returns models for batch syncing (caller responsible for sync) + /// Links an entry to its content identity, deduplicating files with identical hashes. + /// + /// Content identities are shared across all entries with the same content hash + /// (computed via BLAKE3). When two files have identical content, they reference + /// the same `content_identity` row, enabling "find all duplicates" queries and + /// reducing thumbnail storage (one thumbnail per content, not per entry). + /// + /// Each content identity gets a deterministic UUID (v5 hash of content_hash + library_id) + /// so other devices can independently identify the same content and merge their + /// metadata without coordination. This enables offline duplicate detection across + /// library peers. + /// + /// Returns both the content identity and the updated entry for batch sync operations. + /// The caller must sync both models if running outside the job system (e.g., watcher). pub async fn link_to_content_identity( ctx: &impl IndexingCtx, entry_id: i32, @@ -714,7 +772,6 @@ impl EntryProcessor { content_hash: String, library_id: Uuid, ) -> Result { - // Check if content identity already exists by content_hash let existing = entities::content_identity::Entity::find() .filter(entities::content_identity::Column::ContentHash.eq(&content_hash)) .one(ctx.library_db()) @@ -722,7 +779,6 @@ impl EntryProcessor { .map_err(|e| JobError::execution(format!("Failed to query content identity: {}", e)))?; let (content_model, is_new_content) = if let Some(existing) = existing { - // Increment entry count for existing content let mut existing_active: entities::content_identity::ActiveModel = existing.into(); existing_active.entry_count = Set(existing_active.entry_count.unwrap() + 1); existing_active.last_verified_at = Set(chrono::Utc::now()); @@ -736,36 +792,32 @@ impl EntryProcessor { (updated, false) } else { - // Create new content identity with deterministic UUID (ready for sync) let file_size = tokio::fs::symlink_metadata(path) .await .map(|m| m.len() as i64) .unwrap_or(0); - // Generate deterministic UUID from content_hash + library_id + // Generate deterministic v5 UUID (namespace hash) so different devices can independently + // create the same content identity UUID for duplicate files. The namespace is derived from + // the library ID, ensuring content UUIDs are unique per library while still being deterministic. let deterministic_uuid = { const LIBRARY_NAMESPACE: uuid::Uuid = uuid::Uuid::from_bytes([ 0x6b, 0xa7, 0xb8, 0x10, 0x9d, 0xad, 0x11, 0xd1, 0x80, 0xb4, 0x00, 0xc0, 0x4f, 0xd4, 0x30, 0xc8, ]); - // We use v5 to ensure the UUID is deterministic and unique within the library let namespace = uuid::Uuid::new_v5(&LIBRARY_NAMESPACE, library_id.as_bytes()); uuid::Uuid::new_v5(&namespace, content_hash.as_bytes()) }; - // Detect file type using the file type registry let registry = FileTypeRegistry::default(); let file_type_result = registry.identify(path).await; let (kind_id, mime_type_id) = match file_type_result { Ok(result) => { - // Get content kind ID directly from the enum let kind_id = result.file_type.category as i32; - // Handle MIME type - upsert if found let mime_type_id = if let Some(mime_str) = result.file_type.primary_mime_type() { - // Check if MIME type already exists let existing = entities::mime_type::Entity::find() .filter(entities::mime_type::Column::MimeType.eq(mime_str)) .one(ctx.library_db()) @@ -777,7 +829,6 @@ impl EntryProcessor { match existing { Some(mime_record) => Some(mime_record.id), None => { - // Create new MIME type entry let new_mime = entities::mime_type::ActiveModel { uuid: Set(Uuid::new_v4()), mime_type: Set(mime_str.to_string()), @@ -802,19 +853,16 @@ impl EntryProcessor { (kind_id, mime_type_id) } - Err(_) => { - // If identification fails, fall back to "unknown" (0) - (0, None) - } + Err(_) => (0, None), }; let new_content = entities::content_identity::ActiveModel { - uuid: Set(Some(deterministic_uuid)), // Deterministic UUID for sync - integrity_hash: Set(None), // Generated later by validate job + uuid: Set(Some(deterministic_uuid)), + integrity_hash: Set(None), content_hash: Set(content_hash.clone()), mime_type_id: Set(mime_type_id), kind_id: Set(kind_id), - text_content: Set(None), // TODO: Extract text content for indexing + text_content: Set(None), total_size: Set(file_size), entry_count: Set(1), first_seen_at: Set(chrono::Utc::now()), @@ -822,13 +870,13 @@ impl EntryProcessor { ..Default::default() }; - // Try to insert, but handle unique constraint violations + // Handle race condition: another job (or device sync) may have created this + // content identity between our check and insert. Catch UNIQUE constraint violations + // and use the existing record instead of failing. let result = match new_content.insert(ctx.library_db()).await { Ok(model) => (model, true), Err(e) => { - // Check if it's a unique constraint violation if e.to_string().contains("UNIQUE constraint failed") { - // Another job created it - find and use the existing one let existing = entities::content_identity::Entity::find() .filter(entities::content_identity::Column::ContentHash.eq(&content_hash)) .one(ctx.library_db()) @@ -836,7 +884,6 @@ impl EntryProcessor { .map_err(|e| JobError::execution(format!("Failed to find existing content identity: {}", e)))? .ok_or_else(|| JobError::execution("Content identity should exist after unique constraint violation".to_string()))?; - // Update entry count let mut existing_active: entities::content_identity::ActiveModel = existing.clone().into(); existing_active.entry_count = Set(existing.entry_count + 1); @@ -866,7 +913,6 @@ impl EntryProcessor { result }; - // Update Entry with content_id (now sync-ready for regular files) let entry = entities::entry::Entity::find_by_id(entry_id) .one(ctx.library_db()) .await @@ -966,7 +1012,10 @@ impl EntryProcessor { moved_count += 1; } Err(e) => { - // Log error but continue with other moves + // Bulk move operations are best-effort: one failure shouldn't roll back + // the entire batch. Parent directory renames succeed even if a child fails + // due to file locks, though the child will have a stale path until the next + // reindex cleans it up. ctx.log(format!( "Failed to move entry {} from {} to {}: {}", entry_id, @@ -988,7 +1037,6 @@ impl EntryProcessor { entry: &super::state::DirEntry, txn: &DatabaseTransaction, ) -> Result<(), JobError> { - // Get the existing entry let db_entry = entities::entry::Entity::find_by_id(entry_id) .one(txn) .await @@ -997,11 +1045,9 @@ impl EntryProcessor { let mut entry_active: entities::entry::ActiveModel = db_entry.into(); - // Update size if it changed if let Ok(metadata) = std::fs::symlink_metadata(&entry.path) { entry_active.size = Set(metadata.len() as i64); - // Update modified time if let Ok(modified) = metadata.modified() { if let Ok(duration) = modified.duration_since(std::time::UNIX_EPOCH) { entry_active.modified_at = Set(chrono::DateTime::from_timestamp( @@ -1013,7 +1059,6 @@ impl EntryProcessor { } } - // Save the updated entry entry_active .update(txn) .await diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index bffab0ef6..a98797927 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -1,4 +1,9 @@ -//! Main indexer job implementation +//! Indexer job implementation and ephemeral index storage. +//! +//! This module contains the main `IndexerJob` struct that orchestrates the multi-phase +//! indexing pipeline, as well as the `EphemeralIndex` used for browsing unmanaged paths +//! without database writes. The job supports both persistent indexing (for managed locations) +//! and ephemeral indexing (for external drives, network shares, and temporary browsing). use crate::{ domain::addressing::SdPath, @@ -26,20 +31,30 @@ use super::{ PathResolver, }; -/// Indexing mode determines the depth of indexing +/// How deeply to index files, from metadata-only to full processing. +/// +/// IndexMode controls the trade-off between indexing speed and feature completeness. +/// Shallow mode is fast enough for ephemeral browsing, while Deep mode enables +/// duplicate detection, thumbnail generation, and full-text search at the cost of +/// significantly longer indexing time. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Type)] pub enum IndexMode { /// Location exists but is not indexed None, /// Just filesystem metadata (fastest) Shallow, - /// Generate content identities (moderate) + /// Generate content identities via BLAKE3 hashing (enables duplicate detection) Content, /// Full indexing with thumbnails and text extraction (slowest) Deep, } -/// Indexing scope determines how much of the directory tree to process +/// Whether to index just one directory level or recurse through subdirectories. +/// +/// Current scope is used for UI navigation where users expand folders on-demand, +/// while Recursive scope is used for full location indexing. Current scope with +/// persistent storage enables progressive indexing where the UI drives which +/// directories get indexed based on user interaction. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Type)] pub enum IndexScope { /// Index only the current directory (single level) @@ -73,7 +88,14 @@ impl std::fmt::Display for IndexScope { } } -/// Determines whether indexing results are persisted to database or kept in memory +/// Whether to write indexing results to the database or keep them in memory. +/// +/// Ephemeral persistence allows users to browse external drives and network shares +/// without adding them as managed locations. The in-memory index survives for the +/// session duration and provides the same API surface as persistent entries, enabling +/// features like search and navigation to work identically for both modes. If an +/// ephemeral path is later promoted to a managed location, UUIDs are preserved to +/// maintain continuity for user metadata. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Type)] pub enum IndexPersistence { /// Write all results to database (normal operation) @@ -88,21 +110,24 @@ impl Default for IndexPersistence { } } -/// Enhanced configuration for indexer jobs +/// Configuration for an indexer job, supporting both persistent and ephemeral indexing. +/// +/// Persistent jobs require a location_id to identify which managed location they're +/// indexing. Ephemeral jobs (browsing unmanaged paths) use location_id = None and +/// store results in memory instead of the database. #[derive(Debug, Clone, Serialize, Deserialize, Type)] pub struct IndexerJobConfig { - pub location_id: Option, // None for ephemeral indexing + pub location_id: Option, pub path: SdPath, pub mode: IndexMode, pub scope: IndexScope, pub persistence: IndexPersistence, - pub max_depth: Option, // Override for Current scope or depth limiting + pub max_depth: Option, #[serde(default)] pub rule_toggles: super::rules::RuleToggles, } impl IndexerJobConfig { - /// Create a new configuration for persistent recursive indexing (traditional) pub fn new(location_id: Uuid, path: SdPath, mode: IndexMode) -> Self { Self { location_id: Some(location_id), @@ -115,7 +140,6 @@ impl IndexerJobConfig { } } - /// Create configuration for UI directory navigation (quick current scan) pub fn ui_navigation(location_id: Uuid, path: SdPath) -> Self { Self { location_id: Some(location_id), @@ -128,12 +152,11 @@ impl IndexerJobConfig { } } - /// Create configuration for ephemeral path browsing (outside managed locations) pub fn ephemeral_browse(path: SdPath, scope: IndexScope) -> Self { Self { location_id: None, path, - mode: IndexMode::Shallow, // Ephemeral jobs identify content kind by extension, no hashing needed + mode: IndexMode::Shallow, scope, persistence: IndexPersistence::Ephemeral, max_depth: if scope == IndexScope::Current { @@ -156,38 +179,29 @@ impl IndexerJobConfig { } } -/// In-memory storage for ephemeral indexing results +/// Memory-efficient index for browsing paths outside managed locations. /// -/// This implementation uses efficient data structures for memory optimization: -/// - NodeArena: Contiguous storage for file nodes (~48 bytes per node) -/// - NameCache: String interning for common filenames (shared across all entries) -/// - NameRegistry: Fast name-based lookups +/// Ephemeral indexing lets users navigate unmanaged directories (network shares, +/// external drives) without adding them as permanent locations. Instead of writing +/// to the database, entries live in this memory-only structure until the session +/// ends or the path is promoted to a managed location. /// -/// All browsed paths share a single index, maximizing string deduplication -/// and memory efficiency. Parent-child relationships are established based -/// on path hierarchy. +/// Memory usage is ~50 bytes per entry vs ~200 bytes with a naive `HashMap` +/// approach. The optimization comes from: +/// - **NodeArena:** Contiguous slab allocation with pointer-sized entry IDs +/// - **NameCache:** String interning (one copy of "index.js" for thousands of node_modules files) +/// - **NameRegistry:** Trie-based prefix search without full-text indexing overhead /// -/// Memory usage: ~50 bytes per entry vs ~200 bytes with HashMap +/// Multiple directory trees can coexist in the same index (e.g., browsing both +/// `/mnt/nas` and `/media/usb` simultaneously), sharing the string interning pool +/// for maximum deduplication. pub struct EphemeralIndex { - /// Efficient tree storage arena: super::ephemeral::NodeArena, - - /// String interning (shared across all paths) cache: std::sync::Arc, - - /// Fast name lookups registry: super::ephemeral::NameRegistry, - - /// Path → EntryId mapping (for lookups by path) path_index: HashMap, - - /// UUID mapping (for API compatibility) entry_uuids: HashMap, - - /// Content kinds by path (fast extension-based identification) content_kinds: HashMap, - - /// Metadata created_at: std::time::Instant, last_accessed: std::time::Instant, pub stats: IndexerStats, @@ -204,11 +218,6 @@ impl std::fmt::Debug for EphemeralIndex { } impl EphemeralIndex { - /// Create a new empty ephemeral index - /// - /// The index stores entries with their full paths and builds parent-child - /// relationships based on path hierarchy. Multiple directory trees can - /// coexist in the same index, sharing the arena and string interning pool. pub fn new() -> Self { use super::ephemeral::{NameCache, NameRegistry, NodeArena}; @@ -231,21 +240,22 @@ impl EphemeralIndex { } } - /// Ensure a directory exists in the index, creating ancestor chain if needed + /// Ensures a directory exists, creating all missing ancestors recursively. /// - /// Returns the EntryId of the directory. + /// This method guarantees that `list_directory()` works immediately after + /// `add_entry()` without a separate tree-building pass. Parent directories + /// are created from root to leaf, so the full ancestor chain exists before + /// any child is added. pub fn ensure_directory(&mut self, path: &Path) -> super::ephemeral::EntryId { use super::ephemeral::{ FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata, }; use super::state::EntryKind; - // Already exists? if let Some(&id) = self.path_index.get(path) { return id; } - // Ensure parent exists first (recursive) let parent_id = if let Some(parent_path) = path.parent() { if parent_path.as_os_str().is_empty() { None @@ -256,7 +266,6 @@ impl EphemeralIndex { None }; - // Create this directory let name = self.cache.intern( path.file_name() .map(|s| s.to_string_lossy()) @@ -279,18 +288,21 @@ impl EphemeralIndex { } } - // Index by path and name self.path_index.insert(path.to_path_buf(), id); self.registry.insert(name, id); - // Generate UUID for directory let uuid = uuid::Uuid::new_v4(); self.entry_uuids.insert(path.to_path_buf(), uuid); id } - /// Add an entry to the index. Returns Some(content_kind) if added, None if duplicate. + /// Adds an entry to the index, returning its content kind if successful. + /// + /// Content kind is identified by file extension (no I/O needed), which is + /// sufficient for ephemeral browsing where speed is critical. Returns None + /// if the entry already exists (prevents duplicate entries when re-indexing + /// a directory). pub fn add_entry( &mut self, path: PathBuf, @@ -303,30 +315,26 @@ impl EphemeralIndex { use crate::domain::ContentKind; use crate::filetype::FileTypeRegistry; - // Check if entry already exists for this path - skip if so to prevent duplicates if self.path_index.contains_key(&path) { tracing::trace!("Skipping duplicate entry: {}", path.display()); return None; } - // Ensure parent directory exists in the index FIRST (requires &mut self) - // This must happen before interning the name to avoid borrow conflicts + // Ensure parent directories exist before adding this entry, building the ancestor + // chain from root to leaf. The &mut borrow happens before name interning to avoid + // holding the cache lock while recursing. let parent_id = if let Some(parent_path) = path.parent() { if parent_path.as_os_str().is_empty() { - // Root of filesystem, no parent None } else if let Some(&existing_id) = self.path_index.get(parent_path) { - // Parent already exists Some(existing_id) } else { - // Parent doesn't exist - ensure it (and ancestors) are created Some(self.ensure_directory(parent_path)) } } else { None }; - // Now intern the filename (borrows self.cache immutably) let name = self.cache.intern( path.file_name() .map(|s| s.to_string_lossy()) @@ -334,13 +342,11 @@ impl EphemeralIndex { .unwrap_or("unknown"), ); - // Create metadata let file_type = FileType::from(metadata.kind); let meta = PackedMetadata::new(NodeState::Accessible, file_type, metadata.size) .with_times(metadata.modified, metadata.created); - // Create node let parent_ref = parent_id .map(MaybeEntryId::some) .unwrap_or(MaybeEntryId::NONE); @@ -355,17 +361,15 @@ impl EphemeralIndex { } } - // Detect content kind by extension (fast, no I/O) let content_kind = if metadata.kind == super::state::EntryKind::File { let registry = FileTypeRegistry::default(); registry.identify_by_extension(&path) } else if metadata.kind == super::state::EntryKind::Directory { - ContentKind::Unknown // Directories don't have content kind + ContentKind::Unknown } else { ContentKind::Unknown }; - // Index by path and name self.path_index.insert(path.clone(), id); self.registry.insert(name, id); self.entry_uuids.insert(path.clone(), uuid); @@ -428,7 +432,6 @@ impl EphemeralIndex { self.entry_uuids.get(path).copied() } - /// Get the content kind for an entry (identified by extension) pub fn get_content_kind(&self, path: &PathBuf) -> crate::domain::ContentKind { self.content_kinds .get(path) @@ -436,7 +439,6 @@ impl EphemeralIndex { .unwrap_or(crate::domain::ContentKind::Unknown) } - /// List directory children pub fn list_directory(&self, path: &std::path::Path) -> Option> { let id = self.path_index.get(path)?; let node = self.arena.get(*id)?; @@ -449,13 +451,13 @@ impl EphemeralIndex { ) } - /// Clear all direct children of a directory (for re-indexing) + /// Clears immediate children of a directory to prepare for re-indexing. /// - /// This removes entries for the immediate children of the given directory, - /// preventing ghost entries when files are deleted between index runs. - /// Note: Does not recursively clear subdirectories. + /// This prevents ghost entries when files are deleted between index runs. + /// The arena nodes become orphaned but remain allocated, which is acceptable + /// for ephemeral indexes since memory pressure triggers full eviction anyway. + /// Only clears the direct children (non-recursive). pub fn clear_directory_children(&mut self, dir_path: &Path) -> usize { - // Get the directory's children paths first let children_paths: Vec = if let Some(dir_id) = self.path_index.get(dir_path) { if let Some(dir_node) = self.arena.get(*dir_id) { dir_node @@ -472,7 +474,6 @@ impl EphemeralIndex { let mut cleared = 0; - // Remove each child from indexes (arena nodes are left as orphans - acceptable for ephemeral) for child_path in &children_paths { if self.path_index.remove(child_path).is_some() { cleared += 1; @@ -481,7 +482,6 @@ impl EphemeralIndex { self.content_kinds.remove(child_path); } - // Clear the parent's children list if let Some(dir_id) = self.path_index.get(dir_path) { if let Some(dir_node) = self.arena.get_mut(*dir_id) { dir_node.children.clear(); @@ -491,18 +491,15 @@ impl EphemeralIndex { cleared } - /// Reconstruct full path for a node fn reconstruct_path(&self, id: super::ephemeral::EntryId) -> Option { let mut segments = Vec::new(); let mut current = id; - // Walk up the tree collecting path segments while let Some(node) = self.arena.get(current) { segments.push(node.name().to_owned()); if let Some(parent) = node.parent() { current = parent; } else { - // Reached a root node (no parent) break; } } @@ -511,7 +508,6 @@ impl EphemeralIndex { return None; } - // Build absolute path from segments (root to leaf) let mut path = PathBuf::from("/"); for segment in segments.into_iter().rev() { path.push(segment); @@ -519,7 +515,6 @@ impl EphemeralIndex { Some(path) } - /// Find all entries with the given filename pub fn find_by_name(&self, name: &str) -> Vec { self.registry .get(name) @@ -531,7 +526,6 @@ impl EphemeralIndex { .unwrap_or_default() } - /// Find all entries with names starting with the given prefix pub fn find_by_prefix(&self, prefix: &str) -> Vec { self.registry .find_prefix(prefix) @@ -548,17 +542,14 @@ impl EphemeralIndex { self.last_accessed.elapsed() } - /// Get the total number of entries pub fn len(&self) -> usize { self.arena.len() } - /// Check if the index is empty pub fn is_empty(&self) -> bool { self.arena.is_empty() } - /// Get approximate memory usage in bytes pub fn memory_usage(&self) -> usize { self.arena.memory_usage() + self.cache.memory_usage() @@ -570,7 +561,6 @@ impl EphemeralIndex { * (std::mem::size_of::() + std::mem::size_of::()) } - /// Get statistics about the index pub fn get_stats(&self) -> EphemeralIndexStats { EphemeralIndexStats { total_entries: self.arena.len(), @@ -580,20 +570,19 @@ impl EphemeralIndex { } } - /// Get the number of content kinds stored pub fn content_kinds_count(&self) -> usize { self.content_kinds.len() } - /// Get the number of entries in the path index pub fn path_index_count(&self) -> usize { self.path_index.len() } - /// Get all entries as a HashMap (for backward compatibility) + /// Reconstructs paths for all entries and returns them as a HashMap. /// - /// This method reconstructs paths for all entries. For large indexes, - /// consider using iterators or specific queries instead. + /// For large indexes, this can be expensive since it walks the tree to rebuild + /// every path. Prefer using `list_directory()` or `find_by_name()` for targeted + /// queries when possible. pub fn entries(&self) -> HashMap { use super::state::EntryKind; @@ -639,25 +628,25 @@ pub struct EphemeralIndexStats { pub memory_bytes: usize, } -/// Indexer job - discovers and indexes files in a location +/// Orchestrates multi-phase file indexing for both persistent and ephemeral modes. +/// +/// The job executes as a state machine progressing through Discovery, Processing, +/// Aggregation, and ContentIdentification phases. State is automatically serialized +/// between phases, allowing the job to survive app restarts and resume from the last +/// completed phase. Ephemeral jobs (browsing unmanaged paths) skip aggregation and +/// content identification, storing results in memory via `EphemeralIndex`. #[derive(Debug, Serialize, Deserialize, Job)] pub struct IndexerJob { pub config: IndexerJobConfig, - - // Resumable state state: Option, - - // Ephemeral storage for non-persistent jobs #[serde(skip)] ephemeral_index: Option>>, - - // Performance tracking #[serde(skip)] timer: Option, #[serde(skip)] - db_operations: (u64, u64), // (reads, writes) + db_operations: (u64, u64), #[serde(skip)] - batch_info: (u64, usize), // (count, total_size) + batch_info: (u64, usize), } impl Job for IndexerJob { @@ -675,10 +664,7 @@ impl DynJob for IndexerJob { impl JobProgress for IndexerProgress {} impl IndexerJob { - /// Inner implementation of the job phases (separated for cleanup guarantee) async fn run_job_phases(&mut self, ctx: &JobContext<'_>) -> JobResult { - // Initialize or restore state - // Ensure state is always created early to avoid serialization issues if self.state.is_none() { ctx.log(format!( "Starting new indexer job (scope: {}, persistence: {:?})", @@ -704,9 +690,8 @@ impl IndexerJob { let state = self.state.as_mut().unwrap(); - // Get root path ONCE for the entire job // For cloud volumes, we use the path component from the SdPath (e.g., "/" or "folder/") - // since discovery operates through the volume backend (not direct filesystem access) + // since discovery operates through the volume backend (not direct filesystem access). let root_path_buf = if let Some(p) = self.config.path.as_local_path() { p.to_path_buf() } else if let Some(cloud_path) = self.config.path.cloud_path() { @@ -739,7 +724,6 @@ impl IndexerJob { }; let root_path = root_path_buf.as_path(); - // Get volume backend for the entire job let volume_backend: Option> = if let Some(vm) = ctx.volume_manager() { match vm @@ -754,7 +738,6 @@ impl IndexerJob { Some(vm.backend_for_volume(&mut volume)) } Ok(None) => { - // For cloud paths, we MUST have a volume - can't fall back to local if self.config.path.is_cloud() { ctx.log(format!( "Cloud volume not found for path: {}", @@ -766,7 +749,6 @@ impl IndexerJob { ))); } - // For local paths, we can fall back to LocalBackend ctx.log(format!( "No volume found for path: {}, will use LocalBackend fallback", self.config.path @@ -786,12 +768,10 @@ impl IndexerJob { None }; - // Seed discovery queue if it wasn't initialized due to device-id timing if state.dirs_to_walk.is_empty() { state.dirs_to_walk.push_back(root_path.to_path_buf()); } - // Main state machine loop loop { ctx.check_interrupt().await?; @@ -799,7 +779,6 @@ impl IndexerJob { warn!("DEBUG: IndexerJob entering phase: {:?}", current_phase); match current_phase { Phase::Discovery => { - // For cloud volumes, construct the base URL for building absolute paths let cloud_url_base = if let Some((service, identifier, _)) = self.config.path.as_cloud() { Some(format!("{}://{}/", service.scheme(), identifier)) @@ -807,7 +786,6 @@ impl IndexerJob { None }; - // Use scope-aware discovery if self.config.is_current_scope() { Self::run_current_scope_discovery_static(state, &ctx, root_path).await?; } else { @@ -822,11 +800,9 @@ impl IndexerJob { .await?; } - // Track batch info self.batch_info.0 = state.entry_batches.len() as u64; self.batch_info.1 = state.entry_batches.iter().map(|b| b.len()).sum(); - // Start processing timer if let Some(timer) = &mut self.timer { timer.start_processing(); } @@ -859,8 +835,7 @@ impl IndexerJob { ) .await?; - // Update DB operation counts - self.db_operations.1 += state.entry_batches.len() as u64 * 100; // Estimate + self.db_operations.1 += state.entry_batches.len() as u64 * 100; } } @@ -875,14 +850,11 @@ impl IndexerJob { ) .await?; } else { - // Skip aggregation and content phases for ephemeral jobs - // Content kind is already identified by extension during add_entry ctx.log("Skipping aggregation and content phases for ephemeral job (content kind identified by extension)"); state.phase = Phase::Complete; continue; } - // Start content timer if let Some(timer) = &mut self.timer { timer.start_content(); } @@ -891,7 +863,6 @@ impl IndexerJob { Phase::ContentIdentification => { if self.config.mode >= IndexMode::Content { if self.config.is_ephemeral() { - // Skip content phase for ephemeral jobs - content kind already identified ctx.log("Skipping content identification for ephemeral job"); state.phase = Phase::Complete; continue; @@ -915,14 +886,12 @@ impl IndexerJob { Phase::Complete => break, } - // State is automatically saved during job serialization on shutdown warn!( "DEBUG: IndexerJob completed phase: {:?}, next phase will be: {:?}", current_phase, state.phase ); } - // Send final progress update let final_progress = IndexerProgress { phase: IndexPhase::Finalizing { processed: 0, @@ -935,27 +904,23 @@ impl IndexerJob { scope: None, persistence: None, is_ephemeral: false, - action_context: None, // TODO: Pass action context from job state + action_context: None, }; ctx.progress(Progress::generic(final_progress.to_generic_progress())); - // Calculate final metrics let metrics = if let Some(timer) = &self.timer { IndexerMetrics::calculate(&state.stats, timer, self.db_operations, self.batch_info) } else { IndexerMetrics::default() }; - // Log summary ctx.log(&metrics.format_summary()); - // If Deep mode, dispatch thumbnail generation job after indexing completes if self.config.mode == IndexMode::Deep && !self.config.is_ephemeral() { use crate::ops::media::thumbnail::{ThumbnailJob, ThumbnailJobConfig}; ctx.log("Deep mode enabled - dispatching thumbnail generation job"); - // Dispatch thumbnail job for all entries in this location let thumbnail_config = ThumbnailJobConfig::default(); let thumbnail_job = ThumbnailJob::new(thumbnail_config); @@ -965,12 +930,10 @@ impl IndexerJob { } Err(e) => { ctx.log(format!("Warning: Failed to dispatch thumbnail job: {}", e)); - // Don't fail the indexing job if thumbnail dispatch fails } } } - // Generate final output (cleanup happens in outer run() method) Ok(IndexerOutput { location_id: self.config.location_id, stats: state.stats, @@ -992,22 +955,20 @@ impl JobHandler for IndexerJob { type Output = IndexerOutput; async fn run(&mut self, ctx: JobContext<'_>) -> JobResult { - // Initialize timer if self.timer.is_none() { self.timer = Some(PhaseTimer::new()); } - // Initialize ephemeral index if needed if self.config.is_ephemeral() && self.ephemeral_index.is_none() { self.ephemeral_index = Some(Arc::new(RwLock::new(EphemeralIndex::new()))); ctx.log("Initialized ephemeral index for non-persistent job"); } - // Run the actual job, ensuring ephemeral cleanup happens on both success and failure let result = self.run_job_phases(&ctx).await; - // ALWAYS mark ephemeral indexing complete, even on failure - // This prevents the indexing flag from being stuck forever + // Mark ephemeral indexing complete even on failure to prevent the indexing + // flag from being stuck forever. Without this, a failed ephemeral job would + // block all future indexing attempts for that path until app restart. if self.config.is_ephemeral() { if let Some(local_path) = self.config.path.as_local_path() { ctx.library() @@ -1032,7 +993,6 @@ impl JobHandler for IndexerJob { } async fn on_resume(&mut self, ctx: &JobContext<'_>) -> JobResult { - // State is already loaded from serialization warn!("DEBUG: IndexerJob on_resume called"); if let Some(state) = &self.state { warn!( @@ -1045,18 +1005,16 @@ impl JobHandler for IndexerJob { state.stats.files, state.stats.dirs, state.stats.errors )); - // Reinitialize timer for resumed job self.timer = Some(PhaseTimer::new()); } else { warn!("DEBUG: IndexerJob has no state during resume - creating new state!"); - // If state is missing, create it now (this shouldn't happen in normal operation) self.state = Some(IndexerState::new(&self.config.path)); } Ok(()) } async fn on_pause(&mut self, ctx: &JobContext<'_>) -> JobResult { - ctx.log("Pausing indexer job - state will be preserved"); + ctx.log("Pausing indexer job"); Ok(()) } @@ -1072,13 +1030,11 @@ impl JobHandler for IndexerJob { } fn is_resuming(&self) -> bool { - // If we have existing state, we're resuming self.state.is_some() } } impl IndexerJob { - /// Create a new indexer job with enhanced configuration pub fn new(config: IndexerJobConfig) -> Self { Self { config, @@ -1090,43 +1046,40 @@ impl IndexerJob { } } - /// Create a traditional persistent recursive indexer job pub fn from_location(location_id: Uuid, root_path: SdPath, mode: IndexMode) -> Self { Self::new(IndexerJobConfig::new(location_id, root_path, mode)) } - /// Create a shallow indexer job (metadata only) pub fn shallow(location_id: Uuid, root_path: SdPath) -> Self { Self::from_location(location_id, root_path, IndexMode::Shallow) } - /// Create a content indexer job (with CAS IDs) pub fn with_content(location_id: Uuid, root_path: SdPath) -> Self { Self::from_location(location_id, root_path, IndexMode::Content) } - /// Create a deep indexer job (full processing) pub fn deep(location_id: Uuid, root_path: SdPath) -> Self { Self::from_location(location_id, root_path, IndexMode::Deep) } - /// Create a UI navigation job (current scope, quick scan) pub fn ui_navigation(location_id: Uuid, path: SdPath) -> Self { Self::new(IndexerJobConfig::ui_navigation(location_id, path)) } - /// Set the ephemeral index storage (must be called before dispatching for ephemeral jobs) - /// This allows external code to maintain a reference to the same storage the job uses + /// Sets the ephemeral index storage that the job will use. + /// + /// This must be called before dispatching ephemeral jobs. It allows external code + /// (like the ephemeral cache manager) to maintain a reference to the same storage + /// the job uses, enabling direct access to indexing results without job-to-caller + /// communication overhead. pub fn set_ephemeral_index(&mut self, index: Arc>) { self.ephemeral_index = Some(index); } - /// Create an ephemeral browsing job (no database writes) pub fn ephemeral_browse(path: SdPath, scope: IndexScope) -> Self { Self::new(IndexerJobConfig::ephemeral_browse(path, scope)) } - /// Run current scope discovery (single level only) async fn run_current_scope_discovery_static( state: &mut IndexerState, ctx: &JobContext<'_>, @@ -1180,7 +1133,6 @@ impl IndexerJob { } } - // Create single batch and move to processing if !state.pending_entries.is_empty() { let batch = state.create_batch(); state.entry_batches.push(batch); @@ -1195,7 +1147,6 @@ impl IndexerJob { Ok(()) } - /// Run ephemeral processing (store in memory instead of database) async fn run_ephemeral_processing_static( state: &mut IndexerState, ctx: &JobContext<'_>, @@ -1207,26 +1158,20 @@ impl IndexerJob { ctx.log("Starting ephemeral processing"); - // Get event bus from library let event_bus = Some(ctx.library().event_bus().clone()); - // Create ephemeral persistence layer (emits events as entries are stored) let persistence = PersistenceFactory::ephemeral( ephemeral_index.clone(), event_bus, root_path.to_path_buf(), ); - // Process all batches through persistence layer while let Some(batch) = state.entry_batches.pop() { for entry in batch { - // Store entry (this will emit ResourceChanged events) - // Content kind is identified by extension during add_entry, no hashing needed let _entry_id = persistence.store_entry(&entry, None, root_path).await?; } } - // Skip content identification for ephemeral jobs - go directly to complete state.phase = Phase::Complete; ctx.log("Ephemeral processing complete"); diff --git a/core/src/ops/indexing/mod.rs b/core/src/ops/indexing/mod.rs index f8f87face..be465835b 100644 --- a/core/src/ops/indexing/mod.rs +++ b/core/src/ops/indexing/mod.rs @@ -1,12 +1,24 @@ -//! Production-ready indexing system for Spacedrive +//! # Spacedrive's File Indexing System //! -//! This module implements a sophisticated file indexing system with: -//! - Multi-phase processing (discovery, processing, content identification) -//! - Full resumability with checkpoint support -//! - Incremental indexing with change detection -//! - Efficient batch processing -//! - Comprehensive error handling -//! - Performance monitoring and metrics +//! `core::ops::indexing` provides a multi-phase indexing pipeline that turns +//! raw filesystem paths into searchable database entries. The system handles +//! both persistent locations (managed directories) and ephemeral browsing sessions +//! (external drives, network shares), ensuring every file gets a stable UUID for +//! sync and user data attachment. +//! +//! ## Example +//! ```rust,no_run +//! use spacedrive_core::ops::indexing::{IndexerJob, IndexerJobConfig, IndexMode}; +//! use spacedrive_core::domain::addressing::SdPath; +//! use uuid::Uuid; +//! +//! # async fn example(library: &spacedrive_core::library::Library, location_id: Uuid, path: SdPath) -> Result<(), Box> { +//! let config = IndexerJobConfig::new(location_id, path, IndexMode::Content); +//! let job = IndexerJob::new(config); +//! library.jobs().dispatch(job).await?; +//! # Ok(()) +//! # } +//! ``` pub mod action; pub mod change_detection; @@ -27,7 +39,6 @@ pub mod rules; pub mod state; pub mod verify; -// Re-exports for convenience pub use action::IndexingAction; pub use ctx::{IndexingCtx, ResponderCtx}; pub use entry::{EntryMetadata, EntryProcessor}; @@ -48,8 +59,5 @@ pub use rules::{ pub use state::{IndexPhase, IndexerProgress, IndexerState, IndexerStats}; pub use verify::{IndexVerifyAction, IndexVerifyInput, IndexVerifyOutput, IntegrityReport}; -// Rules system will be integrated here in the future -// pub mod rules; - #[cfg(test)] mod tests; diff --git a/core/src/ops/indexing/persistence.rs b/core/src/ops/indexing/persistence.rs index 30d647bc3..9a6b1ee0f 100644 --- a/core/src/ops/indexing/persistence.rs +++ b/core/src/ops/indexing/persistence.rs @@ -1,7 +1,10 @@ -//! Persistence abstraction layer for indexing operations +//! # Persistence Abstraction for Indexing +//! +//! `core::ops::indexing::persistence` provides a unified interface for storing +//! indexing results either persistently in the database or ephemerally in memory. +//! This abstraction allows the same indexing pipeline to work for both managed +//! locations (database-backed) and ephemeral browsing (memory-only). //! -//! This module provides a unified interface for storing indexing results -//! either persistently in the database or ephemerally in memory. use crate::{ filetype::FileTypeRegistry, @@ -28,10 +31,18 @@ use super::{ PathResolver, }; -/// Abstraction for storing indexing results +/// Unified storage interface for persistent and ephemeral indexing. +/// +/// Implementations handle either database writes (DatabasePersistence) or +/// in-memory storage (EphemeralPersistence). The indexing pipeline calls +/// these methods without knowing which backend is active. #[async_trait::async_trait] pub trait IndexPersistence: Send + Sync { - /// Store an entry and return its ID + /// Stores an entry and returns its ID for linking content identities. + /// + /// For database persistence, this creates an `entry` row and updates the closure table. + /// For ephemeral persistence, this adds the entry to the in-memory index and emits + /// a ResourceChanged event for immediate UI updates. async fn store_entry( &self, entry: &DirEntry, @@ -39,7 +50,11 @@ pub trait IndexPersistence: Send + Sync { location_root_path: &Path, ) -> JobResult; - /// Store content identity and link to entry + /// Links a content identity (hash) to an entry. + /// + /// For database persistence, this creates or finds a `content_identity` row and updates + /// the entry's `content_id` foreign key. For ephemeral persistence, this is a no-op since + /// in-memory indexes don't track content deduplication across sessions. async fn store_content_identity( &self, entry_id: i32, @@ -47,7 +62,12 @@ pub trait IndexPersistence: Send + Sync { cas_id: String, ) -> JobResult<()>; - /// Get existing entries for change detection, scoped to the indexing path + /// Retrieves existing entries under a path for change detection. + /// + /// Returns a map of path -> (entry_id, inode, modified_time, size) for all entries + /// under the indexing path. Change detection compares this snapshot against the + /// current filesystem to identify additions, modifications, and deletions. Ephemeral + /// persistence returns an empty map since it doesn't support incremental indexing. async fn get_existing_entries( &self, indexing_path: &Path, @@ -55,18 +75,25 @@ pub trait IndexPersistence: Send + Sync { HashMap, Option, u64)>, >; - /// Update an existing entry async fn update_entry(&self, entry_id: i32, entry: &DirEntry) -> JobResult<()>; - /// Check if this persistence layer supports operations + /// Returns true for database persistence, false for ephemeral. + /// + /// Used by the indexing pipeline to determine whether to perform expensive operations + /// like change detection (database only) or content hashing (database only). fn is_persistent(&self) -> bool; } -/// Database-backed persistence implementation +/// Database-backed persistence with RwLock-protected entry ID cache. +/// +/// This implementation writes all entries to the database and manages a cache of +/// path -> entry_id mappings for fast parent lookups during hierarchy construction. +/// The cache uses RwLock instead of clone-modify-write to prevent race conditions +/// where concurrent cache updates overwrite each other. pub struct DatabasePersistence<'a> { ctx: &'a JobContext<'a>, device_id: i32, - location_root_entry_id: Option, // The root entry ID of the location being indexed + location_root_entry_id: Option, entry_id_cache: Arc>>, } @@ -95,20 +122,8 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { ) -> JobResult { use super::entry::EntryProcessor; - // CRITICAL FIX: Do NOT clone the cache! - // The previous clone-modify-write pattern caused cache corruption: - // 1. Thread A clones cache, processes entry, writes back - // 2. Thread B clones cache (stale snapshot), processes entry, writes back - // 3. Thread B's write overwrites Thread A's updates -> lost updates - // 4. Worse: concurrent HashMap mutations could cause data corruption - // - // Instead, we manage the cache directly with proper locking. - // We look up the parent, then create the entry, then cache it. - // All cache operations are protected by the RwLock. - - // Find parent entry ID with proper locking + // Cache lookups use RwLock read/write operations instead of clone-modify-write. let parent_id = if let Some(parent_path) = entry.path.parent() { - // Try cache first (read lock) let cached_parent = { let cache = self.entry_id_cache.read().await; cache.get(parent_path).copied() @@ -117,19 +132,16 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { if let Some(id) = cached_parent { Some(id) } else { - // Not in cache, check database (no lock held during async DB query) let parent_path_str = parent_path.to_string_lossy().to_string(); if let Ok(Some(dir_path_record)) = entities::directory_paths::Entity::find() .filter(entities::directory_paths::Column::Path.eq(&parent_path_str)) .one(self.ctx.library_db()) .await { - // Found in database, cache it (write lock) let mut cache = self.entry_id_cache.write().await; cache.insert(parent_path.to_path_buf(), dir_path_record.entry_id); Some(dir_path_record.entry_id) } else { - // Parent truly not found tracing::warn!( "Parent not found for {}: {}", entry.path.display(), @@ -142,12 +154,6 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { None }; - // Now create the entry using the old implementation (not EntryProcessor) - // We can't easily use EntryProcessor without IndexerState, and creating - // IndexerState with clone causes the bug we're trying to fix. - // TODO: Refactor EntryProcessor to work without full IndexerState - - // For now, inline the entry creation logic with our properly-locked cache use entities::entry_closure; let extension = match entry.kind { @@ -295,7 +301,6 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { ); } - // Cache the entry ID for potential children (write lock) { let mut cache = self.entry_id_cache.write().await; cache.insert(entry.path.clone(), result.id); @@ -312,10 +317,8 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { ) -> JobResult<()> { use super::entry::EntryProcessor; - // Use the library ID from the context let library_id = self.ctx.library().id(); - // Delegate to existing implementation with the library_id EntryProcessor::link_to_content_identity(self.ctx, entry_id, path, cas_id, library_id) .await .map(|_| ()) @@ -329,27 +332,22 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { > { use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; - // If we don't have a location root entry ID, we can't find existing entries let location_root_entry_id = match self.location_root_entry_id { Some(id) => id, None => return Ok(HashMap::new()), }; - // Query descendants of the indexing path let indexing_path_str = indexing_path.to_string_lossy().to_string(); let indexing_path_entry_id = if let Ok(Some(dir_record)) = directory_paths::Entity::find() .filter(directory_paths::Column::Path.eq(&indexing_path_str)) .one(self.ctx.library_db()) .await { - // Indexing path exists in DB - use its entry ID dir_record.entry_id } else { - // This is safe because if the path doesn't exist, there are no descendants to delete location_root_entry_id }; - // Get all descendants of the indexing path let descendant_ids = entry_closure::Entity::find() .filter(entry_closure::Column::AncestorId.eq(indexing_path_entry_id)) .all(self.ctx.library_db()) @@ -359,11 +357,10 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { .map(|ec| ec.descendant_id) .collect::>(); - // Add the indexing path entry itself let mut all_entry_ids = vec![indexing_path_entry_id]; all_entry_ids.extend(descendant_ids); - // Fetch all entries (chunked to avoid SQLite variable limit) + // Chunk queries to stay under SQLite's 999 variable limit. let mut existing_entries: Vec = Vec::new(); let chunk_size: usize = 900; for chunk in all_entry_ids.chunks(chunk_size) { @@ -385,12 +382,10 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { )); for entry in existing_entries { - // Build full path for the entry using PathResolver let full_path = PathResolver::get_full_path(self.ctx.library_db(), entry.id) .await .unwrap_or_else(|_| PathBuf::from(&entry.name)); - // Convert timestamp to SystemTime for comparison let modified_time = entry .modified_at @@ -418,7 +413,6 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { async fn update_entry(&self, entry_id: i32, entry: &DirEntry) -> JobResult<()> { use super::entry::EntryProcessor; - // Delegate to existing implementation EntryProcessor::update_entry(self.ctx, entry_id, entry).await } @@ -427,7 +421,10 @@ impl<'a> IndexPersistence for DatabasePersistence<'a> { } } -/// In-memory ephemeral persistence implementation +/// In-memory ephemeral persistence for browsing unmanaged paths. +/// +/// Stores entries in an `EphemeralIndex` (memory-only) and emits ResourceChanged +/// events for immediate UI updates. pub struct EphemeralPersistence { index: Arc>, next_entry_id: Arc>, @@ -467,23 +464,18 @@ impl IndexPersistence for EphemeralPersistence { ) -> JobResult { use super::entry::EntryProcessor; - // Extract full metadata - // Note: Ephemeral persistence uses direct filesystem (None backend) let metadata = EntryProcessor::extract_metadata(&entry.path, None) .await .map_err(|e| JobError::execution(format!("Failed to extract metadata: {}", e)))?; - // Generate a stable UUID for this ephemeral entry let entry_id = self.get_next_id().await; let entry_uuid = Uuid::new_v4(); - // Store in ephemeral index with UUID - // add_entry returns Some(content_kind) if added, None if duplicate + // add_entry returns Some(content_kind) if added, None if duplicate path. let content_kind = { let mut index = self.index.write().await; let result = index.add_entry(entry.path.clone(), entry_uuid, metadata.clone()); - // Only update stats if the entry was actually added (not a duplicate) if result.is_some() { match entry.kind { EntryKind::File => index.stats.files += 1, @@ -495,19 +487,16 @@ impl IndexPersistence for EphemeralPersistence { result }; - // Only emit event if entry was actually added let Some(content_kind) = content_kind else { return Ok(entry_id); }; - // Emit ResourceChanged event for UI if let Some(event_bus) = &self.event_bus { use crate::device::get_current_device_slug; use crate::domain::addressing::SdPath; use crate::domain::file::File; use crate::infra::event::{Event, ResourceMetadata}; - // Build SdPath - for ephemeral indexing, we use Physical paths let device_slug = get_current_device_slug(); let sd_path = SdPath::Physical { @@ -515,11 +504,9 @@ impl IndexPersistence for EphemeralPersistence { path: entry.path.clone(), }; - // Build File domain object from ephemeral data let mut file = File::from_ephemeral(entry_uuid, &metadata, sd_path); file.content_kind = content_kind; - // Emit event with path metadata for filtering let parent_path = entry.path.parent().map(|p| SdPath::Physical { device_slug: file.sd_path.device_slug().unwrap_or("local").to_string(), path: p.to_path_buf(), @@ -553,7 +540,6 @@ impl IndexPersistence for EphemeralPersistence { _path: &Path, _cas_id: String, ) -> JobResult<()> { - // Ephemeral indexes do not store content identities Ok(()) } @@ -563,12 +549,10 @@ impl IndexPersistence for EphemeralPersistence { ) -> JobResult< HashMap, Option, u64)>, > { - // Ephemeral persistence doesn't support change detection Ok(HashMap::new()) } async fn update_entry(&self, _entry_id: i32, _entry: &DirEntry) -> JobResult<()> { - // Updates not needed for ephemeral storage Ok(()) } diff --git a/core/src/ops/indexing/phases/content.rs b/core/src/ops/indexing/phases/content.rs index eba743685..06142f974 100644 --- a/core/src/ops/indexing/phases/content.rs +++ b/core/src/ops/indexing/phases/content.rs @@ -1,4 +1,10 @@ -//! Content identification phase - generates CAS IDs and links content +//! # Content Identification and Hashing +//! +//! `core::ops::indexing::phases::content` generates BLAKE3 content hashes for files and +//! links entries to content_identity records for deduplication. Processes files in parallel +//! chunks, supports both local filesystem and cloud backends (S3, Dropbox), and carefully +//! orders sync operations (content identities before entries) to prevent foreign key violations +//! on receiving devices. use crate::{ domain::content_identity::ContentHashGenerator, @@ -15,21 +21,27 @@ use std::path::Path; use std::sync::Arc; use tracing::warn; -/// Strip cloud URL prefix from DirEntry path to get backend-relative path +/// Strips cloud URL schemes to convert full URIs into backend-relative paths. +/// +/// Backends expect relative keys ("folder/file.txt"), not full URIs ("s3://bucket/folder/file.txt"). +/// For S3 paths like "s3://my-bucket/docs/report.pdf", this returns "docs/report.pdf". +/// Local paths pass through unchanged. fn to_backend_path(path: &Path) -> std::path::PathBuf { let path_str = path.to_string_lossy(); if let Some(after_scheme) = path_str.strip_prefix("s3://") { - // Strip s3://bucket/ prefix to get just the key if let Some(slash_pos) = after_scheme.find('/') { let key = &after_scheme[slash_pos + 1..]; return std::path::PathBuf::from(key); } } - // Return as-is for local paths path.to_path_buf() } -/// Run the content identification phase +/// Generates BLAKE3 content hashes for files and links them to content identities. +/// +/// Processes files in parallel chunks for throughput, uses volume backends for cloud files, +/// syncs content identities before entries (to prevent foreign key violations), and emits +/// ResourceChanged events for UI updates. Empty files are skipped (no content to hash). pub async fn run_content_phase( state: &mut IndexerState, ctx: &JobContext<'_>, @@ -52,7 +64,6 @@ pub async fn run_content_phase( let mut success_count = 0; let mut error_count = 0; - // Process in chunks for better performance and memory usage const CHUNK_SIZE: usize = 100; while !state.entries_for_content.is_empty() { @@ -62,7 +73,6 @@ pub async fn run_content_phase( let chunk: Vec<_> = state.entries_for_content.drain(..chunk_size).collect(); let chunk_len = chunk.len(); - // Report progress BEFORE processing (using current processed count) let indexer_progress = IndexerProgress { phase: IndexPhase::ContentIdentification { current: processed, @@ -75,22 +85,18 @@ pub async fn run_content_phase( scope: None, persistence: None, is_ephemeral: false, - action_context: None, // TODO: Pass action context from job state + action_context: None, }; ctx.progress(Progress::generic(indexer_progress.to_generic_progress())); - // Process chunk in parallel for better performance let content_hash_futures: Vec<_> = chunk .iter() .map(|(entry_id, path)| { let backend_clone = volume_backend.cloned(); async move { let hash_result = if let Some(backend) = backend_clone { - // Use backend for content hashing (supports both local and cloud) - // For cloud paths, strip the URL prefix to get backend-relative path let backend_path = to_backend_path(path); - // Get file size first match backend.metadata(&backend_path).await { Ok(meta) => { ContentHashGenerator::generate_content_hash_with_backend( @@ -105,7 +111,6 @@ pub async fn run_content_phase( )), } } else { - // No backend - use local filesystem path ContentHashGenerator::generate_content_hash(path).await }; (*entry_id, path.clone(), hash_result) @@ -113,16 +118,12 @@ pub async fn run_content_phase( }) .collect(); - // Wait for all content hash generations to complete let hash_results = futures::future::join_all(content_hash_futures).await; - // Collect results for batch syncing let mut content_identities_to_sync = Vec::new(); let mut entries_to_sync = Vec::new(); - // Process results for (entry_id, path, hash_result) in hash_results { - // Check for interruption during result processing ctx.check_interrupt().await?; match hash_result { @@ -143,7 +144,6 @@ pub async fn run_content_phase( content_hash )); - // Collect for batch sync content_identities_to_sync.push(result.content_identity); entries_to_sync.push(result.entry); @@ -188,7 +188,6 @@ pub async fn run_content_phase( } } - // Batch sync content identities (shared resources) if !content_identities_to_sync.is_empty() { match IndexingCtx::library(ctx) { Some(library) => { @@ -221,12 +220,11 @@ pub async fn run_content_phase( } } - // Yield to allow content_identity events to be emitted before entry updates - // This ensures content_identities arrive on receiving devices before entries that reference them - // Prevents FK orphaning where entry UPDATE arrives before content_identity exists + // Yield to let content_identity sync messages propagate before entry updates. + // Without this, receiving devices might process entry.content_id foreign keys before + // the referenced content_identity row exists, causing foreign key constraint violations. tokio::task::yield_now().await; - // Batch sync entries (device-owned, now sync-ready with content_id assigned) if !entries_to_sync.is_empty() { match IndexingCtx::library(ctx) { Some(library) => { @@ -259,15 +257,10 @@ pub async fn run_content_phase( } } - // Update processed count AFTER processing chunk processed += chunk_len; - - // Update rate tracking state.items_since_last_update += chunk_len as u64; - // Emit ResourceChanged events for affected Files if !entries_to_sync.is_empty() { - // Collect entry UUIDs from successfully processed entries let entry_ids_for_events: Vec = entries_to_sync .iter() .filter_map(|entry_model| entry_model.uuid) @@ -288,8 +281,6 @@ pub async fn run_content_phase( } } } - - // State is automatically saved during job serialization on shutdown } ctx.log(format!( diff --git a/core/src/ops/indexing/phases/discovery.rs b/core/src/ops/indexing/phases/discovery.rs index 4db7f7ce3..21f5ede88 100644 --- a/core/src/ops/indexing/phases/discovery.rs +++ b/core/src/ops/indexing/phases/discovery.rs @@ -1,4 +1,9 @@ -//! Discovery phase - walks directories and collects entries +//! # Directory Discovery Phase +//! +//! `core::ops::indexing::phases::discovery` implements parallel directory traversal +//! using a work-stealing pattern inspired by Rayon. Workers pull directories from a +//! shared queue, read their contents, filter entries against indexing rules, and +//! directly enqueue subdirectories for other workers to process. use crate::{ infra::job::generic_progress::ToGenericProgress, @@ -24,7 +29,11 @@ impl crate::ops::indexing::rules::MetadataForIndexerRules for SimpleMetadata { } } -/// Run the discovery phase of indexing with parallel directory walking +/// Runs parallel directory discovery or falls back to sequential for concurrency = 1. +/// +/// Spawns worker tasks that walk the directory tree, apply filtering rules, and collect +/// entries into batches for the processing phase. Falls back to sequential traversal +/// when concurrency is 1 to avoid task spawning overhead for single-threaded scenarios. pub async fn run_discovery_phase( state: &mut IndexerState, ctx: &JobContext<'_>, @@ -36,7 +45,6 @@ pub async fn run_discovery_phase( let concurrency = state.discovery_concurrency; if concurrency <= 1 { - // Fall back to sequential discovery for concurrency = 1 return run_discovery_phase_sequential( state, ctx, @@ -69,7 +77,12 @@ pub async fn run_discovery_phase( .await } -/// Parallel discovery implementation using Rayon-style work-stealing +/// Parallel discovery using work-stealing with N worker tasks and atomic coordination. +/// +/// Workers pull directories from a shared queue, read contents, filter against rules, +/// and directly enqueue subdirectories. A monitor task watches `pending_work` (atomic +/// counter) and signals shutdown when it reaches zero, avoiding explicit work completion +/// messages that would require coordinator awareness. async fn run_parallel_discovery( state: &mut IndexerState, ctx: &JobContext<'_>, @@ -80,21 +93,20 @@ async fn run_parallel_discovery( ) -> Result<(), JobError> { let concurrency = state.discovery_concurrency; - // Use unbounded channels to avoid backpressure/deadlock issues let (work_tx, work_rx) = chan::unbounded::(); let (result_tx, result_rx) = chan::unbounded::(); - // Atomic counter tracking work in progress + shutdown signal - // INVARIANT: incremented BEFORE sending to work channel, decremented AFTER processing + // INVARIANT: `pending_work` is incremented BEFORE enqueuing work and decremented AFTER + // completing it. When it reaches zero, all work is done and shutdown can be signaled. + // This avoids coordinator bottlenecks from explicit "work done" messages. let pending_work = Arc::new(AtomicUsize::new(0)); let skipped_count = Arc::new(AtomicU64::new(0)); let shutdown = Arc::new(AtomicBool::new(false)); - // Shared seen_paths across all workers to prevent duplicate processing - // (handles symlink loops and same directory reached via different paths) + // Shared across all workers to prevent duplicate processing when symlinks create cycles + // or multiple paths (e.g., /home/user/docs and /mnt/docs) lead to the same directory. let seen_paths = Arc::new(parking_lot::RwLock::new(std::collections::HashSet::new())); - // Seed initial work while let Some(dir) = state.dirs_to_walk.pop_front() { pending_work.fetch_add(1, Ordering::Release); work_tx @@ -103,7 +115,6 @@ async fn run_parallel_discovery( .map_err(|_| JobError::execution("Work channel closed"))?; } - // Spawn worker tasks let mut workers = Vec::new(); for worker_id in 0..concurrency { let work_rx = work_rx.clone(); @@ -138,7 +149,8 @@ async fn run_parallel_discovery( workers.push(worker); } - // Monitor task: signals shutdown when all work is done + // Monitor polls `pending_work` and signals shutdown when it hits zero, allowing workers + // to exit gracefully without needing explicit "I'm done" messages to a coordinator. let monitor = tokio::spawn({ let shutdown = Arc::clone(&shutdown); let pending_work = Arc::clone(&pending_work); @@ -153,11 +165,9 @@ async fn run_parallel_discovery( } }); - // Drop our copies so channels close when workers are done drop(work_tx); drop(result_tx); - // Collect results let mut total_processed = 0u64; while let Ok(result) = result_rx.recv().await { match result { @@ -200,7 +210,6 @@ async fn run_parallel_discovery( state.items_since_last_update += 1; } DiscoveryResult::QueueDirectories(_) => { - // Workers queue directly, this shouldn't happen unreachable!("Workers should not send QueueDirectories in Rayon-style mode"); } } @@ -208,7 +217,6 @@ async fn run_parallel_discovery( ctx.check_interrupt().await?; } - // Wait for monitor and workers monitor .await .map_err(|e| JobError::execution(format!("Monitor task failed: {}", e)))?; @@ -219,7 +227,6 @@ async fn run_parallel_discovery( .map_err(|e| JobError::execution(format!("Worker task failed: {}", e)))?; } - // Final batch if !state.pending_entries.is_empty() { let final_batch_size = state.pending_entries.len(); ctx.log(format!( @@ -246,7 +253,11 @@ async fn run_parallel_discovery( Ok(()) } -/// Result types sent from workers back to coordinator +/// Messages sent from workers to the coordinator via the result channel. +/// +/// Workers send entries, stats updates, progress notifications, and errors through this +/// enum instead of directly mutating shared state. QueueDirectories is unused in the +/// work-stealing implementation (workers directly enqueue subdirectories). enum DiscoveryResult { Entry(DirEntry), QueueDirectories(Vec), @@ -262,7 +273,12 @@ enum DiscoveryResult { }, } -/// Rayon-style worker: processes directories and directly enqueues new work +/// Worker task that pulls directories, reads contents, filters entries, and enqueues subdirectories. +/// +/// Workers check the shutdown signal, pull work with a timeout to avoid blocking forever, +/// skip already-seen paths (using the shared RwLock), apply filtering rules, and directly +/// enqueue subdirectories for other workers. The atomic `pending_work` counter tracks +/// in-flight work: incremented before enqueue, decremented after processing completes. async fn discovery_worker_rayon( _worker_id: usize, work_rx: chan::Receiver, @@ -278,12 +294,10 @@ async fn discovery_worker_rayon( cloud_url_base: Option, ) { loop { - // Check shutdown signal if shutdown.load(Ordering::Acquire) { break; } - // Try to get work with a timeout to periodically check shutdown let dir_path = match tokio::time::timeout( tokio::time::Duration::from_millis(50), work_rx.recv(), @@ -291,11 +305,10 @@ async fn discovery_worker_rayon( .await { Ok(Ok(path)) => path, - Ok(Err(_)) => break, // Channel closed - Err(_) => continue, // Timeout, check shutdown flag again + Ok(Err(_)) => break, + Err(_) => continue, }; - // Skip if already seen (handles symlink loops across ALL workers) { let mut seen = seen_paths.write(); if !seen.insert(dir_path.clone()) { @@ -304,10 +317,8 @@ async fn discovery_worker_rayon( } } - // Build rules for this directory let dir_ruler = build_default_ruler(rule_toggles, &root_path, &dir_path).await; - // Read directory match read_directory( &dir_path, volume_backend.as_ref(), @@ -319,7 +330,6 @@ async fn discovery_worker_rayon( let mut local_stats = LocalStats::default(); for entry in entries { - // Apply rules let decision = dir_ruler .evaluate_path( &entry.path, @@ -347,10 +357,10 @@ async fn discovery_worker_rayon( match entry.kind { EntryKind::Directory => { local_stats.dirs += 1; - // Rayon-style: increment BEFORE queueing, worker directly enqueues + // Increment BEFORE enqueuing so the monitor never sees pending_work=0 while + // work is in flight. Decrement only happens after processing completes. pending_work.fetch_add(1, Ordering::Release); if work_tx.send(entry.path.clone()).await.is_err() { - // Channel closed, decrement and continue pending_work.fetch_sub(1, Ordering::Release); } let _ = result_tx.send(DiscoveryResult::Entry(entry)).await; @@ -367,7 +377,6 @@ async fn discovery_worker_rayon( } } - // Send stats update let _ = result_tx .send(DiscoveryResult::Stats { files: local_stats.files, @@ -377,7 +386,6 @@ async fn discovery_worker_rayon( }) .await; - // Send progress update let dirs_queued = pending_work.load(Ordering::Acquire); let _ = result_tx .send(DiscoveryResult::Progress { dirs_queued }) @@ -393,7 +401,6 @@ async fn discovery_worker_rayon( } } - // Decrement AFTER processing complete pending_work.fetch_sub(1, Ordering::Release); } } @@ -406,7 +413,11 @@ struct LocalStats { bytes: u64, } -/// Sequential discovery fallback (original implementation) +/// Single-threaded directory traversal fallback for concurrency = 1. +/// +/// Uses a simple queue-based approach without task spawning overhead. Processes +/// directories one at a time, applies filters, and accumulates entries into batches. +/// Useful for debugging or when parallel overhead exceeds benefits (small directory trees). async fn run_discovery_phase_sequential( state: &mut IndexerState, ctx: &JobContext<'_>, @@ -545,31 +556,32 @@ async fn run_discovery_phase_sequential( Ok(()) } -/// Read a directory and extract metadata +/// Reads a directory through a volume backend, falling back to LocalBackend if none provided. /// -/// Uses the provided volume backend if available, otherwise creates a LocalBackend fallback. -/// The backend is typically provided once per indexer job from the root volume lookup. +/// Volume backends abstract local filesystems and cloud storage (S3, Dropbox) behind a +/// unified interface. When indexing managed locations, the backend is provided upfront from +/// volume registration. For ephemeral browsing or untracked paths, this creates a temporary +/// LocalBackend on demand. async fn read_directory( path: &Path, volume_backend: Option<&Arc>, cloud_url_base: Option<&str>, ) -> Result, std::io::Error> { - // Use provided backend or create LocalBackend fallback let backend: Arc = match volume_backend { Some(backend) => Arc::clone(backend), - None => { - // Fallback: create temporary LocalBackend - // This happens when no volume is tracked for the indexing path - Arc::new(crate::volume::LocalBackend::new( - path.parent().unwrap_or(path), - )) - } + None => Arc::new(crate::volume::LocalBackend::new( + path.parent().unwrap_or(path), + )), }; read_directory_with_backend(backend.as_ref(), path, cloud_url_base).await } -/// Read a directory using a volume backend (local or cloud) +/// Reads directory contents via a volume backend and converts paths for cloud vs local. +/// +/// For cloud volumes, prepends the cloud URL base (e.g., "s3://bucket/") to build proper +/// hierarchical paths. For local volumes, uses standard PathBuf joins. This ensures cloud +/// entries have full URIs like "s3://bucket/folder/file.txt" instead of relative paths. async fn read_directory_with_backend( backend: &dyn crate::volume::VolumeBackend, path: &Path, @@ -582,13 +594,10 @@ async fn read_directory_with_backend( .await .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - // Convert RawDirEntry to DirEntry let entries: Vec = raw_entries .into_iter() .map(|raw| { - // For cloud volumes, prepend the cloud URL base to build proper hierarchical paths let full_path = if let Some(base) = cloud_url_base { - // Cloud: s3://bucket/ + relative_path + filename let relative = path.to_string_lossy(); let joined = if relative.is_empty() { raw.name.clone() @@ -597,7 +606,6 @@ async fn read_directory_with_backend( }; PathBuf::from(format!("{}{}", base, joined)) } else { - // Local: just join normally path.join(&raw.name) }; diff --git a/core/src/ops/indexing/phases/processing.rs b/core/src/ops/indexing/phases/processing.rs index 68de9d604..3bc26f067 100644 --- a/core/src/ops/indexing/phases/processing.rs +++ b/core/src/ops/indexing/phases/processing.rs @@ -1,4 +1,10 @@ -//! Processing phase - creates/updates database entries +//! # Entry Processing and Change Detection +//! +//! `core::ops::indexing::phases::processing` converts discovered filesystem entries into +//! database records, applying change detection to identify new, modified, moved, and deleted +//! entries. Processes entries in depth-first order (parents before children) within database +//! transactions, preserving ephemeral UUIDs from prior browsing sessions and validating that +//! indexing paths stay within location boundaries to prevent cross-location data corruption. use crate::{ infra::{ @@ -18,16 +24,24 @@ use std::{path::Path, sync::Arc}; use tracing::warn; use uuid::Uuid; -/// Check if an error is a unique constraint violation +/// Detects SQLite unique constraint violations from concurrent watcher and indexer writes. +/// +/// When the file watcher creates an entry while the indexer is processing the same file, +/// both try to insert with the same (path, parent_id) combination. This is benign - the entry +/// exists, which is the desired outcome. We detect and skip these instead of failing the job. fn is_unique_constraint_violation(error: &JobError) -> bool { - // Check if the error contains SQLite unique constraint violation messages let error_msg = error.to_string().to_lowercase(); error_msg.contains("unique constraint") || error_msg.contains("unique index") || error_msg.contains("constraint failed") } -/// Run the processing phase of indexing +/// Processes discovered entries into database records with change detection and UUID preservation. +/// +/// Sorts all entries by depth (parents before children) to ensure hierarchy integrity, applies +/// change detection to identify new/modified/moved/deleted entries, processes changes within +/// batch transactions, preserves ephemeral UUIDs from browsing sessions, validates indexing +/// boundaries to prevent cross-location corruption, and emits sync/event batches for UI updates. pub async fn run_processing_phase( location_id: Uuid, state: &mut IndexerState, @@ -42,9 +56,9 @@ pub async fn run_processing_phase( total_batches )); - // Populate ephemeral UUIDs for preservation before processing - // This allows entries that were browsed before enabling indexing to keep - // the same UUID, preserving any user data associated with them + // Populate ephemeral UUIDs so entries browsed before enabling indexing keep the same UUID, + // preserving tags and notes attached during ephemeral mode. Without this, promoting a browsed + // folder to a managed location would orphan all existing user metadata. let ephemeral_cache = ctx.library().core_context().ephemeral_cache(); let preserved_count = state .populate_ephemeral_uuids(ephemeral_cache, location_root_path) @@ -62,7 +76,6 @@ pub async fn run_processing_phase( return Ok(()); } - // Get the actual location record from database let location_record = entities::location::Entity::find() .filter(entities::location::Column::Uuid.eq(location_id)) .one(ctx.library_db()) @@ -80,8 +93,10 @@ pub async fn run_processing_phase( device_id, location_id_i32, location_entry_id )); - // CRITICAL SAFETY CHECK: Validate that the indexing path is within this location's boundaries - // This prevents catastrophic cross-location deletion if the watcher routes events incorrectly + // SAFETY: Validate indexing path is within location boundaries to prevent catastrophic + // cross-location deletion if watcher routing bugs send events for /home/user/photos to a + // /home/user/documents location. Without this check, we'd delete all documents entries + // not present in photos, wiping the database. let location_actual_path = crate::ops::indexing::path_resolver::PathResolver::get_full_path( ctx.library_db(), location_entry_id, @@ -89,18 +104,14 @@ pub async fn run_processing_phase( .await .map_err(|e| JobError::execution(format!("Failed to resolve location root path: {}", e)))?; - // For cloud paths, compare strings instead of PathBuf (cloud paths have empty path component for root) let location_actual_str = location_actual_path.to_string_lossy(); let is_cloud_path = location_actual_str.contains("://") && !location_actual_str.starts_with("local://"); let is_within_boundaries = if is_cloud_path { - // For cloud paths, check if the root path matches or is a subpath let root_str = location_root_path.to_string_lossy(); - // Empty path means root of cloud location, which is always valid root_str.is_empty() || location_actual_str.starts_with(root_str.as_ref()) } else { - // For local paths, use standard PathBuf comparison location_root_path.starts_with(&location_actual_path) }; @@ -119,8 +130,9 @@ pub async fn run_processing_phase( location_actual_path.display() )); - // Seed cache with ancestor directories from location root to indexing path - // This prevents the ghost folder bug where subpath reindexing creates wrong parent_ids + // Seed entry ID cache with all ancestors between location root and indexing path. + // Without this, re-indexing /home/user/docs/photos would fail to find /home/user/docs + // in the cache and create a duplicate "docs" folder with wrong parent_id. let _ = state .seed_ancestor_cache( ctx.library_db(), @@ -130,8 +142,6 @@ pub async fn run_processing_phase( ) .await; - // Load existing entries for change detection scoped to the indexing path - // Note: location_root_path is the actual path being indexed (could be a subpath of the location) let mut change_detector = ChangeDetector::new(); if !state.existing_entries.is_empty() || mode != IndexMode::Shallow { ctx.log("Loading existing entries for change detection..."); @@ -144,22 +154,21 @@ pub async fn run_processing_phase( )); } - // Flatten all batches and sort globally by depth to ensure parents are always processed before children + // Sort all discovered entries by depth (parents before children) to ensure parent entries + // exist in the database before we try to create children with parent_id foreign keys. + // Without this, creating /a/b/c.txt before /a would fail the parent_id constraint. ctx.log("Flattening and sorting all entries by depth..."); let mut all_entries: Vec = Vec::new(); while let Some(batch) = state.entry_batches.pop() { all_entries.extend(batch); } - // Sort all entries by depth first, then by type all_entries.sort_by(|a, b| { let a_depth = a.path.components().count(); let b_depth = b.path.components().count(); - // First sort by depth (parents before children) match a_depth.cmp(&b_depth) { std::cmp::Ordering::Equal => { - // Then sort by type (directories before files at same depth) let a_priority = match a.kind { EntryKind::Directory => 0, EntryKind::Symlink => 1, @@ -181,8 +190,7 @@ pub async fn run_processing_phase( all_entries.len() )); - // Re-batch the sorted entries for processing - let batch_size = 1000; // Use a reasonable batch size + let batch_size = 1000; let mut sorted_batches: Vec> = Vec::new(); let mut current_batch = Vec::with_capacity(batch_size); @@ -199,7 +207,6 @@ pub async fn run_processing_phase( sorted_batches.push(current_batch); } - // Use pop() below to consume batches. Reverse so that the first (shallowest) batch is processed first. state.entry_batches = sorted_batches; state.entry_batches.reverse(); let total_batches = state.entry_batches.len(); @@ -226,29 +233,22 @@ pub async fn run_processing_phase( scope: None, persistence: None, is_ephemeral: false, - action_context: None, // TODO: Pass action context from job state + action_context: None, }; ctx.progress(Progress::generic(indexer_progress.to_generic_progress())); - // Check for interruption before starting transaction ctx.check_interrupt().await?; - // Begin a single transaction for all new entry creations in this batch let txn = ctx.library_db().begin().await.map_err(|e| { JobError::execution(format!("Failed to begin processing transaction: {}", e)) })?; - // Accumulate related rows for bulk insert let mut bulk_self_closures: Vec = Vec::new(); let mut bulk_dir_paths: Vec = Vec::new(); let mut created_entries: Vec = Vec::new(); - // Process batch - check for changes and create/update entries - // (Already sorted globally by depth) for entry in batch { - // Check for interruption during batch processing if let Err(e) = ctx.check_interrupt().await { - // Rollback transaction before propagating interruption if let Err(rollback_err) = txn.rollback().await { warn!( "Failed to rollback transaction during interruption: {}", @@ -258,19 +258,14 @@ pub async fn run_processing_phase( return Err(e); } - // Add to seen_paths for delete detection (important for resumed jobs) state.seen_paths.insert(entry.path.clone()); - // Check for changes - // Note: For cloud backends, we skip change detection for now since we can't - // access std::fs::Metadata directly. Cloud entries are always treated as "new" - // on first index. Future: implement cloud-specific change detection using - // backend metadata. + // Cloud backends can't use std::fs::Metadata for change detection since files don't + // exist locally. We treat cloud entries as always "new" for now. Future enhancement: + // use backend-provided ETag or modified_at for cloud change detection. let change = if volume_backend.is_some() && !volume_backend.unwrap().is_local() { - // Cloud backend - treat as new for now Some(Change::New(entry.path.clone())) } else { - // Local backend - use standard change detection let metadata = match std::fs::symlink_metadata(&entry.path) { Ok(m) => m, Err(e) => { @@ -287,7 +282,6 @@ pub async fn run_processing_phase( match change { Some(Change::New(_)) => { - // Create new entry within batch transaction match EntryProcessor::create_entry_in_conn( state, ctx, @@ -309,25 +303,18 @@ pub async fn run_processing_phase( )); total_processed += 1; - // Track for content identification if needed if mode >= IndexMode::Content && entry.kind == EntryKind::File { state.entries_for_content.push((entry_id, entry.path)); } - // Collect for batch sync after transaction commits created_entries.push(entry_model); - // end Some(Change::New) } Err(e) => { - // Check if this is a unique constraint violation - // This can happen when the watcher creates an entry while the indexer is running if is_unique_constraint_violation(&e) { ctx.log(format!( "Entry already exists (created by watcher): {}", entry.path.display() )); - // This is not an error - the entry exists, which is what we want - // Just skip it and continue } else { let error_msg = format!( "Failed to create entry for {}: {}", @@ -345,7 +332,6 @@ pub async fn run_processing_phase( } Some(Change::Modified { entry_id, .. }) => { - // Update existing entry within batch transaction match EntryProcessor::update_entry_in_conn(ctx, entry_id, &entry, &txn).await { Ok(()) => { ctx.log(format!( @@ -355,7 +341,6 @@ pub async fn run_processing_phase( )); total_processed += 1; - // Re-process content if needed if mode >= IndexMode::Content && entry.kind == EntryKind::File { state.entries_for_content.push((entry_id, entry.path)); } @@ -377,7 +362,6 @@ pub async fn run_processing_phase( entry_id, .. }) => { - // Handle move - update path in database ctx.log(format!( "Detected move: {} -> {}", old_path.display(), @@ -397,7 +381,6 @@ pub async fn run_processing_phase( )); total_processed += 1; - // Re-process content if needed for moved files if mode >= IndexMode::Content && entry.kind == EntryKind::File { state.entries_for_content.push((entry_id, new_path)); } diff --git a/core/src/ops/indexing/state.rs b/core/src/ops/indexing/state.rs index b1ac7c58a..236d583b1 100644 --- a/core/src/ops/indexing/state.rs +++ b/core/src/ops/indexing/state.rs @@ -1,4 +1,9 @@ -//! Indexer state management and progress tracking +//! State management and progress tracking for indexer jobs. +//! +//! This module defines the resumable state machine that tracks indexing progress +//! across all phases. The state is automatically serialized during job shutdowns, +//! allowing indexing to resume from the last completed phase rather than starting +//! over from scratch. use crate::domain::addressing::SdPath; @@ -11,7 +16,7 @@ use std::{ }; use uuid::Uuid; -/// Indexer progress information +/// Progress information sent to UI during indexing operations. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IndexerProgress { pub phase: IndexPhase, @@ -24,13 +29,11 @@ pub struct IndexerProgress { #[serde(skip_serializing_if = "Option::is_none")] pub persistence: Option, pub is_ephemeral: bool, - - /// Action context that spawned this job (if available) #[serde(skip_serializing_if = "Option::is_none")] pub action_context: Option, } -/// Statistics collected during indexing +/// Cumulative statistics tracked throughout the indexing process. #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, Type)] pub struct IndexerStats { pub files: u64, @@ -41,7 +44,7 @@ pub struct IndexerStats { pub errors: u64, } -/// Current phase of the indexing operation +/// Public-facing phase information exposed to the UI. #[derive(Debug, Clone, Serialize, Deserialize)] pub enum IndexPhase { Discovery { dirs_queued: usize }, @@ -50,7 +53,11 @@ pub enum IndexPhase { Finalizing { processed: usize, total: usize }, } -/// Internal phases for state machine +/// Internal phase enum used by the indexer state machine. +/// +/// The state machine progresses linearly through these phases. Each phase +/// completes atomically before transitioning to the next, ensuring the job +/// can resume from a clean checkpoint if interrupted. #[derive(Debug, Clone, Serialize, Deserialize)] pub(crate) enum Phase { Discovery, @@ -60,14 +67,17 @@ pub(crate) enum Phase { Complete, } -/// Directory entry found during discovery +/// Filesystem entry discovered during the discovery phase. +/// +/// These are lightweight representations of files and directories found on disk. +/// They're collected in batches before being processed into full database entries, +/// allowing discovery to run ahead of persistence without blocking. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DirEntry { pub path: PathBuf, pub kind: EntryKind, pub size: u64, pub modified: Option, - #[serde(skip_serializing_if = "Option::is_none")] pub inode: Option, } @@ -78,7 +88,11 @@ pub enum EntryKind { Symlink, } -/// Errors that occur during indexing +/// Errors encountered during indexing that don't halt the entire job. +/// +/// These errors are logged and accumulated but don't cause job failure. This allows +/// indexing to continue even when individual files are inaccessible due to permissions, +/// file locks, or I/O errors. #[derive(Debug, Clone, Serialize, Deserialize)] pub enum IndexError { ReadDir { path: String, error: String }, @@ -87,49 +101,36 @@ pub enum IndexError { FilterCheck { path: String, error: String }, } -/// Resumable indexer state +/// Complete state for a resumable indexer job. +/// +/// This struct holds all data needed to resume indexing from any phase. The state +/// is automatically serialized when the job system shuts down, allowing long-running +/// indexing operations to survive app restarts without losing progress. #[derive(Debug, Serialize, Deserialize)] pub struct IndexerState { pub(crate) phase: Phase, #[serde(skip, default = "Instant::now")] pub(crate) started_at: Instant, - - // Discovery phase pub(crate) dirs_to_walk: VecDeque, pub(crate) pending_entries: Vec, pub(crate) seen_paths: HashSet, - - // Processing phase pub(crate) entry_batches: Vec>, - - // Content phase - pub(crate) entries_for_content: Vec<(i32, PathBuf)>, // (entry_id, path) - - // Database operations - pub(crate) entry_id_cache: HashMap, // path -> entry_id for parent lookups - - // Ephemeral UUID preservation - // UUIDs from ephemeral indexing that should be reused when creating persistent entries - // This ensures files browsed before enabling indexing keep the same UUID + pub(crate) entries_for_content: Vec<(i32, PathBuf)>, + pub(crate) entry_id_cache: HashMap, + // UUIDs from ephemeral indexing preserved when creating persistent entries. + // This ensures files browsed before enabling indexing keep the same UUID, + // preventing orphaned tags and flashing Quick Look previews when a browsed + // folder is later added as a managed location. #[serde(skip, default)] pub(crate) ephemeral_uuids: HashMap, - - // Change detection pub(crate) existing_entries: - HashMap, Option)>, // path -> (id, inode, modified) - - // Statistics + HashMap, Option)>, pub(crate) stats: IndexerStats, pub(crate) errors: Vec, - - // Performance tracking #[serde(skip, default = "Instant::now")] pub(crate) last_progress_time: Instant, pub(crate) items_since_last_update: u64, - - // Configuration pub(crate) batch_size: usize, - // Discovery config (Phase 2) pub(crate) discovery_concurrency: usize, pub(crate) dirs_channel_capacity: usize, pub(crate) entries_channel_capacity: usize, @@ -142,7 +143,6 @@ impl IndexerState { dirs_to_walk.push_back(path.to_path_buf()); } - // Use half of available CPU cores for parallel discovery (Rayon-style) let discovery_concurrency = std::thread::available_parallelism() .map(|n| usize::max(n.get() / 2, 1)) .unwrap_or(4); @@ -169,23 +169,22 @@ impl IndexerState { } } - /// Populate ephemeral UUIDs from the ephemeral cache for UUID preservation + /// Extracts UUIDs from the ephemeral cache for reuse during persistent indexing. /// - /// When a directory is browsed before being added as a managed location, - /// ephemeral indexing assigns UUIDs to each entry. This method extracts - /// those UUIDs so they can be reused when creating persistent database entries, - /// ensuring continuity for any user data (tags, notes, etc.) associated with - /// the ephemeral UUIDs. + /// When a directory is browsed before being added as a managed location, ephemeral + /// indexing assigns UUIDs to each entry. This method preserves those UUIDs so that + /// user metadata (tags, notes) attached during browsing remains valid after the + /// directory is promoted to a managed location. Without preservation, adding a + /// browsed folder as a location would orphan all existing tags and cause Quick Look + /// previews to flash as UUIDs change. pub async fn populate_ephemeral_uuids( &mut self, ephemeral_cache: &super::ephemeral::EphemeralIndexCache, root_path: &std::path::Path, ) -> usize { - // Try to get an ephemeral index that covers this path if let Some(index) = ephemeral_cache.get_for_path(root_path) { let index_read = index.read().await; - // Get all paths from the entries and look up their UUIDs let entries = index_read.entries(); for path in entries.keys() { if let Some(entry_uuid) = index_read.get_entry_uuid(path) { @@ -206,10 +205,6 @@ impl IndexerState { } } - /// Get an ephemeral UUID for a path if one exists - /// - /// Returns the UUID that was assigned during ephemeral indexing, - /// allowing it to be reused for the persistent database entry. pub fn get_ephemeral_uuid(&self, path: &std::path::Path) -> Option { self.ephemeral_uuids.get(path).copied() } @@ -227,7 +222,6 @@ impl IndexerState { } pub fn estimate_remaining(&self) -> Option { - // TODO: Implement based on current rate and remaining work None } @@ -244,8 +238,12 @@ impl IndexerState { std::mem::take(&mut self.pending_entries) } - /// Seed the entry ID cache with all ancestor directories from location root to target path - /// This prevents the ghost folder bug where subpath reindexing creates entries with wrong parent_id + /// Seeds the entry ID cache with all ancestor directories from location root to target path. + /// + /// This prevents the ghost folder bug where subpath reindexing creates entries with the + /// wrong parent_id. When indexing a subdirectory, parent lookups must find the existing + /// ancestor entries rather than creating duplicates. Seeding ensures the cache is warm + /// before processing begins. pub async fn seed_ancestor_cache<'a>( &mut self, db: &sea_orm::DatabaseConnection, @@ -256,18 +254,15 @@ impl IndexerState { use crate::infra::db::entities::directory_paths; use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; - // Seed location root self.entry_id_cache .insert(location_root_path.to_path_buf(), location_entry_id); - // Seed all intermediate ancestors between location root and target path if let Ok(relative_path) = target_path.strip_prefix(location_root_path) { let mut current_path = location_root_path.to_path_buf(); for component in relative_path.components() { current_path.push(component); - // Look up this ancestor in directory_paths table if let Ok(Some(dir_record)) = directory_paths::Entity::find() .filter( directory_paths::Column::Path diff --git a/packages/interface/src/components/JobManager/hooks/useJobCount.ts b/packages/interface/src/components/JobManager/hooks/useJobCount.ts index a08bbf9ee..a88ebe1cf 100644 --- a/packages/interface/src/components/JobManager/hooks/useJobCount.ts +++ b/packages/interface/src/components/JobManager/hooks/useJobCount.ts @@ -1,4 +1,4 @@ -import { useEffect } from "react"; +import { useEffect, useRef } from "react"; import { useLibraryQuery, useSpacedriveClient } from "../../../context"; /** @@ -7,52 +7,60 @@ import { useLibraryQuery, useSpacedriveClient } from "../../../context"; * Events trigger a refetch rather than incrementing/decrementing counts manually. */ export function useJobCount() { - const client = useSpacedriveClient(); + const client = useSpacedriveClient(); - const { data, refetch } = useLibraryQuery({ - type: "jobs.list", - input: { status: null }, - }); + const { data, refetch } = useLibraryQuery({ + type: "jobs.list", + input: { status: null }, + }); - // Subscribe to job state changes and refetch when they occur - useEffect(() => { - if (!client) return; + // Ref for stable refetch access (prevents effect re-runs when refetch reference changes) + const refetchRef = useRef(refetch); + useEffect(() => { + refetchRef.current = refetch; + }, [refetch]); - let unsubscribe: (() => void) | undefined; - let isCancelled = false; + // Subscribe to job state changes and refetch when they occur + useEffect(() => { + if (!client) return; - const filter = { - event_types: [ - "JobQueued", - "JobStarted", - "JobCompleted", - "JobFailed", - "JobCancelled", - "JobPaused", - "JobResumed", - ], - }; + let unsubscribe: (() => void) | undefined; + let isCancelled = false; - client.subscribeFiltered(filter, () => refetch()).then((unsub) => { - if (isCancelled) { - unsub(); - } else { - unsubscribe = unsub; - } - }); + const filter = { + event_types: [ + "JobQueued", + "JobStarted", + "JobCompleted", + "JobFailed", + "JobCancelled", + "JobPaused", + "JobResumed", + ], + }; - return () => { - isCancelled = true; - unsubscribe?.(); - }; - }, [client, refetch]); + client + .subscribeFiltered(filter, () => refetchRef.current()) + .then((unsub) => { + if (isCancelled) { + unsub(); + } else { + unsubscribe = unsub; + } + }); - const jobs = data?.jobs ?? []; - const runningCount = jobs.filter(j => j.status === "running").length; - const pausedCount = jobs.filter(j => j.status === "paused").length; + return () => { + isCancelled = true; + unsubscribe?.(); + }; + }, [client]); - return { - activeJobCount: runningCount + pausedCount, - hasRunningJobs: runningCount > 0, - }; + const jobs = data?.jobs ?? []; + const runningCount = jobs.filter((j) => j.status === "running").length; + const pausedCount = jobs.filter((j) => j.status === "paused").length; + + return { + activeJobCount: runningCount + pausedCount, + hasRunningJobs: runningCount > 0, + }; } From 8c24a987568303e7e1891095fcdd08ee0688519e Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 22:36:19 -0800 Subject: [PATCH 09/20] more comments --- core/src/ops/indexing/action.rs | 17 +++--- core/src/ops/indexing/hierarchy.rs | 59 +++++++++++-------- core/src/ops/indexing/input.rs | 11 +++- core/src/ops/indexing/metrics.rs | 14 +++-- core/src/ops/indexing/path_resolver.rs | 36 +++++------- core/src/ops/indexing/phases/aggregation.rs | 63 +++++++++++---------- core/src/ops/indexing/phases/mod.rs | 17 ++++-- core/src/ops/indexing/processor.rs | 27 +++++---- core/src/ops/indexing/progress.rs | 27 ++++----- core/src/ops/indexing/responder.rs | 35 ++++++------ 10 files changed, 163 insertions(+), 143 deletions(-) diff --git a/core/src/ops/indexing/action.rs b/core/src/ops/indexing/action.rs index be8dd85d8..428e324c5 100644 --- a/core/src/ops/indexing/action.rs +++ b/core/src/ops/indexing/action.rs @@ -1,4 +1,9 @@ -//! Indexing action handler +//! # Indexing Action Handler +//! +//! Bridges user-facing indexing requests (from CLI, API, UI) to the internal IndexerJob system. +//! Actions validate inputs, convert paths to SdPaths, dispatch jobs to the library's job queue, +//! and track execution context for observability. Each action can spawn multiple jobs (one per +//! path), but returns only the last handle for API simplicity. use super::job::{IndexMode, IndexPersistence, IndexScope, IndexerJob, IndexerJobConfig}; use super::IndexInput; @@ -64,7 +69,6 @@ impl LibraryAction for IndexingAction { _library: &std::sync::Arc, _context: std::sync::Arc, ) -> Result { - // Validate input if let Err(errors) = self.input.validate() { return Err(ActionError::Validation { field: "paths".to_string(), @@ -79,10 +83,6 @@ impl LibraryAction for IndexingAction { library: std::sync::Arc, context: Arc, ) -> Result { - // Validation is now handled by ActionManager before execute - - // For now, submit one job per path (sequentially). Could be parallelized later. - // Return the handle of the last job submitted for convenience. let mut last_handle: Option = None; for path in &self.input.paths { @@ -93,16 +93,13 @@ impl LibraryAction for IndexingAction { IndexerJobConfig::ephemeral_browse(sd_path, self.input.scope) } IndexPersistence::Persistent => { - // Persistent indexing expects a location context. For now, default to recursive path walk with selected mode. - // If we later bind paths to a location, we can set location_id properly. - // Here use ui_navigation/new with mode overridden below when possible. + // Persistent mode stores entries in the database but doesn't require a location binding yet. let mut c = IndexerJobConfig::ephemeral_browse(sd_path, self.input.scope); c.persistence = IndexPersistence::Persistent; c } }; - // Apply selected mode config.mode = self.input.mode; // TODO: Apply include_hidden via rule_toggles when available diff --git a/core/src/ops/indexing/hierarchy.rs b/core/src/ops/indexing/hierarchy.rs index 67a5eb265..61b9e4edb 100644 --- a/core/src/ops/indexing/hierarchy.rs +++ b/core/src/ops/indexing/hierarchy.rs @@ -1,4 +1,10 @@ -//! Hierarchical query helpers using closure table +//! # Closure Table Query Helpers +//! +//! Provides O(1) tree traversal operations using a precomputed closure table. +//! The closure table stores all ancestor-descendant relationships with their depths, +//! eliminating recursive queries for common operations like "get all children" or +//! "build full path". Each insert updates the closure table to maintain transitive +//! relationships, trading write complexity for instant read performance. use crate::infra::db::entities::{entry, entry_closure}; use sea_orm::{ @@ -7,11 +13,11 @@ use sea_orm::{ }; use std::path::PathBuf; -/// Hierarchical query helpers for efficient tree operations +/// Namespace for closure table queries that avoid recursive database operations. pub struct HierarchyQuery; impl HierarchyQuery { - /// Get direct children of an entry + /// Returns direct children only (depth 1), sorted by name. pub async fn get_children( db: &DbConn, parent_id: i32, @@ -23,12 +29,14 @@ impl HierarchyQuery { .await } - /// Get all descendants of an entry (recursive) + /// Returns all descendants at any depth using the closure table (not recursive). + /// + /// Excludes the entry itself (depth > 0). Results are ordered by depth (shallowest first). + /// Chunks queries to respect SQLite's parameter limit. pub async fn get_descendants( db: &DbConn, ancestor_id: i32, ) -> Result, sea_orm::DbErr> { - // First get all descendant IDs from closure table let descendant_ids = entry_closure::Entity::find() .filter(entry_closure::Column::AncestorId.eq(ancestor_id)) .filter(entry_closure::Column::Depth.gt(0)) @@ -39,7 +47,6 @@ impl HierarchyQuery { .map(|ec| ec.descendant_id) .collect::>(); - // Then fetch the entries if descendant_ids.is_empty() { return Ok(vec![]); } @@ -59,12 +66,14 @@ impl HierarchyQuery { } } - /// Get all ancestors of an entry (path to root) + /// Returns all ancestors from root to immediate parent, enabling breadcrumb construction. + /// + /// Excludes the entry itself (depth > 0). Results are ordered deepest-first, so reverse + /// iteration builds paths from root downward. pub async fn get_ancestors( db: &DbConn, descendant_id: i32, ) -> Result, sea_orm::DbErr> { - // First get all ancestor IDs from closure table let ancestor_ids = entry_closure::Entity::find() .filter(entry_closure::Column::DescendantId.eq(descendant_id)) .filter(entry_closure::Column::Depth.gt(0)) @@ -75,7 +84,6 @@ impl HierarchyQuery { .map(|ec| ec.ancestor_id) .collect::>(); - // Then fetch the entries if ancestor_ids.is_empty() { return Ok(vec![]); } @@ -94,13 +102,14 @@ impl HierarchyQuery { } } - /// Get entries at a specific depth below an ancestor + /// Returns entries at exactly the specified depth (e.g., all grandchildren = depth 2). + /// + /// Useful for level-by-level tree rendering without fetching the entire subtree. pub async fn get_at_depth( db: &DbConn, ancestor_id: i32, depth: i32, ) -> Result, sea_orm::DbErr> { - // First get IDs at the specific depth let entry_ids = entry_closure::Entity::find() .filter(entry_closure::Column::AncestorId.eq(ancestor_id)) .filter(entry_closure::Column::Depth.eq(depth)) @@ -110,7 +119,6 @@ impl HierarchyQuery { .map(|ec| ec.descendant_id) .collect::>(); - // Then fetch the entries if entry_ids.is_empty() { return Ok(vec![]); } @@ -130,36 +138,35 @@ impl HierarchyQuery { } } - /// Build a full path for an entry by traversing ancestors + /// Constructs the absolute filesystem path by joining location_path + ancestors + entry name. + /// + /// Used for displaying full paths in UI and for validating moves/renames don't exceed + /// filesystem limits. The closure table makes this O(1) instead of recursively walking + /// parent_id links. pub async fn build_full_path( db: &DbConn, entry_id: i32, location_path: &str, ) -> Result { - // Get the entry itself let entry = entry::Entity::find_by_id(entry_id) .one(db) .await? .ok_or_else(|| sea_orm::DbErr::RecordNotFound("Entry not found".to_string()))?; - // Get all ancestors in order (root to parent) let ancestors = Self::get_ancestors(db, entry_id).await?; - // Build the path let mut path = PathBuf::from(location_path); - // Add ancestor names for ancestor in ancestors { path.push(&ancestor.name); } - // Add the entry's own name path.push(&entry.name); Ok(path) } - /// Count total descendants of an entry + /// Counts descendants at any depth without fetching full entry records. pub async fn count_descendants(db: &DbConn, ancestor_id: i32) -> Result { entry_closure::Entity::find() .filter(entry_closure::Column::AncestorId.eq(ancestor_id)) @@ -168,13 +175,16 @@ impl HierarchyQuery { .await } - /// Get subtree size (total size of all descendant files) + /// Sums the size field across all descendants (files and directories). + /// + /// Note: This is a naive sum. For accurate directory subtree sizes, use the + /// pre-aggregated aggregate_size field computed during the aggregation phase. pub async fn get_subtree_size(db: &DbConn, ancestor_id: i32) -> Result { let descendants = Self::get_descendants(db, ancestor_id).await?; Ok(descendants.iter().map(|e| e.size).sum()) } - /// Check if an entry is an ancestor of another + /// Checks if potential_ancestor_id is anywhere above potential_descendant_id in the tree. pub async fn is_ancestor_of( db: &DbConn, potential_ancestor_id: i32, @@ -190,17 +200,18 @@ impl HierarchyQuery { Ok(count > 0) } - /// Find common ancestor of two entries + /// Finds the lowest (deepest) ancestor shared by both entries, if any. + /// + /// Returns None if the entries are in different trees (different locations). + /// Useful for determining relative path operations. pub async fn find_common_ancestor( db: &DbConn, entry1_id: i32, entry2_id: i32, ) -> Result, sea_orm::DbErr> { - // Get ancestors of both entries let ancestors1 = Self::get_ancestors(db, entry1_id).await?; let ancestors2 = Self::get_ancestors(db, entry2_id).await?; - // Find the first common ancestor (starting from the deepest) for ancestor1 in ancestors1.iter().rev() { for ancestor2 in &ancestors2 { if ancestor1.id == ancestor2.id { diff --git a/core/src/ops/indexing/input.rs b/core/src/ops/indexing/input.rs index 13948e656..4f6eb8ad7 100644 --- a/core/src/ops/indexing/input.rs +++ b/core/src/ops/indexing/input.rs @@ -1,4 +1,9 @@ -//! Core input types for indexing operations +//! # Indexing Input Types +//! +//! Defines IndexInput, the canonical request shape for all indexing operations regardless +//! of origin (CLI, API, UI). This type is deserialized from external requests, validated, +//! and converted into IndexerJobConfig for internal execution. Separating input from config +//! keeps the public API stable while internal job parameters evolve. use super::job::{IndexMode, IndexPersistence, IndexScope}; use serde::{Deserialize, Serialize}; @@ -28,7 +33,7 @@ pub struct IndexInput { } impl IndexInput { - /// Create a new input with sane defaults + /// Creates an input with defaults: recursive deep indexing of ephemeral entries, excluding hidden files. pub fn new>(library_id: uuid::Uuid, paths: P) -> Self { Self { library_id, @@ -65,7 +70,7 @@ impl IndexInput { self } - /// Validate the input + /// Checks that at least one path is provided; other fields are structurally valid via types. pub fn validate(&self) -> Result<(), Vec> { let mut errors = Vec::new(); diff --git a/core/src/ops/indexing/metrics.rs b/core/src/ops/indexing/metrics.rs index d7936bc35..1bdf41cde 100644 --- a/core/src/ops/indexing/metrics.rs +++ b/core/src/ops/indexing/metrics.rs @@ -1,10 +1,14 @@ -//! Performance metrics and monitoring for the indexer +//! # Indexer Performance Metrics +//! +//! Tracks timing, throughput, database activity, and error counts across all indexing phases. +//! Metrics are computed at job completion and logged for performance analysis. They're also +//! serialized for API responses so clients can display progress summaries and detect bottlenecks. use serde::{Deserialize, Serialize}; use specta::Type; use std::time::{Duration, Instant}; -/// Comprehensive metrics for indexing operations +/// Complete snapshot of indexer performance after job completion. #[derive(Debug, Clone, Serialize, Deserialize, Type)] pub struct IndexerMetrics { // Timing @@ -59,7 +63,7 @@ impl Default for IndexerMetrics { } } -/// Tracks timing for different phases +/// Tracks phase transition times to compute per-phase durations without overlapping timers. #[derive(Debug)] pub struct PhaseTimer { phase_start: Instant, @@ -109,7 +113,7 @@ impl PhaseTimer { } impl IndexerMetrics { - /// Calculate final metrics from state and timer + /// Computes metrics after job completion by combining accumulated stats with elapsed timers. pub fn calculate( stats: &super::state::IndexerStats, timer: &PhaseTimer, @@ -163,7 +167,7 @@ impl IndexerMetrics { } } - /// Format metrics for logging + /// Formats metrics as a multi-line summary suitable for job completion logs. pub fn format_summary(&self) -> String { format!( "Indexing completed in {:.2}s:\n\ diff --git a/core/src/ops/indexing/path_resolver.rs b/core/src/ops/indexing/path_resolver.rs index 03056dc1d..5a8680237 100644 --- a/core/src/ops/indexing/path_resolver.rs +++ b/core/src/ops/indexing/path_resolver.rs @@ -1,6 +1,9 @@ -//! Path resolution service for the pure hierarchical model +//! # Path Resolution via directory_paths Cache //! -//! This service provides efficient path resolution by utilizing the directory_paths lookup table. +//! Resolves full filesystem paths for entries without walking parent_id chains. The directory_paths +//! table caches absolute paths for all directories, making lookups O(1) instead of O(depth). Files +//! are resolved by joining their parent's cached path with the filename. This table is updated during +//! indexing and move operations to keep paths in sync with the entry hierarchy. use std::path::PathBuf; @@ -11,12 +14,11 @@ use crate::infra::db::entities::{directory_paths, entry, DirectoryPaths, Entry}; pub struct PathResolver; impl PathResolver { - /// Get the full path for any entry (file or directory) + /// Resolves the absolute path by looking up directories in the cache or reconstructing file paths. pub async fn get_full_path( db: &C, entry_id: i32, ) -> Result { - // First, get the entry to determine if it's a file or directory let entry = Entry::find_by_id(entry_id) .one(db) .await? @@ -24,7 +26,6 @@ impl PathResolver { match entry.entry_kind() { crate::infra::db::entities::entry::EntryKind::Directory => { - // For directories, lookup in directory_paths table let dir_path = DirectoryPaths::find_by_id(entry_id) .one(db) .await? @@ -37,7 +38,6 @@ impl PathResolver { Ok(PathBuf::from(dir_path.path)) } _ => { - // For files, get parent directory path and append full filename (name + extension) if let Some(parent_id) = entry.parent_id { let parent_path = DirectoryPaths::find_by_id(parent_id) .one(db) @@ -49,7 +49,6 @@ impl PathResolver { )) })?; - // Reconstruct full filename: name + extension let full_filename = if let Some(ext) = &entry.extension { format!("{}.{}", entry.name, ext) } else { @@ -58,8 +57,6 @@ impl PathResolver { Ok(PathBuf::from(parent_path.path).join(full_filename)) } else { - // Root file (shouldn't normally happen) - // Still need to add extension if present let full_filename = if let Some(ext) = &entry.extension { format!("{}.{}", entry.name, ext) } else { @@ -71,7 +68,7 @@ impl PathResolver { } } - /// Get the path for a directory from the cache + /// Fetches the cached path string directly from directory_paths without entry lookup. pub async fn get_directory_path( db: &C, directory_id: i32, @@ -88,7 +85,9 @@ impl PathResolver { }) } - /// Build the full path for a new directory entry + /// Constructs the path string for a new directory by joining its parent's path with its name. + /// + /// Used during indexing to populate the directory_paths table for newly discovered directories. pub async fn build_directory_path( db: &C, parent_id: Option, @@ -98,17 +97,15 @@ impl PathResolver { let parent_path = Self::get_directory_path(db, parent_id).await?; Ok(format!("{}/{}", parent_path, name)) } else { - // Root directory Ok(name.to_string()) } } - /// Get paths for multiple entries efficiently + /// Resolves paths for multiple entries in batched queries to minimize database round-trips. pub async fn get_paths_batch( db: &C, entry_ids: Vec, ) -> Result, DbErr> { - // First, fetch all entries to determine types let mut entries: Vec = Vec::new(); let chunk_size: usize = 900; for chunk in entry_ids.chunks(chunk_size) { @@ -121,7 +118,6 @@ impl PathResolver { let mut results = Vec::with_capacity(entries.len()); - // Separate directories and files let mut directory_ids = Vec::new(); let mut file_entries = Vec::new(); @@ -136,7 +132,6 @@ impl PathResolver { } } - // Batch fetch directory paths if !directory_ids.is_empty() { let mut dir_paths: Vec = Vec::new(); for chunk in directory_ids.chunks(chunk_size) { @@ -152,7 +147,6 @@ impl PathResolver { } } - // Handle files by fetching parent paths if !file_entries.is_empty() { let parent_ids: Vec = file_entries.iter().filter_map(|e| e.parent_id).collect(); @@ -165,7 +159,6 @@ impl PathResolver { parent_paths.append(&mut batch); } - // Create a map for quick lookup let parent_map: std::collections::HashMap = parent_paths .into_iter() .map(|dp| (dp.entry_id, dp.path)) @@ -188,15 +181,16 @@ impl PathResolver { Ok(results) } - /// Update all descendant directory paths after a move operation - /// This should be called in a background job after moving a directory + /// Bulk-updates descendant directory paths after moving a directory tree. + /// + /// Uses a single SQL REPLACE to rewrite all paths under the moved directory's old prefix. + /// Should be called after updating the moved directory's entry.parent_id and directory_paths.path. pub async fn update_descendant_paths( db: &C, moved_directory_id: i32, old_path: &str, new_path: &str, ) -> Result { - // Use raw SQL for efficient bulk update let sql = r#" UPDATE directory_paths SET path = REPLACE(path, ?, ?) diff --git a/core/src/ops/indexing/phases/aggregation.rs b/core/src/ops/indexing/phases/aggregation.rs index 5fe814320..3ebde7ff9 100644 --- a/core/src/ops/indexing/phases/aggregation.rs +++ b/core/src/ops/indexing/phases/aggregation.rs @@ -1,4 +1,14 @@ -//! Directory size aggregation phase +//! # Directory Size Aggregation +//! +//! Computes total sizes and file counts for directories by traversing from deepest +//! leaves to the root. Each directory's `aggregate_size` includes all descendant files, +//! and `file_count` tracks the total number of files (not subdirectories) contained +//! within. This data powers folder size displays in the UI and enables sorting by size. +//! +//! Processing order matters: children must be aggregated before their parents, so we +//! sort directories by depth (deepest first) before computing. Without this, parent +//! totals would miss unaggregated child contributions. The closure table provides all +//! descendants in one query instead of recursive tree walks. use crate::{ infra::{ @@ -15,7 +25,12 @@ use sea_orm::{ use std::collections::HashMap; use uuid::Uuid; -/// Run the directory aggregation phase +/// Aggregates directory sizes and file counts from leaves to root. +/// +/// Queries all directories under the location using the closure table, sorts them by +/// depth (deepest first), then computes aggregate_size and file_count for each by +/// summing direct children. Updates indexed_at after each directory so sync picks up +/// the aggregated values. Skips locations without an entry_id (not yet indexed). pub async fn run_aggregation_phase( location_id: Uuid, state: &mut IndexerState, @@ -23,7 +38,6 @@ pub async fn run_aggregation_phase( ) -> Result<(), JobError> { ctx.log("Starting directory size aggregation phase"); - // Get the location record let location_record = entities::location::Entity::find() .filter(entities::location::Column::Uuid.eq(location_id)) .one(ctx.library_db()) @@ -33,8 +47,6 @@ pub async fn run_aggregation_phase( let location_id_i32 = location_record.id; - // Find all directories under this location using closure table - // First get all descendant IDs let descendant_ids = entities::entry_closure::Entity::find() .filter(entities::entry_closure::Column::AncestorId.eq(location_record.entry_id)) .all(ctx.library_db()) @@ -44,14 +56,12 @@ pub async fn run_aggregation_phase( .map(|ec| ec.descendant_id) .collect::>(); - // Add the root entry itself (skip if location has no entry_id) let Some(root_entry_id) = location_record.entry_id else { - return Ok(()); // Skip if location not yet synced + return Ok(()); }; let mut all_entry_ids = vec![root_entry_id]; all_entry_ids.extend(descendant_ids); - // Now get all directories from these entries let mut directories: Vec = Vec::new(); // SQLite has a bind parameter limit (~999). Query in safe chunks. let chunk_size: usize = 900; @@ -65,18 +75,15 @@ pub async fn run_aggregation_phase( directories.append(&mut batch); } - // Sort directories by their depth in the hierarchy (deepest first) - // We'll use a simple approach: count parents + // Count depth by following parent links up to root. let mut dir_depths: Vec<(entities::entry::Model, usize)> = Vec::new(); for directory in directories { let mut depth = 0; let mut current_parent_id = directory.parent_id; - // Count the depth by following parent links while let Some(parent_id) = current_parent_id { depth += 1; - // Find the parent to get its parent_id if let Ok(Some(parent)) = entities::entry::Entity::find_by_id(parent_id) .one(ctx.library_db()) .await @@ -90,7 +97,6 @@ pub async fn run_aggregation_phase( dir_depths.push((directory, depth)); } - // Sort by depth (deepest first) dir_depths.sort_by(|a, b| b.1.cmp(&a.1)); let directories: Vec = dir_depths.into_iter().map(|(dir, _)| dir).collect(); @@ -98,7 +104,6 @@ pub async fn run_aggregation_phase( let total_dirs = directories.len(); ctx.log(format!("Found {} directories to aggregate", total_dirs)); - // Process directories from leaves to root let mut processed = 0; let aggregator = DirectoryAggregator::new(ctx.library_db().clone()); @@ -125,16 +130,14 @@ pub async fn run_aggregation_phase( }; ctx.progress(Progress::generic(indexer_progress.to_generic_progress())); - // Calculate aggregate values for this directory match aggregator.aggregate_directory(&directory).await { Ok((aggregate_size, child_count, file_count)) => { - // Update the directory entry let directory_name = directory.name.clone(); let mut active_dir: entities::entry::ActiveModel = directory.into(); active_dir.aggregate_size = Set(aggregate_size); active_dir.child_count = Set(child_count); active_dir.file_count = Set(file_count); - // Update indexed_at so aggregate changes are picked up by sync + // Bump indexed_at so sync picks up aggregate changes. active_dir.indexed_at = Set(Some(chrono::Utc::now())); active_dir.update(ctx.library_db()).await.map_err(|e| { @@ -153,8 +156,6 @@ pub async fn run_aggregation_phase( )); } } - - // State is automatically saved during job serialization on shutdown } ctx.log(format!( @@ -174,12 +175,15 @@ impl DirectoryAggregator { Self { db } } - /// Calculate aggregate size, child count, and file count for a directory + /// Computes aggregate values by summing direct children only. + /// + /// Files contribute their size directly. Subdirectories contribute their already-computed + /// aggregate_size and file_count (this is why we process deepest-first). Symlinks are + /// treated as files for counting purposes. async fn aggregate_directory( &self, directory: &entities::entry::Model, ) -> Result<(i64, i32, i32), DbErr> { - // Get all direct children using parent_id only let children = entities::entry::Entity::find() .filter(entities::entry::Column::ParentId.eq(directory.id)) .all(&self.db) @@ -192,21 +196,19 @@ impl DirectoryAggregator { for child in children { match child.kind { 0 => { - // File aggregate_size += child.size; file_count += 1; } 1 => { - // Directory aggregate_size += child.aggregate_size; file_count += child.file_count; } 2 => { - // Symlink - count as file + // Symlinks count as files. aggregate_size += child.size; file_count += 1; } - _ => {} // Unknown type, skip + _ => {} } } @@ -214,9 +216,12 @@ impl DirectoryAggregator { } } -/// One-time migration to calculate all directory sizes for existing data +/// Backfills aggregate_size and file_count for all existing directories across all locations. +/// +/// This is a one-time migration for databases created before aggregation was added. +/// Safe to run multiple times (idempotent). Processes each location independently, +/// sorting directories by depth within each location tree. pub async fn migrate_directory_sizes(db: &DatabaseConnection) -> Result<(), DbErr> { - // Get all locations let locations = entities::location::Entity::find().all(db).await?; for location in locations { @@ -225,7 +230,6 @@ pub async fn migrate_directory_sizes(db: &DatabaseConnection) -> Result<(), DbEr location.name.as_deref().unwrap_or("Unknown") ); - // Find all directories under this location using closure table let Some(root_entry_id) = location.entry_id else { tracing::warn!( "Skipping location {} - entry_id not set (not yet synced)", @@ -256,7 +260,6 @@ pub async fn migrate_directory_sizes(db: &DatabaseConnection) -> Result<(), DbEr directories.append(&mut batch); } - // Sort by depth (deepest first) - same logic as above let mut dir_depths: Vec<(entities::entry::Model, usize)> = Vec::new(); for directory in directories { @@ -290,7 +293,7 @@ pub async fn migrate_directory_sizes(db: &DatabaseConnection) -> Result<(), DbEr active_dir.aggregate_size = Set(aggregate_size); active_dir.child_count = Set(child_count); active_dir.file_count = Set(file_count); - // Update indexed_at so aggregate changes are picked up by sync + // Bump indexed_at so sync picks up aggregate changes. active_dir.indexed_at = Set(Some(chrono::Utc::now())); active_dir.update(db).await?; diff --git a/core/src/ops/indexing/phases/mod.rs b/core/src/ops/indexing/phases/mod.rs index 4519d3175..15d473880 100644 --- a/core/src/ops/indexing/phases/mod.rs +++ b/core/src/ops/indexing/phases/mod.rs @@ -1,10 +1,15 @@ -//! Indexer phases implementation +//! # Indexer Execution Phases //! -//! The indexer operates in distinct phases for clarity and resumability: -//! 1. Discovery - Walk directories and collect entries -//! 2. Processing - Create/update database records -//! 3. Aggregation - Calculate directory sizes -//! 4. Content - Generate content identities +//! The indexer runs in four sequential phases to enable resumability and incremental +//! progress tracking. Each phase is independently checkpointed so interrupted jobs can +//! resume mid-phase without reprocessing completed work. This prevents re-walking large +//! directories or re-hashing files after crashes or cancellations. +//! +//! Discovery walks the filesystem and collects raw metadata. Processing converts those +//! entries into database records with stable UUIDs. Aggregation bubbles up directory +//! sizes from leaves to root (required for accurate folder size reporting). Content +//! identification hashes file contents for deduplication and generates deterministic +//! sync UUIDs. pub mod aggregation; pub mod content; diff --git a/core/src/ops/indexing/processor.rs b/core/src/ops/indexing/processor.rs index 624514c9d..33278a581 100644 --- a/core/src/ops/indexing/processor.rs +++ b/core/src/ops/indexing/processor.rs @@ -1,4 +1,9 @@ -//! Content hash processor - atomic operation for generating and linking content identities +//! # Content Hash Processor +//! +//! Generates BLAKE3 content hashes for files and links them to content_identity records. Each +//! processor execution is atomic: hash generation, identity creation/lookup, and entry linking +//! happen in a single transaction. This ensures entries either have valid content_id references +//! or remain unlinked if processing fails. use super::{ctx::IndexingCtx, entry::EntryProcessor, state::EntryKind}; use crate::domain::content_identity::ContentHashGenerator; @@ -8,7 +13,7 @@ use std::path::PathBuf; use tracing::debug; use uuid::Uuid; -/// Entry data for processor execution +/// Minimal entry snapshot required for content processing without full database models. #[derive(Debug, Clone)] pub struct ProcessorEntry { pub id: i32, @@ -20,7 +25,7 @@ pub struct ProcessorEntry { pub mime_type: Option, } -/// Result of processor execution +/// Outcome of a single processor run: success/failure, artifacts created, and bytes processed. #[derive(Debug, Clone)] pub struct ProcessorResult { pub success: bool, @@ -49,7 +54,7 @@ impl ProcessorResult { } } -/// Processor configuration +/// Per-processor settings: type, enabled flag, and arbitrary JSON config. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ProcessorConfig { pub processor_type: String, @@ -58,7 +63,7 @@ pub struct ProcessorConfig { pub settings: serde_json::Value, } -/// Location processor configuration +/// Collection of processors that run automatically on watcher events for a location. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LocationProcessorConfig { #[serde(default)] @@ -84,7 +89,7 @@ impl Default for LocationProcessorConfig { }, ProcessorConfig { processor_type: "thumbstrip".to_string(), - enabled: true, // Fast enough for auto-generation (~6s) + enabled: true, // ~6s per video, acceptable for auto-generation. settings: serde_json::json!({ "variants": ["thumbstrip_preview"], "regenerate": false @@ -92,7 +97,7 @@ impl Default for LocationProcessorConfig { }, ProcessorConfig { processor_type: "proxy".to_string(), - enabled: false, // Disabled by default (user opt-in, ~8s per video) + enabled: false, // User opt-in required (~8s per video). settings: serde_json::json!({ "enabled": false, "max_file_size_gb": 5, @@ -101,7 +106,7 @@ impl Default for LocationProcessorConfig { }, ProcessorConfig { processor_type: "ocr".to_string(), - enabled: false, // Disabled by default (expensive) + enabled: false, // Expensive, user opt-in. settings: serde_json::json!({ "languages": ["eng"], "min_confidence": 0.6 @@ -109,7 +114,7 @@ impl Default for LocationProcessorConfig { }, ProcessorConfig { processor_type: "speech_to_text".to_string(), - enabled: false, // Disabled by default (very expensive) + enabled: false, // Very expensive, user opt-in. settings: serde_json::json!({ "model": "base", "language": null @@ -120,7 +125,7 @@ impl Default for LocationProcessorConfig { } } -/// Content hash processor +/// Generates BLAKE3 hashes and creates content_identity records for files. pub struct ContentHashProcessor { library_id: Uuid, } @@ -159,7 +164,7 @@ impl ContentHashProcessor { } } -/// Load processor configuration for a location +/// Loads processor config from the location's database record, falling back to defaults. pub async fn load_location_processor_config( _location_id: Uuid, _db: &sea_orm::DatabaseConnection, diff --git a/core/src/ops/indexing/progress.rs b/core/src/ops/indexing/progress.rs index 4cb666ede..7240e0c49 100644 --- a/core/src/ops/indexing/progress.rs +++ b/core/src/ops/indexing/progress.rs @@ -1,4 +1,8 @@ -//! IndexerProgress to GenericProgress conversion +//! # IndexerProgress to GenericProgress Conversion +//! +//! Maps indexer-specific progress (phases, stats) to the generic job progress format for UI display. +//! Each phase is assigned a percentage range to show continuous progress across all four stages. +//! The converter handles path filtering to distinguish between real filesystem paths and status messages. use super::state::{IndexPhase, IndexerProgress}; use crate::{ @@ -73,20 +77,16 @@ impl ToGenericProgress for IndexerProgress { } }; - // Convert current_path string to SdPath only if it's a real filesystem path - // During aggregation, current_path contains status messages like "Aggregating directory 3846/3877: info" - // During other phases, it might contain actual file paths + // Filter out status messages from current_path - only convert real filesystem paths to SdPath. let current_path = if !self.current_path.is_empty() && !self.current_path.starts_with("Aggregating directory") && !self.current_path.starts_with("Finalizing") { - // Only create SdPath if it looks like a real path (absolute or relative with separators) let path_buf = PathBuf::from(&self.current_path); if path_buf.is_absolute() || self.current_path.contains('/') || self.current_path.contains('\\') { - // Try to parse as URI first (for cloud paths), fall back to local path SdPath::from_uri(&self.current_path) .ok() .or_else(|| Some(SdPath::local(path_buf))) @@ -97,34 +97,29 @@ impl ToGenericProgress for IndexerProgress { None }; - // completion_info is already set correctly from phase matching above let final_completion = completion_info; - // Create the generic progress let mut progress = GenericProgress::new(percentage, &phase_name, &phase_message) - .with_bytes(self.total_found.bytes, self.total_found.bytes) // Total bytes found so far + .with_bytes(self.total_found.bytes, self.total_found.bytes) .with_performance( self.processing_rate, self.estimated_remaining, - None, // Could calculate elapsed time from start + None, ) - .with_errors(self.total_found.errors, 0) // No separate warning count in IndexerStats - .with_metadata(self); // Include original indexer progress as metadata + .with_errors(self.total_found.errors, 0) + .with_metadata(self); - // Set completion data - for finalizing phase, manually set to avoid auto-percentage calculation + // Finalizing phase uses manual completion to preserve custom percentage ranges. match &self.phase { IndexPhase::Finalizing { .. } => { - // Manually set completion to preserve our custom percentage calculation progress.completion.completed = final_completion.0; progress.completion.total = final_completion.1; } _ => { - // For other phases, use normal with_completion which auto-calculates percentage progress = progress.with_completion(final_completion.0, final_completion.1); } } - // Set current path if available if let Some(path) = current_path { progress = progress.with_current_path(path); } diff --git a/core/src/ops/indexing/responder.rs b/core/src/ops/indexing/responder.rs index b1981322a..11006ad13 100644 --- a/core/src/ops/indexing/responder.rs +++ b/core/src/ops/indexing/responder.rs @@ -94,7 +94,10 @@ async fn path_exists_safe( } } -/// Apply a raw FS change by resolving it to DB operations (create/modify/move/delete) +/// Translates a single filesystem event into database mutations: create, modify, rename, or remove. +/// +/// Queries the database to resolve paths to entry IDs, then delegates to specialized handlers. +/// For creates/modifies, runs the processor pipeline (content hash, thumbnails, etc.) inline. pub async fn apply( context: &Arc, library_id: Uuid, @@ -152,7 +155,11 @@ pub async fn apply( Ok(()) } -/// Apply a batch of raw FS changes with optimized processing +/// Processes multiple filesystem events as a batch, deduplicating and ordering for correctness. +/// +/// Groups events by type, deduplicates (macOS sends duplicate creates), then processes in order: +/// removes first, then renames, creates, modifies. This prevents conflicts like creating a file +/// that should have been deleted. pub async fn apply_batch( context: &Arc, library_id: Uuid, @@ -181,7 +188,6 @@ pub async fn apply_batch( // Lightweight indexing context for DB access let ctx = ResponderCtx::new(context, library_id).await?; - // Group events by type for potential bulk operations let mut creates = Vec::new(); let mut modifies = Vec::new(); let mut removes = Vec::new(); @@ -196,8 +202,7 @@ pub async fn apply_batch( } } - // Deduplicate events - macOS FSEvents can send duplicate Create events for the same file - // when it's written in stages (common for screenshots, large files, etc) + // macOS FSEvents sends duplicate creates when files are written incrementally. creates.sort(); creates.dedup(); modifies.sort(); @@ -205,9 +210,6 @@ pub async fn apply_batch( removes.sort(); removes.dedup(); - // Process in order: removes first, then renames, then creates, then modifies - // This ensures we don't try to create files that should be removed, etc. - debug!( "Processing batch: {} creates, {} modifies, {} removes, {} renames", creates.len(), @@ -298,7 +300,7 @@ pub async fn apply_batch( Ok(()) } -/// Get the location's root entry ID for scoping queries +/// Fetches the location's root entry_id to scope path lookups within the correct location tree. async fn get_location_root_entry_id(ctx: &impl IndexingCtx, location_id: Uuid) -> Result { let location_record = entities::location::Entity::find() .filter(entities::location::Column::Uuid.eq(location_id)) @@ -311,17 +313,15 @@ async fn get_location_root_entry_id(ctx: &impl IndexingCtx, location_id: Uuid) - .ok_or_else(|| anyhow::anyhow!("Location {} has no root entry", location_id)) } -/// Check if a path should be filtered based on indexing rules +/// Evaluates indexing rules to determine if a path should be skipped (hidden files, system dirs, etc.). async fn should_filter_path( path: &Path, rule_toggles: RuleToggles, location_root: &Path, backend: Option<&Arc>, ) -> Result { - // Build ruler for this path using the same logic as the indexer let ruler = build_default_ruler(rule_toggles, location_root, path).await; - // Get metadata for the path using backend if available let metadata = if let Some(backend) = backend { backend .metadata(path) @@ -346,7 +346,6 @@ async fn should_filter_path( } }; - // Simple metadata implementation for rule evaluation struct SimpleMetadata { is_dir: bool, } @@ -360,7 +359,6 @@ async fn should_filter_path( is_dir: metadata.kind == crate::ops::indexing::state::EntryKind::Directory, }; - // Evaluate the path against the ruler match ruler.evaluate_path(path, &simple_meta).await { Ok(RulerDecision::Reject) => { debug!("Filtered path by indexing rules: {}", path.display()); @@ -369,12 +367,16 @@ async fn should_filter_path( Ok(RulerDecision::Accept) => Ok(false), Err(e) => { tracing::warn!("Error evaluating rules for {}: {}", path.display(), e); - Ok(false) // Don't filter on error, let it through + Ok(false) } } } -/// Handle create: extract metadata and insert via EntryProcessor +/// Creates a new entry for the path, runs processors, and spawns recursive indexing for directories. +/// +/// Checks for duplicate creates (race conditions), inode-based moves, and filters based on rules. +/// For directories, dispatches an IndexerJob to index contents. For files, runs the processor +/// pipeline inline (content hash, thumbnails, etc.). async fn handle_create( ctx: &impl IndexingCtx, context: &Arc, @@ -387,7 +389,6 @@ async fn handle_create( ) -> Result<()> { debug!("Create: {}", path.display()); - // Verify path is accessible before processing match path_exists_safe(path, backend).await { Ok(true) => { // Path exists and is accessible, proceed From b6779d71acfc79d9a2d238840742d9c0dfcf1a34 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sun, 7 Dec 2025 23:17:51 -0800 Subject: [PATCH 10/20] Enhance ephemeral indexing with error handling and memory-mapped storage - Updated `EphemeralIndex` and `NodeArena` to return `std::io::Result` for better error handling during creation and insertion. - Implemented memory-mapped storage in `NodeArena` to efficiently manage large indexes, preventing out-of-memory errors. - Refactored `EphemeralIndexCache` to handle initialization errors gracefully. - Improved tests to validate new error handling and memory management features. --- core/src/context.rs | 4 +- core/src/ops/indexing/ephemeral/arena.rs | 305 +++++++++++++----- .../src/ops/indexing/ephemeral/index_cache.rs | 20 +- core/src/ops/indexing/job.rs | 36 ++- core/src/ops/indexing/persistence.rs | 13 +- core/src/ops/indexing/responder.rs | 125 ++----- core/src/ops/indexing/verify/action.rs | 5 +- 7 files changed, 307 insertions(+), 201 deletions(-) diff --git a/core/src/context.rs b/core/src/context.rs index e19f1915d..0c03a068f 100644 --- a/core/src/context.rs +++ b/core/src/context.rs @@ -49,7 +49,9 @@ impl CoreContext { action_manager: Arc::new(RwLock::new(None)), networking: Arc::new(RwLock::new(None)), plugin_manager: Arc::new(RwLock::new(None)), - ephemeral_index_cache: Arc::new(EphemeralIndexCache::new()), + ephemeral_index_cache: Arc::new( + EphemeralIndexCache::new().expect("Failed to create ephemeral index cache"), + ), job_logging_config: None, job_logs_dir: None, } diff --git a/core/src/ops/indexing/ephemeral/arena.rs b/core/src/ops/indexing/ephemeral/arena.rs index 198e5b58b..b3aeeaa78 100644 --- a/core/src/ops/indexing/ephemeral/arena.rs +++ b/core/src/ops/indexing/ephemeral/arena.rs @@ -1,112 +1,246 @@ -//! Vec-based arena storage for file nodes +//! # Memory-Mapped Arena for Ephemeral File Nodes //! -//! The NodeArena provides efficient, contiguous storage for FileNodes. -//! Key features: -//! - O(1) insertion and lookup by EntryId -//! - Cache-friendly contiguous memory layout -//! - Iteration over all nodes +//! `NodeArena` stores file nodes in memory-mapped temporary files, allowing the OS +//! to page data in and out as needed. This prevents out-of-memory errors when browsing +//! large network shares or external drives with millions of files. //! -//! For very large indexes (10M+ files), this could be upgraded to use -//! memory-mapped storage, but Vec is sufficient for most use cases. +//! Entries are stored contiguously at stable u32 indices (EntryIds), providing O(1) +//! lookup while keeping memory usage bounded. When RAM is tight, the OS pages cold +//! entries to disk automatically. The backing file is anonymous and cleaned up on drop, +//! so no manual file management is needed. +//! +//! The arena doubles capacity (1024 → 2048 → 4096 → ...) when full, minimizing +//! expensive remap operations while staying within Vec-like amortized O(1) insertion. use super::types::{EntryId, FileNode}; +use memmap2::{MmapMut, MmapOptions}; +use std::{ + io, + mem::{self, MaybeUninit}, + num::NonZeroUsize, + slice, +}; +use tempfile::NamedTempFile; -/// Arena storage for file nodes using a simple Vec +const CAPACITY: usize = 1024; + +/// Slab allocator backed by an anonymous memory-mapped temporary file. /// -/// Nodes are stored contiguously in memory for cache-friendly access. -/// EntryIds are stable indexes into this Vec. +/// The OS manages paging, allowing large indexes to spill to disk under memory +/// pressure without crashing. EntryIds remain stable across capacity growth, +/// enabling parent-child relationships to persist through remaps. pub struct NodeArena { - /// Vector of nodes - nodes: Vec, + file: NamedTempFile, + mmap: MmapMut, + capacity: NonZeroUsize, + len: usize, } impl NodeArena { - /// Create a new empty arena - pub fn new() -> Self { - Self { nodes: Vec::new() } + pub fn new() -> io::Result { + Self::with_capacity(CAPACITY) } - /// Create an arena with pre-allocated capacity - pub fn with_capacity(capacity: usize) -> Self { - Self { - nodes: Vec::with_capacity(capacity), + pub fn with_capacity(capacity: usize) -> io::Result { + let capacity = NonZeroUsize::new(capacity.max(1)).unwrap(); + let mut file = NamedTempFile::new()?; + let mmap = Self::map_file(&mut file, capacity)?; + + Ok(Self { + file, + mmap, + capacity, + len: 0, + }) + } + + fn map_file(file: &mut NamedTempFile, slots: NonZeroUsize) -> io::Result { + let bytes = (slots.get() as u64).saturating_mul(mem::size_of::() as u64); + file.as_file_mut().set_len(bytes)?; + unsafe { MmapOptions::new().map_mut(file.as_file()) } + } + + /// Doubles capacity until min_capacity is reached. + fn ensure_capacity(&mut self, min_capacity: NonZeroUsize) -> io::Result<()> { + if min_capacity <= self.capacity { + return Ok(()); + } + + let mut new_capacity = self.capacity; + while new_capacity < min_capacity { + new_capacity = new_capacity.saturating_mul(NonZeroUsize::new(2).unwrap()); + } + + self.remap(new_capacity) + } + + /// Flushes dirty pages, expands the file, and remaps with new capacity. + fn remap(&mut self, new_capacity: NonZeroUsize) -> io::Result<()> { + assert!(new_capacity.get() >= self.len); + self.mmap.flush()?; + self.mmap = Self::map_file(&mut self.file, new_capacity)?; + self.capacity = new_capacity; + Ok(()) + } + + fn grow(&mut self) -> io::Result<()> { + let desired = self.capacity.saturating_mul(NonZeroUsize::new(2).unwrap()); + self.ensure_capacity(desired) + } + + fn entries(&self) -> &[MaybeUninit] { + unsafe { + slice::from_raw_parts( + self.mmap.as_ptr().cast::>(), + self.capacity.get(), + ) } } - /// Insert a node and return its ID - pub fn insert(&mut self, node: FileNode) -> EntryId { - let id = EntryId::from_usize(self.nodes.len()); - self.nodes.push(node); - id + fn entries_mut(&mut self) -> &mut [MaybeUninit] { + unsafe { + slice::from_raw_parts_mut( + self.mmap.as_mut_ptr().cast::>(), + self.capacity.get(), + ) + } + } + + /// Appends a node and returns its stable ID. + /// + /// The arena grows automatically when full, remapping to a larger capacity. + /// EntryIds remain valid across remaps since they're just indices. + pub fn insert(&mut self, node: FileNode) -> io::Result { + if self.len == self.capacity.get() { + self.grow()?; + } + + let index = self.len; + let id = EntryId::from_usize(index); + + unsafe { + self.entries_mut().get_unchecked_mut(index).write(node); + } + + self.len += 1; + Ok(id) } - /// Get node by ID pub fn get(&self, id: EntryId) -> Option<&FileNode> { - self.nodes.get(id.as_usize()) + if id.as_usize() < self.len { + Some(unsafe { + self.entries() + .get_unchecked(id.as_usize()) + .assume_init_ref() + }) + } else { + None + } } - /// Get mutable node by ID pub fn get_mut(&mut self, id: EntryId) -> Option<&mut FileNode> { - self.nodes.get_mut(id.as_usize()) + if id.as_usize() < self.len { + Some(unsafe { + self.entries_mut() + .get_unchecked_mut(id.as_usize()) + .assume_init_mut() + }) + } else { + None + } } - /// Get the number of nodes pub fn len(&self) -> usize { - self.nodes.len() + self.len } - /// Check if the arena is empty pub fn is_empty(&self) -> bool { - self.nodes.is_empty() + self.len == 0 } - /// Shrink capacity to fit current size - pub fn shrink_to_fit(&mut self) { - self.nodes.shrink_to_fit(); - } + /// No-op for memory-mapped arenas; the OS manages paging. + pub fn shrink_to_fit(&mut self) {} - /// Get current capacity pub fn capacity(&self) -> usize { - self.nodes.capacity() + self.capacity.get() } - /// Reserve additional capacity - pub fn reserve(&mut self, additional: usize) { - self.nodes.reserve(additional); + pub fn reserve(&mut self, additional: usize) -> io::Result<()> { + let new_capacity = self.len.saturating_add(additional); + if let Some(min_cap) = NonZeroUsize::new(new_capacity) { + self.ensure_capacity(min_cap)?; + } + Ok(()) } - /// Iterate over all nodes pub fn iter(&self) -> impl Iterator { - self.nodes - .iter() - .enumerate() - .map(|(i, node)| (EntryId::from_usize(i), node)) + (0..self.len).map(move |i| { + let id = EntryId::from_usize(i); + let node = unsafe { self.entries().get_unchecked(i).assume_init_ref() }; + (id, node) + }) } - /// Iterate over all nodes mutably - pub fn iter_mut(&mut self) -> impl Iterator { - self.nodes - .iter_mut() - .enumerate() - .map(|(i, node)| (EntryId::from_usize(i), node)) + pub fn iter_mut(&mut self) -> ArenaIterMut<'_> { + let len = self.len; + ArenaIterMut { + entries: self.entries_mut(), + len, + index: 0, + } } - /// Get approximate memory usage in bytes + /// Reports total allocation including mmap overhead and child vectors. pub fn memory_usage(&self) -> usize { - // Base struct size + Vec allocation - std::mem::size_of::() - + self.nodes.capacity() * std::mem::size_of::() - + self - .nodes - .iter() - .map(|n| n.children.capacity() * std::mem::size_of::()) + mem::size_of::() + + (self.capacity.get() * mem::size_of::()) + + (0..self.len) + .filter_map(|i| self.get(EntryId::from_usize(i))) + .map(|n| n.children.capacity() * mem::size_of::()) .sum::() } } +pub struct ArenaIterMut<'a> { + entries: &'a mut [MaybeUninit], + len: usize, + index: usize, +} + +impl<'a> Iterator for ArenaIterMut<'a> { + type Item = (EntryId, &'a mut FileNode); + + fn next(&mut self) -> Option { + if self.index >= self.len { + return None; + } + + let id = EntryId::from_usize(self.index); + let node = unsafe { + let ptr = self.entries.as_mut_ptr().add(self.index); + &mut *(*ptr).as_mut_ptr() + }; + + self.index += 1; + Some((id, node)) + } +} + impl Default for NodeArena { fn default() -> Self { - Self::new() + Self::new().expect("Failed to create default NodeArena") + } +} + +impl Drop for NodeArena { + fn drop(&mut self) { + for i in 0..self.len { + unsafe { + self.entries_mut().get_unchecked_mut(i).assume_init_drop(); + } + } + + let _ = self.mmap.flush(); } } @@ -124,10 +258,14 @@ mod tests { #[test] fn test_insert_and_get() { - let mut arena = NodeArena::new(); + let mut arena = NodeArena::new().expect("failed to create arena"); - let id1 = arena.insert(make_test_node("file1.txt")); - let id2 = arena.insert(make_test_node("file2.txt")); + let id1 = arena + .insert(make_test_node("file1.txt")) + .expect("insert failed"); + let id2 = arena + .insert(make_test_node("file2.txt")) + .expect("insert failed"); assert_eq!(arena.len(), 2); assert_eq!(arena.get(id1).unwrap().name(), "file1.txt"); @@ -136,17 +274,17 @@ mod tests { #[test] fn test_get_nonexistent() { - let arena = NodeArena::new(); + let arena = NodeArena::new().expect("failed to create arena"); assert!(arena.get(EntryId::from_usize(0)).is_none()); } #[test] fn test_iteration() { - let mut arena = NodeArena::new(); + let mut arena = NodeArena::new().expect("failed to create arena"); - arena.insert(make_test_node("a")); - arena.insert(make_test_node("b")); - arena.insert(make_test_node("c")); + arena.insert(make_test_node("a")).expect("insert failed"); + arena.insert(make_test_node("b")).expect("insert failed"); + arena.insert(make_test_node("c")).expect("insert failed"); let names: Vec<&str> = arena.iter().map(|(_, node)| node.name()).collect(); assert_eq!(names, vec!["a", "b", "c"]); @@ -154,16 +292,35 @@ mod tests { #[test] fn test_with_capacity() { - let arena = NodeArena::with_capacity(1000); + let arena = NodeArena::with_capacity(1000).expect("failed to create arena"); assert!(arena.capacity() >= 1000); assert!(arena.is_empty()); } #[test] fn test_shrink_to_fit() { - let mut arena = NodeArena::with_capacity(1000); - arena.insert(make_test_node("a")); + let mut arena = NodeArena::with_capacity(1000).expect("failed to create arena"); + arena.insert(make_test_node("a")).expect("insert failed"); arena.shrink_to_fit(); - assert!(arena.capacity() < 1000); + assert!(arena.capacity() >= 1000); + } + + #[test] + fn test_large_arena_growth() { + let mut arena = NodeArena::new().expect("failed to create arena"); + + for i in 0..10_000 { + let node = make_test_node(&format!("file{}.txt", i)); + arena.insert(node).expect("insert should succeed"); + } + + assert_eq!(arena.len(), 10_000); + assert!(arena.capacity() >= 10_000); + + for i in 0..10_000 { + let id = EntryId::from_usize(i); + let node = arena.get(id).expect("node should exist"); + assert_eq!(node.name(), format!("file{}.txt", i)); + } } } diff --git a/core/src/ops/indexing/ephemeral/index_cache.rs b/core/src/ops/indexing/ephemeral/index_cache.rs index 660bf8570..4716b7439 100644 --- a/core/src/ops/indexing/ephemeral/index_cache.rs +++ b/core/src/ops/indexing/ephemeral/index_cache.rs @@ -42,13 +42,13 @@ pub struct EphemeralIndexCache { impl EphemeralIndexCache { /// Create a new cache with an empty global index - pub fn new() -> Self { - Self { - index: Arc::new(TokioRwLock::new(EphemeralIndex::new())), + pub fn new() -> std::io::Result { + Ok(Self { + index: Arc::new(TokioRwLock::new(EphemeralIndex::new()?)), indexed_paths: RwLock::new(HashSet::new()), indexing_in_progress: RwLock::new(HashSet::new()), created_at: Instant::now(), - } + }) } /// Get the global index if the given path has been indexed @@ -199,7 +199,7 @@ impl EphemeralIndexCache { impl Default for EphemeralIndexCache { fn default() -> Self { - Self::new() + Self::new().expect("Failed to create default EphemeralIndexCache") } } @@ -231,7 +231,7 @@ mod tests { #[test] fn test_single_global_index() { - let cache = EphemeralIndexCache::new(); + let cache = EphemeralIndexCache::new().expect("failed to create cache"); // Initially no paths are indexed assert!(cache.is_empty()); @@ -240,7 +240,7 @@ mod tests { #[test] fn test_indexing_workflow() { - let cache = EphemeralIndexCache::new(); + let cache = EphemeralIndexCache::new().expect("failed to create cache"); let path = PathBuf::from("/test/path"); // Start indexing @@ -259,7 +259,7 @@ mod tests { #[test] fn test_shared_index_across_paths() { - let cache = EphemeralIndexCache::new(); + let cache = EphemeralIndexCache::new().expect("failed to create cache"); let path1 = PathBuf::from("/test/path1"); let path2 = PathBuf::from("/test/path2"); @@ -283,7 +283,7 @@ mod tests { #[test] fn test_invalidate_path() { - let cache = EphemeralIndexCache::new(); + let cache = EphemeralIndexCache::new().expect("failed to create cache"); let path = PathBuf::from("/test/path"); // Index the path @@ -301,7 +301,7 @@ mod tests { #[test] fn test_stats() { - let cache = EphemeralIndexCache::new(); + let cache = EphemeralIndexCache::new().expect("failed to create cache"); let path1 = PathBuf::from("/ready"); let path2 = PathBuf::from("/in_progress"); diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index a98797927..fab94fd67 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -218,16 +218,16 @@ impl std::fmt::Debug for EphemeralIndex { } impl EphemeralIndex { - pub fn new() -> Self { + pub fn new() -> std::io::Result { use super::ephemeral::{NameCache, NameRegistry, NodeArena}; let cache = std::sync::Arc::new(NameCache::new()); - let arena = NodeArena::new(); + let arena = NodeArena::new()?; let registry = NameRegistry::new(); let now = std::time::Instant::now(); - Self { + Ok(Self { arena, cache, registry, @@ -237,7 +237,7 @@ impl EphemeralIndex { created_at: now, last_accessed: now, stats: IndexerStats::default(), - } + }) } /// Ensures a directory exists, creating all missing ancestors recursively. @@ -246,21 +246,21 @@ impl EphemeralIndex { /// `add_entry()` without a separate tree-building pass. Parent directories /// are created from root to leaf, so the full ancestor chain exists before /// any child is added. - pub fn ensure_directory(&mut self, path: &Path) -> super::ephemeral::EntryId { + pub fn ensure_directory(&mut self, path: &Path) -> std::io::Result { use super::ephemeral::{ FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata, }; use super::state::EntryKind; if let Some(&id) = self.path_index.get(path) { - return id; + return Ok(id); } let parent_id = if let Some(parent_path) = path.parent() { if parent_path.as_os_str().is_empty() { None } else { - Some(self.ensure_directory(parent_path)) + Some(self.ensure_directory(parent_path)?) } } else { None @@ -279,7 +279,7 @@ impl EphemeralIndex { let meta = PackedMetadata::new(NodeState::Accessible, FileType::Directory, 0); let node = FileNode::new(NameRef::new(name, parent_ref), meta); - let id = self.arena.insert(node); + let id = self.arena.insert(node)?; // Add to parent's children if let Some(parent_id) = parent_id { @@ -294,13 +294,13 @@ impl EphemeralIndex { let uuid = uuid::Uuid::new_v4(); self.entry_uuids.insert(path.to_path_buf(), uuid); - id + Ok(id) } /// Adds an entry to the index, returning its content kind if successful. /// /// Content kind is identified by file extension (no I/O needed), which is - /// sufficient for ephemeral browsing where speed is critical. Returns None + /// sufficient for ephemeral browsing where speed is critical. Returns Ok(None) /// if the entry already exists (prevents duplicate entries when re-indexing /// a directory). pub fn add_entry( @@ -308,7 +308,7 @@ impl EphemeralIndex { path: PathBuf, uuid: Uuid, metadata: EntryMetadata, - ) -> Option { + ) -> std::io::Result> { use super::ephemeral::{ FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata, }; @@ -317,7 +317,7 @@ impl EphemeralIndex { if self.path_index.contains_key(&path) { tracing::trace!("Skipping duplicate entry: {}", path.display()); - return None; + return Ok(None); } // Ensure parent directories exist before adding this entry, building the ancestor @@ -329,7 +329,7 @@ impl EphemeralIndex { } else if let Some(&existing_id) = self.path_index.get(parent_path) { Some(existing_id) } else { - Some(self.ensure_directory(parent_path)) + Some(self.ensure_directory(parent_path)?) } } else { None @@ -352,7 +352,7 @@ impl EphemeralIndex { .unwrap_or(MaybeEntryId::NONE); let node = FileNode::new(NameRef::new(name, parent_ref), meta); - let id = self.arena.insert(node); + let id = self.arena.insert(node)?; // Add to parent's children if let Some(parent_id) = parent_id { @@ -376,7 +376,7 @@ impl EphemeralIndex { self.content_kinds.insert(path, content_kind); self.last_accessed = std::time::Instant::now(); - Some(content_kind) + Ok(Some(content_kind)) } pub fn get_entry(&mut self, path: &PathBuf) -> Option { @@ -615,7 +615,7 @@ impl EphemeralIndex { impl Default for EphemeralIndex { fn default() -> Self { - Self::new() + Self::new().expect("Failed to create default EphemeralIndex") } } @@ -960,7 +960,9 @@ impl JobHandler for IndexerJob { } if self.config.is_ephemeral() && self.ephemeral_index.is_none() { - self.ephemeral_index = Some(Arc::new(RwLock::new(EphemeralIndex::new()))); + let index = EphemeralIndex::new() + .map_err(|e| JobError::Other(format!("Failed to create ephemeral index: {}", e)))?; + self.ephemeral_index = Some(Arc::new(RwLock::new(index))); ctx.log("Initialized ephemeral index for non-persistent job"); } diff --git a/core/src/ops/indexing/persistence.rs b/core/src/ops/indexing/persistence.rs index 9a6b1ee0f..5ba7c106f 100644 --- a/core/src/ops/indexing/persistence.rs +++ b/core/src/ops/indexing/persistence.rs @@ -471,10 +471,15 @@ impl IndexPersistence for EphemeralPersistence { let entry_id = self.get_next_id().await; let entry_uuid = Uuid::new_v4(); - // add_entry returns Some(content_kind) if added, None if duplicate path. + // add_entry returns Ok(Some(content_kind)) if added, Ok(None) if duplicate path. let content_kind = { let mut index = self.index.write().await; - let result = index.add_entry(entry.path.clone(), entry_uuid, metadata.clone()); + let result = index + .add_entry(entry.path.clone(), entry_uuid, metadata.clone()) + .map_err(|e| { + tracing::error!("Failed to add entry to ephemeral index: {}", e); + e + })?; if result.is_some() { match entry.kind { @@ -604,7 +609,9 @@ mod tests { std::fs::write(&test_file, b"test content").unwrap(); // Create ephemeral index - let index = Arc::new(RwLock::new(EphemeralIndex::new())); + let index = Arc::new(RwLock::new( + EphemeralIndex::new().expect("failed to create ephemeral index"), + )); // Create event collector let collected_events = Arc::new(Mutex::new(Vec::new())); diff --git a/core/src/ops/indexing/responder.rs b/core/src/ops/indexing/responder.rs index 11006ad13..0cb53469d 100644 --- a/core/src/ops/indexing/responder.rs +++ b/core/src/ops/indexing/responder.rs @@ -390,9 +390,7 @@ async fn handle_create( debug!("Create: {}", path.display()); match path_exists_safe(path, backend).await { - Ok(true) => { - // Path exists and is accessible, proceed - } + Ok(true) => {} Ok(false) => { debug!("Path no longer exists, skipping create: {}", path.display()); return Ok(()); @@ -407,7 +405,6 @@ async fn handle_create( } } - // Check if path should be filtered if should_filter_path(path, rule_toggles, location_root, backend).await? { debug!("✗ Skipping filtered path: {}", path.display()); return Ok(()); @@ -416,7 +413,6 @@ async fn handle_create( debug!("→ Processing create for: {}", path.display()); let dir_entry = build_dir_entry(path, backend).await?; - // Check if entry already exists at this exact path (race condition from duplicate watcher events) let location_root_entry_id = get_location_root_entry_id(ctx, location_id).await?; if let Some(existing_id) = resolve_entry_id_by_path_scoped(ctx, path, location_root_entry_id).await? @@ -426,7 +422,6 @@ async fn handle_create( path.display(), existing_id ); - // Treat as a modify instead return handle_modify( ctx, context, @@ -440,15 +435,12 @@ async fn handle_create( .await; } - // If inode matches an existing entry at another path, treat this as a move if handle_move_by_inode(ctx, path, dir_entry.inode, backend).await? { return Ok(()); } - // Minimal state provides parent cache used by EntryProcessor let mut state = IndexerState::new(&crate::domain::addressing::SdPath::local(path)); - // Seed ancestor directories into cache to prevent ghost folder bug if let Ok(Some(location_record)) = entities::location::Entity::find() .filter(entities::location::Column::Uuid.eq(location_id)) .one(ctx.library_db()) @@ -461,12 +453,11 @@ async fn handle_create( } } - // Try to create the entry, handling unique constraint violations with upsert let entry_id = match EntryProcessor::create_entry( &mut state, ctx, &dir_entry, - 0, // device_id not needed here + 0, path.parent().unwrap_or_else(|| Path::new("/")), ) .await @@ -476,24 +467,20 @@ async fn handle_create( id } Err(e) if is_unique_constraint_violation(&e) => { - // Entry was created concurrently by another event, update it instead debug!( "Unique constraint violation for {}, updating existing entry (race condition)", path.display() ); - // Find the existing entry that caused the constraint violation if let Some(existing_id) = resolve_entry_id_by_path_scoped(ctx, path, location_root_entry_id).await? { - // Update the existing entry with new metadata (including potentially new inode) EntryProcessor::update_entry(ctx, existing_id, &dir_entry).await?; debug!( "✓ Updated existing entry {} with new metadata (inode: {:?})", existing_id, dir_entry.inode ); - // Treat as modify for processor pipeline return handle_modify( ctx, context, @@ -506,7 +493,6 @@ async fn handle_create( ) .await; } else { - // Shouldn't happen - we got unique constraint but can't find the entry warn!( "Unique constraint violation but entry not found for path: {}", path.display() @@ -519,7 +505,6 @@ async fn handle_create( } }; - // Get the entry UUID for event emission let entry_uuid = match entities::entry::Entity::find_by_id(entry_id) .one(ctx.library_db()) .await? @@ -528,16 +513,13 @@ async fn handle_create( None => None, }; - // If this is a directory, spawn a recursive indexer job to index its contents if dir_entry.kind == super::state::EntryKind::Directory { debug!( "Created directory detected, spawning recursive indexer job for: {}", path.display() ); - // Get the library to access the job manager if let Some(library) = context.get_library(library_id).await { - // Query the location to get its index_mode policy let location_record = entities::location::Entity::find() .filter(entities::location::Column::Uuid.eq(location_id)) .one(ctx.library_db()) @@ -545,7 +527,6 @@ async fn handle_create( .ok() .flatten(); - // Determine index mode from location policy (default to Content if not found) let index_mode = if let Some(loc) = location_record { match loc.index_mode.as_str() { "shallow" => super::job::IndexMode::Shallow, @@ -557,15 +538,12 @@ async fn handle_create( super::job::IndexMode::Content }; - // Create a recursive indexer job for this directory subtree - // Use the location's index_mode to respect thumbnail/thumbstrip policies let indexer_job = super::job::IndexerJob::from_location( location_id, crate::domain::addressing::SdPath::local(path), index_mode, ); - // Dispatch the job asynchronously (fire and forget) if let Err(e) = library.jobs().dispatch(indexer_job).await { warn!( "Failed to spawn indexer job for directory {}: {}", @@ -581,18 +559,14 @@ async fn handle_create( } } } else { - // For files, run processors inline (single file processing) if let Some(library) = context.get_library(library_id).await { - // Load processor configuration for this location let proc_config = processor::load_location_processor_config(location_id, ctx.library_db()) .await .unwrap_or_default(); - // Build processor entry (with MIME type after content linking) let proc_entry = build_processor_entry(ctx, entry_id, path).await?; - // Run content hash processor first if proc_config .watcher_processors .iter() @@ -604,10 +578,8 @@ async fn handle_create( } } - // Reload processor entry to get updated content_id and MIME type let proc_entry = build_processor_entry(ctx, entry_id, path).await?; - // Run thumbnail processor if proc_config .watcher_processors .iter() @@ -711,7 +683,6 @@ async fn handle_create( } } - // Emit resource event for the created entry if let Some(uuid) = entry_uuid { debug!("→ Emitting resource event for entry {}", uuid); let resource_manager = @@ -730,7 +701,10 @@ async fn handle_create( Ok(()) } -/// Handle modify: resolve entry ID by path, then update +/// Updates an existing entry's metadata and re-runs processors for files. +/// +/// Detects inode-based moves before updating. For files, regenerates content hashes and +/// thumbnails in case the file contents changed. async fn handle_modify( ctx: &impl IndexingCtx, context: &Arc, @@ -743,11 +717,8 @@ async fn handle_modify( ) -> Result<()> { debug!("Modify: {}", path.display()); - // Verify path is accessible before processing match path_exists_safe(path, backend).await { - Ok(true) => { - // Path exists and is accessible, proceed - } + Ok(true) => {} Ok(false) => { debug!("Path no longer exists, skipping modify: {}", path.display()); return Ok(()); @@ -762,7 +733,6 @@ async fn handle_modify( } } - // Check if path should be filtered if should_filter_path(path, rule_toggles, location_root, backend).await? { debug!("✗ Skipping filtered path: {}", path.display()); return Ok(()); @@ -770,10 +740,8 @@ async fn handle_modify( debug!("→ Processing modify for: {}", path.display()); - // Get location root entry ID for scoped queries let location_root_entry_id = get_location_root_entry_id(ctx, location_id).await?; - // If inode indicates a move, handle as a move and skip update let meta = EntryProcessor::extract_metadata(path, backend).await?; if handle_move_by_inode(ctx, path, meta.inode, backend).await? { return Ok(()); @@ -792,7 +760,6 @@ async fn handle_modify( EntryProcessor::update_entry(ctx, entry_id, &dir_entry).await?; debug!("✓ Updated entry {} for path: {}", entry_id, path.display()); - // Get entry UUID for event emission let entry_uuid = match entities::entry::Entity::find_by_id(entry_id) .one(ctx.library_db()) .await? @@ -801,19 +768,15 @@ async fn handle_modify( None => None, }; - // For files, run processors on the modified file if dir_entry.kind == super::state::EntryKind::File { if let Some(library) = context.get_library(library_id).await { - // Load processor configuration let proc_config = processor::load_location_processor_config(location_id, ctx.library_db()) .await .unwrap_or_default(); - // Build processor entry let proc_entry = build_processor_entry(ctx, entry_id, path).await?; - // Run content hash processor first if proc_config .watcher_processors .iter() @@ -825,10 +788,8 @@ async fn handle_modify( } } - // Reload processor entry to get updated content_id and MIME type let proc_entry = build_processor_entry(ctx, entry_id, path).await?; - // Run thumbnail processor if proc_config .watcher_processors .iter() @@ -872,7 +833,6 @@ async fn handle_modify( } } - // Emit resource event for the updated entry if let Some(uuid) = entry_uuid { debug!("→ Emitting resource event for modified entry {}", uuid); let resource_manager = @@ -896,7 +856,9 @@ async fn handle_modify( Ok(()) } -/// Handle remove: resolve entry ID and delete subtree (closure table + cache) +/// Deletes an entry and its entire subtree using closure table traversal. +/// +/// Creates tombstones for all deleted entries to sync the deletion across devices. async fn handle_remove( ctx: &impl IndexingCtx, context: &Arc, @@ -905,7 +867,6 @@ async fn handle_remove( ) -> Result<()> { debug!("Remove: {}", path.display()); - // Get location root entry ID for scoped queries let location_root_entry_id = get_location_root_entry_id(ctx, location_id).await?; if let Some(entry_id) = @@ -914,7 +875,6 @@ async fn handle_remove( debug!("→ Deleting entry {} for path: {}", entry_id, path.display()); delete_subtree(ctx, context, location_id, entry_id).await?; debug!("✓ Deleted entry {} for path: {}", entry_id, path.display()); - // Note: ResourceDeleted events are emitted by sync_models_batch in delete_subtree } else { debug!( "✗ Entry not found for path, skipping remove: {}", @@ -924,7 +884,10 @@ async fn handle_remove( Ok(()) } -/// Handle rename/move: resolve source entry and move via EntryProcessor +/// Moves an entry from one path to another, updating parent relationships and directory_paths. +/// +/// Checks if the destination is filtered (treats as deletion). Updates the entry's parent_id, +/// name, and extension, then recursively fixes descendant paths in directory_paths. async fn handle_rename( ctx: &impl IndexingCtx, context: &Arc, @@ -937,11 +900,8 @@ async fn handle_rename( ) -> Result<()> { debug!("Rename: {} -> {}", from.display(), to.display()); - // Verify destination path is accessible before processing match path_exists_safe(to, backend).await { - Ok(true) => { - // Destination exists and is accessible, proceed - } + Ok(true) => {} Ok(false) => { debug!( "Destination path doesn't exist, skipping rename: {}", @@ -959,17 +919,13 @@ async fn handle_rename( } } - // Get location root entry ID for scoped queries let location_root_entry_id = get_location_root_entry_id(ctx, location_id).await?; - // Check if the destination path should be filtered - // If the file is being moved to a filtered location, we should remove it from the database if should_filter_path(to, rule_toggles, location_root, backend).await? { debug!( "✗ Destination path is filtered, removing entry: {}", to.display() ); - // Treat this as a removal of the source file return handle_remove(ctx, context, location_id, from).await; } @@ -1023,7 +979,7 @@ async fn handle_rename( Ok(()) } -/// Build a DirEntry from current filesystem metadata +/// Extracts filesystem metadata into a DirEntry for database insertion. async fn build_dir_entry( path: &Path, backend: Option<&Arc>, @@ -1038,7 +994,7 @@ async fn build_dir_entry( }) } -/// Build a ProcessorEntry from database entry +/// Constructs a ProcessorEntry by querying the entry and resolving MIME type via content_identity. async fn build_processor_entry( ctx: &impl IndexingCtx, entry_id: i32, @@ -1051,7 +1007,6 @@ async fn build_processor_entry( .await? .ok_or_else(|| anyhow::anyhow!("Entry not found"))?; - // Get MIME type if content exists let mime_type = if let Some(content_id) = entry.content_id { if let Ok(Some(ci)) = entities::content_identity::Entity::find_by_id(content_id) .one(ctx.library_db()) @@ -1076,7 +1031,6 @@ async fn build_processor_entry( None }; - // Convert DB entry kind to domain EntryKind let kind = match entry.kind { 0 => super::state::EntryKind::File, 1 => super::state::EntryKind::Directory, @@ -1095,7 +1049,7 @@ async fn build_processor_entry( }) } -/// Resolve an entry ID by absolute path, scoped to location's entry tree +/// Resolves an entry ID by trying directory lookup first, then file lookup. async fn resolve_entry_id_by_path_scoped( ctx: &impl IndexingCtx, abs_path: &Path, @@ -1109,7 +1063,7 @@ async fn resolve_entry_id_by_path_scoped( resolve_file_entry_id_scoped(ctx, abs_path, location_root_entry_id).await } -/// Resolve a directory entry by path, scoped to location's entry tree using entry_closure +/// Queries directory_paths joined with entry_closure to find directories scoped to this location. async fn resolve_directory_entry_id_scoped( ctx: &impl IndexingCtx, abs_path: &Path, @@ -1119,8 +1073,6 @@ async fn resolve_directory_entry_id_scoped( let path_str = abs_path.to_string_lossy().to_string(); - // Query directory_paths and JOIN with entry_closure to scope by location - // This ensures we only find entries within THIS location's tree #[derive(Debug, FromQueryResult)] struct DirectoryEntryId { entry_id: i32, @@ -1143,7 +1095,7 @@ async fn resolve_directory_entry_id_scoped( Ok(result.map(|r| r.entry_id)) } -/// Resolve a file entry by parent directory path + file name, scoped to location's tree +/// Finds a file entry by resolving its parent directory, then matching name + extension. async fn resolve_file_entry_id_scoped( ctx: &impl IndexingCtx, abs_path: &Path, @@ -1154,14 +1106,12 @@ async fn resolve_file_entry_id_scoped( None => return Ok(None), }; - // First resolve parent directory using scoped lookup let parent_id = match resolve_directory_entry_id_scoped(ctx, parent, location_root_entry_id).await? { Some(id) => id, None => return Ok(None), }; - // Now find the file entry by parent + name + extension let name = abs_path .file_stem() .and_then(|s| s.to_str()) @@ -1184,19 +1134,18 @@ async fn resolve_file_entry_id_scoped( Ok(model.map(|m| m.id)) } -/// Check if an error is a unique constraint violation +/// Detects SQLite unique constraint errors by checking error message strings. fn is_unique_constraint_violation(error: &crate::infra::job::error::JobError) -> bool { - // Check if the error contains SQLite unique constraint violation messages let error_msg = error.to_string().to_lowercase(); error_msg.contains("unique constraint") || error_msg.contains("unique index") || error_msg.contains("constraint failed") } -/// Best-effort deletion of an entry and its subtree (with tombstone creation) +/// Deletes an entry tree and creates tombstones for sync. /// -/// This variant is used for local deletions (watcher, indexer) and creates -/// a tombstone for the root entry UUID to sync the deletion to other devices. +/// Used by watcher and indexer to propagate deletions to other devices. Traverses using both +/// entry_closure and parent_id (fallback) to handle partially-corrupted closure tables. async fn delete_subtree( ctx: &impl IndexingCtx, context: &Arc, @@ -1205,7 +1154,6 @@ async fn delete_subtree( ) -> Result<()> { use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; - // Step 1: Collect all entry IDs in the subtree let mut to_delete_ids: Vec = vec![entry_id]; if let Ok(rows) = entities::entry_closure::Entity::find() .filter(entities::entry_closure::Column::AncestorId.eq(entry_id)) @@ -1215,7 +1163,6 @@ async fn delete_subtree( to_delete_ids.extend(rows.into_iter().map(|r| r.descendant_id)); } - // IMPORTANT: Also find descendants by parent_id recursively as a fallback let mut queue = vec![entry_id]; let mut visited = std::collections::HashSet::from([entry_id]); @@ -1244,10 +1191,8 @@ async fn delete_subtree( to_delete_ids.len() ); - // Step 2: Fetch all entry models that will be deleted let entries_to_delete = if !to_delete_ids.is_empty() { let mut all_entries = Vec::new(); - // Chunk to avoid SQLite variable limit for chunk in to_delete_ids.chunks(900) { let batch = entities::entry::Entity::find() .filter(entities::entry::Column::Id.is_in(chunk.to_vec())) @@ -1262,8 +1207,6 @@ async fn delete_subtree( if !entries_to_delete.is_empty() { if let Some(library) = context.get_library(location_id).await { - // Use sync_models_batch for proper sync and event handling - // This will create tombstones for all entries and emit ResourceDeleted events let _ = library .sync_models_batch( &entries_to_delete, @@ -1274,7 +1217,6 @@ async fn delete_subtree( } } - // Step 4: Now perform the actual database deletion let txn = ctx.library_db().begin().await?; if !to_delete_ids.is_empty() { @@ -1300,10 +1242,7 @@ async fn delete_subtree( Ok(()) } -/// Best-effort deletion of an entry and its subtree (without tombstone creation) -/// -/// This variant is used when applying deletion tombstones from sync to avoid -/// recursion. It performs the same deletions but does not create new tombstones. +/// Deletes an entry tree without creating tombstones (used when applying remote tombstones). pub async fn delete_subtree_internal( entry_id: i32, db: &sea_orm::DatabaseConnection, @@ -1316,12 +1255,11 @@ pub async fn delete_subtree_internal( Ok(()) } -/// Helper to delete subtree without transaction management (for use within existing transactions) +/// Deletes a subtree within an existing transaction (no transaction management). async fn delete_subtree_no_txn(entry_id: i32, db: &C) -> Result<(), sea_orm::DbErr> where C: sea_orm::ConnectionTrait, { - // Find all descendants let mut to_delete_ids: Vec = vec![entry_id]; if let Ok(rows) = entities::entry_closure::Entity::find() .filter(entities::entry_closure::Column::AncestorId.eq(entry_id)) @@ -1333,7 +1271,6 @@ where to_delete_ids.sort_unstable(); to_delete_ids.dedup(); - // Delete entries and related data if !to_delete_ids.is_empty() { let _ = entities::entry_closure::Entity::delete_many() .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) @@ -1356,8 +1293,10 @@ where Ok(()) } -/// Inode-aware move detection: if an existing entry has the same inode but a different path, -/// treat the change as a move and update the database accordingly. +/// Detects moves by matching inodes: if an entry exists with the same inode at a different path, treats as a move. +/// +/// Prevents duplicate entries when files are moved instead of deleted+created. Falls back to update +/// if the inode matches but the path is the same (macOS FSEvents quirk). async fn handle_move_by_inode( ctx: &impl IndexingCtx, new_path: &Path, @@ -1379,7 +1318,6 @@ async fn handle_move_by_inode( .one(ctx.library_db()) .await? { - // Resolve old full path let old_path = PathResolver::get_full_path(ctx.library_db(), existing.id) .await .unwrap_or_else(|_| std::path::PathBuf::from(&existing.name)); @@ -1394,7 +1332,6 @@ async fn handle_move_by_inode( ); if old_path != new_path { - // File was moved to a different path debug!( "✓ Detected inode-based move: {} → {}", old_path.display(), @@ -1413,8 +1350,6 @@ async fn handle_move_by_inode( debug!("✓ Completed inode-based move for entry {}", existing.id); return Ok(true); } else { - // Same path, same inode - this is a modification (macOS FSEvents reports as Create) - // Update the existing entry instead of creating a duplicate debug!( "Entry already exists at path with same inode {}, updating instead of creating: {}", inode_val, diff --git a/core/src/ops/indexing/verify/action.rs b/core/src/ops/indexing/verify/action.rs index 130f754f3..752151696 100644 --- a/core/src/ops/indexing/verify/action.rs +++ b/core/src/ops/indexing/verify/action.rs @@ -104,7 +104,10 @@ impl IndexVerifyAction { tracing::debug!("Running ephemeral indexer job on {}", path.display()); // Create ephemeral index storage that we'll share with the job - let ephemeral_index = Arc::new(RwLock::new(EphemeralIndex::new())); + let ephemeral_index = Arc::new(RwLock::new( + EphemeralIndex::new() + .map_err(|e| ActionError::from(std::io::Error::new(std::io::ErrorKind::Other, e)))?, + )); // Subscribe to job events before dispatching let mut event_subscriber = context.events.subscribe(); From 3739b3f34ff08e70ef79d9aba4f533204a8abd49 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 00:38:04 -0800 Subject: [PATCH 11/20] Enhance ephemeral indexing and add filesystem watching support - Updated `EphemeralIndex` to preserve explicitly browsed subdirectories during re-indexing, preventing loss of user navigation context. - Modified `clear_directory_children` to return the count of cleared entries and a list of deleted browsed directories. - Introduced `EphemeralIndexCache` enhancements to support filesystem watching, allowing paths to be monitored for changes. - Added methods for registering, unregistering, and checking watched paths, improving the responsiveness of the indexing system. - Updated documentation and tests to reflect new functionality and ensure reliability. --- .../src/ops/indexing/ephemeral/index_cache.rs | 18 +- core/src/ops/indexing/job.rs | 98 +++++-- docs/workbench | 2 +- .../Explorer/components/AddStorageModal.tsx | 169 +++++++---- .../Explorer/components/PathBar.tsx | 263 ++++++++++-------- 5 files changed, 354 insertions(+), 196 deletions(-) diff --git a/core/src/ops/indexing/ephemeral/index_cache.rs b/core/src/ops/indexing/ephemeral/index_cache.rs index 4716b7439..93fe98dcd 100644 --- a/core/src/ops/indexing/ephemeral/index_cache.rs +++ b/core/src/ops/indexing/ephemeral/index_cache.rs @@ -100,11 +100,23 @@ impl EphemeralIndexCache { /// Clear stale entries for a path before re-indexing (async version) /// - /// Call this after create_for_indexing to remove old children entries. - /// This prevents ghost entries when files are deleted between index runs. + /// Removes files and unbrowsed subdirectories, preserving subdirectories + /// that were explicitly navigated to. Verifies preserved directories still + /// exist on the filesystem and removes deleted ones from tracking. pub async fn clear_for_reindex(&self, path: &Path) -> usize { + let indexed = self.indexed_paths.read().clone(); let mut index = self.index.write().await; - index.clear_directory_children(path) + let (cleared, deleted_browsed_dirs) = index.clear_directory_children(path, &indexed); + + // Remove deleted browsed directories from indexed_paths + if !deleted_browsed_dirs.is_empty() { + let mut indexed_paths = self.indexed_paths.write(); + for deleted_path in deleted_browsed_dirs { + indexed_paths.remove(&deleted_path); + } + } + + cleared } /// Mark indexing as complete for a path diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index fab94fd67..ee301beac 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -451,44 +451,86 @@ impl EphemeralIndex { ) } - /// Clears immediate children of a directory to prepare for re-indexing. + /// Clears entries before re-indexing, preserving explicitly browsed subdirectories. /// - /// This prevents ghost entries when files are deleted between index runs. - /// The arena nodes become orphaned but remain allocated, which is acceptable - /// for ephemeral indexes since memory pressure triggers full eviction anyway. - /// Only clears the direct children (non-recursive). - pub fn clear_directory_children(&mut self, dir_path: &Path) -> usize { - let children_paths: Vec = if let Some(dir_id) = self.path_index.get(dir_path) { - if let Some(dir_node) = self.arena.get(*dir_id) { - dir_node - .children - .iter() - .filter_map(|&child_id| self.reconstruct_path(child_id)) - .collect() - } else { - return 0; - } - } else { - return 0; + /// Since ephemeral indexing is shallow, subdirectories that were explicitly + /// navigated to (in `indexed_paths`) should be preserved as separate index + /// branches. Unbrowsed subdirectories are refreshed with the parent. + /// + /// Returns (cleared_count, deleted_browsed_dirs) where deleted_browsed_dirs + /// contains paths that were in indexed_paths but no longer exist on disk. + pub fn clear_directory_children( + &mut self, + dir_path: &Path, + indexed_paths: &std::collections::HashSet, + ) -> (usize, Vec) { + let dir_id = match self.path_index.get(dir_path) { + Some(&id) => id, + None => return (0, Vec::new()), }; - let mut cleared = 0; + let dir_node = match self.arena.get(dir_id) { + Some(node) => node, + None => return (0, Vec::new()), + }; - for child_path in &children_paths { - if self.path_index.remove(child_path).is_some() { - cleared += 1; - } + let mut deleted_browsed_dirs = Vec::new(); + + // Collect children to remove + let mut children_to_remove: Vec<(PathBuf, super::ephemeral::EntryId)> = dir_node + .children + .iter() + .filter_map(|&child_id| { + let child_node = self.arena.get(child_id)?; + let child_path = self.reconstruct_path(child_id)?; + + // Preserve subdirectories that were explicitly browsed AND still exist + if child_node.is_directory() && indexed_paths.contains(&child_path) { + // Verify the directory still exists on the filesystem + if std::fs::metadata(&child_path).is_ok() { + return None; // Preserve - still exists and was browsed + } + // Directory was deleted - track for removal from indexed_paths + tracing::debug!( + "Removing deleted browsed directory: {}", + child_path.display() + ); + deleted_browsed_dirs.push(child_path.clone()); + } + + // Remove everything else (files, unbrowsed directories, deleted directories) + Some((child_path, child_id)) + }) + .collect(); + + let cleared = children_to_remove.len(); + + // Remove from indexes + for (child_path, _) in &children_to_remove { + self.path_index.remove(child_path); self.entry_uuids.remove(child_path); self.content_kinds.remove(child_path); } - if let Some(dir_id) = self.path_index.get(dir_path) { - if let Some(dir_node) = self.arena.get_mut(*dir_id) { - dir_node.children.clear(); - } + // Update parent's children list + if let Some(dir_node) = self.arena.get_mut(dir_id) { + let removed_ids: std::collections::HashSet<_> = + children_to_remove.iter().map(|(_, id)| id).collect(); + + dir_node + .children + .retain(|child_id| !removed_ids.contains(child_id)); } - cleared + if cleared > 0 { + tracing::debug!( + "Cleared {} entries from {} (preserved browsed subdirs)", + cleared, + dir_path.display() + ); + } + + (cleared, deleted_browsed_dirs) } fn reconstruct_path(&self, id: super::ephemeral::EntryId) -> Option { diff --git a/docs/workbench b/docs/workbench index cab1f9e49..351a8415f 160000 --- a/docs/workbench +++ b/docs/workbench @@ -1 +1 @@ -Subproject commit cab1f9e49e81f8622f2c77f8c1162f7cbd2b1b1d +Subproject commit 351a8415f43a8396c2c3370a09f72ebb2b36cd05 diff --git a/packages/interface/src/components/Explorer/components/AddStorageModal.tsx b/packages/interface/src/components/Explorer/components/AddStorageModal.tsx index bcb5dbc2b..791f721c8 100644 --- a/packages/interface/src/components/Explorer/components/AddStorageModal.tsx +++ b/packages/interface/src/components/Explorer/components/AddStorageModal.tsx @@ -349,22 +349,34 @@ const jobOptions: JobOption[] = [ export function useAddStorageDialog( onStorageAdded?: (id: string) => void, + initialPath?: string, ) { return dialogManager.create((props) => ( - + )); } function AddStorageDialog(props: { id: number; onStorageAdded?: (id: string) => void; + initialPath?: string; }) { const dialog = useDialog(props); const platform = usePlatform(); - const [step, setStep] = useState("category"); + // Derive initial folder name from path + const initialFolderName = + props.initialPath?.split("/").filter(Boolean).pop() || ""; + + const [step, setStep] = useState( + props.initialPath ? "local-config" : "category", + ); const [selectedCategory, setSelectedCategory] = - useState(null); + useState(props.initialPath ? "local" : null); const [selectedProvider, setSelectedProvider] = useState(null); const [tab, setTab] = useState("preset"); @@ -385,8 +397,8 @@ function AddStorageDialog(props: { const localForm = useForm({ defaultValues: { - path: "", - name: "", + path: props.initialPath || "", + name: initialFolderName, mode: "Deep", }, }); @@ -404,7 +416,9 @@ function AddStorageDialog(props: { const currentMode = localForm.watch("mode"); const [selectedJobs, setSelectedJobs] = useState>( new Set( - jobOptions.filter((j) => j.presets.includes("Deep")).map((j) => j.id), + jobOptions + .filter((j) => j.presets.includes("Deep")) + .map((j) => j.id), ), ); @@ -539,7 +553,9 @@ function AddStorageDialog(props: { localForm.setError("root", { type: "manual", message: - error instanceof Error ? error.message : "Failed to add location", + error instanceof Error + ? error.message + : "Failed to add location", }); } }); @@ -692,7 +708,11 @@ function AddStorageDialog(props: { "border-app-line bg-app-box hover:bg-app-hover hover:border-accent/50", )} > - +
{category.label} @@ -733,7 +753,11 @@ function AddStorageDialog(props: { "border-app-line bg-app-box hover:bg-app-hover hover:border-accent/50", )} > - +
{provider.name}
@@ -761,8 +785,9 @@ function AddStorageDialog(props: {
Coming Soon

- Network protocol support (SMB, NFS, SFTP, WebDAV) is currently in - development. Check back in a future update! + Network protocol support (SMB, NFS, SFTP, WebDAV) is + currently in development. Check back in a future + update!

@@ -776,7 +801,11 @@ function AddStorageDialog(props: { "border-app-line bg-app-box", )} > - +
{protocol.name} @@ -820,17 +849,27 @@ function AddStorageDialog(props: { "border-app-line bg-app-box hover:bg-app-hover hover:border-accent/50", )} > - +
{volume.name}
- {volume.mount_point} • {volume.filesystem} + {volume.mount_point} •{" "} + {volume.filesystem}
- {volume.total_capacity ? (volume.total_capacity / 1e9).toFixed(0) : '?'} GB + {volume.total_capacity + ? ( + volume.total_capacity / 1e9 + ).toFixed(0) + : "?"}{" "} + GB
))} @@ -838,8 +877,8 @@ function AddStorageDialog(props: { ) : (

- No untracked external drives found. Connect a drive and refresh - to see it here. + No untracked external drives found. Connect a + drive and refresh to see it here.

)} @@ -867,7 +906,9 @@ function AddStorageDialog(props: {
localForm.setValue("path", e.target.value)} + onChange={(e) => + localForm.setValue("path", e.target.value) + } placeholder="Select a custom folder" size="lg" className="pr-14" @@ -880,34 +921,40 @@ function AddStorageDialog(props: {
- {suggestedLocations && suggestedLocations.locations.length > 0 && ( -
- -
- {suggestedLocations.locations.map((loc) => ( -
- - ))} + + ))} +
-
- )} + )}
); @@ -939,11 +986,16 @@ function AddStorageDialog(props: { />
- setTab(v as SettingsTab)}> + setTab(v as SettingsTab)} + > Preset - Jobs {selectedJobs.size > 0 && `(${selectedJobs.size})`} + Jobs{" "} + {selectedJobs.size > 0 && + `(${selectedJobs.size})`} @@ -952,12 +1004,15 @@ function AddStorageDialog(props: {
{indexModes.map((mode) => { - const isSelected = currentMode === mode.value; + const isSelected = + currentMode === mode.value; return ( + )} + + + + ) : ( + +
+
+ Path is outside any location +
+
-
- - - - - - {!isIndexed && ( - - )} - - + + )} - ); } @@ -257,71 +302,69 @@ export function PathBar({ path, devices, onNavigate }: PathBarProps) { "focus-within:bg-sidebar-box/30 focus-within:border-sidebar-line/40", )} > - Device + Device - {showUri ? ( - - ) : isExpanded ? ( -
- {segments.map((segment, index) => { - const isLast = index === segments.length - 1; - return ( -
- - {!isLast && ( - - )} -
- ); - })} -
- ) : ( - - )} + + {!isLast && } +
+ ); + })} +
+ ) : ( + + )}
From 6bdc9a70557be56cb1ed1e4a90bbe79f6b9386db Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 00:38:17 -0800 Subject: [PATCH 12/20] Implement unified change handling for indexing with filesystem watching support - Introduced a new `handler.rs` module to manage filesystem change events for both persistent and ephemeral indexing. - Added a trait-based `ChangeHandler` interface to abstract operations for different storage backends. - Enhanced `EphemeralIndexCache` to support filesystem watching, allowing paths to be monitored for changes. - Implemented methods for registering and unregistering watched paths, improving responsiveness to filesystem events. - Updated the `LocationWatcher` to handle ephemeral watches and process events accordingly. - Added tests and documentation to ensure reliability and clarity of the new functionality. --- .cspell/project_words.txt | 2 + .../src/ops/indexing/ephemeral/index_cache.rs | 136 +- core/src/ops/indexing/ephemeral/mod.rs | 1 + core/src/ops/indexing/ephemeral/responder.rs | 124 ++ core/src/ops/indexing/handler.rs | 1447 +++++++++++++++++ core/src/ops/indexing/job.rs | 41 + core/src/ops/indexing/mod.rs | 5 + core/src/service/watcher/mod.rs | 196 +++ 8 files changed, 1951 insertions(+), 1 deletion(-) create mode 100644 core/src/ops/indexing/ephemeral/responder.rs create mode 100644 core/src/ops/indexing/handler.rs diff --git a/.cspell/project_words.txt b/.cspell/project_words.txt index 3e9dcae85..466ca6def 100644 --- a/.cspell/project_words.txt +++ b/.cspell/project_words.txt @@ -38,6 +38,7 @@ lütke marietti mbps mehrzad +memmap Mjpeg Mmap mpscrr @@ -77,6 +78,7 @@ tobiaslutke tokio tombstoned typecheck +Uninit unwatch uuid vdfs diff --git a/core/src/ops/indexing/ephemeral/index_cache.rs b/core/src/ops/indexing/ephemeral/index_cache.rs index 93fe98dcd..57d558d65 100644 --- a/core/src/ops/indexing/ephemeral/index_cache.rs +++ b/core/src/ops/indexing/ephemeral/index_cache.rs @@ -11,6 +11,12 @@ //! //! The cache tracks which paths have been indexed (ready) vs are currently //! being indexed (in progress). +//! +//! ## File Watching Support +//! +//! The cache can optionally track which paths should be monitored for filesystem +//! changes. When a path is marked for watching, the watcher service can detect +//! changes and update the ephemeral index via `EphemeralChangeHandler`. use crate::ops::indexing::EphemeralIndex; use parking_lot::RwLock; @@ -36,6 +42,9 @@ pub struct EphemeralIndexCache { /// Paths currently being indexed indexing_in_progress: RwLock>, + /// Paths registered for filesystem watching (subset of indexed_paths) + watched_paths: RwLock>, + /// When the cache was created created_at: Instant, } @@ -47,6 +56,7 @@ impl EphemeralIndexCache { index: Arc::new(TokioRwLock::new(EphemeralIndex::new()?)), indexed_paths: RwLock::new(HashSet::new()), indexing_in_progress: RwLock::new(HashSet::new()), + watched_paths: RwLock::new(HashSet::new()), created_at: Instant::now(), }) } @@ -159,14 +169,92 @@ impl EphemeralIndexCache { self.indexing_in_progress.read().iter().cloned().collect() } + // ======================================================================== + // File Watching Support + // ======================================================================== + + /// Register a path for filesystem watching. + /// + /// When registered, the watcher service will monitor this path for changes + /// and update the ephemeral index via `EphemeralChangeHandler`. The path + /// must already be indexed. + pub fn register_for_watching(&self, path: PathBuf) -> bool { + let indexed = self.indexed_paths.read(); + if !indexed.contains(&path) { + return false; + } + drop(indexed); + + let mut watched = self.watched_paths.write(); + watched.insert(path); + true + } + + /// Unregister a path from filesystem watching. + pub fn unregister_from_watching(&self, path: &Path) { + let mut watched = self.watched_paths.write(); + watched.remove(path); + } + + /// Check if a path is registered for watching. + pub fn is_watched(&self, path: &Path) -> bool { + self.watched_paths.read().contains(path) + } + + /// Get all watched paths. + pub fn watched_paths(&self) -> Vec { + self.watched_paths.read().iter().cloned().collect() + } + + /// Find the watched root path that contains the given path. + /// + /// If the given path is under a watched directory, returns that directory. + /// Used by the watcher to route events to the ephemeral handler. + pub fn find_watched_root(&self, path: &Path) -> Option { + let watched = self.watched_paths.read(); + + // Find the longest matching watched path that is an ancestor of `path` + let mut best_match: Option<&PathBuf> = None; + let mut best_len = 0; + + for watched_path in watched.iter() { + if path.starts_with(watched_path) { + let len = watched_path.as_os_str().len(); + if len > best_len { + best_len = len; + best_match = Some(watched_path); + } + } + } + + best_match.cloned() + } + + /// Check if any path in an event batch is under an ephemeral watched path. + /// + /// Returns the watched root if found. + pub fn find_watched_root_for_any<'a, I>(&self, paths: I) -> Option + where + I: IntoIterator, + { + for path in paths { + if let Some(root) = self.find_watched_root(path) { + return Some(root); + } + } + None + } + /// Get cache statistics pub fn stats(&self) -> EphemeralIndexCacheStats { let indexed = self.indexed_paths.read(); let in_progress = self.indexing_in_progress.read(); + let watched = self.watched_paths.read(); EphemeralIndexCacheStats { indexed_paths: indexed.len(), indexing_in_progress: in_progress.len(), + watched_paths: watched.len(), } } @@ -222,7 +310,8 @@ pub struct EphemeralIndexCacheStats { pub indexed_paths: usize, /// Number of paths currently being indexed pub indexing_in_progress: usize, - // Legacy field names for compatibility + /// Number of paths registered for filesystem watching + pub watched_paths: usize, } impl EphemeralIndexCacheStats { @@ -328,4 +417,49 @@ mod tests { assert_eq!(stats.indexed_paths, 1); assert_eq!(stats.indexing_in_progress, 1); } + + #[test] + fn test_watch_registration() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); + let path = PathBuf::from("/test/watched"); + + // Can't watch a path that's not indexed + assert!(!cache.register_for_watching(path.clone())); + assert!(!cache.is_watched(&path)); + + // Index the path first + let _index = cache.create_for_indexing(path.clone()); + cache.mark_indexing_complete(&path); + + // Now we can register for watching + assert!(cache.register_for_watching(path.clone())); + assert!(cache.is_watched(&path)); + + // Stats should reflect watched path + let stats = cache.stats(); + assert_eq!(stats.watched_paths, 1); + + // Unregister + cache.unregister_from_watching(&path); + assert!(!cache.is_watched(&path)); + } + + #[test] + fn test_find_watched_root() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); + + let root = PathBuf::from("/mnt/nas"); + let child = PathBuf::from("/mnt/nas/documents/report.pdf"); + + // Index and watch the root + let _index = cache.create_for_indexing(root.clone()); + cache.mark_indexing_complete(&root); + cache.register_for_watching(root.clone()); + + // Child path should find the watched root + assert_eq!(cache.find_watched_root(&child), Some(root.clone())); + + // Unrelated path should not find a root + assert_eq!(cache.find_watched_root(Path::new("/other/path")), None); + } } diff --git a/core/src/ops/indexing/ephemeral/mod.rs b/core/src/ops/indexing/ephemeral/mod.rs index 147e0082f..02a5f3553 100644 --- a/core/src/ops/indexing/ephemeral/mod.rs +++ b/core/src/ops/indexing/ephemeral/mod.rs @@ -41,6 +41,7 @@ pub mod arena; pub mod cache; pub mod index_cache; pub mod registry; +pub mod responder; pub mod types; // Re-export public types diff --git a/core/src/ops/indexing/ephemeral/responder.rs b/core/src/ops/indexing/ephemeral/responder.rs new file mode 100644 index 000000000..5ebbc0feb --- /dev/null +++ b/core/src/ops/indexing/ephemeral/responder.rs @@ -0,0 +1,124 @@ +//! Ephemeral responder for updating in-memory indexes on filesystem changes. +//! +//! This module processes filesystem events against the ephemeral index cache. +//! When a user is browsing an ephemeral directory (external drive, network share) +//! and files change, the responder updates the in-memory index to reflect changes. +//! +//! ## Usage +//! +//! ```rust,ignore +//! use sd_core::ops::indexing::ephemeral::responder; +//! +//! // Check if an event should be handled by the ephemeral system +//! if let Some(root) = responder::find_ephemeral_root(&path, &context) { +//! responder::process_event(&context, &root, event_kind).await?; +//! } +//! ``` + +use crate::context::CoreContext; +use crate::infra::event::FsRawEventKind; +use crate::ops::indexing::handler::{self, ChangeConfig, EphemeralChangeHandler}; +use crate::ops::indexing::rules::RuleToggles; +use anyhow::Result; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +/// Check if a path falls under an ephemeral watched directory. +/// +/// Returns the watched root path if found. +pub fn find_ephemeral_root(path: &Path, context: &CoreContext) -> Option { + context.ephemeral_cache().find_watched_root(path) +} + +/// Check if any path in a batch of events falls under an ephemeral watched directory. +pub fn find_ephemeral_root_for_events( + events: &[FsRawEventKind], + context: &CoreContext, +) -> Option { + let paths: Vec<&Path> = events + .iter() + .flat_map(|e| match e { + FsRawEventKind::Create { path } => vec![path.as_path()], + FsRawEventKind::Modify { path } => vec![path.as_path()], + FsRawEventKind::Remove { path } => vec![path.as_path()], + FsRawEventKind::Rename { from, to } => vec![from.as_path(), to.as_path()], + }) + .collect(); + + context + .ephemeral_cache() + .find_watched_root_for_any(paths.into_iter()) +} + +/// Process a batch of filesystem events against the ephemeral index. +/// +/// Creates an `EphemeralChangeHandler` and processes the events using shared +/// handler logic. The ephemeral index is updated in-place and ResourceChanged +/// events are emitted for UI updates. +pub async fn apply_batch( + context: &Arc, + root_path: &Path, + events: Vec, + rule_toggles: RuleToggles, +) -> Result<()> { + if events.is_empty() { + return Ok(()); + } + + let index = context.ephemeral_cache().get_global_index(); + let event_bus = context.events.clone(); + + let mut handler = EphemeralChangeHandler::new(index, event_bus, root_path.to_path_buf()); + + let config = ChangeConfig { + rule_toggles, + location_root: root_path, + volume_backend: None, // Ephemeral paths typically don't use volume backends + }; + + handler::apply_batch(&mut handler, events, &config).await +} + +/// Process a single filesystem event against the ephemeral index. +pub async fn apply( + context: &Arc, + root_path: &Path, + event: FsRawEventKind, + rule_toggles: RuleToggles, +) -> Result<()> { + apply_batch(context, root_path, vec![event], rule_toggles).await +} + +/// Register an ephemeral path for filesystem watching. +/// +/// After calling this, filesystem events under the path will be detectable +/// via `find_ephemeral_root`. The path must already be indexed in the +/// ephemeral cache. +/// +/// Returns true if registration succeeded, false if the path is not indexed. +pub fn register_for_watching(context: &CoreContext, path: PathBuf) -> bool { + context.ephemeral_cache().register_for_watching(path) +} + +/// Unregister an ephemeral path from filesystem watching. +pub fn unregister_from_watching(context: &CoreContext, path: &Path) { + context.ephemeral_cache().unregister_from_watching(path) +} + +/// Check if any ephemeral paths are being watched. +pub fn has_watched_paths(context: &CoreContext) -> bool { + !context.ephemeral_cache().watched_paths().is_empty() +} + +/// Get all currently watched ephemeral paths. +pub fn watched_paths(context: &CoreContext) -> Vec { + context.ephemeral_cache().watched_paths() +} + +#[cfg(test)] +mod tests { + use super::*; + + // Integration tests would require a full CoreContext setup + // Unit tests for the helper functions are covered by index_cache tests +} diff --git a/core/src/ops/indexing/handler.rs b/core/src/ops/indexing/handler.rs new file mode 100644 index 000000000..896e1d653 --- /dev/null +++ b/core/src/ops/indexing/handler.rs @@ -0,0 +1,1447 @@ +//! Unified change handling for persistent and ephemeral indexing. +//! +//! This module provides a trait-based abstraction for filesystem change handling, +//! allowing the same logic to work with both database-backed (persistent) and +//! memory-backed (ephemeral) storage. The watcher and responder use these handlers +//! to process Create/Modify/Remove/Rename events consistently. +//! +//! ## Architecture +//! +//! ```text +//! FsRawEventKind +//! │ +//! ▼ +//! ┌─────────────────────────────────────────────┐ +//! │ apply_change (shared logic) │ +//! │ - path validation │ +//! │ - rule filtering │ +//! │ - metadata extraction │ +//! │ - inode-based move detection │ +//! └──────────────────┬──────────────────────────┘ +//! │ +//! ┌─────────┴─────────┐ +//! ▼ ▼ +//! ┌───────────────┐ ┌───────────────┐ +//! │ Persistent │ │ Ephemeral │ +//! │ ChangeHandler │ │ ChangeHandler │ +//! │ (database) │ │ (in-memory) │ +//! └───────────────┘ └───────────────┘ +//! ``` + +use super::rules::{build_default_ruler, RuleToggles, RulerDecision}; +use super::state::{DirEntry, EntryKind}; +use anyhow::Result; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use uuid::Uuid; + +/// Reference to an entry in either persistent or ephemeral storage. +/// +/// Provides a uniform way to refer to entries regardless of storage backend. +/// Persistent entries have database IDs; ephemeral entries have arena indices. +#[derive(Debug, Clone)] +pub struct EntryRef { + /// For persistent: database entry ID. For ephemeral: synthetic ID. + pub id: i32, + /// UUID for sync and event emission. + pub uuid: Option, + /// Full filesystem path. + pub path: PathBuf, + /// Entry kind (file/directory/symlink). + pub kind: EntryKind, +} + +impl EntryRef { + pub fn is_directory(&self) -> bool { + self.kind == EntryKind::Directory + } +} + +/// Type of change for event emission. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ChangeType { + Created, + Modified, + Moved, + Deleted, +} + +/// Configuration for change handling operations. +pub struct ChangeConfig<'a> { + pub rule_toggles: RuleToggles, + pub location_root: &'a Path, + pub volume_backend: Option<&'a Arc>, +} + +/// Abstracts storage operations for filesystem change handling. +/// +/// Both persistent (database) and ephemeral (in-memory) handlers implement +/// this trait, allowing the same change processing logic to work with both +/// storage backends. The trait methods map to CRUD operations plus event +/// emission and processor execution. +#[async_trait::async_trait] +pub trait ChangeHandler: Send + Sync { + /// Find an entry by its full filesystem path. + async fn find_by_path(&self, path: &Path) -> Result>; + + /// Find an entry by inode (for move detection). + /// Returns None if inode tracking is not supported or no match found. + async fn find_by_inode(&self, inode: u64) -> Result>; + + /// Create a new entry from filesystem metadata. + async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result; + + /// Update an existing entry's metadata. + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()>; + + /// Move an entry from old path to new path. + async fn move_entry( + &mut self, + entry: &EntryRef, + old_path: &Path, + new_path: &Path, + new_parent_path: &Path, + ) -> Result<()>; + + /// Delete an entry and all its descendants. + async fn delete(&mut self, entry: &EntryRef) -> Result<()>; + + /// Run post-create/modify processors (thumbnails, content hash). + /// No-op for ephemeral handlers. + async fn run_processors(&self, entry: &EntryRef, is_new: bool) -> Result<()>; + + /// Emit appropriate events for UI updates. + async fn emit_change_event(&self, entry: &EntryRef, change_type: ChangeType) -> Result<()>; + + /// Handle directory recursion after creation. + /// Persistent: spawns indexer job. Ephemeral: inline shallow index. + async fn handle_new_directory(&self, path: &Path) -> Result<()>; +} + +// ============================================================================ +// Shared Logic - Used by both handlers +// ============================================================================ + +/// Check if a path exists, distinguishing between "doesn't exist" and "can't access". +/// +/// This is critical for preventing false deletions when volumes go offline. +/// Returns Ok(true) if path exists, Ok(false) if confirmed absent, Err if inaccessible. +pub async fn path_exists_safe( + path: &Path, + backend: Option<&Arc>, +) -> Result { + use crate::volume::error::VolumeError; + + if let Some(backend) = backend { + match backend.exists(path).await { + Ok(exists) => Ok(exists), + Err(VolumeError::NotMounted(_)) => { + tracing::warn!( + "Volume not mounted when checking path existence: {}", + path.display() + ); + Err(anyhow::anyhow!( + "Volume not mounted, cannot verify path existence" + )) + } + Err(VolumeError::Io(ref e)) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(VolumeError::Io(io_err)) => { + tracing::warn!( + "IO error when checking path existence for {}: {}", + path.display(), + io_err + ); + Err(anyhow::anyhow!( + "IO error, volume may be offline: {}", + io_err + )) + } + Err(e) => { + tracing::warn!( + "Volume error when checking path existence for {}: {}", + path.display(), + e + ); + Err(e.into()) + } + } + } else { + match tokio::fs::try_exists(path).await { + Ok(exists) => Ok(exists), + Err(e) => { + tracing::warn!( + "Cannot verify path existence for {} (volume may be offline): {}", + path.display(), + e + ); + Err(anyhow::anyhow!("Cannot access path: {}", e)) + } + } + } +} + +/// Evaluates indexing rules to determine if a path should be skipped. +pub async fn should_filter_path( + path: &Path, + rule_toggles: RuleToggles, + location_root: &Path, + backend: Option<&Arc>, +) -> Result { + let ruler = build_default_ruler(rule_toggles, location_root, path).await; + + let metadata = if let Some(backend) = backend { + backend + .metadata(path) + .await + .map_err(|e| anyhow::anyhow!("Failed to get metadata via backend: {}", e))? + } else { + let fs_meta = tokio::fs::metadata(path).await?; + crate::volume::backend::RawMetadata { + kind: if fs_meta.is_dir() { + EntryKind::Directory + } else if fs_meta.is_symlink() { + EntryKind::Symlink + } else { + EntryKind::File + }, + size: fs_meta.len(), + modified: fs_meta.modified().ok(), + created: fs_meta.created().ok(), + accessed: fs_meta.accessed().ok(), + inode: None, + permissions: None, + } + }; + + struct SimpleMetadata { + is_dir: bool, + } + impl super::rules::MetadataForIndexerRules for SimpleMetadata { + fn is_dir(&self) -> bool { + self.is_dir + } + } + + let simple_meta = SimpleMetadata { + is_dir: metadata.kind == EntryKind::Directory, + }; + + match ruler.evaluate_path(path, &simple_meta).await { + Ok(RulerDecision::Reject) => { + tracing::debug!("Filtered path by indexing rules: {}", path.display()); + Ok(true) + } + Ok(RulerDecision::Accept) => Ok(false), + Err(e) => { + tracing::warn!("Error evaluating rules for {}: {}", path.display(), e); + Ok(false) + } + } +} + +/// Extracts filesystem metadata into a DirEntry. +pub async fn build_dir_entry( + path: &Path, + backend: Option<&Arc>, +) -> Result { + use super::entry::EntryProcessor; + + let meta = EntryProcessor::extract_metadata(path, backend).await?; + Ok(DirEntry { + path: meta.path, + kind: meta.kind, + size: meta.size, + modified: meta.modified, + inode: meta.inode, + }) +} + +// ============================================================================ +// Generic Change Application +// ============================================================================ + +/// Apply a batch of filesystem changes using the provided handler. +/// +/// Processes events in the correct order: removes first, then renames, +/// creates, and finally modifies. This prevents conflicts like creating +/// a file that should have been deleted. +pub async fn apply_batch( + handler: &mut H, + events: Vec, + config: &ChangeConfig<'_>, +) -> Result<()> { + use crate::infra::event::FsRawEventKind; + + if events.is_empty() { + return Ok(()); + } + + let mut creates = Vec::new(); + let mut modifies = Vec::new(); + let mut removes = Vec::new(); + let mut renames = Vec::new(); + + for event in events { + match event { + FsRawEventKind::Create { path } => creates.push(path), + FsRawEventKind::Modify { path } => modifies.push(path), + FsRawEventKind::Remove { path } => removes.push(path), + FsRawEventKind::Rename { from, to } => renames.push((from, to)), + } + } + + // Deduplicate (macOS sends duplicate creates) + creates.sort(); + creates.dedup(); + modifies.sort(); + modifies.dedup(); + removes.sort(); + removes.dedup(); + + tracing::debug!( + "Processing batch: {} creates, {} modifies, {} removes, {} renames", + creates.len(), + modifies.len(), + removes.len(), + renames.len() + ); + + // Process in order: removes, renames, creates, modifies + for path in removes { + if let Err(e) = handle_remove(handler, &path).await { + tracing::error!("Failed to handle remove for {}: {}", path.display(), e); + } + } + + for (from, to) in renames { + if let Err(e) = handle_rename(handler, &from, &to, config).await { + tracing::error!( + "Failed to handle rename from {} to {}: {}", + from.display(), + to.display(), + e + ); + } + } + + for path in creates { + if let Err(e) = handle_create(handler, &path, config).await { + tracing::error!("Failed to handle create for {}: {}", path.display(), e); + } + } + + for path in modifies { + if let Err(e) = handle_modify(handler, &path, config).await { + tracing::error!("Failed to handle modify for {}: {}", path.display(), e); + } + } + + Ok(()) +} + +/// Handle a create event. +/// +/// Validates path, checks rules, extracts metadata, detects inode-based moves, +/// and creates the entry. For directories, triggers recursive indexing. +pub async fn handle_create( + handler: &mut H, + path: &Path, + config: &ChangeConfig<'_>, +) -> Result<()> { + tracing::debug!("Create: {}", path.display()); + + // 1. Validate path exists + match path_exists_safe(path, config.volume_backend).await { + Ok(true) => {} + Ok(false) => { + tracing::debug!("Path no longer exists, skipping create: {}", path.display()); + return Ok(()); + } + Err(e) => { + tracing::warn!( + "Skipping create event for inaccessible path {}: {}", + path.display(), + e + ); + return Ok(()); + } + } + + // 2. Apply rule filtering + if should_filter_path( + path, + config.rule_toggles, + config.location_root, + config.volume_backend, + ) + .await? + { + tracing::debug!("Skipping filtered path: {}", path.display()); + return Ok(()); + } + + // 3. Extract metadata + let metadata = build_dir_entry(path, config.volume_backend).await?; + + // 4. Check for existing entry (treat as modify) + if handler.find_by_path(path).await?.is_some() { + tracing::debug!( + "Entry already exists at path {}, treating as modify", + path.display() + ); + return handle_modify(handler, path, config).await; + } + + // 5. Check for inode-based move + if let Some(inode) = metadata.inode { + if let Some(existing) = handler.find_by_inode(inode).await? { + if existing.path != path { + tracing::debug!( + "Detected inode-based move: {} -> {}", + existing.path.display(), + path.display() + ); + let old_path = existing.path.clone(); + handler + .move_entry( + &existing, + &old_path, + path, + path.parent().unwrap_or(Path::new("/")), + ) + .await?; + handler + .emit_change_event(&existing, ChangeType::Moved) + .await?; + return Ok(()); + } + } + } + + // 6. Create entry + let parent_path = path.parent().unwrap_or(Path::new("/")); + let entry = handler.create(&metadata, parent_path).await?; + + // 7. Handle directory recursion or run processors + if entry.is_directory() { + handler.handle_new_directory(path).await?; + } else { + handler.run_processors(&entry, true).await?; + } + + // 8. Emit event + handler + .emit_change_event(&entry, ChangeType::Created) + .await?; + + Ok(()) +} + +/// Handle a modify event. +/// +/// Updates existing entry metadata and re-runs processors for files. +pub async fn handle_modify( + handler: &mut H, + path: &Path, + config: &ChangeConfig<'_>, +) -> Result<()> { + tracing::debug!("Modify: {}", path.display()); + + // 1. Validate path exists + match path_exists_safe(path, config.volume_backend).await { + Ok(true) => {} + Ok(false) => { + tracing::debug!("Path no longer exists, skipping modify: {}", path.display()); + return Ok(()); + } + Err(e) => { + tracing::warn!( + "Skipping modify event for inaccessible path {}: {}", + path.display(), + e + ); + return Ok(()); + } + } + + // 2. Apply rule filtering + if should_filter_path( + path, + config.rule_toggles, + config.location_root, + config.volume_backend, + ) + .await? + { + tracing::debug!("Skipping filtered path: {}", path.display()); + return Ok(()); + } + + // 3. Extract metadata + let metadata = build_dir_entry(path, config.volume_backend).await?; + + // 4. Check for inode-based move + if let Some(inode) = metadata.inode { + if let Some(existing) = handler.find_by_inode(inode).await? { + if existing.path != path { + tracing::debug!( + "Detected inode-based move during modify: {} -> {}", + existing.path.display(), + path.display() + ); + let old_path = existing.path.clone(); + handler + .move_entry( + &existing, + &old_path, + path, + path.parent().unwrap_or(Path::new("/")), + ) + .await?; + handler + .emit_change_event(&existing, ChangeType::Moved) + .await?; + return Ok(()); + } + } + } + + // 5. Find and update entry + if let Some(entry) = handler.find_by_path(path).await? { + handler.update(&entry, &metadata).await?; + + // 6. Re-run processors for files + if !entry.is_directory() { + handler.run_processors(&entry, false).await?; + } + + // 7. Emit event + handler + .emit_change_event(&entry, ChangeType::Modified) + .await?; + } else { + tracing::debug!( + "Entry not found for path, skipping modify: {}", + path.display() + ); + } + + Ok(()) +} + +/// Handle a remove event. +/// +/// Deletes the entry and its entire subtree. +pub async fn handle_remove(handler: &mut H, path: &Path) -> Result<()> { + tracing::debug!("Remove: {}", path.display()); + + if let Some(entry) = handler.find_by_path(path).await? { + handler.delete(&entry).await?; + handler + .emit_change_event(&entry, ChangeType::Deleted) + .await?; + tracing::debug!("Deleted entry for path: {}", path.display()); + } else { + tracing::debug!( + "Entry not found for path, skipping remove: {}", + path.display() + ); + } + + Ok(()) +} + +/// Handle a rename event. +/// +/// Moves an entry from one path to another, updating parent relationships. +pub async fn handle_rename( + handler: &mut H, + from: &Path, + to: &Path, + config: &ChangeConfig<'_>, +) -> Result<()> { + tracing::debug!("Rename: {} -> {}", from.display(), to.display()); + + // 1. Validate destination exists + match path_exists_safe(to, config.volume_backend).await { + Ok(true) => {} + Ok(false) => { + tracing::debug!( + "Destination path doesn't exist, skipping rename: {}", + to.display() + ); + return Ok(()); + } + Err(e) => { + tracing::warn!( + "Skipping rename event for inaccessible destination {}: {}", + to.display(), + e + ); + return Ok(()); + } + } + + // 2. Check if destination is filtered (treat as deletion) + if should_filter_path( + to, + config.rule_toggles, + config.location_root, + config.volume_backend, + ) + .await? + { + tracing::debug!( + "Destination path is filtered, removing entry: {}", + to.display() + ); + return handle_remove(handler, from).await; + } + + // 3. Find source entry and move + if let Some(entry) = handler.find_by_path(from).await? { + handler + .move_entry(&entry, from, to, to.parent().unwrap_or(Path::new("/"))) + .await?; + handler.emit_change_event(&entry, ChangeType::Moved).await?; + tracing::debug!("Moved entry {} -> {}", from.display(), to.display()); + } else { + tracing::debug!( + "Entry not found for old path {}, skipping rename", + from.display() + ); + } + + Ok(()) +} + +// ============================================================================ +// Persistent Change Handler (Database-backed) +// ============================================================================ + +use crate::context::CoreContext; +use crate::infra::db::entities; +use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, QuerySelect}; + +/// Database-backed change handler for managed locations. +/// +/// Uses EntryProcessor for CRUD operations and maintains closure table +/// relationships. Runs processor pipeline (thumbnails, content hash) for +/// new and modified files. +pub struct PersistentChangeHandler { + context: Arc, + library_id: Uuid, + location_id: Uuid, + location_root_entry_id: i32, + db: sea_orm::DatabaseConnection, + /// Volume backend for this location + volume_backend: Option>, + /// Entry ID cache for parent lookups + entry_id_cache: std::collections::HashMap, +} + +impl PersistentChangeHandler { + pub async fn new( + context: Arc, + library_id: Uuid, + location_id: Uuid, + location_root: &Path, + volume_backend: Option>, + ) -> Result { + let library = context + .get_library(library_id) + .await + .ok_or_else(|| anyhow::anyhow!("Library not found: {}", library_id))?; + + let db = library.db().conn().clone(); + + // Get location's root entry_id + let location_record = entities::location::Entity::find() + .filter(entities::location::Column::Uuid.eq(location_id)) + .one(&db) + .await? + .ok_or_else(|| anyhow::anyhow!("Location not found: {}", location_id))?; + + let location_root_entry_id = location_record + .entry_id + .ok_or_else(|| anyhow::anyhow!("Location {} has no root entry", location_id))?; + + Ok(Self { + context, + library_id, + location_id, + location_root_entry_id, + db, + volume_backend, + entry_id_cache: std::collections::HashMap::new(), + }) + } + + /// Resolve entry ID by path, checking directories then files. + async fn resolve_entry_id(&self, path: &Path) -> Result> { + // Try directory lookup first + if let Some(id) = self.resolve_directory_entry_id(path).await? { + return Ok(Some(id)); + } + // Try file lookup + self.resolve_file_entry_id(path).await + } + + async fn resolve_directory_entry_id(&self, path: &Path) -> Result> { + use sea_orm::FromQueryResult; + + let path_str = path.to_string_lossy().to_string(); + + #[derive(Debug, FromQueryResult)] + struct DirectoryEntryId { + entry_id: i32, + } + + let result = DirectoryEntryId::find_by_statement(sea_orm::Statement::from_sql_and_values( + sea_orm::DbBackend::Sqlite, + r#" + SELECT dp.entry_id + FROM directory_paths dp + INNER JOIN entry_closure ec ON ec.descendant_id = dp.entry_id + WHERE dp.path = ? + AND ec.ancestor_id = ? + "#, + vec![path_str.into(), self.location_root_entry_id.into()], + )) + .one(&self.db) + .await?; + + Ok(result.map(|r| r.entry_id)) + } + + async fn resolve_file_entry_id(&self, path: &Path) -> Result> { + let parent = match path.parent() { + Some(p) => p, + None => return Ok(None), + }; + + let parent_id = match self.resolve_directory_entry_id(parent).await? { + Some(id) => id, + None => return Ok(None), + }; + + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_string(); + let ext = path + .extension() + .and_then(|s| s.to_str()) + .map(|s| s.to_lowercase()); + + let mut q = entities::entry::Entity::find() + .filter(entities::entry::Column::ParentId.eq(parent_id)) + .filter(entities::entry::Column::Name.eq(name)); + + if let Some(e) = ext { + q = q.filter(entities::entry::Column::Extension.eq(e)); + } else { + q = q.filter(entities::entry::Column::Extension.is_null()); + } + + let model = q.one(&self.db).await?; + Ok(model.map(|m| m.id)) + } +} + +#[async_trait::async_trait] +impl ChangeHandler for PersistentChangeHandler { + async fn find_by_path(&self, path: &Path) -> Result> { + let entry_id = match self.resolve_entry_id(path).await? { + Some(id) => id, + None => return Ok(None), + }; + + let entry = entities::entry::Entity::find_by_id(entry_id) + .one(&self.db) + .await? + .ok_or_else(|| anyhow::anyhow!("Entry {} not found after ID lookup", entry_id))?; + + let kind = match entry.kind { + 0 => EntryKind::File, + 1 => EntryKind::Directory, + 2 => EntryKind::Symlink, + _ => EntryKind::File, + }; + + Ok(Some(EntryRef { + id: entry.id, + uuid: entry.uuid, + path: path.to_path_buf(), + kind, + })) + } + + async fn find_by_inode(&self, inode: u64) -> Result> { + let inode_val = inode as i64; + + let entry = entities::entry::Entity::find() + .filter(entities::entry::Column::Inode.eq(inode_val)) + .one(&self.db) + .await?; + + match entry { + Some(e) => { + let full_path = super::PathResolver::get_full_path(&self.db, e.id) + .await + .unwrap_or_else(|_| std::path::PathBuf::from(&e.name)); + + let kind = match e.kind { + 0 => EntryKind::File, + 1 => EntryKind::Directory, + 2 => EntryKind::Symlink, + _ => EntryKind::File, + }; + + Ok(Some(EntryRef { + id: e.id, + uuid: e.uuid, + path: full_path, + kind, + })) + } + None => Ok(None), + } + } + + async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result { + use super::entry::EntryProcessor; + use super::state::IndexerState; + use crate::domain::addressing::SdPath; + + // Create minimal state for entry creation + let mut state = IndexerState::new(&SdPath::local(&metadata.path)); + + // Seed parent cache if we have it + if let Some(&parent_id) = self.entry_id_cache.get(parent_path) { + state + .entry_id_cache + .insert(parent_path.to_path_buf(), parent_id); + } else if let Some(parent_id) = self.resolve_directory_entry_id(parent_path).await? { + state + .entry_id_cache + .insert(parent_path.to_path_buf(), parent_id); + self.entry_id_cache + .insert(parent_path.to_path_buf(), parent_id); + } + + // Use ResponderCtx for the IndexingCtx trait + let ctx = super::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + + let entry_id = EntryProcessor::create_entry(&mut state, &ctx, metadata, 0, parent_path) + .await + .map_err(|e| anyhow::anyhow!("Failed to create entry: {}", e))?; + + // Cache the new entry + self.entry_id_cache.insert(metadata.path.clone(), entry_id); + + // Get the created entry for the response + let entry = entities::entry::Entity::find_by_id(entry_id) + .one(&self.db) + .await? + .ok_or_else(|| anyhow::anyhow!("Entry not found after creation"))?; + + Ok(EntryRef { + id: entry.id, + uuid: entry.uuid, + path: metadata.path.clone(), + kind: metadata.kind, + }) + } + + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { + use super::entry::EntryProcessor; + + let ctx = super::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + EntryProcessor::update_entry(&ctx, entry.id, metadata) + .await + .map_err(|e| anyhow::anyhow!("Failed to update entry: {}", e))?; + + Ok(()) + } + + async fn move_entry( + &mut self, + entry: &EntryRef, + old_path: &Path, + new_path: &Path, + new_parent_path: &Path, + ) -> Result<()> { + use super::entry::EntryProcessor; + use super::state::IndexerState; + use crate::domain::addressing::SdPath; + + let mut state = IndexerState::new(&SdPath::local(old_path)); + + // Seed parent cache + if let Some(&parent_id) = self.entry_id_cache.get(new_parent_path) { + state + .entry_id_cache + .insert(new_parent_path.to_path_buf(), parent_id); + } else if let Some(parent_id) = self.resolve_directory_entry_id(new_parent_path).await? { + state + .entry_id_cache + .insert(new_parent_path.to_path_buf(), parent_id); + self.entry_id_cache + .insert(new_parent_path.to_path_buf(), parent_id); + } + + let ctx = super::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + EntryProcessor::move_entry( + &mut state, + &ctx, + entry.id, + old_path, + new_path, + new_parent_path, + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to move entry: {}", e))?; + + // Update cache + self.entry_id_cache.remove(old_path); + self.entry_id_cache.insert(new_path.to_path_buf(), entry.id); + + Ok(()) + } + + async fn delete(&mut self, entry: &EntryRef) -> Result<()> { + use sea_orm::TransactionTrait; + + // Collect all descendants + let mut to_delete_ids: Vec = vec![entry.id]; + + if let Ok(rows) = entities::entry_closure::Entity::find() + .filter(entities::entry_closure::Column::AncestorId.eq(entry.id)) + .all(&self.db) + .await + { + to_delete_ids.extend(rows.into_iter().map(|r| r.descendant_id)); + } + + // Also traverse via parent_id as fallback + let mut queue = vec![entry.id]; + let mut visited = std::collections::HashSet::from([entry.id]); + + while let Some(parent) = queue.pop() { + if let Ok(children) = entities::entry::Entity::find() + .filter(entities::entry::Column::ParentId.eq(parent)) + .all(&self.db) + .await + { + for child in children { + if visited.insert(child.id) { + to_delete_ids.push(child.id); + queue.push(child.id); + } + } + } + } + + to_delete_ids.sort_unstable(); + to_delete_ids.dedup(); + + // Create tombstones for sync + let entries_to_delete = if !to_delete_ids.is_empty() { + let mut all_entries = Vec::new(); + for chunk in to_delete_ids.chunks(900) { + let batch = entities::entry::Entity::find() + .filter(entities::entry::Column::Id.is_in(chunk.to_vec())) + .all(&self.db) + .await?; + all_entries.extend(batch); + } + all_entries + } else { + Vec::new() + }; + + if !entries_to_delete.is_empty() { + if let Some(library) = self.context.get_library(self.library_id).await { + let _ = library + .sync_models_batch( + &entries_to_delete, + crate::infra::sync::ChangeType::Delete, + &self.db, + ) + .await; + } + } + + // Delete in transaction + let txn = self.db.begin().await?; + + if !to_delete_ids.is_empty() { + let _ = entities::entry_closure::Entity::delete_many() + .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) + .exec(&txn) + .await; + let _ = entities::entry_closure::Entity::delete_many() + .filter(entities::entry_closure::Column::AncestorId.is_in(to_delete_ids.clone())) + .exec(&txn) + .await; + let _ = entities::directory_paths::Entity::delete_many() + .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) + .exec(&txn) + .await; + let _ = entities::entry::Entity::delete_many() + .filter(entities::entry::Column::Id.is_in(to_delete_ids)) + .exec(&txn) + .await; + } + + txn.commit().await?; + + // Clear from cache + self.entry_id_cache.remove(&entry.path); + + Ok(()) + } + + async fn run_processors(&self, entry: &EntryRef, _is_new: bool) -> Result<()> { + use super::processor::{ + load_location_processor_config, ContentHashProcessor, ProcessorEntry, + }; + use crate::ops::media::thumbnail::ThumbnailProcessor; + + if entry.is_directory() { + return Ok(()); + } + + let Some(library) = self.context.get_library(self.library_id).await else { + return Ok(()); + }; + + let proc_config = load_location_processor_config(self.location_id, &self.db) + .await + .unwrap_or_default(); + + // Build processor entry + let db_entry = entities::entry::Entity::find_by_id(entry.id) + .one(&self.db) + .await? + .ok_or_else(|| anyhow::anyhow!("Entry not found"))?; + + let mime_type = if let Some(content_id) = db_entry.content_id { + if let Ok(Some(ci)) = entities::content_identity::Entity::find_by_id(content_id) + .one(&self.db) + .await + { + if let Some(mime_id) = ci.mime_type_id { + if let Ok(Some(mime)) = entities::mime_type::Entity::find_by_id(mime_id) + .one(&self.db) + .await + { + Some(mime.mime_type) + } else { + None + } + } else { + None + } + } else { + None + } + } else { + None + }; + + let proc_entry = ProcessorEntry { + id: entry.id, + uuid: entry.uuid, + path: entry.path.clone(), + kind: entry.kind, + size: db_entry.size as u64, + content_id: db_entry.content_id, + mime_type, + }; + + let ctx = super::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + + // Content hash + if proc_config + .watcher_processors + .iter() + .any(|c| c.processor_type == "content_hash" && c.enabled) + { + let content_proc = ContentHashProcessor::new(self.library_id); + if let Err(e) = content_proc.process(&ctx, &proc_entry).await { + tracing::warn!("Content hash processing failed: {}", e); + } + } + + // Thumbnail + if proc_config + .watcher_processors + .iter() + .any(|c| c.processor_type == "thumbnail" && c.enabled) + { + let thumb_proc = ThumbnailProcessor::new(library.clone()); + if thumb_proc.should_process(&proc_entry) { + if let Err(e) = thumb_proc.process(&self.db, &proc_entry).await { + tracing::warn!("Thumbnail processing failed: {}", e); + } + } + } + + Ok(()) + } + + async fn emit_change_event(&self, entry: &EntryRef, change_type: ChangeType) -> Result<()> { + use crate::domain::ResourceManager; + + if let Some(uuid) = entry.uuid { + let resource_manager = + ResourceManager::new(Arc::new(self.db.clone()), self.context.events.clone()); + + if let Err(e) = resource_manager + .emit_resource_events("entry", vec![uuid]) + .await + { + tracing::warn!( + "Failed to emit resource event for {:?} entry: {}", + change_type, + e + ); + } + } + + Ok(()) + } + + async fn handle_new_directory(&self, path: &Path) -> Result<()> { + use super::job::{IndexMode, IndexerJob}; + use crate::domain::addressing::SdPath; + + let Some(library) = self.context.get_library(self.library_id).await else { + return Ok(()); + }; + + // Get index mode from location + let index_mode = if let Ok(Some(loc)) = entities::location::Entity::find() + .filter(entities::location::Column::Uuid.eq(self.location_id)) + .one(&self.db) + .await + { + match loc.index_mode.as_str() { + "shallow" => IndexMode::Shallow, + "content" => IndexMode::Content, + "deep" => IndexMode::Deep, + _ => IndexMode::Content, + } + } else { + IndexMode::Content + }; + + let indexer_job = + IndexerJob::from_location(self.location_id, SdPath::local(path), index_mode); + + if let Err(e) = library.jobs().dispatch(indexer_job).await { + tracing::warn!( + "Failed to spawn indexer job for directory {}: {}", + path.display(), + e + ); + } else { + tracing::debug!( + "Spawned recursive indexer job for directory: {}", + path.display() + ); + } + + Ok(()) + } +} + +// ============================================================================ +// Ephemeral Change Handler (Memory-backed) +// ============================================================================ + +use super::job::EphemeralIndex; +use tokio::sync::RwLock; + +/// Memory-backed change handler for ephemeral browsing. +/// +/// Updates the EphemeralIndex directly without database writes. +/// Skips processor pipeline (no thumbnails/content hash for ephemeral). +pub struct EphemeralChangeHandler { + index: Arc>, + event_bus: Arc, + root_path: PathBuf, + /// Synthetic ID counter (EphemeralIndex uses arena indices internally) + next_id: std::sync::atomic::AtomicI32, +} + +impl EphemeralChangeHandler { + pub fn new( + index: Arc>, + event_bus: Arc, + root_path: PathBuf, + ) -> Self { + Self { + index, + event_bus, + root_path, + next_id: std::sync::atomic::AtomicI32::new(1), + } + } + + fn next_id(&self) -> i32 { + self.next_id + .fetch_add(1, std::sync::atomic::Ordering::SeqCst) + } +} + +#[async_trait::async_trait] +impl ChangeHandler for EphemeralChangeHandler { + async fn find_by_path(&self, path: &Path) -> Result> { + let index = self.index.read().await; + + if let Some(metadata) = index.get_entry_ref(&path.to_path_buf()) { + let uuid = index.get_entry_uuid(&path.to_path_buf()); + + Ok(Some(EntryRef { + id: 0, // Ephemeral entries don't have stable IDs + uuid, + path: path.to_path_buf(), + kind: metadata.kind, + })) + } else { + Ok(None) + } + } + + async fn find_by_inode(&self, _inode: u64) -> Result> { + // Ephemeral index doesn't track inodes + Ok(None) + } + + async fn create(&mut self, metadata: &DirEntry, _parent_path: &Path) -> Result { + use super::entry::EntryMetadata; + + let entry_uuid = Uuid::new_v4(); + let entry_metadata = EntryMetadata::from(metadata.clone()); + + { + let mut index = self.index.write().await; + index + .add_entry(metadata.path.clone(), entry_uuid, entry_metadata) + .map_err(|e| anyhow::anyhow!("Failed to add entry to ephemeral index: {}", e))?; + } + + Ok(EntryRef { + id: self.next_id(), + uuid: Some(entry_uuid), + path: metadata.path.clone(), + kind: metadata.kind, + }) + } + + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { + use super::entry::EntryMetadata; + + // Ephemeral index doesn't have a direct update method, + // so we remove and re-add (preserving UUID) + let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); + let entry_metadata = EntryMetadata::from(metadata.clone()); + + { + let mut index = self.index.write().await; + // The add_entry method handles duplicates by returning Ok(None) + // For updates, we need to clear first then re-add + // Since EphemeralIndex doesn't have remove_entry, we just re-add + // which effectively updates the metadata + let _ = index.add_entry(metadata.path.clone(), uuid, entry_metadata); + } + + Ok(()) + } + + async fn move_entry( + &mut self, + entry: &EntryRef, + old_path: &Path, + new_path: &Path, + _new_parent_path: &Path, + ) -> Result<()> { + // Ephemeral index doesn't support moves directly + // We delete from old path and create at new path + // Note: This loses the UUID association, but for ephemeral that's acceptable + + let metadata = build_dir_entry(new_path, None).await?; + + { + let mut index = self.index.write().await; + // Remove old entry + index.remove_entry(old_path); + + // Add at new path with preserved UUID + let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); + let entry_metadata = super::entry::EntryMetadata::from(metadata.clone()); + let _ = index.add_entry(new_path.to_path_buf(), uuid, entry_metadata); + } + + Ok(()) + } + + async fn delete(&mut self, entry: &EntryRef) -> Result<()> { + { + let mut index = self.index.write().await; + + if entry.is_directory() { + // Remove directory and all descendants + index.remove_directory_tree(&entry.path); + } else { + // Remove single entry + index.remove_entry(&entry.path); + } + } + + Ok(()) + } + + async fn run_processors(&self, _entry: &EntryRef, _is_new: bool) -> Result<()> { + // Ephemeral handler skips processors - no thumbnails or content hash + Ok(()) + } + + async fn emit_change_event(&self, entry: &EntryRef, change_type: ChangeType) -> Result<()> { + use crate::device::get_current_device_slug; + use crate::domain::addressing::SdPath; + use crate::domain::file::File; + use crate::domain::ContentKind; + use crate::infra::event::{Event, ResourceMetadata}; + + let Some(uuid) = entry.uuid else { + return Ok(()); + }; + + let device_slug = get_current_device_slug(); + + let sd_path = SdPath::Physical { + device_slug: device_slug.clone(), + path: entry.path.clone(), + }; + + // Get content kind from index + let content_kind = { + let index = self.index.read().await; + index.get_content_kind(&entry.path) + }; + + // Build a minimal File for the event + let metadata = build_dir_entry(&entry.path, None).await.ok(); + + if let Some(meta) = metadata { + let entry_metadata = super::entry::EntryMetadata::from(meta); + let mut file = File::from_ephemeral(uuid, &entry_metadata, sd_path); + file.content_kind = content_kind; + + let parent_path = entry.path.parent().map(|p| SdPath::Physical { + device_slug: file.sd_path.device_slug().unwrap_or("local").to_string(), + path: p.to_path_buf(), + }); + + let affected_paths = parent_path.into_iter().collect(); + + if let Ok(resource_json) = serde_json::to_value(&file) { + self.event_bus.emit(Event::ResourceChanged { + resource_type: "file".to_string(), + resource: resource_json, + metadata: Some(ResourceMetadata { + no_merge_fields: vec!["sd_path".to_string()], + alternate_ids: vec![], + affected_paths, + }), + }); + } + } + + Ok(()) + } + + async fn handle_new_directory(&self, path: &Path) -> Result<()> { + // For ephemeral, we do inline shallow indexing instead of spawning a job + use super::entry::EntryMetadata; + use super::entry::EntryProcessor; + + let mut entries = match tokio::fs::read_dir(path).await { + Ok(e) => e, + Err(e) => { + tracing::warn!( + "Failed to read directory {} for ephemeral indexing: {}", + path.display(), + e + ); + return Ok(()); + } + }; + + let mut index = self.index.write().await; + + while let Ok(Some(entry)) = entries.next_entry().await { + let entry_path = entry.path(); + + if let Ok(metadata) = entry.metadata().await { + let kind = if metadata.is_dir() { + EntryKind::Directory + } else if metadata.is_symlink() { + EntryKind::Symlink + } else { + EntryKind::File + }; + + let entry_metadata = EntryMetadata { + path: entry_path.clone(), + kind, + size: metadata.len(), + modified: metadata.modified().ok(), + accessed: metadata.accessed().ok(), + created: metadata.created().ok(), + inode: EntryProcessor::get_inode(&metadata), + permissions: None, + is_hidden: entry_path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }; + + let uuid = Uuid::new_v4(); + let _ = index.add_entry(entry_path, uuid, entry_metadata); + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_entry_ref_is_directory() { + let file_ref = EntryRef { + id: 1, + uuid: Some(Uuid::new_v4()), + path: PathBuf::from("/test/file.txt"), + kind: EntryKind::File, + }; + assert!(!file_ref.is_directory()); + + let dir_ref = EntryRef { + id: 2, + uuid: Some(Uuid::new_v4()), + path: PathBuf::from("/test/dir"), + kind: EntryKind::Directory, + }; + assert!(dir_ref.is_directory()); + } +} diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index ee301beac..93b58f089 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -620,6 +620,47 @@ impl EphemeralIndex { self.path_index.len() } + /// Check if an entry exists at the given path. + pub fn has_entry(&self, path: &Path) -> bool { + self.path_index.contains_key(path) + } + + /// Remove an entry at the given path. + /// + /// Returns true if the entry was removed, false if it didn't exist. + /// For directories, this only removes the directory entry itself, not its children. + /// Use `remove_directory_tree` to remove a directory and all its descendants. + pub fn remove_entry(&mut self, path: &Path) -> bool { + let existed = self.path_index.remove(path).is_some(); + self.entry_uuids.remove(path); + self.content_kinds.remove(path); + existed + } + + /// Remove a directory and all its descendants. + /// + /// Returns the number of entries removed. + pub fn remove_directory_tree(&mut self, path: &Path) -> usize { + let prefix = path.to_string_lossy().to_string(); + let keys_to_remove: Vec<_> = self + .path_index + .keys() + .filter(|k| { + let k_str = k.to_string_lossy(); + k_str == prefix || k_str.starts_with(&format!("{}/", prefix)) + }) + .cloned() + .collect(); + + let count = keys_to_remove.len(); + for key in keys_to_remove { + self.path_index.remove(&key); + self.entry_uuids.remove(&key); + self.content_kinds.remove(&key); + } + count + } + /// Reconstructs paths for all entries and returns them as a HashMap. /// /// For large indexes, this can be expensive since it walks the tree to rebuild diff --git a/core/src/ops/indexing/mod.rs b/core/src/ops/indexing/mod.rs index be465835b..c63a058ef 100644 --- a/core/src/ops/indexing/mod.rs +++ b/core/src/ops/indexing/mod.rs @@ -25,6 +25,7 @@ pub mod change_detection; pub mod ctx; pub mod entry; pub mod ephemeral; +pub mod handler; pub mod hierarchy; pub mod input; pub mod job; @@ -43,6 +44,10 @@ pub use action::IndexingAction; pub use ctx::{IndexingCtx, ResponderCtx}; pub use entry::{EntryMetadata, EntryProcessor}; pub use ephemeral::EphemeralIndexCache; +pub use handler::{ + apply_batch as apply_change_batch, ChangeConfig, ChangeHandler, ChangeType, EntryRef, + EphemeralChangeHandler, PersistentChangeHandler, +}; pub use hierarchy::HierarchyQuery; pub use input::IndexInput; pub use job::{ diff --git a/core/src/service/watcher/mod.rs b/core/src/service/watcher/mod.rs index 39e6e15bb..47f29a60d 100644 --- a/core/src/service/watcher/mod.rs +++ b/core/src/service/watcher/mod.rs @@ -139,6 +139,8 @@ pub struct LocationWatcher { context: Arc, /// Currently watched locations watched_locations: Arc>>, + /// Ephemeral watches (shallow, non-recursive) keyed by path + ephemeral_watches: Arc>>, /// File system watcher watcher: Arc>>, /// Whether the service is running @@ -170,6 +172,15 @@ pub struct WatchedLocation { pub rule_toggles: crate::ops::indexing::rules::RuleToggles, } +/// Information about an ephemeral watch (shallow, non-recursive) +#[derive(Debug, Clone)] +pub struct EphemeralWatch { + /// Path being watched + pub path: PathBuf, + /// Indexing rule toggles for filtering events + pub rule_toggles: crate::ops::indexing::rules::RuleToggles, +} + impl LocationWatcher { /// Create a new location watcher pub fn new( @@ -184,6 +195,7 @@ impl LocationWatcher { events, context, watched_locations: Arc::new(RwLock::new(HashMap::new())), + ephemeral_watches: Arc::new(RwLock::new(HashMap::new())), watcher: Arc::new(RwLock::new(None)), is_running: Arc::new(RwLock::new(false)), platform_handler, @@ -508,6 +520,135 @@ impl LocationWatcher { .collect() } + // ======================================================================== + // Ephemeral Watch Support (shallow, non-recursive) + // ======================================================================== + + /// Add an ephemeral watch for a directory (shallow, immediate children only). + /// + /// Unlike location watches which are recursive, ephemeral watches only monitor + /// immediate children of the watched directory. This is appropriate for ephemeral + /// browsing where only the current directory's contents are indexed. + /// + /// The path should already be indexed in the ephemeral cache before calling this. + pub async fn add_ephemeral_watch( + &self, + path: PathBuf, + rule_toggles: crate::ops::indexing::rules::RuleToggles, + ) -> Result<()> { + // Check if path is valid + if !path.exists() { + return Err(anyhow::anyhow!( + "Cannot watch non-existent path: {}", + path.display() + )); + } + + if !path.is_dir() { + return Err(anyhow::anyhow!( + "Cannot watch non-directory path: {}", + path.display() + )); + } + + // Check if already watching + { + let watches = self.ephemeral_watches.read().await; + if watches.contains_key(&path) { + debug!("Already watching ephemeral path: {}", path.display()); + return Ok(()); + } + } + + // Register in ephemeral cache + self.context + .ephemeral_cache() + .register_for_watching(path.clone()); + + // Add to our tracking + { + let mut watches = self.ephemeral_watches.write().await; + watches.insert( + path.clone(), + EphemeralWatch { + path: path.clone(), + rule_toggles, + }, + ); + } + + // Add to file system watcher with NonRecursive mode + if *self.is_running.read().await { + if let Some(watcher) = self.watcher.write().await.as_mut() { + watcher.watch(&path, RecursiveMode::NonRecursive)?; + info!("Started shallow ephemeral watch for: {}", path.display()); + } + } + + Ok(()) + } + + /// Remove an ephemeral watch + pub async fn remove_ephemeral_watch(&self, path: &Path) -> Result<()> { + let watch = { + let mut watches = self.ephemeral_watches.write().await; + watches.remove(path) + }; + + if let Some(watch) = watch { + // Unregister from ephemeral cache + self.context + .ephemeral_cache() + .unregister_from_watching(&watch.path); + + // Remove from file system watcher + if *self.is_running.read().await { + if let Some(watcher) = self.watcher.write().await.as_mut() { + if let Err(e) = watcher.unwatch(&watch.path) { + warn!( + "Failed to unwatch ephemeral path {}: {}", + watch.path.display(), + e + ); + } else { + info!("Stopped ephemeral watch for: {}", watch.path.display()); + } + } + } + } + + Ok(()) + } + + /// Get all ephemeral watches + pub async fn get_ephemeral_watches(&self) -> Vec { + self.ephemeral_watches + .read() + .await + .values() + .cloned() + .collect() + } + + /// Check if a path has an ephemeral watch + pub async fn has_ephemeral_watch(&self, path: &Path) -> bool { + self.ephemeral_watches.read().await.contains_key(path) + } + + /// Find the ephemeral watch that covers a given path (if any). + /// + /// For shallow watches, only returns a match if the path is an immediate + /// child of a watched directory. + pub async fn find_ephemeral_watch_for_path(&self, path: &Path) -> Option { + let watches = self.ephemeral_watches.read().await; + + // Get the parent directory of the event path + let parent = path.parent()?; + + // Check if the parent is being watched + watches.get(parent).cloned() + } + /// Load existing locations from the database and add them to the watcher async fn load_existing_locations(&self) -> Result<()> { info!("Loading existing locations from database..."); @@ -674,11 +815,13 @@ impl LocationWatcher { async fn start_event_loop(&self) -> Result<()> { let platform_handler = self.platform_handler.clone(); let watched_locations = self.watched_locations.clone(); + let ephemeral_watches = self.ephemeral_watches.clone(); let workers = self.workers.clone(); let is_running = self.is_running.clone(); let debug_mode = self.config.debug_mode; let metrics = self.metrics.clone(); let events = self.events.clone(); + let context = self.context.clone(); let (tx, mut rx) = mpsc::channel(self.config.event_buffer_size); let tx_clone = tx.clone(); @@ -731,6 +874,17 @@ impl LocationWatcher { } drop(locations); + // Watch all ephemeral paths (non-recursive/shallow) + let ephemeral = ephemeral_watches.read().await; + for watch in ephemeral.values() { + watcher.watch(&watch.path, RecursiveMode::NonRecursive)?; + info!( + "Started shallow ephemeral watch for: {}", + watch.path.display() + ); + } + drop(ephemeral); + // Store watcher *self.watcher.write().await = Some(watcher); @@ -762,6 +916,46 @@ impl LocationWatcher { FsRawEventKind::Rename { from, .. } => Some(from.as_path()), }; + // First, check if this is an ephemeral watch event + // For shallow watches, only process if path is immediate child + let mut handled_by_ephemeral = false; + if let Some(event_path) = event_path { + let parent = event_path.parent(); + if let Some(parent_path) = parent { + let ephemeral = ephemeral_watches.read().await; + if let Some(watch) = ephemeral.get(parent_path) { + debug!( + "Ephemeral watch match for {}: parent {} is watched", + event_path.display(), + parent_path.display() + ); + handled_by_ephemeral = true; + + // Process via ephemeral handler + let ctx = context.clone(); + let root = watch.path.clone(); + let toggles = watch.rule_toggles; + let event_kind = kind.clone(); + + tokio::spawn(async move { + if let Err(e) = crate::ops::indexing::ephemeral::responder::apply( + &ctx, + &root, + event_kind, + toggles, + ).await { + warn!("Failed to process ephemeral event: {}", e); + } + }); + } + } + } + + // Skip location matching if handled by ephemeral + if handled_by_ephemeral { + continue; + } + // Find the location for this event by matching path prefix // CRITICAL: Must match by path, not just library_id, to avoid routing // events to the wrong location when multiple locations exist in one library @@ -995,6 +1189,7 @@ impl LocationWatcher { events: events.clone(), context: context.clone(), watched_locations: watched_locations.clone(), + ephemeral_watches: Arc::new(RwLock::new(HashMap::new())), watcher: watcher_ref.clone(), is_running: is_running.clone(), platform_handler: platform_handler.clone(), @@ -1033,6 +1228,7 @@ impl LocationWatcher { events: events.clone(), context: context.clone(), watched_locations: watched_locations.clone(), + ephemeral_watches: Arc::new(RwLock::new(HashMap::new())), watcher: watcher_ref.clone(), is_running: is_running.clone(), platform_handler: platform_handler.clone(), From 36659ac96bbdd3189a20c7fdc015c7c6b9ed7ae0 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 01:25:21 -0800 Subject: [PATCH 13/20] Refactor indexing change handling and introduce unified change detection - Removed the `handler.rs` module and integrated its functionality into the new `change_detection` module, which now handles both persistent and ephemeral change processing. - Implemented a `ChangeDetector` for batch indexing scans, allowing efficient detection of new, modified, moved, and deleted entries. - Introduced a `ChangeHandler` trait to abstract operations for both persistent and ephemeral storage, ensuring consistent behavior across different backends. - Enhanced the `EphemeralChangeHandler` and `PersistentChangeHandler` to utilize the new change detection infrastructure. - Updated the `apply_batch` function to streamline event processing and improve responsiveness to filesystem changes. - Added comprehensive tests and documentation to validate the new structure and functionality. --- .../ops/indexing/change_detection/detector.rs | 281 ++++ .../indexing/change_detection/ephemeral.rs | 244 +++ .../ops/indexing/change_detection/handler.rs | 523 ++++++ core/src/ops/indexing/change_detection/mod.rs | 473 +----- .../indexing/change_detection/persistent.rs | 638 ++++++++ .../ops/indexing/change_detection/types.rs | 135 ++ core/src/ops/indexing/ephemeral/responder.rs | 4 +- core/src/ops/indexing/handler.rs | 1447 ----------------- core/src/ops/indexing/mod.rs | 9 +- core/src/ops/indexing/responder.rs | 1331 +-------------- 10 files changed, 1937 insertions(+), 3148 deletions(-) create mode 100644 core/src/ops/indexing/change_detection/detector.rs create mode 100644 core/src/ops/indexing/change_detection/ephemeral.rs create mode 100644 core/src/ops/indexing/change_detection/handler.rs create mode 100644 core/src/ops/indexing/change_detection/persistent.rs create mode 100644 core/src/ops/indexing/change_detection/types.rs delete mode 100644 core/src/ops/indexing/handler.rs diff --git a/core/src/ops/indexing/change_detection/detector.rs b/core/src/ops/indexing/change_detection/detector.rs new file mode 100644 index 000000000..5ef8090bb --- /dev/null +++ b/core/src/ops/indexing/change_detection/detector.rs @@ -0,0 +1,281 @@ +//! Change detector for batch indexing scans. +//! +//! The `ChangeDetector` compares database state against filesystem state +//! during indexer job scans. It identifies: +//! - New files/directories (not in database) +//! - Modified entries (size or mtime changed) +//! - Moved entries (same inode, different path) +//! - Deleted entries (in database but not on disk) + +use super::types::Change; +use crate::infra::job::prelude::JobContext; +use crate::ops::indexing::state::EntryKind; +use std::{ + collections::HashMap, + path::{Path, PathBuf}, + time::SystemTime, +}; + +/// Tracks changes between database state and filesystem during batch scans. +/// +/// Used by the indexer job to efficiently detect what needs to be created, +/// updated, moved, or deleted. Loads existing entries from the database, +/// then compares against filesystem walks. +pub struct ChangeDetector { + /// Maps paths to their database entries + path_to_entry: HashMap, + + /// Maps inodes to paths (for detecting moves) + inode_to_path: HashMap, + + /// Precision for timestamp comparison (some filesystems have lower precision) + timestamp_precision_ms: i64, + + /// Cache for file existence checks to avoid repeated filesystem calls + existence_cache: HashMap, +} + +#[derive(Debug, Clone)] +struct DatabaseEntry { + id: i32, + path: PathBuf, + kind: EntryKind, + size: u64, + modified: Option, + inode: Option, +} + +impl ChangeDetector { + /// Create a new change detector + pub fn new() -> Self { + Self { + path_to_entry: HashMap::new(), + inode_to_path: HashMap::new(), + timestamp_precision_ms: 1, // Default to 1ms precision + existence_cache: HashMap::new(), + } + } + + /// Load existing entries from database for a location, scoped to indexing path + pub async fn load_existing_entries( + &mut self, + ctx: &JobContext<'_>, + location_id: i32, + indexing_path: &Path, + ) -> Result<(), crate::infra::job::prelude::JobError> { + use crate::infra::db::entities; + use crate::infra::job::prelude::JobError; + use crate::ops::indexing::persistence::{DatabasePersistence, IndexPersistence}; + use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; + + let location_record = entities::location::Entity::find_by_id(location_id) + .one(ctx.library_db()) + .await + .map_err(|e| JobError::execution(format!("Failed to find location: {}", e)))? + .ok_or_else(|| JobError::execution("Location not found".to_string()))?; + + // Create a database persistence instance to leverage the scoped query logic + let persistence = DatabasePersistence::new(ctx, 0, location_record.entry_id); + + // Use the scoped query method + let existing_entries = persistence.get_existing_entries(indexing_path).await?; + + // Process the results into our internal data structures + for (full_path, (id, inode, modified_time, size)) in existing_entries { + let entry_kind = if full_path.is_dir() { + EntryKind::Directory + } else { + EntryKind::File + }; + + let db_entry = DatabaseEntry { + id, + path: full_path.clone(), + kind: entry_kind, + size, + modified: modified_time, + inode, + }; + + self.path_to_entry.insert(full_path.clone(), db_entry); + + if let Some(inode_val) = inode { + self.inode_to_path.insert(inode_val, full_path); + } + } + + ctx.log(format!( + "Loaded {} existing entries for change detection", + self.path_to_entry.len() + )); + + use tracing::warn; + if self.path_to_entry.is_empty() { + warn!("ChangeDetector loaded 0 entries - database may be locked or empty"); + } else { + warn!( + "ChangeDetector loaded {} entries successfully", + self.path_to_entry.len() + ); + } + + Ok(()) + } + + /// Check if a path represents a change. + /// + /// Returns Some(Change) if the path is new, modified, or moved. + /// Returns None if the path exists in database with same metadata. + pub fn check_path( + &mut self, + path: &Path, + metadata: &std::fs::Metadata, + inode: Option, + ) -> Option { + // Check if path exists in database + if let Some(db_entry) = self.path_to_entry.get(path) { + // Check for modifications + if self.is_modified(db_entry, metadata) { + return Some(Change::Modified { + path: path.to_path_buf(), + entry_id: db_entry.id, + old_modified: db_entry.modified, + new_modified: metadata.modified().ok(), + }); + } + + // No change for this path + return None; + } + + // Path not in database - check if it's a move or hard link + if let Some(inode_val) = inode { + if let Some(old_path) = self.inode_to_path.get(&inode_val).cloned() { + if old_path != path { + if let Some(db_entry) = self.path_to_entry.get(&old_path).cloned() { + // Check if the old path still exists on disk (with caching) + if self.path_exists_cached(&old_path) { + // Hard link: Both paths exist and point to same inode + use tracing::debug; + debug!( + "Hard link detected - existing: {:?}, new: {:?}, inode: {}", + old_path, path, inode_val + ); + // Fall through to "New" - both entries should exist + } else { + // Genuine move: Old path no longer exists + use tracing::info; + info!( + "Move detected - old: {:?}, new: {:?}, inode: {}", + old_path, path, inode_val + ); + return Some(Change::Moved { + old_path, + new_path: path.to_path_buf(), + entry_id: db_entry.id, + inode: inode_val, + }); + } + } + } + } + } + + // New file/directory + Some(Change::New(path.to_path_buf())) + } + + /// Find deleted entries (in DB but not seen during scan). + pub fn find_deleted(&self, seen_paths: &std::collections::HashSet) -> Vec { + self.path_to_entry + .iter() + .filter(|(path, _)| !seen_paths.contains(*path)) + .map(|(path, entry)| Change::Deleted { + path: path.clone(), + entry_id: entry.id, + }) + .collect() + } + + /// Check if an entry has been modified + fn is_modified(&self, db_entry: &DatabaseEntry, metadata: &std::fs::Metadata) -> bool { + // Check size first (fast) + if db_entry.size != metadata.len() { + return true; + } + + // Check modification time + if let (Some(db_modified), Ok(fs_modified)) = (db_entry.modified, metadata.modified()) { + let db_time = db_modified + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as i64; + let fs_time = fs_modified + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as i64; + + if (db_time - fs_time).abs() > self.timestamp_precision_ms { + return true; + } + } + + false + } + + /// Set timestamp precision for comparison (in milliseconds) + pub fn set_timestamp_precision(&mut self, precision_ms: i64) { + self.timestamp_precision_ms = precision_ms; + } + + /// Get the number of tracked entries + pub fn entry_count(&self) -> usize { + self.path_to_entry.len() + } + + /// Check if a path exists with caching to reduce filesystem calls + fn path_exists_cached(&mut self, path: &Path) -> bool { + if let Some(&cached_result) = self.existence_cache.get(path) { + return cached_result; + } + + let exists = path.exists(); + self.existence_cache.insert(path.to_path_buf(), exists); + exists + } +} + +impl Default for ChangeDetector { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_new_file_detection() { + let mut detector = ChangeDetector::new(); + let new_path = PathBuf::from("/test/new_file.txt"); + + // Create a temporary file for testing + let temp_dir = tempfile::tempdir().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + std::fs::write(&test_file, "test content").unwrap(); + let metadata = std::fs::metadata(&test_file).unwrap(); + + let result = detector.check_path(&new_path, &metadata, None); + match result { + Some(Change::Created { path, .. }) => assert_eq!(path, new_path), + _ => panic!("Expected new file detection"), + } + } + + #[test] + fn test_entry_count() { + let detector = ChangeDetector::new(); + assert_eq!(detector.entry_count(), 0); + } +} diff --git a/core/src/ops/indexing/change_detection/ephemeral.rs b/core/src/ops/indexing/change_detection/ephemeral.rs new file mode 100644 index 000000000..61fffb29d --- /dev/null +++ b/core/src/ops/indexing/change_detection/ephemeral.rs @@ -0,0 +1,244 @@ +//! Ephemeral (memory-backed) change handler for browsing unmanaged paths. +//! +//! Updates the EphemeralIndex directly without database writes. +//! Skips the processor pipeline (no thumbnails/content hash for ephemeral). + +use super::handler::{build_dir_entry, ChangeHandler}; +use super::types::{ChangeType, EntryRef}; +use crate::infra::event::EventBus; +use crate::ops::indexing::entry::EntryMetadata; +use crate::ops::indexing::job::EphemeralIndex; +use crate::ops::indexing::state::{DirEntry, EntryKind}; +use anyhow::Result; +use std::path::{Path, PathBuf}; +use std::sync::atomic::AtomicI32; +use std::sync::Arc; +use tokio::sync::RwLock; +use uuid::Uuid; + +/// Memory-backed change handler for ephemeral browsing. +pub struct EphemeralChangeHandler { + index: Arc>, + event_bus: Arc, + root_path: PathBuf, + next_id: AtomicI32, +} + +impl EphemeralChangeHandler { + pub fn new( + index: Arc>, + event_bus: Arc, + root_path: PathBuf, + ) -> Self { + Self { + index, + event_bus, + root_path, + next_id: AtomicI32::new(1), + } + } + + fn next_id(&self) -> i32 { + self.next_id + .fetch_add(1, std::sync::atomic::Ordering::SeqCst) + } +} + +#[async_trait::async_trait] +impl ChangeHandler for EphemeralChangeHandler { + async fn find_by_path(&self, path: &Path) -> Result> { + let index = self.index.read().await; + + if let Some(metadata) = index.get_entry_ref(&path.to_path_buf()) { + let uuid = index.get_entry_uuid(&path.to_path_buf()); + + Ok(Some(EntryRef { + id: 0, + uuid, + path: path.to_path_buf(), + kind: metadata.kind, + })) + } else { + Ok(None) + } + } + + async fn find_by_inode(&self, _inode: u64) -> Result> { + Ok(None) + } + + async fn create(&mut self, metadata: &DirEntry, _parent_path: &Path) -> Result { + let entry_uuid = Uuid::new_v4(); + let entry_metadata = EntryMetadata::from(metadata.clone()); + + { + let mut index = self.index.write().await; + index + .add_entry(metadata.path.clone(), entry_uuid, entry_metadata) + .map_err(|e| anyhow::anyhow!("Failed to add entry to ephemeral index: {}", e))?; + } + + Ok(EntryRef { + id: self.next_id(), + uuid: Some(entry_uuid), + path: metadata.path.clone(), + kind: metadata.kind, + }) + } + + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { + let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); + let entry_metadata = EntryMetadata::from(metadata.clone()); + + { + let mut index = self.index.write().await; + let _ = index.add_entry(metadata.path.clone(), uuid, entry_metadata); + } + + Ok(()) + } + + async fn move_entry( + &mut self, + entry: &EntryRef, + old_path: &Path, + new_path: &Path, + _new_parent_path: &Path, + ) -> Result<()> { + let metadata = build_dir_entry(new_path, None).await?; + + { + let mut index = self.index.write().await; + index.remove_entry(old_path); + + let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); + let entry_metadata = EntryMetadata::from(metadata.clone()); + let _ = index.add_entry(new_path.to_path_buf(), uuid, entry_metadata); + } + + Ok(()) + } + + async fn delete(&mut self, entry: &EntryRef) -> Result<()> { + { + let mut index = self.index.write().await; + + if entry.is_directory() { + index.remove_directory_tree(&entry.path); + } else { + index.remove_entry(&entry.path); + } + } + + Ok(()) + } + + async fn run_processors(&self, _entry: &EntryRef, _is_new: bool) -> Result<()> { + Ok(()) + } + + async fn emit_change_event(&self, entry: &EntryRef, _change_type: ChangeType) -> Result<()> { + use crate::device::get_current_device_slug; + use crate::domain::addressing::SdPath; + use crate::domain::file::File; + use crate::infra::event::{Event, ResourceMetadata}; + + let Some(uuid) = entry.uuid else { + return Ok(()); + }; + + let device_slug = get_current_device_slug(); + + let sd_path = SdPath::Physical { + device_slug: device_slug.clone(), + path: entry.path.clone(), + }; + + let content_kind = { + let index = self.index.read().await; + index.get_content_kind(&entry.path) + }; + + let metadata = build_dir_entry(&entry.path, None).await.ok(); + + if let Some(meta) = metadata { + let entry_metadata = EntryMetadata::from(meta); + let mut file = File::from_ephemeral(uuid, &entry_metadata, sd_path); + file.content_kind = content_kind; + + let parent_path = entry.path.parent().map(|p| SdPath::Physical { + device_slug: file.sd_path.device_slug().unwrap_or("local").to_string(), + path: p.to_path_buf(), + }); + + let affected_paths = parent_path.into_iter().collect(); + + if let Ok(resource_json) = serde_json::to_value(&file) { + self.event_bus.emit(Event::ResourceChanged { + resource_type: "file".to_string(), + resource: resource_json, + metadata: Some(ResourceMetadata { + no_merge_fields: vec!["sd_path".to_string()], + alternate_ids: vec![], + affected_paths, + }), + }); + } + } + + Ok(()) + } + + async fn handle_new_directory(&self, path: &Path) -> Result<()> { + use crate::ops::indexing::entry::EntryProcessor; + + let mut entries = match tokio::fs::read_dir(path).await { + Ok(e) => e, + Err(e) => { + tracing::warn!( + "Failed to read directory {} for ephemeral indexing: {}", + path.display(), + e + ); + return Ok(()); + } + }; + + let mut index = self.index.write().await; + + while let Ok(Some(entry)) = entries.next_entry().await { + let entry_path = entry.path(); + + if let Ok(metadata) = entry.metadata().await { + let kind = if metadata.is_dir() { + EntryKind::Directory + } else if metadata.is_symlink() { + EntryKind::Symlink + } else { + EntryKind::File + }; + + let entry_metadata = EntryMetadata { + path: entry_path.clone(), + kind, + size: metadata.len(), + modified: metadata.modified().ok(), + accessed: metadata.accessed().ok(), + created: metadata.created().ok(), + inode: EntryProcessor::get_inode(&metadata), + permissions: None, + is_hidden: entry_path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }; + + let uuid = Uuid::new_v4(); + let _ = index.add_entry(entry_path, uuid, entry_metadata); + } + } + + Ok(()) + } +} diff --git a/core/src/ops/indexing/change_detection/handler.rs b/core/src/ops/indexing/change_detection/handler.rs new file mode 100644 index 000000000..03dca2c41 --- /dev/null +++ b/core/src/ops/indexing/change_detection/handler.rs @@ -0,0 +1,523 @@ +//! Change handler for responding to filesystem events. +//! +//! This module provides the `ChangeHandler` trait and shared logic for +//! processing filesystem changes. Both persistent (database) and ephemeral +//! (in-memory) handlers implement this trait. + +use super::types::{ChangeConfig, ChangeType, EntryRef}; +use crate::ops::indexing::rules::{build_default_ruler, RuleToggles, RulerDecision}; +use crate::ops::indexing::state::{DirEntry, EntryKind}; +use anyhow::Result; +use std::path::Path; +use std::sync::Arc; + +/// Abstracts storage operations for filesystem change handling. +/// +/// Both persistent (database) and ephemeral (in-memory) handlers implement +/// this trait, allowing the same change processing logic to work with both +/// storage backends. +#[async_trait::async_trait] +pub trait ChangeHandler: Send + Sync { + /// Find an entry by its full filesystem path. + async fn find_by_path(&self, path: &Path) -> Result>; + + /// Find an entry by inode (for move detection). + async fn find_by_inode(&self, inode: u64) -> Result>; + + /// Create a new entry from filesystem metadata. + async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result; + + /// Update an existing entry's metadata. + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()>; + + /// Move an entry from old path to new path. + async fn move_entry( + &mut self, + entry: &EntryRef, + old_path: &Path, + new_path: &Path, + new_parent_path: &Path, + ) -> Result<()>; + + /// Delete an entry and all its descendants. + async fn delete(&mut self, entry: &EntryRef) -> Result<()>; + + /// Run post-create/modify processors (thumbnails, content hash). + /// No-op for ephemeral handlers. + async fn run_processors(&self, entry: &EntryRef, is_new: bool) -> Result<()>; + + /// Emit appropriate events for UI updates. + async fn emit_change_event(&self, entry: &EntryRef, change_type: ChangeType) -> Result<()>; + + /// Handle directory recursion after creation. + /// Persistent: spawns indexer job. Ephemeral: inline shallow index. + async fn handle_new_directory(&self, path: &Path) -> Result<()>; +} + +// ============================================================================ +// Shared Logic - Used by both handlers +// ============================================================================ + +/// Check if a path exists, distinguishing between "doesn't exist" and "can't access". +/// +/// Critical for preventing false deletions when volumes go offline. +pub async fn path_exists_safe( + path: &Path, + backend: Option<&Arc>, +) -> Result { + use crate::volume::error::VolumeError; + + if let Some(backend) = backend { + match backend.exists(path).await { + Ok(exists) => Ok(exists), + Err(VolumeError::NotMounted(_)) => { + tracing::warn!( + "Volume not mounted when checking path existence: {}", + path.display() + ); + Err(anyhow::anyhow!( + "Volume not mounted, cannot verify path existence" + )) + } + Err(VolumeError::Io(ref e)) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(VolumeError::Io(io_err)) => { + tracing::warn!( + "IO error when checking path existence for {}: {}", + path.display(), + io_err + ); + Err(anyhow::anyhow!( + "IO error, volume may be offline: {}", + io_err + )) + } + Err(e) => { + tracing::warn!( + "Volume error when checking path existence for {}: {}", + path.display(), + e + ); + Err(e.into()) + } + } + } else { + match tokio::fs::try_exists(path).await { + Ok(exists) => Ok(exists), + Err(e) => { + tracing::warn!( + "Cannot verify path existence for {} (volume may be offline): {}", + path.display(), + e + ); + Err(anyhow::anyhow!("Cannot access path: {}", e)) + } + } + } +} + +/// Evaluates indexing rules to determine if a path should be skipped. +pub async fn should_filter_path( + path: &Path, + rule_toggles: RuleToggles, + location_root: &Path, + backend: Option<&Arc>, +) -> Result { + let ruler = build_default_ruler(rule_toggles, location_root, path).await; + + let metadata = if let Some(backend) = backend { + backend + .metadata(path) + .await + .map_err(|e| anyhow::anyhow!("Failed to get metadata via backend: {}", e))? + } else { + let fs_meta = tokio::fs::metadata(path).await?; + crate::volume::backend::RawMetadata { + kind: if fs_meta.is_dir() { + EntryKind::Directory + } else if fs_meta.is_symlink() { + EntryKind::Symlink + } else { + EntryKind::File + }, + size: fs_meta.len(), + modified: fs_meta.modified().ok(), + created: fs_meta.created().ok(), + accessed: fs_meta.accessed().ok(), + inode: None, + permissions: None, + } + }; + + struct SimpleMetadata { + is_dir: bool, + } + impl crate::ops::indexing::rules::MetadataForIndexerRules for SimpleMetadata { + fn is_dir(&self) -> bool { + self.is_dir + } + } + + let simple_meta = SimpleMetadata { + is_dir: metadata.kind == EntryKind::Directory, + }; + + match ruler.evaluate_path(path, &simple_meta).await { + Ok(RulerDecision::Reject) => { + tracing::debug!("Filtered path by indexing rules: {}", path.display()); + Ok(true) + } + Ok(RulerDecision::Accept) => Ok(false), + Err(e) => { + tracing::warn!("Error evaluating rules for {}: {}", path.display(), e); + Ok(false) + } + } +} + +/// Extracts filesystem metadata into a DirEntry. +pub async fn build_dir_entry( + path: &Path, + backend: Option<&Arc>, +) -> Result { + use crate::ops::indexing::entry::EntryProcessor; + + let meta = EntryProcessor::extract_metadata(path, backend).await?; + Ok(DirEntry { + path: meta.path, + kind: meta.kind, + size: meta.size, + modified: meta.modified, + inode: meta.inode, + }) +} + +// ============================================================================ +// Generic Change Application +// ============================================================================ + +/// Apply a batch of filesystem changes using the provided handler. +/// +/// Processes events in the correct order: removes first, then renames, +/// creates, and finally modifies. +pub async fn apply_batch( + handler: &mut H, + events: Vec, + config: &ChangeConfig<'_>, +) -> Result<()> { + use crate::infra::event::FsRawEventKind; + + if events.is_empty() { + return Ok(()); + } + + let mut creates = Vec::new(); + let mut modifies = Vec::new(); + let mut removes = Vec::new(); + let mut renames = Vec::new(); + + for event in events { + match event { + FsRawEventKind::Create { path } => creates.push(path), + FsRawEventKind::Modify { path } => modifies.push(path), + FsRawEventKind::Remove { path } => removes.push(path), + FsRawEventKind::Rename { from, to } => renames.push((from, to)), + } + } + + // Deduplicate + creates.sort(); + creates.dedup(); + modifies.sort(); + modifies.dedup(); + removes.sort(); + removes.dedup(); + + tracing::debug!( + "Processing batch: {} creates, {} modifies, {} removes, {} renames", + creates.len(), + modifies.len(), + removes.len(), + renames.len() + ); + + // Process in order: removes, renames, creates, modifies + for path in removes { + if let Err(e) = handle_remove(handler, &path).await { + tracing::error!("Failed to handle remove for {}: {}", path.display(), e); + } + } + + for (from, to) in renames { + if let Err(e) = handle_rename(handler, &from, &to, config).await { + tracing::error!( + "Failed to handle rename from {} to {}: {}", + from.display(), + to.display(), + e + ); + } + } + + for path in creates { + if let Err(e) = handle_create(handler, &path, config).await { + tracing::error!("Failed to handle create for {}: {}", path.display(), e); + } + } + + for path in modifies { + if let Err(e) = handle_modify(handler, &path, config).await { + tracing::error!("Failed to handle modify for {}: {}", path.display(), e); + } + } + + Ok(()) +} + +/// Handle a create event. +pub async fn handle_create( + handler: &mut H, + path: &Path, + config: &ChangeConfig<'_>, +) -> Result<()> { + tracing::debug!("Create: {}", path.display()); + + match path_exists_safe(path, config.volume_backend).await { + Ok(true) => {} + Ok(false) => { + tracing::debug!("Path no longer exists, skipping create: {}", path.display()); + return Ok(()); + } + Err(e) => { + tracing::warn!( + "Skipping create event for inaccessible path {}: {}", + path.display(), + e + ); + return Ok(()); + } + } + + if should_filter_path( + path, + config.rule_toggles, + config.location_root, + config.volume_backend, + ) + .await? + { + tracing::debug!("Skipping filtered path: {}", path.display()); + return Ok(()); + } + + let metadata = build_dir_entry(path, config.volume_backend).await?; + + if handler.find_by_path(path).await?.is_some() { + tracing::debug!( + "Entry already exists at path {}, treating as modify", + path.display() + ); + return handle_modify(handler, path, config).await; + } + + if let Some(inode) = metadata.inode { + if let Some(existing) = handler.find_by_inode(inode).await? { + if existing.path != path { + tracing::debug!( + "Detected inode-based move: {} -> {}", + existing.path.display(), + path.display() + ); + let old_path = existing.path.clone(); + handler + .move_entry( + &existing, + &old_path, + path, + path.parent().unwrap_or(Path::new("/")), + ) + .await?; + handler + .emit_change_event(&existing, ChangeType::Moved) + .await?; + return Ok(()); + } + } + } + + let parent_path = path.parent().unwrap_or(Path::new("/")); + let entry = handler.create(&metadata, parent_path).await?; + + if entry.is_directory() { + handler.handle_new_directory(path).await?; + } else { + handler.run_processors(&entry, true).await?; + } + + handler + .emit_change_event(&entry, ChangeType::Created) + .await?; + + Ok(()) +} + +/// Handle a modify event. +pub async fn handle_modify( + handler: &mut H, + path: &Path, + config: &ChangeConfig<'_>, +) -> Result<()> { + tracing::debug!("Modify: {}", path.display()); + + match path_exists_safe(path, config.volume_backend).await { + Ok(true) => {} + Ok(false) => { + tracing::debug!("Path no longer exists, skipping modify: {}", path.display()); + return Ok(()); + } + Err(e) => { + tracing::warn!( + "Skipping modify event for inaccessible path {}: {}", + path.display(), + e + ); + return Ok(()); + } + } + + if should_filter_path( + path, + config.rule_toggles, + config.location_root, + config.volume_backend, + ) + .await? + { + tracing::debug!("Skipping filtered path: {}", path.display()); + return Ok(()); + } + + let metadata = build_dir_entry(path, config.volume_backend).await?; + + if let Some(inode) = metadata.inode { + if let Some(existing) = handler.find_by_inode(inode).await? { + if existing.path != path { + tracing::debug!( + "Detected inode-based move during modify: {} -> {}", + existing.path.display(), + path.display() + ); + let old_path = existing.path.clone(); + handler + .move_entry( + &existing, + &old_path, + path, + path.parent().unwrap_or(Path::new("/")), + ) + .await?; + handler + .emit_change_event(&existing, ChangeType::Moved) + .await?; + return Ok(()); + } + } + } + + if let Some(entry) = handler.find_by_path(path).await? { + handler.update(&entry, &metadata).await?; + + if !entry.is_directory() { + handler.run_processors(&entry, false).await?; + } + + handler + .emit_change_event(&entry, ChangeType::Modified) + .await?; + } else { + tracing::debug!( + "Entry not found for path, skipping modify: {}", + path.display() + ); + } + + Ok(()) +} + +/// Handle a remove event. +pub async fn handle_remove(handler: &mut H, path: &Path) -> Result<()> { + tracing::debug!("Remove: {}", path.display()); + + if let Some(entry) = handler.find_by_path(path).await? { + handler.delete(&entry).await?; + handler + .emit_change_event(&entry, ChangeType::Deleted) + .await?; + tracing::debug!("Deleted entry for path: {}", path.display()); + } else { + tracing::debug!( + "Entry not found for path, skipping remove: {}", + path.display() + ); + } + + Ok(()) +} + +/// Handle a rename event. +pub async fn handle_rename( + handler: &mut H, + from: &Path, + to: &Path, + config: &ChangeConfig<'_>, +) -> Result<()> { + tracing::debug!("Rename: {} -> {}", from.display(), to.display()); + + match path_exists_safe(to, config.volume_backend).await { + Ok(true) => {} + Ok(false) => { + tracing::debug!( + "Destination path doesn't exist, skipping rename: {}", + to.display() + ); + return Ok(()); + } + Err(e) => { + tracing::warn!( + "Skipping rename event for inaccessible destination {}: {}", + to.display(), + e + ); + return Ok(()); + } + } + + if should_filter_path( + to, + config.rule_toggles, + config.location_root, + config.volume_backend, + ) + .await? + { + tracing::debug!( + "Destination path is filtered, removing entry: {}", + to.display() + ); + return handle_remove(handler, from).await; + } + + if let Some(entry) = handler.find_by_path(from).await? { + handler + .move_entry(&entry, from, to, to.parent().unwrap_or(Path::new("/"))) + .await?; + handler.emit_change_event(&entry, ChangeType::Moved).await?; + tracing::debug!("Moved entry {} -> {}", from.display(), to.display()); + } else { + tracing::debug!( + "Entry not found for old path {}, skipping rename", + from.display() + ); + } + + Ok(()) +} diff --git a/core/src/ops/indexing/change_detection/mod.rs b/core/src/ops/indexing/change_detection/mod.rs index 0387867dd..2fdc17829 100644 --- a/core/src/ops/indexing/change_detection/mod.rs +++ b/core/src/ops/indexing/change_detection/mod.rs @@ -1,423 +1,56 @@ -//! Change detection for incremental indexing +//! Change detection and handling for the indexing system. //! -//! This module provides efficient change detection using: -//! - Inode tracking for move/rename detection -//! - Modification time comparison -//! - Size verification -//! - Directory hierarchy tracking +//! This module provides two complementary subsystems: +//! +//! 1. **Detection** (`detector.rs`): Batch scanning during indexer jobs. +//! Compares database state against filesystem to identify changes. +//! +//! 2. **Handling** (`handler.rs`): Real-time response to watcher events. +//! Applies changes (create/modify/move/delete) to storage. +//! +//! Both systems use the same `Change` type and share concepts like +//! inode-based move detection, ensuring consistent behavior. +//! +//! ## Architecture +//! +//! ```text +//! ┌─────────────────────────────────────────────────────────────┐ +//! │ Change Detection │ +//! ├─────────────────────────────────────────────────────────────┤ +//! │ │ +//! │ ┌─────────────┐ ┌─────────────┐ │ +//! │ │ Detector │ │ Handler │ │ +//! │ │ (batch) │ │ (real-time)│ │ +//! │ └──────┬──────┘ └──────┬──────┘ │ +//! │ │ │ │ +//! │ │ ┌─────────┐ │ │ +//! │ └────►│ Change │◄─────────┘ │ +//! │ │ enum │ │ +//! │ └────┬────┘ │ +//! │ │ │ +//! │ ┌──────────┴──────────┐ │ +//! │ ▼ ▼ │ +//! │ ┌─────────────┐ ┌─────────────┐ │ +//! │ │ Persistent │ │ Ephemeral │ │ +//! │ │ Handler │ │ Handler │ │ +//! │ │ (database) │ │ (in-memory) │ │ +//! │ └─────────────┘ └─────────────┘ │ +//! │ │ +//! └─────────────────────────────────────────────────────────────┘ +//! ``` -use super::state::EntryKind; -use crate::infra::{db::entities, job::prelude::JobContext}; -use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, QuerySelect}; -use std::{ - collections::HashMap, - path::{Path, PathBuf}, - time::SystemTime, +pub mod detector; +pub mod ephemeral; +pub mod handler; +pub mod persistent; +pub mod types; + +// Re-export primary types +pub use detector::ChangeDetector; +pub use ephemeral::EphemeralChangeHandler; +pub use handler::{ + apply_batch, build_dir_entry, handle_create, handle_modify, handle_remove, handle_rename, + path_exists_safe, should_filter_path, ChangeHandler, }; - -/// Represents a change detected in the file system -#[derive(Debug, Clone)] -pub enum Change { - /// New file/directory not in database - New(PathBuf), - - /// File/directory modified (content or metadata changed) - Modified { - path: PathBuf, - entry_id: i32, - old_modified: Option, - new_modified: Option, - }, - - /// File/directory moved or renamed (same inode, different path) - Moved { - old_path: PathBuf, - new_path: PathBuf, - entry_id: i32, - inode: u64, - }, - - /// File/directory deleted (exists in DB but not on disk) - Deleted { path: PathBuf, entry_id: i32 }, -} - -/// Tracks changes between database state and file system -pub struct ChangeDetector { - /// Maps paths to their database entries - path_to_entry: HashMap, - - /// Maps inodes to paths (for detecting moves) - inode_to_path: HashMap, - - /// Precision for timestamp comparison (some filesystems have lower precision) - timestamp_precision_ms: i64, - - /// Cache for file existence checks to avoid repeated filesystem calls - existence_cache: HashMap, -} - -#[derive(Debug, Clone)] -struct DatabaseEntry { - id: i32, - path: PathBuf, - kind: EntryKind, - size: u64, - modified: Option, - inode: Option, -} - -impl ChangeDetector { - /// Create a new change detector - pub fn new() -> Self { - Self { - path_to_entry: HashMap::new(), - inode_to_path: HashMap::new(), - timestamp_precision_ms: 1, // Default to 1ms precision - existence_cache: HashMap::new(), - } - } - - /// Load existing entries from database for a location, scoped to indexing path - pub async fn load_existing_entries( - &mut self, - ctx: &JobContext<'_>, - location_id: i32, - indexing_path: &Path, - ) -> Result<(), crate::infra::job::prelude::JobError> { - use super::persistence::{DatabasePersistence, IndexPersistence}; - use crate::infra::job::prelude::JobError; - - // For change detection, we need to get the location's root entry ID - use crate::infra::db::entities; - use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; - - let location_record = entities::location::Entity::find_by_id(location_id) - .one(ctx.library_db()) - .await - .map_err(|e| JobError::execution(format!("Failed to find location: {}", e)))? - .ok_or_else(|| JobError::execution("Location not found".to_string()))?; - - // Create a database persistence instance to leverage the scoped query logic - let persistence = DatabasePersistence::new(ctx, 0, location_record.entry_id); // device_id not needed for query - - // Use the scoped query method - let existing_entries = persistence.get_existing_entries(indexing_path).await?; - - // Process the results into our internal data structures - for (full_path, (id, inode, modified_time, size)) in existing_entries { - // Determine entry kind from the path (we could query this, but for change detection we mainly care about existence) - // For now, we'll assume File for simplicity since change detection primarily cares about path/inode/timestamp - let entry_kind = if full_path.is_dir() { - EntryKind::Directory - } else { - EntryKind::File - }; - - // Now we have accurate size information from the database - let db_entry = DatabaseEntry { - id, - path: full_path.clone(), - kind: entry_kind, - size, - modified: modified_time, - inode, - }; - - // Track by path - self.path_to_entry.insert(full_path.clone(), db_entry); - - // Track by inode if available - if let Some(inode_val) = inode { - self.inode_to_path.insert(inode_val, full_path); - } - } - - ctx.log(format!( - "Loaded {} existing entries for change detection", - self.path_to_entry.len() - )); - - // DEBUG: Log if we failed to load entries - use tracing::warn; - if self.path_to_entry.is_empty() { - warn!("DEBUG: ChangeDetector loaded 0 entries - database may be locked or empty"); - } else { - warn!( - "DEBUG: ChangeDetector loaded {} entries successfully", - self.path_to_entry.len() - ); - } - - Ok(()) - } - - /// Check if a path represents a change - pub fn check_path( - &mut self, - path: &Path, - metadata: &std::fs::Metadata, - inode: Option, - ) -> Option { - // Check if path exists in database - if let Some(db_entry) = self.path_to_entry.get(path) { - // Check for modifications - if self.is_modified(db_entry, metadata) { - return Some(Change::Modified { - path: path.to_path_buf(), - entry_id: db_entry.id, - old_modified: db_entry.modified, - new_modified: metadata.modified().ok(), - }); - } - - // No change for this path - return None; - } - - // Path not in database - check if it's a move or hard link - if let Some(inode_val) = inode { - if let Some(old_path) = self.inode_to_path.get(&inode_val).cloned() { - if old_path != path { - if let Some(db_entry) = self.path_to_entry.get(&old_path).cloned() { - // Check if the old path still exists on disk (with caching) - // - If old path exists: This is a hard link (both paths are valid) - // - If old path doesn't exist: This is a genuine move - if self.path_exists_cached(&old_path) { - // Hard link: Both paths exist and point to same inode - // Treat current path as a new entry (don't skip it) - use tracing::debug; - debug!( - "Hard link detected - existing: {:?}, new: {:?}, inode: {}", - old_path, path, inode_val - ); - // Fall through to "New file/directory" - both entries should exist - } else { - // Genuine move: Old path no longer exists, same inode at new path - use tracing::info; - info!( - "Genuine move detected - old: {:?}, new: {:?}, inode: {}", - old_path, path, inode_val - ); - return Some(Change::Moved { - old_path, - new_path: path.to_path_buf(), - entry_id: db_entry.id, - inode: inode_val, - }); - } - } - } - } - } - - // New file/directory - Some(Change::New(path.to_path_buf())) - } - - /// Find deleted entries (in DB but not seen during scan) - pub fn find_deleted(&self, seen_paths: &std::collections::HashSet) -> Vec { - self.path_to_entry - .iter() - .filter(|(path, _)| !seen_paths.contains(*path)) - .map(|(path, entry)| Change::Deleted { - path: path.clone(), - entry_id: entry.id, - }) - .collect() - } - - /// Check if an entry has been modified - fn is_modified(&self, db_entry: &DatabaseEntry, metadata: &std::fs::Metadata) -> bool { - // Check size first (fast) - if db_entry.size != metadata.len() { - return true; - } - - // Check modification time - if let (Some(db_modified), Ok(fs_modified)) = (db_entry.modified, metadata.modified()) { - // Compare with precision tolerance - let db_time = db_modified - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or_default() - .as_millis() as i64; - let fs_time = fs_modified - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or_default() - .as_millis() as i64; - - if (db_time - fs_time).abs() > self.timestamp_precision_ms { - return true; - } - } - - false - } - - /// Set timestamp precision for comparison (in milliseconds) - pub fn set_timestamp_precision(&mut self, precision_ms: i64) { - self.timestamp_precision_ms = precision_ms; - } - - /// Get the number of tracked entries - pub fn entry_count(&self) -> usize { - self.path_to_entry.len() - } - - /// Check if a path exists with caching to reduce filesystem calls - fn path_exists_cached(&mut self, path: &Path) -> bool { - // Check cache first - if let Some(&cached_result) = self.existence_cache.get(path) { - return cached_result; - } - - // Not in cache, check filesystem and cache the result - let exists = path.exists(); - self.existence_cache.insert(path.to_path_buf(), exists); - exists - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::time::SystemTime; - - // Mock metadata struct for testing - pub struct MockMetadata { - size: u64, - modified: SystemTime, - } - - impl MockMetadata { - pub fn new(size: u64) -> Self { - Self { - size, - modified: SystemTime::now(), - } - } - - pub fn len(&self) -> u64 { - self.size - } - - pub fn modified(&self) -> Result { - Ok(self.modified) - } - } - - // Helper to test change detection with mock metadata - fn test_check_path( - detector: &mut ChangeDetector, - path: &Path, - size: u64, - inode: Option, - ) -> Option { - let mock_metadata = MockMetadata::new(size); - - // We need to manually call the logic since we can't easily mock std::fs::Metadata - // Check if path exists in database - if let Some(db_entry) = detector.path_to_entry.get(path) { - // Check for modifications (simplified for testing) - if db_entry.size != mock_metadata.len() { - return Some(Change::Modified { - path: path.to_path_buf(), - entry_id: db_entry.id, - old_modified: db_entry.modified, - new_modified: Some(mock_metadata.modified), - }); - } - return None; - } - - // Path not in database - check if it's a move or hard link - if let Some(inode_val) = inode { - if let Some(old_path) = detector.inode_to_path.get(&inode_val) { - if old_path != path { - if let Some(db_entry) = detector.path_to_entry.get(old_path) { - // In mock tests, we can't easily check file existence - // For testing purposes, assume it's a hard link (treat as new entry) - // In real scenarios, the actual file existence check would determine behavior - // Fall through to treat as new entry - } - } - } - } - - // New file/directory - Some(Change::New(path.to_path_buf())) - } - - #[test] - fn test_hard_link_detection() { - let mut detector = ChangeDetector::new(); - - // Add a test entry - let db_path = PathBuf::from("/test/dir1/file.txt"); - let db_entry = DatabaseEntry { - id: 1, - path: db_path.clone(), - kind: EntryKind::File, - size: 1000, - modified: Some(SystemTime::now()), - inode: Some(12345), - }; - - detector.path_to_entry.insert(db_path.clone(), db_entry); - detector.inode_to_path.insert(12345, db_path); - - // Test hard link detection (same inode, different path, both should exist) - let hard_link_path = PathBuf::from("/test/dir2/hardlink.txt"); - - // Since we can't easily mock file existence in tests, we'll test the logic - // In a real scenario, if both paths exist, it should be treated as a new entry - let result = test_check_path(&mut detector, &hard_link_path, 1000, Some(12345)); - // In our mock test, this will be treated as new since we can't check file existence - match result { - Some(Change::New(path)) => assert_eq!(path, hard_link_path), - _ => panic!("Expected hard link to be treated as new entry"), - } - } - - #[test] - fn test_consistent_behavior() { - let mut detector = ChangeDetector::new(); - - // Add a test entry - let db_path = PathBuf::from("/test/dir1/file.txt"); - let db_entry = DatabaseEntry { - id: 1, - path: db_path.clone(), - kind: EntryKind::File, - size: 1000, - modified: Some(SystemTime::now()), - inode: Some(12345), - }; - - detector.path_to_entry.insert(db_path.clone(), db_entry); - detector.inode_to_path.insert(12345, db_path.clone()); - - // Test consistent behavior: same inode at different path - // In our mock test environment, this will be treated as a new entry - // (since we can't mock file existence checks easily) - let other_path = PathBuf::from("/test/dir2/other_file.txt"); - - let result = test_check_path(&mut detector, &other_path, 1000, Some(12345)); - match result { - Some(Change::New(path)) => assert_eq!(path, other_path), - _ => panic!("Expected consistent behavior: treat as new entry"), - } - } - - #[test] - fn test_new_file_detection() { - let mut detector = ChangeDetector::new(); - - // Test new file detection - let new_path = PathBuf::from("/test/new_file.txt"); - - match test_check_path(&mut detector, &new_path, 500, None) { - Some(Change::New(p)) => assert_eq!(p, new_path), - _ => panic!("Expected new file detection"), - } - } -} +pub use persistent::PersistentChangeHandler; +pub use types::{Change, ChangeConfig, ChangeMetadata, ChangeType, EntryRef}; diff --git a/core/src/ops/indexing/change_detection/persistent.rs b/core/src/ops/indexing/change_detection/persistent.rs new file mode 100644 index 000000000..0378f8f78 --- /dev/null +++ b/core/src/ops/indexing/change_detection/persistent.rs @@ -0,0 +1,638 @@ +//! Persistent (database-backed) change handler for managed locations. +//! +//! Uses EntryProcessor for CRUD operations and maintains closure table +//! relationships. Runs the processor pipeline (thumbnails, content hash) +//! for new and modified files. + +use super::handler::ChangeHandler; +use super::types::{ChangeType, EntryRef}; +use crate::context::CoreContext; +use crate::infra::db::entities; +use crate::ops::indexing::state::{DirEntry, EntryKind}; +use anyhow::Result; +use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, TransactionTrait}; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use uuid::Uuid; + +/// Database-backed change handler for managed locations. +pub struct PersistentChangeHandler { + context: Arc, + library_id: Uuid, + location_id: Uuid, + location_root_entry_id: i32, + db: sea_orm::DatabaseConnection, + volume_backend: Option>, + entry_id_cache: HashMap, +} + +impl PersistentChangeHandler { + pub async fn new( + context: Arc, + library_id: Uuid, + location_id: Uuid, + _location_root: &Path, + volume_backend: Option>, + ) -> Result { + let library = context + .get_library(library_id) + .await + .ok_or_else(|| anyhow::anyhow!("Library not found: {}", library_id))?; + + let db = library.db().conn().clone(); + + let location_record = entities::location::Entity::find() + .filter(entities::location::Column::Uuid.eq(location_id)) + .one(&db) + .await? + .ok_or_else(|| anyhow::anyhow!("Location not found: {}", location_id))?; + + let location_root_entry_id = location_record + .entry_id + .ok_or_else(|| anyhow::anyhow!("Location {} has no root entry", location_id))?; + + Ok(Self { + context, + library_id, + location_id, + location_root_entry_id, + db, + volume_backend, + entry_id_cache: HashMap::new(), + }) + } + + async fn resolve_entry_id(&self, path: &Path) -> Result> { + if let Some(id) = self.resolve_directory_entry_id(path).await? { + return Ok(Some(id)); + } + self.resolve_file_entry_id(path).await + } + + async fn resolve_directory_entry_id(&self, path: &Path) -> Result> { + use sea_orm::FromQueryResult; + + let path_str = path.to_string_lossy().to_string(); + + #[derive(Debug, FromQueryResult)] + struct DirectoryEntryId { + entry_id: i32, + } + + let result = DirectoryEntryId::find_by_statement(sea_orm::Statement::from_sql_and_values( + sea_orm::DbBackend::Sqlite, + r#" + SELECT dp.entry_id + FROM directory_paths dp + INNER JOIN entry_closure ec ON ec.descendant_id = dp.entry_id + WHERE dp.path = ? + AND ec.ancestor_id = ? + "#, + vec![path_str.into(), self.location_root_entry_id.into()], + )) + .one(&self.db) + .await?; + + Ok(result.map(|r| r.entry_id)) + } + + async fn resolve_file_entry_id(&self, path: &Path) -> Result> { + let parent = match path.parent() { + Some(p) => p, + None => return Ok(None), + }; + + let parent_id = match self.resolve_directory_entry_id(parent).await? { + Some(id) => id, + None => return Ok(None), + }; + + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_string(); + let ext = path + .extension() + .and_then(|s| s.to_str()) + .map(|s| s.to_lowercase()); + + let mut q = entities::entry::Entity::find() + .filter(entities::entry::Column::ParentId.eq(parent_id)) + .filter(entities::entry::Column::Name.eq(name)); + + if let Some(e) = ext { + q = q.filter(entities::entry::Column::Extension.eq(e)); + } else { + q = q.filter(entities::entry::Column::Extension.is_null()); + } + + let model = q.one(&self.db).await?; + Ok(model.map(|m| m.id)) + } +} + +#[async_trait::async_trait] +impl ChangeHandler for PersistentChangeHandler { + async fn find_by_path(&self, path: &Path) -> Result> { + let entry_id = match self.resolve_entry_id(path).await? { + Some(id) => id, + None => return Ok(None), + }; + + let entry = entities::entry::Entity::find_by_id(entry_id) + .one(&self.db) + .await? + .ok_or_else(|| anyhow::anyhow!("Entry {} not found after ID lookup", entry_id))?; + + let kind = match entry.kind { + 0 => EntryKind::File, + 1 => EntryKind::Directory, + 2 => EntryKind::Symlink, + _ => EntryKind::File, + }; + + Ok(Some(EntryRef { + id: entry.id, + uuid: entry.uuid, + path: path.to_path_buf(), + kind, + })) + } + + async fn find_by_inode(&self, inode: u64) -> Result> { + let inode_val = inode as i64; + + let entry = entities::entry::Entity::find() + .filter(entities::entry::Column::Inode.eq(inode_val)) + .one(&self.db) + .await?; + + match entry { + Some(e) => { + let full_path = crate::ops::indexing::PathResolver::get_full_path(&self.db, e.id) + .await + .unwrap_or_else(|_| PathBuf::from(&e.name)); + + let kind = match e.kind { + 0 => EntryKind::File, + 1 => EntryKind::Directory, + 2 => EntryKind::Symlink, + _ => EntryKind::File, + }; + + Ok(Some(EntryRef { + id: e.id, + uuid: e.uuid, + path: full_path, + kind, + })) + } + None => Ok(None), + } + } + + async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result { + use crate::domain::addressing::SdPath; + use crate::ops::indexing::entry::EntryProcessor; + use crate::ops::indexing::state::IndexerState; + + let mut state = IndexerState::new(&SdPath::local(&metadata.path)); + + if let Some(&parent_id) = self.entry_id_cache.get(parent_path) { + state + .entry_id_cache + .insert(parent_path.to_path_buf(), parent_id); + } else if let Some(parent_id) = self.resolve_directory_entry_id(parent_path).await? { + state + .entry_id_cache + .insert(parent_path.to_path_buf(), parent_id); + self.entry_id_cache + .insert(parent_path.to_path_buf(), parent_id); + } + + let ctx = + crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + + let entry_id = EntryProcessor::create_entry(&mut state, &ctx, metadata, 0, parent_path) + .await + .map_err(|e| anyhow::anyhow!("Failed to create entry: {}", e))?; + + self.entry_id_cache.insert(metadata.path.clone(), entry_id); + + let entry = entities::entry::Entity::find_by_id(entry_id) + .one(&self.db) + .await? + .ok_or_else(|| anyhow::anyhow!("Entry not found after creation"))?; + + Ok(EntryRef { + id: entry.id, + uuid: entry.uuid, + path: metadata.path.clone(), + kind: metadata.kind, + }) + } + + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { + use crate::ops::indexing::entry::EntryProcessor; + + let ctx = + crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + EntryProcessor::update_entry(&ctx, entry.id, metadata) + .await + .map_err(|e| anyhow::anyhow!("Failed to update entry: {}", e))?; + + Ok(()) + } + + async fn move_entry( + &mut self, + entry: &EntryRef, + old_path: &Path, + new_path: &Path, + new_parent_path: &Path, + ) -> Result<()> { + use crate::domain::addressing::SdPath; + use crate::ops::indexing::entry::EntryProcessor; + use crate::ops::indexing::state::IndexerState; + + let mut state = IndexerState::new(&SdPath::local(old_path)); + + if let Some(&parent_id) = self.entry_id_cache.get(new_parent_path) { + state + .entry_id_cache + .insert(new_parent_path.to_path_buf(), parent_id); + } else if let Some(parent_id) = self.resolve_directory_entry_id(new_parent_path).await? { + state + .entry_id_cache + .insert(new_parent_path.to_path_buf(), parent_id); + self.entry_id_cache + .insert(new_parent_path.to_path_buf(), parent_id); + } + + let ctx = + crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + EntryProcessor::move_entry( + &mut state, + &ctx, + entry.id, + old_path, + new_path, + new_parent_path, + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to move entry: {}", e))?; + + self.entry_id_cache.remove(old_path); + self.entry_id_cache.insert(new_path.to_path_buf(), entry.id); + + Ok(()) + } + + async fn delete(&mut self, entry: &EntryRef) -> Result<()> { + let mut to_delete_ids: Vec = vec![entry.id]; + + if let Ok(rows) = entities::entry_closure::Entity::find() + .filter(entities::entry_closure::Column::AncestorId.eq(entry.id)) + .all(&self.db) + .await + { + to_delete_ids.extend(rows.into_iter().map(|r| r.descendant_id)); + } + + let mut queue = vec![entry.id]; + let mut visited = std::collections::HashSet::from([entry.id]); + + while let Some(parent) = queue.pop() { + if let Ok(children) = entities::entry::Entity::find() + .filter(entities::entry::Column::ParentId.eq(parent)) + .all(&self.db) + .await + { + for child in children { + if visited.insert(child.id) { + to_delete_ids.push(child.id); + queue.push(child.id); + } + } + } + } + + to_delete_ids.sort_unstable(); + to_delete_ids.dedup(); + + let entries_to_delete = if !to_delete_ids.is_empty() { + let mut all_entries = Vec::new(); + for chunk in to_delete_ids.chunks(900) { + let batch = entities::entry::Entity::find() + .filter(entities::entry::Column::Id.is_in(chunk.to_vec())) + .all(&self.db) + .await?; + all_entries.extend(batch); + } + all_entries + } else { + Vec::new() + }; + + if !entries_to_delete.is_empty() { + if let Some(library) = self.context.get_library(self.library_id).await { + let _ = library + .sync_models_batch( + &entries_to_delete, + crate::infra::sync::ChangeType::Delete, + &self.db, + ) + .await; + } + } + + let txn = self.db.begin().await?; + + if !to_delete_ids.is_empty() { + let _ = entities::entry_closure::Entity::delete_many() + .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) + .exec(&txn) + .await; + let _ = entities::entry_closure::Entity::delete_many() + .filter(entities::entry_closure::Column::AncestorId.is_in(to_delete_ids.clone())) + .exec(&txn) + .await; + let _ = entities::directory_paths::Entity::delete_many() + .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) + .exec(&txn) + .await; + let _ = entities::entry::Entity::delete_many() + .filter(entities::entry::Column::Id.is_in(to_delete_ids)) + .exec(&txn) + .await; + } + + txn.commit().await?; + self.entry_id_cache.remove(&entry.path); + + Ok(()) + } + + async fn run_processors(&self, entry: &EntryRef, _is_new: bool) -> Result<()> { + use crate::ops::indexing::processor::{ + load_location_processor_config, ContentHashProcessor, ProcessorEntry, + }; + use crate::ops::media::{ + ocr::OcrProcessor, proxy::ProxyProcessor, speech::SpeechToTextProcessor, + thumbnail::ThumbnailProcessor, thumbstrip::ThumbstripProcessor, + }; + + if entry.is_directory() { + return Ok(()); + } + + let Some(library) = self.context.get_library(self.library_id).await else { + return Ok(()); + }; + + let proc_config = load_location_processor_config(self.location_id, &self.db) + .await + .unwrap_or_default(); + + let ctx = + crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + + // Helper to build ProcessorEntry (re-queries to get latest content_id after hash) + let build_proc_entry = |db: &sea_orm::DatabaseConnection, + entry: &EntryRef| + -> std::pin::Pin< + Box> + Send + '_>, + > { + let entry = entry.clone(); + let db = db.clone(); + Box::pin(async move { + let db_entry = entities::entry::Entity::find_by_id(entry.id) + .one(&db) + .await? + .ok_or_else(|| anyhow::anyhow!("Entry not found"))?; + + let mime_type = if let Some(content_id) = db_entry.content_id { + if let Ok(Some(ci)) = entities::content_identity::Entity::find_by_id(content_id) + .one(&db) + .await + { + if let Some(mime_id) = ci.mime_type_id { + if let Ok(Some(mime)) = entities::mime_type::Entity::find_by_id(mime_id) + .one(&db) + .await + { + Some(mime.mime_type) + } else { + None + } + } else { + None + } + } else { + None + } + } else { + None + }; + + Ok(ProcessorEntry { + id: entry.id, + uuid: entry.uuid, + path: entry.path.clone(), + kind: entry.kind, + size: db_entry.size as u64, + content_id: db_entry.content_id, + mime_type, + }) + }) + }; + + // Content hash (run first - other processors may need the content_id) + if proc_config + .watcher_processors + .iter() + .any(|c| c.processor_type == "content_hash" && c.enabled) + { + let proc_entry = build_proc_entry(&self.db, entry).await?; + let content_proc = ContentHashProcessor::new(self.library_id); + if let Err(e) = content_proc.process(&ctx, &proc_entry).await { + tracing::warn!("Content hash processing failed: {}", e); + } + } + + // Thumbnail + if proc_config + .watcher_processors + .iter() + .any(|c| c.processor_type == "thumbnail" && c.enabled) + { + let proc_entry = build_proc_entry(&self.db, entry).await?; + let thumb_proc = ThumbnailProcessor::new(library.clone()); + if thumb_proc.should_process(&proc_entry) { + if let Err(e) = thumb_proc.process(&self.db, &proc_entry).await { + tracing::warn!("Thumbnail processing failed: {}", e); + } + } + } + + // Thumbstrip + if proc_config + .watcher_processors + .iter() + .any(|c| c.processor_type == "thumbstrip" && c.enabled) + { + let proc_entry = build_proc_entry(&self.db, entry).await?; + let settings = proc_config + .watcher_processors + .iter() + .find(|c| c.processor_type == "thumbstrip") + .map(|c| &c.settings); + + let thumbstrip_proc = if let Some(settings) = settings { + ThumbstripProcessor::new(library.clone()) + .with_settings(settings) + .unwrap_or_else(|e| { + tracing::warn!("Failed to parse thumbstrip settings: {}", e); + ThumbstripProcessor::new(library.clone()) + }) + } else { + ThumbstripProcessor::new(library.clone()) + }; + + if thumbstrip_proc.should_process(&proc_entry) { + if let Err(e) = thumbstrip_proc.process(&self.db, &proc_entry).await { + tracing::warn!("Thumbstrip processing failed: {}", e); + } + } + } + + // Proxy + if proc_config + .watcher_processors + .iter() + .any(|c| c.processor_type == "proxy" && c.enabled) + { + let proc_entry = build_proc_entry(&self.db, entry).await?; + let settings = proc_config + .watcher_processors + .iter() + .find(|c| c.processor_type == "proxy") + .map(|c| &c.settings); + + let proxy_proc = if let Some(settings) = settings { + ProxyProcessor::new(library.clone()) + .with_settings(settings) + .unwrap_or_else(|e| { + tracing::warn!("Failed to parse proxy settings: {}", e); + ProxyProcessor::new(library.clone()) + }) + } else { + ProxyProcessor::new(library.clone()) + }; + + if proxy_proc.should_process(&proc_entry) { + if let Err(e) = proxy_proc.process(&self.db, &proc_entry).await { + tracing::warn!("Proxy processing failed: {}", e); + } + } + } + + // OCR + if proc_config + .watcher_processors + .iter() + .any(|c| c.processor_type == "ocr" && c.enabled) + { + let proc_entry = build_proc_entry(&self.db, entry).await?; + let ocr_proc = OcrProcessor::new(library.clone()); + if ocr_proc.should_process(&proc_entry) { + if let Err(e) = ocr_proc.process(&self.db, &proc_entry).await { + tracing::warn!("OCR processing failed: {}", e); + } + } + } + + // Speech-to-text + if proc_config + .watcher_processors + .iter() + .any(|c| c.processor_type == "speech_to_text" && c.enabled) + { + let proc_entry = build_proc_entry(&self.db, entry).await?; + let speech_proc = SpeechToTextProcessor::new(library.clone()); + if speech_proc.should_process(&proc_entry) { + if let Err(e) = speech_proc.process(&self.db, &proc_entry).await { + tracing::warn!("Speech-to-text processing failed: {}", e); + } + } + } + + Ok(()) + } + + async fn emit_change_event(&self, entry: &EntryRef, change_type: ChangeType) -> Result<()> { + use crate::domain::ResourceManager; + + if let Some(uuid) = entry.uuid { + let resource_manager = + ResourceManager::new(Arc::new(self.db.clone()), self.context.events.clone()); + + if let Err(e) = resource_manager + .emit_resource_events("entry", vec![uuid]) + .await + { + tracing::warn!( + "Failed to emit resource event for {:?} entry: {}", + change_type, + e + ); + } + } + + Ok(()) + } + + async fn handle_new_directory(&self, path: &Path) -> Result<()> { + use crate::domain::addressing::SdPath; + use crate::ops::indexing::job::{IndexMode, IndexerJob}; + + let Some(library) = self.context.get_library(self.library_id).await else { + return Ok(()); + }; + + let index_mode = if let Ok(Some(loc)) = entities::location::Entity::find() + .filter(entities::location::Column::Uuid.eq(self.location_id)) + .one(&self.db) + .await + { + match loc.index_mode.as_str() { + "shallow" => IndexMode::Shallow, + "content" => IndexMode::Content, + "deep" => IndexMode::Deep, + _ => IndexMode::Content, + } + } else { + IndexMode::Content + }; + + let indexer_job = + IndexerJob::from_location(self.location_id, SdPath::local(path), index_mode); + + if let Err(e) = library.jobs().dispatch(indexer_job).await { + tracing::warn!( + "Failed to spawn indexer job for directory {}: {}", + path.display(), + e + ); + } else { + tracing::debug!( + "Spawned recursive indexer job for directory: {}", + path.display() + ); + } + + Ok(()) + } +} diff --git a/core/src/ops/indexing/change_detection/types.rs b/core/src/ops/indexing/change_detection/types.rs new file mode 100644 index 000000000..ba1ff63bf --- /dev/null +++ b/core/src/ops/indexing/change_detection/types.rs @@ -0,0 +1,135 @@ +//! Shared types for change detection and handling. +//! +//! This module defines the common vocabulary used by both: +//! - The detector (batch scanning during indexer jobs) +//! - The handler (real-time response to watcher events) + +use crate::ops::indexing::state::EntryKind; +use std::path::PathBuf; +use std::time::SystemTime; +use uuid::Uuid; + +/// A detected or reported filesystem change. +/// +/// This enum represents changes that can come from either: +/// - The `ChangeDetector` during batch indexing scans +/// - The file watcher via `FsRawEventKind` conversion +#[derive(Debug, Clone)] +pub enum Change { + /// New file/directory (not in storage). + New(PathBuf), + + /// File/directory modified (content or metadata changed). + Modified { + path: PathBuf, + entry_id: i32, + old_modified: Option, + new_modified: Option, + }, + + /// File/directory moved or renamed (same inode, different path). + Moved { + old_path: PathBuf, + new_path: PathBuf, + entry_id: i32, + inode: u64, + }, + + /// File/directory deleted (existed in storage but not on disk). + Deleted { path: PathBuf, entry_id: i32 }, +} + +impl Change { + /// Get the primary path affected by this change. + pub fn path(&self) -> &PathBuf { + match self { + Change::New(path) => path, + Change::Modified { path, .. } => path, + Change::Moved { new_path, .. } => new_path, + Change::Deleted { path, .. } => path, + } + } + + /// Get the change type for event emission. + pub fn change_type(&self) -> ChangeType { + match self { + Change::New(_) => ChangeType::Created, + Change::Modified { .. } => ChangeType::Modified, + Change::Moved { .. } => ChangeType::Moved, + Change::Deleted { .. } => ChangeType::Deleted, + } + } + + /// Create a Change from an FsRawEventKind (for watcher integration). + /// Note: These variants don't have entry_ids since they come from the watcher. + pub fn from_fs_event(event: crate::infra::event::FsRawEventKind) -> Self { + use crate::infra::event::FsRawEventKind; + + match event { + FsRawEventKind::Create { path } => Change::New(path), + FsRawEventKind::Modify { path } => Change::Modified { + path, + entry_id: 0, // Placeholder - handler will look up real ID + old_modified: None, + new_modified: None, + }, + FsRawEventKind::Remove { path } => Change::Deleted { + path, + entry_id: 0, // Placeholder - handler will look up real ID + }, + FsRawEventKind::Rename { from, to } => Change::Moved { + old_path: from, + new_path: to, + entry_id: 0, // Placeholder - handler will look up real ID + inode: 0, + }, + } + } +} + +/// Metadata about a change, populated during detection. +#[derive(Debug, Clone)] +pub struct ChangeMetadata { + pub size: u64, + pub modified: Option, + pub inode: Option, + pub kind: EntryKind, +} + +/// Type of change for event emission and logging. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ChangeType { + Created, + Modified, + Moved, + Deleted, +} + +/// Reference to an entry in either persistent or ephemeral storage. +/// +/// Provides a uniform way to refer to entries regardless of storage backend. +/// Persistent entries have database IDs; ephemeral entries have synthetic IDs. +#[derive(Debug, Clone)] +pub struct EntryRef { + /// For persistent: database entry ID. For ephemeral: synthetic ID. + pub id: i32, + /// UUID for sync and event emission. + pub uuid: Option, + /// Full filesystem path. + pub path: PathBuf, + /// Entry kind (file/directory/symlink). + pub kind: EntryKind, +} + +impl EntryRef { + pub fn is_directory(&self) -> bool { + self.kind == EntryKind::Directory + } +} + +/// Configuration for change handling operations. +pub struct ChangeConfig<'a> { + pub rule_toggles: crate::ops::indexing::rules::RuleToggles, + pub location_root: &'a std::path::Path, + pub volume_backend: Option<&'a std::sync::Arc>, +} diff --git a/core/src/ops/indexing/ephemeral/responder.rs b/core/src/ops/indexing/ephemeral/responder.rs index 5ebbc0feb..913b105a3 100644 --- a/core/src/ops/indexing/ephemeral/responder.rs +++ b/core/src/ops/indexing/ephemeral/responder.rs @@ -17,7 +17,7 @@ use crate::context::CoreContext; use crate::infra::event::FsRawEventKind; -use crate::ops::indexing::handler::{self, ChangeConfig, EphemeralChangeHandler}; +use crate::ops::indexing::change_detection::{self, ChangeConfig, EphemeralChangeHandler}; use crate::ops::indexing::rules::RuleToggles; use anyhow::Result; use std::path::{Path, PathBuf}; @@ -76,7 +76,7 @@ pub async fn apply_batch( volume_backend: None, // Ephemeral paths typically don't use volume backends }; - handler::apply_batch(&mut handler, events, &config).await + change_detection::apply_batch(&mut handler, events, &config).await } /// Process a single filesystem event against the ephemeral index. diff --git a/core/src/ops/indexing/handler.rs b/core/src/ops/indexing/handler.rs deleted file mode 100644 index 896e1d653..000000000 --- a/core/src/ops/indexing/handler.rs +++ /dev/null @@ -1,1447 +0,0 @@ -//! Unified change handling for persistent and ephemeral indexing. -//! -//! This module provides a trait-based abstraction for filesystem change handling, -//! allowing the same logic to work with both database-backed (persistent) and -//! memory-backed (ephemeral) storage. The watcher and responder use these handlers -//! to process Create/Modify/Remove/Rename events consistently. -//! -//! ## Architecture -//! -//! ```text -//! FsRawEventKind -//! │ -//! ▼ -//! ┌─────────────────────────────────────────────┐ -//! │ apply_change (shared logic) │ -//! │ - path validation │ -//! │ - rule filtering │ -//! │ - metadata extraction │ -//! │ - inode-based move detection │ -//! └──────────────────┬──────────────────────────┘ -//! │ -//! ┌─────────┴─────────┐ -//! ▼ ▼ -//! ┌───────────────┐ ┌───────────────┐ -//! │ Persistent │ │ Ephemeral │ -//! │ ChangeHandler │ │ ChangeHandler │ -//! │ (database) │ │ (in-memory) │ -//! └───────────────┘ └───────────────┘ -//! ``` - -use super::rules::{build_default_ruler, RuleToggles, RulerDecision}; -use super::state::{DirEntry, EntryKind}; -use anyhow::Result; -use std::path::{Path, PathBuf}; -use std::sync::Arc; -use uuid::Uuid; - -/// Reference to an entry in either persistent or ephemeral storage. -/// -/// Provides a uniform way to refer to entries regardless of storage backend. -/// Persistent entries have database IDs; ephemeral entries have arena indices. -#[derive(Debug, Clone)] -pub struct EntryRef { - /// For persistent: database entry ID. For ephemeral: synthetic ID. - pub id: i32, - /// UUID for sync and event emission. - pub uuid: Option, - /// Full filesystem path. - pub path: PathBuf, - /// Entry kind (file/directory/symlink). - pub kind: EntryKind, -} - -impl EntryRef { - pub fn is_directory(&self) -> bool { - self.kind == EntryKind::Directory - } -} - -/// Type of change for event emission. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ChangeType { - Created, - Modified, - Moved, - Deleted, -} - -/// Configuration for change handling operations. -pub struct ChangeConfig<'a> { - pub rule_toggles: RuleToggles, - pub location_root: &'a Path, - pub volume_backend: Option<&'a Arc>, -} - -/// Abstracts storage operations for filesystem change handling. -/// -/// Both persistent (database) and ephemeral (in-memory) handlers implement -/// this trait, allowing the same change processing logic to work with both -/// storage backends. The trait methods map to CRUD operations plus event -/// emission and processor execution. -#[async_trait::async_trait] -pub trait ChangeHandler: Send + Sync { - /// Find an entry by its full filesystem path. - async fn find_by_path(&self, path: &Path) -> Result>; - - /// Find an entry by inode (for move detection). - /// Returns None if inode tracking is not supported or no match found. - async fn find_by_inode(&self, inode: u64) -> Result>; - - /// Create a new entry from filesystem metadata. - async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result; - - /// Update an existing entry's metadata. - async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()>; - - /// Move an entry from old path to new path. - async fn move_entry( - &mut self, - entry: &EntryRef, - old_path: &Path, - new_path: &Path, - new_parent_path: &Path, - ) -> Result<()>; - - /// Delete an entry and all its descendants. - async fn delete(&mut self, entry: &EntryRef) -> Result<()>; - - /// Run post-create/modify processors (thumbnails, content hash). - /// No-op for ephemeral handlers. - async fn run_processors(&self, entry: &EntryRef, is_new: bool) -> Result<()>; - - /// Emit appropriate events for UI updates. - async fn emit_change_event(&self, entry: &EntryRef, change_type: ChangeType) -> Result<()>; - - /// Handle directory recursion after creation. - /// Persistent: spawns indexer job. Ephemeral: inline shallow index. - async fn handle_new_directory(&self, path: &Path) -> Result<()>; -} - -// ============================================================================ -// Shared Logic - Used by both handlers -// ============================================================================ - -/// Check if a path exists, distinguishing between "doesn't exist" and "can't access". -/// -/// This is critical for preventing false deletions when volumes go offline. -/// Returns Ok(true) if path exists, Ok(false) if confirmed absent, Err if inaccessible. -pub async fn path_exists_safe( - path: &Path, - backend: Option<&Arc>, -) -> Result { - use crate::volume::error::VolumeError; - - if let Some(backend) = backend { - match backend.exists(path).await { - Ok(exists) => Ok(exists), - Err(VolumeError::NotMounted(_)) => { - tracing::warn!( - "Volume not mounted when checking path existence: {}", - path.display() - ); - Err(anyhow::anyhow!( - "Volume not mounted, cannot verify path existence" - )) - } - Err(VolumeError::Io(ref e)) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), - Err(VolumeError::Io(io_err)) => { - tracing::warn!( - "IO error when checking path existence for {}: {}", - path.display(), - io_err - ); - Err(anyhow::anyhow!( - "IO error, volume may be offline: {}", - io_err - )) - } - Err(e) => { - tracing::warn!( - "Volume error when checking path existence for {}: {}", - path.display(), - e - ); - Err(e.into()) - } - } - } else { - match tokio::fs::try_exists(path).await { - Ok(exists) => Ok(exists), - Err(e) => { - tracing::warn!( - "Cannot verify path existence for {} (volume may be offline): {}", - path.display(), - e - ); - Err(anyhow::anyhow!("Cannot access path: {}", e)) - } - } - } -} - -/// Evaluates indexing rules to determine if a path should be skipped. -pub async fn should_filter_path( - path: &Path, - rule_toggles: RuleToggles, - location_root: &Path, - backend: Option<&Arc>, -) -> Result { - let ruler = build_default_ruler(rule_toggles, location_root, path).await; - - let metadata = if let Some(backend) = backend { - backend - .metadata(path) - .await - .map_err(|e| anyhow::anyhow!("Failed to get metadata via backend: {}", e))? - } else { - let fs_meta = tokio::fs::metadata(path).await?; - crate::volume::backend::RawMetadata { - kind: if fs_meta.is_dir() { - EntryKind::Directory - } else if fs_meta.is_symlink() { - EntryKind::Symlink - } else { - EntryKind::File - }, - size: fs_meta.len(), - modified: fs_meta.modified().ok(), - created: fs_meta.created().ok(), - accessed: fs_meta.accessed().ok(), - inode: None, - permissions: None, - } - }; - - struct SimpleMetadata { - is_dir: bool, - } - impl super::rules::MetadataForIndexerRules for SimpleMetadata { - fn is_dir(&self) -> bool { - self.is_dir - } - } - - let simple_meta = SimpleMetadata { - is_dir: metadata.kind == EntryKind::Directory, - }; - - match ruler.evaluate_path(path, &simple_meta).await { - Ok(RulerDecision::Reject) => { - tracing::debug!("Filtered path by indexing rules: {}", path.display()); - Ok(true) - } - Ok(RulerDecision::Accept) => Ok(false), - Err(e) => { - tracing::warn!("Error evaluating rules for {}: {}", path.display(), e); - Ok(false) - } - } -} - -/// Extracts filesystem metadata into a DirEntry. -pub async fn build_dir_entry( - path: &Path, - backend: Option<&Arc>, -) -> Result { - use super::entry::EntryProcessor; - - let meta = EntryProcessor::extract_metadata(path, backend).await?; - Ok(DirEntry { - path: meta.path, - kind: meta.kind, - size: meta.size, - modified: meta.modified, - inode: meta.inode, - }) -} - -// ============================================================================ -// Generic Change Application -// ============================================================================ - -/// Apply a batch of filesystem changes using the provided handler. -/// -/// Processes events in the correct order: removes first, then renames, -/// creates, and finally modifies. This prevents conflicts like creating -/// a file that should have been deleted. -pub async fn apply_batch( - handler: &mut H, - events: Vec, - config: &ChangeConfig<'_>, -) -> Result<()> { - use crate::infra::event::FsRawEventKind; - - if events.is_empty() { - return Ok(()); - } - - let mut creates = Vec::new(); - let mut modifies = Vec::new(); - let mut removes = Vec::new(); - let mut renames = Vec::new(); - - for event in events { - match event { - FsRawEventKind::Create { path } => creates.push(path), - FsRawEventKind::Modify { path } => modifies.push(path), - FsRawEventKind::Remove { path } => removes.push(path), - FsRawEventKind::Rename { from, to } => renames.push((from, to)), - } - } - - // Deduplicate (macOS sends duplicate creates) - creates.sort(); - creates.dedup(); - modifies.sort(); - modifies.dedup(); - removes.sort(); - removes.dedup(); - - tracing::debug!( - "Processing batch: {} creates, {} modifies, {} removes, {} renames", - creates.len(), - modifies.len(), - removes.len(), - renames.len() - ); - - // Process in order: removes, renames, creates, modifies - for path in removes { - if let Err(e) = handle_remove(handler, &path).await { - tracing::error!("Failed to handle remove for {}: {}", path.display(), e); - } - } - - for (from, to) in renames { - if let Err(e) = handle_rename(handler, &from, &to, config).await { - tracing::error!( - "Failed to handle rename from {} to {}: {}", - from.display(), - to.display(), - e - ); - } - } - - for path in creates { - if let Err(e) = handle_create(handler, &path, config).await { - tracing::error!("Failed to handle create for {}: {}", path.display(), e); - } - } - - for path in modifies { - if let Err(e) = handle_modify(handler, &path, config).await { - tracing::error!("Failed to handle modify for {}: {}", path.display(), e); - } - } - - Ok(()) -} - -/// Handle a create event. -/// -/// Validates path, checks rules, extracts metadata, detects inode-based moves, -/// and creates the entry. For directories, triggers recursive indexing. -pub async fn handle_create( - handler: &mut H, - path: &Path, - config: &ChangeConfig<'_>, -) -> Result<()> { - tracing::debug!("Create: {}", path.display()); - - // 1. Validate path exists - match path_exists_safe(path, config.volume_backend).await { - Ok(true) => {} - Ok(false) => { - tracing::debug!("Path no longer exists, skipping create: {}", path.display()); - return Ok(()); - } - Err(e) => { - tracing::warn!( - "Skipping create event for inaccessible path {}: {}", - path.display(), - e - ); - return Ok(()); - } - } - - // 2. Apply rule filtering - if should_filter_path( - path, - config.rule_toggles, - config.location_root, - config.volume_backend, - ) - .await? - { - tracing::debug!("Skipping filtered path: {}", path.display()); - return Ok(()); - } - - // 3. Extract metadata - let metadata = build_dir_entry(path, config.volume_backend).await?; - - // 4. Check for existing entry (treat as modify) - if handler.find_by_path(path).await?.is_some() { - tracing::debug!( - "Entry already exists at path {}, treating as modify", - path.display() - ); - return handle_modify(handler, path, config).await; - } - - // 5. Check for inode-based move - if let Some(inode) = metadata.inode { - if let Some(existing) = handler.find_by_inode(inode).await? { - if existing.path != path { - tracing::debug!( - "Detected inode-based move: {} -> {}", - existing.path.display(), - path.display() - ); - let old_path = existing.path.clone(); - handler - .move_entry( - &existing, - &old_path, - path, - path.parent().unwrap_or(Path::new("/")), - ) - .await?; - handler - .emit_change_event(&existing, ChangeType::Moved) - .await?; - return Ok(()); - } - } - } - - // 6. Create entry - let parent_path = path.parent().unwrap_or(Path::new("/")); - let entry = handler.create(&metadata, parent_path).await?; - - // 7. Handle directory recursion or run processors - if entry.is_directory() { - handler.handle_new_directory(path).await?; - } else { - handler.run_processors(&entry, true).await?; - } - - // 8. Emit event - handler - .emit_change_event(&entry, ChangeType::Created) - .await?; - - Ok(()) -} - -/// Handle a modify event. -/// -/// Updates existing entry metadata and re-runs processors for files. -pub async fn handle_modify( - handler: &mut H, - path: &Path, - config: &ChangeConfig<'_>, -) -> Result<()> { - tracing::debug!("Modify: {}", path.display()); - - // 1. Validate path exists - match path_exists_safe(path, config.volume_backend).await { - Ok(true) => {} - Ok(false) => { - tracing::debug!("Path no longer exists, skipping modify: {}", path.display()); - return Ok(()); - } - Err(e) => { - tracing::warn!( - "Skipping modify event for inaccessible path {}: {}", - path.display(), - e - ); - return Ok(()); - } - } - - // 2. Apply rule filtering - if should_filter_path( - path, - config.rule_toggles, - config.location_root, - config.volume_backend, - ) - .await? - { - tracing::debug!("Skipping filtered path: {}", path.display()); - return Ok(()); - } - - // 3. Extract metadata - let metadata = build_dir_entry(path, config.volume_backend).await?; - - // 4. Check for inode-based move - if let Some(inode) = metadata.inode { - if let Some(existing) = handler.find_by_inode(inode).await? { - if existing.path != path { - tracing::debug!( - "Detected inode-based move during modify: {} -> {}", - existing.path.display(), - path.display() - ); - let old_path = existing.path.clone(); - handler - .move_entry( - &existing, - &old_path, - path, - path.parent().unwrap_or(Path::new("/")), - ) - .await?; - handler - .emit_change_event(&existing, ChangeType::Moved) - .await?; - return Ok(()); - } - } - } - - // 5. Find and update entry - if let Some(entry) = handler.find_by_path(path).await? { - handler.update(&entry, &metadata).await?; - - // 6. Re-run processors for files - if !entry.is_directory() { - handler.run_processors(&entry, false).await?; - } - - // 7. Emit event - handler - .emit_change_event(&entry, ChangeType::Modified) - .await?; - } else { - tracing::debug!( - "Entry not found for path, skipping modify: {}", - path.display() - ); - } - - Ok(()) -} - -/// Handle a remove event. -/// -/// Deletes the entry and its entire subtree. -pub async fn handle_remove(handler: &mut H, path: &Path) -> Result<()> { - tracing::debug!("Remove: {}", path.display()); - - if let Some(entry) = handler.find_by_path(path).await? { - handler.delete(&entry).await?; - handler - .emit_change_event(&entry, ChangeType::Deleted) - .await?; - tracing::debug!("Deleted entry for path: {}", path.display()); - } else { - tracing::debug!( - "Entry not found for path, skipping remove: {}", - path.display() - ); - } - - Ok(()) -} - -/// Handle a rename event. -/// -/// Moves an entry from one path to another, updating parent relationships. -pub async fn handle_rename( - handler: &mut H, - from: &Path, - to: &Path, - config: &ChangeConfig<'_>, -) -> Result<()> { - tracing::debug!("Rename: {} -> {}", from.display(), to.display()); - - // 1. Validate destination exists - match path_exists_safe(to, config.volume_backend).await { - Ok(true) => {} - Ok(false) => { - tracing::debug!( - "Destination path doesn't exist, skipping rename: {}", - to.display() - ); - return Ok(()); - } - Err(e) => { - tracing::warn!( - "Skipping rename event for inaccessible destination {}: {}", - to.display(), - e - ); - return Ok(()); - } - } - - // 2. Check if destination is filtered (treat as deletion) - if should_filter_path( - to, - config.rule_toggles, - config.location_root, - config.volume_backend, - ) - .await? - { - tracing::debug!( - "Destination path is filtered, removing entry: {}", - to.display() - ); - return handle_remove(handler, from).await; - } - - // 3. Find source entry and move - if let Some(entry) = handler.find_by_path(from).await? { - handler - .move_entry(&entry, from, to, to.parent().unwrap_or(Path::new("/"))) - .await?; - handler.emit_change_event(&entry, ChangeType::Moved).await?; - tracing::debug!("Moved entry {} -> {}", from.display(), to.display()); - } else { - tracing::debug!( - "Entry not found for old path {}, skipping rename", - from.display() - ); - } - - Ok(()) -} - -// ============================================================================ -// Persistent Change Handler (Database-backed) -// ============================================================================ - -use crate::context::CoreContext; -use crate::infra::db::entities; -use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, QuerySelect}; - -/// Database-backed change handler for managed locations. -/// -/// Uses EntryProcessor for CRUD operations and maintains closure table -/// relationships. Runs processor pipeline (thumbnails, content hash) for -/// new and modified files. -pub struct PersistentChangeHandler { - context: Arc, - library_id: Uuid, - location_id: Uuid, - location_root_entry_id: i32, - db: sea_orm::DatabaseConnection, - /// Volume backend for this location - volume_backend: Option>, - /// Entry ID cache for parent lookups - entry_id_cache: std::collections::HashMap, -} - -impl PersistentChangeHandler { - pub async fn new( - context: Arc, - library_id: Uuid, - location_id: Uuid, - location_root: &Path, - volume_backend: Option>, - ) -> Result { - let library = context - .get_library(library_id) - .await - .ok_or_else(|| anyhow::anyhow!("Library not found: {}", library_id))?; - - let db = library.db().conn().clone(); - - // Get location's root entry_id - let location_record = entities::location::Entity::find() - .filter(entities::location::Column::Uuid.eq(location_id)) - .one(&db) - .await? - .ok_or_else(|| anyhow::anyhow!("Location not found: {}", location_id))?; - - let location_root_entry_id = location_record - .entry_id - .ok_or_else(|| anyhow::anyhow!("Location {} has no root entry", location_id))?; - - Ok(Self { - context, - library_id, - location_id, - location_root_entry_id, - db, - volume_backend, - entry_id_cache: std::collections::HashMap::new(), - }) - } - - /// Resolve entry ID by path, checking directories then files. - async fn resolve_entry_id(&self, path: &Path) -> Result> { - // Try directory lookup first - if let Some(id) = self.resolve_directory_entry_id(path).await? { - return Ok(Some(id)); - } - // Try file lookup - self.resolve_file_entry_id(path).await - } - - async fn resolve_directory_entry_id(&self, path: &Path) -> Result> { - use sea_orm::FromQueryResult; - - let path_str = path.to_string_lossy().to_string(); - - #[derive(Debug, FromQueryResult)] - struct DirectoryEntryId { - entry_id: i32, - } - - let result = DirectoryEntryId::find_by_statement(sea_orm::Statement::from_sql_and_values( - sea_orm::DbBackend::Sqlite, - r#" - SELECT dp.entry_id - FROM directory_paths dp - INNER JOIN entry_closure ec ON ec.descendant_id = dp.entry_id - WHERE dp.path = ? - AND ec.ancestor_id = ? - "#, - vec![path_str.into(), self.location_root_entry_id.into()], - )) - .one(&self.db) - .await?; - - Ok(result.map(|r| r.entry_id)) - } - - async fn resolve_file_entry_id(&self, path: &Path) -> Result> { - let parent = match path.parent() { - Some(p) => p, - None => return Ok(None), - }; - - let parent_id = match self.resolve_directory_entry_id(parent).await? { - Some(id) => id, - None => return Ok(None), - }; - - let name = path - .file_stem() - .and_then(|s| s.to_str()) - .unwrap_or("") - .to_string(); - let ext = path - .extension() - .and_then(|s| s.to_str()) - .map(|s| s.to_lowercase()); - - let mut q = entities::entry::Entity::find() - .filter(entities::entry::Column::ParentId.eq(parent_id)) - .filter(entities::entry::Column::Name.eq(name)); - - if let Some(e) = ext { - q = q.filter(entities::entry::Column::Extension.eq(e)); - } else { - q = q.filter(entities::entry::Column::Extension.is_null()); - } - - let model = q.one(&self.db).await?; - Ok(model.map(|m| m.id)) - } -} - -#[async_trait::async_trait] -impl ChangeHandler for PersistentChangeHandler { - async fn find_by_path(&self, path: &Path) -> Result> { - let entry_id = match self.resolve_entry_id(path).await? { - Some(id) => id, - None => return Ok(None), - }; - - let entry = entities::entry::Entity::find_by_id(entry_id) - .one(&self.db) - .await? - .ok_or_else(|| anyhow::anyhow!("Entry {} not found after ID lookup", entry_id))?; - - let kind = match entry.kind { - 0 => EntryKind::File, - 1 => EntryKind::Directory, - 2 => EntryKind::Symlink, - _ => EntryKind::File, - }; - - Ok(Some(EntryRef { - id: entry.id, - uuid: entry.uuid, - path: path.to_path_buf(), - kind, - })) - } - - async fn find_by_inode(&self, inode: u64) -> Result> { - let inode_val = inode as i64; - - let entry = entities::entry::Entity::find() - .filter(entities::entry::Column::Inode.eq(inode_val)) - .one(&self.db) - .await?; - - match entry { - Some(e) => { - let full_path = super::PathResolver::get_full_path(&self.db, e.id) - .await - .unwrap_or_else(|_| std::path::PathBuf::from(&e.name)); - - let kind = match e.kind { - 0 => EntryKind::File, - 1 => EntryKind::Directory, - 2 => EntryKind::Symlink, - _ => EntryKind::File, - }; - - Ok(Some(EntryRef { - id: e.id, - uuid: e.uuid, - path: full_path, - kind, - })) - } - None => Ok(None), - } - } - - async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result { - use super::entry::EntryProcessor; - use super::state::IndexerState; - use crate::domain::addressing::SdPath; - - // Create minimal state for entry creation - let mut state = IndexerState::new(&SdPath::local(&metadata.path)); - - // Seed parent cache if we have it - if let Some(&parent_id) = self.entry_id_cache.get(parent_path) { - state - .entry_id_cache - .insert(parent_path.to_path_buf(), parent_id); - } else if let Some(parent_id) = self.resolve_directory_entry_id(parent_path).await? { - state - .entry_id_cache - .insert(parent_path.to_path_buf(), parent_id); - self.entry_id_cache - .insert(parent_path.to_path_buf(), parent_id); - } - - // Use ResponderCtx for the IndexingCtx trait - let ctx = super::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - - let entry_id = EntryProcessor::create_entry(&mut state, &ctx, metadata, 0, parent_path) - .await - .map_err(|e| anyhow::anyhow!("Failed to create entry: {}", e))?; - - // Cache the new entry - self.entry_id_cache.insert(metadata.path.clone(), entry_id); - - // Get the created entry for the response - let entry = entities::entry::Entity::find_by_id(entry_id) - .one(&self.db) - .await? - .ok_or_else(|| anyhow::anyhow!("Entry not found after creation"))?; - - Ok(EntryRef { - id: entry.id, - uuid: entry.uuid, - path: metadata.path.clone(), - kind: metadata.kind, - }) - } - - async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { - use super::entry::EntryProcessor; - - let ctx = super::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - EntryProcessor::update_entry(&ctx, entry.id, metadata) - .await - .map_err(|e| anyhow::anyhow!("Failed to update entry: {}", e))?; - - Ok(()) - } - - async fn move_entry( - &mut self, - entry: &EntryRef, - old_path: &Path, - new_path: &Path, - new_parent_path: &Path, - ) -> Result<()> { - use super::entry::EntryProcessor; - use super::state::IndexerState; - use crate::domain::addressing::SdPath; - - let mut state = IndexerState::new(&SdPath::local(old_path)); - - // Seed parent cache - if let Some(&parent_id) = self.entry_id_cache.get(new_parent_path) { - state - .entry_id_cache - .insert(new_parent_path.to_path_buf(), parent_id); - } else if let Some(parent_id) = self.resolve_directory_entry_id(new_parent_path).await? { - state - .entry_id_cache - .insert(new_parent_path.to_path_buf(), parent_id); - self.entry_id_cache - .insert(new_parent_path.to_path_buf(), parent_id); - } - - let ctx = super::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - EntryProcessor::move_entry( - &mut state, - &ctx, - entry.id, - old_path, - new_path, - new_parent_path, - ) - .await - .map_err(|e| anyhow::anyhow!("Failed to move entry: {}", e))?; - - // Update cache - self.entry_id_cache.remove(old_path); - self.entry_id_cache.insert(new_path.to_path_buf(), entry.id); - - Ok(()) - } - - async fn delete(&mut self, entry: &EntryRef) -> Result<()> { - use sea_orm::TransactionTrait; - - // Collect all descendants - let mut to_delete_ids: Vec = vec![entry.id]; - - if let Ok(rows) = entities::entry_closure::Entity::find() - .filter(entities::entry_closure::Column::AncestorId.eq(entry.id)) - .all(&self.db) - .await - { - to_delete_ids.extend(rows.into_iter().map(|r| r.descendant_id)); - } - - // Also traverse via parent_id as fallback - let mut queue = vec![entry.id]; - let mut visited = std::collections::HashSet::from([entry.id]); - - while let Some(parent) = queue.pop() { - if let Ok(children) = entities::entry::Entity::find() - .filter(entities::entry::Column::ParentId.eq(parent)) - .all(&self.db) - .await - { - for child in children { - if visited.insert(child.id) { - to_delete_ids.push(child.id); - queue.push(child.id); - } - } - } - } - - to_delete_ids.sort_unstable(); - to_delete_ids.dedup(); - - // Create tombstones for sync - let entries_to_delete = if !to_delete_ids.is_empty() { - let mut all_entries = Vec::new(); - for chunk in to_delete_ids.chunks(900) { - let batch = entities::entry::Entity::find() - .filter(entities::entry::Column::Id.is_in(chunk.to_vec())) - .all(&self.db) - .await?; - all_entries.extend(batch); - } - all_entries - } else { - Vec::new() - }; - - if !entries_to_delete.is_empty() { - if let Some(library) = self.context.get_library(self.library_id).await { - let _ = library - .sync_models_batch( - &entries_to_delete, - crate::infra::sync::ChangeType::Delete, - &self.db, - ) - .await; - } - } - - // Delete in transaction - let txn = self.db.begin().await?; - - if !to_delete_ids.is_empty() { - let _ = entities::entry_closure::Entity::delete_many() - .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) - .exec(&txn) - .await; - let _ = entities::entry_closure::Entity::delete_many() - .filter(entities::entry_closure::Column::AncestorId.is_in(to_delete_ids.clone())) - .exec(&txn) - .await; - let _ = entities::directory_paths::Entity::delete_many() - .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) - .exec(&txn) - .await; - let _ = entities::entry::Entity::delete_many() - .filter(entities::entry::Column::Id.is_in(to_delete_ids)) - .exec(&txn) - .await; - } - - txn.commit().await?; - - // Clear from cache - self.entry_id_cache.remove(&entry.path); - - Ok(()) - } - - async fn run_processors(&self, entry: &EntryRef, _is_new: bool) -> Result<()> { - use super::processor::{ - load_location_processor_config, ContentHashProcessor, ProcessorEntry, - }; - use crate::ops::media::thumbnail::ThumbnailProcessor; - - if entry.is_directory() { - return Ok(()); - } - - let Some(library) = self.context.get_library(self.library_id).await else { - return Ok(()); - }; - - let proc_config = load_location_processor_config(self.location_id, &self.db) - .await - .unwrap_or_default(); - - // Build processor entry - let db_entry = entities::entry::Entity::find_by_id(entry.id) - .one(&self.db) - .await? - .ok_or_else(|| anyhow::anyhow!("Entry not found"))?; - - let mime_type = if let Some(content_id) = db_entry.content_id { - if let Ok(Some(ci)) = entities::content_identity::Entity::find_by_id(content_id) - .one(&self.db) - .await - { - if let Some(mime_id) = ci.mime_type_id { - if let Ok(Some(mime)) = entities::mime_type::Entity::find_by_id(mime_id) - .one(&self.db) - .await - { - Some(mime.mime_type) - } else { - None - } - } else { - None - } - } else { - None - } - } else { - None - }; - - let proc_entry = ProcessorEntry { - id: entry.id, - uuid: entry.uuid, - path: entry.path.clone(), - kind: entry.kind, - size: db_entry.size as u64, - content_id: db_entry.content_id, - mime_type, - }; - - let ctx = super::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - - // Content hash - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "content_hash" && c.enabled) - { - let content_proc = ContentHashProcessor::new(self.library_id); - if let Err(e) = content_proc.process(&ctx, &proc_entry).await { - tracing::warn!("Content hash processing failed: {}", e); - } - } - - // Thumbnail - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "thumbnail" && c.enabled) - { - let thumb_proc = ThumbnailProcessor::new(library.clone()); - if thumb_proc.should_process(&proc_entry) { - if let Err(e) = thumb_proc.process(&self.db, &proc_entry).await { - tracing::warn!("Thumbnail processing failed: {}", e); - } - } - } - - Ok(()) - } - - async fn emit_change_event(&self, entry: &EntryRef, change_type: ChangeType) -> Result<()> { - use crate::domain::ResourceManager; - - if let Some(uuid) = entry.uuid { - let resource_manager = - ResourceManager::new(Arc::new(self.db.clone()), self.context.events.clone()); - - if let Err(e) = resource_manager - .emit_resource_events("entry", vec![uuid]) - .await - { - tracing::warn!( - "Failed to emit resource event for {:?} entry: {}", - change_type, - e - ); - } - } - - Ok(()) - } - - async fn handle_new_directory(&self, path: &Path) -> Result<()> { - use super::job::{IndexMode, IndexerJob}; - use crate::domain::addressing::SdPath; - - let Some(library) = self.context.get_library(self.library_id).await else { - return Ok(()); - }; - - // Get index mode from location - let index_mode = if let Ok(Some(loc)) = entities::location::Entity::find() - .filter(entities::location::Column::Uuid.eq(self.location_id)) - .one(&self.db) - .await - { - match loc.index_mode.as_str() { - "shallow" => IndexMode::Shallow, - "content" => IndexMode::Content, - "deep" => IndexMode::Deep, - _ => IndexMode::Content, - } - } else { - IndexMode::Content - }; - - let indexer_job = - IndexerJob::from_location(self.location_id, SdPath::local(path), index_mode); - - if let Err(e) = library.jobs().dispatch(indexer_job).await { - tracing::warn!( - "Failed to spawn indexer job for directory {}: {}", - path.display(), - e - ); - } else { - tracing::debug!( - "Spawned recursive indexer job for directory: {}", - path.display() - ); - } - - Ok(()) - } -} - -// ============================================================================ -// Ephemeral Change Handler (Memory-backed) -// ============================================================================ - -use super::job::EphemeralIndex; -use tokio::sync::RwLock; - -/// Memory-backed change handler for ephemeral browsing. -/// -/// Updates the EphemeralIndex directly without database writes. -/// Skips processor pipeline (no thumbnails/content hash for ephemeral). -pub struct EphemeralChangeHandler { - index: Arc>, - event_bus: Arc, - root_path: PathBuf, - /// Synthetic ID counter (EphemeralIndex uses arena indices internally) - next_id: std::sync::atomic::AtomicI32, -} - -impl EphemeralChangeHandler { - pub fn new( - index: Arc>, - event_bus: Arc, - root_path: PathBuf, - ) -> Self { - Self { - index, - event_bus, - root_path, - next_id: std::sync::atomic::AtomicI32::new(1), - } - } - - fn next_id(&self) -> i32 { - self.next_id - .fetch_add(1, std::sync::atomic::Ordering::SeqCst) - } -} - -#[async_trait::async_trait] -impl ChangeHandler for EphemeralChangeHandler { - async fn find_by_path(&self, path: &Path) -> Result> { - let index = self.index.read().await; - - if let Some(metadata) = index.get_entry_ref(&path.to_path_buf()) { - let uuid = index.get_entry_uuid(&path.to_path_buf()); - - Ok(Some(EntryRef { - id: 0, // Ephemeral entries don't have stable IDs - uuid, - path: path.to_path_buf(), - kind: metadata.kind, - })) - } else { - Ok(None) - } - } - - async fn find_by_inode(&self, _inode: u64) -> Result> { - // Ephemeral index doesn't track inodes - Ok(None) - } - - async fn create(&mut self, metadata: &DirEntry, _parent_path: &Path) -> Result { - use super::entry::EntryMetadata; - - let entry_uuid = Uuid::new_v4(); - let entry_metadata = EntryMetadata::from(metadata.clone()); - - { - let mut index = self.index.write().await; - index - .add_entry(metadata.path.clone(), entry_uuid, entry_metadata) - .map_err(|e| anyhow::anyhow!("Failed to add entry to ephemeral index: {}", e))?; - } - - Ok(EntryRef { - id: self.next_id(), - uuid: Some(entry_uuid), - path: metadata.path.clone(), - kind: metadata.kind, - }) - } - - async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { - use super::entry::EntryMetadata; - - // Ephemeral index doesn't have a direct update method, - // so we remove and re-add (preserving UUID) - let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); - let entry_metadata = EntryMetadata::from(metadata.clone()); - - { - let mut index = self.index.write().await; - // The add_entry method handles duplicates by returning Ok(None) - // For updates, we need to clear first then re-add - // Since EphemeralIndex doesn't have remove_entry, we just re-add - // which effectively updates the metadata - let _ = index.add_entry(metadata.path.clone(), uuid, entry_metadata); - } - - Ok(()) - } - - async fn move_entry( - &mut self, - entry: &EntryRef, - old_path: &Path, - new_path: &Path, - _new_parent_path: &Path, - ) -> Result<()> { - // Ephemeral index doesn't support moves directly - // We delete from old path and create at new path - // Note: This loses the UUID association, but for ephemeral that's acceptable - - let metadata = build_dir_entry(new_path, None).await?; - - { - let mut index = self.index.write().await; - // Remove old entry - index.remove_entry(old_path); - - // Add at new path with preserved UUID - let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); - let entry_metadata = super::entry::EntryMetadata::from(metadata.clone()); - let _ = index.add_entry(new_path.to_path_buf(), uuid, entry_metadata); - } - - Ok(()) - } - - async fn delete(&mut self, entry: &EntryRef) -> Result<()> { - { - let mut index = self.index.write().await; - - if entry.is_directory() { - // Remove directory and all descendants - index.remove_directory_tree(&entry.path); - } else { - // Remove single entry - index.remove_entry(&entry.path); - } - } - - Ok(()) - } - - async fn run_processors(&self, _entry: &EntryRef, _is_new: bool) -> Result<()> { - // Ephemeral handler skips processors - no thumbnails or content hash - Ok(()) - } - - async fn emit_change_event(&self, entry: &EntryRef, change_type: ChangeType) -> Result<()> { - use crate::device::get_current_device_slug; - use crate::domain::addressing::SdPath; - use crate::domain::file::File; - use crate::domain::ContentKind; - use crate::infra::event::{Event, ResourceMetadata}; - - let Some(uuid) = entry.uuid else { - return Ok(()); - }; - - let device_slug = get_current_device_slug(); - - let sd_path = SdPath::Physical { - device_slug: device_slug.clone(), - path: entry.path.clone(), - }; - - // Get content kind from index - let content_kind = { - let index = self.index.read().await; - index.get_content_kind(&entry.path) - }; - - // Build a minimal File for the event - let metadata = build_dir_entry(&entry.path, None).await.ok(); - - if let Some(meta) = metadata { - let entry_metadata = super::entry::EntryMetadata::from(meta); - let mut file = File::from_ephemeral(uuid, &entry_metadata, sd_path); - file.content_kind = content_kind; - - let parent_path = entry.path.parent().map(|p| SdPath::Physical { - device_slug: file.sd_path.device_slug().unwrap_or("local").to_string(), - path: p.to_path_buf(), - }); - - let affected_paths = parent_path.into_iter().collect(); - - if let Ok(resource_json) = serde_json::to_value(&file) { - self.event_bus.emit(Event::ResourceChanged { - resource_type: "file".to_string(), - resource: resource_json, - metadata: Some(ResourceMetadata { - no_merge_fields: vec!["sd_path".to_string()], - alternate_ids: vec![], - affected_paths, - }), - }); - } - } - - Ok(()) - } - - async fn handle_new_directory(&self, path: &Path) -> Result<()> { - // For ephemeral, we do inline shallow indexing instead of spawning a job - use super::entry::EntryMetadata; - use super::entry::EntryProcessor; - - let mut entries = match tokio::fs::read_dir(path).await { - Ok(e) => e, - Err(e) => { - tracing::warn!( - "Failed to read directory {} for ephemeral indexing: {}", - path.display(), - e - ); - return Ok(()); - } - }; - - let mut index = self.index.write().await; - - while let Ok(Some(entry)) = entries.next_entry().await { - let entry_path = entry.path(); - - if let Ok(metadata) = entry.metadata().await { - let kind = if metadata.is_dir() { - EntryKind::Directory - } else if metadata.is_symlink() { - EntryKind::Symlink - } else { - EntryKind::File - }; - - let entry_metadata = EntryMetadata { - path: entry_path.clone(), - kind, - size: metadata.len(), - modified: metadata.modified().ok(), - accessed: metadata.accessed().ok(), - created: metadata.created().ok(), - inode: EntryProcessor::get_inode(&metadata), - permissions: None, - is_hidden: entry_path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| n.starts_with('.')) - .unwrap_or(false), - }; - - let uuid = Uuid::new_v4(); - let _ = index.add_entry(entry_path, uuid, entry_metadata); - } - } - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_entry_ref_is_directory() { - let file_ref = EntryRef { - id: 1, - uuid: Some(Uuid::new_v4()), - path: PathBuf::from("/test/file.txt"), - kind: EntryKind::File, - }; - assert!(!file_ref.is_directory()); - - let dir_ref = EntryRef { - id: 2, - uuid: Some(Uuid::new_v4()), - path: PathBuf::from("/test/dir"), - kind: EntryKind::Directory, - }; - assert!(dir_ref.is_directory()); - } -} diff --git a/core/src/ops/indexing/mod.rs b/core/src/ops/indexing/mod.rs index c63a058ef..85e2914e3 100644 --- a/core/src/ops/indexing/mod.rs +++ b/core/src/ops/indexing/mod.rs @@ -25,7 +25,6 @@ pub mod change_detection; pub mod ctx; pub mod entry; pub mod ephemeral; -pub mod handler; pub mod hierarchy; pub mod input; pub mod job; @@ -41,13 +40,13 @@ pub mod state; pub mod verify; pub use action::IndexingAction; +pub use change_detection::{ + apply_batch as apply_change_batch, Change, ChangeConfig, ChangeDetector, ChangeHandler, + ChangeType, EntryRef, EphemeralChangeHandler, PersistentChangeHandler, +}; pub use ctx::{IndexingCtx, ResponderCtx}; pub use entry::{EntryMetadata, EntryProcessor}; pub use ephemeral::EphemeralIndexCache; -pub use handler::{ - apply_batch as apply_change_batch, ChangeConfig, ChangeHandler, ChangeType, EntryRef, - EphemeralChangeHandler, PersistentChangeHandler, -}; pub use hierarchy::HierarchyQuery; pub use input::IndexInput; pub use job::{ diff --git a/core/src/ops/indexing/responder.rs b/core/src/ops/indexing/responder.rs index 0cb53469d..418e95bb7 100644 --- a/core/src/ops/indexing/responder.rs +++ b/core/src/ops/indexing/responder.rs @@ -1,103 +1,24 @@ -//! Change Detection Responder (function-style) +//! Persistent location responder. //! -//! Translates raw filesystem events into database-backed operations using the -//! indexing module. The watcher emits path-only events; this module resolves -//! real entry IDs and performs identity-preserving updates. +//! Thin wrapper over `PersistentChangeHandler` that translates raw filesystem +//! events into database mutations. The watcher calls `apply_batch` with events; +//! this module delegates to the unified change handling infrastructure. use crate::context::CoreContext; -use crate::domain::ResourceManager; use crate::infra::db::entities; use crate::infra::event::FsRawEventKind; -use crate::ops::indexing::entry::EntryProcessor; -use crate::ops::indexing::path_resolver::PathResolver; -use crate::ops::indexing::processor::{ - self, ContentHashProcessor, LocationProcessorConfig, ProcessorEntry, ProcessorResult, -}; -use crate::ops::indexing::rules::{build_default_ruler, RuleToggles, RulerDecision}; -use crate::ops::indexing::state::{DirEntry, IndexerState}; -use crate::ops::indexing::{ctx::ResponderCtx, IndexingCtx}; -use crate::ops::media::{ - ocr::OcrProcessor, proxy::ProxyProcessor, speech::SpeechToTextProcessor, - thumbnail::ThumbnailProcessor, thumbstrip::ThumbstripProcessor, -}; +use crate::ops::indexing::change_detection::{self, ChangeConfig, PersistentChangeHandler}; +use crate::ops::indexing::rules::RuleToggles; use anyhow::Result; -use sea_orm::{ColumnTrait, DbErr, EntityTrait, QueryFilter, QuerySelect, TransactionTrait}; +use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, TransactionTrait}; use std::path::Path; use std::sync::Arc; -use tracing::{debug, warn}; use uuid::Uuid; -/// Check if a path exists, distinguishing between "doesn't exist" and "can't access" +/// Translates a single filesystem event into database mutations. /// -/// This is critical for preventing false deletions when volumes go offline. -/// Returns Ok(true) if path exists, Ok(false) if confirmed absent, Err if inaccessible. -async fn path_exists_safe( - path: &Path, - backend: Option<&Arc>, -) -> Result { - use crate::volume::error::VolumeError; - - if let Some(backend) = backend { - // Use volume backend (works for both local and cloud) - match backend.exists(path).await { - Ok(exists) => Ok(exists), - Err(VolumeError::NotMounted(_)) => { - // Volume is not mounted - don't treat as deletion - warn!( - "Volume not mounted when checking path existence: {}", - path.display() - ); - Err(anyhow::anyhow!( - "Volume not mounted, cannot verify path existence" - )) - } - Err(VolumeError::Io(ref e)) if e.kind() == std::io::ErrorKind::NotFound => { - // Path doesn't exist - this is OK, return false - Ok(false) - } - Err(VolumeError::Io(io_err)) => { - // Other IO errors (permissions, volume offline, etc.) - don't treat as deletion - warn!( - "IO error when checking path existence for {}: {}", - path.display(), - io_err - ); - Err(anyhow::anyhow!( - "IO error, volume may be offline: {}", - io_err - )) - } - Err(e) => { - // Other volume errors (timeout, permission denied, etc.) - warn!( - "Volume error when checking path existence for {}: {}", - path.display(), - e - ); - Err(e.into()) - } - } - } else { - // Fallback to local filesystem - match tokio::fs::try_exists(path).await { - Ok(exists) => Ok(exists), - Err(e) => { - // IO error - can't determine existence (volume may be offline) - warn!( - "Cannot verify path existence for {} (volume may be offline): {}", - path.display(), - e - ); - Err(anyhow::anyhow!("Cannot access path: {}", e)) - } - } - } -} - -/// Translates a single filesystem event into database mutations: create, modify, rename, or remove. -/// -/// Queries the database to resolve paths to entry IDs, then delegates to specialized handlers. -/// For creates/modifies, runs the processor pipeline (content hash, thumbnails, etc.) inline. +/// Creates a `PersistentChangeHandler` and delegates to the unified change +/// handling infrastructure in `change_detection`. pub async fn apply( context: &Arc, library_id: Uuid, @@ -107,59 +28,23 @@ pub async fn apply( location_root: &Path, volume_backend: Option<&Arc>, ) -> Result<()> { - // Lightweight indexing context for DB access - let ctx = ResponderCtx::new(context, library_id).await?; - - match kind { - FsRawEventKind::Create { path } => { - handle_create( - &ctx, - context, - library_id, - location_id, - &path, - rule_toggles, - location_root, - volume_backend, - ) - .await? - } - FsRawEventKind::Modify { path } => { - handle_modify( - &ctx, - context, - library_id, - location_id, - &path, - rule_toggles, - location_root, - volume_backend, - ) - .await? - } - FsRawEventKind::Remove { path } => handle_remove(&ctx, context, location_id, &path).await?, - FsRawEventKind::Rename { from, to } => { - handle_rename( - &ctx, - context, - location_id, - &from, - &to, - rule_toggles, - location_root, - volume_backend, - ) - .await? - } - } - Ok(()) + apply_batch( + context, + library_id, + location_id, + vec![kind], + rule_toggles, + location_root, + volume_backend, + ) + .await } -/// Processes multiple filesystem events as a batch, deduplicating and ordering for correctness. +/// Processes multiple filesystem events as a batch. /// -/// Groups events by type, deduplicates (macOS sends duplicate creates), then processes in order: -/// removes first, then renames, creates, modifies. This prevents conflicts like creating a file -/// that should have been deleted. +/// Creates a `PersistentChangeHandler` and delegates to the unified +/// `change_detection::apply_batch` which handles deduplication, ordering, +/// and correct processing sequence (removes, renames, creates, modifies). pub async fn apply_batch( context: &Arc, library_id: Uuid, @@ -173,1093 +58,59 @@ pub async fn apply_batch( return Ok(()); } - use std::sync::atomic::{AtomicU64, Ordering}; - static CALL_COUNTER: AtomicU64 = AtomicU64::new(0); - let call_id = CALL_COUNTER.fetch_add(1, Ordering::SeqCst); - - debug!( - "[BATCH #{}] Responder received batch of {} events for location {} (thread {:?})", - call_id, + tracing::debug!( + "Responder received batch of {} events for location {}", events.len(), + location_id + ); + + let mut handler = PersistentChangeHandler::new( + context.clone(), + library_id, location_id, - std::thread::current().id() - ); - - // Lightweight indexing context for DB access - let ctx = ResponderCtx::new(context, library_id).await?; - - let mut creates = Vec::new(); - let mut modifies = Vec::new(); - let mut removes = Vec::new(); - let mut renames = Vec::new(); - - for event in events { - match event { - FsRawEventKind::Create { path } => creates.push(path), - FsRawEventKind::Modify { path } => modifies.push(path), - FsRawEventKind::Remove { path } => removes.push(path), - FsRawEventKind::Rename { from, to } => renames.push((from, to)), - } - } - - // macOS FSEvents sends duplicate creates when files are written incrementally. - creates.sort(); - creates.dedup(); - modifies.sort(); - modifies.dedup(); - removes.sort(); - removes.dedup(); - - debug!( - "Processing batch: {} creates, {} modifies, {} removes, {} renames", - creates.len(), - modifies.len(), - removes.len(), - renames.len() - ); - - // Process removes - for path in removes { - if let Err(e) = handle_remove(&ctx, context, location_id, &path).await { - tracing::error!("Failed to handle remove for {}: {}", path.display(), e); - } - } - - // Process renames - for (from, to) in renames { - if let Err(e) = handle_rename( - &ctx, - context, - location_id, - &from, - &to, - rule_toggles, - location_root, - volume_backend, - ) - .await - { - tracing::error!( - "Failed to handle rename from {} to {}: {}", - from.display(), - to.display(), - e - ); - } - } - - // Process creates - for (idx, path) in creates.iter().enumerate() { - debug!( - "[BATCH #{}] Processing create {}/{}: {}", - call_id, - idx + 1, - creates.len(), - path.display() - ); - if let Err(e) = handle_create( - &ctx, - context, - library_id, - location_id, - &path, - rule_toggles, - location_root, - volume_backend, - ) - .await - { - tracing::error!("Failed to handle create for {}: {}", path.display(), e); - } - debug!( - "[BATCH #{}] Completed create {}/{}", - call_id, - idx + 1, - creates.len() - ); - } - - // Process modifies - for path in modifies { - if let Err(e) = handle_modify( - &ctx, - context, - library_id, - location_id, - &path, - rule_toggles, - location_root, - volume_backend, - ) - .await - { - tracing::error!("Failed to handle modify for {}: {}", path.display(), e); - } - } - - Ok(()) -} - -/// Fetches the location's root entry_id to scope path lookups within the correct location tree. -async fn get_location_root_entry_id(ctx: &impl IndexingCtx, location_id: Uuid) -> Result { - let location_record = entities::location::Entity::find() - .filter(entities::location::Column::Uuid.eq(location_id)) - .one(ctx.library_db()) - .await? - .ok_or_else(|| anyhow::anyhow!("Location not found: {}", location_id))?; - - location_record - .entry_id - .ok_or_else(|| anyhow::anyhow!("Location {} has no root entry", location_id)) -} - -/// Evaluates indexing rules to determine if a path should be skipped (hidden files, system dirs, etc.). -async fn should_filter_path( - path: &Path, - rule_toggles: RuleToggles, - location_root: &Path, - backend: Option<&Arc>, -) -> Result { - let ruler = build_default_ruler(rule_toggles, location_root, path).await; - - let metadata = if let Some(backend) = backend { - backend - .metadata(path) - .await - .map_err(|e| anyhow::anyhow!("Failed to get metadata via backend: {}", e))? - } else { - let fs_meta = tokio::fs::metadata(path).await?; - crate::volume::backend::RawMetadata { - kind: if fs_meta.is_dir() { - crate::ops::indexing::state::EntryKind::Directory - } else if fs_meta.is_symlink() { - crate::ops::indexing::state::EntryKind::Symlink - } else { - crate::ops::indexing::state::EntryKind::File - }, - size: fs_meta.len(), - modified: fs_meta.modified().ok(), - created: fs_meta.created().ok(), - accessed: fs_meta.accessed().ok(), - inode: None, - permissions: None, - } - }; - - struct SimpleMetadata { - is_dir: bool, - } - impl crate::ops::indexing::rules::MetadataForIndexerRules for SimpleMetadata { - fn is_dir(&self) -> bool { - self.is_dir - } - } - - let simple_meta = SimpleMetadata { - is_dir: metadata.kind == crate::ops::indexing::state::EntryKind::Directory, - }; - - match ruler.evaluate_path(path, &simple_meta).await { - Ok(RulerDecision::Reject) => { - debug!("Filtered path by indexing rules: {}", path.display()); - Ok(true) - } - Ok(RulerDecision::Accept) => Ok(false), - Err(e) => { - tracing::warn!("Error evaluating rules for {}: {}", path.display(), e); - Ok(false) - } - } -} - -/// Creates a new entry for the path, runs processors, and spawns recursive indexing for directories. -/// -/// Checks for duplicate creates (race conditions), inode-based moves, and filters based on rules. -/// For directories, dispatches an IndexerJob to index contents. For files, runs the processor -/// pipeline inline (content hash, thumbnails, etc.). -async fn handle_create( - ctx: &impl IndexingCtx, - context: &Arc, - library_id: Uuid, - location_id: Uuid, - path: &Path, - rule_toggles: RuleToggles, - location_root: &Path, - backend: Option<&Arc>, -) -> Result<()> { - debug!("Create: {}", path.display()); - - match path_exists_safe(path, backend).await { - Ok(true) => {} - Ok(false) => { - debug!("Path no longer exists, skipping create: {}", path.display()); - return Ok(()); - } - Err(e) => { - warn!( - "Skipping create event for inaccessible path {}: {}", - path.display(), - e - ); - return Ok(()); - } - } - - if should_filter_path(path, rule_toggles, location_root, backend).await? { - debug!("✗ Skipping filtered path: {}", path.display()); - return Ok(()); - } - - debug!("→ Processing create for: {}", path.display()); - let dir_entry = build_dir_entry(path, backend).await?; - - let location_root_entry_id = get_location_root_entry_id(ctx, location_id).await?; - if let Some(existing_id) = - resolve_entry_id_by_path_scoped(ctx, path, location_root_entry_id).await? - { - debug!( - "Entry already exists at path {} (entry_id={}), treating as modify instead of create", - path.display(), - existing_id - ); - return handle_modify( - ctx, - context, - library_id, - location_id, - path, - rule_toggles, - location_root, - backend, - ) - .await; - } - - if handle_move_by_inode(ctx, path, dir_entry.inode, backend).await? { - return Ok(()); - } - - let mut state = IndexerState::new(&crate::domain::addressing::SdPath::local(path)); - - if let Ok(Some(location_record)) = entities::location::Entity::find() - .filter(entities::location::Column::Uuid.eq(location_id)) - .one(ctx.library_db()) - .await - { - if let Some(location_entry_id) = location_record.entry_id { - let _ = state - .seed_ancestor_cache(ctx.library_db(), location_root, location_entry_id, path) - .await; - } - } - - let entry_id = match EntryProcessor::create_entry( - &mut state, - ctx, - &dir_entry, - 0, - path.parent().unwrap_or_else(|| Path::new("/")), + location_root, + volume_backend.cloned(), ) - .await - { - Ok(id) => { - debug!("✓ Created entry {} for path: {}", id, path.display()); - id - } - Err(e) if is_unique_constraint_violation(&e) => { - debug!( - "Unique constraint violation for {}, updating existing entry (race condition)", - path.display() - ); - - if let Some(existing_id) = - resolve_entry_id_by_path_scoped(ctx, path, location_root_entry_id).await? - { - EntryProcessor::update_entry(ctx, existing_id, &dir_entry).await?; - debug!( - "✓ Updated existing entry {} with new metadata (inode: {:?})", - existing_id, dir_entry.inode - ); - - return handle_modify( - ctx, - context, - library_id, - location_id, - path, - rule_toggles, - location_root, - backend, - ) - .await; - } else { - warn!( - "Unique constraint violation but entry not found for path: {}", - path.display() - ); - return Err(e.into()); - } - } - Err(e) => { - return Err(e.into()); - } - }; - - let entry_uuid = match entities::entry::Entity::find_by_id(entry_id) - .one(ctx.library_db()) - .await? - { - Some(entry) => entry.uuid, - None => None, - }; - - if dir_entry.kind == super::state::EntryKind::Directory { - debug!( - "Created directory detected, spawning recursive indexer job for: {}", - path.display() - ); - - if let Some(library) = context.get_library(library_id).await { - let location_record = entities::location::Entity::find() - .filter(entities::location::Column::Uuid.eq(location_id)) - .one(ctx.library_db()) - .await - .ok() - .flatten(); - - let index_mode = if let Some(loc) = location_record { - match loc.index_mode.as_str() { - "shallow" => super::job::IndexMode::Shallow, - "content" => super::job::IndexMode::Content, - "deep" => super::job::IndexMode::Deep, - _ => super::job::IndexMode::Content, - } - } else { - super::job::IndexMode::Content - }; - - let indexer_job = super::job::IndexerJob::from_location( - location_id, - crate::domain::addressing::SdPath::local(path), - index_mode, - ); - - if let Err(e) = library.jobs().dispatch(indexer_job).await { - warn!( - "Failed to spawn indexer job for directory {}: {}", - path.display(), - e - ); - } else { - debug!( - "✓ Spawned recursive indexer job (mode: {:?}) for directory: {}", - index_mode, - path.display() - ); - } - } - } else { - if let Some(library) = context.get_library(library_id).await { - let proc_config = - processor::load_location_processor_config(location_id, ctx.library_db()) - .await - .unwrap_or_default(); - - let proc_entry = build_processor_entry(ctx, entry_id, path).await?; - - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "content_hash" && c.enabled) - { - let content_proc = ContentHashProcessor::new(library_id); - if let Err(e) = content_proc.process(ctx, &proc_entry).await { - warn!("Content hash processing failed: {}", e); - } - } - - let proc_entry = build_processor_entry(ctx, entry_id, path).await?; - - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "thumbnail" && c.enabled) - { - let thumb_proc = ThumbnailProcessor::new(library.clone()); - if thumb_proc.should_process(&proc_entry) { - if let Err(e) = thumb_proc.process(ctx.library_db(), &proc_entry).await { - warn!("Thumbnail processing failed: {}", e); - } - } - } - - // Run thumbstrip processor - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "thumbstrip" && c.enabled) - { - let settings = proc_config - .watcher_processors - .iter() - .find(|c| c.processor_type == "thumbstrip") - .map(|c| &c.settings); - - let thumbstrip_proc = if let Some(settings) = settings { - ThumbstripProcessor::new(library.clone()) - .with_settings(settings) - .unwrap_or_else(|e| { - warn!("Failed to parse thumbstrip settings: {}", e); - ThumbstripProcessor::new(library.clone()) - }) - } else { - ThumbstripProcessor::new(library.clone()) - }; - - if thumbstrip_proc.should_process(&proc_entry) { - if let Err(e) = thumbstrip_proc.process(ctx.library_db(), &proc_entry).await { - warn!("Thumbstrip processing failed: {}", e); - } - } - } - - // Run proxy processor - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "proxy" && c.enabled) - { - let settings = proc_config - .watcher_processors - .iter() - .find(|c| c.processor_type == "proxy") - .map(|c| &c.settings); - - let proxy_proc = if let Some(settings) = settings { - ProxyProcessor::new(library.clone()) - .with_settings(settings) - .unwrap_or_else(|e| { - warn!("Failed to parse proxy settings: {}", e); - ProxyProcessor::new(library.clone()) - }) - } else { - ProxyProcessor::new(library.clone()) - }; - - if proxy_proc.should_process(&proc_entry) { - if let Err(e) = proxy_proc.process(ctx.library_db(), &proc_entry).await { - warn!("Proxy processing failed: {}", e); - } - } - } - - // Run OCR processor - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "ocr" && c.enabled) - { - let ocr_proc = OcrProcessor::new(library.clone()); - if ocr_proc.should_process(&proc_entry) { - if let Err(e) = ocr_proc.process(ctx.library_db(), &proc_entry).await { - warn!("OCR processing failed: {}", e); - } - } - } - - // Run speech-to-text processor - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "speech_to_text" && c.enabled) - { - let speech_proc = SpeechToTextProcessor::new(library.clone()); - if speech_proc.should_process(&proc_entry) { - if let Err(e) = speech_proc.process(ctx.library_db(), &proc_entry).await { - warn!("Speech-to-text processing failed: {}", e); - } - } - } - } - } - - if let Some(uuid) = entry_uuid { - debug!("→ Emitting resource event for entry {}", uuid); - let resource_manager = - ResourceManager::new(Arc::new(ctx.library_db().clone()), context.events.clone()); - - if let Err(e) = resource_manager - .emit_resource_events("entry", vec![uuid]) - .await - { - warn!("Failed to emit resource event for created entry: {}", e); - } else { - debug!("✓ Emitted resource event for entry {}", uuid); - } - } - - Ok(()) -} - -/// Updates an existing entry's metadata and re-runs processors for files. -/// -/// Detects inode-based moves before updating. For files, regenerates content hashes and -/// thumbnails in case the file contents changed. -async fn handle_modify( - ctx: &impl IndexingCtx, - context: &Arc, - library_id: Uuid, - location_id: Uuid, - path: &Path, - rule_toggles: RuleToggles, - location_root: &Path, - backend: Option<&Arc>, -) -> Result<()> { - debug!("Modify: {}", path.display()); - - match path_exists_safe(path, backend).await { - Ok(true) => {} - Ok(false) => { - debug!("Path no longer exists, skipping modify: {}", path.display()); - return Ok(()); - } - Err(e) => { - warn!( - "Skipping modify event for inaccessible path {}: {}", - path.display(), - e - ); - return Ok(()); - } - } - - if should_filter_path(path, rule_toggles, location_root, backend).await? { - debug!("✗ Skipping filtered path: {}", path.display()); - return Ok(()); - } - - debug!("→ Processing modify for: {}", path.display()); - - let location_root_entry_id = get_location_root_entry_id(ctx, location_id).await?; - - let meta = EntryProcessor::extract_metadata(path, backend).await?; - if handle_move_by_inode(ctx, path, meta.inode, backend).await? { - return Ok(()); - } - - if let Some(entry_id) = - resolve_entry_id_by_path_scoped(ctx, path, location_root_entry_id).await? - { - let dir_entry = DirEntry { - path: meta.path.clone(), - kind: meta.kind, - size: meta.size, - modified: meta.modified, - inode: meta.inode, - }; - EntryProcessor::update_entry(ctx, entry_id, &dir_entry).await?; - debug!("✓ Updated entry {} for path: {}", entry_id, path.display()); - - let entry_uuid = match entities::entry::Entity::find_by_id(entry_id) - .one(ctx.library_db()) - .await? - { - Some(entry) => entry.uuid, - None => None, - }; - - if dir_entry.kind == super::state::EntryKind::File { - if let Some(library) = context.get_library(library_id).await { - let proc_config = - processor::load_location_processor_config(location_id, ctx.library_db()) - .await - .unwrap_or_default(); - - let proc_entry = build_processor_entry(ctx, entry_id, path).await?; - - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "content_hash" && c.enabled) - { - let content_proc = ContentHashProcessor::new(library_id); - if let Err(e) = content_proc.process(ctx, &proc_entry).await { - warn!("Content hash processing failed: {}", e); - } - } - - let proc_entry = build_processor_entry(ctx, entry_id, path).await?; - - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "thumbnail" && c.enabled) - { - let thumb_proc = ThumbnailProcessor::new(library.clone()); - if thumb_proc.should_process(&proc_entry) { - if let Err(e) = thumb_proc.process(ctx.library_db(), &proc_entry).await { - warn!("Thumbnail processing failed: {}", e); - } - } - } - - // Run OCR processor - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "ocr" && c.enabled) - { - let ocr_proc = OcrProcessor::new(library.clone()); - if ocr_proc.should_process(&proc_entry) { - if let Err(e) = ocr_proc.process(ctx.library_db(), &proc_entry).await { - warn!("OCR processing failed: {}", e); - } - } - } - - // Run speech-to-text processor - if proc_config - .watcher_processors - .iter() - .any(|c| c.processor_type == "speech_to_text" && c.enabled) - { - let speech_proc = SpeechToTextProcessor::new(library.clone()); - if speech_proc.should_process(&proc_entry) { - if let Err(e) = speech_proc.process(ctx.library_db(), &proc_entry).await { - warn!("Speech-to-text processing failed: {}", e); - } - } - } - } - } - - if let Some(uuid) = entry_uuid { - debug!("→ Emitting resource event for modified entry {}", uuid); - let resource_manager = - ResourceManager::new(Arc::new(ctx.library_db().clone()), context.events.clone()); - - if let Err(e) = resource_manager - .emit_resource_events("entry", vec![uuid]) - .await - { - warn!("Failed to emit resource event for modified entry: {}", e); - } else { - debug!("✓ Emitted resource event for entry {}", uuid); - } - } - } else { - debug!( - "✗ Entry not found for path, skipping modify: {}", - path.display() - ); - } - Ok(()) -} - -/// Deletes an entry and its entire subtree using closure table traversal. -/// -/// Creates tombstones for all deleted entries to sync the deletion across devices. -async fn handle_remove( - ctx: &impl IndexingCtx, - context: &Arc, - location_id: Uuid, - path: &Path, -) -> Result<()> { - debug!("Remove: {}", path.display()); - - let location_root_entry_id = get_location_root_entry_id(ctx, location_id).await?; - - if let Some(entry_id) = - resolve_entry_id_by_path_scoped(ctx, path, location_root_entry_id).await? - { - debug!("→ Deleting entry {} for path: {}", entry_id, path.display()); - delete_subtree(ctx, context, location_id, entry_id).await?; - debug!("✓ Deleted entry {} for path: {}", entry_id, path.display()); - } else { - debug!( - "✗ Entry not found for path, skipping remove: {}", - path.display() - ); - } - Ok(()) -} - -/// Moves an entry from one path to another, updating parent relationships and directory_paths. -/// -/// Checks if the destination is filtered (treats as deletion). Updates the entry's parent_id, -/// name, and extension, then recursively fixes descendant paths in directory_paths. -async fn handle_rename( - ctx: &impl IndexingCtx, - context: &Arc, - location_id: Uuid, - from: &Path, - to: &Path, - rule_toggles: RuleToggles, - location_root: &Path, - backend: Option<&Arc>, -) -> Result<()> { - debug!("Rename: {} -> {}", from.display(), to.display()); - - match path_exists_safe(to, backend).await { - Ok(true) => {} - Ok(false) => { - debug!( - "Destination path doesn't exist, skipping rename: {}", - to.display() - ); - return Ok(()); - } - Err(e) => { - warn!( - "Skipping rename event for inaccessible destination {}: {}", - to.display(), - e - ); - return Ok(()); - } - } - - let location_root_entry_id = get_location_root_entry_id(ctx, location_id).await?; - - if should_filter_path(to, rule_toggles, location_root, backend).await? { - debug!( - "✗ Destination path is filtered, removing entry: {}", - to.display() - ); - return handle_remove(ctx, context, location_id, from).await; - } - - debug!( - "→ Processing rename for: {} -> {}", - from.display(), - to.display() - ); - - if let Some(entry_id) = - resolve_entry_id_by_path_scoped(ctx, from, location_root_entry_id).await? - { - debug!("Found entry {} for old path, moving to new path", entry_id); - - // Create state and populate entry_id_cache with parent directories - let mut state = IndexerState::new(&crate::domain::addressing::SdPath::local(from)); - - // Populate cache with new parent directory if it exists - if let Some(new_parent_path) = to.parent() { - if let Ok(Some(parent_id)) = - resolve_directory_entry_id_scoped(ctx, new_parent_path, location_root_entry_id) - .await - { - state - .entry_id_cache - .insert(new_parent_path.to_path_buf(), parent_id); - debug!( - "Populated parent cache: {} -> {}", - new_parent_path.display(), - parent_id - ); - } - } - - EntryProcessor::move_entry( - &mut state, - ctx, - entry_id, - from, - to, - to.parent().unwrap_or_else(|| Path::new("/")), - ) - .await?; - debug!("✓ Successfully moved entry {} to new path", entry_id); - } else { - debug!( - "Entry not found for old path {}, skipping rename", - from.display() - ); - } - Ok(()) -} - -/// Extracts filesystem metadata into a DirEntry for database insertion. -async fn build_dir_entry( - path: &Path, - backend: Option<&Arc>, -) -> Result { - let meta = EntryProcessor::extract_metadata(path, backend).await?; - Ok(DirEntry { - path: meta.path, - kind: meta.kind, - size: meta.size, - modified: meta.modified, - inode: meta.inode, - }) -} - -/// Constructs a ProcessorEntry by querying the entry and resolving MIME type via content_identity. -async fn build_processor_entry( - ctx: &impl IndexingCtx, - entry_id: i32, - path: &Path, -) -> Result { - use sea_orm::EntityTrait; - - let entry = entities::entry::Entity::find_by_id(entry_id) - .one(ctx.library_db()) - .await? - .ok_or_else(|| anyhow::anyhow!("Entry not found"))?; - - let mime_type = if let Some(content_id) = entry.content_id { - if let Ok(Some(ci)) = entities::content_identity::Entity::find_by_id(content_id) - .one(ctx.library_db()) - .await - { - if let Some(mime_id) = ci.mime_type_id { - if let Ok(Some(mime)) = entities::mime_type::Entity::find_by_id(mime_id) - .one(ctx.library_db()) - .await - { - Some(mime.mime_type) - } else { - None - } - } else { - None - } - } else { - None - } - } else { - None - }; - - let kind = match entry.kind { - 0 => super::state::EntryKind::File, - 1 => super::state::EntryKind::Directory, - 2 => super::state::EntryKind::Symlink, - _ => super::state::EntryKind::File, - }; - - Ok(ProcessorEntry { - id: entry.id, - uuid: entry.uuid, - path: path.to_path_buf(), - kind, - size: entry.size as u64, - content_id: entry.content_id, - mime_type, - }) -} - -/// Resolves an entry ID by trying directory lookup first, then file lookup. -async fn resolve_entry_id_by_path_scoped( - ctx: &impl IndexingCtx, - abs_path: &Path, - location_root_entry_id: i32, -) -> Result> { - if let Some(id) = - resolve_directory_entry_id_scoped(ctx, abs_path, location_root_entry_id).await? - { - return Ok(Some(id)); - } - resolve_file_entry_id_scoped(ctx, abs_path, location_root_entry_id).await -} - -/// Queries directory_paths joined with entry_closure to find directories scoped to this location. -async fn resolve_directory_entry_id_scoped( - ctx: &impl IndexingCtx, - abs_path: &Path, - location_root_entry_id: i32, -) -> Result> { - use sea_orm::FromQueryResult; - - let path_str = abs_path.to_string_lossy().to_string(); - - #[derive(Debug, FromQueryResult)] - struct DirectoryEntryId { - entry_id: i32, - } - - let result = DirectoryEntryId::find_by_statement(sea_orm::Statement::from_sql_and_values( - sea_orm::DbBackend::Sqlite, - r#" - SELECT dp.entry_id - FROM directory_paths dp - INNER JOIN entry_closure ec ON ec.descendant_id = dp.entry_id - WHERE dp.path = ? - AND ec.ancestor_id = ? - "#, - vec![path_str.into(), location_root_entry_id.into()], - )) - .one(ctx.library_db()) .await?; - Ok(result.map(|r| r.entry_id)) -} - -/// Finds a file entry by resolving its parent directory, then matching name + extension. -async fn resolve_file_entry_id_scoped( - ctx: &impl IndexingCtx, - abs_path: &Path, - location_root_entry_id: i32, -) -> Result> { - let parent = match abs_path.parent() { - Some(p) => p, - None => return Ok(None), + let config = ChangeConfig { + rule_toggles, + location_root, + volume_backend, }; - let parent_id = - match resolve_directory_entry_id_scoped(ctx, parent, location_root_entry_id).await? { - Some(id) => id, - None => return Ok(None), - }; - - let name = abs_path - .file_stem() - .and_then(|s| s.to_str()) - .unwrap_or("") - .to_string(); - let ext = abs_path - .extension() - .and_then(|s| s.to_str()) - .map(|s| s.to_lowercase()); - - let mut q = entities::entry::Entity::find() - .filter(entities::entry::Column::ParentId.eq(parent_id)) - .filter(entities::entry::Column::Name.eq(name)); - if let Some(e) = ext { - q = q.filter(entities::entry::Column::Extension.eq(e)); - } else { - q = q.filter(entities::entry::Column::Extension.is_null()); - } - let model = q.one(ctx.library_db()).await?; - Ok(model.map(|m| m.id)) + change_detection::apply_batch(&mut handler, events, &config).await } -/// Detects SQLite unique constraint errors by checking error message strings. -fn is_unique_constraint_violation(error: &crate::infra::job::error::JobError) -> bool { - let error_msg = error.to_string().to_lowercase(); - error_msg.contains("unique constraint") - || error_msg.contains("unique index") - || error_msg.contains("constraint failed") -} +// ============================================================================ +// Subtree Deletion Utilities +// ============================================================================ +// These functions are used by sync and entity deletion code paths. +// They operate directly on the database without going through ChangeHandler. -/// Deletes an entry tree and creates tombstones for sync. +/// Deletes an entry tree without creating tombstones. /// -/// Used by watcher and indexer to propagate deletions to other devices. Traverses using both -/// entry_closure and parent_id (fallback) to handle partially-corrupted closure tables. -async fn delete_subtree( - ctx: &impl IndexingCtx, - context: &Arc, - location_id: Uuid, - entry_id: i32, -) -> Result<()> { - use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; - - let mut to_delete_ids: Vec = vec![entry_id]; - if let Ok(rows) = entities::entry_closure::Entity::find() - .filter(entities::entry_closure::Column::AncestorId.eq(entry_id)) - .all(ctx.library_db()) - .await - { - to_delete_ids.extend(rows.into_iter().map(|r| r.descendant_id)); - } - - let mut queue = vec![entry_id]; - let mut visited = std::collections::HashSet::from([entry_id]); - - while let Some(parent) = queue.pop() { - if let Ok(children) = entities::entry::Entity::find() - .filter(entities::entry::Column::ParentId.eq(parent)) - .all(ctx.library_db()) - .await - { - for child in children { - if visited.insert(child.id) { - to_delete_ids.push(child.id); - queue.push(child.id); - } - } - } - } - - to_delete_ids.sort_unstable(); - to_delete_ids.dedup(); - - tracing::debug!( - "Deleting entry {} and {} descendants (total {} entries)", - entry_id, - to_delete_ids.len() - 1, - to_delete_ids.len() - ); - - let entries_to_delete = if !to_delete_ids.is_empty() { - let mut all_entries = Vec::new(); - for chunk in to_delete_ids.chunks(900) { - let batch = entities::entry::Entity::find() - .filter(entities::entry::Column::Id.is_in(chunk.to_vec())) - .all(ctx.library_db()) - .await?; - all_entries.extend(batch); - } - all_entries - } else { - Vec::new() - }; - - if !entries_to_delete.is_empty() { - if let Some(library) = context.get_library(location_id).await { - let _ = library - .sync_models_batch( - &entries_to_delete, - crate::infra::sync::ChangeType::Delete, - ctx.library_db(), - ) - .await; - } - } - - let txn = ctx.library_db().begin().await?; - - if !to_delete_ids.is_empty() { - let _ = entities::entry_closure::Entity::delete_many() - .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) - .exec(&txn) - .await; - let _ = entities::entry_closure::Entity::delete_many() - .filter(entities::entry_closure::Column::AncestorId.is_in(to_delete_ids.clone())) - .exec(&txn) - .await; - let _ = entities::directory_paths::Entity::delete_many() - .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) - .exec(&txn) - .await; - let _ = entities::entry::Entity::delete_many() - .filter(entities::entry::Column::Id.is_in(to_delete_ids)) - .exec(&txn) - .await; - } - - txn.commit().await?; - Ok(()) -} - -/// Deletes an entry tree without creating tombstones (used when applying remote tombstones). +/// Used when applying remote tombstones (the deletion was already synced, +/// we're just applying it locally). Also used by entity cascade deletes. pub async fn delete_subtree_internal( entry_id: i32, db: &sea_orm::DatabaseConnection, ) -> Result<(), sea_orm::DbErr> { - use sea_orm::TransactionTrait; - let txn = db.begin().await?; delete_subtree_no_txn(entry_id, &txn).await?; txn.commit().await?; Ok(()) } -/// Deletes a subtree within an existing transaction (no transaction management). +/// Deletes a subtree within an existing transaction. +/// +/// Traverses via entry_closure to find all descendants, then deletes +/// closure links, directory_paths, and entries in the correct order. async fn delete_subtree_no_txn(entry_id: i32, db: &C) -> Result<(), sea_orm::DbErr> where C: sea_orm::ConnectionTrait, { + // Collect all descendants via closure table let mut to_delete_ids: Vec = vec![entry_id]; if let Ok(rows) = entities::entry_closure::Entity::find() .filter(entities::entry_closure::Column::AncestorId.eq(entry_id)) @@ -1272,6 +123,7 @@ where to_delete_ids.dedup(); if !to_delete_ids.is_empty() { + // Delete closure links (both directions) let _ = entities::entry_closure::Entity::delete_many() .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) .exec(db) @@ -1280,10 +132,14 @@ where .filter(entities::entry_closure::Column::AncestorId.is_in(to_delete_ids.clone())) .exec(db) .await; + + // Delete directory paths let _ = entities::directory_paths::Entity::delete_many() .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) .exec(db) .await; + + // Delete entries let _ = entities::entry::Entity::delete_many() .filter(entities::entry::Column::Id.is_in(to_delete_ids)) .exec(db) @@ -1292,76 +148,3 @@ where Ok(()) } - -/// Detects moves by matching inodes: if an entry exists with the same inode at a different path, treats as a move. -/// -/// Prevents duplicate entries when files are moved instead of deleted+created. Falls back to update -/// if the inode matches but the path is the same (macOS FSEvents quirk). -async fn handle_move_by_inode( - ctx: &impl IndexingCtx, - new_path: &Path, - inode: Option, - backend: Option<&Arc>, -) -> Result { - let inode_val = match inode { - Some(i) if i != 0 => i as i64, - _ => return Ok(false), - }; - - debug!( - "→ Checking inode {} for potential move detection", - inode_val - ); - - if let Some(existing) = entities::entry::Entity::find() - .filter(entities::entry::Column::Inode.eq(inode_val)) - .one(ctx.library_db()) - .await? - { - let old_path = PathResolver::get_full_path(ctx.library_db(), existing.id) - .await - .unwrap_or_else(|_| std::path::PathBuf::from(&existing.name)); - - debug!( - "Found existing entry {} (uuid={:?}) with inode {}: old_path={}, new_path={}", - existing.id, - existing.uuid, - inode_val, - old_path.display(), - new_path.display() - ); - - if old_path != new_path { - debug!( - "✓ Detected inode-based move: {} → {}", - old_path.display(), - new_path.display() - ); - let mut state = IndexerState::new(&crate::domain::addressing::SdPath::local(&old_path)); - EntryProcessor::move_entry( - &mut state, - ctx, - existing.id, - &old_path, - new_path, - new_path.parent().unwrap_or_else(|| Path::new("/")), - ) - .await?; - debug!("✓ Completed inode-based move for entry {}", existing.id); - return Ok(true); - } else { - debug!( - "Entry already exists at path with same inode {}, updating instead of creating: {}", - inode_val, - new_path.display() - ); - let dir_entry = build_dir_entry(new_path, backend).await?; - EntryProcessor::update_entry(ctx, existing.id, &dir_entry).await?; - debug!("✓ Updated entry {} via inode match", existing.id); - return Ok(true); - } - } else { - debug!("✗ No existing entry found with inode {}", inode_val); - } - Ok(false) -} From ee39df74a8b50e4a7a913780c5ffa7a02f1a319d Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 01:35:30 -0800 Subject: [PATCH 14/20] Refactor subtree deletion handling in indexing operations - Replaced calls to `delete_subtree_internal` with `EntryProcessor::delete_subtree` in the `entry`, `location`, and `manager` modules to streamline the deletion process. - Introduced a new `delete_subtree` method in `EntryProcessor` that handles the deletion of an entry and its descendants without creating tombstones, improving efficiency in database operations. - Removed the deprecated `delete_subtree_internal` function from the `responder` module, consolidating deletion logic into the `EntryProcessor`. - Updated documentation and tests to reflect the changes in deletion handling and ensure reliability. --- core/src/infra/db/entities/entry.rs | 2 +- core/src/infra/db/entities/location.rs | 2 +- core/src/location/manager.rs | 2 +- core/src/ops/indexing/change_detection/mod.rs | 49 +- core/src/ops/indexing/entry.rs | 79 +++ core/src/ops/indexing/ephemeral/cache.rs | 533 +++++++++++++----- .../src/ops/indexing/ephemeral/index_cache.rs | 465 --------------- core/src/ops/indexing/ephemeral/name.rs | 206 +++++++ core/src/ops/indexing/responder.rs | 71 +-- 9 files changed, 687 insertions(+), 722 deletions(-) delete mode 100644 core/src/ops/indexing/ephemeral/index_cache.rs create mode 100644 core/src/ops/indexing/ephemeral/name.rs diff --git a/core/src/infra/db/entities/entry.rs b/core/src/infra/db/entities/entry.rs index 0533e3d2b..f3417654c 100644 --- a/core/src/infra/db/entities/entry.rs +++ b/core/src/infra/db/entities/entry.rs @@ -336,7 +336,7 @@ impl crate::infra::sync::Syncable for Model { // Use delete_subtree_internal to cascade delete entire subtree // This avoids creating tombstones (we're applying a tombstone) - crate::ops::indexing::responder::delete_subtree_internal(entry.id, db).await?; + crate::ops::indexing::EntryProcessor::delete_subtree(entry.id, db).await?; Ok(()) } diff --git a/core/src/infra/db/entities/location.rs b/core/src/infra/db/entities/location.rs index 929dd7cc0..cb1c146ee 100644 --- a/core/src/infra/db/entities/location.rs +++ b/core/src/infra/db/entities/location.rs @@ -330,7 +330,7 @@ impl Syncable for Model { // Delete root entry tree first if it exists // Use delete_subtree_internal to avoid creating tombstones (we're applying a tombstone) if let Some(entry_id) = location.entry_id { - crate::ops::indexing::responder::delete_subtree_internal(entry_id, db).await?; + crate::ops::indexing::EntryProcessor::delete_subtree(entry_id, db).await?; } // Delete location record diff --git a/core/src/location/manager.rs b/core/src/location/manager.rs index 68ced1dbc..325e6f435 100644 --- a/core/src/location/manager.rs +++ b/core/src/location/manager.rs @@ -501,7 +501,7 @@ impl LocationManager { // Delete the root entry tree first if it exists // Use delete_subtree_internal to avoid creating entry tombstones (we'll tombstone the location instead) if let Some(entry_id) = location.entry_id { - crate::ops::indexing::responder::delete_subtree_internal(entry_id, library.db().conn()) + crate::ops::indexing::EntryProcessor::delete_subtree(entry_id, library.db().conn()) .await .map_err(|e| LocationError::Other(format!("Failed to delete entry tree: {}", e)))?; } diff --git a/core/src/ops/indexing/change_detection/mod.rs b/core/src/ops/indexing/change_detection/mod.rs index 2fdc17829..9ff9a76d2 100644 --- a/core/src/ops/indexing/change_detection/mod.rs +++ b/core/src/ops/indexing/change_detection/mod.rs @@ -1,43 +1,15 @@ -//! Change detection and handling for the indexing system. +//! # Change Detection //! -//! This module provides two complementary subsystems: +//! Tracks filesystem changes through two complementary subsystems: batch +//! detection during indexer jobs (`detector`) and real-time handling of watcher +//! events (`handler`). Both produce the same `Change` type and share inode-based +//! move detection, so a file moved while the indexer is running behaves +//! identically to one moved while the watcher is active. //! -//! 1. **Detection** (`detector.rs`): Batch scanning during indexer jobs. -//! Compares database state against filesystem to identify changes. -//! -//! 2. **Handling** (`handler.rs`): Real-time response to watcher events. -//! Applies changes (create/modify/move/delete) to storage. -//! -//! Both systems use the same `Change` type and share concepts like -//! inode-based move detection, ensuring consistent behavior. -//! -//! ## Architecture -//! -//! ```text -//! ┌─────────────────────────────────────────────────────────────┐ -//! │ Change Detection │ -//! ├─────────────────────────────────────────────────────────────┤ -//! │ │ -//! │ ┌─────────────┐ ┌─────────────┐ │ -//! │ │ Detector │ │ Handler │ │ -//! │ │ (batch) │ │ (real-time)│ │ -//! │ └──────┬──────┘ └──────┬──────┘ │ -//! │ │ │ │ -//! │ │ ┌─────────┐ │ │ -//! │ └────►│ Change │◄─────────┘ │ -//! │ │ enum │ │ -//! │ └────┬────┘ │ -//! │ │ │ -//! │ ┌──────────┴──────────┐ │ -//! │ ▼ ▼ │ -//! │ ┌─────────────┐ ┌─────────────┐ │ -//! │ │ Persistent │ │ Ephemeral │ │ -//! │ │ Handler │ │ Handler │ │ -//! │ │ (database) │ │ (in-memory) │ │ -//! │ └─────────────┘ └─────────────┘ │ -//! │ │ -//! └─────────────────────────────────────────────────────────────┘ -//! ``` +//! Changes route to either `PersistentChangeHandler` (database writes for +//! managed locations) or `EphemeralChangeHandler` (in-memory updates for +//! browsing sessions). This split keeps browsed directories responsive without +//! polluting the database with temporary entries. pub mod detector; pub mod ephemeral; @@ -45,7 +17,6 @@ pub mod handler; pub mod persistent; pub mod types; -// Re-export primary types pub use detector::ChangeDetector; pub use ephemeral::EphemeralChangeHandler; pub use handler::{ diff --git a/core/src/ops/indexing/entry.rs b/core/src/ops/indexing/entry.rs index 5f0da14bd..b7726733d 100644 --- a/core/src/ops/indexing/entry.rs +++ b/core/src/ops/indexing/entry.rs @@ -1066,4 +1066,83 @@ impl EntryProcessor { Ok(()) } + + // ======================================================================== + // Subtree Deletion + // ======================================================================== + + /// Deletes an entry and all its descendants from the database. + /// + /// This is a raw database operation that does NOT: + /// - Create tombstones for sync + /// - Emit events for UI updates + /// - Run any processors + /// + /// Use cases: + /// - Applying remote tombstones (deletion already synced) + /// - Cascade deletes from entity relationships + /// - Database cleanup operations + /// + /// For watcher-triggered deletions that need sync/events, use + /// `PersistentChangeHandler::delete()` instead. + pub async fn delete_subtree( + entry_id: i32, + db: &sea_orm::DatabaseConnection, + ) -> Result<(), sea_orm::DbErr> { + use sea_orm::TransactionTrait; + + let txn = db.begin().await?; + Self::delete_subtree_in_txn(entry_id, &txn).await?; + txn.commit().await?; + Ok(()) + } + + /// Deletes a subtree within an existing transaction. + /// + /// Traverses via entry_closure to find all descendants, then deletes + /// closure links, directory_paths, and entries in the correct order. + pub async fn delete_subtree_in_txn(entry_id: i32, db: &C) -> Result<(), sea_orm::DbErr> + where + C: sea_orm::ConnectionTrait, + { + use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; + + // Collect all descendants via closure table + let mut to_delete_ids: Vec = vec![entry_id]; + if let Ok(rows) = entities::entry_closure::Entity::find() + .filter(entities::entry_closure::Column::AncestorId.eq(entry_id)) + .all(db) + .await + { + to_delete_ids.extend(rows.into_iter().map(|r| r.descendant_id)); + } + to_delete_ids.sort_unstable(); + to_delete_ids.dedup(); + + if !to_delete_ids.is_empty() { + // Delete closure links (both directions) + let _ = entities::entry_closure::Entity::delete_many() + .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) + .exec(db) + .await; + let _ = entities::entry_closure::Entity::delete_many() + .filter(entities::entry_closure::Column::AncestorId.is_in(to_delete_ids.clone())) + .exec(db) + .await; + + // Delete directory paths + let _ = entities::directory_paths::Entity::delete_many() + .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) + .exec(db) + .await; + + // Delete entries + let _ = entities::entry::Entity::delete_many() + .filter(entities::entry::Column::Id.is_in(to_delete_ids)) + .exec(db) + .await; + } + + Ok(()) + } } diff --git a/core/src/ops/indexing/ephemeral/cache.rs b/core/src/ops/indexing/ephemeral/cache.rs index b6d66b73f..d47f0c035 100644 --- a/core/src/ops/indexing/ephemeral/cache.rs +++ b/core/src/ops/indexing/ephemeral/cache.rs @@ -1,206 +1,449 @@ -//! String interning cache for deduplicating filenames +//! # Ephemeral Index Cache //! -//! The NameCache provides global string interning to reduce memory usage. -//! Common filenames like `.git`, `node_modules`, `target`, `README.md` etc. -//! are stored only once and referenced via pointers. -//! -//! Benefits: -//! - 30-40% memory reduction on typical filesystems -//! - Pointer-based equality (faster comparisons) -//! - Stable references for NameRef +//! Thread-safe wrapper around a single global `EphemeralIndex`. All browsed +//! directories share one arena and string pool, keeping memory at ~50 bytes per +//! entry regardless of how many paths the user navigates. The cache tracks which +//! paths are indexed (queryable), in-progress (being scanned), or watched +//! (receiving live filesystem updates via `EphemeralChangeHandler`). -use parking_lot::Mutex; -use std::collections::BTreeSet; +use crate::ops::indexing::EphemeralIndex; +use parking_lot::RwLock; +use std::{ + collections::HashSet, + path::{Path, PathBuf}, + sync::Arc, + time::Instant, +}; +use tokio::sync::RwLock as TokioRwLock; -/// Global string interning pool for deduplicating filenames +/// Global cache with a single unified ephemeral index /// -/// Strings are stored in a BTreeSet for ordered iteration and fast lookup. -/// The Mutex ensures thread-safe access for concurrent indexing. -pub struct NameCache { - inner: Mutex>>, +/// Instead of separate indexes per path, all entries live in one shared index. +/// This maximizes memory efficiency through shared string interning and arena. +pub struct EphemeralIndexCache { + /// Single global index containing all browsed entries + index: Arc>, + + /// Paths whose immediate children have been indexed (ready for queries) + indexed_paths: RwLock>, + + /// Paths currently being indexed + indexing_in_progress: RwLock>, + + /// Paths registered for filesystem watching (subset of indexed_paths) + watched_paths: RwLock>, + + /// When the cache was created + created_at: Instant, } -impl NameCache { - /// Create a new empty cache - pub fn new() -> Self { - Self { - inner: Mutex::new(BTreeSet::new()), +impl EphemeralIndexCache { + /// Create a new cache with an empty global index + pub fn new() -> std::io::Result { + Ok(Self { + index: Arc::new(TokioRwLock::new(EphemeralIndex::new()?)), + indexed_paths: RwLock::new(HashSet::new()), + indexing_in_progress: RwLock::new(HashSet::new()), + watched_paths: RwLock::new(HashSet::new()), + created_at: Instant::now(), + }) + } + + /// Get the global index if the given path has been indexed + /// + /// Returns Some(index) if this path's contents are available, + /// None if the path hasn't been browsed yet. + pub fn get_for_path(&self, path: &Path) -> Option>> { + let indexed = self.indexed_paths.read(); + if indexed.contains(path) { + Some(self.index.clone()) + } else { + None } } - /// Intern a string and return a stable reference - /// - /// If the string already exists, returns a reference to the existing copy. - /// If not, inserts a new copy and returns a reference to it. - /// - /// # Safety - /// The returned reference is valid as long as the NameCache exists. - /// NameCache never removes strings, so references remain stable. - pub fn intern<'cache>(&'cache self, name: &str) -> &'cache str { - let mut inner = self.inner.lock(); - - // Check if already interned - if let Some(existing) = inner.get(name) { - // SAFETY: BTreeSet owns the Box, which lives as long as NameCache. - // We return a reference with lifetime tied to &self. - return unsafe { &*(existing.as_ref() as *const str) }; - } - - // Insert new string - let boxed: Box = name.into(); - let ptr = boxed.as_ref() as *const str; - inner.insert(boxed); - - // SAFETY: We just inserted the string, and NameCache never removes strings. - // The pointer remains valid as long as NameCache exists. - unsafe { &*ptr } + /// Get the global index unconditionally (for internal use) + pub fn get_global_index(&self) -> Arc> { + self.index.clone() } - /// Get the number of interned strings + /// Check if a path has been fully indexed + pub fn is_indexed(&self, path: &Path) -> bool { + self.indexed_paths.read().contains(path) + } + + /// Check if indexing is in progress for a path + pub fn is_indexing(&self, path: &Path) -> bool { + self.indexing_in_progress.read().contains(path) + } + + /// Prepare the global index for indexing a new path + /// + /// Marks the path as indexing-in-progress and returns the global index. + /// The indexer job should add entries to this shared index. + /// + /// If the path was previously indexed, clears its children first to + /// prevent ghost entries from deleted files. + pub fn create_for_indexing(&self, path: PathBuf) -> Arc> { + let mut in_progress = self.indexing_in_progress.write(); + let mut indexed = self.indexed_paths.write(); + + // If this path was previously indexed, remove it from indexed set + // The actual clearing of stale entries happens asynchronously via clear_for_reindex + indexed.remove(&path); + in_progress.insert(path); + + self.index.clone() + } + + /// Clear stale entries for a path before re-indexing (async version) + /// + /// Removes files and unbrowsed subdirectories, preserving subdirectories + /// that were explicitly navigated to. Verifies preserved directories still + /// exist on the filesystem and removes deleted ones from tracking. + pub async fn clear_for_reindex(&self, path: &Path) -> usize { + let indexed = self.indexed_paths.read().clone(); + let mut index = self.index.write().await; + let (cleared, deleted_browsed_dirs) = index.clear_directory_children(path, &indexed); + + // Remove deleted browsed directories from indexed_paths + if !deleted_browsed_dirs.is_empty() { + let mut indexed_paths = self.indexed_paths.write(); + for deleted_path in deleted_browsed_dirs { + indexed_paths.remove(&deleted_path); + } + } + + cleared + } + + /// Mark indexing as complete for a path + /// + /// Moves the path from "in progress" to "indexed" state. + pub fn mark_indexing_complete(&self, path: &Path) { + let mut in_progress = self.indexing_in_progress.write(); + let mut indexed = self.indexed_paths.write(); + + in_progress.remove(path); + indexed.insert(path.to_path_buf()); + } + + /// Remove a path from the indexed set (e.g., on invalidation) + /// + /// Note: This doesn't remove entries from the index itself, + /// just marks the path as needing re-indexing. + pub fn invalidate_path(&self, path: &Path) { + let mut indexed = self.indexed_paths.write(); + indexed.remove(path); + } + + /// Get the number of indexed paths pub fn len(&self) -> usize { - self.inner.lock().len() + self.indexed_paths.read().len() } - /// Check if the cache is empty + /// Check if no paths have been indexed pub fn is_empty(&self) -> bool { - self.inner.lock().is_empty() + self.indexed_paths.read().is_empty() } - /// Check if a string is already interned - pub fn contains(&self, name: &str) -> bool { - self.inner.lock().contains(name) + /// Get all indexed paths + pub fn indexed_paths(&self) -> Vec { + self.indexed_paths.read().iter().cloned().collect() } - /// Get approximate memory usage in bytes - pub fn memory_usage(&self) -> usize { - let inner = self.inner.lock(); - // Base struct size + BTreeSet overhead + string contents - std::mem::size_of::() - + inner.len() * std::mem::size_of::>() - + inner.iter().map(|s| s.len()).sum::() + /// Get all paths currently being indexed + pub fn paths_in_progress(&self) -> Vec { + self.indexing_in_progress.read().iter().cloned().collect() } - /// Iterate over all interned strings - pub fn iter(&self) -> impl Iterator { - let inner = self.inner.lock(); - inner - .iter() - .map(|s| s.to_string()) - .collect::>() - .into_iter() + /// Register a path for filesystem watching. + /// + /// When registered, the watcher service will monitor this path for changes + /// and update the ephemeral index via `EphemeralChangeHandler`. The path + /// must already be indexed. + pub fn register_for_watching(&self, path: PathBuf) -> bool { + let indexed = self.indexed_paths.read(); + if !indexed.contains(&path) { + return false; + } + drop(indexed); + + let mut watched = self.watched_paths.write(); + watched.insert(path); + true + } + + /// Unregister a path from filesystem watching. + pub fn unregister_from_watching(&self, path: &Path) { + let mut watched = self.watched_paths.write(); + watched.remove(path); + } + + /// Check if a path is registered for watching. + pub fn is_watched(&self, path: &Path) -> bool { + self.watched_paths.read().contains(path) + } + + /// Get all watched paths. + pub fn watched_paths(&self) -> Vec { + self.watched_paths.read().iter().cloned().collect() + } + + /// Find the watched root path that contains the given path. + /// + /// If the given path is under a watched directory, returns that directory. + /// Used by the watcher to route events to the ephemeral handler. + pub fn find_watched_root(&self, path: &Path) -> Option { + let watched = self.watched_paths.read(); + + // Find the longest matching watched path that is an ancestor of `path` + let mut best_match: Option<&PathBuf> = None; + let mut best_len = 0; + + for watched_path in watched.iter() { + if path.starts_with(watched_path) { + let len = watched_path.as_os_str().len(); + if len > best_len { + best_len = len; + best_match = Some(watched_path); + } + } + } + + best_match.cloned() + } + + /// Check if any path in an event batch is under an ephemeral watched path. + /// + /// Returns the watched root if found. + pub fn find_watched_root_for_any<'a, I>(&self, paths: I) -> Option + where + I: IntoIterator, + { + for path in paths { + if let Some(root) = self.find_watched_root(path) { + return Some(root); + } + } + None + } + + /// Get cache statistics + pub fn stats(&self) -> EphemeralIndexCacheStats { + let indexed = self.indexed_paths.read(); + let in_progress = self.indexing_in_progress.read(); + let watched = self.watched_paths.read(); + + EphemeralIndexCacheStats { + indexed_paths: indexed.len(), + indexing_in_progress: in_progress.len(), + watched_paths: watched.len(), + } + } + + /// Get how long the cache has existed + pub fn age(&self) -> std::time::Duration { + self.created_at.elapsed() + } + + /// Legacy: Get age for a specific path (returns cache age since all share one index) + pub fn get_age(&self, _path: &Path) -> Option { + Some(self.created_at.elapsed().as_secs_f64()) + } + + // Legacy compatibility methods + + /// Legacy: Get an index by exact path (for backward compatibility) + #[deprecated(note = "Use get_for_path instead")] + pub fn get(&self, path: &Path) -> Option>> { + self.get_for_path(path) + } + + /// Legacy: Get all cached paths (returns indexed paths) + #[deprecated(note = "Use indexed_paths instead")] + pub fn cached_paths(&self) -> Vec { + self.indexed_paths() + } + + /// Legacy: Insert (no-op, entries are added directly to global index) + #[deprecated(note = "Entries should be added directly to the global index")] + pub fn insert(&self, path: PathBuf, _index: Arc>) { + // Mark the path as indexed + let mut indexed = self.indexed_paths.write(); + indexed.insert(path); + } + + /// Legacy: Remove (just invalidates the path) + #[deprecated(note = "Use invalidate_path instead")] + pub fn remove(&self, path: &Path) { + self.invalidate_path(path); } } -impl Default for NameCache { +impl Default for EphemeralIndexCache { fn default() -> Self { - Self::new() + Self::new().expect("Failed to create default EphemeralIndexCache") } } -// SAFETY: NameCache uses Mutex for thread-safe access -unsafe impl Send for NameCache {} -unsafe impl Sync for NameCache {} +/// Statistics about the ephemeral index cache +#[derive(Debug, Clone)] +pub struct EphemeralIndexCacheStats { + /// Number of paths that have been indexed + pub indexed_paths: usize, + /// Number of paths currently being indexed + pub indexing_in_progress: usize, + /// Number of paths registered for filesystem watching + pub watched_paths: usize, +} + +impl EphemeralIndexCacheStats { + /// Legacy: total_entries now means indexed_paths + pub fn total_entries(&self) -> usize { + self.indexed_paths + } + + /// Legacy: indexing_count now means indexing_in_progress + pub fn indexing_count(&self) -> usize { + self.indexing_in_progress + } +} #[cfg(test)] mod tests { use super::*; #[test] - fn test_intern_returns_same_pointer() { - let cache = NameCache::new(); + fn test_single_global_index() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); - let s1 = cache.intern("hello"); - let s2 = cache.intern("hello"); - - // Same pointer means same interned string - assert!(std::ptr::eq(s1, s2)); - assert_eq!(s1, "hello"); + // Initially no paths are indexed + assert!(cache.is_empty()); + assert!(cache.get_for_path(Path::new("/test")).is_none()); } #[test] - fn test_intern_different_strings() { - let cache = NameCache::new(); + fn test_indexing_workflow() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); + let path = PathBuf::from("/test/path"); - let s1 = cache.intern("hello"); - let s2 = cache.intern("world"); + // Start indexing + let _index = cache.create_for_indexing(path.clone()); + assert!(cache.is_indexing(&path)); + assert!(!cache.is_indexed(&path)); - assert!(!std::ptr::eq(s1, s2)); - assert_eq!(s1, "hello"); - assert_eq!(s2, "world"); + // Complete indexing + cache.mark_indexing_complete(&path); + assert!(!cache.is_indexing(&path)); + assert!(cache.is_indexed(&path)); + + // Now get_for_path returns the index + assert!(cache.get_for_path(&path).is_some()); } #[test] - fn test_len_and_contains() { - let cache = NameCache::new(); + fn test_shared_index_across_paths() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); - assert_eq!(cache.len(), 0); - assert!(!cache.contains("test")); + let path1 = PathBuf::from("/test/path1"); + let path2 = PathBuf::from("/test/path2"); - cache.intern("test"); - assert_eq!(cache.len(), 1); - assert!(cache.contains("test")); + // Start indexing both paths + let index1 = cache.create_for_indexing(path1.clone()); + let index2 = cache.create_for_indexing(path2.clone()); - // Interning same string doesn't increase count - cache.intern("test"); - assert_eq!(cache.len(), 1); + // They should be the same index + assert!(Arc::ptr_eq(&index1, &index2)); + + // Complete both + cache.mark_indexing_complete(&path1); + cache.mark_indexing_complete(&path2); + + // Both paths now indexed + assert!(cache.is_indexed(&path1)); + assert!(cache.is_indexed(&path2)); + assert_eq!(cache.len(), 2); } #[test] - fn test_common_filenames() { - let cache = NameCache::new(); + fn test_invalidate_path() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); + let path = PathBuf::from("/test/path"); - // Simulate common filesystem patterns - let common_names = [ - ".git", - ".gitignore", - "node_modules", - "target", - "Cargo.toml", - "README.md", - "package.json", - "src", - "lib", - "main.rs", - ]; + // Index the path + let _index = cache.create_for_indexing(path.clone()); + cache.mark_indexing_complete(&path); + assert!(cache.is_indexed(&path)); - for name in &common_names { - cache.intern(name); - } + // Invalidate it + cache.invalidate_path(&path); + assert!(!cache.is_indexed(&path)); - // All unique, so length equals count - assert_eq!(cache.len(), common_names.len()); - - // Interning again returns same references - for name in &common_names { - let ptr1 = cache.intern(name); - let ptr2 = cache.intern(name); - assert!(std::ptr::eq(ptr1, ptr2)); - } + // get_for_path now returns None + assert!(cache.get_for_path(&path).is_none()); } #[test] - fn test_thread_safety() { - use std::sync::Arc; - use std::thread; + fn test_stats() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); - let cache = Arc::new(NameCache::new()); - let mut handles = vec![]; + let path1 = PathBuf::from("/ready"); + let path2 = PathBuf::from("/in_progress"); - for i in 0..10 { - let cache = Arc::clone(&cache); - handles.push(thread::spawn(move || { - for j in 0..100 { - let name = format!("file_{}_{}", i, j); - cache.intern(&name); - } - })); - } + // One indexed, one in progress + let _index = cache.create_for_indexing(path1.clone()); + cache.mark_indexing_complete(&path1); - for handle in handles { - handle.join().unwrap(); - } + let _index = cache.create_for_indexing(path2.clone()); - // Should have 1000 unique strings - assert_eq!(cache.len(), 1000); + let stats = cache.stats(); + assert_eq!(stats.indexed_paths, 1); + assert_eq!(stats.indexing_in_progress, 1); + } + + #[test] + fn test_watch_registration() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); + let path = PathBuf::from("/test/watched"); + + // Can't watch a path that's not indexed + assert!(!cache.register_for_watching(path.clone())); + assert!(!cache.is_watched(&path)); + + // Index the path first + let _index = cache.create_for_indexing(path.clone()); + cache.mark_indexing_complete(&path); + + // Now we can register for watching + assert!(cache.register_for_watching(path.clone())); + assert!(cache.is_watched(&path)); + + // Stats should reflect watched path + let stats = cache.stats(); + assert_eq!(stats.watched_paths, 1); + + // Unregister + cache.unregister_from_watching(&path); + assert!(!cache.is_watched(&path)); + } + + #[test] + fn test_find_watched_root() { + let cache = EphemeralIndexCache::new().expect("failed to create cache"); + + let root = PathBuf::from("/mnt/nas"); + let child = PathBuf::from("/mnt/nas/documents/report.pdf"); + + // Index and watch the root + let _index = cache.create_for_indexing(root.clone()); + cache.mark_indexing_complete(&root); + cache.register_for_watching(root.clone()); + + // Child path should find the watched root + assert_eq!(cache.find_watched_root(&child), Some(root.clone())); + + // Unrelated path should not find a root + assert_eq!(cache.find_watched_root(Path::new("/other/path")), None); } } diff --git a/core/src/ops/indexing/ephemeral/index_cache.rs b/core/src/ops/indexing/ephemeral/index_cache.rs deleted file mode 100644 index 57d558d65..000000000 --- a/core/src/ops/indexing/ephemeral/index_cache.rs +++ /dev/null @@ -1,465 +0,0 @@ -//! Global cache for ephemeral indexes -//! -//! This module provides a thread-safe cache with a SINGLE global ephemeral index. -//! All browsed directories share the same arena and string interning pool, -//! providing efficient memory usage through deduplication. -//! -//! Key benefits of unified index: -//! - String interning shared across all paths (common names like .git, README.md) -//! - Single arena for all entries (~50 bytes per entry vs ~200 with HashMap) -//! - Hierarchical structure preserved for efficient directory listings -//! -//! The cache tracks which paths have been indexed (ready) vs are currently -//! being indexed (in progress). -//! -//! ## File Watching Support -//! -//! The cache can optionally track which paths should be monitored for filesystem -//! changes. When a path is marked for watching, the watcher service can detect -//! changes and update the ephemeral index via `EphemeralChangeHandler`. - -use crate::ops::indexing::EphemeralIndex; -use parking_lot::RwLock; -use std::{ - collections::HashSet, - path::{Path, PathBuf}, - sync::Arc, - time::Instant, -}; -use tokio::sync::RwLock as TokioRwLock; - -/// Global cache with a single unified ephemeral index -/// -/// Instead of separate indexes per path, all entries live in one shared index. -/// This maximizes memory efficiency through shared string interning and arena. -pub struct EphemeralIndexCache { - /// Single global index containing all browsed entries - index: Arc>, - - /// Paths whose immediate children have been indexed (ready for queries) - indexed_paths: RwLock>, - - /// Paths currently being indexed - indexing_in_progress: RwLock>, - - /// Paths registered for filesystem watching (subset of indexed_paths) - watched_paths: RwLock>, - - /// When the cache was created - created_at: Instant, -} - -impl EphemeralIndexCache { - /// Create a new cache with an empty global index - pub fn new() -> std::io::Result { - Ok(Self { - index: Arc::new(TokioRwLock::new(EphemeralIndex::new()?)), - indexed_paths: RwLock::new(HashSet::new()), - indexing_in_progress: RwLock::new(HashSet::new()), - watched_paths: RwLock::new(HashSet::new()), - created_at: Instant::now(), - }) - } - - /// Get the global index if the given path has been indexed - /// - /// Returns Some(index) if this path's contents are available, - /// None if the path hasn't been browsed yet. - pub fn get_for_path(&self, path: &Path) -> Option>> { - let indexed = self.indexed_paths.read(); - if indexed.contains(path) { - Some(self.index.clone()) - } else { - None - } - } - - /// Get the global index unconditionally (for internal use) - pub fn get_global_index(&self) -> Arc> { - self.index.clone() - } - - /// Check if a path has been fully indexed - pub fn is_indexed(&self, path: &Path) -> bool { - self.indexed_paths.read().contains(path) - } - - /// Check if indexing is in progress for a path - pub fn is_indexing(&self, path: &Path) -> bool { - self.indexing_in_progress.read().contains(path) - } - - /// Prepare the global index for indexing a new path - /// - /// Marks the path as indexing-in-progress and returns the global index. - /// The indexer job should add entries to this shared index. - /// - /// If the path was previously indexed, clears its children first to - /// prevent ghost entries from deleted files. - pub fn create_for_indexing(&self, path: PathBuf) -> Arc> { - let mut in_progress = self.indexing_in_progress.write(); - let mut indexed = self.indexed_paths.write(); - - // If this path was previously indexed, remove it from indexed set - // The actual clearing of stale entries happens asynchronously via clear_for_reindex - indexed.remove(&path); - in_progress.insert(path); - - self.index.clone() - } - - /// Clear stale entries for a path before re-indexing (async version) - /// - /// Removes files and unbrowsed subdirectories, preserving subdirectories - /// that were explicitly navigated to. Verifies preserved directories still - /// exist on the filesystem and removes deleted ones from tracking. - pub async fn clear_for_reindex(&self, path: &Path) -> usize { - let indexed = self.indexed_paths.read().clone(); - let mut index = self.index.write().await; - let (cleared, deleted_browsed_dirs) = index.clear_directory_children(path, &indexed); - - // Remove deleted browsed directories from indexed_paths - if !deleted_browsed_dirs.is_empty() { - let mut indexed_paths = self.indexed_paths.write(); - for deleted_path in deleted_browsed_dirs { - indexed_paths.remove(&deleted_path); - } - } - - cleared - } - - /// Mark indexing as complete for a path - /// - /// Moves the path from "in progress" to "indexed" state. - pub fn mark_indexing_complete(&self, path: &Path) { - let mut in_progress = self.indexing_in_progress.write(); - let mut indexed = self.indexed_paths.write(); - - in_progress.remove(path); - indexed.insert(path.to_path_buf()); - } - - /// Remove a path from the indexed set (e.g., on invalidation) - /// - /// Note: This doesn't remove entries from the index itself, - /// just marks the path as needing re-indexing. - pub fn invalidate_path(&self, path: &Path) { - let mut indexed = self.indexed_paths.write(); - indexed.remove(path); - } - - /// Get the number of indexed paths - pub fn len(&self) -> usize { - self.indexed_paths.read().len() - } - - /// Check if no paths have been indexed - pub fn is_empty(&self) -> bool { - self.indexed_paths.read().is_empty() - } - - /// Get all indexed paths - pub fn indexed_paths(&self) -> Vec { - self.indexed_paths.read().iter().cloned().collect() - } - - /// Get all paths currently being indexed - pub fn paths_in_progress(&self) -> Vec { - self.indexing_in_progress.read().iter().cloned().collect() - } - - // ======================================================================== - // File Watching Support - // ======================================================================== - - /// Register a path for filesystem watching. - /// - /// When registered, the watcher service will monitor this path for changes - /// and update the ephemeral index via `EphemeralChangeHandler`. The path - /// must already be indexed. - pub fn register_for_watching(&self, path: PathBuf) -> bool { - let indexed = self.indexed_paths.read(); - if !indexed.contains(&path) { - return false; - } - drop(indexed); - - let mut watched = self.watched_paths.write(); - watched.insert(path); - true - } - - /// Unregister a path from filesystem watching. - pub fn unregister_from_watching(&self, path: &Path) { - let mut watched = self.watched_paths.write(); - watched.remove(path); - } - - /// Check if a path is registered for watching. - pub fn is_watched(&self, path: &Path) -> bool { - self.watched_paths.read().contains(path) - } - - /// Get all watched paths. - pub fn watched_paths(&self) -> Vec { - self.watched_paths.read().iter().cloned().collect() - } - - /// Find the watched root path that contains the given path. - /// - /// If the given path is under a watched directory, returns that directory. - /// Used by the watcher to route events to the ephemeral handler. - pub fn find_watched_root(&self, path: &Path) -> Option { - let watched = self.watched_paths.read(); - - // Find the longest matching watched path that is an ancestor of `path` - let mut best_match: Option<&PathBuf> = None; - let mut best_len = 0; - - for watched_path in watched.iter() { - if path.starts_with(watched_path) { - let len = watched_path.as_os_str().len(); - if len > best_len { - best_len = len; - best_match = Some(watched_path); - } - } - } - - best_match.cloned() - } - - /// Check if any path in an event batch is under an ephemeral watched path. - /// - /// Returns the watched root if found. - pub fn find_watched_root_for_any<'a, I>(&self, paths: I) -> Option - where - I: IntoIterator, - { - for path in paths { - if let Some(root) = self.find_watched_root(path) { - return Some(root); - } - } - None - } - - /// Get cache statistics - pub fn stats(&self) -> EphemeralIndexCacheStats { - let indexed = self.indexed_paths.read(); - let in_progress = self.indexing_in_progress.read(); - let watched = self.watched_paths.read(); - - EphemeralIndexCacheStats { - indexed_paths: indexed.len(), - indexing_in_progress: in_progress.len(), - watched_paths: watched.len(), - } - } - - /// Get how long the cache has existed - pub fn age(&self) -> std::time::Duration { - self.created_at.elapsed() - } - - /// Legacy: Get age for a specific path (returns cache age since all share one index) - pub fn get_age(&self, _path: &Path) -> Option { - Some(self.created_at.elapsed().as_secs_f64()) - } - - // Legacy compatibility methods - - /// Legacy: Get an index by exact path (for backward compatibility) - #[deprecated(note = "Use get_for_path instead")] - pub fn get(&self, path: &Path) -> Option>> { - self.get_for_path(path) - } - - /// Legacy: Get all cached paths (returns indexed paths) - #[deprecated(note = "Use indexed_paths instead")] - pub fn cached_paths(&self) -> Vec { - self.indexed_paths() - } - - /// Legacy: Insert (no-op, entries are added directly to global index) - #[deprecated(note = "Entries should be added directly to the global index")] - pub fn insert(&self, path: PathBuf, _index: Arc>) { - // Mark the path as indexed - let mut indexed = self.indexed_paths.write(); - indexed.insert(path); - } - - /// Legacy: Remove (just invalidates the path) - #[deprecated(note = "Use invalidate_path instead")] - pub fn remove(&self, path: &Path) { - self.invalidate_path(path); - } -} - -impl Default for EphemeralIndexCache { - fn default() -> Self { - Self::new().expect("Failed to create default EphemeralIndexCache") - } -} - -/// Statistics about the ephemeral index cache -#[derive(Debug, Clone)] -pub struct EphemeralIndexCacheStats { - /// Number of paths that have been indexed - pub indexed_paths: usize, - /// Number of paths currently being indexed - pub indexing_in_progress: usize, - /// Number of paths registered for filesystem watching - pub watched_paths: usize, -} - -impl EphemeralIndexCacheStats { - /// Legacy: total_entries now means indexed_paths - pub fn total_entries(&self) -> usize { - self.indexed_paths - } - - /// Legacy: indexing_count now means indexing_in_progress - pub fn indexing_count(&self) -> usize { - self.indexing_in_progress - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_single_global_index() { - let cache = EphemeralIndexCache::new().expect("failed to create cache"); - - // Initially no paths are indexed - assert!(cache.is_empty()); - assert!(cache.get_for_path(Path::new("/test")).is_none()); - } - - #[test] - fn test_indexing_workflow() { - let cache = EphemeralIndexCache::new().expect("failed to create cache"); - let path = PathBuf::from("/test/path"); - - // Start indexing - let _index = cache.create_for_indexing(path.clone()); - assert!(cache.is_indexing(&path)); - assert!(!cache.is_indexed(&path)); - - // Complete indexing - cache.mark_indexing_complete(&path); - assert!(!cache.is_indexing(&path)); - assert!(cache.is_indexed(&path)); - - // Now get_for_path returns the index - assert!(cache.get_for_path(&path).is_some()); - } - - #[test] - fn test_shared_index_across_paths() { - let cache = EphemeralIndexCache::new().expect("failed to create cache"); - - let path1 = PathBuf::from("/test/path1"); - let path2 = PathBuf::from("/test/path2"); - - // Start indexing both paths - let index1 = cache.create_for_indexing(path1.clone()); - let index2 = cache.create_for_indexing(path2.clone()); - - // They should be the same index - assert!(Arc::ptr_eq(&index1, &index2)); - - // Complete both - cache.mark_indexing_complete(&path1); - cache.mark_indexing_complete(&path2); - - // Both paths now indexed - assert!(cache.is_indexed(&path1)); - assert!(cache.is_indexed(&path2)); - assert_eq!(cache.len(), 2); - } - - #[test] - fn test_invalidate_path() { - let cache = EphemeralIndexCache::new().expect("failed to create cache"); - let path = PathBuf::from("/test/path"); - - // Index the path - let _index = cache.create_for_indexing(path.clone()); - cache.mark_indexing_complete(&path); - assert!(cache.is_indexed(&path)); - - // Invalidate it - cache.invalidate_path(&path); - assert!(!cache.is_indexed(&path)); - - // get_for_path now returns None - assert!(cache.get_for_path(&path).is_none()); - } - - #[test] - fn test_stats() { - let cache = EphemeralIndexCache::new().expect("failed to create cache"); - - let path1 = PathBuf::from("/ready"); - let path2 = PathBuf::from("/in_progress"); - - // One indexed, one in progress - let _index = cache.create_for_indexing(path1.clone()); - cache.mark_indexing_complete(&path1); - - let _index = cache.create_for_indexing(path2.clone()); - - let stats = cache.stats(); - assert_eq!(stats.indexed_paths, 1); - assert_eq!(stats.indexing_in_progress, 1); - } - - #[test] - fn test_watch_registration() { - let cache = EphemeralIndexCache::new().expect("failed to create cache"); - let path = PathBuf::from("/test/watched"); - - // Can't watch a path that's not indexed - assert!(!cache.register_for_watching(path.clone())); - assert!(!cache.is_watched(&path)); - - // Index the path first - let _index = cache.create_for_indexing(path.clone()); - cache.mark_indexing_complete(&path); - - // Now we can register for watching - assert!(cache.register_for_watching(path.clone())); - assert!(cache.is_watched(&path)); - - // Stats should reflect watched path - let stats = cache.stats(); - assert_eq!(stats.watched_paths, 1); - - // Unregister - cache.unregister_from_watching(&path); - assert!(!cache.is_watched(&path)); - } - - #[test] - fn test_find_watched_root() { - let cache = EphemeralIndexCache::new().expect("failed to create cache"); - - let root = PathBuf::from("/mnt/nas"); - let child = PathBuf::from("/mnt/nas/documents/report.pdf"); - - // Index and watch the root - let _index = cache.create_for_indexing(root.clone()); - cache.mark_indexing_complete(&root); - cache.register_for_watching(root.clone()); - - // Child path should find the watched root - assert_eq!(cache.find_watched_root(&child), Some(root.clone())); - - // Unrelated path should not find a root - assert_eq!(cache.find_watched_root(Path::new("/other/path")), None); - } -} diff --git a/core/src/ops/indexing/ephemeral/name.rs b/core/src/ops/indexing/ephemeral/name.rs new file mode 100644 index 000000000..b6d66b73f --- /dev/null +++ b/core/src/ops/indexing/ephemeral/name.rs @@ -0,0 +1,206 @@ +//! String interning cache for deduplicating filenames +//! +//! The NameCache provides global string interning to reduce memory usage. +//! Common filenames like `.git`, `node_modules`, `target`, `README.md` etc. +//! are stored only once and referenced via pointers. +//! +//! Benefits: +//! - 30-40% memory reduction on typical filesystems +//! - Pointer-based equality (faster comparisons) +//! - Stable references for NameRef + +use parking_lot::Mutex; +use std::collections::BTreeSet; + +/// Global string interning pool for deduplicating filenames +/// +/// Strings are stored in a BTreeSet for ordered iteration and fast lookup. +/// The Mutex ensures thread-safe access for concurrent indexing. +pub struct NameCache { + inner: Mutex>>, +} + +impl NameCache { + /// Create a new empty cache + pub fn new() -> Self { + Self { + inner: Mutex::new(BTreeSet::new()), + } + } + + /// Intern a string and return a stable reference + /// + /// If the string already exists, returns a reference to the existing copy. + /// If not, inserts a new copy and returns a reference to it. + /// + /// # Safety + /// The returned reference is valid as long as the NameCache exists. + /// NameCache never removes strings, so references remain stable. + pub fn intern<'cache>(&'cache self, name: &str) -> &'cache str { + let mut inner = self.inner.lock(); + + // Check if already interned + if let Some(existing) = inner.get(name) { + // SAFETY: BTreeSet owns the Box, which lives as long as NameCache. + // We return a reference with lifetime tied to &self. + return unsafe { &*(existing.as_ref() as *const str) }; + } + + // Insert new string + let boxed: Box = name.into(); + let ptr = boxed.as_ref() as *const str; + inner.insert(boxed); + + // SAFETY: We just inserted the string, and NameCache never removes strings. + // The pointer remains valid as long as NameCache exists. + unsafe { &*ptr } + } + + /// Get the number of interned strings + pub fn len(&self) -> usize { + self.inner.lock().len() + } + + /// Check if the cache is empty + pub fn is_empty(&self) -> bool { + self.inner.lock().is_empty() + } + + /// Check if a string is already interned + pub fn contains(&self, name: &str) -> bool { + self.inner.lock().contains(name) + } + + /// Get approximate memory usage in bytes + pub fn memory_usage(&self) -> usize { + let inner = self.inner.lock(); + // Base struct size + BTreeSet overhead + string contents + std::mem::size_of::() + + inner.len() * std::mem::size_of::>() + + inner.iter().map(|s| s.len()).sum::() + } + + /// Iterate over all interned strings + pub fn iter(&self) -> impl Iterator { + let inner = self.inner.lock(); + inner + .iter() + .map(|s| s.to_string()) + .collect::>() + .into_iter() + } +} + +impl Default for NameCache { + fn default() -> Self { + Self::new() + } +} + +// SAFETY: NameCache uses Mutex for thread-safe access +unsafe impl Send for NameCache {} +unsafe impl Sync for NameCache {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_intern_returns_same_pointer() { + let cache = NameCache::new(); + + let s1 = cache.intern("hello"); + let s2 = cache.intern("hello"); + + // Same pointer means same interned string + assert!(std::ptr::eq(s1, s2)); + assert_eq!(s1, "hello"); + } + + #[test] + fn test_intern_different_strings() { + let cache = NameCache::new(); + + let s1 = cache.intern("hello"); + let s2 = cache.intern("world"); + + assert!(!std::ptr::eq(s1, s2)); + assert_eq!(s1, "hello"); + assert_eq!(s2, "world"); + } + + #[test] + fn test_len_and_contains() { + let cache = NameCache::new(); + + assert_eq!(cache.len(), 0); + assert!(!cache.contains("test")); + + cache.intern("test"); + assert_eq!(cache.len(), 1); + assert!(cache.contains("test")); + + // Interning same string doesn't increase count + cache.intern("test"); + assert_eq!(cache.len(), 1); + } + + #[test] + fn test_common_filenames() { + let cache = NameCache::new(); + + // Simulate common filesystem patterns + let common_names = [ + ".git", + ".gitignore", + "node_modules", + "target", + "Cargo.toml", + "README.md", + "package.json", + "src", + "lib", + "main.rs", + ]; + + for name in &common_names { + cache.intern(name); + } + + // All unique, so length equals count + assert_eq!(cache.len(), common_names.len()); + + // Interning again returns same references + for name in &common_names { + let ptr1 = cache.intern(name); + let ptr2 = cache.intern(name); + assert!(std::ptr::eq(ptr1, ptr2)); + } + } + + #[test] + fn test_thread_safety() { + use std::sync::Arc; + use std::thread; + + let cache = Arc::new(NameCache::new()); + let mut handles = vec![]; + + for i in 0..10 { + let cache = Arc::clone(&cache); + handles.push(thread::spawn(move || { + for j in 0..100 { + let name = format!("file_{}_{}", i, j); + cache.intern(&name); + } + })); + } + + for handle in handles { + handle.join().unwrap(); + } + + // Should have 1000 unique strings + assert_eq!(cache.len(), 1000); + } +} diff --git a/core/src/ops/indexing/responder.rs b/core/src/ops/indexing/responder.rs index 418e95bb7..7638be291 100644 --- a/core/src/ops/indexing/responder.rs +++ b/core/src/ops/indexing/responder.rs @@ -1,16 +1,14 @@ //! Persistent location responder. //! -//! Thin wrapper over `PersistentChangeHandler` that translates raw filesystem +//! Thin adapter over `PersistentChangeHandler` that translates raw filesystem //! events into database mutations. The watcher calls `apply_batch` with events; //! this module delegates to the unified change handling infrastructure. use crate::context::CoreContext; -use crate::infra::db::entities; use crate::infra::event::FsRawEventKind; use crate::ops::indexing::change_detection::{self, ChangeConfig, PersistentChangeHandler}; use crate::ops::indexing::rules::RuleToggles; use anyhow::Result; -use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, TransactionTrait}; use std::path::Path; use std::sync::Arc; use uuid::Uuid; @@ -81,70 +79,3 @@ pub async fn apply_batch( change_detection::apply_batch(&mut handler, events, &config).await } - -// ============================================================================ -// Subtree Deletion Utilities -// ============================================================================ -// These functions are used by sync and entity deletion code paths. -// They operate directly on the database without going through ChangeHandler. - -/// Deletes an entry tree without creating tombstones. -/// -/// Used when applying remote tombstones (the deletion was already synced, -/// we're just applying it locally). Also used by entity cascade deletes. -pub async fn delete_subtree_internal( - entry_id: i32, - db: &sea_orm::DatabaseConnection, -) -> Result<(), sea_orm::DbErr> { - let txn = db.begin().await?; - delete_subtree_no_txn(entry_id, &txn).await?; - txn.commit().await?; - Ok(()) -} - -/// Deletes a subtree within an existing transaction. -/// -/// Traverses via entry_closure to find all descendants, then deletes -/// closure links, directory_paths, and entries in the correct order. -async fn delete_subtree_no_txn(entry_id: i32, db: &C) -> Result<(), sea_orm::DbErr> -where - C: sea_orm::ConnectionTrait, -{ - // Collect all descendants via closure table - let mut to_delete_ids: Vec = vec![entry_id]; - if let Ok(rows) = entities::entry_closure::Entity::find() - .filter(entities::entry_closure::Column::AncestorId.eq(entry_id)) - .all(db) - .await - { - to_delete_ids.extend(rows.into_iter().map(|r| r.descendant_id)); - } - to_delete_ids.sort_unstable(); - to_delete_ids.dedup(); - - if !to_delete_ids.is_empty() { - // Delete closure links (both directions) - let _ = entities::entry_closure::Entity::delete_many() - .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) - .exec(db) - .await; - let _ = entities::entry_closure::Entity::delete_many() - .filter(entities::entry_closure::Column::AncestorId.is_in(to_delete_ids.clone())) - .exec(db) - .await; - - // Delete directory paths - let _ = entities::directory_paths::Entity::delete_many() - .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) - .exec(db) - .await; - - // Delete entries - let _ = entities::entry::Entity::delete_many() - .filter(entities::entry::Column::Id.is_in(to_delete_ids)) - .exec(db) - .await; - } - - Ok(()) -} From bc6a347b690b1d49527c1df2862d98024a73ee03 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 02:14:47 -0800 Subject: [PATCH 15/20] Refactor ephemeral indexing structure and improve documentation - Introduced a new `ephemeral` module to encapsulate the `EphemeralIndex` functionality, enhancing organization and clarity. - Moved `EphemeralIndex` and related types to the new module, ensuring a cleaner separation of concerns. - Updated documentation across the `hierarchy`, `job`, and `persistence` modules to reflect the new structure and improve clarity on ephemeral indexing operations. - Removed deprecated references to `EphemeralIndex` in favor of the new module path, streamlining code references. - Enhanced comments and documentation to provide better context and understanding of the ephemeral indexing system. --- .../indexing/change_detection/ephemeral.rs | 2 +- core/src/ops/indexing/ephemeral/index.rs | 527 +++++++++++++++++ core/src/ops/indexing/ephemeral/mod.rs | 8 +- core/src/ops/indexing/hierarchy.rs | 43 +- core/src/ops/indexing/job.rs | 547 +----------------- core/src/ops/indexing/persistence.rs | 2 +- 6 files changed, 548 insertions(+), 581 deletions(-) create mode 100644 core/src/ops/indexing/ephemeral/index.rs diff --git a/core/src/ops/indexing/change_detection/ephemeral.rs b/core/src/ops/indexing/change_detection/ephemeral.rs index 61fffb29d..7e1e9c12e 100644 --- a/core/src/ops/indexing/change_detection/ephemeral.rs +++ b/core/src/ops/indexing/change_detection/ephemeral.rs @@ -7,7 +7,7 @@ use super::handler::{build_dir_entry, ChangeHandler}; use super::types::{ChangeType, EntryRef}; use crate::infra::event::EventBus; use crate::ops::indexing::entry::EntryMetadata; -use crate::ops::indexing::job::EphemeralIndex; +use crate::ops::indexing::ephemeral::EphemeralIndex; use crate::ops::indexing::state::{DirEntry, EntryKind}; use anyhow::Result; use std::path::{Path, PathBuf}; diff --git a/core/src/ops/indexing/ephemeral/index.rs b/core/src/ops/indexing/ephemeral/index.rs new file mode 100644 index 000000000..873d83e9d --- /dev/null +++ b/core/src/ops/indexing/ephemeral/index.rs @@ -0,0 +1,527 @@ +//! Memory-efficient index for browsing paths outside managed locations. +//! +//! Ephemeral indexing lets users navigate unmanaged directories (network shares, +//! external drives) without adding them as permanent locations. Instead of writing +//! to the database, entries live in this memory-only structure until the session +//! ends or the path is promoted to a managed location. +//! +//! Memory usage is ~50 bytes per entry vs ~200 bytes with a naive `HashMap` +//! approach. The optimization comes from: +//! - **NodeArena:** Contiguous slab allocation with pointer-sized entry IDs +//! - **NameCache:** String interning (one copy of "index.js" for thousands of node_modules files) +//! - **NameRegistry:** Trie-based prefix search without full-text indexing overhead +//! +//! Multiple directory trees can coexist in the same index (e.g., browsing both +//! `/mnt/nas` and `/media/usb` simultaneously), sharing the string interning pool +//! for maximum deduplication. + +use crate::domain::ContentKind; +use crate::filetype::FileTypeRegistry; +use crate::ops::indexing::entry::EntryMetadata; +use crate::ops::indexing::state::{EntryKind, IndexerStats}; + +use super::types::{FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata}; +use super::{EntryId, NameCache, NameRegistry, NodeArena}; + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use uuid::Uuid; + +/// Memory-efficient index for browsing unmanaged paths. +pub struct EphemeralIndex { + arena: NodeArena, + cache: Arc, + registry: NameRegistry, + path_index: HashMap, + entry_uuids: HashMap, + content_kinds: HashMap, + created_at: Instant, + last_accessed: Instant, + pub stats: IndexerStats, +} + +impl std::fmt::Debug for EphemeralIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("EphemeralIndex") + .field("entry_count", &self.arena.len()) + .field("interned_names", &self.cache.len()) + .field("path_count", &self.path_index.len()) + .finish() + } +} + +impl EphemeralIndex { + pub fn new() -> std::io::Result { + let cache = Arc::new(NameCache::new()); + let arena = NodeArena::new()?; + let registry = NameRegistry::new(); + + let now = Instant::now(); + + Ok(Self { + arena, + cache, + registry, + path_index: HashMap::new(), + entry_uuids: HashMap::new(), + content_kinds: HashMap::new(), + created_at: now, + last_accessed: now, + stats: IndexerStats::default(), + }) + } + + /// Ensures a directory exists, creating all missing ancestors recursively. + /// + /// This method guarantees that `list_directory()` works immediately after + /// `add_entry()` without a separate tree-building pass. Parent directories + /// are created from root to leaf, so the full ancestor chain exists before + /// any child is added. + pub fn ensure_directory(&mut self, path: &Path) -> std::io::Result { + if let Some(&id) = self.path_index.get(path) { + return Ok(id); + } + + let parent_id = if let Some(parent_path) = path.parent() { + if parent_path.as_os_str().is_empty() { + None + } else { + Some(self.ensure_directory(parent_path)?) + } + } else { + None + }; + + let name = self.cache.intern( + path.file_name() + .map(|s| s.to_string_lossy()) + .as_deref() + .unwrap_or("/"), + ); + + let parent_ref = parent_id + .map(MaybeEntryId::some) + .unwrap_or(MaybeEntryId::NONE); + let meta = PackedMetadata::new(NodeState::Accessible, FileType::Directory, 0); + let node = FileNode::new(NameRef::new(name, parent_ref), meta); + + let id = self.arena.insert(node)?; + + // Add to parent's children + if let Some(parent_id) = parent_id { + if let Some(parent) = self.arena.get_mut(parent_id) { + parent.add_child(id); + } + } + + self.path_index.insert(path.to_path_buf(), id); + self.registry.insert(name, id); + + let uuid = Uuid::new_v4(); + self.entry_uuids.insert(path.to_path_buf(), uuid); + + Ok(id) + } + + /// Adds an entry to the index, returning its content kind if successful. + /// + /// Content kind is identified by file extension (no I/O needed), which is + /// sufficient for ephemeral browsing where speed is critical. Returns Ok(None) + /// if the entry already exists (prevents duplicate entries when re-indexing + /// a directory). + pub fn add_entry( + &mut self, + path: PathBuf, + uuid: Uuid, + metadata: EntryMetadata, + ) -> std::io::Result> { + if self.path_index.contains_key(&path) { + tracing::trace!("Skipping duplicate entry: {}", path.display()); + return Ok(None); + } + + // Ensure parent directories exist before adding this entry, building the ancestor + // chain from root to leaf. The &mut borrow happens before name interning to avoid + // holding the cache lock while recursing. + let parent_id = if let Some(parent_path) = path.parent() { + if parent_path.as_os_str().is_empty() { + None + } else if let Some(&existing_id) = self.path_index.get(parent_path) { + Some(existing_id) + } else { + Some(self.ensure_directory(parent_path)?) + } + } else { + None + }; + + let name = self.cache.intern( + path.file_name() + .map(|s| s.to_string_lossy()) + .as_deref() + .unwrap_or("unknown"), + ); + + let file_type = FileType::from(metadata.kind); + + let meta = PackedMetadata::new(NodeState::Accessible, file_type, metadata.size) + .with_times(metadata.modified, metadata.created); + + let parent_ref = parent_id + .map(MaybeEntryId::some) + .unwrap_or(MaybeEntryId::NONE); + let node = FileNode::new(NameRef::new(name, parent_ref), meta); + + let id = self.arena.insert(node)?; + + // Add to parent's children + if let Some(parent_id) = parent_id { + if let Some(parent) = self.arena.get_mut(parent_id) { + parent.add_child(id); + } + } + + let content_kind = if metadata.kind == EntryKind::File { + let registry = FileTypeRegistry::default(); + registry.identify_by_extension(&path) + } else if metadata.kind == EntryKind::Directory { + ContentKind::Unknown + } else { + ContentKind::Unknown + }; + + self.path_index.insert(path.clone(), id); + self.registry.insert(name, id); + self.entry_uuids.insert(path.clone(), uuid); + self.content_kinds.insert(path, content_kind); + + self.last_accessed = Instant::now(); + Ok(Some(content_kind)) + } + + pub fn get_entry(&mut self, path: &PathBuf) -> Option { + let id = self.path_index.get(path)?; + let node = self.arena.get(*id)?; + + self.last_accessed = Instant::now(); + + Some(EntryMetadata { + path: path.clone(), + kind: EntryKind::from(node.meta.file_type()), + size: node.meta.size(), + modified: node.meta.mtime_as_system_time(), + accessed: None, + created: node.meta.ctime_as_system_time(), + inode: None, + permissions: None, + is_hidden: path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }) + } + + /// Get entry reference for read-only access (doesn't update last_accessed) + pub fn get_entry_ref(&self, path: &PathBuf) -> Option { + let id = self.path_index.get(path)?; + let node = self.arena.get(*id)?; + + Some(EntryMetadata { + path: path.clone(), + kind: EntryKind::from(node.meta.file_type()), + size: node.meta.size(), + modified: node.meta.mtime_as_system_time(), + accessed: None, + created: node.meta.ctime_as_system_time(), + inode: None, + permissions: None, + is_hidden: path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }) + } + + pub fn get_entry_uuid(&self, path: &PathBuf) -> Option { + self.entry_uuids.get(path).copied() + } + + pub fn get_content_kind(&self, path: &PathBuf) -> ContentKind { + self.content_kinds + .get(path) + .copied() + .unwrap_or(ContentKind::Unknown) + } + + pub fn list_directory(&self, path: &Path) -> Option> { + let id = self.path_index.get(path)?; + let node = self.arena.get(*id)?; + + Some( + node.children + .iter() + .filter_map(|&child_id| self.reconstruct_path(child_id)) + .collect(), + ) + } + + /// Clears entries before re-indexing, preserving explicitly browsed subdirectories. + /// + /// Since ephemeral indexing is shallow, subdirectories that were explicitly + /// navigated to (in `indexed_paths`) should be preserved as separate index + /// branches. Unbrowsed subdirectories are refreshed with the parent. + /// + /// Returns (cleared_count, deleted_browsed_dirs) where deleted_browsed_dirs + /// contains paths that were in indexed_paths but no longer exist on disk. + pub fn clear_directory_children( + &mut self, + dir_path: &Path, + indexed_paths: &std::collections::HashSet, + ) -> (usize, Vec) { + let dir_id = match self.path_index.get(dir_path) { + Some(&id) => id, + None => return (0, Vec::new()), + }; + + let dir_node = match self.arena.get(dir_id) { + Some(node) => node, + None => return (0, Vec::new()), + }; + + let mut deleted_browsed_dirs = Vec::new(); + + // Collect children to remove + let mut children_to_remove: Vec<(PathBuf, EntryId)> = dir_node + .children + .iter() + .filter_map(|&child_id| { + let child_node = self.arena.get(child_id)?; + let child_path = self.reconstruct_path(child_id)?; + + // Preserve subdirectories that were explicitly browsed AND still exist + if child_node.is_directory() && indexed_paths.contains(&child_path) { + // Verify the directory still exists on the filesystem + if std::fs::metadata(&child_path).is_ok() { + return None; // Preserve - still exists and was browsed + } + // Directory was deleted - track for removal from indexed_paths + tracing::debug!( + "Removing deleted browsed directory: {}", + child_path.display() + ); + deleted_browsed_dirs.push(child_path.clone()); + } + + // Remove everything else (files, unbrowsed directories, deleted directories) + Some((child_path, child_id)) + }) + .collect(); + + let cleared = children_to_remove.len(); + + // Remove from indexes + for (child_path, _) in &children_to_remove { + self.path_index.remove(child_path); + self.entry_uuids.remove(child_path); + self.content_kinds.remove(child_path); + } + + // Update parent's children list + if let Some(dir_node) = self.arena.get_mut(dir_id) { + let removed_ids: std::collections::HashSet<_> = + children_to_remove.iter().map(|(_, id)| id).collect(); + + dir_node + .children + .retain(|child_id| !removed_ids.contains(child_id)); + } + + if cleared > 0 { + tracing::debug!( + "Cleared {} entries from {} (preserved browsed subdirs)", + cleared, + dir_path.display() + ); + } + + (cleared, deleted_browsed_dirs) + } + + fn reconstruct_path(&self, id: EntryId) -> Option { + let mut segments = Vec::new(); + let mut current = id; + + while let Some(node) = self.arena.get(current) { + segments.push(node.name().to_owned()); + if let Some(parent) = node.parent() { + current = parent; + } else { + break; + } + } + + if segments.is_empty() { + return None; + } + + let mut path = PathBuf::from("/"); + for segment in segments.into_iter().rev() { + path.push(segment); + } + Some(path) + } + + pub fn find_by_name(&self, name: &str) -> Vec { + self.registry + .get(name) + .map(|ids| { + ids.iter() + .filter_map(|&id| self.reconstruct_path(id)) + .collect() + }) + .unwrap_or_default() + } + + pub fn find_by_prefix(&self, prefix: &str) -> Vec { + self.registry + .find_prefix(prefix) + .iter() + .filter_map(|&id| self.reconstruct_path(id)) + .collect() + } + + pub fn age(&self) -> Duration { + self.created_at.elapsed() + } + + pub fn idle_time(&self) -> Duration { + self.last_accessed.elapsed() + } + + pub fn len(&self) -> usize { + self.arena.len() + } + + pub fn is_empty(&self) -> bool { + self.arena.is_empty() + } + + pub fn memory_usage(&self) -> usize { + self.arena.memory_usage() + + self.cache.memory_usage() + + self.registry.memory_usage() + + self.path_index.capacity() + * (std::mem::size_of::() + std::mem::size_of::()) + + self.entry_uuids.capacity() + * (std::mem::size_of::() + std::mem::size_of::()) + } + + pub fn get_stats(&self) -> EphemeralIndexStats { + EphemeralIndexStats { + total_entries: self.arena.len(), + unique_names: self.registry.unique_names(), + interned_strings: self.cache.len(), + memory_bytes: self.memory_usage(), + } + } + + pub fn content_kinds_count(&self) -> usize { + self.content_kinds.len() + } + + pub fn path_index_count(&self) -> usize { + self.path_index.len() + } + + /// Check if an entry exists at the given path. + pub fn has_entry(&self, path: &Path) -> bool { + self.path_index.contains_key(path) + } + + /// Remove an entry at the given path. + /// + /// Returns true if the entry was removed, false if it didn't exist. + /// For directories, this only removes the directory entry itself, not its children. + /// Use `remove_directory_tree` to remove a directory and all its descendants. + pub fn remove_entry(&mut self, path: &Path) -> bool { + let existed = self.path_index.remove(path).is_some(); + self.entry_uuids.remove(path); + self.content_kinds.remove(path); + existed + } + + /// Remove a directory and all its descendants. + /// + /// Returns the number of entries removed. + pub fn remove_directory_tree(&mut self, path: &Path) -> usize { + let prefix = path.to_string_lossy().to_string(); + let keys_to_remove: Vec<_> = self + .path_index + .keys() + .filter(|k| { + let k_str = k.to_string_lossy(); + k_str == prefix || k_str.starts_with(&format!("{}/", prefix)) + }) + .cloned() + .collect(); + + let count = keys_to_remove.len(); + for key in keys_to_remove { + self.path_index.remove(&key); + self.entry_uuids.remove(&key); + self.content_kinds.remove(&key); + } + count + } + + /// Reconstructs paths for all entries and returns them as a HashMap. + /// + /// For large indexes, this can be expensive since it walks the tree to rebuild + /// every path. Prefer using `list_directory()` or `find_by_name()` for targeted + /// queries when possible. + pub fn entries(&self) -> HashMap { + let mut result = HashMap::with_capacity(self.path_index.len()); + + for (path, &id) in &self.path_index { + if let Some(node) = self.arena.get(id) { + let metadata = EntryMetadata { + path: path.clone(), + kind: EntryKind::from(node.meta.file_type()), + size: node.meta.size(), + modified: node.meta.mtime_as_system_time(), + accessed: None, + created: node.meta.ctime_as_system_time(), + inode: None, + permissions: None, + is_hidden: path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }; + result.insert(path.clone(), metadata); + } + } + + result + } +} + +impl Default for EphemeralIndex { + fn default() -> Self { + Self::new().expect("Failed to create default EphemeralIndex") + } +} + +/// Statistics about an ephemeral index +#[derive(Debug, Clone)] +pub struct EphemeralIndexStats { + pub total_entries: usize, + pub unique_names: usize, + pub interned_strings: usize, + pub memory_bytes: usize, +} diff --git a/core/src/ops/indexing/ephemeral/mod.rs b/core/src/ops/indexing/ephemeral/mod.rs index 02a5f3553..0ca0ee082 100644 --- a/core/src/ops/indexing/ephemeral/mod.rs +++ b/core/src/ops/indexing/ephemeral/mod.rs @@ -39,14 +39,16 @@ pub mod arena; pub mod cache; -pub mod index_cache; +pub mod index; +pub mod name; pub mod registry; pub mod responder; pub mod types; // Re-export public types pub use arena::NodeArena; -pub use cache::NameCache; -pub use index_cache::EphemeralIndexCache; +pub use cache::EphemeralIndexCache; +pub use index::{EphemeralIndex, EphemeralIndexStats}; +pub use name::NameCache; pub use registry::NameRegistry; pub use types::{EntryId, FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata}; diff --git a/core/src/ops/indexing/hierarchy.rs b/core/src/ops/indexing/hierarchy.rs index 61b9e4edb..4c84f227e 100644 --- a/core/src/ops/indexing/hierarchy.rs +++ b/core/src/ops/indexing/hierarchy.rs @@ -2,16 +2,15 @@ //! //! Provides O(1) tree traversal operations using a precomputed closure table. //! The closure table stores all ancestor-descendant relationships with their depths, -//! eliminating recursive queries for common operations like "get all children" or -//! "build full path". Each insert updates the closure table to maintain transitive -//! relationships, trading write complexity for instant read performance. +//! eliminating recursive queries for common operations like "get all children". +//! Each insert updates the closure table to maintain transitive relationships, +//! trading write complexity for instant read performance. +//! +//! For path resolution, use [`PathResolver::get_full_path`] which provides O(1) +//! lookups via the `directory_paths` cache table. use crate::infra::db::entities::{entry, entry_closure}; -use sea_orm::{ - ColumnTrait, Condition, DbConn, EntityTrait, JoinType, PaginatorTrait, QueryFilter, QueryOrder, - QuerySelect, RelationTrait, -}; -use std::path::PathBuf; +use sea_orm::{ColumnTrait, DbConn, EntityTrait, PaginatorTrait, QueryFilter, QueryOrder}; /// Namespace for closure table queries that avoid recursive database operations. pub struct HierarchyQuery; @@ -138,34 +137,6 @@ impl HierarchyQuery { } } - /// Constructs the absolute filesystem path by joining location_path + ancestors + entry name. - /// - /// Used for displaying full paths in UI and for validating moves/renames don't exceed - /// filesystem limits. The closure table makes this O(1) instead of recursively walking - /// parent_id links. - pub async fn build_full_path( - db: &DbConn, - entry_id: i32, - location_path: &str, - ) -> Result { - let entry = entry::Entity::find_by_id(entry_id) - .one(db) - .await? - .ok_or_else(|| sea_orm::DbErr::RecordNotFound("Entry not found".to_string()))?; - - let ancestors = Self::get_ancestors(db, entry_id).await?; - - let mut path = PathBuf::from(location_path); - - for ancestor in ancestors { - path.push(&ancestor.name); - } - - path.push(&entry.name); - - Ok(path) - } - /// Counts descendants at any depth without fetching full entry records. pub async fn count_descendants(db: &DbConn, ancestor_id: i32) -> Result { entry_closure::Entity::find() diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index 93b58f089..eebf87210 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -1,9 +1,9 @@ -//! Indexer job implementation and ephemeral index storage. +//! Indexer job implementation. //! //! This module contains the main `IndexerJob` struct that orchestrates the multi-phase -//! indexing pipeline, as well as the `EphemeralIndex` used for browsing unmanaged paths -//! without database writes. The job supports both persistent indexing (for managed locations) +//! indexing pipeline. The job supports both persistent indexing (for managed locations) //! and ephemeral indexing (for external drives, network shares, and temporary browsing). +//! use crate::{ domain::addressing::SdPath, @@ -14,7 +14,6 @@ use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; use serde::{Deserialize, Serialize}; use specta::Type; use std::{ - collections::HashMap, path::{Path, PathBuf}, sync::Arc, time::Duration, @@ -24,7 +23,7 @@ use tracing::{info, warn}; use uuid::Uuid; use super::{ - entry::EntryMetadata, + ephemeral::EphemeralIndex, metrics::{IndexerMetrics, PhaseTimer}, phases, state::{IndexError, IndexPhase, IndexerProgress, IndexerState, IndexerStats, Phase}, @@ -41,11 +40,11 @@ use super::{ pub enum IndexMode { /// Location exists but is not indexed None, - /// Just filesystem metadata (fastest) + /// Just filesystem metadata Shallow, - /// Generate content identities via BLAKE3 hashing (enables duplicate detection) + /// Generate content identities via sampled BLAKE3 hashing (enables duplicate detection) Content, - /// Full indexing with thumbnails and text extraction (slowest) + /// Full indexing with thumbnails and text extraction Deep, } @@ -179,538 +178,6 @@ impl IndexerJobConfig { } } -/// Memory-efficient index for browsing paths outside managed locations. -/// -/// Ephemeral indexing lets users navigate unmanaged directories (network shares, -/// external drives) without adding them as permanent locations. Instead of writing -/// to the database, entries live in this memory-only structure until the session -/// ends or the path is promoted to a managed location. -/// -/// Memory usage is ~50 bytes per entry vs ~200 bytes with a naive `HashMap` -/// approach. The optimization comes from: -/// - **NodeArena:** Contiguous slab allocation with pointer-sized entry IDs -/// - **NameCache:** String interning (one copy of "index.js" for thousands of node_modules files) -/// - **NameRegistry:** Trie-based prefix search without full-text indexing overhead -/// -/// Multiple directory trees can coexist in the same index (e.g., browsing both -/// `/mnt/nas` and `/media/usb` simultaneously), sharing the string interning pool -/// for maximum deduplication. -pub struct EphemeralIndex { - arena: super::ephemeral::NodeArena, - cache: std::sync::Arc, - registry: super::ephemeral::NameRegistry, - path_index: HashMap, - entry_uuids: HashMap, - content_kinds: HashMap, - created_at: std::time::Instant, - last_accessed: std::time::Instant, - pub stats: IndexerStats, -} - -impl std::fmt::Debug for EphemeralIndex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("EphemeralIndex") - .field("entry_count", &self.arena.len()) - .field("interned_names", &self.cache.len()) - .field("path_count", &self.path_index.len()) - .finish() - } -} - -impl EphemeralIndex { - pub fn new() -> std::io::Result { - use super::ephemeral::{NameCache, NameRegistry, NodeArena}; - - let cache = std::sync::Arc::new(NameCache::new()); - let arena = NodeArena::new()?; - let registry = NameRegistry::new(); - - let now = std::time::Instant::now(); - - Ok(Self { - arena, - cache, - registry, - path_index: HashMap::new(), - entry_uuids: HashMap::new(), - content_kinds: HashMap::new(), - created_at: now, - last_accessed: now, - stats: IndexerStats::default(), - }) - } - - /// Ensures a directory exists, creating all missing ancestors recursively. - /// - /// This method guarantees that `list_directory()` works immediately after - /// `add_entry()` without a separate tree-building pass. Parent directories - /// are created from root to leaf, so the full ancestor chain exists before - /// any child is added. - pub fn ensure_directory(&mut self, path: &Path) -> std::io::Result { - use super::ephemeral::{ - FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata, - }; - use super::state::EntryKind; - - if let Some(&id) = self.path_index.get(path) { - return Ok(id); - } - - let parent_id = if let Some(parent_path) = path.parent() { - if parent_path.as_os_str().is_empty() { - None - } else { - Some(self.ensure_directory(parent_path)?) - } - } else { - None - }; - - let name = self.cache.intern( - path.file_name() - .map(|s| s.to_string_lossy()) - .as_deref() - .unwrap_or("/"), - ); - - let parent_ref = parent_id - .map(MaybeEntryId::some) - .unwrap_or(MaybeEntryId::NONE); - let meta = PackedMetadata::new(NodeState::Accessible, FileType::Directory, 0); - let node = FileNode::new(NameRef::new(name, parent_ref), meta); - - let id = self.arena.insert(node)?; - - // Add to parent's children - if let Some(parent_id) = parent_id { - if let Some(parent) = self.arena.get_mut(parent_id) { - parent.add_child(id); - } - } - - self.path_index.insert(path.to_path_buf(), id); - self.registry.insert(name, id); - - let uuid = uuid::Uuid::new_v4(); - self.entry_uuids.insert(path.to_path_buf(), uuid); - - Ok(id) - } - - /// Adds an entry to the index, returning its content kind if successful. - /// - /// Content kind is identified by file extension (no I/O needed), which is - /// sufficient for ephemeral browsing where speed is critical. Returns Ok(None) - /// if the entry already exists (prevents duplicate entries when re-indexing - /// a directory). - pub fn add_entry( - &mut self, - path: PathBuf, - uuid: Uuid, - metadata: EntryMetadata, - ) -> std::io::Result> { - use super::ephemeral::{ - FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata, - }; - use crate::domain::ContentKind; - use crate::filetype::FileTypeRegistry; - - if self.path_index.contains_key(&path) { - tracing::trace!("Skipping duplicate entry: {}", path.display()); - return Ok(None); - } - - // Ensure parent directories exist before adding this entry, building the ancestor - // chain from root to leaf. The &mut borrow happens before name interning to avoid - // holding the cache lock while recursing. - let parent_id = if let Some(parent_path) = path.parent() { - if parent_path.as_os_str().is_empty() { - None - } else if let Some(&existing_id) = self.path_index.get(parent_path) { - Some(existing_id) - } else { - Some(self.ensure_directory(parent_path)?) - } - } else { - None - }; - - let name = self.cache.intern( - path.file_name() - .map(|s| s.to_string_lossy()) - .as_deref() - .unwrap_or("unknown"), - ); - - let file_type = FileType::from(metadata.kind); - - let meta = PackedMetadata::new(NodeState::Accessible, file_type, metadata.size) - .with_times(metadata.modified, metadata.created); - - let parent_ref = parent_id - .map(MaybeEntryId::some) - .unwrap_or(MaybeEntryId::NONE); - let node = FileNode::new(NameRef::new(name, parent_ref), meta); - - let id = self.arena.insert(node)?; - - // Add to parent's children - if let Some(parent_id) = parent_id { - if let Some(parent) = self.arena.get_mut(parent_id) { - parent.add_child(id); - } - } - - let content_kind = if metadata.kind == super::state::EntryKind::File { - let registry = FileTypeRegistry::default(); - registry.identify_by_extension(&path) - } else if metadata.kind == super::state::EntryKind::Directory { - ContentKind::Unknown - } else { - ContentKind::Unknown - }; - - self.path_index.insert(path.clone(), id); - self.registry.insert(name, id); - self.entry_uuids.insert(path.clone(), uuid); - self.content_kinds.insert(path, content_kind); - - self.last_accessed = std::time::Instant::now(); - Ok(Some(content_kind)) - } - - pub fn get_entry(&mut self, path: &PathBuf) -> Option { - use super::state::EntryKind; - - let id = self.path_index.get(path)?; - let node = self.arena.get(*id)?; - - self.last_accessed = std::time::Instant::now(); - - Some(EntryMetadata { - path: path.clone(), - kind: EntryKind::from(node.meta.file_type()), - size: node.meta.size(), - modified: node.meta.mtime_as_system_time(), - accessed: None, - created: node.meta.ctime_as_system_time(), - inode: None, - permissions: None, - is_hidden: path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| n.starts_with('.')) - .unwrap_or(false), - }) - } - - /// Get entry reference for read-only access (doesn't update last_accessed) - pub fn get_entry_ref(&self, path: &PathBuf) -> Option { - use super::state::EntryKind; - - let id = self.path_index.get(path)?; - let node = self.arena.get(*id)?; - - Some(EntryMetadata { - path: path.clone(), - kind: EntryKind::from(node.meta.file_type()), - size: node.meta.size(), - modified: node.meta.mtime_as_system_time(), - accessed: None, - created: node.meta.ctime_as_system_time(), - inode: None, - permissions: None, - is_hidden: path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| n.starts_with('.')) - .unwrap_or(false), - }) - } - - pub fn get_entry_uuid(&self, path: &PathBuf) -> Option { - self.entry_uuids.get(path).copied() - } - - pub fn get_content_kind(&self, path: &PathBuf) -> crate::domain::ContentKind { - self.content_kinds - .get(path) - .copied() - .unwrap_or(crate::domain::ContentKind::Unknown) - } - - pub fn list_directory(&self, path: &std::path::Path) -> Option> { - let id = self.path_index.get(path)?; - let node = self.arena.get(*id)?; - - Some( - node.children - .iter() - .filter_map(|&child_id| self.reconstruct_path(child_id)) - .collect(), - ) - } - - /// Clears entries before re-indexing, preserving explicitly browsed subdirectories. - /// - /// Since ephemeral indexing is shallow, subdirectories that were explicitly - /// navigated to (in `indexed_paths`) should be preserved as separate index - /// branches. Unbrowsed subdirectories are refreshed with the parent. - /// - /// Returns (cleared_count, deleted_browsed_dirs) where deleted_browsed_dirs - /// contains paths that were in indexed_paths but no longer exist on disk. - pub fn clear_directory_children( - &mut self, - dir_path: &Path, - indexed_paths: &std::collections::HashSet, - ) -> (usize, Vec) { - let dir_id = match self.path_index.get(dir_path) { - Some(&id) => id, - None => return (0, Vec::new()), - }; - - let dir_node = match self.arena.get(dir_id) { - Some(node) => node, - None => return (0, Vec::new()), - }; - - let mut deleted_browsed_dirs = Vec::new(); - - // Collect children to remove - let mut children_to_remove: Vec<(PathBuf, super::ephemeral::EntryId)> = dir_node - .children - .iter() - .filter_map(|&child_id| { - let child_node = self.arena.get(child_id)?; - let child_path = self.reconstruct_path(child_id)?; - - // Preserve subdirectories that were explicitly browsed AND still exist - if child_node.is_directory() && indexed_paths.contains(&child_path) { - // Verify the directory still exists on the filesystem - if std::fs::metadata(&child_path).is_ok() { - return None; // Preserve - still exists and was browsed - } - // Directory was deleted - track for removal from indexed_paths - tracing::debug!( - "Removing deleted browsed directory: {}", - child_path.display() - ); - deleted_browsed_dirs.push(child_path.clone()); - } - - // Remove everything else (files, unbrowsed directories, deleted directories) - Some((child_path, child_id)) - }) - .collect(); - - let cleared = children_to_remove.len(); - - // Remove from indexes - for (child_path, _) in &children_to_remove { - self.path_index.remove(child_path); - self.entry_uuids.remove(child_path); - self.content_kinds.remove(child_path); - } - - // Update parent's children list - if let Some(dir_node) = self.arena.get_mut(dir_id) { - let removed_ids: std::collections::HashSet<_> = - children_to_remove.iter().map(|(_, id)| id).collect(); - - dir_node - .children - .retain(|child_id| !removed_ids.contains(child_id)); - } - - if cleared > 0 { - tracing::debug!( - "Cleared {} entries from {} (preserved browsed subdirs)", - cleared, - dir_path.display() - ); - } - - (cleared, deleted_browsed_dirs) - } - - fn reconstruct_path(&self, id: super::ephemeral::EntryId) -> Option { - let mut segments = Vec::new(); - let mut current = id; - - while let Some(node) = self.arena.get(current) { - segments.push(node.name().to_owned()); - if let Some(parent) = node.parent() { - current = parent; - } else { - break; - } - } - - if segments.is_empty() { - return None; - } - - let mut path = PathBuf::from("/"); - for segment in segments.into_iter().rev() { - path.push(segment); - } - Some(path) - } - - pub fn find_by_name(&self, name: &str) -> Vec { - self.registry - .get(name) - .map(|ids| { - ids.iter() - .filter_map(|&id| self.reconstruct_path(id)) - .collect() - }) - .unwrap_or_default() - } - - pub fn find_by_prefix(&self, prefix: &str) -> Vec { - self.registry - .find_prefix(prefix) - .iter() - .filter_map(|&id| self.reconstruct_path(id)) - .collect() - } - - pub fn age(&self) -> Duration { - self.created_at.elapsed() - } - - pub fn idle_time(&self) -> Duration { - self.last_accessed.elapsed() - } - - pub fn len(&self) -> usize { - self.arena.len() - } - - pub fn is_empty(&self) -> bool { - self.arena.is_empty() - } - - pub fn memory_usage(&self) -> usize { - self.arena.memory_usage() - + self.cache.memory_usage() - + self.registry.memory_usage() - + self.path_index.capacity() - * (std::mem::size_of::() - + std::mem::size_of::()) - + self.entry_uuids.capacity() - * (std::mem::size_of::() + std::mem::size_of::()) - } - - pub fn get_stats(&self) -> EphemeralIndexStats { - EphemeralIndexStats { - total_entries: self.arena.len(), - unique_names: self.registry.unique_names(), - interned_strings: self.cache.len(), - memory_bytes: self.memory_usage(), - } - } - - pub fn content_kinds_count(&self) -> usize { - self.content_kinds.len() - } - - pub fn path_index_count(&self) -> usize { - self.path_index.len() - } - - /// Check if an entry exists at the given path. - pub fn has_entry(&self, path: &Path) -> bool { - self.path_index.contains_key(path) - } - - /// Remove an entry at the given path. - /// - /// Returns true if the entry was removed, false if it didn't exist. - /// For directories, this only removes the directory entry itself, not its children. - /// Use `remove_directory_tree` to remove a directory and all its descendants. - pub fn remove_entry(&mut self, path: &Path) -> bool { - let existed = self.path_index.remove(path).is_some(); - self.entry_uuids.remove(path); - self.content_kinds.remove(path); - existed - } - - /// Remove a directory and all its descendants. - /// - /// Returns the number of entries removed. - pub fn remove_directory_tree(&mut self, path: &Path) -> usize { - let prefix = path.to_string_lossy().to_string(); - let keys_to_remove: Vec<_> = self - .path_index - .keys() - .filter(|k| { - let k_str = k.to_string_lossy(); - k_str == prefix || k_str.starts_with(&format!("{}/", prefix)) - }) - .cloned() - .collect(); - - let count = keys_to_remove.len(); - for key in keys_to_remove { - self.path_index.remove(&key); - self.entry_uuids.remove(&key); - self.content_kinds.remove(&key); - } - count - } - - /// Reconstructs paths for all entries and returns them as a HashMap. - /// - /// For large indexes, this can be expensive since it walks the tree to rebuild - /// every path. Prefer using `list_directory()` or `find_by_name()` for targeted - /// queries when possible. - pub fn entries(&self) -> HashMap { - use super::state::EntryKind; - - let mut result = HashMap::with_capacity(self.path_index.len()); - - for (path, &id) in &self.path_index { - if let Some(node) = self.arena.get(id) { - let metadata = EntryMetadata { - path: path.clone(), - kind: EntryKind::from(node.meta.file_type()), - size: node.meta.size(), - modified: node.meta.mtime_as_system_time(), - accessed: None, - created: node.meta.ctime_as_system_time(), - inode: None, - permissions: None, - is_hidden: path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| n.starts_with('.')) - .unwrap_or(false), - }; - result.insert(path.clone(), metadata); - } - } - - result - } -} - -impl Default for EphemeralIndex { - fn default() -> Self { - Self::new().expect("Failed to create default EphemeralIndex") - } -} - -/// Statistics about an ephemeral index -#[derive(Debug, Clone)] -pub struct EphemeralIndexStats { - pub total_entries: usize, - pub unique_names: usize, - pub interned_strings: usize, - pub memory_bytes: usize, -} - /// Orchestrates multi-phase file indexing for both persistent and ephemeral modes. /// /// The job executes as a state machine progressing through Discovery, Processing, diff --git a/core/src/ops/indexing/persistence.rs b/core/src/ops/indexing/persistence.rs index 5ba7c106f..7becd8df1 100644 --- a/core/src/ops/indexing/persistence.rs +++ b/core/src/ops/indexing/persistence.rs @@ -26,7 +26,7 @@ use tokio::sync::RwLock; use uuid::Uuid; use super::{ - job::EphemeralIndex, + ephemeral::EphemeralIndex, state::{DirEntry, EntryKind}, PathResolver, }; From e5275c664973453687c645f63c4d327a10c5587b Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 03:18:47 -0800 Subject: [PATCH 16/20] Rename EntryProcessor with DBWriter across indexing - Replace EntryProcessor with DBWriter across indexing - Introduce EphemeralWriter to unify ephemeral indexing logic - Update IndexPersistence to abstract over writers and adjust modules --- core/src/domain/file.rs | 2 +- core/src/infra/db/entities/entry.rs | 2 +- core/src/infra/db/entities/location.rs | 2 +- core/src/location/manager.rs | 2 +- .../ops/indexing/change_detection/detector.rs | 7 +- .../indexing/change_detection/ephemeral.rs | 244 ------- .../ops/indexing/change_detection/handler.rs | 4 +- core/src/ops/indexing/change_detection/mod.rs | 10 +- .../indexing/change_detection/persistent.rs | 232 ++++++- .../ops/indexing/{entry.rs => db_writer.rs} | 127 ++-- core/src/ops/indexing/ephemeral/cache.rs | 6 +- core/src/ops/indexing/ephemeral/index.rs | 2 +- core/src/ops/indexing/ephemeral/mod.rs | 2 + core/src/ops/indexing/ephemeral/responder.rs | 10 +- core/src/ops/indexing/ephemeral/writer.rs | 494 ++++++++++++++ core/src/ops/indexing/job.rs | 4 +- core/src/ops/indexing/mod.rs | 13 +- core/src/ops/indexing/persistence.rs | 643 ++---------------- core/src/ops/indexing/phases/content.rs | 4 +- core/src/ops/indexing/phases/discovery.rs | 2 +- core/src/ops/indexing/phases/processing.rs | 8 +- core/src/ops/indexing/processor.rs | 4 +- core/src/ops/indexing/responder.rs | 10 +- core/src/ops/indexing/verify/action.rs | 11 +- 24 files changed, 887 insertions(+), 958 deletions(-) delete mode 100644 core/src/ops/indexing/change_detection/ephemeral.rs rename core/src/ops/indexing/{entry.rs => db_writer.rs} (91%) create mode 100644 core/src/ops/indexing/ephemeral/writer.rs diff --git a/core/src/domain/file.rs b/core/src/domain/file.rs index 52cd76373..6b640acc8 100644 --- a/core/src/domain/file.rs +++ b/core/src/domain/file.rs @@ -425,7 +425,7 @@ impl File { /// This is used for ephemeral indexing where files are discovered but not persisted to the database. pub fn from_ephemeral( id: Uuid, - metadata: &crate::ops::indexing::entry::EntryMetadata, + metadata: &crate::ops::indexing::db_writer::EntryMetadata, sd_path: SdPath, ) -> Self { let is_local = sd_path.is_local(); diff --git a/core/src/infra/db/entities/entry.rs b/core/src/infra/db/entities/entry.rs index f3417654c..9461a458e 100644 --- a/core/src/infra/db/entities/entry.rs +++ b/core/src/infra/db/entities/entry.rs @@ -336,7 +336,7 @@ impl crate::infra::sync::Syncable for Model { // Use delete_subtree_internal to cascade delete entire subtree // This avoids creating tombstones (we're applying a tombstone) - crate::ops::indexing::EntryProcessor::delete_subtree(entry.id, db).await?; + crate::ops::indexing::DBWriter::delete_subtree(entry.id, db).await?; Ok(()) } diff --git a/core/src/infra/db/entities/location.rs b/core/src/infra/db/entities/location.rs index cb1c146ee..dea005c93 100644 --- a/core/src/infra/db/entities/location.rs +++ b/core/src/infra/db/entities/location.rs @@ -330,7 +330,7 @@ impl Syncable for Model { // Delete root entry tree first if it exists // Use delete_subtree_internal to avoid creating tombstones (we're applying a tombstone) if let Some(entry_id) = location.entry_id { - crate::ops::indexing::EntryProcessor::delete_subtree(entry_id, db).await?; + crate::ops::indexing::DBWriter::delete_subtree(entry_id, db).await?; } // Delete location record diff --git a/core/src/location/manager.rs b/core/src/location/manager.rs index 325e6f435..bcfcb82b7 100644 --- a/core/src/location/manager.rs +++ b/core/src/location/manager.rs @@ -501,7 +501,7 @@ impl LocationManager { // Delete the root entry tree first if it exists // Use delete_subtree_internal to avoid creating entry tombstones (we'll tombstone the location instead) if let Some(entry_id) = location.entry_id { - crate::ops::indexing::EntryProcessor::delete_subtree(entry_id, library.db().conn()) + crate::ops::indexing::DBWriter::delete_subtree(entry_id, library.db().conn()) .await .map_err(|e| LocationError::Other(format!("Failed to delete entry tree: {}", e)))?; } diff --git a/core/src/ops/indexing/change_detection/detector.rs b/core/src/ops/indexing/change_detection/detector.rs index 5ef8090bb..aa7edb376 100644 --- a/core/src/ops/indexing/change_detection/detector.rs +++ b/core/src/ops/indexing/change_detection/detector.rs @@ -65,7 +65,8 @@ impl ChangeDetector { ) -> Result<(), crate::infra::job::prelude::JobError> { use crate::infra::db::entities; use crate::infra::job::prelude::JobError; - use crate::ops::indexing::persistence::{DatabasePersistence, IndexPersistence}; + use crate::ops::indexing::change_detection::PersistentWriterAdapter; + use crate::ops::indexing::persistence::IndexPersistence; use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; let location_record = entities::location::Entity::find_by_id(location_id) @@ -74,8 +75,8 @@ impl ChangeDetector { .map_err(|e| JobError::execution(format!("Failed to find location: {}", e)))? .ok_or_else(|| JobError::execution("Location not found".to_string()))?; - // Create a database persistence instance to leverage the scoped query logic - let persistence = DatabasePersistence::new(ctx, 0, location_record.entry_id); + // Create a persistent writer adapter to leverage the unified query logic + let persistence = PersistentWriterAdapter::new(ctx, location_record.uuid, location_record.entry_id); // Use the scoped query method let existing_entries = persistence.get_existing_entries(indexing_path).await?; diff --git a/core/src/ops/indexing/change_detection/ephemeral.rs b/core/src/ops/indexing/change_detection/ephemeral.rs deleted file mode 100644 index 7e1e9c12e..000000000 --- a/core/src/ops/indexing/change_detection/ephemeral.rs +++ /dev/null @@ -1,244 +0,0 @@ -//! Ephemeral (memory-backed) change handler for browsing unmanaged paths. -//! -//! Updates the EphemeralIndex directly without database writes. -//! Skips the processor pipeline (no thumbnails/content hash for ephemeral). - -use super::handler::{build_dir_entry, ChangeHandler}; -use super::types::{ChangeType, EntryRef}; -use crate::infra::event::EventBus; -use crate::ops::indexing::entry::EntryMetadata; -use crate::ops::indexing::ephemeral::EphemeralIndex; -use crate::ops::indexing::state::{DirEntry, EntryKind}; -use anyhow::Result; -use std::path::{Path, PathBuf}; -use std::sync::atomic::AtomicI32; -use std::sync::Arc; -use tokio::sync::RwLock; -use uuid::Uuid; - -/// Memory-backed change handler for ephemeral browsing. -pub struct EphemeralChangeHandler { - index: Arc>, - event_bus: Arc, - root_path: PathBuf, - next_id: AtomicI32, -} - -impl EphemeralChangeHandler { - pub fn new( - index: Arc>, - event_bus: Arc, - root_path: PathBuf, - ) -> Self { - Self { - index, - event_bus, - root_path, - next_id: AtomicI32::new(1), - } - } - - fn next_id(&self) -> i32 { - self.next_id - .fetch_add(1, std::sync::atomic::Ordering::SeqCst) - } -} - -#[async_trait::async_trait] -impl ChangeHandler for EphemeralChangeHandler { - async fn find_by_path(&self, path: &Path) -> Result> { - let index = self.index.read().await; - - if let Some(metadata) = index.get_entry_ref(&path.to_path_buf()) { - let uuid = index.get_entry_uuid(&path.to_path_buf()); - - Ok(Some(EntryRef { - id: 0, - uuid, - path: path.to_path_buf(), - kind: metadata.kind, - })) - } else { - Ok(None) - } - } - - async fn find_by_inode(&self, _inode: u64) -> Result> { - Ok(None) - } - - async fn create(&mut self, metadata: &DirEntry, _parent_path: &Path) -> Result { - let entry_uuid = Uuid::new_v4(); - let entry_metadata = EntryMetadata::from(metadata.clone()); - - { - let mut index = self.index.write().await; - index - .add_entry(metadata.path.clone(), entry_uuid, entry_metadata) - .map_err(|e| anyhow::anyhow!("Failed to add entry to ephemeral index: {}", e))?; - } - - Ok(EntryRef { - id: self.next_id(), - uuid: Some(entry_uuid), - path: metadata.path.clone(), - kind: metadata.kind, - }) - } - - async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { - let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); - let entry_metadata = EntryMetadata::from(metadata.clone()); - - { - let mut index = self.index.write().await; - let _ = index.add_entry(metadata.path.clone(), uuid, entry_metadata); - } - - Ok(()) - } - - async fn move_entry( - &mut self, - entry: &EntryRef, - old_path: &Path, - new_path: &Path, - _new_parent_path: &Path, - ) -> Result<()> { - let metadata = build_dir_entry(new_path, None).await?; - - { - let mut index = self.index.write().await; - index.remove_entry(old_path); - - let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); - let entry_metadata = EntryMetadata::from(metadata.clone()); - let _ = index.add_entry(new_path.to_path_buf(), uuid, entry_metadata); - } - - Ok(()) - } - - async fn delete(&mut self, entry: &EntryRef) -> Result<()> { - { - let mut index = self.index.write().await; - - if entry.is_directory() { - index.remove_directory_tree(&entry.path); - } else { - index.remove_entry(&entry.path); - } - } - - Ok(()) - } - - async fn run_processors(&self, _entry: &EntryRef, _is_new: bool) -> Result<()> { - Ok(()) - } - - async fn emit_change_event(&self, entry: &EntryRef, _change_type: ChangeType) -> Result<()> { - use crate::device::get_current_device_slug; - use crate::domain::addressing::SdPath; - use crate::domain::file::File; - use crate::infra::event::{Event, ResourceMetadata}; - - let Some(uuid) = entry.uuid else { - return Ok(()); - }; - - let device_slug = get_current_device_slug(); - - let sd_path = SdPath::Physical { - device_slug: device_slug.clone(), - path: entry.path.clone(), - }; - - let content_kind = { - let index = self.index.read().await; - index.get_content_kind(&entry.path) - }; - - let metadata = build_dir_entry(&entry.path, None).await.ok(); - - if let Some(meta) = metadata { - let entry_metadata = EntryMetadata::from(meta); - let mut file = File::from_ephemeral(uuid, &entry_metadata, sd_path); - file.content_kind = content_kind; - - let parent_path = entry.path.parent().map(|p| SdPath::Physical { - device_slug: file.sd_path.device_slug().unwrap_or("local").to_string(), - path: p.to_path_buf(), - }); - - let affected_paths = parent_path.into_iter().collect(); - - if let Ok(resource_json) = serde_json::to_value(&file) { - self.event_bus.emit(Event::ResourceChanged { - resource_type: "file".to_string(), - resource: resource_json, - metadata: Some(ResourceMetadata { - no_merge_fields: vec!["sd_path".to_string()], - alternate_ids: vec![], - affected_paths, - }), - }); - } - } - - Ok(()) - } - - async fn handle_new_directory(&self, path: &Path) -> Result<()> { - use crate::ops::indexing::entry::EntryProcessor; - - let mut entries = match tokio::fs::read_dir(path).await { - Ok(e) => e, - Err(e) => { - tracing::warn!( - "Failed to read directory {} for ephemeral indexing: {}", - path.display(), - e - ); - return Ok(()); - } - }; - - let mut index = self.index.write().await; - - while let Ok(Some(entry)) = entries.next_entry().await { - let entry_path = entry.path(); - - if let Ok(metadata) = entry.metadata().await { - let kind = if metadata.is_dir() { - EntryKind::Directory - } else if metadata.is_symlink() { - EntryKind::Symlink - } else { - EntryKind::File - }; - - let entry_metadata = EntryMetadata { - path: entry_path.clone(), - kind, - size: metadata.len(), - modified: metadata.modified().ok(), - accessed: metadata.accessed().ok(), - created: metadata.created().ok(), - inode: EntryProcessor::get_inode(&metadata), - permissions: None, - is_hidden: entry_path - .file_name() - .and_then(|n| n.to_str()) - .map(|n| n.starts_with('.')) - .unwrap_or(false), - }; - - let uuid = Uuid::new_v4(); - let _ = index.add_entry(entry_path, uuid, entry_metadata); - } - } - - Ok(()) - } -} diff --git a/core/src/ops/indexing/change_detection/handler.rs b/core/src/ops/indexing/change_detection/handler.rs index 03dca2c41..ab6dc614f 100644 --- a/core/src/ops/indexing/change_detection/handler.rs +++ b/core/src/ops/indexing/change_detection/handler.rs @@ -179,9 +179,9 @@ pub async fn build_dir_entry( path: &Path, backend: Option<&Arc>, ) -> Result { - use crate::ops::indexing::entry::EntryProcessor; + use crate::ops::indexing::db_writer::DBWriter; - let meta = EntryProcessor::extract_metadata(path, backend).await?; + let meta = DBWriter::extract_metadata(path, backend).await?; Ok(DirEntry { path: meta.path, kind: meta.kind, diff --git a/core/src/ops/indexing/change_detection/mod.rs b/core/src/ops/indexing/change_detection/mod.rs index 9ff9a76d2..caaad6874 100644 --- a/core/src/ops/indexing/change_detection/mod.rs +++ b/core/src/ops/indexing/change_detection/mod.rs @@ -6,22 +6,20 @@ //! move detection, so a file moved while the indexer is running behaves //! identically to one moved while the watcher is active. //! -//! Changes route to either `PersistentChangeHandler` (database writes for -//! managed locations) or `EphemeralChangeHandler` (in-memory updates for -//! browsing sessions). This split keeps browsed directories responsive without +//! Changes route to either `PersistentWriter` (database writes for +//! managed locations) or `EphemeralWriter` (in-memory updates for browsing +//! sessions). This split keeps browsed directories responsive without //! polluting the database with temporary entries. pub mod detector; -pub mod ephemeral; pub mod handler; pub mod persistent; pub mod types; pub use detector::ChangeDetector; -pub use ephemeral::EphemeralChangeHandler; pub use handler::{ apply_batch, build_dir_entry, handle_create, handle_modify, handle_remove, handle_rename, path_exists_safe, should_filter_path, ChangeHandler, }; -pub use persistent::PersistentChangeHandler; +pub use persistent::{PersistentWriter, PersistentWriterAdapter}; pub use types::{Change, ChangeConfig, ChangeMetadata, ChangeType, EntryRef}; diff --git a/core/src/ops/indexing/change_detection/persistent.rs b/core/src/ops/indexing/change_detection/persistent.rs index 0378f8f78..edcdc7f65 100644 --- a/core/src/ops/indexing/change_detection/persistent.rs +++ b/core/src/ops/indexing/change_detection/persistent.rs @@ -1,13 +1,16 @@ -//! Persistent (database-backed) change handler for managed locations. +//! Unified persistent (database-backed) writer for both watcher and indexer pipelines. //! -//! Uses EntryProcessor for CRUD operations and maintains closure table -//! relationships. Runs the processor pipeline (thumbnails, content hash) -//! for new and modified files. +//! This module provides `PersistentWriter`, which implements both `ChangeHandler` +//! (for the watcher pipeline) and `IndexPersistence` (for the indexer job). +//! Both pipelines share the same database write logic through `DBWriter`, +//! eliminating code duplication. use super::handler::ChangeHandler; use super::types::{ChangeType, EntryRef}; use crate::context::CoreContext; use crate::infra::db::entities; +use crate::infra::job::prelude::{JobContext, JobError, JobResult}; +use crate::ops::indexing::persistence::IndexPersistence; use crate::ops::indexing::state::{DirEntry, EntryKind}; use anyhow::Result; use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, TransactionTrait}; @@ -16,8 +19,15 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use uuid::Uuid; -/// Database-backed change handler for managed locations. -pub struct PersistentChangeHandler { +/// Unified writer for persistent (database-backed) index storage. +/// +/// Implements both `ChangeHandler` (for the watcher pipeline) and `IndexPersistence` +/// (for the indexer job pipeline). Both pipelines share: +/// - The same `DBWriter` for CRUD operations +/// - Closure table management +/// - Directory path tracking +/// - Entry ID caching for hierarchy construction +pub struct PersistentWriter { context: Arc, library_id: Uuid, location_id: Uuid, @@ -27,7 +37,7 @@ pub struct PersistentChangeHandler { entry_id_cache: HashMap, } -impl PersistentChangeHandler { +impl PersistentWriter { pub async fn new( context: Arc, library_id: Uuid, @@ -134,7 +144,7 @@ impl PersistentChangeHandler { } #[async_trait::async_trait] -impl ChangeHandler for PersistentChangeHandler { +impl ChangeHandler for PersistentWriter { async fn find_by_path(&self, path: &Path) -> Result> { let entry_id = match self.resolve_entry_id(path).await? { Some(id) => id, @@ -195,16 +205,20 @@ impl ChangeHandler for PersistentChangeHandler { async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result { use crate::domain::addressing::SdPath; - use crate::ops::indexing::entry::EntryProcessor; + use crate::ops::indexing::db_writer::DBWriter; use crate::ops::indexing::state::IndexerState; let mut state = IndexerState::new(&SdPath::local(&metadata.path)); + let ctx = + crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + // Cache Management: Check cache first, then query DB if needed if let Some(&parent_id) = self.entry_id_cache.get(parent_path) { state .entry_id_cache .insert(parent_path.to_path_buf(), parent_id); - } else if let Some(parent_id) = self.resolve_directory_entry_id(parent_path).await? { + } else if let Ok(Some(parent_id)) = DBWriter::resolve_parent_id(&ctx, parent_path).await { + // Cache the parent ID for future lookups state .entry_id_cache .insert(parent_path.to_path_buf(), parent_id); @@ -212,10 +226,7 @@ impl ChangeHandler for PersistentChangeHandler { .insert(parent_path.to_path_buf(), parent_id); } - let ctx = - crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - - let entry_id = EntryProcessor::create_entry(&mut state, &ctx, metadata, 0, parent_path) + let entry_id = DBWriter::create_entry(&mut state, &ctx, metadata, 0, parent_path) .await .map_err(|e| anyhow::anyhow!("Failed to create entry: {}", e))?; @@ -235,11 +246,11 @@ impl ChangeHandler for PersistentChangeHandler { } async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { - use crate::ops::indexing::entry::EntryProcessor; + use crate::ops::indexing::db_writer::DBWriter; let ctx = crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - EntryProcessor::update_entry(&ctx, entry.id, metadata) + DBWriter::update_entry(&ctx, entry.id, metadata) .await .map_err(|e| anyhow::anyhow!("Failed to update entry: {}", e))?; @@ -254,26 +265,27 @@ impl ChangeHandler for PersistentChangeHandler { new_parent_path: &Path, ) -> Result<()> { use crate::domain::addressing::SdPath; - use crate::ops::indexing::entry::EntryProcessor; + use crate::ops::indexing::db_writer::DBWriter; use crate::ops::indexing::state::IndexerState; let mut state = IndexerState::new(&SdPath::local(old_path)); + let ctx = + crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + // Cache Management: Check cache first, then query DB if needed if let Some(&parent_id) = self.entry_id_cache.get(new_parent_path) { state .entry_id_cache .insert(new_parent_path.to_path_buf(), parent_id); - } else if let Some(parent_id) = self.resolve_directory_entry_id(new_parent_path).await? { + } else if let Ok(Some(parent_id)) = DBWriter::resolve_parent_id(&ctx, new_parent_path).await + { state .entry_id_cache .insert(new_parent_path.to_path_buf(), parent_id); self.entry_id_cache .insert(new_parent_path.to_path_buf(), parent_id); } - - let ctx = - crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - EntryProcessor::move_entry( + DBWriter::move_entry( &mut state, &ctx, entry.id, @@ -399,7 +411,6 @@ impl ChangeHandler for PersistentChangeHandler { let ctx = crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - // Helper to build ProcessorEntry (re-queries to get latest content_id after hash) let build_proc_entry = |db: &sea_orm::DatabaseConnection, entry: &EntryRef| -> std::pin::Pin< @@ -636,3 +647,178 @@ impl ChangeHandler for PersistentChangeHandler { Ok(()) } } + +// ============================================================================ +// IndexPersistence Implementation (Job Pipeline) +// ============================================================================ + +/// Adapter for using PersistentWriter in the job pipeline. +/// +/// The job system expects an `IndexPersistence` trait, but works with `JobContext` +/// instead of `CoreContext`. This adapter wraps `PersistentWriter` and delegates +/// storage operations to `DBWriter`, ensuring both pipelines use identical logic. +pub struct PersistentWriterAdapter<'a> { + ctx: &'a JobContext<'a>, + library_id: Uuid, + location_root_entry_id: Option, +} + +impl<'a> PersistentWriterAdapter<'a> { + pub fn new( + ctx: &'a JobContext<'a>, + library_id: Uuid, + location_root_entry_id: Option, + ) -> Self { + Self { + ctx, + library_id, + location_root_entry_id, + } + } +} + +#[async_trait::async_trait] +impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { + async fn store_entry( + &self, + entry: &DirEntry, + _location_id: Option, + location_root_path: &Path, + ) -> JobResult { + use crate::domain::addressing::SdPath; + use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::state::IndexerState; + + let mut state = IndexerState::new(&SdPath::local(&entry.path)); + + // Cache Management: Resolve parent ID if needed (for job pipeline) + // The job processes entries in hierarchy order, but we still need to ensure + // the parent ID is cached before creating this entry + if let Some(parent_path) = entry.path.parent() { + if !state.entry_id_cache.contains_key(parent_path) { + if let Ok(Some(parent_id)) = DBWriter::resolve_parent_id(self.ctx, parent_path).await + { + state + .entry_id_cache + .insert(parent_path.to_path_buf(), parent_id); + } + } + } + + let entry_id = + DBWriter::create_entry(&mut state, self.ctx, entry, 0, location_root_path) + .await?; + + Ok(entry_id) + } + + async fn store_content_identity( + &self, + entry_id: i32, + path: &Path, + cas_id: String, + ) -> JobResult<()> { + use crate::ops::indexing::db_writer::DBWriter; + + DBWriter::link_to_content_identity(self.ctx, entry_id, path, cas_id, self.library_id) + .await + .map(|_| ()) + } + + async fn get_existing_entries( + &self, + indexing_path: &Path, + ) -> JobResult< + HashMap, Option, u64)>, + > { + use crate::infra::db::entities::{directory_paths, entry_closure}; + use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; + + let location_root_entry_id = match self.location_root_entry_id { + Some(id) => id, + None => return Ok(HashMap::new()), + }; + + let indexing_path_str = indexing_path.to_string_lossy().to_string(); + let indexing_path_entry_id = if let Ok(Some(dir_record)) = directory_paths::Entity::find() + .filter(directory_paths::Column::Path.eq(&indexing_path_str)) + .one(self.ctx.library_db()) + .await + { + dir_record.entry_id + } else { + location_root_entry_id + }; + + let descendant_ids = entry_closure::Entity::find() + .filter(entry_closure::Column::AncestorId.eq(indexing_path_entry_id)) + .all(self.ctx.library_db()) + .await + .map_err(|e| JobError::execution(format!("Failed to query closure table: {}", e)))? + .into_iter() + .map(|ec| ec.descendant_id) + .collect::>(); + + let mut all_entry_ids = vec![indexing_path_entry_id]; + all_entry_ids.extend(descendant_ids); + + let mut existing_entries: Vec = Vec::new(); + let chunk_size: usize = 900; + for chunk in all_entry_ids.chunks(chunk_size) { + let mut batch = entities::entry::Entity::find() + .filter(entities::entry::Column::Id.is_in(chunk.to_vec())) + .all(self.ctx.library_db()) + .await + .map_err(|e| { + JobError::execution(format!("Failed to query existing entries: {}", e)) + })?; + existing_entries.append(&mut batch); + } + + let mut result = HashMap::new(); + + self.ctx.log(format!( + "Loading {} existing entries", + existing_entries.len() + )); + + for entry in existing_entries { + let full_path = + crate::ops::indexing::PathResolver::get_full_path(self.ctx.library_db(), entry.id) + .await + .unwrap_or_else(|_| PathBuf::from(&entry.name)); + + let modified_time = + entry + .modified_at + .timestamp() + .try_into() + .ok() + .and_then(|secs: u64| { + std::time::UNIX_EPOCH.checked_add(std::time::Duration::from_secs(secs)) + }); + + result.insert( + full_path, + ( + entry.id, + entry.inode.map(|i| i as u64), + modified_time, + entry.size as u64, + ), + ); + } + + Ok(result) + } + + async fn update_entry(&self, entry_id: i32, entry: &DirEntry) -> JobResult<()> { + use crate::ops::indexing::db_writer::DBWriter; + + DBWriter::update_entry(self.ctx, entry_id, entry).await + } + + fn is_persistent(&self) -> bool { + true + } +} diff --git a/core/src/ops/indexing/entry.rs b/core/src/ops/indexing/db_writer.rs similarity index 91% rename from core/src/ops/indexing/entry.rs rename to core/src/ops/indexing/db_writer.rs index b7726733d..0b8bdf920 100644 --- a/core/src/ops/indexing/entry.rs +++ b/core/src/ops/indexing/db_writer.rs @@ -1,8 +1,8 @@ -//! # Entry Processing and Persistence +//! # Core Database Writer for Indexing //! -//! `core::ops::indexing::entry` handles the translation of discovered filesystem -//! entries into database records, managing the full lifecycle from metadata extraction -//! to content identification and move operations. +//! `core::ops::indexing::db_writer` provides the foundational database operations layer +//! for the indexing system. All database writes (creates, updates, moves, deletes) flow +//! through this module, ensuring consistency across both watcher and job pipelines. //! //! ## Key Design Decisions //! @@ -24,10 +24,10 @@ //! //! ## Example //! ```rust,no_run -//! use spacedrive_core::ops::indexing::{EntryProcessor, state::DirEntry}; +//! use spacedrive_core::ops::indexing::{DBWriter, state::DirEntry}; //! //! let entry = DirEntry { /* ... */ }; -//! let entry_id = EntryProcessor::create_entry( +//! let entry_id = DBWriter::create_entry( //! &mut state, //! &ctx, //! &entry, @@ -111,13 +111,14 @@ impl From for EntryMetadata { } } -/// Entry persistence operations for the indexing system. +/// Core database operations for the indexing system. /// -/// EntryProcessor provides methods for creating, updating, and moving database entries, -/// handling the complexity of closure table updates and directory path cascades. All -/// methods come in both standalone (creates own transaction) and `_in_conn` variants -/// (uses existing transaction) for flexible batch operations. -pub struct EntryProcessor; +/// DBWriter provides the foundational layer for all database writes during indexing. +/// Both the watcher pipeline (`PersistentWriter`) and job pipeline (`PersistentWriterAdapter`) +/// delegate to these methods, ensuring consistent database operations. All methods come in +/// both standalone (creates own transaction) and `_in_conn` variants (uses existing transaction) +/// for flexible batch operations. +pub struct DBWriter; /// Result of linking an entry to its content identity. /// @@ -131,7 +132,7 @@ pub struct ContentLinkResult { pub is_new_content: bool, } -impl EntryProcessor { +impl DBWriter { /// Get platform-specific inode #[cfg(unix)] pub fn get_inode(metadata: &std::fs::Metadata) -> Option { @@ -153,6 +154,40 @@ impl EntryProcessor { None } + /// Resolves a parent directory path to its entry ID via pure database lookup. + /// + /// This is the foundational database query operation. Callers (writers) should + /// check their cache first, then call this method if the ID isn't cached. + /// + /// For cloud paths (containing "://"), tries both with and without trailing slashes + /// since cloud backends may store paths inconsistently. + pub async fn resolve_parent_id( + ctx: &impl IndexingCtx, + parent_path: &Path, + ) -> Result, JobError> { + let parent_path_str = parent_path.to_string_lossy().to_string(); + let is_cloud = parent_path_str.contains("://"); + + let parent_variants = if is_cloud && !parent_path_str.ends_with('/') { + vec![parent_path_str.clone(), format!("{}/", parent_path_str)] + } else { + vec![parent_path_str.clone()] + }; + + let query = entities::directory_paths::Entity::find() + .filter(entities::directory_paths::Column::Path.is_in(parent_variants)); + + match query.one(ctx.library_db()).await { + Ok(Some(dir_path_record)) => Ok(Some(dir_path_record.entry_id)), + Ok(None) => Ok(None), + Err(e) => Err(JobError::execution(format!( + "Failed to resolve parent ID for {}: {}", + parent_path.display(), + e + ))), + } + } + /// Extracts filesystem metadata through either a volume backend or direct I/O. /// /// Volume backends abstract cloud storage (S3, Dropbox) and local filesystems @@ -282,11 +317,8 @@ impl EntryProcessor { }) .unwrap_or_else(|| chrono::Utc::now()); - // UUID assignment strategy: preserve ephemeral UUIDs from prior browsing sessions - // so user metadata (tags, notes) attached during ephemeral mode survives the - // transition to persistent indexing. Without preservation, adding a browsed folder - // as a managed location would orphan all existing tags and make Quick Look previews - // flash as UUIDs change. The ephemeral cache is populated during state initialization. + // UUID assignment: preserve ephemeral UUIDs from prior browsing sessions + // so user metadata (tags, notes) survives the transition to persistent indexing. let entry_uuid = if let Some(ephemeral_uuid) = state.get_ephemeral_uuid(&entry.path) { tracing::debug!( "Preserving ephemeral UUID {} for {}", @@ -298,57 +330,12 @@ impl EntryProcessor { Some(Uuid::new_v4()) }; - let parent_id = if let Some(parent_path) = entry.path.parent() { - ctx.log(format!( - "Looking up parent for {}: parent_path = {}", - entry.path.display(), - parent_path.display() - )); - - if let Some(id) = state.entry_id_cache.get(parent_path).copied() { - ctx.log(format!("Found parent in cache: id = {}", id)); - Some(id) - } else { - // For cloud paths, try both with and without trailing slash since cloud backends - // may store paths inconsistently depending on API responses. - let parent_path_str = parent_path.to_string_lossy().to_string(); - let is_cloud = parent_path_str.contains("://"); - - let parent_variants = if is_cloud && !parent_path_str.ends_with('/') { - vec![parent_path_str.clone(), format!("{}/", parent_path_str)] - } else { - vec![parent_path_str.clone()] - }; - - let query = entities::directory_paths::Entity::find() - .filter(entities::directory_paths::Column::Path.is_in(parent_variants.clone())); - - if let Ok(Some(dir_path_record)) = query.one(ctx.library_db()).await { - // Found parent in database, cache it - ctx.log(format!( - "Found parent in database: id = {}", - dir_path_record.entry_id - )); - state - .entry_id_cache - .insert(parent_path.to_path_buf(), dir_path_record.entry_id); - Some(dir_path_record.entry_id) - } else { - // Parent not found indicates entries arrived out of order, possibly from - // concurrent file watchers or interrupted batch processing. The entry will - // be orphaned (parent_id = NULL) until the next full reindex repairs the hierarchy. - ctx.log(format!( - "WARNING: Parent not found for {}: {} (tried: {:?})", - entry.path.display(), - parent_path.display(), - parent_variants - )); - None - } - } - } else { - None - }; + // Parent ID should already be resolved and cached by the caller (writer layer). + // This keeps DBWriter focused on pure database operations without cache management. + let parent_id = entry + .path + .parent() + .and_then(|parent_path| state.entry_id_cache.get(parent_path).copied()); let now = chrono::Utc::now(); tracing::debug!( @@ -1084,7 +1071,7 @@ impl EntryProcessor { /// - Database cleanup operations /// /// For watcher-triggered deletions that need sync/events, use - /// `PersistentChangeHandler::delete()` instead. + /// `PersistentWriter::delete()` instead. pub async fn delete_subtree( entry_id: i32, db: &sea_orm::DatabaseConnection, diff --git a/core/src/ops/indexing/ephemeral/cache.rs b/core/src/ops/indexing/ephemeral/cache.rs index d47f0c035..77a7aaae9 100644 --- a/core/src/ops/indexing/ephemeral/cache.rs +++ b/core/src/ops/indexing/ephemeral/cache.rs @@ -4,9 +4,9 @@ //! directories share one arena and string pool, keeping memory at ~50 bytes per //! entry regardless of how many paths the user navigates. The cache tracks which //! paths are indexed (queryable), in-progress (being scanned), or watched -//! (receiving live filesystem updates via `EphemeralChangeHandler`). +//! (receiving live filesystem updates via `EphemeralWriter`). -use crate::ops::indexing::EphemeralIndex; +use super::EphemeralIndex; use parking_lot::RwLock; use std::{ collections::HashSet, @@ -160,7 +160,7 @@ impl EphemeralIndexCache { /// Register a path for filesystem watching. /// /// When registered, the watcher service will monitor this path for changes - /// and update the ephemeral index via `EphemeralChangeHandler`. The path + /// and update the ephemeral index via `EphemeralWriter`. The path /// must already be indexed. pub fn register_for_watching(&self, path: PathBuf) -> bool { let indexed = self.indexed_paths.read(); diff --git a/core/src/ops/indexing/ephemeral/index.rs b/core/src/ops/indexing/ephemeral/index.rs index 873d83e9d..40dc68c21 100644 --- a/core/src/ops/indexing/ephemeral/index.rs +++ b/core/src/ops/indexing/ephemeral/index.rs @@ -17,7 +17,7 @@ use crate::domain::ContentKind; use crate::filetype::FileTypeRegistry; -use crate::ops::indexing::entry::EntryMetadata; +use crate::ops::indexing::db_writer::EntryMetadata; use crate::ops::indexing::state::{EntryKind, IndexerStats}; use super::types::{FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata}; diff --git a/core/src/ops/indexing/ephemeral/mod.rs b/core/src/ops/indexing/ephemeral/mod.rs index 0ca0ee082..da8c7576a 100644 --- a/core/src/ops/indexing/ephemeral/mod.rs +++ b/core/src/ops/indexing/ephemeral/mod.rs @@ -44,6 +44,7 @@ pub mod name; pub mod registry; pub mod responder; pub mod types; +pub mod writer; // Re-export public types pub use arena::NodeArena; @@ -52,3 +53,4 @@ pub use index::{EphemeralIndex, EphemeralIndexStats}; pub use name::NameCache; pub use registry::NameRegistry; pub use types::{EntryId, FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata}; +pub use writer::EphemeralWriter; diff --git a/core/src/ops/indexing/ephemeral/responder.rs b/core/src/ops/indexing/ephemeral/responder.rs index 913b105a3..33f1104a4 100644 --- a/core/src/ops/indexing/ephemeral/responder.rs +++ b/core/src/ops/indexing/ephemeral/responder.rs @@ -17,12 +17,14 @@ use crate::context::CoreContext; use crate::infra::event::FsRawEventKind; -use crate::ops::indexing::change_detection::{self, ChangeConfig, EphemeralChangeHandler}; +use crate::ops::indexing::change_detection::{self, ChangeConfig}; use crate::ops::indexing::rules::RuleToggles; use anyhow::Result; use std::path::{Path, PathBuf}; use std::sync::Arc; +use super::EphemeralWriter; + /// Check if a path falls under an ephemeral watched directory. /// /// Returns the watched root path if found. @@ -52,7 +54,7 @@ pub fn find_ephemeral_root_for_events( /// Process a batch of filesystem events against the ephemeral index. /// -/// Creates an `EphemeralChangeHandler` and processes the events using shared +/// Creates an `EphemeralWriter` and processes the events using shared /// handler logic. The ephemeral index is updated in-place and ResourceChanged /// events are emitted for UI updates. pub async fn apply_batch( @@ -68,7 +70,7 @@ pub async fn apply_batch( let index = context.ephemeral_cache().get_global_index(); let event_bus = context.events.clone(); - let mut handler = EphemeralChangeHandler::new(index, event_bus, root_path.to_path_buf()); + let mut writer = EphemeralWriter::new(index, event_bus, root_path.to_path_buf()); let config = ChangeConfig { rule_toggles, @@ -76,7 +78,7 @@ pub async fn apply_batch( volume_backend: None, // Ephemeral paths typically don't use volume backends }; - change_detection::apply_batch(&mut handler, events, &config).await + change_detection::apply_batch(&mut writer, events, &config).await } /// Process a single filesystem event against the ephemeral index. diff --git a/core/src/ops/indexing/ephemeral/writer.rs b/core/src/ops/indexing/ephemeral/writer.rs new file mode 100644 index 000000000..e3ff10d98 --- /dev/null +++ b/core/src/ops/indexing/ephemeral/writer.rs @@ -0,0 +1,494 @@ +//! Unified ephemeral writer for both watcher and indexer pipelines. +//! +//! This module consolidates `EphemeralChangeHandler` and `EphemeralPersistence` +//! into a single implementation that serves both the file watcher and indexer job. +//! Both pipelines share the same entry storage logic, UUID generation, and event +//! emission, eliminating code duplication. +//! +use crate::infra::event::EventBus; +use crate::infra::job::prelude::{JobError, JobResult}; +use crate::ops::indexing::change_detection::handler::{build_dir_entry, ChangeHandler}; +use crate::ops::indexing::change_detection::types::{ChangeType, EntryRef}; +use crate::ops::indexing::db_writer::EntryMetadata; +use crate::ops::indexing::persistence::IndexPersistence; +use crate::ops::indexing::state::{DirEntry, EntryKind}; + +use super::EphemeralIndex; + +use anyhow::Result; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicI32, Ordering}; +use std::sync::Arc; +use std::time::SystemTime; +use tokio::sync::RwLock; +use uuid::Uuid; + +/// Unified writer for ephemeral (in-memory) index storage. +/// +/// Implements both `ChangeHandler` (for the watcher pipeline) and `IndexPersistence` +/// (for the indexer job pipeline). Both pipelines share: +/// - The same `EphemeralIndex` storage +/// - UUID generation and tracking +/// - Event emission for UI updates +/// - Entry ID generation +pub struct EphemeralWriter { + index: Arc>, + event_bus: Arc, + root_path: PathBuf, + next_id: AtomicI32, +} + +impl EphemeralWriter { + pub fn new( + index: Arc>, + event_bus: Arc, + root_path: PathBuf, + ) -> Self { + Self { + index, + event_bus, + root_path, + next_id: AtomicI32::new(1), + } + } + + /// Generate the next entry ID. + fn next_id(&self) -> i32 { + self.next_id.fetch_add(1, Ordering::SeqCst) + } + + /// Add an entry to the index and emit a ResourceChanged event. + /// + /// This is the core write operation used by both pipelines. + async fn add_entry_internal( + &self, + path: &Path, + uuid: Uuid, + metadata: EntryMetadata, + ) -> Result<(i32, Option)> { + let content_kind = { + let mut index = self.index.write().await; + index + .add_entry(path.to_path_buf(), uuid, metadata.clone()) + .map_err(|e| anyhow::anyhow!("Failed to add entry to ephemeral index: {}", e))? + }; + + let entry_id = self.next_id(); + Ok((entry_id, content_kind)) + } + + /// Emit a ResourceChanged event for UI updates. + async fn emit_resource_changed( + &self, + uuid: Uuid, + path: &Path, + metadata: &EntryMetadata, + content_kind: crate::domain::ContentKind, + ) { + use crate::device::get_current_device_slug; + use crate::domain::addressing::SdPath; + use crate::domain::file::File; + use crate::infra::event::{Event, ResourceMetadata}; + + let device_slug = get_current_device_slug(); + + let sd_path = SdPath::Physical { + device_slug: device_slug.clone(), + path: path.to_path_buf(), + }; + + let mut file = File::from_ephemeral(uuid, metadata, sd_path); + file.content_kind = content_kind; + + let parent_path = path.parent().map(|p| SdPath::Physical { + device_slug: file.sd_path.device_slug().unwrap_or("local").to_string(), + path: p.to_path_buf(), + }); + + let affected_paths = parent_path.into_iter().collect(); + + if let Ok(resource_json) = serde_json::to_value(&file) { + self.event_bus.emit(Event::ResourceChanged { + resource_type: "file".to_string(), + resource: resource_json, + metadata: Some(ResourceMetadata { + no_merge_fields: vec!["sd_path".to_string()], + alternate_ids: vec![], + affected_paths, + }), + }); + } + } +} + +#[async_trait::async_trait] +impl ChangeHandler for EphemeralWriter { + async fn find_by_path(&self, path: &Path) -> Result> { + let index = self.index.read().await; + + if let Some(metadata) = index.get_entry_ref(&path.to_path_buf()) { + let uuid = index.get_entry_uuid(&path.to_path_buf()); + + Ok(Some(EntryRef { + id: 0, + uuid, + path: path.to_path_buf(), + kind: metadata.kind, + })) + } else { + Ok(None) + } + } + + async fn find_by_inode(&self, _inode: u64) -> Result> { + // Ephemeral index doesn't track inodes for move detection + Ok(None) + } + + async fn create(&mut self, metadata: &DirEntry, _parent_path: &Path) -> Result { + let entry_uuid = Uuid::new_v4(); + let entry_metadata = EntryMetadata::from(metadata.clone()); + + let (entry_id, content_kind) = self + .add_entry_internal(&metadata.path, entry_uuid, entry_metadata.clone()) + .await?; + + // Emit event if entry was actually added (not a duplicate) + if let Some(content_kind) = content_kind { + self.emit_resource_changed(entry_uuid, &metadata.path, &entry_metadata, content_kind) + .await; + } + + Ok(EntryRef { + id: entry_id, + uuid: Some(entry_uuid), + path: metadata.path.clone(), + kind: metadata.kind, + }) + } + + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { + let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); + let entry_metadata = EntryMetadata::from(metadata.clone()); + + { + let mut index = self.index.write().await; + let _ = index.add_entry(metadata.path.clone(), uuid, entry_metadata); + } + + Ok(()) + } + + async fn move_entry( + &mut self, + entry: &EntryRef, + old_path: &Path, + new_path: &Path, + _new_parent_path: &Path, + ) -> Result<()> { + let metadata = build_dir_entry(new_path, None).await?; + + { + let mut index = self.index.write().await; + index.remove_entry(old_path); + + let uuid = entry.uuid.unwrap_or_else(Uuid::new_v4); + let entry_metadata = EntryMetadata::from(metadata.clone()); + let _ = index.add_entry(new_path.to_path_buf(), uuid, entry_metadata); + } + + Ok(()) + } + + async fn delete(&mut self, entry: &EntryRef) -> Result<()> { + { + let mut index = self.index.write().await; + + if entry.is_directory() { + index.remove_directory_tree(&entry.path); + } else { + index.remove_entry(&entry.path); + } + } + + Ok(()) + } + + async fn run_processors(&self, _entry: &EntryRef, _is_new: bool) -> Result<()> { + // Ephemeral indexing skips processor pipeline (no thumbnails/content hash) + Ok(()) + } + + async fn emit_change_event(&self, entry: &EntryRef, _change_type: ChangeType) -> Result<()> { + let Some(uuid) = entry.uuid else { + return Ok(()); + }; + + let content_kind = { + let index = self.index.read().await; + index.get_content_kind(&entry.path) + }; + + let metadata = build_dir_entry(&entry.path, None).await.ok(); + + if let Some(meta) = metadata { + let entry_metadata = EntryMetadata::from(meta); + self.emit_resource_changed(uuid, &entry.path, &entry_metadata, content_kind) + .await; + } + + Ok(()) + } + + async fn handle_new_directory(&self, path: &Path) -> Result<()> { + use crate::ops::indexing::db_writer::DBWriter; + + let mut entries = match tokio::fs::read_dir(path).await { + Ok(e) => e, + Err(e) => { + tracing::warn!( + "Failed to read directory {} for ephemeral indexing: {}", + path.display(), + e + ); + return Ok(()); + } + }; + + let mut index = self.index.write().await; + + while let Ok(Some(entry)) = entries.next_entry().await { + let entry_path = entry.path(); + + if let Ok(metadata) = entry.metadata().await { + let kind = if metadata.is_dir() { + EntryKind::Directory + } else if metadata.is_symlink() { + EntryKind::Symlink + } else { + EntryKind::File + }; + + let entry_metadata = EntryMetadata { + path: entry_path.clone(), + kind, + size: metadata.len(), + modified: metadata.modified().ok(), + accessed: metadata.accessed().ok(), + created: metadata.created().ok(), + inode: DBWriter::get_inode(&metadata), + permissions: None, + is_hidden: entry_path + .file_name() + .and_then(|n| n.to_str()) + .map(|n| n.starts_with('.')) + .unwrap_or(false), + }; + + let uuid = Uuid::new_v4(); + let _ = index.add_entry(entry_path, uuid, entry_metadata); + } + } + + Ok(()) + } +} + +// ============================================================================ +// IndexPersistence Implementation (Job Pipeline) +// ============================================================================ + +#[async_trait::async_trait] +impl IndexPersistence for EphemeralWriter { + async fn store_entry( + &self, + entry: &DirEntry, + _location_id: Option, + _location_root_path: &Path, + ) -> JobResult { + use crate::ops::indexing::db_writer::DBWriter; + + let metadata = DBWriter::extract_metadata(&entry.path, None) + .await + .map_err(|e| JobError::execution(format!("Failed to extract metadata: {}", e)))?; + + let entry_uuid = Uuid::new_v4(); + + let (entry_id, content_kind) = { + let mut index = self.index.write().await; + let content_kind = index + .add_entry(entry.path.clone(), entry_uuid, metadata.clone()) + .map_err(|e| { + tracing::error!("Failed to add entry to ephemeral index: {}", e); + JobError::execution(format!("Failed to add entry: {}", e)) + })?; + + if content_kind.is_some() { + match entry.kind { + EntryKind::File => index.stats.files += 1, + EntryKind::Directory => index.stats.dirs += 1, + EntryKind::Symlink => index.stats.symlinks += 1, + } + index.stats.bytes += entry.size; + } + + (self.next_id(), content_kind) + }; + + // Emit event if entry was actually added (not a duplicate) + if let Some(content_kind) = content_kind { + self.emit_resource_changed(entry_uuid, &entry.path, &metadata, content_kind) + .await; + } + + Ok(entry_id) + } + + async fn store_content_identity( + &self, + _entry_id: i32, + _path: &Path, + _cas_id: String, + ) -> JobResult<()> { + // Ephemeral indexing doesn't track content identities + Ok(()) + } + + async fn get_existing_entries( + &self, + _indexing_path: &Path, + ) -> JobResult, Option, u64)>> { + // Ephemeral indexing doesn't support incremental indexing + Ok(HashMap::new()) + } + + async fn update_entry(&self, _entry_id: i32, _entry: &DirEntry) -> JobResult<()> { + // Updates are handled via add_entry (overwrites existing) + Ok(()) + } + + fn is_persistent(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::infra::event::Event; + use tempfile::TempDir; + + #[tokio::test] + async fn test_ephemeral_writer_as_change_handler() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + std::fs::write(&test_file, b"test content").unwrap(); + + let index = Arc::new(RwLock::new( + EphemeralIndex::new().expect("failed to create ephemeral index"), + )); + let event_bus = Arc::new(EventBus::new(1024)); + + let mut writer = + EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); + + // Test create + let dir_entry = DirEntry { + path: test_file.clone(), + kind: EntryKind::File, + size: 12, + modified: Some(std::time::SystemTime::now()), + inode: Some(12345), + }; + + let entry_ref = writer + .create(&dir_entry, temp_dir.path()) + .await + .expect("create should succeed"); + + assert!(entry_ref.uuid.is_some()); + assert_eq!(entry_ref.path, test_file); + assert_eq!(entry_ref.kind, EntryKind::File); + + // Verify entry exists + let found = writer + .find_by_path(&test_file) + .await + .expect("find should succeed"); + assert!(found.is_some()); + } + + #[tokio::test] + async fn test_ephemeral_writer_as_index_persistence() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + std::fs::write(&test_file, b"test content").unwrap(); + + let index = Arc::new(RwLock::new( + EphemeralIndex::new().expect("failed to create ephemeral index"), + )); + let event_bus = Arc::new(EventBus::new(1024)); + + let writer = EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); + + let dir_entry = DirEntry { + path: test_file.clone(), + kind: EntryKind::File, + size: 12, + modified: Some(std::time::SystemTime::now()), + inode: Some(12345), + }; + + let entry_id = writer + .store_entry(&dir_entry, None, temp_dir.path()) + .await + .expect("store_entry should succeed"); + + assert!(entry_id > 0); + assert!(!writer.is_persistent()); + + // Verify index was updated + let idx = index.read().await; + assert!(idx.has_entry(&test_file)); + } + + #[tokio::test] + async fn test_event_emission_consistency() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + std::fs::write(&test_file, b"test content").unwrap(); + + let index = Arc::new(RwLock::new( + EphemeralIndex::new().expect("failed to create ephemeral index"), + )); + + let event_bus = Arc::new(EventBus::new(1024)); + let mut subscriber = event_bus.subscribe(); + + let writer = EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); + + let dir_entry = DirEntry { + path: test_file.clone(), + kind: EntryKind::File, + size: 12, + modified: Some(std::time::SystemTime::now()), + inode: Some(12345), + }; + + writer + .store_entry(&dir_entry, None, temp_dir.path()) + .await + .expect("store_entry should succeed"); + + // Try to receive the event + let event = + tokio::time::timeout(tokio::time::Duration::from_millis(100), subscriber.recv()).await; + + assert!(event.is_ok(), "Should receive an event"); + if let Ok(Ok(Event::ResourceChanged { resource, .. })) = event { + let uuid = resource["id"].as_str(); + assert!(uuid.is_some(), "Event should have UUID"); + } + } +} diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index eebf87210..197f7d8f1 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -637,7 +637,7 @@ impl IndexerJob { ctx: &JobContext<'_>, root_path: &std::path::Path, ) -> JobResult<()> { - use super::entry::EntryProcessor; + use super::db_writer::DBWriter; use super::state::{DirEntry, EntryKind}; use tokio::fs; @@ -671,7 +671,7 @@ impl IndexerJob { kind: entry_kind, size: metadata.len(), modified: metadata.modified().ok(), - inode: EntryProcessor::get_inode(&metadata), + inode: DBWriter::get_inode(&metadata), }; state.pending_entries.push(dir_entry); diff --git a/core/src/ops/indexing/mod.rs b/core/src/ops/indexing/mod.rs index 85e2914e3..84a0f0f3f 100644 --- a/core/src/ops/indexing/mod.rs +++ b/core/src/ops/indexing/mod.rs @@ -23,7 +23,7 @@ pub mod action; pub mod change_detection; pub mod ctx; -pub mod entry; +pub mod db_writer; pub mod ephemeral; pub mod hierarchy; pub mod input; @@ -42,17 +42,14 @@ pub mod verify; pub use action::IndexingAction; pub use change_detection::{ apply_batch as apply_change_batch, Change, ChangeConfig, ChangeDetector, ChangeHandler, - ChangeType, EntryRef, EphemeralChangeHandler, PersistentChangeHandler, + ChangeType, EntryRef, PersistentWriter, PersistentWriterAdapter, }; pub use ctx::{IndexingCtx, ResponderCtx}; -pub use entry::{EntryMetadata, EntryProcessor}; -pub use ephemeral::EphemeralIndexCache; +pub use db_writer::{DBWriter, EntryMetadata}; +pub use ephemeral::{EphemeralIndex, EphemeralIndexCache, EphemeralIndexStats, EphemeralWriter}; pub use hierarchy::HierarchyQuery; pub use input::IndexInput; -pub use job::{ - EphemeralIndex, EphemeralIndexStats, IndexMode, IndexPersistence, IndexScope, IndexerJob, - IndexerJobConfig, IndexerOutput, -}; +pub use job::{IndexMode, IndexScope, IndexerJob, IndexerJobConfig, IndexerOutput}; pub use metrics::IndexerMetrics; pub use path_resolver::PathResolver; pub use persistence::{IndexPersistence as PersistenceTrait, PersistenceFactory}; diff --git a/core/src/ops/indexing/persistence.rs b/core/src/ops/indexing/persistence.rs index 7becd8df1..ac1903759 100644 --- a/core/src/ops/indexing/persistence.rs +++ b/core/src/ops/indexing/persistence.rs @@ -5,36 +5,24 @@ //! This abstraction allows the same indexing pipeline to work for both managed //! locations (database-backed) and ephemeral browsing (memory-only). //! +//! For ephemeral storage, use `EphemeralWriter` from `crate::ops::indexing::ephemeral` +//! which implements both `IndexPersistence` and `ChangeHandler`. +//! +//! For persistent storage, use `PersistentWriterAdapter` from `crate::ops::indexing::change_detection` +//! which implements `IndexPersistence` and delegates to `DBWriter` for database writes. -use crate::{ - filetype::FileTypeRegistry, - infra::{ - db::entities::{self, directory_paths, entry_closure}, - job::prelude::{JobContext, JobError, JobResult}, - }, -}; -use sea_orm::{ - ActiveModelTrait, ActiveValue::Set, ColumnTrait, Condition, ConnectionTrait, DbBackend, - EntityTrait, JoinType, QueryFilter, QuerySelect, RelationTrait, Statement, TransactionTrait, -}; +use crate::infra::job::prelude::{JobError, JobResult}; use std::{ collections::HashMap, path::{Path, PathBuf}, - sync::Arc, }; -use tokio::sync::RwLock; -use uuid::Uuid; -use super::{ - ephemeral::EphemeralIndex, - state::{DirEntry, EntryKind}, - PathResolver, -}; +use super::{ephemeral::EphemeralIndex, state::DirEntry}; /// Unified storage interface for persistent and ephemeral indexing. /// -/// Implementations handle either database writes (DatabasePersistence) or -/// in-memory storage (EphemeralPersistence). The indexing pipeline calls +/// Implementations handle either database writes (`PersistentWriterAdapter`) or +/// in-memory storage (`EphemeralWriter`). The indexing pipeline calls /// these methods without knowing which backend is active. #[async_trait::async_trait] pub trait IndexPersistence: Send + Sync { @@ -84,512 +72,40 @@ pub trait IndexPersistence: Send + Sync { fn is_persistent(&self) -> bool; } -/// Database-backed persistence with RwLock-protected entry ID cache. -/// -/// This implementation writes all entries to the database and manages a cache of -/// path -> entry_id mappings for fast parent lookups during hierarchy construction. -/// The cache uses RwLock instead of clone-modify-write to prevent race conditions -/// where concurrent cache updates overwrite each other. -pub struct DatabasePersistence<'a> { - ctx: &'a JobContext<'a>, - device_id: i32, - location_root_entry_id: Option, - entry_id_cache: Arc>>, -} - -impl<'a> DatabasePersistence<'a> { - pub fn new( - ctx: &'a JobContext<'a>, - device_id: i32, - location_root_entry_id: Option, - ) -> Self { - Self { - ctx, - device_id, - location_root_entry_id, - entry_id_cache: Arc::new(RwLock::new(HashMap::new())), - } - } -} - -#[async_trait::async_trait] -impl<'a> IndexPersistence for DatabasePersistence<'a> { - async fn store_entry( - &self, - entry: &DirEntry, - _location_id: Option, - location_root_path: &Path, - ) -> JobResult { - use super::entry::EntryProcessor; - - // Cache lookups use RwLock read/write operations instead of clone-modify-write. - let parent_id = if let Some(parent_path) = entry.path.parent() { - let cached_parent = { - let cache = self.entry_id_cache.read().await; - cache.get(parent_path).copied() - }; - - if let Some(id) = cached_parent { - Some(id) - } else { - let parent_path_str = parent_path.to_string_lossy().to_string(); - if let Ok(Some(dir_path_record)) = entities::directory_paths::Entity::find() - .filter(entities::directory_paths::Column::Path.eq(&parent_path_str)) - .one(self.ctx.library_db()) - .await - { - let mut cache = self.entry_id_cache.write().await; - cache.insert(parent_path.to_path_buf(), dir_path_record.entry_id); - Some(dir_path_record.entry_id) - } else { - tracing::warn!( - "Parent not found for {}: {}", - entry.path.display(), - parent_path.display() - ); - None - } - } - } else { - None - }; - - use entities::entry_closure; - - let extension = match entry.kind { - EntryKind::File => entry - .path - .extension() - .and_then(|ext| ext.to_str()) - .map(|ext| ext.to_lowercase()), - EntryKind::Directory | EntryKind::Symlink => None, - }; - - let name = match entry.kind { - EntryKind::File => entry - .path - .file_stem() - .map(|stem| stem.to_string_lossy().to_string()) - .unwrap_or_else(|| { - entry - .path - .file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| "unknown".to_string()) - }), - EntryKind::Directory | EntryKind::Symlink => entry - .path - .file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| "unknown".to_string()), - }; - - let modified_at = entry - .modified - .and_then(|t| { - chrono::DateTime::from_timestamp( - t.duration_since(std::time::UNIX_EPOCH).ok()?.as_secs() as i64, - 0, - ) - }) - .unwrap_or_else(|| chrono::Utc::now()); - - let entry_uuid = Some(Uuid::new_v4()); - - let new_entry = entities::entry::ActiveModel { - uuid: Set(entry_uuid), - name: Set(name.clone()), - kind: Set(EntryProcessor::entry_kind_to_int(entry.kind)), - extension: Set(extension), - metadata_id: Set(None), - content_id: Set(None), - size: Set(entry.size as i64), - aggregate_size: Set(0), - child_count: Set(0), - file_count: Set(0), - created_at: Set(chrono::Utc::now()), - modified_at: Set(modified_at), - accessed_at: Set(None), - permissions: Set(None), - inode: Set(entry.inode.map(|i| i as i64)), - parent_id: Set(parent_id), - ..Default::default() - }; - - let txn = self - .ctx - .library_db() - .begin() - .await - .map_err(|e| JobError::execution(format!("Failed to begin transaction: {}", e)))?; - - let result = new_entry - .insert(&txn) - .await - .map_err(|e| JobError::execution(format!("Failed to create entry: {}", e)))?; - - let self_closure = entry_closure::ActiveModel { - ancestor_id: Set(result.id), - descendant_id: Set(result.id), - depth: Set(0), - ..Default::default() - }; - self_closure - .insert(&txn) - .await - .map_err(|e| JobError::execution(format!("Failed to insert self-closure: {}", e)))?; - - if let Some(parent_id) = parent_id { - txn.execute(Statement::from_sql_and_values( - DbBackend::Sqlite, - "INSERT INTO entry_closure (ancestor_id, descendant_id, depth) \ - SELECT ancestor_id, ?, depth + 1 \ - FROM entry_closure \ - WHERE descendant_id = ?", - vec![result.id.into(), parent_id.into()], - )) - .await - .map_err(|e| { - JobError::execution(format!("Failed to populate ancestor closures: {}", e)) - })?; - } - - if entry.kind == EntryKind::Directory { - let absolute_path = entry.path.to_string_lossy().to_string(); - let dir_path_entry = entities::directory_paths::ActiveModel { - entry_id: Set(result.id), - path: Set(absolute_path), - ..Default::default() - }; - dir_path_entry.insert(&txn).await.map_err(|e| { - JobError::execution(format!("Failed to insert directory path: {}", e)) - })?; - } - - txn.commit() - .await - .map_err(|e| JobError::execution(format!("Failed to commit transaction: {}", e)))?; - - tracing::info!( - "ENTRY_SYNC: About to sync entry name={} uuid={:?}", - result.name, - result.uuid - ); - if let Err(e) = self - .ctx - .library() - .sync_model_with_db( - &result, - crate::infra::sync::ChangeType::Insert, - self.ctx.library_db(), - ) - .await - { - tracing::warn!( - "ENTRY_SYNC: Failed to sync entry {}: {}", - result - .uuid - .map(|u| u.to_string()) - .unwrap_or_else(|| "no-uuid".to_string()), - e - ); - } else { - tracing::info!( - "ENTRY_SYNC: Successfully synced entry name={} uuid={:?}", - result.name, - result.uuid - ); - } - - { - let mut cache = self.entry_id_cache.write().await; - cache.insert(entry.path.clone(), result.id); - } - - Ok(result.id) - } - - async fn store_content_identity( - &self, - entry_id: i32, - path: &Path, - cas_id: String, - ) -> JobResult<()> { - use super::entry::EntryProcessor; - - let library_id = self.ctx.library().id(); - - EntryProcessor::link_to_content_identity(self.ctx, entry_id, path, cas_id, library_id) - .await - .map(|_| ()) - } - - async fn get_existing_entries( - &self, - indexing_path: &Path, - ) -> JobResult< - HashMap, Option, u64)>, - > { - use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; - - let location_root_entry_id = match self.location_root_entry_id { - Some(id) => id, - None => return Ok(HashMap::new()), - }; - - let indexing_path_str = indexing_path.to_string_lossy().to_string(); - let indexing_path_entry_id = if let Ok(Some(dir_record)) = directory_paths::Entity::find() - .filter(directory_paths::Column::Path.eq(&indexing_path_str)) - .one(self.ctx.library_db()) - .await - { - dir_record.entry_id - } else { - location_root_entry_id - }; - - let descendant_ids = entry_closure::Entity::find() - .filter(entry_closure::Column::AncestorId.eq(indexing_path_entry_id)) - .all(self.ctx.library_db()) - .await - .map_err(|e| JobError::execution(format!("Failed to query closure table: {}", e)))? - .into_iter() - .map(|ec| ec.descendant_id) - .collect::>(); - - let mut all_entry_ids = vec![indexing_path_entry_id]; - all_entry_ids.extend(descendant_ids); - - // Chunk queries to stay under SQLite's 999 variable limit. - let mut existing_entries: Vec = Vec::new(); - let chunk_size: usize = 900; - for chunk in all_entry_ids.chunks(chunk_size) { - let mut batch = entities::entry::Entity::find() - .filter(entities::entry::Column::Id.is_in(chunk.to_vec())) - .all(self.ctx.library_db()) - .await - .map_err(|e| { - JobError::execution(format!("Failed to query existing entries: {}", e)) - })?; - existing_entries.append(&mut batch); - } - - let mut result = HashMap::new(); - - self.ctx.log(format!( - "Loading {} existing entries", - existing_entries.len() - )); - - for entry in existing_entries { - let full_path = PathResolver::get_full_path(self.ctx.library_db(), entry.id) - .await - .unwrap_or_else(|_| PathBuf::from(&entry.name)); - - let modified_time = - entry - .modified_at - .timestamp() - .try_into() - .ok() - .and_then(|secs: u64| { - std::time::UNIX_EPOCH.checked_add(std::time::Duration::from_secs(secs)) - }); - - result.insert( - full_path, - ( - entry.id, - entry.inode.map(|i| i as u64), - modified_time, - entry.size as u64, - ), - ); - } - - Ok(result) - } - - async fn update_entry(&self, entry_id: i32, entry: &DirEntry) -> JobResult<()> { - use super::entry::EntryProcessor; - - EntryProcessor::update_entry(self.ctx, entry_id, entry).await - } - - fn is_persistent(&self) -> bool { - true - } -} - -/// In-memory ephemeral persistence for browsing unmanaged paths. -/// -/// Stores entries in an `EphemeralIndex` (memory-only) and emits ResourceChanged -/// events for immediate UI updates. -pub struct EphemeralPersistence { - index: Arc>, - next_entry_id: Arc>, - event_bus: Option>, - root_path: PathBuf, -} - -impl EphemeralPersistence { - pub fn new( - index: Arc>, - event_bus: Option>, - root_path: PathBuf, - ) -> Self { - Self { - index, - next_entry_id: Arc::new(RwLock::new(1)), - event_bus, - root_path, - } - } - - async fn get_next_id(&self) -> i32 { - let mut id = self.next_entry_id.write().await; - let current = *id; - *id += 1; - current - } -} - -#[async_trait::async_trait] -impl IndexPersistence for EphemeralPersistence { - async fn store_entry( - &self, - entry: &DirEntry, - _location_id: Option, - _location_root_path: &Path, - ) -> JobResult { - use super::entry::EntryProcessor; - - let metadata = EntryProcessor::extract_metadata(&entry.path, None) - .await - .map_err(|e| JobError::execution(format!("Failed to extract metadata: {}", e)))?; - - let entry_id = self.get_next_id().await; - let entry_uuid = Uuid::new_v4(); - - // add_entry returns Ok(Some(content_kind)) if added, Ok(None) if duplicate path. - let content_kind = { - let mut index = self.index.write().await; - let result = index - .add_entry(entry.path.clone(), entry_uuid, metadata.clone()) - .map_err(|e| { - tracing::error!("Failed to add entry to ephemeral index: {}", e); - e - })?; - - if result.is_some() { - match entry.kind { - EntryKind::File => index.stats.files += 1, - EntryKind::Directory => index.stats.dirs += 1, - EntryKind::Symlink => index.stats.symlinks += 1, - } - index.stats.bytes += entry.size; - } - result - }; - - let Some(content_kind) = content_kind else { - return Ok(entry_id); - }; - - if let Some(event_bus) = &self.event_bus { - use crate::device::get_current_device_slug; - use crate::domain::addressing::SdPath; - use crate::domain::file::File; - use crate::infra::event::{Event, ResourceMetadata}; - - let device_slug = get_current_device_slug(); - - let sd_path = SdPath::Physical { - device_slug: device_slug.clone(), - path: entry.path.clone(), - }; - - let mut file = File::from_ephemeral(entry_uuid, &metadata, sd_path); - file.content_kind = content_kind; - - let parent_path = entry.path.parent().map(|p| SdPath::Physical { - device_slug: file.sd_path.device_slug().unwrap_or("local").to_string(), - path: p.to_path_buf(), - }); - - let affected_paths = if let Some(parent) = parent_path { - vec![parent] - } else { - vec![] - }; - - if let Ok(resource_json) = serde_json::to_value(&file) { - event_bus.emit(Event::ResourceChanged { - resource_type: "file".to_string(), - resource: resource_json, - metadata: Some(ResourceMetadata { - no_merge_fields: vec!["sd_path".to_string()], - alternate_ids: vec![], - affected_paths, - }), - }); - } - } - - Ok(entry_id) - } - - async fn store_content_identity( - &self, - _entry_id: i32, - _path: &Path, - _cas_id: String, - ) -> JobResult<()> { - Ok(()) - } - - async fn get_existing_entries( - &self, - _indexing_path: &Path, - ) -> JobResult< - HashMap, Option, u64)>, - > { - Ok(HashMap::new()) - } - - async fn update_entry(&self, _entry_id: i32, _entry: &DirEntry) -> JobResult<()> { - Ok(()) - } - - fn is_persistent(&self) -> bool { - false - } -} - /// Factory for creating appropriate persistence implementations pub struct PersistenceFactory; impl PersistenceFactory { - /// Create a database persistence instance + /// Create a database persistence instance using the unified PersistentWriterAdapter. + /// + /// This delegates to `DBWriter` for all database operations, ensuring + /// consistency between the watcher and indexer pipelines. pub fn database<'a>( - ctx: &'a JobContext<'a>, - device_id: i32, + ctx: &'a crate::infra::job::prelude::JobContext<'a>, + library_id: uuid::Uuid, location_root_entry_id: Option, ) -> Box { - Box::new(DatabasePersistence::new( + use crate::ops::indexing::change_detection::PersistentWriterAdapter; + + Box::new(PersistentWriterAdapter::new( ctx, - device_id, + library_id, location_root_entry_id, )) } - /// Create an ephemeral persistence instance + /// Create an ephemeral persistence instance using the unified EphemeralWriter. pub fn ephemeral( - index: Arc>, - event_bus: Option>, + index: std::sync::Arc>, + event_bus: Option>, root_path: PathBuf, ) -> Box { - Box::new(EphemeralPersistence::new(index, event_bus, root_path)) + use super::ephemeral::EphemeralWriter; + + let event_bus = + event_bus.unwrap_or_else(|| std::sync::Arc::new(crate::infra::event::EventBus::new(1024))); + + Box::new(EphemeralWriter::new(index, event_bus, root_path)) } } @@ -597,42 +113,31 @@ impl PersistenceFactory { mod tests { use super::*; use crate::infra::event::Event; + use crate::ops::indexing::ephemeral::EphemeralWriter; use crate::ops::indexing::state::{DirEntry, EntryKind}; - use std::sync::Mutex; + use std::sync::Arc; use tempfile::TempDir; + use tokio::sync::RwLock; #[tokio::test] - async fn test_ephemeral_uuid_consistency() { - // Create temp directory for test + async fn test_ephemeral_writer_via_factory() { let temp_dir = TempDir::new().unwrap(); let test_file = temp_dir.path().join("test.txt"); std::fs::write(&test_file, b"test content").unwrap(); - // Create ephemeral index let index = Arc::new(RwLock::new( EphemeralIndex::new().expect("failed to create ephemeral index"), )); - // Create event collector - let collected_events = Arc::new(Mutex::new(Vec::new())); - let events_clone = collected_events.clone(); + let event_bus = Arc::new(crate::infra::event::EventBus::new(1024)); + let mut subscriber = event_bus.subscribe(); - // Create mock event bus that collects events - let event_bus = Arc::new(crate::infra::event::EventBus::new()); - let _subscription = event_bus.subscribe(move |event| { - if let Event::ResourceChanged { resource, .. } = event { - events_clone.lock().unwrap().push(resource.clone()); - } - }); - - // Create ephemeral persistence - let persistence = EphemeralPersistence::new( + let writer = PersistenceFactory::ephemeral( index.clone(), Some(event_bus), temp_dir.path().to_path_buf(), ); - // Store entry (processing phase) let dir_entry = DirEntry { path: test_file.clone(), kind: EntryKind::File, @@ -641,51 +146,53 @@ mod tests { inode: Some(12345), }; - let entry_id = persistence + let entry_id = writer .store_entry(&dir_entry, None, temp_dir.path()) .await .unwrap(); - // Store content identity (content phase) - let cas_id = "test_hash_123".to_string(); - persistence - .store_content_identity(entry_id, &test_file, cas_id) + assert!(entry_id > 0); + assert!(!writer.is_persistent()); + + let event = + tokio::time::timeout(tokio::time::Duration::from_millis(100), subscriber.recv()).await; + + assert!(event.is_ok(), "Should receive an event"); + if let Ok(Ok(Event::ResourceChanged { resource, .. })) = event { + let uuid = resource["id"].as_str(); + assert!(uuid.is_some(), "Event should have UUID"); + } + } + + #[tokio::test] + async fn test_ephemeral_writer_direct() { + let temp_dir = TempDir::new().unwrap(); + let test_file = temp_dir.path().join("test.txt"); + std::fs::write(&test_file, b"test content").unwrap(); + + let index = Arc::new(RwLock::new( + EphemeralIndex::new().expect("failed to create ephemeral index"), + )); + let event_bus = Arc::new(crate::infra::event::EventBus::new(1024)); + + let writer = EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); + + let dir_entry = DirEntry { + path: test_file.clone(), + kind: EntryKind::File, + size: 12, + modified: Some(std::time::SystemTime::now()), + inode: Some(12345), + }; + + let entry_id = writer + .store_entry(&dir_entry, None, temp_dir.path()) .await .unwrap(); - // Give events time to propagate - tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + assert!(entry_id > 0); - // Collect all events - let events = collected_events.lock().unwrap(); - - // Should have 2 events: one from store_entry, one from store_content_identity - assert_eq!( - events.len(), - 2, - "Expected 2 ResourceChanged events (processing + content phases)" - ); - - // Extract UUIDs from both events - let uuid1 = events[0]["id"] - .as_str() - .expect("First event should have UUID"); - let uuid2 = events[1]["id"] - .as_str() - .expect("Second event should have UUID"); - - // CRITICAL: Both events must have the same UUID for the same file - assert_eq!( - uuid1, uuid2, - "UUID mismatch! Processing phase emitted UUID {} but content phase emitted UUID {}. \ - These should be identical so the UI can match the events.", - uuid1, uuid2 - ); - - // Verify the second event has content_identity - assert!( - events[1]["content_identity"].is_object(), - "Second event should include content_identity" - ); + let idx = index.read().await; + assert!(idx.has_entry(&test_file)); } } diff --git a/core/src/ops/indexing/phases/content.rs b/core/src/ops/indexing/phases/content.rs index 06142f974..b0126728c 100644 --- a/core/src/ops/indexing/phases/content.rs +++ b/core/src/ops/indexing/phases/content.rs @@ -12,7 +12,7 @@ use crate::{ infra::job::prelude::{JobContext, JobError, Progress}, ops::indexing::{ ctx::IndexingCtx, - entry::EntryProcessor, + db_writer::DBWriter, processor::{ContentHashProcessor, ProcessorEntry}, state::{EntryKind, IndexError, IndexPhase, IndexerProgress, IndexerState}, }, @@ -128,7 +128,7 @@ pub async fn run_content_phase( match hash_result { Ok(content_hash) => { - match EntryProcessor::link_to_content_identity( + match DBWriter::link_to_content_identity( ctx, entry_id, &path, diff --git a/core/src/ops/indexing/phases/discovery.rs b/core/src/ops/indexing/phases/discovery.rs index 21f5ede88..c27461152 100644 --- a/core/src/ops/indexing/phases/discovery.rs +++ b/core/src/ops/indexing/phases/discovery.rs @@ -9,7 +9,7 @@ use crate::{ infra::job::generic_progress::ToGenericProgress, infra::job::prelude::{JobContext, JobError, Progress}, ops::indexing::{ - entry::EntryProcessor, + db_writer::DBWriter, rules::{build_default_ruler, RuleToggles, RulerDecision}, state::{DirEntry, EntryKind, IndexError, IndexPhase, IndexerProgress, IndexerState}, }, diff --git a/core/src/ops/indexing/phases/processing.rs b/core/src/ops/indexing/phases/processing.rs index 3bc26f067..5d8176ccd 100644 --- a/core/src/ops/indexing/phases/processing.rs +++ b/core/src/ops/indexing/phases/processing.rs @@ -14,7 +14,7 @@ use crate::{ }, ops::indexing::{ change_detection::{Change, ChangeDetector}, - entry::EntryProcessor, + db_writer::DBWriter, state::{DirEntry, EntryKind, IndexError, IndexPhase, IndexerProgress, IndexerState}, IndexMode, }, @@ -282,7 +282,7 @@ pub async fn run_processing_phase( match change { Some(Change::New(_)) => { - match EntryProcessor::create_entry_in_conn( + match DBWriter::create_entry_in_conn( state, ctx, &entry, @@ -332,7 +332,7 @@ pub async fn run_processing_phase( } Some(Change::Modified { entry_id, .. }) => { - match EntryProcessor::update_entry_in_conn(ctx, entry_id, &entry, &txn).await { + match DBWriter::update_entry_in_conn(ctx, entry_id, &entry, &txn).await { Ok(()) => { ctx.log(format!( "Updated entry {}: {}", @@ -367,7 +367,7 @@ pub async fn run_processing_phase( old_path.display(), new_path.display() )); - match EntryProcessor::simple_move_entry_in_conn( + match DBWriter::simple_move_entry_in_conn( state, ctx, entry_id, &old_path, &new_path, &txn, ) .await diff --git a/core/src/ops/indexing/processor.rs b/core/src/ops/indexing/processor.rs index 33278a581..42106bd8f 100644 --- a/core/src/ops/indexing/processor.rs +++ b/core/src/ops/indexing/processor.rs @@ -5,7 +5,7 @@ //! happen in a single transaction. This ensures entries either have valid content_id references //! or remain unlinked if processing fails. -use super::{ctx::IndexingCtx, entry::EntryProcessor, state::EntryKind}; +use super::{ctx::IndexingCtx, db_writer::DBWriter, state::EntryKind}; use crate::domain::content_identity::ContentHashGenerator; use anyhow::Result; use serde::{Deserialize, Serialize}; @@ -149,7 +149,7 @@ impl ContentHashProcessor { let content_hash = ContentHashGenerator::generate_content_hash(&entry.path).await?; debug!("✓ Generated content hash: {}", content_hash); - EntryProcessor::link_to_content_identity( + DBWriter::link_to_content_identity( ctx, entry.id, &entry.path, diff --git a/core/src/ops/indexing/responder.rs b/core/src/ops/indexing/responder.rs index 7638be291..e6fc84346 100644 --- a/core/src/ops/indexing/responder.rs +++ b/core/src/ops/indexing/responder.rs @@ -1,12 +1,12 @@ //! Persistent location responder. //! -//! Thin adapter over `PersistentChangeHandler` that translates raw filesystem +//! Thin adapter over `PersistentWriter` that translates raw filesystem //! events into database mutations. The watcher calls `apply_batch` with events; //! this module delegates to the unified change handling infrastructure. use crate::context::CoreContext; use crate::infra::event::FsRawEventKind; -use crate::ops::indexing::change_detection::{self, ChangeConfig, PersistentChangeHandler}; +use crate::ops::indexing::change_detection::{self, ChangeConfig, PersistentWriter}; use crate::ops::indexing::rules::RuleToggles; use anyhow::Result; use std::path::Path; @@ -15,7 +15,7 @@ use uuid::Uuid; /// Translates a single filesystem event into database mutations. /// -/// Creates a `PersistentChangeHandler` and delegates to the unified change +/// Creates a `PersistentWriter` and delegates to the unified change /// handling infrastructure in `change_detection`. pub async fn apply( context: &Arc, @@ -40,7 +40,7 @@ pub async fn apply( /// Processes multiple filesystem events as a batch. /// -/// Creates a `PersistentChangeHandler` and delegates to the unified +/// Creates a `PersistentWriter` and delegates to the unified /// `change_detection::apply_batch` which handles deduplication, ordering, /// and correct processing sequence (removes, renames, creates, modifies). pub async fn apply_batch( @@ -62,7 +62,7 @@ pub async fn apply_batch( location_id ); - let mut handler = PersistentChangeHandler::new( + let mut handler = PersistentWriter::new( context.clone(), library_id, location_id, diff --git a/core/src/ops/indexing/verify/action.rs b/core/src/ops/indexing/verify/action.rs index 752151696..e374f9d85 100644 --- a/core/src/ops/indexing/verify/action.rs +++ b/core/src/ops/indexing/verify/action.rs @@ -9,10 +9,9 @@ use crate::{ db::entities, }, ops::indexing::{ - entry::EntryProcessor, - job::{ - EphemeralIndex, IndexMode, IndexPersistence, IndexScope, IndexerJob, IndexerJobConfig, - }, + db_writer::DBWriter, + ephemeral::EphemeralIndex, + job::{IndexMode, IndexPersistence, IndexScope, IndexerJob, IndexerJobConfig}, path_resolver::PathResolver, state::EntryKind, }, @@ -98,7 +97,7 @@ impl IndexVerifyAction { library: &Arc, context: &Arc, path: &Path, - ) -> Result, ActionError> { + ) -> Result, ActionError> { use tokio::sync::RwLock; tracing::debug!("Running ephemeral indexer job on {}", path.display()); @@ -404,7 +403,7 @@ impl IndexVerifyAction { /// Compare ephemeral index with database entries async fn compare_indexes( &self, - fs_entries: HashMap, + fs_entries: HashMap, mut db_entries: HashMap, root_path: &Path, ) -> Result { From 93c40bfb7c6d73a52e5c1bfaa38a63fa8937d1a5 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 03:38:13 -0800 Subject: [PATCH 17/20] Refactor indexing to remove context abstraction --- .../indexing/change_detection/persistent.rs | 58 ++++++----- core/src/ops/indexing/ctx.rs | 73 -------------- core/src/ops/indexing/db_writer.rs | 97 ++++++++----------- core/src/ops/indexing/mod.rs | 2 - core/src/ops/indexing/phases/content.rs | 73 ++++++-------- core/src/ops/indexing/phases/processing.rs | 5 +- core/src/ops/indexing/processor.rs | 15 +-- 7 files changed, 110 insertions(+), 213 deletions(-) delete mode 100644 core/src/ops/indexing/ctx.rs diff --git a/core/src/ops/indexing/change_detection/persistent.rs b/core/src/ops/indexing/change_detection/persistent.rs index edcdc7f65..ba17b5e3a 100644 --- a/core/src/ops/indexing/change_detection/persistent.rs +++ b/core/src/ops/indexing/change_detection/persistent.rs @@ -209,15 +209,15 @@ impl ChangeHandler for PersistentWriter { use crate::ops::indexing::state::IndexerState; let mut state = IndexerState::new(&SdPath::local(&metadata.path)); - let ctx = - crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; + let library = self.context.get_library(self.library_id).await; // Cache Management: Check cache first, then query DB if needed if let Some(&parent_id) = self.entry_id_cache.get(parent_path) { state .entry_id_cache .insert(parent_path.to_path_buf(), parent_id); - } else if let Ok(Some(parent_id)) = DBWriter::resolve_parent_id(&ctx, parent_path).await { + } else if let Ok(Some(parent_id)) = DBWriter::resolve_parent_id(&self.db, parent_path).await + { // Cache the parent ID for future lookups state .entry_id_cache @@ -226,9 +226,10 @@ impl ChangeHandler for PersistentWriter { .insert(parent_path.to_path_buf(), parent_id); } - let entry_id = DBWriter::create_entry(&mut state, &ctx, metadata, 0, parent_path) - .await - .map_err(|e| anyhow::anyhow!("Failed to create entry: {}", e))?; + let entry_id = + DBWriter::create_entry(&mut state, &self.db, library.as_deref(), metadata, 0, parent_path) + .await + .map_err(|e| anyhow::anyhow!("Failed to create entry: {}", e))?; self.entry_id_cache.insert(metadata.path.clone(), entry_id); @@ -248,9 +249,7 @@ impl ChangeHandler for PersistentWriter { async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { use crate::ops::indexing::db_writer::DBWriter; - let ctx = - crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - DBWriter::update_entry(&ctx, entry.id, metadata) + DBWriter::update_entry(&self.db, entry.id, metadata) .await .map_err(|e| anyhow::anyhow!("Failed to update entry: {}", e))?; @@ -269,15 +268,14 @@ impl ChangeHandler for PersistentWriter { use crate::ops::indexing::state::IndexerState; let mut state = IndexerState::new(&SdPath::local(old_path)); - let ctx = - crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; // Cache Management: Check cache first, then query DB if needed if let Some(&parent_id) = self.entry_id_cache.get(new_parent_path) { state .entry_id_cache .insert(new_parent_path.to_path_buf(), parent_id); - } else if let Ok(Some(parent_id)) = DBWriter::resolve_parent_id(&ctx, new_parent_path).await + } else if let Ok(Some(parent_id)) = + DBWriter::resolve_parent_id(&self.db, new_parent_path).await { state .entry_id_cache @@ -287,7 +285,7 @@ impl ChangeHandler for PersistentWriter { } DBWriter::move_entry( &mut state, - &ctx, + &self.db, entry.id, old_path, new_path, @@ -408,9 +406,6 @@ impl ChangeHandler for PersistentWriter { .await .unwrap_or_default(); - let ctx = - crate::ops::indexing::ctx::ResponderCtx::new(&self.context, self.library_id).await?; - let build_proc_entry = |db: &sea_orm::DatabaseConnection, entry: &EntryRef| -> std::pin::Pin< @@ -468,7 +463,7 @@ impl ChangeHandler for PersistentWriter { { let proc_entry = build_proc_entry(&self.db, entry).await?; let content_proc = ContentHashProcessor::new(self.library_id); - if let Err(e) = content_proc.process(&ctx, &proc_entry).await { + if let Err(e) = content_proc.process(&self.db, &proc_entry).await { tracing::warn!("Content hash processing failed: {}", e); } } @@ -696,7 +691,8 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { // the parent ID is cached before creating this entry if let Some(parent_path) = entry.path.parent() { if !state.entry_id_cache.contains_key(parent_path) { - if let Ok(Some(parent_id)) = DBWriter::resolve_parent_id(self.ctx, parent_path).await + if let Ok(Some(parent_id)) = + DBWriter::resolve_parent_id(self.ctx.library_db(), parent_path).await { state .entry_id_cache @@ -705,9 +701,15 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { } } - let entry_id = - DBWriter::create_entry(&mut state, self.ctx, entry, 0, location_root_path) - .await?; + let entry_id = DBWriter::create_entry( + &mut state, + self.ctx.library_db(), + Some(self.ctx.library()), + entry, + 0, + location_root_path, + ) + .await?; Ok(entry_id) } @@ -720,9 +722,15 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { ) -> JobResult<()> { use crate::ops::indexing::db_writer::DBWriter; - DBWriter::link_to_content_identity(self.ctx, entry_id, path, cas_id, self.library_id) - .await - .map(|_| ()) + DBWriter::link_to_content_identity( + self.ctx.library_db(), + entry_id, + path, + cas_id, + self.library_id, + ) + .await + .map(|_| ()) } async fn get_existing_entries( @@ -815,7 +823,7 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { async fn update_entry(&self, entry_id: i32, entry: &DirEntry) -> JobResult<()> { use crate::ops::indexing::db_writer::DBWriter; - DBWriter::update_entry(self.ctx, entry_id, entry).await + DBWriter::update_entry(self.ctx.library_db(), entry_id, entry).await } fn is_persistent(&self) -> bool { diff --git a/core/src/ops/indexing/ctx.rs b/core/src/ops/indexing/ctx.rs deleted file mode 100644 index 4ff1088a0..000000000 --- a/core/src/ops/indexing/ctx.rs +++ /dev/null @@ -1,73 +0,0 @@ -//! Context abstraction for indexing operations. -//! -//! The `IndexingCtx` trait provides a minimal interface that indexing code paths -//! need to function. This allows the same indexing logic to run both inside the -//! job system (with `JobContext`) and outside of it (watcher responder), avoiding -//! code duplication between job-based and event-driven indexing. - -use sea_orm::DatabaseConnection; -use std::sync::Arc; -use uuid::Uuid; - -use crate::{context::CoreContext, infra::job::prelude::JobContext, library::Library}; - -/// Minimal interface required by indexing operations. -/// -/// This trait abstracts away the difference between job-based indexing and -/// event-driven indexing (file watcher responders). Both execution contexts -/// provide database access and logging, but only the job context has full -/// library access for sync operations. -pub trait IndexingCtx { - fn library_db(&self) -> &DatabaseConnection; - - /// Returns the library reference when running in job context, None otherwise. - /// - /// This is only available for job-based indexing since responder contexts - /// don't have direct library access (they operate through the event bus). - fn library(&self) -> Option<&Library> { - None - } - - fn log(&self, message: impl AsRef) { - tracing::debug!(message = %message.as_ref()); - } -} - -impl<'a> IndexingCtx for JobContext<'a> { - fn library_db(&self) -> &DatabaseConnection { - self.library_db() - } - - fn library(&self) -> Option<&Library> { - Some(self.library()) - } -} - -/// Context for file watcher responders that run outside the job system. -/// -/// Responders handle filesystem events (file created, moved, deleted) by -/// performing incremental indexing updates. They operate independently of -/// the job system and communicate results through the event bus rather than -/// job completion. -pub struct ResponderCtx { - db: DatabaseConnection, -} - -impl ResponderCtx { - pub async fn new(context: &Arc, library_id: Uuid) -> anyhow::Result { - let library: Arc = context - .get_library(library_id) - .await - .ok_or_else(|| anyhow::anyhow!("Library not found: {}", library_id))?; - - Ok(Self { - db: library.db().conn().clone(), - }) - } -} - -impl IndexingCtx for ResponderCtx { - fn library_db(&self) -> &DatabaseConnection { - &self.db - } -} diff --git a/core/src/ops/indexing/db_writer.rs b/core/src/ops/indexing/db_writer.rs index 0b8bdf920..2388a2425 100644 --- a/core/src/ops/indexing/db_writer.rs +++ b/core/src/ops/indexing/db_writer.rs @@ -36,17 +36,18 @@ //! ).await?; //! ``` -use super::ctx::IndexingCtx; use super::path_resolver::PathResolver; use super::state::{DirEntry, EntryKind, IndexerState}; -use crate::infra::job::prelude::{JobContext, JobError}; +use crate::infra::job::prelude::JobError; +use crate::library::Library; use crate::{ filetype::FileTypeRegistry, infra::db::entities::{self, directory_paths, entry_closure}, }; use sea_orm::{ - ActiveModelTrait, ActiveValue::Set, ColumnTrait, ConnectionTrait, DatabaseTransaction, - DbBackend, EntityTrait, IntoActiveModel, QueryFilter, QuerySelect, Statement, TransactionTrait, + ActiveModelTrait, ActiveValue::Set, ColumnTrait, ConnectionTrait, DatabaseConnection, + DatabaseTransaction, DbBackend, EntityTrait, IntoActiveModel, QueryFilter, QuerySelect, + Statement, TransactionTrait, }; use std::path::{Path, PathBuf}; use uuid::Uuid; @@ -162,7 +163,7 @@ impl DBWriter { /// For cloud paths (containing "://"), tries both with and without trailing slashes /// since cloud backends may store paths inconsistently. pub async fn resolve_parent_id( - ctx: &impl IndexingCtx, + db: &DatabaseConnection, parent_path: &Path, ) -> Result, JobError> { let parent_path_str = parent_path.to_string_lossy().to_string(); @@ -177,7 +178,7 @@ impl DBWriter { let query = entities::directory_paths::Entity::find() .filter(entities::directory_paths::Column::Path.is_in(parent_variants)); - match query.one(ctx.library_db()).await { + match query.one(db).await { Ok(Some(dir_path_record)) => Ok(Some(dir_path_record.entry_id)), Ok(None) => Ok(None), Err(e) => Err(JobError::execution(format!( @@ -267,7 +268,6 @@ impl DBWriter { /// and collect related rows for bulk insertion by the caller. pub async fn create_entry_in_conn( state: &mut IndexerState, - ctx: &impl IndexingCtx, entry: &DirEntry, device_id: i32, location_root_path: &Path, @@ -428,13 +428,13 @@ impl DBWriter { /// Create an entry, starting and committing its own transaction (single insert) pub async fn create_entry( state: &mut IndexerState, - ctx: &impl IndexingCtx, + db: &DatabaseConnection, + library: Option<&Library>, entry: &DirEntry, device_id: i32, location_root_path: &Path, ) -> Result { - let txn = ctx - .library_db() + let txn = db .begin() .await .map_err(|e| JobError::execution(format!("Failed to begin transaction: {}", e)))?; @@ -443,7 +443,6 @@ impl DBWriter { let mut dir_paths: Vec = Vec::new(); let result = Self::create_entry_in_conn( state, - ctx, entry, device_id, location_root_path, @@ -482,18 +481,14 @@ impl DBWriter { .map_err(|e| JobError::execution(format!("Failed to commit transaction: {}", e)))?; // Sync entry to other devices - if let Some(library) = ctx.library() { + if let Some(library) = library { tracing::info!( "ENTRY_SYNC: About to sync entry name={} uuid={:?}", entry_model.name, entry_model.uuid ); if let Err(e) = library - .sync_model_with_db( - &entry_model, - crate::infra::sync::ChangeType::Insert, - ctx.library_db(), - ) + .sync_model_with_db(&entry_model, crate::infra::sync::ChangeType::Insert, db) .await { tracing::warn!( @@ -518,12 +513,12 @@ impl DBWriter { /// Update an existing entry pub async fn update_entry( - ctx: &impl IndexingCtx, + db: &DatabaseConnection, entry_id: i32, entry: &DirEntry, ) -> Result<(), JobError> { let db_entry = entities::entry::Entity::find_by_id(entry_id) - .one(ctx.library_db()) + .one(db) .await .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? .ok_or_else(|| JobError::execution("Entry not found for update".to_string()))?; @@ -555,7 +550,7 @@ impl DBWriter { entry_active.indexed_at = Set(Some(chrono::Utc::now())); entry_active - .update(ctx.library_db()) + .update(db) .await .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; @@ -565,22 +560,20 @@ impl DBWriter { /// Handle entry move operation with closure table updates (creates own transaction) pub async fn move_entry( state: &mut IndexerState, - ctx: &impl IndexingCtx, + db: &DatabaseConnection, entry_id: i32, old_path: &Path, new_path: &Path, location_root_path: &Path, ) -> Result<(), JobError> { // Begin transaction for atomic move operation - let txn = ctx - .library_db() + let txn = db .begin() .await .map_err(|e| JobError::execution(format!("Failed to begin transaction: {}", e)))?; let result = Self::move_entry_in_conn( state, - ctx, entry_id, old_path, new_path, @@ -606,7 +599,6 @@ impl DBWriter { /// Handle entry move operation within existing transaction pub async fn move_entry_in_conn( state: &mut IndexerState, - ctx: &impl IndexingCtx, entry_id: i32, old_path: &Path, new_path: &Path, @@ -753,7 +745,7 @@ impl DBWriter { /// Returns both the content identity and the updated entry for batch sync operations. /// The caller must sync both models if running outside the job system (e.g., watcher). pub async fn link_to_content_identity( - ctx: &impl IndexingCtx, + db: &DatabaseConnection, entry_id: i32, path: &Path, content_hash: String, @@ -761,7 +753,7 @@ impl DBWriter { ) -> Result { let existing = entities::content_identity::Entity::find() .filter(entities::content_identity::Column::ContentHash.eq(&content_hash)) - .one(ctx.library_db()) + .one(db) .await .map_err(|e| JobError::execution(format!("Failed to query content identity: {}", e)))?; @@ -771,7 +763,7 @@ impl DBWriter { existing_active.last_verified_at = Set(chrono::Utc::now()); let updated = existing_active - .update(ctx.library_db()) + .update(db) .await .map_err(|e| { JobError::execution(format!("Failed to update content identity: {}", e)) @@ -807,7 +799,7 @@ impl DBWriter { { let existing = entities::mime_type::Entity::find() .filter(entities::mime_type::Column::MimeType.eq(mime_str)) - .one(ctx.library_db()) + .one(db) .await .map_err(|e| { JobError::execution(format!("Failed to query mime type: {}", e)) @@ -823,13 +815,12 @@ impl DBWriter { ..Default::default() }; - let mime_result = - new_mime.insert(ctx.library_db()).await.map_err(|e| { - JobError::execution(format!( - "Failed to create mime type: {}", - e - )) - })?; + let mime_result = new_mime.insert(db).await.map_err(|e| { + JobError::execution(format!( + "Failed to create mime type: {}", + e + )) + })?; Some(mime_result.id) } @@ -860,13 +851,13 @@ impl DBWriter { // Handle race condition: another job (or device sync) may have created this // content identity between our check and insert. Catch UNIQUE constraint violations // and use the existing record instead of failing. - let result = match new_content.insert(ctx.library_db()).await { + let result = match new_content.insert(db).await { Ok(model) => (model, true), Err(e) => { if e.to_string().contains("UNIQUE constraint failed") { let existing = entities::content_identity::Entity::find() .filter(entities::content_identity::Column::ContentHash.eq(&content_hash)) - .one(ctx.library_db()) + .one(db) .await .map_err(|e| JobError::execution(format!("Failed to find existing content identity: {}", e)))? .ok_or_else(|| JobError::execution("Content identity should exist after unique constraint violation".to_string()))?; @@ -876,16 +867,12 @@ impl DBWriter { existing_active.entry_count = Set(existing.entry_count + 1); existing_active.last_verified_at = Set(chrono::Utc::now()); - let updated = - existing_active - .update(ctx.library_db()) - .await - .map_err(|e| { - JobError::execution(format!( - "Failed to update content identity: {}", - e - )) - })?; + let updated = existing_active.update(db).await.map_err(|e| { + JobError::execution(format!( + "Failed to update content identity: {}", + e + )) + })?; (updated, false) } else { @@ -901,7 +888,7 @@ impl DBWriter { }; let entry = entities::entry::Entity::find_by_id(entry_id) - .one(ctx.library_db()) + .one(db) .await .map_err(|e| JobError::execution(format!("Failed to find entry: {}", e)))? .ok_or_else(|| JobError::execution("Entry not found after creation".to_string()))?; @@ -909,7 +896,7 @@ impl DBWriter { let mut entry_active: entities::entry::ActiveModel = entry.into(); entry_active.content_id = Set(Some(content_model.id)); - let updated_entry = entry_active.update(ctx.library_db()).await.map_err(|e| { + let updated_entry = entry_active.update(db).await.map_err(|e| { JobError::execution(format!("Failed to link content identity to entry: {}", e)) })?; @@ -923,7 +910,6 @@ impl DBWriter { /// Simple move entry within existing transaction (no directory path cascade updates) pub async fn simple_move_entry_in_conn( state: &mut IndexerState, - ctx: &impl IndexingCtx, entry_id: i32, old_path: &Path, new_path: &Path, @@ -984,7 +970,6 @@ impl DBWriter { /// Bulk move entries within a single transaction for better performance pub async fn bulk_move_entries( state: &mut IndexerState, - ctx: &impl IndexingCtx, moves: &[(i32, PathBuf, PathBuf, super::state::DirEntry)], _location_root_path: &Path, txn: &DatabaseTransaction, @@ -992,8 +977,7 @@ impl DBWriter { let mut moved_count = 0; for (entry_id, old_path, new_path, _) in moves { - match Self::simple_move_entry_in_conn(state, ctx, *entry_id, old_path, new_path, txn) - .await + match Self::simple_move_entry_in_conn(state, *entry_id, old_path, new_path, txn).await { Ok(()) => { moved_count += 1; @@ -1003,13 +987,13 @@ impl DBWriter { // the entire batch. Parent directory renames succeed even if a child fails // due to file locks, though the child will have a stale path until the next // reindex cleans it up. - ctx.log(format!( + tracing::debug!( "Failed to move entry {} from {} to {}: {}", entry_id, old_path.display(), new_path.display(), e - )); + ); } } } @@ -1019,7 +1003,6 @@ impl DBWriter { /// Update entry within existing transaction pub async fn update_entry_in_conn( - ctx: &impl IndexingCtx, entry_id: i32, entry: &super::state::DirEntry, txn: &DatabaseTransaction, diff --git a/core/src/ops/indexing/mod.rs b/core/src/ops/indexing/mod.rs index 84a0f0f3f..c1e4a53c2 100644 --- a/core/src/ops/indexing/mod.rs +++ b/core/src/ops/indexing/mod.rs @@ -22,7 +22,6 @@ pub mod action; pub mod change_detection; -pub mod ctx; pub mod db_writer; pub mod ephemeral; pub mod hierarchy; @@ -44,7 +43,6 @@ pub use change_detection::{ apply_batch as apply_change_batch, Change, ChangeConfig, ChangeDetector, ChangeHandler, ChangeType, EntryRef, PersistentWriter, PersistentWriterAdapter, }; -pub use ctx::{IndexingCtx, ResponderCtx}; pub use db_writer::{DBWriter, EntryMetadata}; pub use ephemeral::{EphemeralIndex, EphemeralIndexCache, EphemeralIndexStats, EphemeralWriter}; pub use hierarchy::HierarchyQuery; diff --git a/core/src/ops/indexing/phases/content.rs b/core/src/ops/indexing/phases/content.rs index b0126728c..055f4fa82 100644 --- a/core/src/ops/indexing/phases/content.rs +++ b/core/src/ops/indexing/phases/content.rs @@ -11,7 +11,6 @@ use crate::{ infra::job::generic_progress::ToGenericProgress, infra::job::prelude::{JobContext, JobError, Progress}, ops::indexing::{ - ctx::IndexingCtx, db_writer::DBWriter, processor::{ContentHashProcessor, ProcessorEntry}, state::{EntryKind, IndexError, IndexPhase, IndexerProgress, IndexerState}, @@ -129,7 +128,7 @@ pub async fn run_content_phase( match hash_result { Ok(content_hash) => { match DBWriter::link_to_content_identity( - ctx, + ctx.library_db(), entry_id, &path, content_hash.clone(), @@ -189,33 +188,27 @@ pub async fn run_content_phase( } if !content_identities_to_sync.is_empty() { - match IndexingCtx::library(ctx) { - Some(library) => { - match library - .sync_models_batch( - &content_identities_to_sync, - crate::infra::sync::ChangeType::Insert, - ctx.library_db(), - ) - .await - { - Ok(()) => { - ctx.log(format!( - "Batch synced {} content identities", - content_identities_to_sync.len() - )); - } - Err(e) => { - tracing::warn!( - "Failed to batch sync {} content identities: {}", - content_identities_to_sync.len(), - e - ); - } - } + let library = ctx.library(); + match library + .sync_models_batch( + &content_identities_to_sync, + crate::infra::sync::ChangeType::Insert, + ctx.library_db(), + ) + .await + { + Ok(()) => { + ctx.log(format!( + "Batch synced {} content identities", + content_identities_to_sync.len() + )); } - None => { - ctx.log("Sync disabled - content identities saved locally only"); + Err(e) => { + tracing::warn!( + "Failed to batch sync {} content identities: {}", + content_identities_to_sync.len(), + e + ); } } } @@ -226,16 +219,15 @@ pub async fn run_content_phase( tokio::task::yield_now().await; if !entries_to_sync.is_empty() { - match IndexingCtx::library(ctx) { - Some(library) => { - match library - .sync_models_batch( - &entries_to_sync, - crate::infra::sync::ChangeType::Update, - ctx.library_db(), - ) - .await - { + let library = ctx.library(); + match library + .sync_models_batch( + &entries_to_sync, + crate::infra::sync::ChangeType::Update, + ctx.library_db(), + ) + .await + { Ok(()) => { ctx.log(format!( "Batch synced {} entries with content IDs", @@ -249,11 +241,6 @@ pub async fn run_content_phase( e ); } - } - } - None => { - ctx.log("Sync disabled - entries saved locally only"); - } } } diff --git a/core/src/ops/indexing/phases/processing.rs b/core/src/ops/indexing/phases/processing.rs index 5d8176ccd..aace72618 100644 --- a/core/src/ops/indexing/phases/processing.rs +++ b/core/src/ops/indexing/phases/processing.rs @@ -284,7 +284,6 @@ pub async fn run_processing_phase( Some(Change::New(_)) => { match DBWriter::create_entry_in_conn( state, - ctx, &entry, device_id, location_root_path, @@ -332,7 +331,7 @@ pub async fn run_processing_phase( } Some(Change::Modified { entry_id, .. }) => { - match DBWriter::update_entry_in_conn(ctx, entry_id, &entry, &txn).await { + match DBWriter::update_entry_in_conn(entry_id, &entry, &txn).await { Ok(()) => { ctx.log(format!( "Updated entry {}: {}", @@ -368,7 +367,7 @@ pub async fn run_processing_phase( new_path.display() )); match DBWriter::simple_move_entry_in_conn( - state, ctx, entry_id, &old_path, &new_path, &txn, + state, entry_id, &old_path, &new_path, &txn, ) .await { diff --git a/core/src/ops/indexing/processor.rs b/core/src/ops/indexing/processor.rs index 42106bd8f..74eda8cf6 100644 --- a/core/src/ops/indexing/processor.rs +++ b/core/src/ops/indexing/processor.rs @@ -5,9 +5,10 @@ //! happen in a single transaction. This ensures entries either have valid content_id references //! or remain unlinked if processing fails. -use super::{ctx::IndexingCtx, db_writer::DBWriter, state::EntryKind}; +use super::{db_writer::DBWriter, state::EntryKind}; use crate::domain::content_identity::ContentHashGenerator; use anyhow::Result; +use sea_orm::DatabaseConnection; use serde::{Deserialize, Serialize}; use std::path::PathBuf; use tracing::debug; @@ -137,7 +138,7 @@ impl ContentHashProcessor { pub async fn process( &self, - ctx: &impl IndexingCtx, + db: &DatabaseConnection, entry: &ProcessorEntry, ) -> Result { if !matches!(entry.kind, EntryKind::File) || entry.content_id.is_some() { @@ -149,14 +150,8 @@ impl ContentHashProcessor { let content_hash = ContentHashGenerator::generate_content_hash(&entry.path).await?; debug!("✓ Generated content hash: {}", content_hash); - DBWriter::link_to_content_identity( - ctx, - entry.id, - &entry.path, - content_hash, - self.library_id, - ) - .await?; + DBWriter::link_to_content_identity(db, entry.id, &entry.path, content_hash, self.library_id) + .await?; debug!("✓ Linked content identity for entry {}", entry.id); From 2641c335ff355efae33feeb37743d02ddc2dbf8d Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 12:32:04 -0800 Subject: [PATCH 18/20] cargo fmt --- .../ops/indexing/change_detection/detector.rs | 3 ++- .../indexing/change_detection/persistent.rs | 14 +++++++--- core/src/ops/indexing/db_writer.rs | 17 ++++-------- core/src/ops/indexing/persistence.rs | 4 +-- core/src/ops/indexing/phases/content.rs | 26 +++++++++---------- core/src/ops/indexing/processor.rs | 10 +++++-- core/src/ops/indexing/progress.rs | 6 +---- core/src/ops/indexing/verify/action.rs | 8 +++--- 8 files changed, 45 insertions(+), 43 deletions(-) diff --git a/core/src/ops/indexing/change_detection/detector.rs b/core/src/ops/indexing/change_detection/detector.rs index aa7edb376..b5a5d8b10 100644 --- a/core/src/ops/indexing/change_detection/detector.rs +++ b/core/src/ops/indexing/change_detection/detector.rs @@ -76,7 +76,8 @@ impl ChangeDetector { .ok_or_else(|| JobError::execution("Location not found".to_string()))?; // Create a persistent writer adapter to leverage the unified query logic - let persistence = PersistentWriterAdapter::new(ctx, location_record.uuid, location_record.entry_id); + let persistence = + PersistentWriterAdapter::new(ctx, location_record.uuid, location_record.entry_id); // Use the scoped query method let existing_entries = persistence.get_existing_entries(indexing_path).await?; diff --git a/core/src/ops/indexing/change_detection/persistent.rs b/core/src/ops/indexing/change_detection/persistent.rs index ba17b5e3a..c381907f5 100644 --- a/core/src/ops/indexing/change_detection/persistent.rs +++ b/core/src/ops/indexing/change_detection/persistent.rs @@ -226,10 +226,16 @@ impl ChangeHandler for PersistentWriter { .insert(parent_path.to_path_buf(), parent_id); } - let entry_id = - DBWriter::create_entry(&mut state, &self.db, library.as_deref(), metadata, 0, parent_path) - .await - .map_err(|e| anyhow::anyhow!("Failed to create entry: {}", e))?; + let entry_id = DBWriter::create_entry( + &mut state, + &self.db, + library.as_deref(), + metadata, + 0, + parent_path, + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to create entry: {}", e))?; self.entry_id_cache.insert(metadata.path.clone(), entry_id); diff --git a/core/src/ops/indexing/db_writer.rs b/core/src/ops/indexing/db_writer.rs index 2388a2425..12cb37809 100644 --- a/core/src/ops/indexing/db_writer.rs +++ b/core/src/ops/indexing/db_writer.rs @@ -762,12 +762,9 @@ impl DBWriter { existing_active.entry_count = Set(existing_active.entry_count.unwrap() + 1); existing_active.last_verified_at = Set(chrono::Utc::now()); - let updated = existing_active - .update(db) - .await - .map_err(|e| { - JobError::execution(format!("Failed to update content identity: {}", e)) - })?; + let updated = existing_active.update(db).await.map_err(|e| { + JobError::execution(format!("Failed to update content identity: {}", e)) + })?; (updated, false) } else { @@ -868,10 +865,7 @@ impl DBWriter { existing_active.last_verified_at = Set(chrono::Utc::now()); let updated = existing_active.update(db).await.map_err(|e| { - JobError::execution(format!( - "Failed to update content identity: {}", - e - )) + JobError::execution(format!("Failed to update content identity: {}", e)) })?; (updated, false) @@ -977,8 +971,7 @@ impl DBWriter { let mut moved_count = 0; for (entry_id, old_path, new_path, _) in moves { - match Self::simple_move_entry_in_conn(state, *entry_id, old_path, new_path, txn).await - { + match Self::simple_move_entry_in_conn(state, *entry_id, old_path, new_path, txn).await { Ok(()) => { moved_count += 1; } diff --git a/core/src/ops/indexing/persistence.rs b/core/src/ops/indexing/persistence.rs index ac1903759..c0e56b0fe 100644 --- a/core/src/ops/indexing/persistence.rs +++ b/core/src/ops/indexing/persistence.rs @@ -102,8 +102,8 @@ impl PersistenceFactory { ) -> Box { use super::ephemeral::EphemeralWriter; - let event_bus = - event_bus.unwrap_or_else(|| std::sync::Arc::new(crate::infra::event::EventBus::new(1024))); + let event_bus = event_bus + .unwrap_or_else(|| std::sync::Arc::new(crate::infra::event::EventBus::new(1024))); Box::new(EphemeralWriter::new(index, event_bus, root_path)) } diff --git a/core/src/ops/indexing/phases/content.rs b/core/src/ops/indexing/phases/content.rs index 055f4fa82..da1de2ceb 100644 --- a/core/src/ops/indexing/phases/content.rs +++ b/core/src/ops/indexing/phases/content.rs @@ -228,19 +228,19 @@ pub async fn run_content_phase( ) .await { - Ok(()) => { - ctx.log(format!( - "Batch synced {} entries with content IDs", - entries_to_sync.len() - )); - } - Err(e) => { - tracing::warn!( - "Failed to batch sync {} entries: {}", - entries_to_sync.len(), - e - ); - } + Ok(()) => { + ctx.log(format!( + "Batch synced {} entries with content IDs", + entries_to_sync.len() + )); + } + Err(e) => { + tracing::warn!( + "Failed to batch sync {} entries: {}", + entries_to_sync.len(), + e + ); + } } } diff --git a/core/src/ops/indexing/processor.rs b/core/src/ops/indexing/processor.rs index 74eda8cf6..d50c58912 100644 --- a/core/src/ops/indexing/processor.rs +++ b/core/src/ops/indexing/processor.rs @@ -150,8 +150,14 @@ impl ContentHashProcessor { let content_hash = ContentHashGenerator::generate_content_hash(&entry.path).await?; debug!("✓ Generated content hash: {}", content_hash); - DBWriter::link_to_content_identity(db, entry.id, &entry.path, content_hash, self.library_id) - .await?; + DBWriter::link_to_content_identity( + db, + entry.id, + &entry.path, + content_hash, + self.library_id, + ) + .await?; debug!("✓ Linked content identity for entry {}", entry.id); diff --git a/core/src/ops/indexing/progress.rs b/core/src/ops/indexing/progress.rs index 7240e0c49..088c166e4 100644 --- a/core/src/ops/indexing/progress.rs +++ b/core/src/ops/indexing/progress.rs @@ -101,11 +101,7 @@ impl ToGenericProgress for IndexerProgress { let mut progress = GenericProgress::new(percentage, &phase_name, &phase_message) .with_bytes(self.total_found.bytes, self.total_found.bytes) - .with_performance( - self.processing_rate, - self.estimated_remaining, - None, - ) + .with_performance(self.processing_rate, self.estimated_remaining, None) .with_errors(self.total_found.errors, 0) .with_metadata(self); diff --git a/core/src/ops/indexing/verify/action.rs b/core/src/ops/indexing/verify/action.rs index e374f9d85..876dc72a5 100644 --- a/core/src/ops/indexing/verify/action.rs +++ b/core/src/ops/indexing/verify/action.rs @@ -103,10 +103,10 @@ impl IndexVerifyAction { tracing::debug!("Running ephemeral indexer job on {}", path.display()); // Create ephemeral index storage that we'll share with the job - let ephemeral_index = Arc::new(RwLock::new( - EphemeralIndex::new() - .map_err(|e| ActionError::from(std::io::Error::new(std::io::ErrorKind::Other, e)))?, - )); + let ephemeral_index = + Arc::new(RwLock::new(EphemeralIndex::new().map_err(|e| { + ActionError::from(std::io::Error::new(std::io::ErrorKind::Other, e)) + })?)); // Subscribe to job events before dispatching let mut event_subscriber = context.events.subscribe(); From 3e49f1de107fbe8d1c67fae8eded3b25eeaa212a Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 16:45:39 -0800 Subject: [PATCH 19/20] comments --- .../ops/indexing/change_detection/handler.rs | 8 - core/src/ops/indexing/db_writer.rs | 17 - core/src/ops/indexing/ephemeral/cache.rs | 3 - core/src/ops/indexing/ephemeral/writer.rs | 23 +- core/src/ops/indexing/job.rs | 35 +- docs/core/indexing.mdx | 357 ++++++++++-------- 6 files changed, 212 insertions(+), 231 deletions(-) diff --git a/core/src/ops/indexing/change_detection/handler.rs b/core/src/ops/indexing/change_detection/handler.rs index ab6dc614f..23c1411fc 100644 --- a/core/src/ops/indexing/change_detection/handler.rs +++ b/core/src/ops/indexing/change_detection/handler.rs @@ -54,10 +54,6 @@ pub trait ChangeHandler: Send + Sync { async fn handle_new_directory(&self, path: &Path) -> Result<()>; } -// ============================================================================ -// Shared Logic - Used by both handlers -// ============================================================================ - /// Check if a path exists, distinguishing between "doesn't exist" and "can't access". /// /// Critical for preventing false deletions when volumes go offline. @@ -191,10 +187,6 @@ pub async fn build_dir_entry( }) } -// ============================================================================ -// Generic Change Application -// ============================================================================ - /// Apply a batch of filesystem changes using the provided handler. /// /// Processes events in the correct order: removes first, then renames, diff --git a/core/src/ops/indexing/db_writer.rs b/core/src/ops/indexing/db_writer.rs index 12cb37809..83d481af8 100644 --- a/core/src/ops/indexing/db_writer.rs +++ b/core/src/ops/indexing/db_writer.rs @@ -566,7 +566,6 @@ impl DBWriter { new_path: &Path, location_root_path: &Path, ) -> Result<(), JobError> { - // Begin transaction for atomic move operation let txn = db .begin() .await @@ -605,7 +604,6 @@ impl DBWriter { location_root_path: &Path, txn: &DatabaseTransaction, ) -> Result<(), JobError> { - // Get the entry let db_entry = entities::entry::Entity::find_by_id(entry_id) .one(txn) .await @@ -918,27 +916,22 @@ impl DBWriter { let mut entry_active: entities::entry::ActiveModel = db_entry.into(); - // Find new parent entry ID let new_parent_id = if let Some(parent_path) = new_path.parent() { state.entry_id_cache.get(parent_path).copied() } else { None }; - // Update entry fields entry_active.parent_id = Set(new_parent_id); - // Extract new name and extension for files match new_path.extension() { Some(ext) => { - // File with extension if let Some(stem) = new_path.file_stem() { entry_active.name = Set(stem.to_string_lossy().to_string()); entry_active.extension = Set(Some(ext.to_string_lossy().to_lowercase())); } } None => { - // File without extension or directory if let Some(name) = new_path.file_name() { entry_active.name = Set(name.to_string_lossy().to_string()); entry_active.extension = Set(None); @@ -946,13 +939,11 @@ impl DBWriter { } } - // Save the updated entry entry_active .update(txn) .await .map_err(|e| JobError::execution(format!("Failed to update entry: {}", e)))?; - // Update cache state.entry_id_cache.remove(old_path); state .entry_id_cache @@ -1030,10 +1021,6 @@ impl DBWriter { Ok(()) } - // ======================================================================== - // Subtree Deletion - // ======================================================================== - /// Deletes an entry and all its descendants from the database. /// /// This is a raw database operation that does NOT: @@ -1070,7 +1057,6 @@ impl DBWriter { { use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; - // Collect all descendants via closure table let mut to_delete_ids: Vec = vec![entry_id]; if let Ok(rows) = entities::entry_closure::Entity::find() .filter(entities::entry_closure::Column::AncestorId.eq(entry_id)) @@ -1083,7 +1069,6 @@ impl DBWriter { to_delete_ids.dedup(); if !to_delete_ids.is_empty() { - // Delete closure links (both directions) let _ = entities::entry_closure::Entity::delete_many() .filter(entities::entry_closure::Column::DescendantId.is_in(to_delete_ids.clone())) .exec(db) @@ -1093,13 +1078,11 @@ impl DBWriter { .exec(db) .await; - // Delete directory paths let _ = entities::directory_paths::Entity::delete_many() .filter(entities::directory_paths::Column::EntryId.is_in(to_delete_ids.clone())) .exec(db) .await; - // Delete entries let _ = entities::entry::Entity::delete_many() .filter(entities::entry::Column::Id.is_in(to_delete_ids)) .exec(db) diff --git a/core/src/ops/indexing/ephemeral/cache.rs b/core/src/ops/indexing/ephemeral/cache.rs index 77a7aaae9..2de7997b1 100644 --- a/core/src/ops/indexing/ephemeral/cache.rs +++ b/core/src/ops/indexing/ephemeral/cache.rs @@ -106,7 +106,6 @@ impl EphemeralIndexCache { let mut index = self.index.write().await; let (cleared, deleted_browsed_dirs) = index.clear_directory_children(path, &indexed); - // Remove deleted browsed directories from indexed_paths if !deleted_browsed_dirs.is_empty() { let mut indexed_paths = self.indexed_paths.write(); for deleted_path in deleted_browsed_dirs { @@ -197,7 +196,6 @@ impl EphemeralIndexCache { pub fn find_watched_root(&self, path: &Path) -> Option { let watched = self.watched_paths.read(); - // Find the longest matching watched path that is an ancestor of `path` let mut best_match: Option<&PathBuf> = None; let mut best_len = 0; @@ -269,7 +267,6 @@ impl EphemeralIndexCache { /// Legacy: Insert (no-op, entries are added directly to global index) #[deprecated(note = "Entries should be added directly to the global index")] pub fn insert(&self, path: PathBuf, _index: Arc>) { - // Mark the path as indexed let mut indexed = self.indexed_paths.write(); indexed.insert(path); } diff --git a/core/src/ops/indexing/ephemeral/writer.rs b/core/src/ops/indexing/ephemeral/writer.rs index e3ff10d98..ebc65ca91 100644 --- a/core/src/ops/indexing/ephemeral/writer.rs +++ b/core/src/ops/indexing/ephemeral/writer.rs @@ -53,14 +53,11 @@ impl EphemeralWriter { } } - /// Generate the next entry ID. fn next_id(&self) -> i32 { self.next_id.fetch_add(1, Ordering::SeqCst) } - /// Add an entry to the index and emit a ResourceChanged event. - /// - /// This is the core write operation used by both pipelines. + /// Core write operation shared by both watcher and indexer pipelines. async fn add_entry_internal( &self, path: &Path, @@ -78,7 +75,6 @@ impl EphemeralWriter { Ok((entry_id, content_kind)) } - /// Emit a ResourceChanged event for UI updates. async fn emit_resource_changed( &self, uuid: Uuid, @@ -142,7 +138,7 @@ impl ChangeHandler for EphemeralWriter { } async fn find_by_inode(&self, _inode: u64) -> Result> { - // Ephemeral index doesn't track inodes for move detection + // Inode tracking is skipped to minimize memory overhead; fall back to path-only detection. Ok(None) } @@ -154,7 +150,6 @@ impl ChangeHandler for EphemeralWriter { .add_entry_internal(&metadata.path, entry_uuid, entry_metadata.clone()) .await?; - // Emit event if entry was actually added (not a duplicate) if let Some(content_kind) = content_kind { self.emit_resource_changed(entry_uuid, &metadata.path, &entry_metadata, content_kind) .await; @@ -216,7 +211,7 @@ impl ChangeHandler for EphemeralWriter { } async fn run_processors(&self, _entry: &EntryRef, _is_new: bool) -> Result<()> { - // Ephemeral indexing skips processor pipeline (no thumbnails/content hash) + // File processors (thumbnails, content hash) are disabled to ensure responsive, low-overhead browsing. Ok(()) } @@ -295,10 +290,6 @@ impl ChangeHandler for EphemeralWriter { } } -// ============================================================================ -// IndexPersistence Implementation (Job Pipeline) -// ============================================================================ - #[async_trait::async_trait] impl IndexPersistence for EphemeralWriter { async fn store_entry( @@ -336,7 +327,6 @@ impl IndexPersistence for EphemeralWriter { (self.next_id(), content_kind) }; - // Emit event if entry was actually added (not a duplicate) if let Some(content_kind) = content_kind { self.emit_resource_changed(entry_uuid, &entry.path, &metadata, content_kind) .await; @@ -351,7 +341,6 @@ impl IndexPersistence for EphemeralWriter { _path: &Path, _cas_id: String, ) -> JobResult<()> { - // Ephemeral indexing doesn't track content identities Ok(()) } @@ -359,12 +348,10 @@ impl IndexPersistence for EphemeralWriter { &self, _indexing_path: &Path, ) -> JobResult, Option, u64)>> { - // Ephemeral indexing doesn't support incremental indexing Ok(HashMap::new()) } async fn update_entry(&self, _entry_id: i32, _entry: &DirEntry) -> JobResult<()> { - // Updates are handled via add_entry (overwrites existing) Ok(()) } @@ -393,7 +380,6 @@ mod tests { let mut writer = EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); - // Test create let dir_entry = DirEntry { path: test_file.clone(), kind: EntryKind::File, @@ -411,7 +397,6 @@ mod tests { assert_eq!(entry_ref.path, test_file); assert_eq!(entry_ref.kind, EntryKind::File); - // Verify entry exists let found = writer .find_by_path(&test_file) .await @@ -448,7 +433,6 @@ mod tests { assert!(entry_id > 0); assert!(!writer.is_persistent()); - // Verify index was updated let idx = index.read().await; assert!(idx.has_entry(&test_file)); } @@ -481,7 +465,6 @@ mod tests { .await .expect("store_entry should succeed"); - // Try to receive the event let event = tokio::time::timeout(tokio::time::Duration::from_millis(100), subscriber.recv()).await; diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index 197f7d8f1..02c46b8ab 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -224,29 +224,18 @@ impl IndexerJob { self.state = Some(IndexerState::new(&self.config.path)); } else { ctx.log("Resuming indexer from saved state"); - let state = self.state.as_ref().unwrap(); info!("INDEXER_STATE: Job resuming with saved state - phase: {:?}, entry_batches: {}, entries_for_content: {}, seen_paths: {}", - state.phase, - state.entry_batches.len(), - state.entries_for_content.len(), - state.seen_paths.len()); - warn!( - "DEBUG: Resumed state - phase: {:?}, entry_batches: {}, entries_for_content: {}", - state.phase, - state.entry_batches.len(), - state.entries_for_content.len() - ); + self.state.as_ref().unwrap().phase, + self.state.as_ref().unwrap().entry_batches.len(), + self.state.as_ref().unwrap().entries_for_content.len(), + self.state.as_ref().unwrap().seen_paths.len()); } let state = self.state.as_mut().unwrap(); - // For cloud volumes, we use the path component from the SdPath (e.g., "/" or "folder/") - // since discovery operates through the volume backend (not direct filesystem access). let root_path_buf = if let Some(p) = self.config.path.as_local_path() { p.to_path_buf() } else if let Some(cloud_path) = self.config.path.cloud_path() { - // Cloud path - use the path component within the cloud volume - // The actual I/O will go through the volume backend PathBuf::from(cloud_path) } else if !self.config.is_ephemeral() { let loc_uuid = self @@ -326,7 +315,6 @@ impl IndexerJob { ctx.check_interrupt().await?; let current_phase = state.phase.clone(); - warn!("DEBUG: IndexerJob entering phase: {:?}", current_phase); match current_phase { Phase::Discovery => { let cloud_url_base = @@ -359,7 +347,6 @@ impl IndexerJob { } Phase::Processing => { - warn!("DEBUG: IndexerJob starting Processing phase"); if self.config.is_ephemeral() { let ephemeral_index = self.ephemeral_index.clone().ok_or_else(|| { JobError::execution("Ephemeral index not initialized".to_string()) @@ -435,11 +422,6 @@ impl IndexerJob { Phase::Complete => break, } - - warn!( - "DEBUG: IndexerJob completed phase: {:?}, next phase will be: {:?}", - current_phase, state.phase - ); } let final_progress = IndexerProgress { @@ -545,12 +527,7 @@ impl JobHandler for IndexerJob { } async fn on_resume(&mut self, ctx: &JobContext<'_>) -> JobResult { - warn!("DEBUG: IndexerJob on_resume called"); if let Some(state) = &self.state { - warn!( - "DEBUG: IndexerJob has state, resuming in {:?} phase", - state.phase - ); ctx.log(format!("Resuming indexer in {:?} phase", state.phase)); ctx.log(format!( "Progress: {} files, {} dirs, {} errors so far", @@ -559,7 +536,6 @@ impl JobHandler for IndexerJob { self.timer = Some(PhaseTimer::new()); } else { - warn!("DEBUG: IndexerJob has no state during resume - creating new state!"); self.state = Some(IndexerState::new(&self.config.path)); } Ok(()) @@ -641,8 +617,6 @@ impl IndexerJob { use super::state::{DirEntry, EntryKind}; use tokio::fs; - ctx.log("Starting current scope discovery (single level)"); - let mut entries = fs::read_dir(root_path) .await .map_err(|e| JobError::execution(format!("Failed to read directory: {}", e)))?; @@ -677,7 +651,6 @@ impl IndexerJob { state.pending_entries.push(dir_entry); state.items_since_last_update += 1; - // Update stats match entry_kind { EntryKind::File => state.stats.files += 1, EntryKind::Directory => state.stats.dirs += 1, diff --git a/docs/core/indexing.mdx b/docs/core/indexing.mdx index f61b18cdb..2b27aea95 100644 --- a/docs/core/indexing.mdx +++ b/docs/core/indexing.mdx @@ -3,19 +3,25 @@ title: Indexing sidebarTitle: Indexing --- -The indexing system discovers and analyzes your files through a sophisticated multi-phase process. Built on Spacedrive's job system, it provides resumable operations, real-time progress tracking, and supports both persistent library indexing and ephemeral browsing of external drives. +The indexing system discovers and analyzes your files through a multi-phase pipeline. Built on Spacedrive's job system, it provides resumable operations, real-time progress tracking, and supports both persistent library indexing and ephemeral browsing of external drives. ## Architecture Overview -The indexing system consists of several key components working together: +The indexing system consists of specialized components working together: **IndexerJob** orchestrates the entire indexing process as a resumable job. It maintains state across application restarts and provides detailed progress reporting. -**IndexerState** preserves all necessary information to resume indexing from any interruption point. This includes the current phase, directories to process, and accumulated statistics. +**IndexerState** preserves all necessary information to resume indexing from any interruption point. This includes the current phase, directories to process, accumulated statistics, and ephemeral UUID mappings for preserving user metadata across browsing-to-persistent transitions. -**EntryProcessor** handles the complex task of creating and updating database records while maintaining referential integrity through materialized paths. +**DBWriter** provides the low-level database CRUD layer. All database operations (create, update, move, delete) flow through this module for consistency. -**FileTypeRegistry** identifies files through a combination of extensions, magic bytes, and content analysis to provide accurate type detection. +**PersistentWriter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the database via `DBWriter`. + +**EphemeralWriter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the in-memory `EphemeralIndex`. + +This dual-implementation architecture unifies watcher and job pipelines, eliminating code duplication between real-time filesystem monitoring and batch indexing operations. + +**FileTypeRegistry** identifies files through extensions, magic bytes, and content analysis. The system integrates deeply with Spacedrive's job infrastructure, which provides automatic state persistence through MessagePack serialization. When you pause an indexing operation, the entire job state is saved to a dedicated jobs database, allowing seamless resumption even after application restarts. @@ -24,63 +30,153 @@ The system integrates deeply with Spacedrive's job infrastructure, which provide architecture ensures no work is lost if interrupted. +## Database Architecture + +The indexing system uses a closure table for hierarchy management instead of recursive queries: + +### Closure Table + +Parent-child relationships are stored in the `entry_closure` table with precomputed ancestor-descendant pairs. This makes "find all descendants" queries O(1) regardless of nesting depth, at the cost of additional storage (worst-case N² for deeply nested trees). + +```sql +CREATE TABLE entry_closure ( + ancestor_id INTEGER, + descendant_id INTEGER, + depth INTEGER +); +``` + +The closure table stores all transitive relationships. For a file at `/home/user/docs/report.pdf`, entries exist for: +- (home_id, report_id, depth=3) +- (user_id, report_id, depth=2) +- (docs_id, report_id, depth=1) +- (report_id, report_id, depth=0) + +Move operations require rebuilding closures for the entire moved subtree, which can affect thousands of rows when moving large directories. + +### Directory Paths Cache + +The `directory_paths` table provides O(1) absolute path lookups for directories: + +```sql +CREATE TABLE directory_paths ( + entry_id INTEGER PRIMARY KEY, + path TEXT UNIQUE +); +``` + +This eliminates recursive parent traversal when building file paths. Each directory stores its complete absolute path, enabling instant resolution for child entries. + +### Entries Table + +```sql +CREATE TABLE entry ( + id INTEGER PRIMARY KEY, + uuid UUID UNIQUE, + parent_id INTEGER, + name TEXT, + extension TEXT, + kind INTEGER, + size BIGINT, + inode BIGINT, + content_id INTEGER, + aggregate_size BIGINT, + child_count INTEGER, + file_count INTEGER +); +``` + ## Indexing Phases -The indexer operates through four distinct phases, each designed to be interruptible and resumable: +The indexer operates through five distinct phases, each designed to be interruptible and resumable: ### Phase 1: Discovery -The discovery phase walks your filesystem to build a list of all files and directories. This phase is optimized for speed, collecting just enough information to plan the work ahead: +Discovery walks the filesystem using parallel workers with a work-stealing model. On systems with 8+ cores, multiple threads scan directories concurrently, communicating via channels to maximize disk throughput. -```rust -// Discovery maintains a queue of directories to process -pub struct DiscoveryPhase { - dirs_to_walk: VecDeque, - seen_paths: HashSet, // Cycle detection -} -``` +Discovered entries are filtered through `IndexerRuler`, which applies toggleable system rules (like `NO_HIDDEN`, `NO_DEV_DIRS`) and dynamically loaded `.gitignore` patterns when inside a Git repository. -The phase uses a breadth-first traversal to ensure shallow directories are processed first, providing quicker initial results. Progress is measured by directories discovered versus total estimated. +Progress is measured by directories discovered. Entries are collected into batches of 1,000 items before moving to processing. ### Phase 2: Processing -Processing creates or updates database entries for each discovered item. This is where Spacedrive builds its understanding of your file structure: +Processing converts discovered entries into database records. Entries are sorted by depth (parents before children) to maintain referential integrity during batch insertion. -```rust -// Batch processing for efficiency -const BATCH_SIZE: usize = 1000; +**Change Detection** runs during this phase. The `ChangeDetector` loads existing database entries for the indexing path, then compares against filesystem state to identify: -// Process entries in parent-first order -let sorted_batch = batch.sort_by_depth(); -persistence.process_batch(sorted_batch, &mut entry_cache)?; -``` +- **New**: Paths not in database +- **Modified**: Size or mtime differs +- **Moved**: Same inode at different path +- **Deleted**: In database but missing from filesystem -The system uses materialized paths instead of parent IDs, making queries faster and eliminating complex recursive lookups. Each entry stores its full path prefix, enabling instant directory listings. +Changes are processed in batch transactions. Each batch inserts closure table rows, updates the directory paths cache, and syncs entries across devices. + +**Ephemeral UUID Preservation** happens here. When a browsed folder is promoted to a managed location, UUIDs assigned during ephemeral indexing are preserved (`state.ephemeral_uuids`). This prevents orphaning user metadata like tags and notes attached during browsing sessions. + +The processing phase validates that the indexing path stays within location boundaries, preventing catastrophic cross-location deletion if watcher routing bugs send events for the wrong path. ### Phase 3: Aggregation -Aggregation calculates sizes and counts for directories by traversing the tree bottom-up. This phase provides the statistics you see in the UI: +Aggregation walks the entry tree bottom-up, computing directory statistics: -- Total size including subdirectories -- Direct child count -- Recursive file count -- Aggregate content types +- `aggregate_size`: Total bytes including subdirectories +- `child_count`: Direct children only +- `file_count`: Recursive file count + +These aggregates are stored in the entry table and enable instant directory size display without traversing descendants. ### Phase 4: Content Identification -The final phase generates content-addressed storage (CAS) identifiers and performs deep file analysis: +Content identification generates BLAKE3 hashes for files, linking entries to `content_identity` records for deduplication. + +Content identities use deterministic v5 UUIDs (namespace hash of `content_hash + library_id`) so different devices can independently identify identical files and merge metadata without coordination. This enables offline duplicate detection across library peers. + +**Sync Order**: Content identities must be synced before entries to avoid foreign key violations on receiving devices. The job system enforces this ordering. + +For new content, file type identification runs via `FileTypeRegistry` to populate `kind_id` and `mime_type_id` fields. + +### Phase 5: Finalizing + +Finalizing handles post-processing tasks like directory aggregation updates and potential processor dispatch (thumbnail generation for Deep Mode). + +## Change Detection System + +The indexing system includes both batch and real-time change detection: + +### Batch Change Detection + +`ChangeDetector` compares database state against filesystem during indexer job scans: ```rust -// Sampled hashing for large files -let cas_id = cas_generator - .generate_cas_id(path, file_size) - .await?; +let mut detector = ChangeDetector::new(); +detector.load_existing_entries(ctx, location_id, indexing_path).await?; -// Link to content identity for deduplication -content_processor.link_or_create(entry_id, cas_id)?; +for entry in discovered_entries { + if let Some(change) = detector.check_path(&path, &metadata, inode) { + // Process New, Modified, or Moved change + } +} + +let deleted = detector.find_deleted(&seen_paths); ``` -This phase enables deduplication, content-based search, and file tracking across renames. +The detector tracks paths by inode to identify moves. On Unix systems, inodes provide stable file identity across renames. Windows falls back to path-only matching since file indices are unstable across reboots. + +### Real-Time Change Detection + +Both `PersistentWriter` and `EphemeralWriter` implement the `ChangeHandler` trait, which defines the interface for responding to filesystem watcher events: + +```rust +pub trait ChangeHandler { + async fn find_by_path(&self, path: &Path) -> Result>; + async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result; + async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()>; + async fn move_entry(&mut self, entry: &EntryRef, old_path: &Path, new_path: &Path) -> Result<()>; + async fn delete(&mut self, entry: &EntryRef) -> Result<()>; +} +``` + +The watcher routes events to the appropriate handler based on whether the path belongs to a persistent location (`PersistentWriter` → database) or ephemeral session (`EphemeralWriter` → memory). ## Indexing Modes and Scopes @@ -88,29 +184,21 @@ The system provides flexible configuration through modes and scopes: ### Index Modes -**Shallow Mode** extracts only filesystem metadata (name, size, dates). Completes in under 500ms for typical directories. Perfect for responsive UI navigation. +**Shallow Mode** extracts only filesystem metadata (name, size, dates). Completes in under 500ms for typical directories. -**Content Mode** adds cryptographic hashing to identify files by content. Enables deduplication and content tracking. Moderate performance impact. +**Content Mode** adds BLAKE3 hashing to identify files by content. Enables deduplication and content tracking. -**Deep Mode** performs full analysis including thumbnails and media metadata extraction. Best for photo and video libraries. +**Deep Mode** performs full analysis including file type identification and metadata extraction. Triggers thumbnail generation for images and videos. ### Index Scopes -**Current Scope** indexes only the immediate directory contents: +**Current Scope** indexes only immediate directory contents. Used for responsive UI navigation. -```rust -IndexerJobConfig::ui_navigation(location_id, path) -``` - -**Recursive Scope** indexes the entire directory tree: - -```rust -IndexerJobConfig::new(location_id, path, IndexMode::Deep) -``` +**Recursive Scope** indexes the entire directory tree. Used for full location indexing. ## Persistence and Ephemeral Indexing -One of Spacedrive's key innovations is supporting both persistent and ephemeral indexing modes. +Spacedrive supports both persistent and ephemeral indexing modes: ### Persistent Indexing @@ -123,27 +211,63 @@ Persistent indexing stores all data in the database permanently. This is the def ### Ephemeral Indexing -Ephemeral indexing keeps data in memory only, perfect for browsing external drives: +Ephemeral indexing keeps data in memory only, perfect for browsing external drives without permanent storage. -```rust -let config = IndexerJobConfig::ephemeral_browse( - usb_path, - IndexScope::Current -); -``` +The ephemeral system uses highly memory-optimized structures: -The ephemeral index uses an LRU cache with automatic cleanup: +**NodeArena**: Slab allocator for `FileNode` entries with pointer-sized entry IDs. Provides contiguous memory layout for cache efficiency. -- No database writes -- Session-based lifetime -- Memory-efficient storage -- Automatic expiration +**NameCache**: Global string interning pool. One copy of "index.js" serves thousands of node_modules files. + +**NameRegistry**: BTreeMap for fast name-based lookups without full-text indexing overhead. + +Memory usage is around 50 bytes per entry vs 200+ bytes with naive `HashMap` approach. This 4-6x reduction enables browsing hundreds of thousands of files without database overhead. + +Multiple directory trees can coexist in the same `EphemeralIndex` (browsing both `/mnt/nas` and `/media/usb` simultaneously), sharing the string interning pool for maximum deduplication. + +The `EphemeralIndexCache` tracks which paths have been indexed, are currently being indexed, or are registered for filesystem watching. When a watched path receives filesystem events, `EphemeralWriter` updates the in-memory index in real-time. Ephemeral mode lets you explore USB drives or network shares without permanently adding them to your library. +## Indexer Rules + +The `IndexerRuler` applies filtering rules during discovery to skip unwanted files: + +**System Rules** are toggleable patterns like: +- `NO_HIDDEN`: Skip dotfiles (`.git`, `.DS_Store`) +- `NO_DEV_DIRS`: Skip `node_modules`, `target`, `dist` +- `NO_SYSTEM`: Skip OS folders (`System32`, `Windows`) + +**Git Integration**: When indexing inside a Git repository, rules are dynamically loaded from `.gitignore` files. This automatically excludes build artifacts and local configuration. + +Rules return a `RulerDecision` (Accept/Reject) for each path during discovery, preventing unwanted entries from ever reaching the processing phase. + +## Index Integrity Verification + +The `IndexVerifyAction` checks integrity by running a fresh ephemeral scan and comparing metadata against the existing persistent index: + +```rust +let verify = IndexVerifyAction::from_input(IndexVerifyInput { path }).await?; +let output = verify.execute(library, context).await?; + +// output.report contains: +// - missing_from_index: Files on disk but not in database +// - stale_in_index: Entries in database but missing from filesystem +// - metadata_mismatches: Size, mtime, or inode differences +``` + +The verification system detects: +- **MissingFromIndex**: Files created outside Spacedrive +- **StaleInIndex**: Deleted files not yet purged from database +- **SizeMismatch**: Files modified externally +- **ModifiedTimeMismatch**: Timestamp drift (with 1-second tolerance) +- **InodeMismatch**: File replacement or filesystem corruption + +Verification runs as a library action and returns a detailed `IntegrityReport` with per-file diagnostics. + ## Job System Integration The indexing system leverages Spacedrive's job infrastructure for reliability and monitoring. @@ -159,8 +283,8 @@ pub struct IndexerState { dirs_to_walk: VecDeque, entry_batches: Vec>, entry_id_cache: HashMap, + ephemeral_uuids: HashMap, stats: IndexerStats, - // ... checkpoint data } ``` @@ -172,25 +296,17 @@ Real-time progress flows through multiple channels: ```rust pub struct IndexerProgress { - phase: String, - items_done: u64, - total_items: u64, - bytes_per_second: f64, - eta_seconds: Option, + pub phase: IndexPhase, + pub total_found: IndexerStats, + pub processing_rate: f32, + pub estimated_remaining: Option, } ``` -Progress updates are: - -- Sent to UI via channels -- Persisted to database -- Available through job queries -- Used for time estimates +Progress updates are sent to the UI via channels, persisted to the database, and available through job queries for time estimates. ### Error Handling -The job system provides structured error handling: - **Non-critical errors** are accumulated but don't stop indexing: - Permission denied on individual files @@ -203,47 +319,6 @@ The job system provides structured error handling: - Filesystem unmounted - Out of disk space -## Database Schema - -The indexer populates several key tables designed for query performance. - -### Entries Table - -The core table uses materialized paths for efficient queries: - -```sql -CREATE TABLE entries ( - id INTEGER PRIMARY KEY, - uuid UUID UNIQUE, - location_id INTEGER, - relative_path TEXT, -- Parent path (materialized) - name TEXT, -- Without extension - extension TEXT, - kind INTEGER, -- 0=File, 1=Directory - size BIGINT, - inode BIGINT, -- Change detection - content_id INTEGER -); - --- Key indexes for performance -CREATE INDEX idx_entries_location_path - ON entries(location_id, relative_path); -``` - -### Content Identities Table - -Enables deduplication across your library: - -```sql -CREATE TABLE content_identities ( - id INTEGER PRIMARY KEY, - cas_id TEXT UNIQUE, - kind_id INTEGER, - total_size BIGINT, - entry_count INTEGER -); -``` - ## Performance Characteristics Indexing performance varies by mode and scope: @@ -259,32 +334,12 @@ Indexing performance varies by mode and scope: **Batch Processing**: Groups operations into transactions of 1,000 items, reducing database overhead by 30x. -**Parallel I/O**: Content identification runs on multiple threads, saturating disk bandwidth on fast storage. +**Parallel Discovery**: Work-stealing model with atomic counters for directory traversal, using half of available CPU cores by default. -**Smart Caching**: The entry ID cache eliminates redundant parent lookups, critical for deep directory trees. +**Entry ID Cache**: Eliminates redundant parent lookups during hierarchy construction, critical for deep directory trees. **Checkpoint Strategy**: Checkpoints occur every 5,000 items or 30 seconds, balancing durability with performance. -## Change Detection - -The indexer efficiently detects changes without full rescans: - -```rust -// Platform-specific change detection -#[cfg(unix)] -let file_id = metadata.ino(); // inode - -#[cfg(windows)] -let file_id = get_file_index(path)?; // File index -``` - -Detection capabilities: - -- New files: Appear with unknown inodes -- Modified files: Same inode, different size/mtime -- Moved files: Known inode at new path -- Deleted files: Missing from filesystem walk - ## Usage Examples ### Quick UI Navigation @@ -310,7 +365,7 @@ let job = IndexerJob::new(config); ### Full Library Location -Comprehensive indexing with all features: +Full indexing with content identification: ```rust let config = IndexerJobConfig::new( @@ -318,8 +373,6 @@ let config = IndexerJobConfig::new( path, IndexMode::Deep ); -config.with_checkpointing(true) - .with_filters(indexer_rules); ``` ## CLI Commands @@ -342,9 +395,9 @@ spacedrive job monitor # Watch progress ### Common Issues -**Slow Indexing**: Check for large node_modules or build directories. Use `.spacedriveignore` files to exclude them. +**Slow Indexing**: Check for large `node_modules` or build directories. System rules automatically skip common patterns, or use `.gitignore` to exclude project-specific artifacts. -**High Memory Usage**: Reduce batch size or avoid ephemeral mode for very large directories. +**High Memory Usage**: Reduce batch size for directories over 1M files. Ephemeral mode uses around 50 bytes per entry, so 100K files requires roughly 5MB. **Resume Not Working**: Ensure the jobs database isn't corrupted. Check logs for serialization errors. @@ -364,7 +417,7 @@ spacedrive job info --detailed ## Platform Notes -**Windows**: Uses file indices for change detection. Supports long paths transparently. Network drives may require polling. +**Windows**: Uses file indices for change detection where available, falling back to path-only matching. Supports long paths transparently. Network drives may require polling. **macOS**: Leverages FSEvents and native inodes. Integrates with Time Machine exclusions. APFS provides efficient cloning. @@ -372,15 +425,15 @@ spacedrive job info --detailed ## Best Practices -1. **Start shallow** for new locations to verify configuration -2. **Use filters** to exclude build artifacts and caches -3. **Monitor progress** through the job system instead of polling -4. **Schedule deep scans** during low-usage periods -5. **Enable checkpointing** for locations over 100K files +1. **Start shallow** for new locations to verify configuration before deep scans +2. **Use Git repositories** to automatically inherit `.gitignore` exclusions +3. **Monitor progress** through the job system instead of polling the database +4. **Schedule deep scans** during low-usage periods for large photo/video libraries +5. **Enable checkpointing** for locations over 100K files to survive interruptions Always let indexing jobs complete or pause them properly. Force-killing can - corrupt the job state. + corrupt the job state and require reindexing from scratch. ## Related Documentation From 57209a808529f67c9b0cc64f450ee0f9edc7ca82 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Mon, 8 Dec 2025 17:13:52 -0800 Subject: [PATCH 20/20] Rename DB writer to DatabaseStorage - Rename indexing backend: DBWriter to DatabaseStorage - Replace EphemeralWriter with MemoryAdapter across watcher and ephemeral components - Update module paths and imports in core indexing code, job, and persistence layers to use DatabaseStorage and MemoryAdapter - Update docs to reflect new names - (DatabaseStorage, MemoryAdapter) --- core/src/domain/file.rs | 2 +- core/src/infra/db/entities/entry.rs | 2 +- core/src/infra/db/entities/location.rs | 2 +- core/src/location/manager.rs | 2 +- .../ops/indexing/change_detection/detector.rs | 4 +- .../ops/indexing/change_detection/handler.rs | 4 +- core/src/ops/indexing/change_detection/mod.rs | 8 ++-- .../indexing/change_detection/persistent.rs | 48 +++++++++---------- .../{db_writer.rs => database_storage.rs} | 22 ++++----- core/src/ops/indexing/ephemeral/cache.rs | 4 +- core/src/ops/indexing/ephemeral/index.rs | 2 +- core/src/ops/indexing/ephemeral/mod.rs | 2 +- core/src/ops/indexing/ephemeral/responder.rs | 6 +-- core/src/ops/indexing/ephemeral/writer.rs | 24 +++++----- core/src/ops/indexing/job.rs | 4 +- core/src/ops/indexing/mod.rs | 8 ++-- core/src/ops/indexing/persistence.rs | 24 +++++----- core/src/ops/indexing/phases/content.rs | 4 +- core/src/ops/indexing/phases/discovery.rs | 2 +- core/src/ops/indexing/phases/processing.rs | 8 ++-- core/src/ops/indexing/processor.rs | 4 +- core/src/ops/indexing/responder.rs | 11 +++-- core/src/ops/indexing/verify/action.rs | 6 +-- docs/core/indexing.mdx | 10 ++-- 24 files changed, 107 insertions(+), 106 deletions(-) rename core/src/ops/indexing/{db_writer.rs => database_storage.rs} (98%) diff --git a/core/src/domain/file.rs b/core/src/domain/file.rs index 6b640acc8..fcfb68894 100644 --- a/core/src/domain/file.rs +++ b/core/src/domain/file.rs @@ -425,7 +425,7 @@ impl File { /// This is used for ephemeral indexing where files are discovered but not persisted to the database. pub fn from_ephemeral( id: Uuid, - metadata: &crate::ops::indexing::db_writer::EntryMetadata, + metadata: &crate::ops::indexing::database_storage::EntryMetadata, sd_path: SdPath, ) -> Self { let is_local = sd_path.is_local(); diff --git a/core/src/infra/db/entities/entry.rs b/core/src/infra/db/entities/entry.rs index 9461a458e..caa050d11 100644 --- a/core/src/infra/db/entities/entry.rs +++ b/core/src/infra/db/entities/entry.rs @@ -336,7 +336,7 @@ impl crate::infra::sync::Syncable for Model { // Use delete_subtree_internal to cascade delete entire subtree // This avoids creating tombstones (we're applying a tombstone) - crate::ops::indexing::DBWriter::delete_subtree(entry.id, db).await?; + crate::ops::indexing::DatabaseStorage::delete_subtree(entry.id, db).await?; Ok(()) } diff --git a/core/src/infra/db/entities/location.rs b/core/src/infra/db/entities/location.rs index dea005c93..ac59e2051 100644 --- a/core/src/infra/db/entities/location.rs +++ b/core/src/infra/db/entities/location.rs @@ -330,7 +330,7 @@ impl Syncable for Model { // Delete root entry tree first if it exists // Use delete_subtree_internal to avoid creating tombstones (we're applying a tombstone) if let Some(entry_id) = location.entry_id { - crate::ops::indexing::DBWriter::delete_subtree(entry_id, db).await?; + crate::ops::indexing::DatabaseStorage::delete_subtree(entry_id, db).await?; } // Delete location record diff --git a/core/src/location/manager.rs b/core/src/location/manager.rs index bcfcb82b7..0272db671 100644 --- a/core/src/location/manager.rs +++ b/core/src/location/manager.rs @@ -501,7 +501,7 @@ impl LocationManager { // Delete the root entry tree first if it exists // Use delete_subtree_internal to avoid creating entry tombstones (we'll tombstone the location instead) if let Some(entry_id) = location.entry_id { - crate::ops::indexing::DBWriter::delete_subtree(entry_id, library.db().conn()) + crate::ops::indexing::DatabaseStorage::delete_subtree(entry_id, library.db().conn()) .await .map_err(|e| LocationError::Other(format!("Failed to delete entry tree: {}", e)))?; } diff --git a/core/src/ops/indexing/change_detection/detector.rs b/core/src/ops/indexing/change_detection/detector.rs index b5a5d8b10..2f40439a2 100644 --- a/core/src/ops/indexing/change_detection/detector.rs +++ b/core/src/ops/indexing/change_detection/detector.rs @@ -65,7 +65,7 @@ impl ChangeDetector { ) -> Result<(), crate::infra::job::prelude::JobError> { use crate::infra::db::entities; use crate::infra::job::prelude::JobError; - use crate::ops::indexing::change_detection::PersistentWriterAdapter; + use crate::ops::indexing::change_detection::DatabaseAdapterForJob; use crate::ops::indexing::persistence::IndexPersistence; use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; @@ -77,7 +77,7 @@ impl ChangeDetector { // Create a persistent writer adapter to leverage the unified query logic let persistence = - PersistentWriterAdapter::new(ctx, location_record.uuid, location_record.entry_id); + DatabaseAdapterForJob::new(ctx, location_record.uuid, location_record.entry_id); // Use the scoped query method let existing_entries = persistence.get_existing_entries(indexing_path).await?; diff --git a/core/src/ops/indexing/change_detection/handler.rs b/core/src/ops/indexing/change_detection/handler.rs index 23c1411fc..87db2dd52 100644 --- a/core/src/ops/indexing/change_detection/handler.rs +++ b/core/src/ops/indexing/change_detection/handler.rs @@ -175,9 +175,9 @@ pub async fn build_dir_entry( path: &Path, backend: Option<&Arc>, ) -> Result { - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; - let meta = DBWriter::extract_metadata(path, backend).await?; + let meta = DatabaseStorage::extract_metadata(path, backend).await?; Ok(DirEntry { path: meta.path, kind: meta.kind, diff --git a/core/src/ops/indexing/change_detection/mod.rs b/core/src/ops/indexing/change_detection/mod.rs index caaad6874..fa74e2231 100644 --- a/core/src/ops/indexing/change_detection/mod.rs +++ b/core/src/ops/indexing/change_detection/mod.rs @@ -6,8 +6,8 @@ //! move detection, so a file moved while the indexer is running behaves //! identically to one moved while the watcher is active. //! -//! Changes route to either `PersistentWriter` (database writes for -//! managed locations) or `EphemeralWriter` (in-memory updates for browsing +//! Changes route to either `DatabaseAdapter` (database writes for +//! managed locations) or `MemoryAdapter` (in-memory updates for browsing //! sessions). This split keeps browsed directories responsive without //! polluting the database with temporary entries. @@ -18,8 +18,8 @@ pub mod types; pub use detector::ChangeDetector; pub use handler::{ - apply_batch, build_dir_entry, handle_create, handle_modify, handle_remove, handle_rename, + apply_batch, build_dir_entry, handle_create, handle_modify, handle_rename, handle_remove, path_exists_safe, should_filter_path, ChangeHandler, }; -pub use persistent::{PersistentWriter, PersistentWriterAdapter}; +pub use persistent::{DatabaseAdapter, DatabaseAdapterForJob}; pub use types::{Change, ChangeConfig, ChangeMetadata, ChangeType, EntryRef}; diff --git a/core/src/ops/indexing/change_detection/persistent.rs b/core/src/ops/indexing/change_detection/persistent.rs index c381907f5..aa24b5de0 100644 --- a/core/src/ops/indexing/change_detection/persistent.rs +++ b/core/src/ops/indexing/change_detection/persistent.rs @@ -1,8 +1,8 @@ -//! Unified persistent (database-backed) writer for both watcher and indexer pipelines. +//! Unified database adapter for both watcher and indexer pipelines. //! -//! This module provides `PersistentWriter`, which implements both `ChangeHandler` +//! This module provides `DatabaseAdapter`, which implements both `ChangeHandler` //! (for the watcher pipeline) and `IndexPersistence` (for the indexer job). -//! Both pipelines share the same database write logic through `DBWriter`, +//! Both pipelines share the same database write logic through `DatabaseStorage`, //! eliminating code duplication. use super::handler::ChangeHandler; @@ -27,7 +27,7 @@ use uuid::Uuid; /// - Closure table management /// - Directory path tracking /// - Entry ID caching for hierarchy construction -pub struct PersistentWriter { +pub struct DatabaseAdapter { context: Arc, library_id: Uuid, location_id: Uuid, @@ -37,7 +37,7 @@ pub struct PersistentWriter { entry_id_cache: HashMap, } -impl PersistentWriter { +impl DatabaseAdapter { pub async fn new( context: Arc, library_id: Uuid, @@ -144,7 +144,7 @@ impl PersistentWriter { } #[async_trait::async_trait] -impl ChangeHandler for PersistentWriter { +impl ChangeHandler for DatabaseAdapter { async fn find_by_path(&self, path: &Path) -> Result> { let entry_id = match self.resolve_entry_id(path).await? { Some(id) => id, @@ -205,7 +205,7 @@ impl ChangeHandler for PersistentWriter { async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result { use crate::domain::addressing::SdPath; - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; use crate::ops::indexing::state::IndexerState; let mut state = IndexerState::new(&SdPath::local(&metadata.path)); @@ -216,7 +216,7 @@ impl ChangeHandler for PersistentWriter { state .entry_id_cache .insert(parent_path.to_path_buf(), parent_id); - } else if let Ok(Some(parent_id)) = DBWriter::resolve_parent_id(&self.db, parent_path).await + } else if let Ok(Some(parent_id)) = DatabaseStorage::resolve_parent_id(&self.db, parent_path).await { // Cache the parent ID for future lookups state @@ -226,7 +226,7 @@ impl ChangeHandler for PersistentWriter { .insert(parent_path.to_path_buf(), parent_id); } - let entry_id = DBWriter::create_entry( + let entry_id = DatabaseStorage::create_entry( &mut state, &self.db, library.as_deref(), @@ -253,9 +253,9 @@ impl ChangeHandler for PersistentWriter { } async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()> { - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; - DBWriter::update_entry(&self.db, entry.id, metadata) + DatabaseStorage::update_entry(&self.db, entry.id, metadata) .await .map_err(|e| anyhow::anyhow!("Failed to update entry: {}", e))?; @@ -270,7 +270,7 @@ impl ChangeHandler for PersistentWriter { new_parent_path: &Path, ) -> Result<()> { use crate::domain::addressing::SdPath; - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; use crate::ops::indexing::state::IndexerState; let mut state = IndexerState::new(&SdPath::local(old_path)); @@ -281,7 +281,7 @@ impl ChangeHandler for PersistentWriter { .entry_id_cache .insert(new_parent_path.to_path_buf(), parent_id); } else if let Ok(Some(parent_id)) = - DBWriter::resolve_parent_id(&self.db, new_parent_path).await + DatabaseStorage::resolve_parent_id(&self.db, new_parent_path).await { state .entry_id_cache @@ -289,7 +289,7 @@ impl ChangeHandler for PersistentWriter { self.entry_id_cache .insert(new_parent_path.to_path_buf(), parent_id); } - DBWriter::move_entry( + DatabaseStorage::move_entry( &mut state, &self.db, entry.id, @@ -658,13 +658,13 @@ impl ChangeHandler for PersistentWriter { /// The job system expects an `IndexPersistence` trait, but works with `JobContext` /// instead of `CoreContext`. This adapter wraps `PersistentWriter` and delegates /// storage operations to `DBWriter`, ensuring both pipelines use identical logic. -pub struct PersistentWriterAdapter<'a> { +pub struct DatabaseAdapterForJob<'a> { ctx: &'a JobContext<'a>, library_id: Uuid, location_root_entry_id: Option, } -impl<'a> PersistentWriterAdapter<'a> { +impl<'a> DatabaseAdapterForJob<'a> { pub fn new( ctx: &'a JobContext<'a>, library_id: Uuid, @@ -679,7 +679,7 @@ impl<'a> PersistentWriterAdapter<'a> { } #[async_trait::async_trait] -impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { +impl<'a> IndexPersistence for DatabaseAdapterForJob<'a> { async fn store_entry( &self, entry: &DirEntry, @@ -687,7 +687,7 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { location_root_path: &Path, ) -> JobResult { use crate::domain::addressing::SdPath; - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; use crate::ops::indexing::state::IndexerState; let mut state = IndexerState::new(&SdPath::local(&entry.path)); @@ -698,7 +698,7 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { if let Some(parent_path) = entry.path.parent() { if !state.entry_id_cache.contains_key(parent_path) { if let Ok(Some(parent_id)) = - DBWriter::resolve_parent_id(self.ctx.library_db(), parent_path).await + DatabaseStorage::resolve_parent_id(self.ctx.library_db(), parent_path).await { state .entry_id_cache @@ -707,7 +707,7 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { } } - let entry_id = DBWriter::create_entry( + let entry_id = DatabaseStorage::create_entry( &mut state, self.ctx.library_db(), Some(self.ctx.library()), @@ -726,9 +726,9 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { path: &Path, cas_id: String, ) -> JobResult<()> { - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; - DBWriter::link_to_content_identity( + DatabaseStorage::link_to_content_identity( self.ctx.library_db(), entry_id, path, @@ -827,9 +827,9 @@ impl<'a> IndexPersistence for PersistentWriterAdapter<'a> { } async fn update_entry(&self, entry_id: i32, entry: &DirEntry) -> JobResult<()> { - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; - DBWriter::update_entry(self.ctx.library_db(), entry_id, entry).await + DatabaseStorage::update_entry(self.ctx.library_db(), entry_id, entry).await } fn is_persistent(&self) -> bool { diff --git a/core/src/ops/indexing/db_writer.rs b/core/src/ops/indexing/database_storage.rs similarity index 98% rename from core/src/ops/indexing/db_writer.rs rename to core/src/ops/indexing/database_storage.rs index 83d481af8..cc3d3fb24 100644 --- a/core/src/ops/indexing/db_writer.rs +++ b/core/src/ops/indexing/database_storage.rs @@ -1,6 +1,6 @@ -//! # Core Database Writer for Indexing +//! # Core Database Storage for Indexing //! -//! `core::ops::indexing::db_writer` provides the foundational database operations layer +//! `core::ops::indexing::database_storage` provides the foundational database operations layer //! for the indexing system. All database writes (creates, updates, moves, deletes) flow //! through this module, ensuring consistency across both watcher and job pipelines. //! @@ -24,10 +24,10 @@ //! //! ## Example //! ```rust,no_run -//! use spacedrive_core::ops::indexing::{DBWriter, state::DirEntry}; +//! use spacedrive_core::ops::indexing::{DatabaseStorage, state::DirEntry}; //! //! let entry = DirEntry { /* ... */ }; -//! let entry_id = DBWriter::create_entry( +//! let entry_id = DatabaseStorage::create_entry( //! &mut state, //! &ctx, //! &entry, @@ -114,12 +114,12 @@ impl From for EntryMetadata { /// Core database operations for the indexing system. /// -/// DBWriter provides the foundational layer for all database writes during indexing. -/// Both the watcher pipeline (`PersistentWriter`) and job pipeline (`PersistentWriterAdapter`) -/// delegate to these methods, ensuring consistent database operations. All methods come in -/// both standalone (creates own transaction) and `_in_conn` variants (uses existing transaction) +/// DatabaseStorage provides the foundational layer for all database writes during indexing. +/// Both the watcher pipeline (`DatabaseAdapter`) and job pipeline use these methods, +/// ensuring consistent database operations. All methods come in both standalone +/// (creates own transaction) and `_in_conn` variants (uses existing transaction) /// for flexible batch operations. -pub struct DBWriter; +pub struct DatabaseStorage; /// Result of linking an entry to its content identity. /// @@ -133,7 +133,7 @@ pub struct ContentLinkResult { pub is_new_content: bool, } -impl DBWriter { +impl DatabaseStorage { /// Get platform-specific inode #[cfg(unix)] pub fn get_inode(metadata: &std::fs::Metadata) -> Option { @@ -1034,7 +1034,7 @@ impl DBWriter { /// - Database cleanup operations /// /// For watcher-triggered deletions that need sync/events, use - /// `PersistentWriter::delete()` instead. + /// `DatabaseAdapter::delete()` instead. pub async fn delete_subtree( entry_id: i32, db: &sea_orm::DatabaseConnection, diff --git a/core/src/ops/indexing/ephemeral/cache.rs b/core/src/ops/indexing/ephemeral/cache.rs index 2de7997b1..f0b3d58af 100644 --- a/core/src/ops/indexing/ephemeral/cache.rs +++ b/core/src/ops/indexing/ephemeral/cache.rs @@ -4,7 +4,7 @@ //! directories share one arena and string pool, keeping memory at ~50 bytes per //! entry regardless of how many paths the user navigates. The cache tracks which //! paths are indexed (queryable), in-progress (being scanned), or watched -//! (receiving live filesystem updates via `EphemeralWriter`). +//! (receiving live filesystem updates via `MemoryAdapter`). use super::EphemeralIndex; use parking_lot::RwLock; @@ -159,7 +159,7 @@ impl EphemeralIndexCache { /// Register a path for filesystem watching. /// /// When registered, the watcher service will monitor this path for changes - /// and update the ephemeral index via `EphemeralWriter`. The path + /// and update the ephemeral index via `MemoryAdapter`. The path /// must already be indexed. pub fn register_for_watching(&self, path: PathBuf) -> bool { let indexed = self.indexed_paths.read(); diff --git a/core/src/ops/indexing/ephemeral/index.rs b/core/src/ops/indexing/ephemeral/index.rs index 40dc68c21..528d4e64b 100644 --- a/core/src/ops/indexing/ephemeral/index.rs +++ b/core/src/ops/indexing/ephemeral/index.rs @@ -17,7 +17,7 @@ use crate::domain::ContentKind; use crate::filetype::FileTypeRegistry; -use crate::ops::indexing::db_writer::EntryMetadata; +use crate::ops::indexing::database_storage::EntryMetadata; use crate::ops::indexing::state::{EntryKind, IndexerStats}; use super::types::{FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata}; diff --git a/core/src/ops/indexing/ephemeral/mod.rs b/core/src/ops/indexing/ephemeral/mod.rs index da8c7576a..e2ec21e94 100644 --- a/core/src/ops/indexing/ephemeral/mod.rs +++ b/core/src/ops/indexing/ephemeral/mod.rs @@ -53,4 +53,4 @@ pub use index::{EphemeralIndex, EphemeralIndexStats}; pub use name::NameCache; pub use registry::NameRegistry; pub use types::{EntryId, FileNode, FileType, MaybeEntryId, NameRef, NodeState, PackedMetadata}; -pub use writer::EphemeralWriter; +pub use writer::MemoryAdapter; diff --git a/core/src/ops/indexing/ephemeral/responder.rs b/core/src/ops/indexing/ephemeral/responder.rs index 33f1104a4..504dc6078 100644 --- a/core/src/ops/indexing/ephemeral/responder.rs +++ b/core/src/ops/indexing/ephemeral/responder.rs @@ -23,7 +23,7 @@ use anyhow::Result; use std::path::{Path, PathBuf}; use std::sync::Arc; -use super::EphemeralWriter; +use super::MemoryAdapter; /// Check if a path falls under an ephemeral watched directory. /// @@ -54,7 +54,7 @@ pub fn find_ephemeral_root_for_events( /// Process a batch of filesystem events against the ephemeral index. /// -/// Creates an `EphemeralWriter` and processes the events using shared +/// Creates an `MemoryAdapter` and processes the events using shared /// handler logic. The ephemeral index is updated in-place and ResourceChanged /// events are emitted for UI updates. pub async fn apply_batch( @@ -70,7 +70,7 @@ pub async fn apply_batch( let index = context.ephemeral_cache().get_global_index(); let event_bus = context.events.clone(); - let mut writer = EphemeralWriter::new(index, event_bus, root_path.to_path_buf()); + let mut writer = MemoryAdapter::new(index, event_bus, root_path.to_path_buf()); let config = ChangeConfig { rule_toggles, diff --git a/core/src/ops/indexing/ephemeral/writer.rs b/core/src/ops/indexing/ephemeral/writer.rs index ebc65ca91..58e9fe785 100644 --- a/core/src/ops/indexing/ephemeral/writer.rs +++ b/core/src/ops/indexing/ephemeral/writer.rs @@ -9,7 +9,7 @@ use crate::infra::event::EventBus; use crate::infra::job::prelude::{JobError, JobResult}; use crate::ops::indexing::change_detection::handler::{build_dir_entry, ChangeHandler}; use crate::ops::indexing::change_detection::types::{ChangeType, EntryRef}; -use crate::ops::indexing::db_writer::EntryMetadata; +use crate::ops::indexing::database_storage::EntryMetadata; use crate::ops::indexing::persistence::IndexPersistence; use crate::ops::indexing::state::{DirEntry, EntryKind}; @@ -32,14 +32,14 @@ use uuid::Uuid; /// - UUID generation and tracking /// - Event emission for UI updates /// - Entry ID generation -pub struct EphemeralWriter { +pub struct MemoryAdapter { index: Arc>, event_bus: Arc, root_path: PathBuf, next_id: AtomicI32, } -impl EphemeralWriter { +impl MemoryAdapter { pub fn new( index: Arc>, event_bus: Arc, @@ -119,7 +119,7 @@ impl EphemeralWriter { } #[async_trait::async_trait] -impl ChangeHandler for EphemeralWriter { +impl ChangeHandler for MemoryAdapter { async fn find_by_path(&self, path: &Path) -> Result> { let index = self.index.read().await; @@ -237,7 +237,7 @@ impl ChangeHandler for EphemeralWriter { } async fn handle_new_directory(&self, path: &Path) -> Result<()> { - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; let mut entries = match tokio::fs::read_dir(path).await { Ok(e) => e, @@ -272,7 +272,7 @@ impl ChangeHandler for EphemeralWriter { modified: metadata.modified().ok(), accessed: metadata.accessed().ok(), created: metadata.created().ok(), - inode: DBWriter::get_inode(&metadata), + inode: DatabaseStorage::get_inode(&metadata), permissions: None, is_hidden: entry_path .file_name() @@ -291,16 +291,16 @@ impl ChangeHandler for EphemeralWriter { } #[async_trait::async_trait] -impl IndexPersistence for EphemeralWriter { +impl IndexPersistence for MemoryAdapter { async fn store_entry( &self, entry: &DirEntry, _location_id: Option, _location_root_path: &Path, ) -> JobResult { - use crate::ops::indexing::db_writer::DBWriter; + use crate::ops::indexing::database_storage::DatabaseStorage; - let metadata = DBWriter::extract_metadata(&entry.path, None) + let metadata = DatabaseStorage::extract_metadata(&entry.path, None) .await .map_err(|e| JobError::execution(format!("Failed to extract metadata: {}", e)))?; @@ -378,7 +378,7 @@ mod tests { let event_bus = Arc::new(EventBus::new(1024)); let mut writer = - EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); + MemoryAdapter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); let dir_entry = DirEntry { path: test_file.clone(), @@ -415,7 +415,7 @@ mod tests { )); let event_bus = Arc::new(EventBus::new(1024)); - let writer = EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); + let writer = MemoryAdapter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); let dir_entry = DirEntry { path: test_file.clone(), @@ -450,7 +450,7 @@ mod tests { let event_bus = Arc::new(EventBus::new(1024)); let mut subscriber = event_bus.subscribe(); - let writer = EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); + let writer = MemoryAdapter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); let dir_entry = DirEntry { path: test_file.clone(), diff --git a/core/src/ops/indexing/job.rs b/core/src/ops/indexing/job.rs index 02c46b8ab..db2d88dcf 100644 --- a/core/src/ops/indexing/job.rs +++ b/core/src/ops/indexing/job.rs @@ -613,7 +613,7 @@ impl IndexerJob { ctx: &JobContext<'_>, root_path: &std::path::Path, ) -> JobResult<()> { - use super::db_writer::DBWriter; + use super::database_storage::DatabaseStorage; use super::state::{DirEntry, EntryKind}; use tokio::fs; @@ -645,7 +645,7 @@ impl IndexerJob { kind: entry_kind, size: metadata.len(), modified: metadata.modified().ok(), - inode: DBWriter::get_inode(&metadata), + inode: DatabaseStorage::get_inode(&metadata), }; state.pending_entries.push(dir_entry); diff --git a/core/src/ops/indexing/mod.rs b/core/src/ops/indexing/mod.rs index c1e4a53c2..20d904dfe 100644 --- a/core/src/ops/indexing/mod.rs +++ b/core/src/ops/indexing/mod.rs @@ -22,7 +22,7 @@ pub mod action; pub mod change_detection; -pub mod db_writer; +pub mod database_storage; pub mod ephemeral; pub mod hierarchy; pub mod input; @@ -41,10 +41,10 @@ pub mod verify; pub use action::IndexingAction; pub use change_detection::{ apply_batch as apply_change_batch, Change, ChangeConfig, ChangeDetector, ChangeHandler, - ChangeType, EntryRef, PersistentWriter, PersistentWriterAdapter, + ChangeType, EntryRef, DatabaseAdapter, DatabaseAdapterForJob, }; -pub use db_writer::{DBWriter, EntryMetadata}; -pub use ephemeral::{EphemeralIndex, EphemeralIndexCache, EphemeralIndexStats, EphemeralWriter}; +pub use database_storage::{DatabaseStorage, EntryMetadata}; +pub use ephemeral::{EphemeralIndex, EphemeralIndexCache, EphemeralIndexStats, MemoryAdapter}; pub use hierarchy::HierarchyQuery; pub use input::IndexInput; pub use job::{IndexMode, IndexScope, IndexerJob, IndexerJobConfig, IndexerOutput}; diff --git a/core/src/ops/indexing/persistence.rs b/core/src/ops/indexing/persistence.rs index c0e56b0fe..ca481a900 100644 --- a/core/src/ops/indexing/persistence.rs +++ b/core/src/ops/indexing/persistence.rs @@ -5,10 +5,10 @@ //! This abstraction allows the same indexing pipeline to work for both managed //! locations (database-backed) and ephemeral browsing (memory-only). //! -//! For ephemeral storage, use `EphemeralWriter` from `crate::ops::indexing::ephemeral` +//! For ephemeral storage, use `MemoryAdapter` from `crate::ops::indexing::ephemeral` //! which implements both `IndexPersistence` and `ChangeHandler`. //! -//! For persistent storage, use `PersistentWriterAdapter` from `crate::ops::indexing::change_detection` +//! For persistent storage, use `DatabaseAdapterForJob` from `crate::ops::indexing::change_detection` //! which implements `IndexPersistence` and delegates to `DBWriter` for database writes. use crate::infra::job::prelude::{JobError, JobResult}; @@ -21,8 +21,8 @@ use super::{ephemeral::EphemeralIndex, state::DirEntry}; /// Unified storage interface for persistent and ephemeral indexing. /// -/// Implementations handle either database writes (`PersistentWriterAdapter`) or -/// in-memory storage (`EphemeralWriter`). The indexing pipeline calls +/// Implementations handle either database writes (`DatabaseAdapterForJob`) or +/// in-memory storage (`MemoryAdapter`). The indexing pipeline calls /// these methods without knowing which backend is active. #[async_trait::async_trait] pub trait IndexPersistence: Send + Sync { @@ -76,7 +76,7 @@ pub trait IndexPersistence: Send + Sync { pub struct PersistenceFactory; impl PersistenceFactory { - /// Create a database persistence instance using the unified PersistentWriterAdapter. + /// Create a database persistence instance using the unified DatabaseAdapterForJob. /// /// This delegates to `DBWriter` for all database operations, ensuring /// consistency between the watcher and indexer pipelines. @@ -85,27 +85,27 @@ impl PersistenceFactory { library_id: uuid::Uuid, location_root_entry_id: Option, ) -> Box { - use crate::ops::indexing::change_detection::PersistentWriterAdapter; + use crate::ops::indexing::change_detection::DatabaseAdapterForJob; - Box::new(PersistentWriterAdapter::new( + Box::new(DatabaseAdapterForJob::new( ctx, library_id, location_root_entry_id, )) } - /// Create an ephemeral persistence instance using the unified EphemeralWriter. + /// Create an ephemeral persistence instance using the unified MemoryAdapter. pub fn ephemeral( index: std::sync::Arc>, event_bus: Option>, root_path: PathBuf, ) -> Box { - use super::ephemeral::EphemeralWriter; + use super::ephemeral::MemoryAdapter; let event_bus = event_bus .unwrap_or_else(|| std::sync::Arc::new(crate::infra::event::EventBus::new(1024))); - Box::new(EphemeralWriter::new(index, event_bus, root_path)) + Box::new(MemoryAdapter::new(index, event_bus, root_path)) } } @@ -113,7 +113,7 @@ impl PersistenceFactory { mod tests { use super::*; use crate::infra::event::Event; - use crate::ops::indexing::ephemeral::EphemeralWriter; + use crate::ops::indexing::ephemeral::MemoryAdapter; use crate::ops::indexing::state::{DirEntry, EntryKind}; use std::sync::Arc; use tempfile::TempDir; @@ -175,7 +175,7 @@ mod tests { )); let event_bus = Arc::new(crate::infra::event::EventBus::new(1024)); - let writer = EphemeralWriter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); + let writer = MemoryAdapter::new(index.clone(), event_bus, temp_dir.path().to_path_buf()); let dir_entry = DirEntry { path: test_file.clone(), diff --git a/core/src/ops/indexing/phases/content.rs b/core/src/ops/indexing/phases/content.rs index da1de2ceb..1919faac5 100644 --- a/core/src/ops/indexing/phases/content.rs +++ b/core/src/ops/indexing/phases/content.rs @@ -11,7 +11,7 @@ use crate::{ infra::job::generic_progress::ToGenericProgress, infra::job::prelude::{JobContext, JobError, Progress}, ops::indexing::{ - db_writer::DBWriter, + database_storage::DatabaseStorage, processor::{ContentHashProcessor, ProcessorEntry}, state::{EntryKind, IndexError, IndexPhase, IndexerProgress, IndexerState}, }, @@ -127,7 +127,7 @@ pub async fn run_content_phase( match hash_result { Ok(content_hash) => { - match DBWriter::link_to_content_identity( + match DatabaseStorage::link_to_content_identity( ctx.library_db(), entry_id, &path, diff --git a/core/src/ops/indexing/phases/discovery.rs b/core/src/ops/indexing/phases/discovery.rs index c27461152..36a89911c 100644 --- a/core/src/ops/indexing/phases/discovery.rs +++ b/core/src/ops/indexing/phases/discovery.rs @@ -9,7 +9,7 @@ use crate::{ infra::job::generic_progress::ToGenericProgress, infra::job::prelude::{JobContext, JobError, Progress}, ops::indexing::{ - db_writer::DBWriter, + database_storage::DatabaseStorage, rules::{build_default_ruler, RuleToggles, RulerDecision}, state::{DirEntry, EntryKind, IndexError, IndexPhase, IndexerProgress, IndexerState}, }, diff --git a/core/src/ops/indexing/phases/processing.rs b/core/src/ops/indexing/phases/processing.rs index aace72618..fa3940055 100644 --- a/core/src/ops/indexing/phases/processing.rs +++ b/core/src/ops/indexing/phases/processing.rs @@ -14,7 +14,7 @@ use crate::{ }, ops::indexing::{ change_detection::{Change, ChangeDetector}, - db_writer::DBWriter, + database_storage::DatabaseStorage, state::{DirEntry, EntryKind, IndexError, IndexPhase, IndexerProgress, IndexerState}, IndexMode, }, @@ -282,7 +282,7 @@ pub async fn run_processing_phase( match change { Some(Change::New(_)) => { - match DBWriter::create_entry_in_conn( + match DatabaseStorage::create_entry_in_conn( state, &entry, device_id, @@ -331,7 +331,7 @@ pub async fn run_processing_phase( } Some(Change::Modified { entry_id, .. }) => { - match DBWriter::update_entry_in_conn(entry_id, &entry, &txn).await { + match DatabaseStorage::update_entry_in_conn(entry_id, &entry, &txn).await { Ok(()) => { ctx.log(format!( "Updated entry {}: {}", @@ -366,7 +366,7 @@ pub async fn run_processing_phase( old_path.display(), new_path.display() )); - match DBWriter::simple_move_entry_in_conn( + match DatabaseStorage::simple_move_entry_in_conn( state, entry_id, &old_path, &new_path, &txn, ) .await diff --git a/core/src/ops/indexing/processor.rs b/core/src/ops/indexing/processor.rs index d50c58912..69e625917 100644 --- a/core/src/ops/indexing/processor.rs +++ b/core/src/ops/indexing/processor.rs @@ -5,7 +5,7 @@ //! happen in a single transaction. This ensures entries either have valid content_id references //! or remain unlinked if processing fails. -use super::{db_writer::DBWriter, state::EntryKind}; +use super::{database_storage::DatabaseStorage, state::EntryKind}; use crate::domain::content_identity::ContentHashGenerator; use anyhow::Result; use sea_orm::DatabaseConnection; @@ -150,7 +150,7 @@ impl ContentHashProcessor { let content_hash = ContentHashGenerator::generate_content_hash(&entry.path).await?; debug!("✓ Generated content hash: {}", content_hash); - DBWriter::link_to_content_identity( + DatabaseStorage::link_to_content_identity( db, entry.id, &entry.path, diff --git a/core/src/ops/indexing/responder.rs b/core/src/ops/indexing/responder.rs index e6fc84346..d92682ebe 100644 --- a/core/src/ops/indexing/responder.rs +++ b/core/src/ops/indexing/responder.rs @@ -1,12 +1,13 @@ //! Persistent location responder. //! -//! Thin adapter over `PersistentWriter` that translates raw filesystem +//! Thin adapter over `DatabaseAdapter` that translates raw filesystem //! events into database mutations. The watcher calls `apply_batch` with events; //! this module delegates to the unified change handling infrastructure. use crate::context::CoreContext; + use crate::infra::event::FsRawEventKind; -use crate::ops::indexing::change_detection::{self, ChangeConfig, PersistentWriter}; +use crate::ops::indexing::change_detection::{self, ChangeConfig, DatabaseAdapter}; use crate::ops::indexing::rules::RuleToggles; use anyhow::Result; use std::path::Path; @@ -15,7 +16,7 @@ use uuid::Uuid; /// Translates a single filesystem event into database mutations. /// -/// Creates a `PersistentWriter` and delegates to the unified change +/// Creates a `DatabaseAdapter` and delegates to the unified change /// handling infrastructure in `change_detection`. pub async fn apply( context: &Arc, @@ -40,7 +41,7 @@ pub async fn apply( /// Processes multiple filesystem events as a batch. /// -/// Creates a `PersistentWriter` and delegates to the unified +/// Creates a `DatabaseAdapter` and delegates to the unified /// `change_detection::apply_batch` which handles deduplication, ordering, /// and correct processing sequence (removes, renames, creates, modifies). pub async fn apply_batch( @@ -62,7 +63,7 @@ pub async fn apply_batch( location_id ); - let mut handler = PersistentWriter::new( + let mut handler = DatabaseAdapter::new( context.clone(), library_id, location_id, diff --git a/core/src/ops/indexing/verify/action.rs b/core/src/ops/indexing/verify/action.rs index 876dc72a5..c764abb70 100644 --- a/core/src/ops/indexing/verify/action.rs +++ b/core/src/ops/indexing/verify/action.rs @@ -9,7 +9,7 @@ use crate::{ db::entities, }, ops::indexing::{ - db_writer::DBWriter, + database_storage::DatabaseStorage, ephemeral::EphemeralIndex, job::{IndexMode, IndexPersistence, IndexScope, IndexerJob, IndexerJobConfig}, path_resolver::PathResolver, @@ -97,7 +97,7 @@ impl IndexVerifyAction { library: &Arc, context: &Arc, path: &Path, - ) -> Result, ActionError> { + ) -> Result, ActionError> { use tokio::sync::RwLock; tracing::debug!("Running ephemeral indexer job on {}", path.display()); @@ -403,7 +403,7 @@ impl IndexVerifyAction { /// Compare ephemeral index with database entries async fn compare_indexes( &self, - fs_entries: HashMap, + fs_entries: HashMap, mut db_entries: HashMap, root_path: &Path, ) -> Result { diff --git a/docs/core/indexing.mdx b/docs/core/indexing.mdx index 2b27aea95..ccdd5148b 100644 --- a/docs/core/indexing.mdx +++ b/docs/core/indexing.mdx @@ -13,11 +13,11 @@ The indexing system consists of specialized components working together: **IndexerState** preserves all necessary information to resume indexing from any interruption point. This includes the current phase, directories to process, accumulated statistics, and ephemeral UUID mappings for preserving user metadata across browsing-to-persistent transitions. -**DBWriter** provides the low-level database CRUD layer. All database operations (create, update, move, delete) flow through this module for consistency. +**DatabaseStorage** provides the low-level database CRUD layer. All database operations (create, update, move, delete) flow through this module for consistency. -**PersistentWriter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the database via `DBWriter`. +**DatabaseAdapter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the database via `DatabaseStorage`. -**EphemeralWriter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the in-memory `EphemeralIndex`. +**MemoryAdapter** implements both `ChangeHandler` (for filesystem watcher events) and `IndexPersistence` (for indexer job batches). Both pipelines use the same code to write entries to the in-memory `EphemeralIndex`. This dual-implementation architecture unifies watcher and job pipelines, eliminating code duplication between real-time filesystem monitoring and batch indexing operations. @@ -164,7 +164,7 @@ The detector tracks paths by inode to identify moves. On Unix systems, inodes pr ### Real-Time Change Detection -Both `PersistentWriter` and `EphemeralWriter` implement the `ChangeHandler` trait, which defines the interface for responding to filesystem watcher events: +Both `DatabaseAdapter` and `MemoryAdapter` implement the `ChangeHandler` trait, which defines the interface for responding to filesystem watcher events: ```rust pub trait ChangeHandler { @@ -176,7 +176,7 @@ pub trait ChangeHandler { } ``` -The watcher routes events to the appropriate handler based on whether the path belongs to a persistent location (`PersistentWriter` → database) or ephemeral session (`EphemeralWriter` → memory). +The watcher routes events to the appropriate handler based on whether the path belongs to a persistent location (`DatabaseAdapter` → database) or ephemeral session (`MemoryAdapter` → memory). ## Indexing Modes and Scopes