Merge main into cursor/unified-keybind-system-design-bdd9

Resolved merge conflict in packages/interface/src/index.tsx by keeping both
the existing context menu exports from main and the new keybind-related
exports from this branch.
This commit is contained in:
Cursor Agent
2025-12-23 09:06:27 +00:00
439 changed files with 26588 additions and 11053 deletions

View File

@@ -2,6 +2,7 @@
[env]
PROTOC = { force = true, value = "{{{protoc}}}" }
FFMPEG_DIR = { force = true, value = "{{{nativeDeps}}}" }
CPATH = { force = true, value = "{{{nativeDeps}}}/include" }
{{#isLinux}}
ORT_LIB_LOCATION = { force = true, value = "{{{nativeDeps}}}/lib" }
{{/isLinux}}

BIN
.github/actions/publish-artifacts/dist/index.js generated vendored Normal file
View File

Binary file not shown.

View File

@@ -17,9 +17,9 @@ runs:
steps:
- name: Install Rust
id: toolchain
uses: IronCoreLabs/rust-toolchain@v1
uses: dtolnay/rust-toolchain@stable
with:
target: ${{ inputs.target }}
targets: ${{ inputs.target }}
components: clippy, rustfmt
- name: Cache Rust Dependencies

View File

@@ -11,7 +11,8 @@ env:
jobs:
test:
runs-on: ubuntu-latest
runs-on: self-hosted
if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository
steps:
- uses: actions/checkout@v4
@@ -39,21 +40,21 @@ jobs:
path: target
key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }}
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
libgtk-3-dev \
libwebkit2gtk-4.1-dev \
libayatana-appindicator3-dev \
librsvg2-dev \
patchelf
- name: Setup native dependencies
run: cargo run -p xtask -- setup
- name: Build core
run: cargo build -p sd-core --verbose
- name: Run indexing test
run: cargo test --test indexing_test --test-threads=1 -- --nocapture
- name: Run all tests
run: |
cargo test -p sd-core --lib -- --test-threads=1 --nocapture
cargo test -p sd-core --test indexing_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test indexing_rules_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test indexing_responder_reindex_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test sync_backfill_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test sync_backfill_race_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test sync_event_log_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test sync_metrics_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test sync_realtime_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test sync_setup_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test file_sync_simple_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test file_sync_test -- --test-threads=1 --nocapture
cargo test -p sd-core --test database_migration_test -- --test-threads=1 --nocapture

View File

@@ -19,23 +19,21 @@ jobs:
strategy:
matrix:
include:
- host: macos-latest
# macOS builds
- host: self-hosted
target: aarch64-apple-darwin
platform: macos-aarch64
# - host: macos-15-intel
# target: x86_64-apple-darwin
# platform: macos-x86_64
# # Linux builds
# - host: ubuntu-22.04
# target: x86_64-unknown-linux-gnu
# platform: linux-x86_64
# - host: ubuntu-22.04
# target: aarch64-unknown-linux-gnu
# platform: linux-aarch64
# Windows builds (uncomment when needed)
# - host: windows-latest
# target: x86_64-pc-windows-msvc
# platform: windows-x86_64
- host: macos-15-intel
target: x86_64-apple-darwin
platform: macos-x86_64
# Linux builds
- host: ubuntu-22.04
target: x86_64-unknown-linux-gnu
platform: linux-x86_64
# Windows builds
- host: windows-latest
target: x86_64-pc-windows-msvc
platform: windows-x86_64
name: CLI - ${{ matrix.platform }}
runs-on: ${{ matrix.host }}
steps:
@@ -47,6 +45,12 @@ jobs:
with:
targets: ${{ matrix.target }}
- name: Setup System and Rust
uses: ./.github/actions/setup-system
with:
token: ${{ secrets.GITHUB_TOKEN }}
target: ${{ matrix.target }}
- name: Install cross-compilation tools (Linux ARM)
if: matrix.target == 'aarch64-unknown-linux-gnu'
run: |
@@ -100,35 +104,29 @@ jobs:
strategy:
matrix:
settings:
# - host: macos-15-intel
# target: x86_64-apple-darwin
# bundles: dmg,app
# os: darwin
# arch: x86_64
- host: macos-latest
# macOS builds
- host: self-hosted
target: aarch64-apple-darwin
bundles: dmg,app
os: darwin
arch: aarch64
# - host: windows-latest
# target: x86_64-pc-windows-msvc
# bundles: msi
# os: windows
# - host: self-hosted
# target: x86_64-apple-darwin
# bundles: dmg,app
# os: darwin
# arch: x86_64
# - host: windows-latest
# target: aarch64-pc-windows-msvc
# - host: ubuntu-22.04
# target: x86_64-unknown-linux-gnu
# bundles: deb
# os: linux
# arch: x86_64
# - host: ubuntu-22.04
# target: x86_64-unknown-linux-musl
# - host: ubuntu-22.04
# target: aarch64-unknown-linux-gnu
# bundles: deb
# - host: ubuntu-22.04
# target: aarch64-unknown-linux-musl
# Windows builds
- host: windows-latest
target: x86_64-pc-windows-msvc
bundles: msi
os: windows
arch: x86_64
# Linux builds
- host: ubuntu-22.04
target: x86_64-unknown-linux-gnu
bundles: deb
os: linux
arch: x86_64
name: Desktop - Main ${{ matrix.settings.target }}
runs-on: ${{ matrix.settings.host }}
steps:
@@ -153,6 +151,11 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Rust
uses: dtolnay/rust-toolchain@stable
with:
targets: ${{ matrix.settings.target }}
- name: Install Apple API key
if: ${{ runner.os == 'macOS' }}
run: |
@@ -177,6 +180,9 @@ jobs:
token: ${{ secrets.GITHUB_TOKEN }}
target: ${{ matrix.settings.target }}
- name: Install target
run: rustup target add ${{ matrix.settings.target }}
- name: Setup Bun and dependencies
uses: ./.github/actions/setup-bun
with:
@@ -185,7 +191,7 @@ jobs:
- name: Build
working-directory: apps/tauri
run: |
bun tauri build --ci -v --target ${{ matrix.settings.target }} --bundles ${{ matrix.settings.bundles }}
bun tauri build --ci -vv --target ${{ matrix.settings.target }} --bundles ${{ matrix.settings.bundles }}
env:
TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_PRIVATE_KEY }}
TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_KEY_PASSWORD }}
@@ -197,8 +203,10 @@ jobs:
APPLE_API_ISSUER: ${{ secrets.APPLE_API_ISSUER }}
APPLE_API_KEY: ${{ secrets.APPLE_API_KEY }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
CFLAGS: "-march=armv8.2-a+crypto"
CXXFLAGS: "-march=armv8.2-a+crypto"
CFLAGS: ${{ matrix.settings.arch == 'aarch64' && '-march=armv8.2-a+crypto' || '' }}
CXXFLAGS: ${{ matrix.settings.arch == 'aarch64' && '-march=armv8.2-a+crypto' || '' }}
CPATH: ${{ github.workspace }}/apps/.deps/include
C_INCLUDE_PATH: ${{ github.workspace }}/apps/.deps/include
- name: Package frontend
if: ${{ runner.os == 'Linux' }}
@@ -214,10 +222,14 @@ jobs:
target: ${{ matrix.settings.target }}
profile: release
- name: Cleanup keychain
if: always() && runner.os == 'macOS'
run: security delete-keychain signing_temp.keychain || true
# Create unified release with CLI and Desktop artifacts
release:
if: startsWith(github.ref, 'refs/tags/')
runs-on: ubuntu-latest
runs-on: self-hosted
name: Create Release
needs: [cli-build, desktop-main]
permissions:

3
.gitignore vendored
View File

@@ -1,6 +1,7 @@
# Created by https://www.toptal.com/developers/gitignore/api/macos,windows,linux,rust,node,react,turbo,vercel,nextjs,storybookjs
# Edit at https://www.toptal.com/developers/gitignore?templates=macos,windows,linux,rust,node,react,turbo,vercel,nextjs,storybookjs
TODO
### Linux ###
*~
@@ -479,7 +480,7 @@ apps/ios/sd-ios-core/sd_ios_core.xcframework/**
whitepaper/*.log
# GitHub Actions build artifacts
.github/actions/*/dist/
# Note: dist/ folders in .github/actions/ must be committed for custom JS actions
test_data

View File

@@ -52,7 +52,7 @@ When updating a task's status, edit the YAML front matter:
id: TASK-000
title: Task Title
status: Done # Changed from "To Do" or "In Progress"
assignee: james
assignee: jamiepine
priority: High
tags: [core, feature]
last_updated: 2025-10-14 # Update this date

View File

@@ -1,8 +1,8 @@
---
id: ACT-000
title: "Epic: Transactional Action System"
title: "Transactional Action System"
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, core, actions]
whitepaper: Section 4.4

View File

@@ -2,7 +2,7 @@
id: ACT-001
title: Action Manager and Handler Registry
status: Done
assignee: james
assignee: jamiepine
parent: ACT-000
priority: High
tags: [core, actions]

View File

@@ -1,8 +1,8 @@
---
id: AI-000
title: "Epic: AI & Intelligence"
title: "Local AI & Intelligence"
status: To Do
assignee: james
assignee: jamiepine
priority: High
tags: [epic, ai, agent]
whitepaper: Section 4.6

View File

@@ -1,10 +1,10 @@
---
id: AI-001
title: Develop AI Agent for Proactive Assistance
title: AI Agent for Proactive Assistance
status: To Do
assignee: james
assignee: jamiepine
parent: AI-000
priority: High
priority: Medium
tags: [ai, agent, core]
whitepaper: Section 4.6
---

View File

@@ -2,7 +2,7 @@
id: AI-002
title: "Create Fine-Tuning Dataset for AI Agent"
status: To Do
assignee: james
assignee: jamiepine
parent: AI-000
priority: Medium
tags:

View File

@@ -2,10 +2,9 @@
id: CLI-000
title: "Epic: Command-Line Interface"
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, cli]
whitepaper: "N/A"
last_updated: 2025-12-02
---

View File

@@ -2,7 +2,7 @@
id: CLOUD-000
title: "Epic: Cloud as a Peer"
status: To Do
assignee: james
assignee: jamiepine
priority: High
tags: [epic, cloud, networking, infrastructure]
whitepaper: Section 5

View File

@@ -2,7 +2,7 @@
id: CLOUD-001
title: Design Managed Cloud Core Infrastructure
status: To Do
assignee: james
assignee: jamiepine
parent: CLOUD-000
priority: High
tags: [cloud, infrastructure, design, kubernetes]

View File

@@ -2,7 +2,7 @@
id: CLOUD-002
title: Asynchronous Relay Server
status: To Do
assignee: james
assignee: jamiepine
parent: CLOUD-000
priority: High
tags: [cloud, networking, relay, sharing]

View File

@@ -2,7 +2,7 @@
id: CLOUD-003
title: Cloud Storage Provider as a Volume
status: In Progress
assignee: james
assignee: jamiepine
parent: CLOUD-000
priority: High
tags: [cloud, storage, volume, s3]

View File

@@ -1,8 +1,8 @@
---
id: CORE-000
title: "Epic: VDFS Core Architecture"
title: "VDFS Core Architecture"
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, core, vdfs]
whitepaper: Section 4.1

View File

@@ -2,7 +2,7 @@
id: CORE-001
title: Entry-Centric Data Model
status: Done
assignee: james
assignee: jamiepine
parent: CORE-000
priority: High
tags: [core, vdfs, database, model]

View File

@@ -2,7 +2,7 @@
id: CORE-002
title: Universal SdPath Addressing
status: Done
assignee: james
assignee: jamiepine
parent: CORE-000
priority: High
tags: [core, vdfs, addressing]

View File

@@ -2,7 +2,7 @@
id: CORE-003
title: Content Identity System for Deduplication
status: Done
assignee: james
assignee: jamiepine
parent: CORE-000
priority: High
tags: [core, vdfs, deduplication, hashing]

View File

@@ -2,7 +2,7 @@
id: CORE-004
title: Hierarchical Indexing with Closure Table
status: Done
assignee: james
assignee: jamiepine
parent: CORE-000
priority: High
tags: [core, vdfs, database, performance]

View File

@@ -2,7 +2,7 @@
id: CORE-005
title: Advanced File Type System
status: Done
assignee: james
assignee: jamiepine
parent: CORE-000
priority: High
tags: [core, vdfs, file-types]

View File

@@ -2,7 +2,7 @@
id: CORE-006
title: Semantic Tagging Architecture
status: Done
assignee: james
assignee: jamiepine
parent: CORE-000
priority: Medium
tags: [core, vdfs, tagging, metadata]

View File

@@ -1,8 +1,8 @@
---
id: CORE-008
title: Virtual Sidecar System (VSS)
status: In Progress
assignee: james
status: Done
assignee: jamiepine
parent: CORE-000
priority: High
tags: [core, vdfs, sidecars, derivatives, addressing]
@@ -47,18 +47,21 @@ Implement the Virtual Sidecar System (VSS) for managing derivative data—thumbn
## Implementation Steps
### Phase 1: SdPath Integration
- [ ] Add `SdPath::Sidecar { content_id, kind, variant, format }` enum variant
- [ ] Implement `sidecar://` URI parsing
- [ ] Add display formatting for sidecar URIs
- [ ] Write unit tests for parsing/display
### Phase 2: Resolution
- [ ] Implement `resolve_sidecar()` in SdPathResolver
- [ ] Add resolution mode support (blocking, async, fetch-only)
- [ ] Integrate with existing SidecarManager
- [ ] Handle pending/missing sidecars gracefully
### Phase 3: Operations
- [ ] Add sidecar support to ReadAction
- [ ] Add sidecar support to FileCopyAction
- [ ] Implement restricted DeleteAction for sidecars
@@ -66,6 +69,7 @@ Implement the Virtual Sidecar System (VSS) for managing derivative data—thumbn
- [ ] Add operation validation (prevent move/rename)
### Phase 4: Job System
- [ ] Implement ThumbnailGenerationJob
- [ ] Implement OcrExtractionJob
- [ ] Implement TranscriptGenerationJob
@@ -73,12 +77,14 @@ Implement the Virtual Sidecar System (VSS) for managing derivative data—thumbn
- [ ] Implement job dispatch in SidecarManager
### Phase 5: Cross-Device Sync
- [ ] Implement availability digest exchange
- [ ] Implement sidecar transfer protocol
- [ ] Add sync scheduler for periodic updates
- [ ] Implement prefetch policies
### Phase 6: CLI & SDK
- [ ] Add `sd sidecars` command family
- [ ] Implement sidecar glob patterns
- [ ] Add SDK APIs for extensions
@@ -87,6 +93,7 @@ Implement the Virtual Sidecar System (VSS) for managing derivative data—thumbn
## Acceptance Criteria
### Core Functionality
- [ ] Thumbnails auto-generated for images during indexing
- [ ] OCR text extracted from documents automatically
- [ ] Sidecars addressable via `sidecar://` URIs
@@ -94,18 +101,21 @@ Implement the Virtual Sidecar System (VSS) for managing derivative data—thumbn
- [ ] Can list all sidecars for a content item
### Cross-Device
- [ ] Devices exchange sidecar availability information
- [ ] Missing sidecars can be fetched from remote devices
- [ ] Sidecar transfers reuse P2P file transfer infrastructure
- [ ] Availability tracking stays current across library
### Integration
- [ ] Extensions can read/write sidecars via SDK
- [ ] CLI supports sidecar operations
- [ ] Actions support sidecar paths
- [ ] Resolver handles all sidecar resolution modes
### Quality
- [ ] Deterministic paths work without DB queries
- [ ] Idempotent generation (checks before regenerating)
- [ ] Reference sidecars can be converted to managed
@@ -116,6 +126,7 @@ Implement the Virtual Sidecar System (VSS) for managing derivative data—thumbn
Primary spec: `workbench/core/storage/VIRTUAL_SIDECAR_SYSTEM_V2.md` (Nov 2025)
Supporting docs:
- `workbench/core/storage/VIRTUAL_SIDECAR_SYSTEM.md` (Original spec)
- `workbench/core/storage/REFERENCE_SIDECARS.md` (Reference pattern)
- `workbench/core/storage/SIDECAR_SCALING_DESIGN.md` (Future scaling)

View File

@@ -2,9 +2,9 @@
id: CORE-009
title: User-Managed Collections
status: To Do
assignee: james
assignee: jamiepine
parent: CORE-000
priority: Medium
priority: Low
tags: [core, vdfs, collections, organization]
whitepaper: Section 4.1
---

View File

@@ -2,7 +2,7 @@
id: CORE-010
title: File Ingestion Workflow
status: To Do
assignee: james
assignee: jamiepine
parent: CORE-000
priority: High
tags: [core, vdfs, ingestion, workflow]

View File

@@ -1,8 +1,9 @@
---
id: CORE-011
title: Unified Resource Event System
status: To Do
assignee: james
status: Done
parent: CORE-000
assignee: jamiepine
priority: High
tags: [core, events, architecture, refactor]
---

View File

@@ -2,7 +2,7 @@
id: CORE-012
title: Resource Type Registry (Swift)
status: To Do
assignee: james
assignee: jamiepine
parent: CORE-011
priority: High
tags: [client, swift, codegen, cache]

View File

@@ -2,7 +2,7 @@
id: CORE-013
title: Resource Type Registry (TypeScript)
status: To Do
assignee: james
assignee: jamiepine
parent: CORE-011
priority: High
tags: [client, typescript, codegen, cache]

View File

@@ -1,77 +0,0 @@
---
id: CORE-014
title: Specta Codegen for Resource Events
status: To Do
assignee: james
parent: CORE-011
priority: High
tags: [codegen, specta, typescript, swift]
depends_on: [CORE-011]
---
## Description
Extend the existing specta codegen system to auto-generate resource type registries for TypeScript and Swift. This ensures client-side type registries stay in sync with Rust domain models.
## Implementation Steps
1. Update `xtask/src/specta_gen.rs` to collect all `Identifiable` types
2. Generate TypeScript `resourceTypeMap` with all resource types
3. Generate Swift `ResourceTypeRegistry+Generated.swift` with registrations
4. Add build verification that all Identifiable types are registered
5. Update CI to regenerate on every commit
6. Document regeneration process for developers
## Generated Output
### TypeScript
```typescript
// packages/client/src/bindings/resourceRegistry.ts
export const resourceTypeMap = {
file: File,
album: Album,
tag: Tag,
location: Location,
device: Device,
volume: Volume,
content_identity: ContentIdentity,
// ... all Identifiable types
} as const;
```
### Swift (Future)
```swift
// SpacedriveCore/Generated/ResourceTypeRegistry+Generated.swift
extension ResourceTypeRegistry {
static func registerAllTypes() {
register(File.self)
register(Album.self)
register(Tag.self)
// ... all Identifiable types
}
}
```
## Technical Details
- Location: `xtask/src/specta_gen.rs`
- Trait marker: Check for `impl Identifiable`
- Output: `packages/client/src/bindings/resourceRegistry.ts`
- Build step: `cargo xtask specta-gen`
- CI: Auto-run on pre-commit or CI build
## Acceptance Criteria
- [ ] Specta codegen extended for resource types
- [ ] TypeScript resourceTypeMap auto-generated
- [ ] Build verification ensures all types registered
- [ ] CI/CD regenerates on every commit
- [ ] Developer documentation updated
- [ ] Diff checking prevents manual edits
## References
- `docs/core/events.md` lines 391-434
- Existing: `xtask/src/specta_gen.rs`

View File

@@ -1,68 +0,0 @@
---
id: CORE-015
title: Normalized Client Cache (Swift)
status: To Do
assignee: james
priority: High
tags: [client, swift, cache, performance]
depends_on: [CORE-012]
---
## Description
Implement the normalized client cache for iOS/macOS apps. Provides instant UI updates, offline support, and massive bandwidth savings by normalizing all resources by ID and updating atomically when events arrive.
## Implementation Steps
1. Create `NormalizedCache` actor with two-level structure:
- Level 1: Entity store (normalized by ID)
- Level 2: Query index (maps queries to entity IDs)
2. Implement `updateEntity<T>()` - updates entity and notifies observers
3. Implement `query<T>()` - caches queries and results
4. Implement `deleteEntity()` - removes entity and updates indices
5. Implement `invalidateQueriesForResource()` - bulk operation handling
6. Add LRU eviction (max 10K entities)
7. Add SQLite persistence for offline support
8. Create `EventCacheUpdater` for event integration
## Cache Architecture
```
┌─────────────────────────────────────────┐
│ Entity Store (Level 1) │
│ "file:uuid-1" → File { ... } │
│ "album:uuid-2" → Album { ... } │
└─────────────────────────────────────────┘
│ Atomic updates
┌─────────────────────────────────────────┐
│ Query Index (Level 2) │
│ "search:photos" → ["file:uuid-1", ...] │
│ "albums.list" → ["album:uuid-2"] │
└─────────────────────────────────────────┘
```
## Technical Details
- Location: `packages/client-swift/Sources/SpacedriveCore/Cache/NormalizedCache.swift`
- Actor for thread-safety
- Max entities: 10,000 (configurable)
- TTL: 5 minutes default (query-specific)
- Persistence: SQLite in app cache directory
## Acceptance Criteria
- [ ] NormalizedCache actor implemented
- [ ] Entity store with LRU eviction
- [ ] Query index with TTL
- [ ] SQLite persistence
- [ ] EventCacheUpdater integration
- [ ] ObservableObject wrapper for SwiftUI
- [ ] Memory stays under 15MB with 10K entities
- [ ] Unit tests for cache operations
- [ ] Integration tests with events
## References
- `docs/core/normalized_cache.md` - Complete specification

View File

@@ -1,77 +0,0 @@
---
id: CORE-016
title: Normalized Client Cache (TypeScript)
status: To Do
assignee: james
priority: High
tags: [client, typescript, react, cache, performance]
depends_on: [CORE-013]
---
## Description
Implement the normalized client cache for web/desktop (Electron) apps. Same architecture as Swift version but with React integration via hooks.
## Implementation Steps
1. Create `NormalizedCache` class with entity store + query index
2. Implement `updateEntity()` with subscription notifications
3. Implement `query()` with caching
4. Implement `deleteEntity()` and query invalidation
5. Add LRU eviction
6. Add IndexedDB persistence for offline support
7. Create `useCachedQuery` React hook
8. Create `EventCacheUpdater` for event integration
## React Integration
```typescript
function useCachedQuery<T>(
method: string,
input: any,
): { data: T[] | null; loading: boolean; error: Error | null } {
const cache = useContext(CacheContext);
const [data, setData] = useState<T[] | null>(null);
useEffect(() => {
const queryKey = cache.generateQueryKey(method, input);
// Subscribe to cache changes
const unsubscribe = cache.subscribe(queryKey, () => {
const result = cache.getQueryResult<T>(queryKey);
setData(result);
});
// Initial fetch
cache.query<T>(method, input).then(setData);
return unsubscribe;
}, [method, JSON.stringify(input)]);
return { data, loading: data === null, error: null };
}
```
## Technical Details
- Location: `packages/client/src/core/NormalizedCache.ts`
- React hook: `packages/client/src/hooks/useCachedQuery.ts`
- Max entities: 10,000
- TTL: 5 minutes default
- Persistence: IndexedDB
## Acceptance Criteria
- [ ] NormalizedCache class implemented
- [ ] Entity store with LRU eviction
- [ ] Query index with TTL
- [ ] IndexedDB persistence
- [ ] useCachedQuery hook
- [ ] EventCacheUpdater integration
- [ ] Memory stays under 15MB
- [ ] Unit tests for cache operations
- [ ] Integration tests with React components
## References
- `docs/core/normalized_cache.md` lines 188-279

View File

@@ -1,70 +0,0 @@
---
id: CORE-017
title: Optimistic Updates for Client Cache
status: To Do
assignee: james
parent: CORE-015
priority: Medium
tags: [client, cache, ux, optimistic]
depends_on: [CORE-015, CORE-016]
---
## Description
Implement optimistic updates in the normalized cache, allowing instant UI feedback before server confirmation. If the action fails, the update is rolled back automatically.
## Implementation Steps
1. Add `optimisticUpdates` map to cache (pending_id → resource)
2. Implement `updateOptimistically()` - applies change immediately
3. Implement `commitOptimisticUpdate()` - replaces with confirmed data
4. Implement `rollbackOptimisticUpdate()` - reverts on error
5. Integrate with action execution flow
6. Add visual indicators for pending changes (optional)
## Flow Example
```typescript
// 1. Optimistic update (instant UI)
const pendingId = uuid();
await cache.updateOptimistically(pendingId, {
id: albumId,
name: newName,
...optimisticAlbum,
});
try {
// 2. Send action to server
const confirmed = await client.action("albums.rename", {
id: albumId,
name: newName,
});
// 3. Commit (replace optimistic with confirmed)
await cache.commitOptimisticUpdate(pendingId, confirmed);
} catch (error) {
// 4. Rollback on error
await cache.rollbackOptimisticUpdate(pendingId);
throw error;
}
```
## Technical Details
- Optimistic updates stored separately from confirmed entities
- UI sees merged view (optimistic + confirmed)
- Pending changes visually indicated (future)
- Automatic rollback on action failure
## Acceptance Criteria
- [ ] Optimistic update API implemented
- [ ] UI updates instantly before server response
- [ ] Rollback works on errors
- [ ] No flickering during commit
- [ ] Unit tests for optimistic flow
- [ ] Integration tests validate error scenarios
## References
- `docs/core/normalized_cache.md` lines 685-741

View File

@@ -2,7 +2,7 @@
id: DEV-000
title: "Epic: Development & Validation Framework"
status: Done
assignee: james
assignee: jamiepine
priority: Medium
tags: [epic, core, testing, dev-infra]
whitepaper: Section 6.3

View File

@@ -2,7 +2,7 @@
id: DEV-001
title: Develop Multi-Process Test Framework
status: Done
assignee: james
assignee: jamiepine
parent: DEV-000
priority: High
tags: [testing, dev-infra, networking]

View File

@@ -2,7 +2,7 @@
id: FILE-000
title: "Epic: File Operations"
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, core, file-ops]
whitepaper: Section 4

View File

@@ -2,7 +2,7 @@
id: FILE-001
title: File Copy Job with Strategy Pattern
status: Done
assignee: james
assignee: jamiepine
parent: FILE-000
priority: High
tags: [core, jobs, file-ops, vdfs]

View File

@@ -2,7 +2,7 @@
id: FILE-002
title: File Deletion Job
status: Done
assignee: james
assignee: jamiepine
parent: FILE-000
priority: High
tags: [core, jobs, file-ops]

View File

@@ -2,7 +2,7 @@
id: FILE-003
title: Cloud Volume File Operations
status: To Do
assignee: james
assignee: jamiepine
parent: FILE-000
priority: High
tags: [core, file-ops, cloud, jobs]

View File

@@ -1,8 +1,8 @@
---
id: FSYNC-000
title: File Sync System (Epic)
title: File Sync
status: In Progress
assignee: james
assignee: jamiepine
parent: null
priority: High
tags: [sync, service, epic, index-driven]

View File

@@ -2,7 +2,7 @@
id: FSYNC-001
title: DeleteJob Strategy Pattern & Remote Deletion
status: Done
assignee: james
assignee: jamiepine
parent: FSYNC-000
priority: High
tags: [delete, strategy, remote, networking]

View File

@@ -2,7 +2,7 @@
id: FSYNC-002
title: Database Schema & Entities
status: Done
assignee: james
assignee: jamiepine
parent: FSYNC-000
priority: High
tags: [database, schema, migration, entities]

View File

@@ -2,7 +2,7 @@
id: FSYNC-003
title: FileSyncService Core Implementation
status: To Do
assignee: james
assignee: jamiepine
parent: FSYNC-000
priority: High
tags: [service, core, orchestration, resolver]

View File

@@ -2,7 +2,7 @@
id: FSYNC-004
title: Service Integration & API
status: To Do
assignee: james
assignee: jamiepine
parent: FSYNC-000
priority: Medium
tags: [api, integration, routes, events]

View File

@@ -2,7 +2,7 @@
id: FSYNC-005
title: Advanced Features (Scheduling, Progress, Conflicts)
status: To Do
assignee: james
assignee: jamiepine
parent: FSYNC-000
priority: Medium
tags: [scheduler, progress, conflicts, polish]

View File

@@ -1,13 +1,31 @@
---
id: INDEX-000
title: "Epic: Indexing & File Management Engine"
title: "Epic: Hybrid Indexing Engine"
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, core, indexing]
whitepaper: Section 4.3
last_updated: 2025-12-16
---
## Description
This epic encompasses the system responsible for discovering, processing, and managing user data. It includes the multi-phase indexing pipeline, real-time file system monitoring, and the intelligent volume management system that understands the physical characteristics of storage.
The hybrid indexing engine is Spacedrive's core filesystem discovery and processing system. It layers an ultra-fast, in-memory ephemeral index over a robust SQLite-backed persistent index, enabling instant browsing of unmanaged locations (like a file manager) while seamlessly upgrading paths to managed libraries (like a DAM) without UI flicker.
## Architecture
- **Ephemeral Layer**: Memory-resident index for instant browsing of external drives and unmanaged paths
- **Persistent Layer**: SQLite-backed index with full change tracking, sync, and content analysis
- **Five-Phase Pipeline**: Discovery → Processing → Aggregation → Content Identification → Finalizing
- **Change Detection**: Dual-mode system with batch ChangeDetector and real-time ChangeHandler trait
- **Database Architecture**: Closure tables for O(1) hierarchy queries and directory path caching
## Key Features
- Instant browsing of millions of files in RAM (~50 bytes per entry)
- Seamless promotion from ephemeral to persistent with UUID preservation
- Multi-phase indexing with resumable jobs
- Real-time filesystem watching via unified ChangeHandler
- Intelligent rules engine with .gitignore integration
- Index verification and integrity checking

View File

@@ -0,0 +1,127 @@
---
id: INDEX-001
title: Hybrid Indexing Architecture (Ephemeral + Persistent)
status: Done
assignee: jamiepine
parent: INDEX-000
priority: High
tags: [indexing, architecture, ephemeral, persistent]
whitepaper: Section 4.3.1
last_updated: 2025-12-16
---
## Description
Implement the dual-layer indexing architecture that enables Spacedrive to act as both a fast file explorer (ephemeral mode) and a managed library system (persistent mode). This architecture allows instant browsing of unmanaged locations while seamlessly upgrading them to fully-indexed locations without UI disruption.
## Architecture
### Ephemeral Layer ("File Manager" Mode)
The ephemeral layer provides instant filesystem browsing without database writes:
- **Memory-Resident**: All data lives in RAM via `EphemeralIndex`
- **Highly Optimized**: NodeArena slab allocator + NameCache string interning (~50 bytes/entry)
- **Massive Scale**: Can index millions of files in memory
- **Zero Database I/O**: Bypasses SQLite entirely
- **Real-Time Updates**: Filesystem events update in-memory structures via `MemoryAdapter`
### Persistent Layer ("Library" Mode)
The persistent layer provides full database-backed indexing with sync and content analysis:
- **SQLite-Backed**: All entries stored in database with closure tables
- **Cross-Device Sync**: Changes propagate via library sync protocol
- **Content Analysis**: BLAKE3 hashing, file type detection, metadata extraction
- **Change Tracking**: Full history via sync log
- **Real-Time Updates**: Filesystem events update database via `DatabaseAdapter`
### Seamless State Promotion
The critical innovation is UUID preservation during ephemeral-to-persistent transitions:
1. User browses external drive in ephemeral mode (UUIDs assigned in RAM)
2. User adds location to library
3. System detects existing ephemeral index for that path
4. Indexer carries over ephemeral UUIDs into database (`state.ephemeral_uuids`)
5. UI remains stable (selections, active tabs, view state preserved)
6. Indexer proceeds from Phase 2 (Processing) onward
## Implementation Files
### Ephemeral Layer
- `core/src/ops/indexing/ephemeral/mod.rs` - Module definitions
- `core/src/ops/indexing/ephemeral/index.rs` - EphemeralIndex main structure
- `core/src/ops/indexing/ephemeral/cache.rs` - EphemeralIndexCache for tracking indexed paths
- `core/src/ops/indexing/ephemeral/arena.rs` - NodeArena slab allocator
- `core/src/ops/indexing/ephemeral/name.rs` - NameCache string interning
- `core/src/ops/indexing/ephemeral/registry.rs` - NameRegistry for name-based lookups
- `core/src/ops/indexing/ephemeral/writer.rs` - MemoryAdapter for writing to ephemeral index
- `core/src/ops/indexing/ephemeral/responder.rs` - Filesystem event handling
- `core/src/ops/indexing/ephemeral/types.rs` - FileNode and related types
### Persistent Layer
- `core/src/ops/indexing/database_storage.rs` - DatabaseStorage low-level CRUD operations
- `core/src/ops/indexing/persistence.rs` - DatabaseAdapter for IndexPersistence trait
- `core/src/ops/indexing/handlers/persistent.rs` - DatabaseAdapter for ChangeHandler trait
### Integration
- `core/src/ops/indexing/state.rs` - IndexerState with `ephemeral_uuids` field
- `core/src/ops/indexing/job.rs` - IndexerJob orchestration
- `core/src/ops/indexing/input.rs` - IndexerJobConfig with ephemeral/persistent modes
## Acceptance Criteria
- [x] EphemeralIndex can index directories entirely in RAM
- [x] NameCache interns duplicate filenames (e.g., "index.js" stored once)
- [x] NodeArena uses 32-bit entry IDs instead of 64-bit pointers
- [x] Memory usage is ~50 bytes per file entry
- [x] MemoryAdapter implements ChangeHandler for real-time ephemeral updates
- [x] DatabaseAdapter implements both IndexPersistence and ChangeHandler
- [x] Ephemeral-to-persistent promotion preserves UUIDs via IndexerState
- [x] UI doesn't flicker or reset state during promotion
- [x] EphemeralIndexCache tracks which paths are indexed/watching
- [x] Multiple directory trees can coexist in same EphemeralIndex
- [x] Filesystem events route to correct adapter (ephemeral vs persistent)
## Testing
### Manual Testing
```bash
# Test ephemeral browsing
spacedrive index browse /media/usb --ephemeral
# Verify in-memory only (no database writes)
spacedrive db query "SELECT COUNT(*) FROM entry WHERE name LIKE '%usb%'"
# Should return 0
# Add location while browsing (test promotion)
spacedrive location add /media/usb
# Verify UUIDs preserved (no UI flicker)
```
### Integration Tests
Located in `core/tests/indexing/`:
- `test_ephemeral_indexing` - Memory-only indexing
- `test_ephemeral_to_persistent_promotion` - UUID preservation
- `test_ephemeral_memory_usage` - Verify ~50 bytes/entry
- `test_ephemeral_string_interning` - NameCache deduplication
## Performance Characteristics
| Mode | Storage | Throughput | Memory/File | Sync | Survives Restart |
|------|---------|------------|-------------|------|------------------|
| Ephemeral | RAM | ~50K files/sec | ~50 bytes | No | No |
| Persistent | SQLite | ~10K files/sec | ~200 bytes | Yes | Yes |
## Related Tasks
- INDEX-002 - Five-Phase Indexing Pipeline
- INDEX-006 - Data Structures & Memory Optimizations
- INDEX-004 - Change Detection System (ChangeHandler trait)

View File

@@ -1,26 +0,0 @@
---
id: INDEX-001
title: Location Watcher Service
status: Done
assignee: james
parent: INDEX-000
priority: High
tags: [indexing, watcher, real-time]
whitepaper: Section 4.3.3
last_updated: 2025-12-02
---
## Description
The `LocationWatcher` service, which provides real-time monitoring of filesystem events within indexed locations, is implemented. This is crucial for keeping the VDFS index up-to-date without requiring frequent, expensive rescans.
## Implementation Notes
- A cross-platform file system watching library (`notify`) is integrated.
- The `LocationWatcher` service monitors multiple locations simultaneously.
- The service translates raw filesystem events into VDFS-specific events (e.g., `FileCreated`, `FileModified`, `FileDeleted`).
- The service dispatches these events to an `EventBus` for other services to consume.
## Acceptance Criteria
- [x] The `LocationWatcher` can be started and stopped gracefully.
- [x] The service correctly detects file creation, modification, and deletion events.
- [x] The service dispatches VDFS-specific events to the `EventBus`.

View File

@@ -0,0 +1,170 @@
---
id: INDEX-002
title: Five-Phase Indexing Pipeline
status: Done
assignee: jamiepine
parent: INDEX-000
priority: High
tags: [indexing, pipeline, phases]
whitepaper: Section 4.3.2
last_updated: 2025-12-16
---
## Description
Implement the multi-phase indexing pipeline that breaks filesystem discovery and processing into atomic, resumable stages. The ephemeral engine runs only Phase 1 (Discovery), while the persistent engine runs all five phases with full database writes and content analysis.
## Phase Architecture
### Phase 1: Discovery
**Used by: Ephemeral & Persistent**
Parallel filesystem walk optimized for raw speed:
- **Work-Stealing Parallelism**: Multiple threads scan concurrently, communicating via channels
- **Rules Engine Integration**: IndexerRuler filters at discovery edge (`.git`, `node_modules`, `.gitignore`)
- **Lightweight Output**: Stream of `DirEntry` objects
- **Progress Tracking**: Measured by directories discovered
- **Batching**: Collects 1,000 entries before moving to processing
**Implementation**: `core/src/ops/indexing/phases/discovery.rs`
### Phase 2: Processing
**Used by: Persistent Only**
Converts discovered entries into database records:
- **Topology Sorting**: Entries sorted by depth (parents before children)
- **Batch Transactions**: 1,000 items per transaction to minimize SQLite locking
- **Change Detection**: ChangeDetector compares filesystem vs database (New/Modified/Moved/Deleted)
- **UUID Preservation**: Carries over ephemeral UUIDs via `state.ephemeral_uuids`
- **Boundary Validation**: Ensures indexing path stays within location boundaries
- **Closure Table Updates**: Inserts ancestor-descendant pairs for hierarchy
- **Directory Path Cache**: Updates `directory_paths` table for O(1) lookups
**Implementation**: `core/src/ops/indexing/phases/processing.rs`
### Phase 3: Aggregation
**Used by: Persistent Only**
Bottom-up recursive statistics calculation:
- **Closure Table Queries**: O(1) descendant lookups
- **Leaf-to-Root Traversal**: Calculates sizes from deepest directories upward
- **Aggregates Stored**:
- `aggregate_size` - Total bytes including subdirectories
- `child_count` - Direct children only
- `file_count` - Recursive file count
Enables instant "True Size" sorting without traversing descendants.
**Implementation**: `core/src/ops/indexing/phases/aggregation.rs`
### Phase 4: Content Identification
**Used by: Persistent Only**
Content addressable storage via BLAKE3 hashing:
- **BLAKE3 Hashing**: Generates content hashes for deduplication
- **Globally Deterministic UUIDs**: v5 UUIDs from content hash (offline duplicate detection)
- **Sync Ordering**: Content identities synced before entries (foreign key safety)
- **File Type Detection**: FileTypeRegistry populates `kind_id` and `mime_type_id`
- **Link to Content Records**: Entries reference shared `content_identity` table
**Implementation**: `core/src/ops/indexing/phases/content.rs`
### Phase 5: Finalizing
**Used by: Persistent Only**
Post-processing and processor dispatch:
- **Directory Aggregation Updates**: Final aggregate calculations
- **Processor Dispatch**: Triggers thumbnail generation for Deep Mode
- **Cleanup**: Marks indexing as complete
**Implementation**: Handled in `core/src/ops/indexing/job.rs`
## Implementation Files
### Phase Implementations
- `core/src/ops/indexing/phases/discovery.rs` - Phase 1
- `core/src/ops/indexing/phases/processing.rs` - Phase 2
- `core/src/ops/indexing/phases/aggregation.rs` - Phase 3
- `core/src/ops/indexing/phases/content.rs` - Phase 4
- `core/src/ops/indexing/phases/mod.rs` - Phase enum and orchestration
### Orchestration
- `core/src/ops/indexing/job.rs` - IndexerJob runs phases sequentially
- `core/src/ops/indexing/state.rs` - IndexerState tracks current phase and progress
- `core/src/ops/indexing/progress.rs` - Progress reporting per phase
## Acceptance Criteria
- [x] Phase 1 (Discovery) runs in both ephemeral and persistent modes
- [x] Phases 2-5 only run for persistent indexing
- [x] Each phase is resumable (state preserved in IndexerState)
- [x] Discovery uses work-stealing parallelism (8+ threads on capable systems)
- [x] Processing sorts entries by depth (parents before children)
- [x] Processing batches database writes (1,000 items/transaction)
- [x] ChangeDetector detects New/Modified/Moved/Deleted during processing
- [x] Aggregation uses closure table for O(1) descendant queries
- [x] Content phase generates BLAKE3 hashes
- [x] Content phase creates globally deterministic v5 UUIDs
- [x] FileTypeRegistry identifies file types during content phase
- [x] Progress tracking works for all phases
- [x] Job can pause/resume at any phase boundary
- [x] Ephemeral UUID preservation works in Phase 2
## Indexing Modes
The pipeline supports three depth modes:
| Mode | Phases Run | Speed | Use Case |
|------|-----------|-------|----------|
| Shallow | 1, 2, 3 | Fast | UI navigation, quick scan |
| Content | 1, 2, 3, 4 | Medium | Normal indexing with dedup |
| Deep | 1, 2, 3, 4, 5 | Slow | Media libraries with thumbnails |
## Indexing Scopes
| Scope | Behavior | Use Case |
|-------|----------|----------|
| Current | Index immediate directory only | Responsive UI navigation |
| Recursive | Index entire tree | Full location indexing |
## Performance Characteristics
| Configuration | Performance | Notes |
|--------------|-------------|-------|
| Current + Shallow | <500ms | No subdirectories |
| Recursive + Shallow | ~10K files/sec | Metadata only |
| Recursive + Content | ~1K files/sec | With BLAKE3 hashing |
| Recursive + Deep | ~100 files/sec | Full analysis + thumbnails |
## Resumability
Each phase stores sufficient state in `IndexerState` to resume:
```rust
pub struct IndexerState {
pub phase: Phase,
pub dirs_to_walk: VecDeque<PathBuf>,
pub entry_batches: Vec<Vec<DirEntry>>,
pub entry_id_cache: HashMap<PathBuf, i32>,
pub ephemeral_uuids: HashMap<PathBuf, Uuid>,
pub stats: IndexerStats,
}
```
When interrupted:
1. State serialized to jobs database (MessagePack)
2. On resume, job loads state and continues from saved phase
3. No work is lost
## Related Tasks
- INDEX-001 - Hybrid Architecture (defines ephemeral vs persistent)
- INDEX-003 - Database Architecture (closure tables used in Phase 3)
- INDEX-004 - Change Detection (ChangeDetector used in Phase 2)
- INDEX-005 - Indexer Rules (filters in Phase 1)
- JOB-000 - Job System (provides resumability infrastructure)

View File

@@ -1,27 +0,0 @@
---
id: INDEX-002
title: Stale File Detection Algorithm
status: To Do
assignee: james
parent: INDEX-000
priority: High
tags: [indexing, stale-detection, offline-recovery]
whitepaper: Section 4.3.4
---
## Description
Implement the algorithm for detecting stale files after the application has been offline. This is a critical part of the indexing process, ensuring that changes made while the application was not running are correctly detected and reconciled.
## Implementation Steps
1. Design the algorithm for stale file detection, likely using a combination of inode numbers, modification times, and file sizes.
2. Implement the algorithm as part of the `IndexerJob`'s startup process.
3. The algorithm should be able to handle edge cases like file renames and moves.
4. The algorithm should be efficient and not significantly slow down the application's startup time.
## Acceptance Criteria
- [ ] The system can correctly detect files that were modified or deleted while the application was offline.
- [ ] The system can correctly detect files that were moved or renamed while the application was offline.
- [ ] The stale file detection process is efficient and does not block the application for an unreasonable amount of time.

View File

@@ -0,0 +1,244 @@
---
id: INDEX-003
title: Database Architecture (Closure Table & Directory Paths Cache)
status: Done
assignee: jamiepine
parent: INDEX-000
priority: High
tags: [indexing, database, closure-table, performance]
whitepaper: Section 4.3.5
last_updated: 2025-12-16
related_tasks: [CORE-004]
---
## Description
Implement the specialized database schema optimizations that enable fast hierarchy queries and path lookups. Instead of recursive queries, use precomputed closure tables for O(1) "find all descendants" operations and a directory paths cache for instant absolute path resolution.
## Architecture
### Closure Table
The `entry_closure` table stores all transitive ancestor-descendant relationships with precomputed depths:
```sql
CREATE TABLE entry_closure (
ancestor_id INTEGER,
descendant_id INTEGER,
depth INTEGER,
PRIMARY KEY (ancestor_id, descendant_id)
);
```
#### Example Hierarchy
For `/home/user/docs/report.pdf`:
```
home/ (id=1)
└─ user/ (id=2)
└─ docs/ (id=3)
└─ report.pdf (id=4)
```
#### Closure Table Entries
```sql
-- Self-references (depth 0)
(1, 1, 0) -- home → home
(2, 2, 0) -- user → user
(3, 3, 0) -- docs → docs
(4, 4, 0) -- report.pdf → report.pdf
-- Direct relationships (depth 1)
(1, 2, 1) -- home → user
(2, 3, 1) -- user → docs
(3, 4, 1) -- docs → report.pdf
-- Transitive relationships
(1, 3, 2) -- home → docs
(2, 4, 2) -- user → report.pdf
(1, 4, 3) -- home → report.pdf
```
#### Query Benefits
```sql
-- Find all descendants of "home" (O(1) regardless of depth)
SELECT descendant_id, depth
FROM entry_closure
WHERE ancestor_id = 1 AND depth > 0;
-- Find all ancestors of "report.pdf"
SELECT ancestor_id, depth
FROM entry_closure
WHERE descendant_id = 4 AND depth > 0;
-- Find direct children only
SELECT descendant_id
FROM entry_closure
WHERE ancestor_id = 1 AND depth = 1;
```
#### Move Operations
When moving a subtree, rebuild closures for entire moved branch:
```rust
// Moving /home/user/docs to /home/archive/docs
// Affects thousands of rows for large directories
async fn rebuild_closure_for_subtree(entry_id: i32, db: &DatabaseConnection) -> Result<()> {
// 1. Delete old closures for moved subtree
// 2. Recompute closures based on new parent_id
// 3. Insert new closure rows
}
```
**Cost**: O(N²) worst-case for deeply nested trees, but acceptable for typical hierarchies.
### Directory Paths Cache
The `directory_paths` table caches full absolute paths for O(1) lookups:
```sql
CREATE TABLE directory_paths (
entry_id INTEGER PRIMARY KEY,
path TEXT UNIQUE
);
```
#### Example Entries
```sql
INSERT INTO directory_paths VALUES
(1, '/home'),
(2, '/home/user'),
(3, '/home/user/docs');
```
#### Benefits
- **O(1) Path Resolution**: No recursive parent traversal needed
- **Instant Child Path Construction**: `parent_path + "/" + child_name`
- **Fast Path-Based Queries**: Direct lookup by full path
#### Maintenance
- **Create**: Insert on directory creation
- **Move**: Update path and all descendant paths
- **Delete**: Remove on directory deletion
### Entries Table
Core filesystem metadata storage:
```sql
CREATE TABLE entry (
id INTEGER PRIMARY KEY,
uuid UUID UNIQUE,
parent_id INTEGER,
name TEXT,
extension TEXT,
kind INTEGER,
size BIGINT,
inode BIGINT,
content_id INTEGER,
aggregate_size BIGINT, -- Calculated in Phase 3
child_count INTEGER, -- Calculated in Phase 3
file_count INTEGER -- Calculated in Phase 3
);
```
## Implementation Files
### Closure Table Management
- `core/src/ops/indexing/hierarchy.rs` - Closure table insert/update/delete operations
- `core/src/ops/indexing/database_storage.rs` - Low-level CRUD with closure updates
### Directory Path Caching
- `core/src/ops/indexing/path_resolver.rs` - Path resolution and caching
- `core/src/ops/indexing/database_storage.rs` - Directory path cache updates
### Database Operations
- `core/src/ops/indexing/database_storage.rs` - DatabaseStorage with closure integration
- `core/src/ops/indexing/phases/processing.rs` - Closure creation during Phase 2
- `core/src/ops/indexing/phases/aggregation.rs` - Closure queries for aggregation
## Acceptance Criteria
- [x] Closure table stores all ancestor-descendant pairs
- [x] Self-references included (depth 0)
- [x] Depth correctly calculated for all relationships
- [x] Find descendants query is O(1) regardless of nesting depth
- [x] Find ancestors query is O(1)
- [x] Move operations correctly rebuild closures for moved subtree
- [x] Directory paths cache stores full absolute paths
- [x] Path lookups are O(1) (no recursive traversal)
- [x] Moving directories updates descendant paths in cache
- [x] Deleting directories removes from cache
- [x] Aggregates (aggregate_size, child_count, file_count) calculated via closure table
- [x] Phase 2 creates closure entries for new files
- [x] Phase 3 uses closure table for bottom-up aggregation
## Performance Impact
| Operation | Without Closure Table | With Closure Table |
|-----------|---------------------|-------------------|
| Find all descendants | O(N) recursive | O(1) single query |
| Calculate directory size | O(N) traversal | O(1) precomputed |
| Find ancestors | O(depth) | O(1) single query |
| Move directory | O(1) update | O(subtree) rebuild |
**Trade-off**: Storage cost (N² worst-case) for query speed (O(1) reads).
## Storage Cost
For a typical hierarchy:
- **Flat directory (100 files)**: 100 + 100 = 200 closure rows
- **Deep nesting (10 levels, 10 items/level)**: ~5,000 closure rows
- **Pathological (1 file, 1000 levels deep)**: ~500,000 closure rows
In practice, filesystem hierarchies are relatively balanced, keeping storage overhead reasonable.
## Testing
### Manual Testing
```bash
# Index a deep directory
spacedrive index location ~/Documents --mode shallow
# Check closure table populated
spacedrive db query "SELECT COUNT(*) FROM entry_closure"
# Verify O(1) descendant query
spacedrive db query "
SELECT COUNT(*)
FROM entry_closure
WHERE ancestor_id = (SELECT id FROM entry WHERE name = 'Documents')
"
# Test move operation
mv ~/Documents/Work ~/Documents/Archive/Work
# Verify closures rebuilt correctly
spacedrive db query "
SELECT * FROM entry_closure
WHERE descendant_id = (SELECT id FROM entry WHERE name = 'Work')
"
```
### Integration Tests
Located in `core/tests/indexing/`:
- `test_closure_table_creation` - Verify closures created during indexing
- `test_closure_table_queries` - Test O(1) descendant queries
- `test_move_rebuilds_closures` - Verify move updates closures
- `test_directory_path_cache` - Test O(1) path lookups
- `test_aggregation_uses_closures` - Verify Phase 3 uses closure table
## Related Tasks
- INDEX-002 - Five-Phase Pipeline (Phase 2 creates closures, Phase 3 uses them)
- INDEX-004 - Change Detection (Move detection triggers closure rebuild)
- CORE-004 - Closure Table (base implementation)

View File

@@ -1,396 +0,0 @@
---
id: INDEX-003
title: Fix Watcher Device Ownership Violation (CRITICAL)
status: To Do
assignee: james
priority: Critical
tags: [watcher, sync, bug, security]
last_updated: 2025-10-23
related_tasks: [INDEX-001, LSYNC-010]
---
# Fix Watcher Device Ownership Violation (CRITICAL)
## Problem Statement
**CRITICAL BUG**: The location watcher violates device ownership by watching and modifying locations owned by other devices.
### Bug Scenario
1. Device A creates location `/Users/jamespine/Desktop``device_id = A`
2. Location syncs to Device B's database
3. Device B also has `/Users/jamespine/Desktop` on its local filesystem
4. Device B's watcher **incorrectly** starts watching Device A's location
5. When files change on Device B's desktop, the watcher triggers indexer
6. Device B modifies Device A's entries → **OWNERSHIP VIOLATION**
###Current Code (Bug)
`core/src/service/watcher/mod.rs:~493`
```rust
// Load all locations from database (NO DEVICE CHECK!)
let locations = entities::location::Entity::find()
.all(db) // <-- Gets ALL devices' locations
.await?;
for location in locations {
let path = PathResolver::get_full_path(db, entry_id).await?;
// If path exists locally, start watching
if path.exists() {
self.add_location(watched_location).await?; // BUG!
}
}
```
## Impact
**Severity**: CRITICAL - Data corruption / sync integrity violation
**Consequences**:
- Device B can modify Device A's location metadata
- Entries get incorrectly attributed to wrong devices
- Sync state becomes corrupted
- Cannot determine authoritative source of changes
- Breaks the fundamental device-owned sync model
**Reproduction**:
1. Have two devices with same username (common: `/Users/jamespine/`)
2. Add Desktop location on Device A
3. Sync to Device B
4. Create a file on Device B's desktop
5. **Bug**: Device B's watcher modifies Device A's location
## Root Cause
**TWO separate bugs:**
### Bug 1: Watcher loads all devices' locations
The watcher's `load_locations_from_database()` method (line ~493) queries **all locations** without filtering by device ownership:
```rust
let locations = entities::location::Entity::find()
.all(db) // Gets locations from ALL devices
.await?;
```
It then checks if the path exists locally and starts watching if it does, **regardless of which device owns the location**.
### Bug 2: Responder looks up parents by path without location scoping
The responder's `create_entry()` looks up parent entries by path string:
```rust
// entry.rs:234-236
entities::directory_paths::Entity::find()
.filter(entities::directory_paths::Column::Path.eq(&parent_path_str))
.one(ctx.library_db())
```
If Device A and Device B both have `/Users/jamespine/Desktop` locations, this query could return **EITHER** device's entry (whichever is first in the table).
**Impact**: Even with Bug 1 fixed, if `location_id` isn't used to scope parent lookup, entries could be created under the wrong location's tree.
## Solution
### Fix 1: Filter Locations by Device (DONE ✅)
**File**: `core/src/service/watcher/mod.rs:487-520`
Only watch locations owned by the current device:
```rust
// Get current device UUID
let current_device_uuid = crate::device::get_current_device_id();
// Get device's integer ID
let current_device = device::Entity::find()
.filter(device::Column::Uuid.eq(current_device_uuid))
.one(db)
.await?;
// Filter locations by current device
let locations = location::Entity::find()
.filter(location::Column::DeviceId.eq(current_device.id))
.all(db)
.await?;
```
### Fix 2: Add Safety Check in add_location() (DONE ✅)
**File**: `core/src/service/watcher/mod.rs:356-388`
Add runtime ownership validation before watching:
```rust
pub async fn add_location(&self, location: WatchedLocation) -> Result<()> {
// Verify this device owns the location
let location_record = entities::location::Entity::find()
.filter(entities::location::Column::Uuid.eq(location.id))
.one(db)
.await?
.ok_or_else(|| anyhow!("Location not found"))?;
let current_device_id = self.context.device().id();
if location_record.device_id != current_device_id {
return Err(anyhow!(
"Cannot watch location {} owned by device {} (current device: {})",
location.id,
location_record.device_id,
current_device_id
));
}
// ... rest of add_location logic
}
```
### Fix 3: Add Integration Test
```rust
#[tokio::test]
async fn test_watcher_respects_device_ownership() {
let (device_a, device_b) = setup_paired_devices().await;
// Device A creates location
let location_a = create_location(device_a, "/Users/test/Desktop").await;
// Sync to device B
wait_for_sync().await;
// Device B should NOT watch device A's location
let watched = device_b.watcher().get_watched_locations().await;
assert!(!watched.iter().any(|l| l.id == location_a.uuid));
// Device B creates its own location with same path
let location_b = create_location(device_b, "/Users/test/Desktop").await;
// Device B SHOULD watch its own location
let watched = device_b.watcher().get_watched_locations().await;
assert!(watched.iter().any(|l| l.id == location_b.uuid));
}
```
## Implementation Plan
### Phase 1: Watcher Device Filtering (DONE ✅)
**Files**: `core/src/service/watcher/mod.rs`
- [x] Filter locations by device_id in `load_locations_from_database()`
- [x] Add runtime ownership check in `add_location()`
- [x] Add necessary SeaORM imports
### Phase 2: Scope Responder Path Lookups (TODO - CRITICAL)
**Files**: `core/src/ops/indexing/responder.rs`
Required changes:
1. Thread `location_id` through all handlers:
- `handle_modify(ctx, path, location_id, ...) `
- `handle_remove(ctx, path, location_id, ...)`
- `handle_rename(ctx, from, to, location_id, ...)`
2. Look up location root entry ID at start of each handler:
```rust
let location_record = location::Entity::find()
.filter(location::Column::Uuid.eq(location_id))
.one(db).await?;
let location_root_entry_id = location_record.entry_id.unwrap();
```
3. Update resolve functions to accept and use `location_root_entry_id`:
- `resolve_directory_entry_id(ctx, path, location_root_entry_id)`
- `resolve_file_entry_id(ctx, path, location_root_entry_id)`
4. Add `entry_closure` JOIN to scope queries:
```sql
SELECT dp.entry_id
FROM directory_paths dp
INNER JOIN entry_closure ec ON ec.descendant_id = dp.entry_id
WHERE dp.path = ? AND ec.ancestor_id = ?
```
5. Update `entry.rs` parent lookup (line 234) with same pattern
**Estimated time**: 2-3 hours
### Phase 3: Testing (1-2 hours)
**File**: `core/tests/indexing_multi_device_test.rs` (new)
```rust
#[tokio::test]
async fn test_responder_scopes_to_correct_location() {
// Setup: Two devices, both with /Users/test/Desktop
let (device_a, device_b) = setup_paired_devices().await;
let location_a = create_location(device_a, "/Users/test/Desktop").await;
let location_b = create_location(device_b, "/Users/test/Desktop").await;
// Both create test.txt on their respective desktops
create_file(device_a, "/Users/test/Desktop/test.txt").await;
create_file(device_b, "/Users/test/Desktop/test.txt").await;
// Verify each device's watcher only modified its own location's entries
let entries_a = get_entries_for_location(location_a.id).await;
let entries_b = get_entries_for_location(location_b.id).await;
assert_eq!(entries_a.len(), 2); // Desktop + test.txt
assert_eq!(entries_b.len(), 2); // Desktop + test.txt
// Verify no cross-contamination
assert!(entries_a.iter().all(|e| is_descendant_of(e.id, location_a.entry_id)));
assert!(entries_b.iter().all(|e| is_descendant_of(e.id, location_b.entry_id)));
}
```
### Phase 4: Audit (30 minutes)
Check for similar path-based queries elsewhere:
- `PathResolver::get_full_path()` - Does it need scoping?
- File operations (copy/move/delete) - Do they scope by location?
- Any other `directory_paths WHERE path = ?` queries
## Acceptance Criteria
### Phase 1 (Watcher) - COMPLETED ✅
- [x] Watcher filters locations by current device ID
- [x] `add_location()` validates device ownership
- [x] Builds successfully
- [x] Device A's location not watched on Device B
### Phase 2 (Responder) - TODO
- [ ] All handlers receive `location_id` parameter
- [ ] `resolve_directory_entry_id()` scoped by location using `entry_closure` JOIN
- [ ] `resolve_file_entry_id()` scoped by location using `entry_closure` JOIN
- [ ] `entry.rs` parent lookup scoped by location
- [ ] Integration test passes for multi-device same-path scenario
- [ ] Existing tests still pass
- [ ] Both devices can have same path without cross-contamination
## Testing Strategy
### Manual Test
```bash
# On Device A
sd location add "/Users/jamespine/Desktop"
# On Device B (after sync)
sd sync wait # Wait for location to sync
# Verify Device B is NOT watching Device A's location
sd watcher status
# Should show 0 watched locations (or only Device B's own locations)
# Create a file on Device B's desktop
touch "/Users/jamespine/Desktop/test.txt"
# Wait a few seconds for watcher
sleep 5
# Query Device A's location entries from Device B
sd --instance jam query entries --location <device-a-location-id>
# Should NOT include test.txt (Device B didn't modify Device A's location)
```
### Integration Test
Run test suite with device ownership checks enabled:
```bash
cargo test --lib watcher::test_watcher_respects_device_ownership
```
## Migration Notes
**Breaking Change**: If any users have been affected by this bug, their databases may contain corrupted entries where Device B modified Device A's location.
**Cleanup Strategy** (optional future work):
1. Query for entries where `location.device_id != device_that_created_entry`
2. Mark these as "orphaned" or "corrupted"
3. Allow user to reassign to correct device or delete
### Fix 3: Scope Responder Path Lookups by Location (TODO - CRITICAL)
**Problem**: The responder's resolve functions query entries by path alone, without location scoping:
**Vulnerable functions** (`core/src/ops/indexing/responder.rs`):
- `resolve_directory_entry_id()` (line 397) - Used by modify/remove/rename
- `resolve_file_entry_id()` (line 415) - Used by modify/remove/rename
- Parent lookup in `entry.rs` (line 234) - Used by create
**Current behavior (BROKEN)**:
```rust
// Queries by path only - could match ANY device's entry!
directory_paths::Entity::find()
.filter(directory_paths::Column::Path.eq(path_str))
.one(db)
```
**Correct approach (like the indexer)**:
```rust
// Scope to location's entry tree using entry_closure
async fn resolve_directory_entry_id_scoped(
ctx: &impl IndexingCtx,
abs_path: &Path,
location_root_entry_id: i32, // <-- Add this
) -> Result<Option<i32>> {
let path_str = abs_path.to_string_lossy().to_string();
// Query directory_paths and JOIN with entry_closure to scope by location
let result = ctx.library_db()
.query_one(Statement::from_sql_and_values(
DbBackend::Sqlite,
r#"
SELECT dp.entry_id
FROM directory_paths dp
INNER JOIN entry_closure ec ON ec.descendant_id = dp.entry_id
WHERE dp.path = ?
AND ec.ancestor_id = ?
"#,
vec![path_str.into(), location_root_entry_id.into()],
))
.await?;
Ok(result.map(|row| row.try_get::<i32>("", "entry_id").ok()).flatten())
}
```
**Implementation plan**:
1. Thread `location_id` through all responder handlers
2. Look up `location_root_entry_id` once at start of each handler
3. Pass to all resolve functions
4. Add JOIN with `entry_closure` to scope queries
5. Apply same pattern to `entry.rs` parent lookup
**Why this is better than cache**:
- Works across all operations (create/modify/remove/rename)
- Survives state resets and restarts
- Database-backed correctness (not in-memory heuristic)
- Matches proven indexer pattern
- Prevents cross-device contamination definitively
**Status**: TODO - requires refactoring responder signatures
## Comparison: Indexer vs Responder
| Aspect | Indexer | Responder (Current) | Responder (After Fix) |
|--------|---------|---------------------|----------------------|
| Has location_id? | Yes | Yes (but unused) | Yes (used) |
| Scoping method | `entry_closure` JOIN | None | `entry_closure` JOIN |
| Cache seeding | Yes (line 61-63) | Yes (my fix) | Yes (keep as optimization) |
| Path queries scoped? | Yes | No | Yes |
| Safe for multi-device? | Yes | No | Yes |
## Related Issues
- Entry device ownership filtering during sync (separate concern)
- Sync integrity validation
- Location transfer ownership on volume move
## References
- [Location Watcher Service](../../core/src/service/watcher/mod.rs)
- [LSYNC-010](./LSYNC-010-sync-service.md) - Device-owned sync model
- [INDEX-001](./INDEX-001-location-watcher-service.md) - Watcher architecture

View File

@@ -0,0 +1,256 @@
---
id: INDEX-004
title: Change Detection System (Batch + Real-Time)
status: Done
assignee: jamiepine
parent: INDEX-000
priority: High
tags: [indexing, change-detection, watcher, stale-detection]
whitepaper: Section 4.3.3
last_updated: 2025-12-16
---
## Description
Implement the dual-mode change detection system that keeps the index synchronized with filesystem state. Batch change detection runs during indexer jobs to detect offline changes (stale file detection), while real-time change detection processes filesystem watcher events as they occur.
## Architecture
### Batch Change Detection (ChangeDetector)
The `ChangeDetector` compares database state against filesystem during indexer scans:
```rust
pub struct ChangeDetector {
// Maps inode → EntryRecord for existing entries
inode_map: HashMap<u64, EntryRecord>,
// Maps path → EntryRecord for path-only matching (Windows fallback)
path_map: HashMap<PathBuf, EntryRecord>,
// Tracks which entries we've seen this scan
seen_entries: HashSet<i32>,
}
```
#### Detection Process
1. **Load Existing Entries**: Query database for all entries under indexing path
2. **Build Lookup Maps**: Create inode and path maps for fast comparisons
3. **Compare**: For each discovered filesystem entry, check against maps
4. **Classify Changes**:
- **New**: Path not in database
- **Modified**: Size or mtime differs
- **Moved**: Same inode at different path (Unix only)
- **Deleted**: In database but missing from filesystem
5. **Batch Process**: Execute changes in transactions
#### Inode Tracking
- **Unix**: Use stable inodes for move detection
- **Windows**: Fall back to path-only matching (file indices unstable across reboots)
```rust
impl ChangeDetector {
async fn check_path(
&self,
path: &Path,
metadata: &Metadata,
inode: Option<u64>,
) -> Option<Change> {
if let Some(inode) = inode {
// Unix: Check inode first (detects moves)
if let Some(existing) = self.inode_map.get(&inode) {
if existing.path != path {
return Some(Change::Moved { old: existing.path, new: path });
}
if existing.size != metadata.len() || existing.mtime != metadata.modified() {
return Some(Change::Modified { path });
}
return None; // Unchanged
}
}
// Not found by inode, check path
if let Some(existing) = self.path_map.get(path) {
if existing.size != metadata.len() || existing.mtime != metadata.modified() {
return Some(Change::Modified { path });
}
return None; // Unchanged
}
// Not in database
Some(Change::New { path })
}
fn find_deleted(&self) -> Vec<Change> {
self.path_map
.keys()
.filter(|path| !self.seen_entries.contains(&self.path_map[path].id))
.map(|path| Change::Deleted { path })
.collect()
}
}
```
### Real-Time Change Detection (ChangeHandler Trait)
The `ChangeHandler` trait defines the interface for responding to filesystem events:
```rust
pub trait ChangeHandler {
async fn find_by_path(&self, path: &Path) -> Result<Option<EntryRef>>;
async fn create(&mut self, metadata: &DirEntry, parent_path: &Path) -> Result<EntryRef>;
async fn update(&mut self, entry: &EntryRef, metadata: &DirEntry) -> Result<()>;
async fn move_entry(&mut self, entry: &EntryRef, old_path: &Path, new_path: &Path) -> Result<()>;
async fn delete(&mut self, entry: &EntryRef) -> Result<()>;
}
```
#### Implementations
**DatabaseAdapter** (Persistent):
- Writes to SQLite database
- Updates closure tables on moves
- Updates directory paths cache
- Creates sync operations for cross-device propagation
**MemoryAdapter** (Ephemeral):
- Updates EphemeralIndex in-memory structures
- Updates NameRegistry for name-based lookups
- No database I/O
#### Event Routing
The filesystem watcher routes events to the appropriate handler:
```rust
async fn handle_filesystem_event(&self, event: Event) -> Result<()> {
let path = event.path();
// Determine if this path belongs to ephemeral or persistent index
if let Some(ephemeral_index) = self.ephemeral_cache.get_index_for_path(path).await {
// Route to MemoryAdapter
let mut adapter = MemoryAdapter::new(ephemeral_index);
adapter.handle_change(event).await?;
} else if let Some(location) = self.find_location_for_path(path, db).await? {
// Route to DatabaseAdapter
let mut adapter = DatabaseAdapter::new(db, location.id);
adapter.handle_change(event).await?;
}
Ok(())
}
```
## Implementation Files
### Batch Change Detection
- `core/src/ops/indexing/change_detection/detector.rs` - ChangeDetector implementation
- `core/src/ops/indexing/change_detection/types.rs` - Change enum (New/Modified/Moved/Deleted)
- `core/src/ops/indexing/phases/processing.rs` - Integration into Phase 2
### Real-Time Change Detection
- `core/src/ops/indexing/change_detection/handler.rs` - ChangeHandler trait definition
- `core/src/ops/indexing/change_detection/persistent.rs` - DatabaseAdapter implementation
- `core/src/ops/indexing/handlers/persistent.rs` - DatabaseAdapter for ChangeHandler
- `core/src/ops/indexing/handlers/ephemeral.rs` - MemoryAdapter for ChangeHandler
- `core/src/ops/indexing/handlers/mod.rs` - Handler module exports
### Database Operations
- `core/src/ops/indexing/database_storage.rs` - Low-level CRUD used by DatabaseAdapter
- `core/src/ops/indexing/ephemeral/writer.rs` - In-memory operations used by MemoryAdapter
## Acceptance Criteria
### Batch Change Detection
- [x] ChangeDetector loads existing entries from database
- [x] Inode-based move detection works on Unix systems
- [x] Path-based fallback works on Windows
- [x] Detects New files (not in database)
- [x] Detects Modified files (size or mtime changed)
- [x] Detects Moved files (same inode, different path)
- [x] Detects Deleted files (in database, missing from filesystem)
- [x] Changes processed in batch transactions (1,000 items)
- [x] Integrated into Phase 2 (Processing)
### Real-Time Change Detection
- [x] ChangeHandler trait defines standard interface
- [x] DatabaseAdapter implements ChangeHandler for persistent storage
- [x] MemoryAdapter implements ChangeHandler for ephemeral storage
- [x] Filesystem events route to correct adapter (ephemeral vs persistent)
- [x] Create events insert new entries
- [x] Modify events update size/mtime
- [x] Move events update parent_id and rebuild closures
- [x] Delete events remove entries and closures
- [x] Directory path cache updated on create/move/delete
### Stale File Detection (Offline Changes)
**Note**: Automated stale detection on app startup is tracked separately in INDEX-009. The ChangeDetector provides the foundation but automatic reconciliation is not yet fully implemented.
## Platform-Specific Behavior
| Platform | Inode Support | Move Detection | Path Stability |
|----------|--------------|----------------|----------------|
| macOS | Yes (FSEvents) | Via inode | Stable |
| Linux | Yes | Via inode | Stable |
| Windows | Limited | Via path only | Unstable across reboots |
## Performance Characteristics
### Batch Change Detection
- **Load existing entries**: O(N) where N = entries in location
- **Build lookup maps**: O(N) hash map construction
- **Check each file**: O(1) hash lookup
- **Find deleted**: O(N) iteration
- **Total**: ~O(N) where N = files in location
### Real-Time Change Detection
- **Event routing**: O(1) hash lookup
- **Database write**: O(log N) SQLite insert
- **Closure update (move)**: O(subtree size)
- **Total per event**: ~O(1) to O(subtree) depending on operation
## Testing
### Manual Testing
```bash
# Test batch change detection (stale detection)
# 1. Index a directory
spacedrive index location ~/Documents --mode shallow
# 2. Stop Spacedrive
spacedrive stop
# 3. Make changes while offline
touch ~/Documents/new_file.txt
echo "modified" >> ~/Documents/existing.txt
mv ~/Documents/old.txt ~/Documents/renamed.txt
rm ~/Documents/deleted.txt
# 4. Restart and verify detection
spacedrive start
spacedrive index location ~/Documents --mode shallow
# Should detect: 1 new, 1 modified, 1 moved, 1 deleted
```
### Integration Tests
Located in `core/tests/indexing/`:
- `test_change_detector_new_files` - Detect new files
- `test_change_detector_modified_files` - Detect size/mtime changes
- `test_change_detector_moved_files_unix` - Detect moves via inode
- `test_change_detector_deleted_files` - Detect deleted files
- `test_change_handler_create` - Real-time create events
- `test_change_handler_modify` - Real-time modify events
- `test_change_handler_move` - Real-time move events
- `test_change_handler_delete` - Real-time delete events
- `test_stale_detection_after_offline` - Offline change detection
## Related Tasks
- INDEX-001 - Hybrid Architecture (defines DatabaseAdapter vs MemoryAdapter)
- INDEX-002 - Five-Phase Pipeline (Phase 2 uses ChangeDetector)
- INDEX-003 - Database Architecture (move operations rebuild closures)
- INDEX-009 - Stale File Detection (automated offline change reconciliation)

View File

@@ -0,0 +1,262 @@
---
id: INDEX-005
title: Indexer Rules Engine
status: Done
assignee: jamiepine
parent: INDEX-000
priority: Medium
tags: [indexing, rules, filtering, gitignore]
whitepaper: Section 4.3.6
last_updated: 2025-12-16
---
## Description
Implement the filtering rules system that allows selective indexing by skipping unwanted files at discovery time. The system supports toggleable system rules (hidden files, dev directories, OS folders) and dynamic .gitignore integration for Git repositories.
## Architecture
### IndexerRuler
The `IndexerRuler` applies rules during Phase 1 (Discovery) to filter files before they enter the processing pipeline:
```rust
pub struct IndexerRuler {
// Toggleable system rules
enabled_rules: HashSet<SystemRule>,
// .gitignore patterns (loaded dynamically)
gitignore: Option<Gitignore>,
// Custom user rules
custom_rules: Vec<Rule>,
}
pub enum RulerDecision {
Accept, // Include in index
Reject, // Skip this file
}
```
### System Rules
Predefined patterns that can be toggled on/off:
| Rule | Pattern | Example Matches |
|------|---------|----------------|
| `NO_HIDDEN` | Files starting with `.` | `.git`, `.DS_Store`, `.env` |
| `NO_DEV_DIRS` | Common dev folders | `node_modules`, `target`, `dist`, `build` |
| `NO_SYSTEM` | OS system folders | `System32`, `Windows`, `/proc`, `/sys` |
| `NO_TEMP` | Temporary files | `*.tmp`, `*.temp`, `~*` |
| `NO_CACHE` | Cache directories | `.cache`, `__pycache__`, `.pytest_cache` |
### Git Integration
When indexing inside a Git repository, the ruler automatically loads `.gitignore`:
```rust
impl IndexerRuler {
pub fn load_gitignore(&mut self, repo_root: &Path) -> Result<()> {
let gitignore_path = repo_root.join(".gitignore");
if gitignore_path.exists() {
let patterns = parse_gitignore(&gitignore_path)?;
self.gitignore = Some(Gitignore::new(patterns));
}
Ok(())
}
pub fn check_path(&self, path: &Path, is_dir: bool) -> RulerDecision {
// Check system rules first
if self.check_system_rules(path, is_dir) == RulerDecision::Reject {
return RulerDecision::Reject;
}
// Check .gitignore patterns
if let Some(gitignore) = &self.gitignore {
if gitignore.matches(path, is_dir) {
return RulerDecision::Reject;
}
}
// Check custom rules
for rule in &self.custom_rules {
if rule.matches(path, is_dir) {
return rule.decision;
}
}
RulerDecision::Accept
}
}
```
### Discovery Integration
Rules are applied at the edge of discovery:
```rust
// In Phase 1 (Discovery)
for entry in read_dir(path)? {
let entry = entry?;
let path = entry.path();
// Apply rules BEFORE queuing for processing
if ruler.check_path(&path, entry.is_dir()) == RulerDecision::Reject {
continue; // Skip this file entirely
}
// File passed rules, add to processing queue
discovered_entries.push(entry);
}
```
This prevents unwanted files from ever reaching Phase 2, saving significant processing time.
## Implementation Files
### Core Rules Engine
- `core/src/ops/indexing/rules.rs` - IndexerRuler, SystemRule, RulerDecision
### Discovery Integration
- `core/src/ops/indexing/phases/discovery.rs` - Rules applied during filesystem walk
### Configuration
- `core/src/ops/indexing/input.rs` - IndexerJobConfig with enabled_rules field
## Acceptance Criteria
- [x] IndexerRuler can be configured with system rules
- [x] NO_HIDDEN rule skips files starting with `.`
- [x] NO_DEV_DIRS rule skips node_modules, target, dist, etc.
- [x] NO_SYSTEM rule skips OS folders (System32, /proc, /sys)
- [x] NO_TEMP rule skips temporary files
- [x] NO_CACHE rule skips cache directories
- [x] Rules can be toggled on/off per location
- [x] .gitignore patterns loaded automatically when inside Git repo
- [x] .gitignore patterns correctly match paths
- [x] Rules applied during Phase 1 (Discovery)
- [x] Rejected files never enter processing queue
- [x] Custom user rules supported
- [x] Rule decisions logged for debugging
## Rule Precedence
Rules are evaluated in order of specificity:
1. **System rules** (if enabled)
2. **.gitignore patterns** (if in Git repo)
3. **Custom user rules**
4. **Default: Accept**
First rejection wins - no need to check remaining rules.
## Performance Impact
Applying rules at discovery edge provides significant speedup:
| Scenario | Without Rules | With Rules | Speedup |
|----------|--------------|-----------|---------|
| Node.js project (500K files) | 50 seconds | 8 seconds | 6.25x |
| Rust project (target/ dir) | 20 seconds | 3 seconds | 6.67x |
| Home directory (hidden files) | 100 seconds | 60 seconds | 1.67x |
By rejecting files at discovery, we avoid:
- Database queries in Phase 2
- Closure table lookups
- Metadata processing
- Memory allocation
## Configuration Examples
### CLI
```bash
# Skip all hidden files and dev directories
spacedrive index location ~/Projects \
--skip-hidden \
--skip-dev-dirs
# Use .gitignore patterns
spacedrive index location ~/code/my-app \
--use-gitignore
# Custom rule
spacedrive index location ~/Documents \
--exclude "*.tmp" \
--exclude "~*"
```
### Config File
```toml
[location."~/Projects"]
rules = ["NO_HIDDEN", "NO_DEV_DIRS"]
use_gitignore = true
[location."~/Documents"]
custom_rules = [
{ pattern = "*.tmp", decision = "Reject" },
{ pattern = "~*", decision = "Reject" }
]
```
## Gitignore Pattern Support
Supported .gitignore syntax:
- [x] Basic wildcards (`*.log`, `temp*`)
- [x] Directory-only patterns (`build/`)
- [x] Negation (`!important.log`)
- [x] Character classes (`[abc].txt`)
- [x] Double-asterisk (`**/node_modules`)
- [x] Comments (`# ignore this`)
- [x] Blank lines
## Testing
### Manual Testing
```bash
# Create test directory with common patterns
mkdir -p ~/test-rules
cd ~/test-rules
touch .hidden visible.txt
mkdir -p node_modules/.cache
echo "*.tmp" > .gitignore
touch test.tmp test.txt
# Index with rules
spacedrive index location ~/test-rules \
--skip-hidden \
--skip-dev-dirs \
--use-gitignore
# Verify filtered correctly
spacedrive db query "SELECT name FROM entry WHERE parent_id IN (
SELECT id FROM entry WHERE name = 'test-rules'
)"
# Should only see: visible.txt, test.txt, .gitignore
# Should NOT see: .hidden, node_modules, .cache, test.tmp
```
### Integration Tests
Located in `core/tests/indexing/`:
- `test_ruler_no_hidden` - Verify hidden files skipped
- `test_ruler_no_dev_dirs` - Verify dev directories skipped
- `test_ruler_gitignore` - Verify .gitignore patterns respected
- `test_ruler_precedence` - Verify rule evaluation order
- `test_ruler_custom_rules` - Verify custom user rules work
## Future Enhancements
- **Per-file-type rules**: Skip by extension or MIME type
- **Size-based rules**: Skip files over certain size
- **Date-based rules**: Skip files older than X days
- **Allowlist mode**: Only index matching patterns
- **Rule templates**: Predefined rule sets for common use cases
- **Rule sync**: Share rules across devices
## Related Tasks
- INDEX-002 - Five-Phase Pipeline (Phase 1 applies rules)
- CORE-005 - File Type System (could be used for type-based rules)

View File

@@ -0,0 +1,336 @@
---
id: INDEX-006
title: Data Structures & Memory Optimizations
status: Done
assignee: jamiepine
parent: INDEX-000
priority: High
tags: [indexing, performance, memory, optimization]
whitepaper: Section 4.3.7
last_updated: 2025-12-16
---
## Description
Implement specialized data structures that enable efficient in-memory indexing with minimal memory overhead. The ephemeral layer uses NodeArena (slab allocator), NameCache (string interning), and NameRegistry (fast name lookups) to achieve ~50 bytes per file entry - a 4-6x reduction over naive approaches.
## Architecture
### NodeArena (Slab Allocator)
Instead of storing `HashMap<PathBuf, FileNode>` with 64-bit pointers, the arena uses a contiguous memory slab with 32-bit integer IDs:
```rust
pub struct NodeArena {
// Contiguous slab of FileNode entries
nodes: Vec<FileNode>,
// Free list for reusing deleted slots
free_list: Vec<NodeId>,
}
pub type NodeId = u32; // 32-bit instead of 64-bit pointer
pub struct FileNode {
pub id: NodeId, // 4 bytes
pub parent_id: Option<NodeId>, // 5 bytes (4 + 1 tag)
pub name_id: NameId, // 4 bytes (index into NameCache)
pub kind: FileKind, // 1 byte
pub size: u64, // 8 bytes
pub modified: u64, // 8 bytes (timestamp)
pub inode: u64, // 8 bytes
pub uuid: Uuid, // 16 bytes
// Total: ~54 bytes per node
}
impl NodeArena {
pub fn alloc(&mut self, node: FileNode) -> NodeId {
if let Some(id) = self.free_list.pop() {
// Reuse deleted slot
self.nodes[id as usize] = node;
id
} else {
// Allocate new slot
let id = self.nodes.len() as NodeId;
self.nodes.push(node);
id
}
}
pub fn get(&self, id: NodeId) -> Option<&FileNode> {
self.nodes.get(id as usize)
}
pub fn free(&mut self, id: NodeId) {
self.free_list.push(id);
}
}
```
**Benefits**:
- **Reduced pointer size**: 32-bit vs 64-bit (50% reduction)
- **Cache locality**: Contiguous memory layout
- **Reuse deleted slots**: Free list prevents fragmentation
- **Simplified serialization**: Just save Vec<FileNode>
### NameCache (String Interning)
Filenames repeat frequently in filesystems. The NameCache stores each unique name once and references it by ID:
```rust
pub struct NameCache {
// Stores unique strings
names: Vec<Arc<str>>,
// Maps string → NameId for deduplication
lookup: HashMap<Arc<str>, NameId>,
}
pub type NameId = u32;
impl NameCache {
pub fn intern(&mut self, name: &str) -> NameId {
if let Some(&id) = self.lookup.get(name) {
return id; // Already interned
}
let id = self.names.len() as NameId;
let arc_name: Arc<str> = Arc::from(name);
self.names.push(arc_name.clone());
self.lookup.insert(arc_name, id);
id
}
pub fn get(&self, id: NameId) -> Option<&str> {
self.names.get(id as usize).map(|s| s.as_ref())
}
}
```
**Example Deduplication**:
```
Filesystem:
/app/node_modules/package1/index.js
/app/node_modules/package2/index.js
/app/node_modules/package3/index.js
...1000 packages
Without interning:
"index.js" stored 1000 times = 1000 * 8 bytes (string) = 8 KB
With interning:
"index.js" stored 1 time = 8 bytes
1000 references = 1000 * 4 bytes (NameId) = 4 KB
Total: 4.008 KB (50% reduction)
Common names like ".git", ".DS_Store", "README.md", "package.json" deduplicate heavily.
```
### NameRegistry (Name-Based Lookups)
The `NameRegistry` enables fast "find files by name" queries without full-text indexing:
```rust
pub struct NameRegistry {
// Maps name_id → Vec<NodeId> (all files with this name)
entries: BTreeMap<NameId, Vec<NodeId>>,
}
impl NameRegistry {
pub fn insert(&mut self, name_id: NameId, node_id: NodeId) {
self.entries.entry(name_id).or_insert_with(Vec::new).push(node_id);
}
pub fn find_by_name(&self, name_id: NameId) -> &[NodeId] {
self.entries.get(&name_id).map(|v| v.as_slice()).unwrap_or(&[])
}
}
```
**Use Case**:
```rust
// Find all "README.md" files in ephemeral index
let readme_name_id = name_cache.intern("README.md");
let readme_nodes = registry.find_by_name(readme_name_id);
```
### Directory Path Caching (Persistent)
For the database layer, the `directory_paths` table caches full paths for O(1) lookups:
```sql
CREATE TABLE directory_paths (
entry_id INTEGER PRIMARY KEY,
path TEXT UNIQUE
);
```
This eliminates recursive parent traversal when building file paths.
## Implementation Files
### Ephemeral Data Structures
- `core/src/ops/indexing/ephemeral/arena.rs` - NodeArena slab allocator
- `core/src/ops/indexing/ephemeral/name.rs` - NameCache string interning
- `core/src/ops/indexing/ephemeral/registry.rs` - NameRegistry name-based lookups
- `core/src/ops/indexing/ephemeral/types.rs` - FileNode and related types
### Ephemeral Index
- `core/src/ops/indexing/ephemeral/index.rs` - EphemeralIndex using above structures
### Persistent Optimizations
- `core/src/ops/indexing/path_resolver.rs` - Path resolution with caching
- `core/src/ops/indexing/hierarchy.rs` - Closure table for O(1) hierarchy queries
## Memory Benchmark
| Approach | Bytes/Entry | 100K Files | 1M Files |
|----------|------------|-----------|----------|
| Naive (`HashMap<PathBuf, Entry>`) | ~250 bytes | 25 MB | 250 MB |
| With String Interning | ~150 bytes | 15 MB | 150 MB |
| **NodeArena + NameCache** | **~50 bytes** | **5 MB** | **50 MB** |
**Deduplication Impact**:
In typical filesystems with repeated names:
- **Before**: 250 bytes/entry * 100K = 25 MB
- **After**: 50 bytes/entry * 100K = 5 MB
- **Reduction**: 5x
## Acceptance Criteria
### NodeArena
- [x] Allocates FileNode entries in contiguous memory
- [x] Uses 32-bit NodeId instead of 64-bit pointers
- [x] Supports free list for deleted slots
- [x] get() is O(1) array indexing
- [x] Memory footprint ~54 bytes per node
### NameCache
- [x] Interns unique strings (stores each name once)
- [x] Returns NameId for deduplicated storage
- [x] intern() deduplicates automatically
- [x] get() retrieves string from NameId
- [x] Multiple directory trees share same cache
### NameRegistry
- [x] Maps name_id → Vec<NodeId>
- [x] Enables fast "find by name" queries
- [x] BTreeMap for sorted iteration
- [x] Supports multiple files with same name
### Integration
- [x] EphemeralIndex uses NodeArena for storage
- [x] EphemeralIndex uses NameCache for string interning
- [x] EphemeralIndex uses NameRegistry for name lookups
- [x] Multiple paths can share same EphemeralIndex
- [x] Memory usage is ~50 bytes per file entry
- [x] String deduplication works (common names stored once)
## Performance Characteristics
| Operation | Complexity | Notes |
|-----------|-----------|-------|
| Allocate node | O(1) | Vec push or free list pop |
| Get node | O(1) | Array indexing by NodeId |
| Free node | O(1) | Push to free list |
| Intern name | O(1) avg | HashMap lookup + Vec push |
| Get name | O(1) | Array indexing by NameId |
| Find by name | O(1) | BTreeMap lookup |
## Testing
### Manual Testing
```bash
# Index large directory in ephemeral mode
spacedrive index browse /usr --ephemeral
# Check memory usage
ps aux | grep spacedrive
# For 500K files, should use ~25 MB RAM for index
# (50 bytes/entry * 500K = 25 MB)
```
### Integration Tests
Located in `core/tests/indexing/`:
- `test_node_arena_allocation` - Verify NodeArena works
- `test_node_arena_free_list` - Test slot reuse
- `test_name_cache_deduplication` - Verify string interning
- `test_name_registry_lookup` - Test name-based queries
- `test_ephemeral_memory_usage` - Benchmark memory per file
### Memory Usage Test
```rust
#[test]
fn test_memory_per_entry() {
let mut index = EphemeralIndex::new();
// Index 100K files
for i in 0..100_000 {
index.insert(format!("/test/file_{}.txt", i));
}
// Measure memory usage
let arena_size = std::mem::size_of_val(&index.arena.nodes);
let name_cache_size = std::mem::size_of_val(&index.name_cache.names);
let total = arena_size + name_cache_size;
// Should be ~5 MB for 100K files (50 bytes/entry)
assert!(total < 6_000_000);
println!("Memory per entry: {} bytes", total / 100_000);
}
```
## Comparison: Naive vs Optimized
### Naive Approach
```rust
// 250+ bytes per entry
struct Entry {
path: PathBuf, // ~64 bytes (heap allocation)
name: String, // ~24 bytes (heap allocation)
parent: Option<Box<Entry>>, // 8 bytes pointer
kind: FileKind, // 1 byte
size: u64, // 8 bytes
modified: SystemTime, // 16 bytes
inode: u64, // 8 bytes
uuid: Uuid, // 16 bytes
children: Vec<Entry>, // 24 bytes Vec
}
let mut index: HashMap<PathBuf, Entry> = HashMap::new();
// HashMap overhead: ~32 bytes per entry
// Total: ~282 bytes per entry
```
### Optimized Approach
```rust
// ~50 bytes per entry
struct FileNode {
id: NodeId, // 4 bytes
parent_id: Option<NodeId>, // 5 bytes
name_id: NameId, // 4 bytes (deduplicated)
kind: FileKind, // 1 byte
size: u64, // 8 bytes
modified: u64, // 8 bytes
inode: u64, // 8 bytes
uuid: Uuid, // 16 bytes
}
// Total: ~54 bytes per entry
// No HashMap overhead (arena indexed by NodeId)
```
## Future Enhancements
- **Port to Persistent Layer**: Apply name pooling to SQLite schema for database size reduction
- **Compression**: Use zstd compression for name cache serialization
- **Memory Mapping**: Map arena to disk for persistent ephemeral indexes
- **Tiered Storage**: Hot nodes in RAM, cold nodes on disk
## Related Tasks
- INDEX-001 - Hybrid Architecture (ephemeral layer uses these structures)
- INDEX-003 - Database Architecture (persistent layer could benefit from name pooling)

View File

@@ -0,0 +1,384 @@
---
id: INDEX-007
title: Index Verification System
status: Done
assignee: jamiepine
parent: INDEX-000
priority: Medium
tags: [indexing, verification, integrity, diagnostics]
whitepaper: Section 4.3.8
last_updated: 2025-12-16
---
## Description
Implement the index integrity verification system that detects discrepancies between filesystem state and database records. The system runs a fresh ephemeral scan and compares metadata against the persistent index to identify missing, stale, or mismatched entries.
## Architecture
### IndexVerifyAction
The verification action runs as a library action (not a job) for fast diagnostics:
```rust
pub struct IndexVerifyAction {
path: PathBuf,
}
pub struct IndexVerifyOutput {
pub report: IntegrityReport,
}
pub struct IntegrityReport {
pub missing_from_index: Vec<MissingFile>,
pub stale_in_index: Vec<StaleFile>,
pub metadata_mismatches: Vec<MetadataMismatch>,
pub summary: Summary,
}
pub struct MissingFile {
pub path: PathBuf,
pub size: u64,
pub modified: SystemTime,
}
pub struct StaleFile {
pub path: PathBuf,
pub entry_id: i32,
pub last_indexed: SystemTime,
}
pub struct MetadataMismatch {
pub path: PathBuf,
pub entry_id: i32,
pub issue: MismatchKind,
}
pub enum MismatchKind {
SizeMismatch { db: u64, fs: u64 },
ModifiedTimeMismatch { db: SystemTime, fs: SystemTime },
InodeMismatch { db: u64, fs: u64 },
}
pub struct Summary {
pub total_files_in_db: usize,
pub total_files_on_fs: usize,
pub missing_count: usize,
pub stale_count: usize,
pub mismatch_count: usize,
}
```
### Verification Process
1. **Run Ephemeral Scan**: Index the path in memory (Phase 1 only)
2. **Load Database Entries**: Query existing entries for the same path
3. **Compare**: For each filesystem entry, check against database:
- **MissingFromIndex**: File exists on disk but not in database
- **StaleInIndex**: Entry in database but file missing from filesystem
- **SizeMismatch**: Size differs between database and filesystem
- **ModifiedTimeMismatch**: Mtime differs (with 1-second tolerance)
- **InodeMismatch**: Inode changed (file replacement or corruption)
4. **Generate Report**: Detailed diagnostics with per-file breakdowns
### Comparison Logic
```rust
async fn compare_entries(
ephemeral_index: &EphemeralIndex,
db_entries: &HashMap<PathBuf, EntryRecord>,
) -> IntegrityReport {
let mut report = IntegrityReport::default();
// Check each filesystem file against database
for (path, ephemeral_node) in ephemeral_index.iter() {
if let Some(db_entry) = db_entries.get(path) {
// File exists in both, check metadata
if ephemeral_node.size != db_entry.size {
report.metadata_mismatches.push(MetadataMismatch {
path: path.clone(),
entry_id: db_entry.id,
issue: MismatchKind::SizeMismatch {
db: db_entry.size,
fs: ephemeral_node.size,
},
});
}
// Allow 1-second tolerance for mtime (filesystem precision varies)
let time_diff = ephemeral_node.modified.abs_diff(db_entry.modified);
if time_diff > Duration::from_secs(1) {
report.metadata_mismatches.push(MetadataMismatch {
path: path.clone(),
entry_id: db_entry.id,
issue: MismatchKind::ModifiedTimeMismatch {
db: db_entry.modified,
fs: ephemeral_node.modified,
},
});
}
if ephemeral_node.inode != db_entry.inode {
report.metadata_mismatches.push(MetadataMismatch {
path: path.clone(),
entry_id: db_entry.id,
issue: MismatchKind::InodeMismatch {
db: db_entry.inode,
fs: ephemeral_node.inode,
},
});
}
} else {
// File on disk but not in database
report.missing_from_index.push(MissingFile {
path: path.clone(),
size: ephemeral_node.size,
modified: ephemeral_node.modified,
});
}
}
// Check for stale database entries (not on disk)
for (path, db_entry) in db_entries.iter() {
if !ephemeral_index.contains(path) {
report.stale_in_index.push(StaleFile {
path: path.clone(),
entry_id: db_entry.id,
last_indexed: db_entry.indexed_at,
});
}
}
report.summary = Summary {
total_files_in_db: db_entries.len(),
total_files_on_fs: ephemeral_index.len(),
missing_count: report.missing_from_index.len(),
stale_count: report.stale_in_index.len(),
mismatch_count: report.metadata_mismatches.len(),
};
report
}
```
## Implementation Files
### Verification Action
- `core/src/ops/indexing/verify/action.rs` - IndexVerifyAction implementation
- `core/src/ops/indexing/verify/input.rs` - IndexVerifyInput
- `core/src/ops/indexing/verify/output.rs` - IndexVerifyOutput and IntegrityReport
- `core/src/ops/indexing/verify/mod.rs` - Module exports
### Integration
- `core/src/ops/indexing/action.rs` - Action registration
- `core/src/ops/mod.rs` - Action exports
## Acceptance Criteria
- [x] IndexVerifyAction runs fresh ephemeral scan of path
- [x] Action loads existing database entries for comparison
- [x] MissingFromIndex detects files on disk but not in database
- [x] StaleInIndex detects entries in database but missing from filesystem
- [x] SizeMismatch detects size differences
- [x] ModifiedTimeMismatch detects mtime differences (1-second tolerance)
- [x] InodeMismatch detects inode changes
- [x] Report includes summary statistics
- [x] Report provides per-file diagnostics
- [x] Verification runs as library action (not job)
- [x] Fast execution (ephemeral scan only, no database writes)
- [x] CLI command exposes verification
## Use Cases
### Post-Offline Detection
After app has been offline, verify index integrity:
```bash
spacedrive verify ~/Documents
```
**Expected Issues**:
- Files created externally → MissingFromIndex
- Files deleted externally → StaleInIndex
- Files modified externally → SizeMismatch or ModifiedTimeMismatch
### Debugging Watcher Issues
If real-time updates seem broken, verify state:
```bash
spacedrive verify /media/usb
```
**Expected Issues**:
- Missed create events → MissingFromIndex
- Missed delete events → StaleInIndex
- Missed modify events → MetadataMismatch
### Pre-Migration Validation
Before migrating to new library version, verify current state:
```bash
spacedrive verify --all-locations
```
Ensures clean state before schema migrations.
## CLI Integration
```bash
# Verify specific path
spacedrive verify ~/Documents
# Verify all locations
spacedrive verify --all-locations
# Verify with detailed output
spacedrive verify ~/Pictures --verbose
# Output JSON for scripting
spacedrive verify ~/Videos --json > report.json
```
## Output Format
### Console Output
```
Index Verification Report
=========================
Path: /Users/jamie/Documents
Scanned: 15,234 files
Database: 15,180 entries
Issues Found:
-------------
Missing from index: 54 files
Stale in index: 12 entries
Metadata mismatches: 8 files
Details:
--------
Missing from index:
/Users/jamie/Documents/new_file.txt (created 2025-10-14)
/Users/jamie/Documents/another.pdf (created 2025-10-14)
...
Stale in index:
/Users/jamie/Documents/deleted.txt (last seen 2025-10-01)
/Users/jamie/Documents/old.doc (last seen 2025-09-15)
...
Metadata mismatches:
/Users/jamie/Documents/modified.txt
- Size: DB=1024, FS=2048
/Users/jamie/Documents/touched.pdf
- Modified: DB=2025-10-01 12:00:00, FS=2025-10-14 14:30:00
...
Recommendation: Run reindex to fix issues
```
### JSON Output
```json
{
"path": "/Users/jamie/Documents",
"summary": {
"total_files_in_db": 15180,
"total_files_on_fs": 15234,
"missing_count": 54,
"stale_count": 12,
"mismatch_count": 8
},
"missing_from_index": [
{
"path": "/Users/jamie/Documents/new_file.txt",
"size": 2048,
"modified": "2025-10-14T10:30:00Z"
}
],
"stale_in_index": [
{
"path": "/Users/jamie/Documents/deleted.txt",
"entry_id": 12345,
"last_indexed": "2025-10-01T08:00:00Z"
}
],
"metadata_mismatches": [
{
"path": "/Users/jamie/Documents/modified.txt",
"entry_id": 12346,
"issue": {
"kind": "SizeMismatch",
"db": 1024,
"fs": 2048
}
}
]
}
```
## Performance Characteristics
| Location Size | Verification Time | Notes |
|--------------|------------------|-------|
| 1K files | <1 second | Ephemeral scan + comparison |
| 10K files | 2-5 seconds | Depends on disk speed |
| 100K files | 20-50 seconds | Mostly filesystem traversal |
| 1M files | 3-5 minutes | Batched comparison |
**Bottleneck**: Filesystem traversal (Phase 1 discovery), not comparison.
## Testing
### Manual Testing
```bash
# Create test location with known state
mkdir -p ~/test-verify
cd ~/test-verify
touch file1.txt file2.txt file3.txt
# Index it
spacedrive index location ~/test-verify --mode shallow
# Make external changes
touch external_new.txt
rm file2.txt
echo "modified" >> file3.txt
# Verify (should detect issues)
spacedrive verify ~/test-verify
# Expected output:
# - Missing: external_new.txt
# - Stale: file2.txt
# - Mismatch: file3.txt (size/mtime changed)
```
### Integration Tests
Located in `core/tests/indexing/`:
- `test_verify_missing_from_index` - Detect new files
- `test_verify_stale_in_index` - Detect deleted files
- `test_verify_size_mismatch` - Detect size changes
- `test_verify_mtime_mismatch` - Detect mtime changes
- `test_verify_inode_mismatch` - Detect file replacement
- `test_verify_clean_index` - No issues when in sync
## Future Enhancements
- **Auto-Fix Mode**: `--fix` flag to automatically reindex mismatched files
- **Incremental Verification**: Only verify changed directories (via mtime)
- **Scheduled Verification**: Periodic background integrity checks
- **Notification**: Alert user when issues exceed threshold
- **Metrics**: Track verification results over time
## Related Tasks
- INDEX-001 - Hybrid Architecture (uses ephemeral scan for verification)
- INDEX-004 - Change Detection (verification detects missed changes)
- INDEX-002 - Five-Phase Pipeline (verification uses Phase 1 only)

View File

@@ -1,12 +1,13 @@
---
id: INDEX-004
title: Nested Locations Support (Entry Reuse Architecture)
id: INDEX-008
title: Nested Locations Support
status: To Do
assignee: james
assignee: jamiepine
priority: Medium
tags: [indexing, locations, architecture, sync]
last_updated: 2025-10-23
related_tasks: [INDEX-001, INDEX-003, CORE-001, LSYNC-010]
last_updated: 2025-12-16
parent: INDEX-000
related_tasks: [CORE-001, LSYNC-010, LOC-000]
---
# Nested Locations Support (Entry Reuse Architecture)
@@ -107,6 +108,7 @@ Sync consistent (one UUID per file)
### Location Semantics
Each location defines:
- **Root entry**: Which node in the tree this location starts from
- **Index mode**: How deeply to process files (Shallow/Content/Deep)
- **Watching**: Whether to monitor changes in real-time
@@ -121,6 +123,7 @@ Multiple locations can reference overlapping subtrees with different behaviors.
**File**: `core/src/location/manager.rs:100-122`
**Current**:
```rust
// Always creates new entry
let entry_model = entry::ActiveModel {
@@ -131,6 +134,7 @@ let entry_record = entry_model.insert(&txn).await?;
```
**Needed**:
```rust
// Check if entry already exists at this path
let existing_entry = directory_paths::Entity::find()
@@ -174,6 +178,7 @@ let location_model = location::ActiveModel {
**File**: `core/src/location/manager.rs:~180`
**Current**:
```rust
// Always spawns indexer job
let job = IndexerJob::from_location(location_id, sd_path, mode);
@@ -181,6 +186,7 @@ library.jobs().dispatch(job).await?;
```
**Needed**:
```rust
// Check if this entry is already indexed
let entry = entry::Entity::find_by_id(entry_id)
@@ -216,6 +222,7 @@ if entry.indexed_at.is_some() {
**Options**:
**Option A: All watchers trigger (simple but wasteful)**
```rust
// Both Location A and B get notified for /Documents/Work/test.txt
// Both call responder
@@ -223,6 +230,7 @@ if entry.indexed_at.is_some() {
```
**Option B: Innermost location wins (efficient)**
```rust
// In the watcher event dispatch or routing:
async fn find_deepest_watching_location(
@@ -302,6 +310,7 @@ async fn is_path_in_entry_tree(
**Problem**: Deleting Location A shouldn't delete entries used by Location B
**Solution**:
```rust
async fn delete_location(&self, location_id: Uuid, db: &DatabaseConnection) -> Result<()> {
let location = location::Entity::find()
@@ -354,17 +363,20 @@ async fn delete_location(&self, location_id: Uuid, db: &DatabaseConnection) -> R
**Challenge**: How to sync nested locations across devices?
**Scenario**:
- Device A has Location A (`/Documents`) and Location B (`/Documents/Work`)
- Device C connects and syncs
**Current sync** (no nesting support):
- Location A syncs → creates entries 1-5
- Location B syncs → creates duplicate entries 100-102
- Location B syncs → creates duplicate entries 100-102
**With nesting support**:
- Location A syncs → creates entries 1-5
- Location B syncs → just creates location record pointing to existing entry 2
- No entry duplication
- Location A syncs → creates entries 1-5
- Location B syncs → just creates location record pointing to existing entry 2
- No entry duplication
**Implementation**: Location sync already uses `entry_id` reference, so this works automatically! Just need to ensure receiving device doesn't re-create entries.
@@ -388,6 +400,7 @@ Device A creates Location B (/Documents/Work)
**Implication**: Nested locations must be on the same device as their parent location's device.
**Validation needed**:
```rust
// When creating nested location, verify it's under a location on THIS device
if let Some(parent_location) = find_parent_location(&path, db).await? {
@@ -406,9 +419,11 @@ if let Some(parent_location) = find_parent_location(&path, db).await? {
### Phase 1: Entry Reuse (2-3 days)
**Files**:
- `core/src/location/manager.rs`
**Tasks**:
1. Modify `add_location()` to check for existing entries at path
2. Reuse entry if found, create if not
3. Add validation to prevent cross-device nesting
@@ -418,9 +433,11 @@ if let Some(parent_location) = find_parent_location(&path, db).await? {
### Phase 2: Skip Redundant Indexing (1 day)
**Files**:
- `core/src/location/manager.rs`
**Tasks**:
1. Check if entry is already indexed before spawning job
2. Consider index_mode differences (might need re-index)
3. Add logic to determine if re-indexing needed
@@ -428,10 +445,12 @@ if let Some(parent_location) = find_parent_location(&path, db).await? {
### Phase 3: Watcher Precedence (2 days)
**Files**:
- `core/src/service/watcher/mod.rs`
- `core/src/service/watcher/worker.rs`
**Tasks**:
1. Implement `find_deepest_watching_location()` helper
2. Route events to innermost location only
3. Handle edge cases (multiple watchers at same depth)
@@ -440,9 +459,11 @@ if let Some(parent_location) = find_parent_location(&path, db).await? {
### Phase 4: Location Deletion Safety (1 day)
**Files**:
- `core/src/ops/locations/delete/action.rs` (or manager)
**Tasks**:
1. Check for other location references before deleting entries
2. Preserve shared entry trees
3. Only delete location record if entries are shared
@@ -451,9 +472,11 @@ if let Some(parent_location) = find_parent_location(&path, db).await? {
### Phase 5: Sync Validation (1 day)
**Files**:
- `core/src/infra/db/entities/location.rs`
**Tasks**:
1. Ensure location sync doesn't duplicate entries
2. Validate nested location references exist on receiving device
3. Handle case where parent location hasn't synced yet (defer)
@@ -586,11 +609,13 @@ async fn test_cannot_nest_across_devices() {
### Edge Case 1: Parent Location Deleted, Nested Remains
**Scenario**:
- Location A (`/Documents`) deleted
- Location B (`/Documents/Work`) still exists
- Entry 2 (Work) now has orphan parent or needs reparenting
**Solution**:
```rust
// When deleting Location A:
// - Keep entry tree intact (Location B references it)
@@ -601,6 +626,7 @@ async fn test_cannot_nest_across_devices() {
```
**Alternative**: Prevent deleting parent locations if nested locations exist:
```rust
// Check for child locations before allowing deletion
let child_locations = find_locations_under_entry_subtree(entry_id, db).await?;
@@ -615,18 +641,21 @@ if !child_locations.is_empty() {
### Edge Case 2: Moving Nested Location
**Scenario**:
```bash
# Move Work directory to Personal
mv /Documents/Work /Documents/Personal/Work
```
**Current behavior**:
- Location A's watcher detects rename
- Updates entry 2's parent from entry 1 to entry 3 (Personal)
- Location B's `entry_id` still points to entry 2
- Location B's path is now wrong
- Location B's `entry_id` still points to entry 2
- Location B's path is now wrong
**Solution**: Update location path when root entry moves:
```rust
// After moving entry via responder:
// Check if any locations reference this entry
@@ -651,11 +680,13 @@ for location in locations_using_entry {
### Edge Case 3: Index Mode Conflicts
**Scenario**:
- Location A (`/Documents`) has `mode: Shallow`
- Location B (`/Documents/Work`) has `mode: Deep`
- Which mode applies to `/Documents/Work/test.pdf`?
**Solution**: Innermost location's mode wins:
```rust
// When indexing or processing:
fn get_effective_index_mode(path: &Path, db: &DatabaseConnection) -> IndexMode {
@@ -675,6 +706,7 @@ fn get_effective_index_mode(path: &Path, db: &DatabaseConnection) -> IndexMode {
**Problem**: Location B references entry 2, but what if Location A hasn't synced yet?
**Current sync order** (from docs):
1. Shared resources (tags, etc.)
2. Devices
3. Locations
@@ -682,11 +714,13 @@ fn get_effective_index_mode(path: &Path, db: &DatabaseConnection) -> IndexMode {
5. Entries
**With nesting**:
- Location B syncs → `entry_id: 2`
- Entry 2 might not exist yet on receiving device!
- Foreign key constraint violation
- Foreign key constraint violation
**Solution**: Defer nested location sync until parent location syncs:
```rust
// In location::Model::apply_state_change()
if let Some(entry_id) = location_data.entry_id {
@@ -750,6 +784,7 @@ The flexibility is already built in!
**Backwards compatibility**: Yes - existing non-nested locations continue to work
**Rollout**:
1. Implement entry reuse in location creation (Phase 1)
2. Test with simple 1-level nesting
3. Add watcher precedence (Phase 3)
@@ -760,11 +795,13 @@ The flexibility is already built in!
## Performance Considerations
**Benefits**:
- Reduced storage (no duplicate entries)
- Faster indexing (skip already-indexed paths)
- Less sync traffic (entries synced once)
**Costs**:
- Checking for existing entries on location creation (+1 query)
- Watcher precedence logic (path comparison overhead)
- Location deletion checks (query for other location references)
@@ -774,6 +811,7 @@ The flexibility is already built in!
## UI/UX Implications
**Location list view**:
```
Documents (/Users/jamespine/Documents)
└─ Work (/Users/jamespine/Documents/Work) [nested]
@@ -782,6 +820,7 @@ Photos (/Users/jamespine/Pictures)
```
**Considerations**:
- Show nesting visually in UI
- Warn before deleting parent location
- Indicate which location is actively watching a path
@@ -792,17 +831,19 @@ Photos (/Users/jamespine/Pictures)
- [Location Watcher Service](../../core/src/service/watcher/mod.rs)
- [Location Manager](../../core/src/location/manager.rs)
- [Entry-Centric Model](./CORE-001-entry-centric-model.md)
- [INDEX-003](./INDEX-003-watcher-device-ownership-violation.md) - Related device ownership work
- [Change Detection System](./INDEX-004-change-detection-system.md) - Related watcher work
## Implementation Files
**Modified files**:
- `core/src/location/manager.rs`
- `core/src/service/watcher/mod.rs`
- `core/src/service/watcher/worker.rs`
- `core/src/ops/locations/delete/action.rs`
**New files**:
- `core/tests/nested_locations_test.rs`
- `core/src/location/nesting.rs` (helper functions)

View File

@@ -0,0 +1,374 @@
---
id: INDEX-009
title: Stale File Detection Algorithm
status: To Do
assignee: jamiepine
parent: INDEX-000
priority: High
tags: [indexing, stale-detection, offline-recovery, sync]
whitepaper: Section 4.3.4
last_updated: 2025-12-16
related_tasks: [INDEX-004, LSYNC-020]
---
## Description
Implement the algorithm for detecting stale files after the application has been offline or when the watcher service was not running. This ensures that changes made while Spacedrive was not actively monitoring are correctly detected and reconciled when the app restarts or when manual verification is triggered.
## Problem Statement
The real-time change detection system (ChangeHandler trait) only captures events while Spacedrive is running and actively watching locations. When the app is:
- Stopped/offline
- Crashed unexpectedly
- Watcher paused or disabled
- Running on a different device
...filesystem changes are not immediately detected. Stale detection fills this gap by:
1. **Detecting offline modifications** - Files changed while app wasn't running
2. **Detecting offline deletions** - Files removed while app wasn't running
3. **Detecting offline moves** - Files renamed/moved while app wasn't running
4. **Detecting missed watcher events** - Edge cases where watcher failed to fire
## Current Implementation Status
The ChangeDetector in INDEX-004 provides the foundation for stale detection, but automated offline detection is not fully implemented:
-**Manual verification** - `IndexVerifyAction` can detect discrepancies on-demand
-**Batch change detection** - ChangeDetector compares filesystem vs database during reindex
-**Automatic startup detection** - App doesn't automatically check for stale files on launch
-**Last-seen timestamps** - No tracking of when watcher was last active per location
-**Smart rescanning** - No heuristics to determine which paths need stale detection
-**Background reconciliation** - No automated background stale file cleanup
## Proposed Architecture
### Watcher Lifecycle Tracking
Track when each location was last successfully watched:
```sql
CREATE TABLE location_watcher_state (
location_id INTEGER PRIMARY KEY,
last_watch_start TIMESTAMP,
last_watch_stop TIMESTAMP,
last_successful_event TIMESTAMP,
watch_interrupted BOOLEAN
);
```
### Startup Stale Detection
On app startup, automatically trigger stale detection for locations that were:
1. **Watched during last session** - Check if any changes occurred while offline
2. **Interrupted** - Watcher crashed or was force-stopped
3. **Offline for >N hours** - Heuristic threshold for automatic scanning
```rust
async fn detect_stale_on_startup(library: &Library) -> Result<()> {
let locations = load_watched_locations(&library.db).await?;
for location in locations {
let watcher_state = get_watcher_state(location.id, &library.db).await?;
// Check if location needs stale detection
if should_run_stale_detection(&watcher_state) {
info!("Running stale detection for location {}", location.name);
// Spawn background stale detection job
let job = StaleDetectionJob::new(location.id);
library.jobs().dispatch(job).await?;
}
}
Ok(())
}
fn should_run_stale_detection(state: &WatcherState) -> bool {
// Always run if interrupted
if state.watch_interrupted {
return true;
}
// Run if offline for more than 1 hour
let offline_duration = Utc::now() - state.last_watch_stop;
if offline_duration > Duration::hours(1) {
return true;
}
// Run if no successful events in last session (watcher might have failed silently)
if state.last_successful_event < state.last_watch_start {
return true;
}
false
}
```
### Stale Detection Job
Similar to IndexVerifyAction but runs automatically:
```rust
pub struct StaleDetectionJob {
location_id: i32,
}
impl Job for StaleDetectionJob {
async fn execute(&self, ctx: &JobContext) -> Result<()> {
// 1. Run ephemeral scan of location
let ephemeral_index = self.scan_location(ctx).await?;
// 2. Load database entries
let db_entries = self.load_db_entries(ctx).await?;
// 3. Compare and detect changes
let changes = ChangeDetector::compare(&ephemeral_index, &db_entries);
// 4. Apply changes to database
for change in changes {
match change {
Change::New(path) => self.create_entry(path, ctx).await?,
Change::Modified(path) => self.update_entry(path, ctx).await?,
Change::Moved { old, new } => self.move_entry(old, new, ctx).await?,
Change::Deleted(path) => self.delete_entry(path, ctx).await?,
}
}
// 5. Update watcher state
self.mark_location_reconciled(ctx).await?;
Ok(())
}
}
```
### Inode-Based Move Detection (Critical for Offline Changes)
When app is offline, files can be moved/renamed. On restart, detect these via inode matching:
```rust
async fn detect_moves(
ephemeral_entries: &HashMap<PathBuf, FileNode>,
db_entries: &HashMap<PathBuf, EntryRecord>,
) -> Vec<MoveOperation> {
let mut moves = Vec::new();
// Build inode → db_entry map
let mut inode_map: HashMap<u64, &EntryRecord> = HashMap::new();
for entry in db_entries.values() {
if let Some(inode) = entry.inode {
inode_map.insert(inode, entry);
}
}
// Check each filesystem entry
for (fs_path, fs_node) in ephemeral_entries {
if let Some(inode) = fs_node.inode {
// File exists in DB with same inode but different path?
if let Some(db_entry) = inode_map.get(&inode) {
if db_entry.path != *fs_path {
moves.push(MoveOperation {
entry_id: db_entry.id,
old_path: db_entry.path.clone(),
new_path: fs_path.clone(),
inode,
});
}
}
}
}
moves
}
```
**Critical**: This only works on Unix systems. Windows requires fallback to path-only matching.
## Implementation Plan
### Phase 1: Watcher State Tracking
**Files**:
- `core/src/infra/db/migrations/` - Add `location_watcher_state` table
- `core/src/service/watcher/mod.rs` - Update watcher start/stop to record timestamps
- `core/src/service/watcher/worker.rs` - Update last_successful_event on each event
**Tasks**:
1. Add database schema for watcher lifecycle tracking
2. Record watcher start/stop times per location
3. Update timestamp on each successful event
4. Mark interrupted flag on unexpected shutdown
### Phase 2: Startup Stale Detection
**Files**:
- `core/src/library/mod.rs` - Hook startup stale detection
- `core/src/ops/indexing/stale.rs` - New module for stale detection logic
**Tasks**:
1. Implement `detect_stale_on_startup()` function
2. Check watcher state for each location
3. Spawn StaleDetectionJob for locations needing reconciliation
4. Don't block app startup (run in background)
### Phase 3: StaleDetectionJob Implementation
**Files**:
- `core/src/ops/indexing/jobs/stale_detection.rs` - New job type
**Tasks**:
1. Create StaleDetectionJob similar to IndexVerifyAction
2. Run ephemeral scan + database comparison
3. Apply changes via DatabaseAdapter
4. Update watcher state on completion
5. Report results to user (notification or log)
### Phase 4: Inode-Based Move Detection
**Files**:
- `core/src/ops/indexing/change_detection/detector.rs` - Enhance with move detection
**Tasks**:
1. Build inode → entry map from database
2. Compare filesystem inodes against database
3. Detect same inode at different path
4. Handle Windows fallback (no stable inodes)
### Phase 5: UI Integration
**Files**:
- `packages/interface/src/` - Notification UI for stale detection results
**Tasks**:
1. Show notification when stale files detected
2. Display count of changes found (new/modified/deleted)
3. Allow user to review changes before applying
4. Add setting to enable/disable automatic stale detection
## Acceptance Criteria
- [ ] Watcher state tracked in database (start/stop/last_event timestamps)
- [ ] App startup triggers stale detection for offline locations
- [ ] StaleDetectionJob runs in background without blocking startup
- [ ] Detects new files created while offline
- [ ] Detects modified files (size/mtime changed while offline)
- [ ] Detects deleted files (removed while offline)
- [ ] Detects moved files via inode matching (Unix systems)
- [ ] Windows fallback works (path-only matching)
- [ ] User notified when stale files found and reconciled
- [ ] Settings allow disabling automatic stale detection
- [ ] Manual stale detection still available via IndexVerifyAction
- [ ] Doesn't run stale detection if watcher was active until shutdown
- [ ] Handles edge case: location on external drive that was unmounted
## Edge Cases
### External Drive Unmounted While Offline
**Scenario**: USB drive was ejected while app offline
**Behavior**:
- On startup, drive is not mounted
- Stale detection should skip (don't mark files as deleted)
- Wait for drive to be mounted before reconciling
**Solution**:
```rust
// Check if location path is accessible before stale detection
if !location_path.exists() {
info!("Location {} not accessible, skipping stale detection", location.name);
return Ok(());
}
```
### Very Long Offline Period
**Scenario**: App offline for weeks, thousands of changes
**Behavior**:
- Don't block startup with massive scan
- Run stale detection in low-priority background job
- Show progress in UI
### Multiple Devices with Same Location
**Scenario**: Device A and Device B both have `/shared` mounted. Device A was offline.
**Behavior**:
- Device A's stale detection might conflict with Device B's changes
- Need to coordinate via library sync
- Device B's changes should have higher authority (it was online)
**Related**: LSYNC-020 (Device-Owned Deletion Sync)
## Testing
### Manual Testing
```bash
# 1. Start Spacedrive and add location
spacedrive start
spacedrive location add ~/Documents
# 2. Verify watcher active
spacedrive location info ~/Documents | grep "watcher: active"
# 3. Stop Spacedrive
spacedrive stop
# 4. Make changes while offline
touch ~/Documents/new_file.txt
echo "modified" >> ~/Documents/existing.txt
rm ~/Documents/old.txt
# 5. Restart Spacedrive
spacedrive start
# 6. Verify stale detection ran
spacedrive job list | grep StaleDetection
# 7. Check changes applied
spacedrive db query "SELECT * FROM entry WHERE name = 'new_file.txt'"
```
### Integration Tests
Located in `core/tests/indexing/`:
- `test_stale_detection_on_startup` - Verify automatic startup detection
- `test_watcher_state_tracking` - Verify timestamps recorded
- `test_stale_detection_skips_if_recent` - Don't run if just stopped
- `test_stale_detection_detects_offline_changes` - Full offline change cycle
- `test_stale_detection_inode_moves` - Move detection via inodes
## Performance Considerations
### Startup Impact
- Stale detection should NOT block app startup
- Run in low-priority background thread
- User can interact with app while detection runs
- Show progress in notification/status bar
### Large Locations
For locations with 1M+ files:
- Stale detection could take 5-10 minutes
- Don't run automatically if location >500K files
- Prompt user instead: "Location ~/Photos has been offline. Run stale detection?"
### Frequency Tuning
- **< 1 hour offline**: Skip (watcher state is fresh)
- **1-24 hours offline**: Run automatically
- **> 24 hours offline**: Prompt user before running
- **> 1 week offline**: Always prompt (likely external drive)
## Related Tasks
- INDEX-004 - Change Detection System (provides ChangeDetector foundation)
- INDEX-007 - Index Verification System (provides manual verification)
- LSYNC-020 - Device-Owned Deletion Sync (conflict resolution for multi-device)
- LOC-000 - Location Operations (watcher lifecycle)

View File

@@ -2,7 +2,7 @@
id: JOB-000
title: "Epic: Durable Job System"
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, core, jobs]
whitepaper: Section 4.4

View File

@@ -2,7 +2,7 @@
id: JOB-001
title: Job Manager for Task Scheduling
status: Done
assignee: james
assignee: jamiepine
parent: JOB-000
priority: High
tags: [core, jobs]

View File

@@ -2,7 +2,7 @@
id: JOB-002
title: Job-Specific File Logging
status: Done
assignee: james
assignee: jamiepine
parent: JOB-000
priority: Medium
tags: [core, jobs, logging]

View File

@@ -2,7 +2,7 @@
id: JOB-003
title: Parallel Task Execution from Jobs
status: To Do
assignee: james
assignee: jamiepine
parent: JOB-000
priority: High
tags: [jobs, task-system, performance, parallelism]

View File

@@ -1,8 +1,8 @@
---
id: LOC-000
title: "Epic: Location Operations"
title: Location Operations
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, core, locations]
whitepaper: Section 4.3.3

View File

@@ -2,7 +2,7 @@
id: LOC-001
title: Location Management Actions
status: Done
assignee: james
assignee: jamiepine
parent: LOC-000
priority: High
tags: [core, actions, locations, indexing]

View File

@@ -2,7 +2,7 @@
id: LOC-005
title: "Virtual Locations via Pure Hierarchical Model"
status: To Do
assignee: james
assignee: jamiepine
parent: LOC-000
priority: High
tags: [core, vdfs, database, refactor]

View File

@@ -1,8 +1,8 @@
---
id: LSYNC-000
title: "Epic: Library-based Synchronization (Leaderless)"
title: "Library Sync"
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, sync, networking, library-sync, leaderless]
whitepaper: Section 4.5.1
@@ -13,6 +13,7 @@ last_updated: 2025-12-02
## Description
Implement library metadata synchronization using a **leaderless hybrid model**:
- **State-based sync** for device-owned data (locations, entries, volumes)
- **Log-based sync with HLC** for shared resources (tags, albums, metadata)
@@ -30,17 +31,20 @@ See `core/src/infra/sync/NEW_SYNC.md` for complete rationale.
## Current Status
**Completed (Phase 1)**:
- NET-001: Iroh P2P stack ✅
- NET-002: Device pairing protocol ✅
- LSYNC-003: Library sync setup ✅
**Completed (Phase 2)** - Oct 9, 2025:
- LSYNC-006: TransactionManager ✅
- LSYNC-007: Syncable trait ✅
- LSYNC-009: HLC implementation ✅
- LSYNC-013: Hybrid protocol handler ✅
**Completed (Phase 3)** - Oct 15, 2025:
- LSYNC-010: Peer sync service ✅
- LSYNC-011: Conflict resolution (HLC ordering) ✅
- LSYNC-002: Metadata sync ✅
@@ -49,50 +53,59 @@ See `core/src/infra/sync/NEW_SYNC.md` for complete rationale.
- Shared: Tag ✅, Collection ✅, ContentIdentity ✅, UserMetadata ✅
**Upcoming (Phase 4)**:
- Enhanced integration testing for all 8 models
- Backfill optimization for new devices joining
- Retry queue for failed sync operations
- Performance optimization and monitoring
**Cancelled/Obsolete**:
- ~~LSYNC-008: Central sync log~~ (replaced with per-device shared_changes)
- ~~Leader election~~ (no leader needed)
## Subtasks
### Phase 1: Foundation ✅
- LSYNC-001: Protocol design
- LSYNC-003: Sync setup
### Phase 2: Core Infrastructure (Revised)
- LSYNC-006: TransactionManager (no leader checks)
- LSYNC-007: Syncable trait (device ownership)
- LSYNC-009: HLC implementation
### Phase 3: Sync Services
- LSYNC-013: Hybrid protocol handler
- LSYNC-010: Peer sync service
- LSYNC-011: Conflict resolution
### Phase 4: Application
- LSYNC-002: Metadata sync (tags/albums)
- Entry sync optimization
## Architecture Summary
**Device-Owned Data** (no log, state-based):
- Locations, Entries, Volumes, Devices
- Each device broadcasts its own state
- Peers apply (no conflicts possible)
- Efficient: just timestamp-based delta sync
**Shared Resources** (small log, HLC-based):
- Tags, Collections, ContentIdentity, UserMetadata
- Each device logs its shared changes
- Broadcast with HLC for ordering
- Peers ACK → aggressive pruning → log stays tiny
**Benefits**:
- No leader bottleneck
- Works fully offline
- Simpler (~800 lines less code)
@@ -102,10 +115,12 @@ See `core/src/infra/sync/NEW_SYNC.md` for complete rationale.
## Implementation Summary (Oct 2025)
**Total Models Syncing**: 8
- 4 device-owned (state-based)
- 4 shared (HLC log-based)
**Infrastructure Complete**:
- Syncable trait with FK mapping
- PeerLog (sync.db per device)
- HLC implementation
@@ -114,6 +129,7 @@ See `core/src/infra/sync/NEW_SYNC.md` for complete rationale.
- Integration tests (10 passing)
**Key Features**:
- HLC conflict resolution prevents stale overwrites
- Deterministic UUIDs for ContentIdentity enable dedup
- Per-device sync.db stays small via ACK pruning

View File

@@ -2,7 +2,7 @@
id: LSYNC-001
title: Design Library Sync Protocol (Leaderless)
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, networking, protocol, design, leaderless]

View File

@@ -1,8 +1,8 @@
---
id: LSYNC-002
title: Shared Metadata Sync (Albums, Tags) with HLC
title: Shared Sync with HLC
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, metadata, albums, tags, hlc, shared-resources]
@@ -13,7 +13,7 @@ last_updated: 2025-10-15
## Description
Implement synchronization for truly shared resources (Albums, Tags) using the HLC-based log model. These resources can be modified by any device and need conflict resolution.
Implement synchronization for truly shared resources (ContentIdentity, Tags) using the HLC-based log model. These resources can be modified by any device and need conflict resolution.
**Architecture**: Log-based sync with Hybrid Logical Clocks for ordering.

View File

@@ -2,7 +2,7 @@
id: LSYNC-003
title: Library Sync Setup (Device Registration & Discovery)
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, networking, library-setup, device-pairing]

View File

@@ -1,8 +1,8 @@
---
id: LSYNC-006
title: TransactionManager Core (Leaderless)
title: Transaction Manager Core
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: Critical
tags: [sync, database, transaction, architecture, leaderless]
@@ -123,12 +123,14 @@ Successfully implemented in `core/src/infra/sync/transaction.rs`:
## Migration from Leader Model
**Remove**:
- `next_sequence()` method (replaced with HLC)
- `is_leader()` checks
- Sequence number tracking
- Leader-specific logic
**Add**:
- HLC generator integration
- Strategy selection (device-owned vs shared)
- State broadcast for device-owned

View File

@@ -1,8 +1,8 @@
---
id: LSYNC-007
title: Syncable Trait (Device Ownership Aware)
title: Syncable Trait
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, trait, codegen, macro]

View File

@@ -2,7 +2,7 @@
id: LSYNC-008
title: Sync Log Schema (Per-Device, HLC-Based)
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, database, schema, migration, hlc]

View File

@@ -2,7 +2,7 @@
id: LSYNC-009
title: Hybrid Logical Clock (HLC) Implementation
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, hlc, distributed-systems, leaderless]

View File

@@ -2,7 +2,7 @@
id: LSYNC-010
title: Peer Sync Service (Leaderless)
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, replication, service, peer-to-peer, leaderless]

View File

@@ -2,7 +2,7 @@
id: LSYNC-011
title: Conflict Resolution (HLC-Based)
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: Medium
tags: [sync, conflict-resolution, hlc, merge]

View File

@@ -2,7 +2,7 @@
id: LSYNC-012
title: Bulk Entry Sync Optimization (State-Based)
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, indexing, bulk, performance, state-based]

View File

@@ -2,7 +2,7 @@
id: LSYNC-013
title: Hybrid Sync Protocol Handler (State + Log Based)
status: Done
assignee: james
assignee: jamiepine
parent: LSYNC-000
priority: High
tags: [sync, networking, protocol, peer-to-peer, leaderless]

View File

@@ -2,8 +2,9 @@
id: LSYNC-020
title: Device-Owned Deletion Sync via Cascading Tombstones
status: Done
assignee: james
assignee: jamiepine
priority: High
parent: LSYNC-000
tags: [sync, core, bug-fix, vdfs]
last_updated: 2025-12-02
related_tasks: []
@@ -78,7 +79,7 @@ Device B: Looks up folder by UUID, calls delete_subtree()
Device B: Cascade deletes all 10,000 children automatically
Result: VDFS consistency restored
Result: VDFS consistency restored
```
## Technical Design
@@ -746,7 +747,7 @@ Device A: Delete /Photos (parent folder)
Device B: Receives tombstone for /Photos
→ Calls delete_subtree()
→ File already gone (no-op, idempotent)
→ Successfully deletes /Photos
→ Successfully deletes /Photos
```
**Verdict:** Safe! `delete_subtree()` handles missing children gracefully.
@@ -765,7 +766,7 @@ Later deletes /Photos (parent):
Receiving device processes all 4 tombstones:
- Deletes individual files first
- Then deletes /Photos (cascade to already-deleted children is no-op)
- Correct final state
- Correct final state
```
**Verdict:** Order-independent, idempotent.
@@ -788,7 +789,7 @@ Device B receives tombstone:
- Calls delete_subtree() on /Photos entry
- Cascades to /Subfolder
- file1.jpg, file2.jpg never made it to B anyway
- Correct state
- Correct state
```
**Verdict:** Safe! Can only delete what exists locally.
@@ -873,14 +874,14 @@ pub async fn catch_up_from_peer(
A comprehensive audit of the sync codebase confirmed the design is sound with minor additions needed.
### Protocol Compatibility
### Protocol Compatibility
- StateResponse uses serde JSON serialization (backward compatible)
- Adding `deleted_uuids: Vec<Uuid>` as optional field is safe
- Old clients will ignore unknown fields gracefully
- No breaking changes to existing messages
### Registry Pattern Compatibility
### Registry Pattern Compatibility
- Registry uses function pointers (easy to add deletion dispatch)
- Can add `StateDeleteFn` type alongside `StateApplyFn`
@@ -909,7 +910,7 @@ A comprehensive audit of the sync codebase confirmed the design is sound with mi
**Additional:** For entries, also check if parent is tombstoned (prevents orphaned children).
### Watermark Infrastructure
### Watermark Infrastructure
- `devices.last_state_watermark` already tracks device-owned sync progress
- Can reuse for tombstone acknowledgment (no new table needed)

View File

@@ -2,8 +2,9 @@
id: LSYNC-021
title: Unified Sync Configuration System
status: Done
assignee: james
assignee: jamiepine
priority: Medium
parent: LSYNC-000
tags: [sync, core, config]
last_updated: 2025-12-02
related_tasks: [LSYNC-020]
@@ -34,6 +35,7 @@ Duration::days(30)
```
**Problems:**
- No single source of truth
- Can't adjust sync behavior without code changes
- Different defaults across files
@@ -709,20 +711,14 @@ export SD_PRUNING_STRATEGY=conservative
## Files Requiring Modification
**New Files (3):**
1. `core/src/infra/sync/config.rs` - Configuration types
2. `core/migrations/mXXXXXXXXX_add_sync_config.rs` - Database schema
3. `apps/cli/src/domains/sync/config.rs` - CLI commands
**Modified Files (6):**
4. `core/src/infra/sync/mod.rs` - Export config types
5. `core/src/service/sync/mod.rs` - Accept and use config
6. `core/src/service/sync/backfill.rs` - Replace constants with config
7. `core/src/service/sync/peer.rs` - Replace constants with config
8. `core/src/library/mod.rs` - Add config load/save methods
9. `apps/cli/src/domains/sync/mod.rs` - Add config subcommand
**Modified Files (6):** 4. `core/src/infra/sync/mod.rs` - Export config types 5. `core/src/service/sync/mod.rs` - Accept and use config 6. `core/src/service/sync/backfill.rs` - Replace constants with config 7. `core/src/service/sync/peer.rs` - Replace constants with config 8. `core/src/library/mod.rs` - Add config load/save methods 9. `apps/cli/src/domains/sync/mod.rs` - Add config subcommand
**Documentation (1):**
10. `docs/core/library-sync.mdx` - Add configuration section
**Documentation (1):** 10. `docs/core/library-sync.mdx` - Add configuration section
**Total: 10 files**
@@ -753,6 +749,7 @@ export SD_PRUNING_STRATEGY=conservative
---
**Next Steps:**
1. Review unified sync config design
2. Implement Phase 1 (config structure)
3. Integrate into sync service (Phase 2)

View File

@@ -2,8 +2,9 @@
id: LSYNC-022
title: Sync Metrics and Observability System
status: Done
assignee: james
assignee: jamiepine
priority: High
parent: LSYNC-000
tags: [sync, metrics, observability, monitoring]
last_updated: 2025-12-02
related_tasks: [LSYNC-010, LSYNC-021]
@@ -216,6 +217,7 @@ struct ErrorEvent {
### Phase 1: Core Infrastructure (2-3 days)
**Files to create:**
- `core/src/service/sync/metrics/mod.rs` - Main module
- `core/src/service/sync/metrics/collector.rs` - Central collector
- `core/src/service/sync/metrics/types.rs` - Metric types
@@ -223,6 +225,7 @@ struct ErrorEvent {
- `core/src/service/sync/metrics/history.rs` - Time-series storage
**Tasks:**
1. Define all metric types with atomic counters
2. Implement `SyncMetricsCollector` with thread-safe access
3. Create snapshot/export functionality
@@ -231,12 +234,14 @@ struct ErrorEvent {
### Phase 2: Integration (2-3 days)
**Files to modify:**
- `core/src/service/sync/peer.rs` - Add metrics recording
- `core/src/service/sync/backfill.rs` - Track backfill metrics
- `core/src/service/sync/state.rs` - Track state transitions
- `core/src/service/network/protocol/sync/handler.rs` - Track message handling
**Tasks:**
1. Add metrics recording to all sync operations
2. Record state transitions
3. Track latency for key operations
@@ -245,9 +250,11 @@ struct ErrorEvent {
### Phase 3: CLI Interface (1-2 days)
**Files to create:**
- `crates/cli/src/commands/sync/metrics.rs` - CLI command
**Command structure:**
```bash
# Get current metrics snapshot
sd sync metrics
@@ -275,6 +282,7 @@ sd sync metrics --errors # Recent errors only
```
**Output format:**
```
Sync Metrics (Library: My Library)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
@@ -359,10 +367,12 @@ Errors (Last hour)
### Phase 4: API Integration (1 day)
**Files to create:**
- `core/src/ops/sync/get_metrics/mod.rs` - Query for metrics
- `core/src/ops/sync/get_metrics/action.rs` - Action implementation
**Query implementation:**
```rust
// Define the query
pub struct GetSyncMetrics;
@@ -409,6 +419,7 @@ let metrics = dispatcher
**Event emission:**
Emit events on metric updates for UI real-time display via the existing event bus:
```rust
event_bus.emit(Event::SyncMetricsUpdated {
library_id,
@@ -452,12 +463,14 @@ ON sync_metrics_snapshots(library_id, timestamp);
## Testing Strategy
### Unit Tests
- Test atomic counter thread-safety
- Test histogram calculations
- Test ring buffer overflow behavior
- Test snapshot serialization
### Integration Tests
```rust
#[tokio::test]
async fn test_sync_metrics_tracking() {
@@ -479,6 +492,7 @@ async fn test_sync_metrics_tracking() {
```
### Performance Tests
- Measure overhead of metrics collection
- Verify zero-cost when disabled
- Test with high sync volume (1M+ operations)
@@ -510,6 +524,7 @@ async fn test_sync_metrics_tracking() {
## Implementation Files
**New files:**
- `core/src/service/sync/metrics/mod.rs`
- `core/src/service/sync/metrics/collector.rs`
- `core/src/service/sync/metrics/types.rs`
@@ -518,6 +533,7 @@ async fn test_sync_metrics_tracking() {
- `crates/cli/src/commands/sync/metrics.rs`
**Modified files:**
- `core/src/service/sync/peer.rs`
- `core/src/service/sync/backfill.rs`
- `core/src/service/sync/state.rs`

View File

@@ -2,14 +2,15 @@
id: LSYNC-023
title: Rebuild Closure Tables After Sync
status: Done
assignee: james
priority: Critical
assignee: jamiepine
priority: High
parent: LSYNC-000
tags: [sync, database, bug, closure-table]
last_updated: 2025-10-23
related_tasks: [LSYNC-010, INDEX-003, CORE-004]
---
# Rebuild Closure Tables After Sync (CRITICAL)
# Rebuild Closure Tables After Sync
## Problem Statement

View File

@@ -2,7 +2,7 @@
id: NET-000
title: "Epic: Networking & Synchronization"
status: Done
assignee: james
assignee: jamiepine
priority: High
tags: [epic, core, networking]
whitepaper: Section 4.5

View File

@@ -2,7 +2,7 @@
id: NET-001
title: Unified P2P Stack with Iroh
status: Done
assignee: james
assignee: jamiepine
parent: NET-000
priority: High
tags: [networking, iroh, p2p]

View File

@@ -2,7 +2,7 @@
id: NET-002
title: Secure Device Pairing Protocol
status: Done
assignee: james
assignee: jamiepine
parent: NET-000
priority: High
tags: [networking, security, pairing]

View File

@@ -2,7 +2,7 @@
id: NET-003
title: Spacedrop Protocol
status: To Do
assignee: james
assignee: jamiepine
parent: NET-000
priority: High
tags: [networking, spacedrop, sharing, p2p]

View File

@@ -2,7 +2,7 @@
id: PLUG-000
title: "Epic: WASM Extension System"
status: In Progress
assignee: james
assignee: jamiepine
priority: High
tags: [epic, plugins, wasm, extensibility, extensions]
whitepaper: Section 6.7

View File

@@ -1,8 +1,8 @@
---
id: PLUG-001
title: Integrate WASM Runtime
status: In Progress
assignee: james
status: Done
assignee: jamiepine
parent: PLUG-000
priority: High
tags: [plugins, wasm, runtime, wasmer]

View File

@@ -2,7 +2,7 @@
id: PLUG-002
title: Define and Implement VDFS Plugin API Bridge
status: In Progress
assignee: james
assignee: jamiepine
parent: PLUG-000
priority: High
tags: [plugins, wasm, api, vdfs, wire]

View File

@@ -2,9 +2,9 @@
id: PLUG-003
title: Develop Production Extension (Photos or Email)
status: To Do
assignee: james
assignee: jamiepine
parent: PLUG-000
priority: High
priority: Medium
tags: [plugins, wasm, extension, production]
whitepaper: Section 6.8
last_updated: 2025-10-14

View File

@@ -1,8 +1,8 @@
---
id: RES-000
title: "Epic: Resource Management & Mobile"
title: "Resource Management & Mobile"
status: To Do
assignee: james
assignee: jamiepine
priority: Medium
tags: [epic, core, performance, mobile]
whitepaper: Section 7

View File

@@ -2,7 +2,7 @@
id: RES-001
title: Adaptive Resource Throttling
status: To Do
assignee: james
assignee: jamiepine
parent: RES-000
priority: Medium
tags: [performance, mobile, core]

View File

@@ -1,8 +1,8 @@
---
id: SEARCH-000
title: "Epic: Temporal-Semantic Search"
status: In Progress
assignee: james
title: "Search"
status: To Do
assignee: jamiepine
priority: High
tags: [epic, search, ai, fts]
whitepaper: Section 4.7

View File

@@ -2,7 +2,7 @@
id: SEARCH-001
title: Asynchronous SearchJob
status: To Do
assignee: james
assignee: jamiepine
parent: SEARCH-000
priority: High
tags: [search, jobs, async]

View File

@@ -2,7 +2,7 @@
id: SEARCH-002
title: Two-Stage FTS5 + Semantic Re-ranking
status: To Do
assignee: james
assignee: jamiepine
parent: SEARCH-000
priority: High
tags: [search, fts, semantic-search, ai]

View File

@@ -2,7 +2,7 @@
id: SEARCH-003
title: Unified Vector Repositories
status: To Do
assignee: james
assignee: jamiepine
parent: SEARCH-000
priority: High
tags: [search, vector-search, ai, repositories]

View File

@@ -1,9 +1,9 @@
---
id: SEC-000
title: Security & Privacy Epic
title: Security & Privacy
status: In Progress
assignee:
parent:
assignee: jamiepine
parent: null
priority: High
tags: [security, epic]
whitepaper:

View File

@@ -2,7 +2,7 @@
id: SEC-002
title: SQLCipher for At-Rest Library Encryption
status: To Do
assignee: james
assignee: jamiepine
parent: SEC-000
priority: High
tags: [security, database, core, encryption]

View File

@@ -2,9 +2,9 @@
id: SEC-004
title: Role-Based Access Control (RBAC) System
status: To Do
assignee: james
assignee: jamiepine
parent: SEC-000
priority: High
priority: Low
tags: [security, enterprise, collaboration]
whitepaper: Section 4.4.6
---

View File

@@ -1,8 +1,8 @@
---
id: SEC-005
title: Secure Credential Vault
status: To Do
assignee: james
status: Done
assignee: jamiepine
parent: SEC-000
priority: High
tags: [security, credentials, vault, cloud]

Some files were not shown because too many files have changed in this diff Show More