mirror of
https://github.com/spacedriveapp/spacedrive.git
synced 2026-04-17 21:17:08 -04:00
225 lines
7.7 KiB
Python
225 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Slack Export adapter for Spacedrive.
|
|
|
|
Reads a Slack workspace export (the unzipped JSON directory structure).
|
|
Export format: channels.json, users.json, and one folder per channel
|
|
containing daily JSON files (YYYY-MM-DD.json).
|
|
|
|
This is a full-scan adapter — no incremental cursor since exports are
|
|
point-in-time snapshots. Re-exporting and re-syncing will upsert all data.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import os
|
|
import glob
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
def log(level: str, message: str):
|
|
print(json.dumps({"log": level, "message": message}), flush=True)
|
|
|
|
|
|
def emit(operation: dict):
|
|
print(json.dumps(operation), flush=True)
|
|
|
|
|
|
def ts_to_iso(ts: str) -> str:
|
|
"""Convert Slack timestamp (Unix float as string, e.g. '1706123456.789012') to ISO 8601 UTC."""
|
|
try:
|
|
if not ts:
|
|
return ""
|
|
dt = datetime.fromtimestamp(float(ts), tz=timezone.utc)
|
|
return dt.isoformat()
|
|
except (ValueError, OSError, TypeError):
|
|
return ""
|
|
|
|
|
|
def load_json(path: str):
|
|
"""Load a JSON file, returning None on failure."""
|
|
try:
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
except (json.JSONDecodeError, OSError) as e:
|
|
log("warn", f"Failed to read {path}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
try:
|
|
input_data = json.loads(sys.stdin.read())
|
|
except json.JSONDecodeError as e:
|
|
log("error", f"Invalid input JSON: {e}")
|
|
sys.exit(2)
|
|
|
|
config = input_data.get("config", {})
|
|
|
|
export_path = config.get("export_path", "")
|
|
if not export_path:
|
|
log("error", "Missing required config: export_path")
|
|
sys.exit(2)
|
|
|
|
export_path = os.path.expanduser(export_path)
|
|
if not os.path.isdir(export_path):
|
|
log("error", f"Export directory not found: {export_path}")
|
|
sys.exit(2)
|
|
|
|
max_messages = int(config.get("max_messages", 0))
|
|
|
|
# ── Load users for display name lookup ───────────────────────────────
|
|
users_map = {}
|
|
users_file = os.path.join(export_path, "users.json")
|
|
users_data = load_json(users_file)
|
|
if users_data:
|
|
for user in users_data:
|
|
uid = user.get("id", "")
|
|
profile = user.get("profile", {})
|
|
display = (
|
|
profile.get("display_name")
|
|
or profile.get("real_name")
|
|
or user.get("real_name")
|
|
or user.get("name", uid)
|
|
)
|
|
users_map[uid] = display
|
|
else:
|
|
log("warn", "No users.json found, author names will be user IDs")
|
|
|
|
# ── Load channels ────────────────────────────────────────────────────
|
|
channels = []
|
|
for filename in ["channels.json", "groups.json", "mpims.json", "dms.json"]:
|
|
channels_file = os.path.join(export_path, filename)
|
|
data = load_json(channels_file)
|
|
if data:
|
|
channels.extend(data)
|
|
|
|
if not channels:
|
|
log("error", "No channel data found in export (expected channels.json)")
|
|
sys.exit(2)
|
|
|
|
channel_name_map = {}
|
|
for ch in channels:
|
|
ch_id = ch.get("id", "")
|
|
ch_name = ch.get("name", ch_id)
|
|
purpose = ch.get("purpose", {})
|
|
purpose_text = purpose.get("value", "") if isinstance(purpose, dict) else str(purpose)
|
|
topic = ch.get("topic", {})
|
|
topic_text = topic.get("value", "") if isinstance(topic, dict) else str(topic)
|
|
|
|
channel_name_map[ch_id] = ch_name
|
|
# Also map folder name -> channel id (folders are named by channel name)
|
|
channel_name_map[ch_name] = ch_id
|
|
|
|
members = ch.get("members", [])
|
|
|
|
emit({
|
|
"upsert": "channel",
|
|
"external_id": ch_id,
|
|
"fields": {
|
|
"name": ch_name,
|
|
"purpose": purpose_text[:5000],
|
|
"topic": topic_text[:500],
|
|
"is_archived": ch.get("is_archived", False),
|
|
"member_count": len(members) if isinstance(members, list) else 0,
|
|
}
|
|
})
|
|
|
|
log("info", f"Loaded {len(channels)} channels")
|
|
|
|
# ── Load messages from channel folders ───────────────────────────────
|
|
msg_count = 0
|
|
thread_parents = {} # ts -> external_id, for parent linking
|
|
|
|
# Iterate channel directories
|
|
for entry in sorted(os.listdir(export_path)):
|
|
channel_dir = os.path.join(export_path, entry)
|
|
if not os.path.isdir(channel_dir) or entry.startswith("."):
|
|
continue
|
|
|
|
# Find the channel ID for this folder
|
|
ch_id = channel_name_map.get(entry)
|
|
if not ch_id:
|
|
# Folder name might be the channel ID itself
|
|
ch_id = entry
|
|
|
|
# Read all daily JSON files sorted chronologically
|
|
day_files = sorted(glob.glob(os.path.join(channel_dir, "*.json")))
|
|
|
|
for day_file in day_files:
|
|
messages = load_json(day_file)
|
|
if not messages or not isinstance(messages, list):
|
|
continue
|
|
|
|
for msg in messages:
|
|
if max_messages > 0 and msg_count >= max_messages:
|
|
break
|
|
|
|
ts = msg.get("ts", "")
|
|
if not ts:
|
|
continue
|
|
|
|
# Skip subtypes that aren't real messages
|
|
subtype = msg.get("subtype", "")
|
|
if subtype in ("channel_join", "channel_leave", "channel_topic",
|
|
"channel_purpose", "channel_name", "channel_archive",
|
|
"channel_unarchive", "pinned_item", "unpinned_item"):
|
|
continue
|
|
|
|
user_id = msg.get("user", msg.get("bot_id", "unknown"))
|
|
author = users_map.get(user_id, user_id)
|
|
text = msg.get("text", "")
|
|
|
|
# Expand user mentions: <@U1234> -> @display_name
|
|
for uid, uname in users_map.items():
|
|
text = text.replace(f"<@{uid}>", f"@{uname}")
|
|
|
|
timestamp_iso = ts_to_iso(ts)
|
|
thread_ts = msg.get("thread_ts", "")
|
|
reply_count = msg.get("reply_count", 0)
|
|
|
|
# Build reactions string
|
|
reactions = msg.get("reactions", [])
|
|
reactions_str = ""
|
|
if reactions:
|
|
parts = []
|
|
for r in reactions:
|
|
name = r.get("name", "")
|
|
count = r.get("count", 0)
|
|
if name:
|
|
parts.append(f":{name}: {count}")
|
|
reactions_str = ", ".join(parts)
|
|
|
|
msg_id = f"{ch_id}_{ts}"
|
|
|
|
emit({
|
|
"upsert": "message",
|
|
"external_id": msg_id,
|
|
"fields": {
|
|
"text": text[:50000],
|
|
"author": author[:200],
|
|
"timestamp": timestamp_iso,
|
|
"thread_ts": thread_ts,
|
|
"reply_count": reply_count,
|
|
"reactions": reactions_str[:1000],
|
|
"channel_id": ch_id,
|
|
}
|
|
})
|
|
|
|
# Track thread parents for linking
|
|
if not thread_ts or thread_ts == ts:
|
|
thread_parents[ts] = msg_id
|
|
|
|
msg_count += 1
|
|
|
|
if max_messages > 0 and msg_count >= max_messages:
|
|
break
|
|
|
|
if max_messages > 0 and msg_count >= max_messages:
|
|
break
|
|
|
|
log("info", f"Synced {msg_count} messages across {len(channels)} channels")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|