Files
spacedrive/adapters/github/sync.py
Jamie Pine c02e3404b1 data
2026-03-26 12:19:56 -07:00

276 lines
9.4 KiB
Python

#!/usr/bin/env python3
"""
GitHub adapter for Spacedrive.
Indexes issues, pull requests, and comments from GitHub repositories
using the REST API. Supports incremental sync via `updated_at` cursor.
Requires a personal access token with `repo` scope (or fine-grained
token with Issues and Pull Requests read permissions).
"""
import json
import sys
import urllib.request
import urllib.error
from datetime import datetime, timezone
API_BASE = "https://api.github.com"
def log(level: str, message: str):
print(json.dumps({"log": level, "message": message}), flush=True)
def emit(operation: dict):
print(json.dumps(operation), flush=True)
def api_get(path: str, token: str, params: dict = None) -> list:
"""Make a GET request to GitHub API. Handles pagination, returns all results."""
results = []
url = f"{API_BASE}{path}"
if params:
query_parts = []
for k, v in params.items():
query_parts.append(f"{k}={v}")
url += "?" + "&".join(query_parts)
page = 1
while True:
separator = "&" if "?" in url else "?"
page_url = f"{url}{separator}page={page}&per_page=100"
req = urllib.request.Request(page_url)
req.add_header("Authorization", f"Bearer {token}")
req.add_header("Accept", "application/vnd.github+json")
req.add_header("X-GitHub-Api-Version", "2022-11-28")
req.add_header("User-Agent", "spacedrive-adapter/0.1")
try:
with urllib.request.urlopen(req) as resp:
data = json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
if e.code == 403:
# Rate limited — stop pagination
log("warn", f"GitHub API rate limited on {path}")
break
elif e.code == 404:
log("warn", f"Not found: {path}")
return []
else:
raise
if not isinstance(data, list):
# Single object response (e.g., /repos/owner/repo)
return [data]
results.extend(data)
if len(data) < 100:
break
page += 1
# Safety limit
if page > 50:
log("warn", f"Pagination limit reached for {path}")
break
return results
def parse_iso(dt_str: str) -> str:
"""Normalize GitHub's ISO 8601 timestamps to UTC."""
if not dt_str:
return ""
try:
# GitHub returns "2025-01-15T10:30:00Z" format
s = dt_str.replace("Z", "+00:00")
dt = datetime.fromisoformat(s)
return dt.astimezone(timezone.utc).isoformat()
except (ValueError, TypeError):
return dt_str
def main():
try:
input_data = json.loads(sys.stdin.read())
except json.JSONDecodeError as e:
log("error", f"Invalid input JSON: {e}")
sys.exit(2)
config = input_data.get("config", {})
cursor = input_data.get("cursor")
token = config.get("token", "")
if not token:
log("error", "Missing required config: token")
sys.exit(2)
repos_str = config.get("repos", "")
include_prs = config.get("include_prs", True)
if isinstance(include_prs, str):
include_prs = include_prs.lower() in ("true", "1", "yes")
max_items = int(config.get("max_items", 500))
# ── Determine repos to index ─────────────────────────────────────────
if repos_str:
repo_list = [r.strip() for r in repos_str.split(",") if r.strip()]
else:
# Fetch all repos the user has access to
log("info", "No repos specified, fetching all accessible repos")
try:
all_repos = api_get("/user/repos", token, {"sort": "updated", "type": "all"})
repo_list = [r["full_name"] for r in all_repos if r.get("full_name")]
except Exception as e:
log("error", f"Failed to fetch repos: {e}")
sys.exit(1)
log("info", f"Indexing {len(repo_list)} repositories")
max_updated = cursor or ""
total_issues = 0
total_comments = 0
for repo_full_name in repo_list:
# ── Upsert repository ────────────────────────────────────────────
try:
repo_data = api_get(f"/repos/{repo_full_name}", token)
if not repo_data:
log("warn", f"Could not fetch repo: {repo_full_name}")
continue
repo = repo_data[0]
except Exception as e:
log("warn", f"Failed to fetch repo {repo_full_name}: {e}")
continue
repo_id = str(repo.get("id", repo_full_name))
emit({
"upsert": "repository",
"external_id": repo_id,
"fields": {
"name": repo.get("name", ""),
"full_name": repo.get("full_name", repo_full_name),
"description": (repo.get("description") or "")[:5000],
"language": repo.get("language") or "",
"stars": repo.get("stargazers_count", 0),
"url": repo.get("html_url", ""),
}
})
# ── Fetch issues (and PRs if enabled) ────────────────────────────
params = {
"state": "all",
"sort": "updated",
"direction": "desc",
}
if cursor:
params["since"] = cursor
try:
issues = api_get(f"/repos/{repo_full_name}/issues", token, params)
except Exception as e:
log("warn", f"Failed to fetch issues for {repo_full_name}: {e}")
continue
repo_issue_count = 0
for item in issues:
if repo_issue_count >= max_items:
break
is_pr = "pull_request" in item
if is_pr and not include_prs:
continue
issue_number = item.get("number", 0)
issue_id = f"{repo_full_name}#{issue_number}"
title = item.get("title", "")
body = item.get("body") or ""
author = item.get("user", {}).get("login", "") if item.get("user") else ""
state = item.get("state", "")
url = item.get("html_url", "")
created_at = parse_iso(item.get("created_at", ""))
updated_at = parse_iso(item.get("updated_at", ""))
closed_at = parse_iso(item.get("closed_at", ""))
comments_count = item.get("comments", 0)
# Labels
labels = []
for label in item.get("labels", []):
if isinstance(label, dict):
labels.append(label.get("name", ""))
elif isinstance(label, str):
labels.append(label)
labels_str = ", ".join(labels)
emit({
"upsert": "issue",
"external_id": issue_id,
"fields": {
"title": title[:500],
"body": body[:50000],
"author": author,
"state": state,
"number": issue_number,
"url": url,
"is_pr": is_pr,
"labels": labels_str[:1000],
"comments_count": comments_count,
"created_at": created_at,
"updated_at": updated_at,
"closed_at": closed_at,
"repository_id": repo_id,
}
})
total_issues += 1
repo_issue_count += 1
# Track latest update for cursor
raw_updated = item.get("updated_at", "")
if raw_updated > max_updated:
max_updated = raw_updated
# ── Fetch comments for this issue ────────────────────────────
if comments_count > 0:
try:
comments = api_get(
f"/repos/{repo_full_name}/issues/{issue_number}/comments",
token
)
for comment in comments:
comment_id = str(comment.get("id", ""))
if not comment_id:
continue
emit({
"upsert": "comment",
"external_id": comment_id,
"fields": {
"body": (comment.get("body") or "")[:50000],
"author": comment.get("user", {}).get("login", "") if comment.get("user") else "",
"created_at": parse_iso(comment.get("created_at", "")),
"url": comment.get("html_url", ""),
"issue_id": issue_id,
}
})
total_comments += 1
except Exception as e:
log("warn", f"Failed to fetch comments for {issue_id}: {e}")
log("info", f"{repo_full_name}: {repo_issue_count} issues/PRs")
# Emit cursor
if max_updated:
emit({"cursor": max_updated})
log("info", f"Synced {total_issues} issues/PRs and {total_comments} comments from {len(repo_list)} repos")
if __name__ == "__main__":
main()