From 5738e4dd008512f97496996c844885ca1380a3d8 Mon Sep 17 00:00:00 2001 From: rambros Date: Tue, 3 Mar 2026 23:40:40 +0530 Subject: [PATCH] fix message migration --- BACKUP.md | 331 +++++++++++++++++++++++++++++++++ src/stoat/migrate_message.py | 16 +- src/stoat/roles_permissions.py | 5 + src/ui/shuttle_ops.py | 5 + 4 files changed, 355 insertions(+), 2 deletions(-) create mode 100644 BACKUP.md diff --git a/BACKUP.md b/BACKUP.md new file mode 100644 index 0000000..1d5b00c --- /dev/null +++ b/BACKUP.md @@ -0,0 +1,331 @@ +# Discord Reaper: Backup System Technical Specification + +This document provides a deep-dive into the architecture, data lifecycle, and resilience strategies of the Discord Reaper backup system. + +## 1. Architectural Overview + +The backup system is built on a decoupled architecture that separates the API communication layer from the business logic and I/O operations. + +- **`DiscordReader` (API Provider)**: A high-level wrapper around the `discord.py` library. It handles authentication, rate limiting, and provides an asynchronous interface for fetching guild data, message history, and binary assets. It focuses on *fetching* rather than *processing*. +- **`DiscordExporter` (Orchestration & Serialization)**: The core engine that defines the export lifecycle. It consumes data from the `Reader`, transforms it into standardized schemas, and manages local filesystem operations. + +### Component Interaction Diagram + +```mermaid +graph TD + A[UI / CLI] --> B[DiscordExporter] + B --> C[DiscordReader] + C --> D[Discord API] + B --> E[Local Filesystem] + B --> F[User Cache Object] +``` + +--- + +## 2. Data Lifecycle & Serialization + +### 2.1 Incremental Synchronization Algorithm +To achieve idempotency and efficiency, the system implements an incremental sync strategy using Discord's snowflake IDs. + +1. **State Loading**: The `Exporter` reads the existing `{channel_id}.json` (if present). +2. **Snowflake Extraction**: It extracts the `lastMessageID` from the metadata. +3. **Filtered Fetch**: It calls `fetch_message_history(after_id=last_id)`. +4. **In-Memory Merge**: New messages are appended to the existing list. +5. **Atomic Write**: The updated JSON is written back to disk, ensuring that only new delta data is fetched from the API. + +### 2.2 User Profile Deduplication (`user_info.json`) +The system avoids redundant storage of user metadata (usernames, roles, colors) by using a global `user_cache` map. +- **Key**: `userID` (Snowflake). +- **Policy**: Users are added to the cache only on their first appearance in any channel's history. +- **Avatar Persistence**: User avatars are stored in a centralized `user_avatars/` directory and referenced by relative paths in the JSON schemas. + +--- + +## 3. Special Channel Type Specifications + +### 3.1 Forum Channels & Threads +Forums present a hierarchical challenge where the "starter message" and the "conversation" exist in separate contexts. + +- **Forum Index (`{channel_id}.json`)**: Contains an enriched list of "starter messages" representing each thread. These entries include thread titles, applied tags, and total attachment stats (summed from the entire thread). +- **Thread Persistence**: + - **Regular Threads**: `message_backup/threads/{thread_id}.json` + - **Forum Threads**: `message_backup/{forum_id}/{thread_id}.json` +- **Starter Identification**: The system uses `thread.history(limit=1, after=snowflake(thread_id - 1))` to reliably capture the first post even if it has been edited or pinned. + +--- + +## 4. Resilience & Error Handling + +### 4.1 Permission Resilience (403 Forbidden) +The system is designed to "fail-soft" when encountering restricted content: +- **Server Level**: If the bot lacks `view_channel` or `read_message_history` globally, the backup aborts with a clear error. +- **Channel Level**: If a specific channel is restricted, the error is logged, and the system proceeds to the next channel to ensure a partial backup is still completed. +- **Asset Level**: If an emoji or sticker cannot be downloaded due to permissions, the metadata is preserved with a `null` local path. + +### 4.2 Lottie Sticker Workaround +Discord's Lottie stickers (format: 3) are not supported by standard `discord.py` save methods. The system implements a bypass: +1. Extracts the internal `aiohttp` session from the client: `client.http._HTTPClient__session`. +2. Performs a direct `GET` request to the sticker URL. +3. Streams the raw byte data directly to a `.json` file locally. + +--- + +## 5. Technical Schemas + +### 5.1 Message Object (`_format_message`) +The internal representation of a message focuses on portability: + +| Field | Type | Description | +| :--- | :--- | :--- | +| `messageID` | `String` | Original Discord Snowflake | +| `type` | `String` | Normalized type (Text, ThreadStarter, Forward, etc.) | +| `timestamp` | `ISO8601` | Created date/time | +| `isPinned` | `Boolean` | Pin status | +| `content` | `String` | Raw markdown content (or snapshot content for forwards) | +| `userID` | `String` | Reference to `user_info.json` | +| `attachments`| `Array` | List of local file references and metadata | +| `embeds` | `Array` | Raw Dicord-formatted embed objects | +| `stickers` | `Array` | List of Message Sticker objects (see below) | +| `reactions` | `Array` | List of Reaction objects | + +#### Message Sticker Object +| Field | Type | Description | +| :--- | :--- | :--- | +| `id` | `String` | Sticker Snowflake ID | +| `name` | `String` | Sticker name | +| `format` | `String` | File format (PNG, APNG, LOTTIE, GIF) | +| `localPath` | `String` | Relative path to local file in `{channel_id}/` | + +#### Reaction Object +| Field | Type | Description | +| :--- | :--- | :--- | +| `emoji` | `String` | String representation (`unicode` or `name:id`) | +| `count` | `Integer` | Total count of this reaction | + +### 5.2 Asset Naming Logic +To prevent filename collisions (e.g., multiple files named `image.png`), the system uses a suffixing strategy: +`{filename_stem}-{snowflake_last_5}.{ext}` + +Example: `sunset-54321.png` + +### 5.3 `server_profile.json` Specification +| Field | Type | Description | +| :--- | :--- | :--- | +| `name` | `String` | Original Discord guild name | +| `id` | `String` | Guild Snowflake ID | +| `icon` | `String` | Relative path to local guild icon in `server_media/` | +| `banner` | `String` | Relative path to local guild banner in `server_media/` | +| `last_backup` | `ISO8601` | Timestamp of the last successful backup run | +| `ignore_channels` | `Array` | List of channel Snowflakes explicitly excluded from backup | + +### 5.4 `server_roles.json` Specification (Array of objects) +| Field | Type | Description | +| :--- | :--- | :--- | +| `id` | `String` | Role Snowflake ID | +| `name` | `String` | Role name | +| `color` | `String` | Hex-string representation of role color (e.g. `"#ffffff"`) | +| `position` | `Integer` | Vertical position in the hierarchy (0 is bottom) | +| `permissions`| `Integer` | Bitwise integer representing the role's Discord permissions | +| `hoist` | `Boolean` | Whether the role is displayed separately in the sidebar | +| `mentionable`| `Boolean` | Whether the role can be mentioned | + +### 5.5 `server_assets.json` Specification +Contains two primary arrays: `emojis` and `stickers`. +#### Emoji Object +| Field | Type | Description | +| :--- | :--- | :--- | +| `id` | `String` | Emoji Snowflake ID | +| `name` | `String` | Emoji name (without colons) | +| `animated` | `Boolean` | True if the emoji is a GIF | +| `filename` | `String` | Filename within `server_media/` | + +#### Sticker Object +| Field | Type | Description | +| :--- | :--- | :--- | +| `id` | `String` | Sticker Snowflake ID | +| `name` | `String` | Sticker name | +| `filename` | `String` | Filename within `server_media/` | + +### 5.6 `server_structure.json` Specification (Array of Category objects) +#### Category Object +| Field | Type | Description | +| :--- | :--- | :--- | +| `type` | `String` | Always `"category"` | +| `id` | `String` | Category Snowflake ID (or `"uncategorized"`) | +| `name` | `String` | Category name | +| `position` | `Integer` | Vertical position in hierarchy | +| `channels` | `Array` | List of Channel objects (see below) | + +#### Channel Object +| Field | Type | Description | +| :--- | :--- | :--- | +| `id` | `String` | Channel Snowflake ID | +| `name` | `String` | Channel name | +| `type` | `String` | "text", "voice", "forum", "news", or "thread" | +| `position` | `Integer` | Vertical position within the category | +| `topic` | `String` | Channel description/topic (null if empty) | +| `nsfw` | `Boolean` | True if marked Restricted/NSFW | +| `available_tags` | `Array` | List of Forum Tag objects (see below) | + +#### Forum Tag Object +| Field | Type | Description | +| :--- | :--- | :--- | +| `id` | `String` | Tag Snowflake ID | +| `name` | `String` | Tag display name | +| `moderated` | `Boolean` | True if restricted to moderators | +| `emoji_id` | `String` | ID of the tag's emoji (null if unicode/none) | +| `emoji_name` | `String` | Name of the tag's emoji | + +### 5.7 `user_info.json` Specification (Array of User objects) +| Field | Type | Description | +| :--- | :--- | :--- | +| `userID` | `String` | User Snowflake ID | +| `username` | `String` | Current global username | +| `userNickname`| `String` | Server-specific nickname (display name) | +| `userColor` | `String` | Role-derived color for the user | +| `userIsBot` | `Boolean` | True if the account is a bot | +| `userRoles` | `Array` | List of role snippets (name, id, color, position) | +| `userAvatar` | `String` | Relative path to local avatar in `user_avatars/` | + +### 5.8 Channel History JSON Specification +File: `message_backup/{channel_id}.json` + +This file contains the full history of a channel along with synchronization metadata. + +| Field | Type | Description | +| :--- | :--- | :--- | +| `channelName` | `String` | Human-readable name of the channel | +| `channelID` | `String` | Channel Snowflake ID | +| `channelType` | `String` | "Text", "Thread", "News", or "Forum" | +| `messageCount` | `Integer` | Total number of messages stored in the `messages` array | +| `threadCount` | `Integer` | (If Parent) Count of threads associated with this channel | +| `lastMessageID`| `String` | ID of the most recent message (used for incremental sync) | +| `totalAttachmentSizeBytes`| `Integer`| Summed size of all attachments for this channel | +| `numberOfAttachments` | `Integer`| Total count of attachments | +| `lastBackup` | `ISO8601` | Timestamp of last message fetch | +| `messages` | `Array` | The message objects (see Section 5.1) | +| `parentID` | `String` | (If Thread) Snowflake of the parent channel | + +--- + +## 7. Backup Reader Implementation Guide + +This section is a technical manual for developers building third-party tools (viewers, search engines, or analytics) to consume Discord Reaper backups. + +### 7.1 Entry Point Discovery +A reader should start by identifying the backup root directory (prefixed with `DISCORD_BACKUP-`). + +1. **Parse `server_profile.json`**: Extract the server name, ID, and assets (icon/banner). +2. **Load `server_structure.json`**: This defines the navigation tree for your UI. + - Iterate through categories. + - Map channels to their respective types (text, voice, forum). + - Store the `position` to preserve the original visual order. + +### 7.2 Relational Data Mapping +The backup data is normalized to minimize duplication. A reader must implement the following resolve logic: + +- **User Resolution**: When parsing a message in `{channel_id}.json`, the `userID` must be cross-referenced against the `userID` keys in `message_backup/user_info.json`. +- **Role Resolution**: Use the `userRoles` array (IDs) from the user object and resolve them against the role metadata in `server_roles.json` to get colors and names. +- **Static Asset Resolution**: + - **Server Assets**: Prepend `server_media/` to filenames found in `server_assets.json`. + - **User Avatars**: Resolve `userAvatar` paths found in `user_info.json` (pointing to `user_avatars/`). + +### 7.3 Message Rendering Logic +When rendering the `messages` array from a channel JSON: + +| Feature | Reader Implementation Logic | +| :--- | :--- | +| **Markdown** | Content is raw Discord markdown. Use a library like `markdown-it` with discord-specific plugins. | +| **Attachments** | Resolve `url` field (`{channel_id}/{filename}`) relative to the `message_backup/` directory. | +| **Emojis/Stickers** | If a message contains custom emojis/stickers, resolve their metadata via `server_assets.json`. | +| **Replies** | Use the `reference` object to find the target `messageId`. Note: The target might be in the same file or a different channel/thread. | + +### 7.4 Thread & Forum Reconstruction +Reconstructing the hierarchy requires specific pointer logic: + +1. **Forums**: + - Read `message_backup/{forum_id}.json`. + - Each message in this file is a `Thread_starter_message`. + - The `messageID` of the starter message *is usually* the same as the `thread_id`. + - To load the full thread, open `message_backup/{forum_id}/{thread_id}.json`. +2. **Regular Threads**: + - Discoverable via the `parentID` field in any message or by scanning `message_backup/threads/`. + - Match the `thread.id` in a `ThreadStarter` message to the respective JSON in the `threads/` folder. + +--- + +## 8. Discord.py Model Hydration Guide + +If you are building a `discord.py` API-compatible wrapper to read these backups directly into familiar Discord objects, here is the explicit property mapping from the schema to the standard `discord.py` object attributes. + +### 8.1 Base Server (Guild) +File: `server_profile.json` & `server_roles.json` & `server_structure.json` +- **`discord.Guild`**: + - `id`: Cast `id` (str) to `int`. + - `name`: Mapped directly from `name`. + - `icon` / `banner`: Represented as `discord.Asset` objects. Use the local file paths from `icon` / `banner` as the asset URL/filepath. + - `roles`: Hydrated from `server_roles.json`. + - `channels` / `categories`: Hydrated from `server_structure.json`. + +### 8.2 Roles (`discord.Role`) +File: `server_roles.json` +- `id`: Cast `id` to `int`. +- `name`: Mapped directly. +- `color`: Parse the hex string to `discord.Color(value)`. +- `position`: Mapped directly. +- `permissions`: Initialize `discord.Permissions(value=int(permissions))`. +- `hoist`: Mapped directly to boolean. +- `mentionable`: Mapped directly to boolean. + +### 8.3 Users & Members (`discord.Member` / `discord.User`) +File: `message_backup/user_info.json` +- `id`: Cast `userID` to `int`. +- `name`: Mapped from `username`. +- `display_name`: Mapped from `userNickname`. +- `bot`: Mapped from `userIsBot`. +- `color`: Parse `userColor` string to `discord.Color`. +- `roles`: List of hydrated `discord.Role` objects via matching `id`s from the `userRoles` array. +- `avatar`: Mocked `discord.Asset` using the `userAvatar` local path. + +### 8.4 Channels (`discord.TextChannel`, `discord.CategoryChannel`, `discord.ForumChannel`) +File: `server_structure.json` +- Iterate over the top-level array (Categories): + - **`discord.CategoryChannel`**: + - `id`: Cast `id` to `int`. + - `name`: Mapped directly. + - `position`: Mapped directly. +- Iterate over the nested `channels` array: + - **`discord.abc.GuildChannel` classes**: + - `id`: Cast `id` to `int`. + - `name`: Mapped directly. + - `position`: Mapped directly. + - `type`: Match the `type` string back to the `discord.ChannelType` enum. + - `category_id`: Inherited from the parent category block. + - `topic`: Mapped directly (if applicable). + - `nsfw`: Mapped directly to boolean. + +### 8.5 Messages (`discord.Message`) +File: `message_backup/{channel_id}.json` (Iterating the `messages` array) +- `id`: Cast `messageID` to `int`. +- `type`: Map the string `type` (e.g., "Default", "Reply") to `discord.MessageType`. +- `created_at`: Parse `timestamp` (ISO-8601 string) into a timezone-aware `datetime` object. +- `pinned`: Mapped from `isPinned`. +- `content`: Mapped from `content`. +- `author`: Resolve the `userID` against the loaded `discord.Member` mocks. +- `embeds`: Instantiate using `discord.Embed.from_dict(embed_dict)` directly on the elements of the `embeds` array. +- **Reference (Replies)**: + - If `reference` exists, hydrate a `discord.MessageReference`. + - `message_id`: Cast `reference.messageId` to `int`. + - `channel_id`: Cast `reference.channelId` to `int`. + +### 8.6 Attachments (`discord.Attachment`) +Nested within Message objects. +- `id`: Cast `id` to `int`. +- `filename`: Mapped from `fileName`. +- `size`: Mapped from `fileSizeBytes`. +- `url` / `proxy_url`: Point to the local relative path (`{channel_id}/{resolved_filename}`). + +### 8.7 Reactions (`discord.Reaction` & `discord.PartialEmoji`) +Nested within Message objects. +- `count`: Mapped from `count`. +- `emoji`: Iterate the `emoji` string. If custom (contains a `:`), split it to mock a `discord.PartialEmoji(name=..., id=...)`. Otherwise, mock standard unicode strings. diff --git a/src/stoat/migrate_message.py b/src/stoat/migrate_message.py index 42550db..2e19c21 100644 --- a/src/stoat/migrate_message.py +++ b/src/stoat/migrate_message.py @@ -68,7 +68,13 @@ async def analyze_migration(context: MigrationContext, source_channel_id: int, a """ Scans channel history to count messages, threads, and attachments. """ - stats = {"messages": 0, "threads": 0, "attachments": 0} + stats = { + "messages": 0, + "threads": 0, + "attachments": 0, + "first_message_url": "", + "last_message_url": "" + } async for msg in context.discord_reader.fetch_message_history(source_channel_id, after_id=after_message_id): if not context.is_running: @@ -97,7 +103,13 @@ async def analyze_migration(context: MigrationContext, source_channel_id: int, a async def migrate_messages(context: MigrationContext, source_channel_id: int, target_channel_id: str, after_message_id: int | None = None, progress_callback: Callable[[Dict[str, Any]], Awaitable[None]] | None = None) -> Dict[str, Any]: """Migrate messages for a specific channel using Stoat masquerade for author impersonation.""" - stats = {"messages": 0, "threads": 0, "attachments": 0} + stats = { + "messages": 0, + "threads": 0, + "attachments": 0, + "first_message_url": "", + "last_message_url": "" + } logger.info(f"Starting message migration: Discord #{source_channel_id} -> Stoat #{target_channel_id}") if after_message_id: diff --git a/src/stoat/roles_permissions.py b/src/stoat/roles_permissions.py index 4152358..9ea23cc 100644 --- a/src/stoat/roles_permissions.py +++ b/src/stoat/roles_permissions.py @@ -181,6 +181,11 @@ async def migrate_roles(context: MigrationContext, progress_callback: Callable[[ except Exception as e: logger.error(f"Failed to sync default permissions: {e}") + # 2. Fetch and filter roles + roles = await context.discord_reader.get_roles() + if not force: + roles = [r for r in roles if not context.state.get_target_role_id(str(r.id))] + total = len(roles) cloned_role_names = [] diff --git a/src/ui/shuttle_ops.py b/src/ui/shuttle_ops.py index e4fdbc3..399f828 100644 --- a/src/ui/shuttle_ops.py +++ b/src/ui/shuttle_ops.py @@ -9,6 +9,7 @@ import logging import re import time import aiohttp +import traceback from pathlib import Path from textual.app import ComposeResult @@ -39,6 +40,8 @@ import src.stoat.migrate_message as stoat_migrate global_rate_limit_msg = "" global_rate_limit_expires = 0.0 +logger = logging.getLogger(__name__) + class RateLimitHandler(logging.Handler): """Intercepts library logs to capture rate-limit messages.""" @@ -412,6 +415,7 @@ class ShuttlePane(Container): report = self._format_clone_report(results) modal.write(report) except Exception as e: + logger.error(f"Batch Cloning Error: {e}\n{traceback.format_exc()}") modal.write(f"[bold red]Error: {e}[/bold red]") modal.phase_report("Batch Operation", "error") finally: @@ -464,6 +468,7 @@ class ShuttlePane(Container): report = self._format_sync_report(results) modal.write(report) except Exception as e: + logger.error(f"Batch Sync Error: {e}\n{traceback.format_exc()}") modal.write(f"[bold red]Error: {e}[/bold red]") modal.phase_report("Batch Operation", "error") finally: