diff --git a/.env.example b/.env.example index bf9ea11..2694189 100644 --- a/.env.example +++ b/.env.example @@ -41,10 +41,7 @@ NEO4J_USERNAME=neo4j NEO4J_PASSWORD=27192e6432564f4788d55c15131bd5ac OPENAI_API_KEY= - MAGIC_LINK_SECRET=27192e6432564f4788d55c15131bd5ac - - NEO4J_AUTH=neo4j/27192e6432564f4788d55c15131bd5ac OLLAMA_URL=http://ollama:11434 diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml index 5f53f5b..b784c09 100644 --- a/.github/workflows/build-docker-image.yml +++ b/.github/workflows/build-docker-image.yml @@ -7,32 +7,6 @@ on: workflow_dispatch: jobs: - build-init: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - with: - ref: main - - - name: Set up QEMU - uses: docker/setup-qemu-action@v1 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - - name: Login to Docker Registry - run: echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u "${{ secrets.DOCKER_USERNAME }}" --password-stdin - - - name: Build and Push Frontend Docker Image - uses: docker/build-push-action@v2 - with: - context: . - file: ./apps/init/Dockerfile - platforms: linux/amd64,linux/arm64 - push: true - tags: redplanethq/init:${{ github.ref_name }} - build-webapp: runs-on: ubuntu-latest diff --git a/apps/init/.gitignore b/apps/init/.gitignore deleted file mode 100644 index 3814592..0000000 --- a/apps/init/.gitignore +++ /dev/null @@ -1,51 +0,0 @@ -# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. - -# Dependencies -node_modules -.pnp -.pnp.js - -# Local env files -.env -.env.local -.env.development.local -.env.test.local -.env.production.local - -# Testing -coverage - -# Turbo -.turbo - -# Vercel -.vercel - -# Build Outputs -.next/ -out/ -build -dist -.tshy/ -.tshy-build/ - -# Debug -npm-debug.log* -yarn-debug.log* -yarn-error.log* - -# Misc -.DS_Store -*.pem - -docker-compose.dev.yaml - -clickhouse/ -.vscode/ -registry/ - -.cursor -CLAUDE.md - -.claude - diff --git a/apps/init/Dockerfile b/apps/init/Dockerfile deleted file mode 100644 index 46ec445..0000000 --- a/apps/init/Dockerfile +++ /dev/null @@ -1,70 +0,0 @@ -ARG NODE_IMAGE=node:20.11.1-bullseye-slim@sha256:5a5a92b3a8d392691c983719dbdc65d9f30085d6dcd65376e7a32e6fe9bf4cbe - -FROM ${NODE_IMAGE} AS pruner - -WORKDIR /core - -COPY --chown=node:node . . -RUN npx -q turbo@2.5.3 prune --scope=@redplanethq/init --docker -RUN find . -name "node_modules" -type d -prune -exec rm -rf '{}' + - -# Base strategy to have layer caching -FROM ${NODE_IMAGE} AS base -RUN apt-get update && apt-get install -y openssl dumb-init postgresql-client -WORKDIR /core -COPY --chown=node:node .gitignore .gitignore -COPY --from=pruner --chown=node:node /core/out/json/ . -COPY --from=pruner --chown=node:node /core/out/pnpm-lock.yaml ./pnpm-lock.yaml -COPY --from=pruner --chown=node:node /core/out/pnpm-workspace.yaml ./pnpm-workspace.yaml - -## Dev deps -FROM base AS dev-deps -WORKDIR /core -# Corepack is used to install pnpm -RUN corepack enable -ENV NODE_ENV development -RUN pnpm install --ignore-scripts --no-frozen-lockfile - -## Production deps -FROM base AS production-deps -WORKDIR /core -# Corepack is used to install pnpm -RUN corepack enable -ENV NODE_ENV production -RUN pnpm install --prod --no-frozen-lockfile - -## Builder (builds the init CLI) -FROM base AS builder -WORKDIR /core -# Corepack is used to install pnpm -RUN corepack enable - -COPY --from=pruner --chown=node:node /core/out/full/ . -COPY --from=dev-deps --chown=node:node /core/ . -COPY --chown=node:node turbo.json turbo.json -COPY --chown=node:node .configs/tsconfig.base.json .configs/tsconfig.base.json -RUN pnpm run build --filter=@redplanethq/init... - -# Runner -FROM ${NODE_IMAGE} AS runner -RUN apt-get update && apt-get install -y openssl postgresql-client ca-certificates -WORKDIR /core -RUN corepack enable -ENV NODE_ENV production - -COPY --from=base /usr/bin/dumb-init /usr/bin/dumb-init -COPY --from=pruner --chown=node:node /core/out/full/ . -COPY --from=production-deps --chown=node:node /core . -COPY --from=builder --chown=node:node /core/apps/init/dist ./apps/init/dist - -# Copy the trigger dump file -COPY --chown=node:node apps/init/trigger.dump ./apps/init/trigger.dump - -# Copy and set up entrypoint script -COPY --chown=node:node apps/init/entrypoint.sh ./apps/init/entrypoint.sh -RUN chmod +x ./apps/init/entrypoint.sh - -USER node -WORKDIR /core/apps/init -ENTRYPOINT ["dumb-init", "--"] -CMD ["./entrypoint.sh"] \ No newline at end of file diff --git a/apps/init/README.md b/apps/init/README.md deleted file mode 100644 index 9d6d85d..0000000 --- a/apps/init/README.md +++ /dev/null @@ -1,197 +0,0 @@ -# Core CLI - -🧠 **CORE - Contextual Observation & Recall Engine** - -A Command-Line Interface for setting up and managing the Core development environment. - -## Installation - -```bash -npm install -g @redplanethq/core -``` - -## Commands - -### `core init` - -**One-time setup command** - Initializes the Core development environment with full configuration. - -### `core start` - -**Daily usage command** - Starts all Core services (Docker containers). - -### `core stop` - -**Daily usage command** - Stops all Core services (Docker containers). - -## Getting Started - -### Prerequisites - -- **Node.js** (v18.20.0 or higher) -- **Docker** and **Docker Compose** -- **Git** -- **pnpm** package manager - -### Initial Setup - -1. **Clone the Core repository:** - ```bash - git clone https://github.com/redplanethq/core.git - cd core - ``` - -2. **Run the initialization command:** - ```bash - core init - ``` - -3. **The CLI will guide you through the complete setup process:** - -#### Step 1: Prerequisites Check -- The CLI shows a checklist of required tools -- Confirms you're in the Core repository directory -- Exits with instructions if prerequisites aren't met - -#### Step 2: Environment Configuration - -- Copies `.env.example` to `.env` in the root directory -- Copies `trigger/.env.example` to `trigger/.env` -- Skips copying if `.env` files already exist - -#### Step 3: Docker Services Startup - -- Starts main Core services: `docker compose up -d` -- Starts Trigger.dev services: `docker compose up -d` (in trigger/ directory) -- Shows real-time output with progress indicators - -#### Step 4: Database Health Check - -- Verifies PostgreSQL is running on `localhost:5432` -- Retries for up to 60 seconds if needed - -#### Step 5: Trigger.dev Setup (Interactive) - -- **If Trigger.dev is not configured:** - - 1. Prompts you to open http://localhost:8030 - 2. Asks you to login to Trigger.dev - 3. Guides you to create an organization and project - 4. Collects your Project ID and Secret Key - 5. Updates `.env` with your Trigger.dev configuration - 6. Restarts Core services with new configuration - -- **If Trigger.dev is already configured:** - - Skips setup and shows "Configuration already exists" message - -#### Step 6: Docker Registry Login - -- Displays docker login command with credentials from `.env` -- Waits for you to complete the login process - -#### Step 7: Trigger.dev Task Deployment - -- Automatically runs: `npx trigger.dev@v4-beta login -a http://localhost:8030` -- Deploys tasks with: `pnpm trigger:deploy` -- Shows manual deployment instructions if automatic deployment fails - -#### Step 8: Setup Complete! - -- Confirms all services are running -- Shows service URLs and connection information - -## Daily Usage - -After initial setup, use these commands for daily development: - -### Start Services - -```bash -core start -``` - -Starts all Docker containers for Core development. - -### Stop Services - -```bash -core stop -``` - -Stops all Docker containers. - -## Service URLs - -After setup, these services will be available: - -- **Core Application**: http://localhost:3033 -- **Trigger.dev**: http://localhost:8030 -- **PostgreSQL**: localhost:5432 - -## Troubleshooting - -### Repository Not Found - -If you run commands outside the Core repository: - -- The CLI will ask you to confirm you're in the Core repository -- If not, it provides instructions to clone the repository -- Navigate to the Core repository directory before running commands again - -### Docker Issues - -- Ensure Docker is running -- Check Docker Compose is installed -- Verify you have sufficient system resources - -### Trigger.dev Setup Issues - -- Check container logs: `docker logs trigger-webapp --tail 50` -- Ensure you can access http://localhost:8030 -- Verify your network allows connections to localhost - -### Environment Variables - -The CLI automatically manages these environment variables: - -- `TRIGGER_PROJECT_ID` - Your Trigger.dev project ID -- `TRIGGER_SECRET_KEY` - Your Trigger.dev secret key -- Docker registry credentials for deployment - -### Manual Trigger.dev Deployment - -If automatic deployment fails, run manually: - -```bash -npx trigger.dev@v4-beta login -a http://localhost:8030 -pnpm trigger:deploy -``` - -## Development Workflow - -1. **First time setup:** `core init` -2. **Daily development:** - - `core start` - Start your development environment - - Do your development work - - `core stop` - Stop services when done - -## Support - -For issues and questions: - -- Check the main Core repository: https://github.com/redplanethq/core -- Review Docker container logs for troubleshooting -- Ensure all prerequisites are properly installed - -## Features - -- šŸš€ **One-command setup** - Complete environment initialization -- šŸ”„ **Smart configuration** - Skips already configured components -- šŸ“± **Real-time feedback** - Live progress indicators and output -- 🐳 **Docker integration** - Full container lifecycle management -- šŸ”§ **Interactive setup** - Guided configuration process -- šŸŽÆ **Error handling** - Graceful failure with recovery instructions - ---- - -**Happy coding with Core!** šŸŽ‰ diff --git a/apps/init/entrypoint.sh b/apps/init/entrypoint.sh deleted file mode 100644 index 86b64f0..0000000 --- a/apps/init/entrypoint.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh - -# Exit on any error -set -e - -echo "Starting init CLI..." - -# Wait for database to be ready -echo "Waiting for database connection..." -until pg_isready -h "${DB_HOST:-localhost}" -p "${DB_PORT:-5432}" -U "${POSTGRES_USER:-docker}"; do - echo "Database is unavailable - sleeping" - sleep 2 -done - -echo "Database is ready!" - -# Run the init command -echo "Running init command..." -node ./dist/esm/index.js init - -echo "Init completed successfully!" -exit 0 \ No newline at end of file diff --git a/apps/init/package.json b/apps/init/package.json deleted file mode 100644 index b7c2101..0000000 --- a/apps/init/package.json +++ /dev/null @@ -1,145 +0,0 @@ -{ - "name": "@redplanethq/init", - "version": "0.1.0", - "description": "A init service to create trigger instance", - "type": "module", - "license": "MIT", - "repository": { - "type": "git", - "url": "https://github.com/redplanethq/core", - "directory": "apps/init" - }, - "publishConfig": { - "access": "public" - }, - "keywords": [ - "typescript" - ], - "files": [ - "dist", - "trigger.dump" - ], - "bin": { - "core": "./dist/esm/index.js" - }, - "tshy": { - "selfLink": false, - "main": false, - "module": false, - "dialects": [ - "esm" - ], - "project": "./tsconfig.json", - "exclude": [ - "**/*.test.ts" - ], - "exports": { - "./package.json": "./package.json", - ".": "./src/index.ts" - } - }, - "devDependencies": { - "@epic-web/test-server": "^0.1.0", - "@types/gradient-string": "^1.1.2", - "@types/ini": "^4.1.1", - "@types/object-hash": "3.0.6", - "@types/polka": "^0.5.7", - "@types/react": "^18.2.48", - "@types/resolve": "^1.20.6", - "@types/rimraf": "^4.0.5", - "@types/semver": "^7.5.0", - "@types/source-map-support": "0.5.10", - "@types/ws": "^8.5.3", - "cpy-cli": "^5.0.0", - "execa": "^8.0.1", - "find-up": "^7.0.0", - "rimraf": "^5.0.7", - "ts-essentials": "10.0.1", - "tshy": "^3.0.2", - "tsx": "4.17.0" - }, - "scripts": { - "clean": "rimraf dist .tshy .tshy-build .turbo", - "typecheck": "tsc -p tsconfig.src.json --noEmit", - "build": "tshy", - "test": "vitest", - "test:e2e": "vitest --run -c ./e2e/vitest.config.ts" - }, - "dependencies": { - "@clack/prompts": "^0.10.0", - "@depot/cli": "0.0.1-cli.2.80.0", - "@opentelemetry/api": "1.9.0", - "@opentelemetry/api-logs": "0.52.1", - "@opentelemetry/exporter-logs-otlp-http": "0.52.1", - "@opentelemetry/exporter-trace-otlp-http": "0.52.1", - "@opentelemetry/instrumentation": "0.52.1", - "@opentelemetry/instrumentation-fetch": "0.52.1", - "@opentelemetry/resources": "1.25.1", - "@opentelemetry/sdk-logs": "0.52.1", - "@opentelemetry/sdk-node": "0.52.1", - "@opentelemetry/sdk-trace-base": "1.25.1", - "@opentelemetry/sdk-trace-node": "1.25.1", - "@opentelemetry/semantic-conventions": "1.25.1", - "ansi-escapes": "^7.0.0", - "braces": "^3.0.3", - "c12": "^1.11.1", - "chalk": "^5.2.0", - "chokidar": "^3.6.0", - "cli-table3": "^0.6.3", - "commander": "^9.4.1", - "defu": "^6.1.4", - "dotenv": "^16.4.5", - "dotenv-expand": "^12.0.2", - "esbuild": "^0.23.0", - "eventsource": "^3.0.2", - "evt": "^2.4.13", - "fast-npm-meta": "^0.2.2", - "git-last-commit": "^1.0.1", - "gradient-string": "^2.0.2", - "has-flag": "^5.0.1", - "import-in-the-middle": "1.11.0", - "import-meta-resolve": "^4.1.0", - "ini": "^5.0.0", - "jsonc-parser": "3.2.1", - "magicast": "^0.3.4", - "minimatch": "^10.0.1", - "mlly": "^1.7.1", - "nypm": "^0.5.4", - "nanoid": "3.3.8", - "object-hash": "^3.0.0", - "open": "^10.0.3", - "knex": "3.1.0", - "p-limit": "^6.2.0", - "p-retry": "^6.1.0", - "partysocket": "^1.0.2", - "pkg-types": "^1.1.3", - "polka": "^0.5.2", - "pg": "8.16.3", - "resolve": "^1.22.8", - "semver": "^7.5.0", - "signal-exit": "^4.1.0", - "source-map-support": "0.5.21", - "std-env": "^3.7.0", - "supports-color": "^10.0.0", - "tiny-invariant": "^1.2.0", - "tinyexec": "^0.3.1", - "tinyglobby": "^0.2.10", - "uuid": "11.1.0", - "ws": "^8.18.0", - "xdg-app-paths": "^8.3.0", - "zod": "3.23.8", - "zod-validation-error": "^1.5.0" - }, - "engines": { - "node": ">=18.20.0" - }, - "exports": { - "./package.json": "./package.json", - ".": { - "import": { - "types": "./dist/esm/index.d.ts", - "default": "./dist/esm/index.js" - } - } - } -} diff --git a/apps/init/src/cli/index.ts b/apps/init/src/cli/index.ts deleted file mode 100644 index e20545c..0000000 --- a/apps/init/src/cli/index.ts +++ /dev/null @@ -1,14 +0,0 @@ -import { Command } from "commander"; -import { initCommand } from "../commands/init.js"; -import { VERSION } from "./version.js"; - -const program = new Command(); - -program.name("core").description("Core CLI - A Command-Line Interface for Core").version(VERSION); - -program - .command("init") - .description("Initialize Core development environment (run once)") - .action(initCommand); - -program.parse(process.argv); diff --git a/apps/init/src/cli/version.ts b/apps/init/src/cli/version.ts deleted file mode 100644 index 2985a76..0000000 --- a/apps/init/src/cli/version.ts +++ /dev/null @@ -1,3 +0,0 @@ -import { env } from "../utils/env.js"; - -export const VERSION = env.VERSION; diff --git a/apps/init/src/commands/init.ts b/apps/init/src/commands/init.ts deleted file mode 100644 index 83ad673..0000000 --- a/apps/init/src/commands/init.ts +++ /dev/null @@ -1,36 +0,0 @@ -import { intro, outro, note } from "@clack/prompts"; -import { printCoreBrainLogo } from "../utils/ascii.js"; -import { initTriggerDatabase, updateWorkerImage } from "../utils/trigger.js"; - -export async function initCommand() { - // Display the CORE brain logo - printCoreBrainLogo(); - - intro("šŸš€ Core Development Environment Setup"); - - try { - await initTriggerDatabase(); - await updateWorkerImage(); - - note( - [ - "Your services will start running:", - "", - "• Core Application: http://localhost:3033", - "• Trigger.dev: http://localhost:8030", - "• PostgreSQL: localhost:5432", - "", - "You can now start developing with Core!", - "", - "ā„¹ļø When logging in to the Core Application, you can find the login URL in the Docker container logs:", - " docker logs core-app --tail 50", - ].join("\n"), - "šŸš€ Services Running" - ); - outro("šŸŽ‰ Setup Complete!"); - process.exit(0); - } catch (error: any) { - outro(`āŒ Setup failed: ${error.message}`); - process.exit(1); - } -} diff --git a/apps/init/src/index.ts b/apps/init/src/index.ts deleted file mode 100644 index 44007a1..0000000 --- a/apps/init/src/index.ts +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env node - -import "./cli/index.js"; diff --git a/apps/init/src/utils/ascii.ts b/apps/init/src/utils/ascii.ts deleted file mode 100644 index 5df2765..0000000 --- a/apps/init/src/utils/ascii.ts +++ /dev/null @@ -1,29 +0,0 @@ -import chalk from "chalk"; -import { VERSION } from "../cli/version.js"; - -export function printCoreBrainLogo(): void { - const brain = ` - ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— - ā–ˆā–ˆā•”ā•ā•ā•ā•ā•ā–ˆā–ˆā•”ā•ā•ā•ā–ˆā–ˆā•—ā–ˆā–ˆā•”ā•ā•ā–ˆā–ˆā•—ā–ˆā–ˆā•”ā•ā•ā•ā•ā• - ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•”ā•ā–ˆā–ˆā–ˆā–ˆā–ˆā•— - ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā•”ā•ā•ā–ˆā–ˆā•—ā–ˆā–ˆā•”ā•ā•ā• - ā•šā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā•šā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•”ā•ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— - ā•šā•ā•ā•ā•ā•ā• ā•šā•ā•ā•ā•ā•ā• ā•šā•ā• ā•šā•ā•ā•šā•ā•ā•ā•ā•ā•ā• - - o o o - o o---o---o o - o---o o o---o---o - o o---o---o---o o - o---o o o---o---o - o o---o---o o - o o o - - `; - - console.log(chalk.cyan(brain)); - console.log( - chalk.bold.white( - ` 🧠 CORE - Contextual Observation & Recall Engine ${VERSION ? chalk.gray(`(${VERSION})`) : ""}\n` - ) - ); -} diff --git a/apps/init/src/utils/env.ts b/apps/init/src/utils/env.ts deleted file mode 100644 index 458ec7b..0000000 --- a/apps/init/src/utils/env.ts +++ /dev/null @@ -1,24 +0,0 @@ -import { z } from "zod"; - -const EnvironmentSchema = z.object({ - // Version - VERSION: z.string().default("0.1.24"), - - // Database - DB_HOST: z.string().default("localhost"), - DB_PORT: z.string().default("5432"), - TRIGGER_DB: z.string().default("trigger"), - POSTGRES_USER: z.string().default("docker"), - POSTGRES_PASSWORD: z.string().default("docker"), - - // Trigger database - TRIGGER_TASKS_IMAGE: z.string().default("redplanethq/proj_core:latest"), - - // Node environment - NODE_ENV: z - .union([z.literal("development"), z.literal("production"), z.literal("test")]) - .default("development"), -}); - -export type Environment = z.infer; -export const env = EnvironmentSchema.parse(process.env); diff --git a/apps/init/src/utils/trigger.ts b/apps/init/src/utils/trigger.ts deleted file mode 100644 index b8bd389..0000000 --- a/apps/init/src/utils/trigger.ts +++ /dev/null @@ -1,182 +0,0 @@ -import Knex from "knex"; -import path from "path"; -import { fileURLToPath } from "url"; -import { env } from "./env.js"; -import { spinner, note, log } from "@clack/prompts"; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -/** - * Returns a PostgreSQL database URL for the given database name. - * Throws if required environment variables are missing. - */ -export function getDatabaseUrl(dbName: string): string { - const { POSTGRES_USER, POSTGRES_PASSWORD, DB_HOST, DB_PORT } = env; - - if (!POSTGRES_USER || !POSTGRES_PASSWORD || !DB_HOST || !DB_PORT || !dbName) { - throw new Error( - "One or more required environment variables are missing: POSTGRES_USER, POSTGRES_PASSWORD, DB_HOST, DB_PORT, dbName" - ); - } - - return `postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${DB_HOST}:${DB_PORT}/${dbName}`; -} - -/** - * Checks if the database specified by TRIGGER_DB exists, and creates it if it does not. - * Returns { exists: boolean, created: boolean } - exists indicates success, created indicates if database was newly created. - */ -export async function ensureDatabaseExists(): Promise<{ exists: boolean; created: boolean }> { - const { TRIGGER_DB } = env; - - if (!TRIGGER_DB) { - throw new Error("TRIGGER_DB environment variable is missing"); - } - - // Build a connection string to the default 'postgres' database - const adminDbUrl = getDatabaseUrl("postgres"); - - // Create a Knex instance for the admin connection - const adminKnex = Knex({ - client: "pg", - connection: adminDbUrl, - }); - - const s = spinner(); - s.start("Checking for Trigger.dev database..."); - - try { - // Check if the database exists - const result = await adminKnex.select(1).from("pg_database").where("datname", TRIGGER_DB); - - if (result.length === 0) { - s.message("Database not found. Creating..."); - // Database does not exist, create it - await adminKnex.raw(`CREATE DATABASE "${TRIGGER_DB}"`); - s.stop("Database created."); - return { exists: true, created: true }; - } else { - s.stop("Database exists."); - return { exists: true, created: false }; - } - } catch (err) { - s.stop("Failed to ensure database exists."); - log.warning("Failed to ensure database exists: " + (err as Error).message); - return { exists: false, created: false }; - } finally { - await adminKnex.destroy(); - } -} - -// Main initialization function -export async function initTriggerDatabase() { - const { TRIGGER_DB } = env; - - if (!TRIGGER_DB) { - throw new Error("TRIGGER_DB environment variable is missing"); - } - - // Ensure the database exists - const { exists, created } = await ensureDatabaseExists(); - if (!exists) { - throw new Error("Failed to create or verify database exists"); - } - - // Only run pg_restore if the database was newly created - if (!created) { - note("Database already exists, skipping restore from trigger.dump"); - return; - } - - // Run pg_restore with the trigger.dump file - const dumpFilePath = path.join(__dirname, "../../../trigger.dump"); - const connectionString = getDatabaseUrl(TRIGGER_DB); - - const s = spinner(); - s.start("Restoring database from trigger.dump..."); - - try { - // Use execSync and capture stdout/stderr, send to spinner.log - const { spawn } = await import("child_process"); - await new Promise((resolve, reject) => { - const child = spawn( - "pg_restore", - ["--verbose", "--no-acl", "--no-owner", "-d", connectionString, dumpFilePath], - { stdio: ["ignore", "pipe", "pipe"] } - ); - - child.stdout.on("data", (data) => { - s.message(data.toString()); - }); - - child.stderr.on("data", (data) => { - s.message(data.toString()); - }); - - child.on("close", (code) => { - if (code === 0) { - s.stop("Database restored successfully from trigger.dump"); - resolve(); - } else { - s.stop("Failed to restore database."); - log.warning(`Failed to restore database: pg_restore exited with code ${code}`); - reject(new Error(`Database restore failed: pg_restore exited with code ${code}`)); - } - }); - - child.on("error", (err) => { - s.stop("Failed to restore database."); - log.warning("Failed to restore database: " + err.message); - reject(new Error(`Database restore failed: ${err.message}`)); - }); - }); - } catch (error: any) { - s.stop("Failed to restore database."); - log.warning("Failed to restore database: " + error.message); - throw new Error(`Database restore failed: ${error.message}`); - } -} - -export async function updateWorkerImage() { - const { TRIGGER_DB, TRIGGER_TASKS_IMAGE } = env; - - if (!TRIGGER_DB) { - throw new Error("TRIGGER_DB environment variable is missing"); - } - - const connectionString = getDatabaseUrl(TRIGGER_DB); - - const knex = Knex({ - client: "pg", - connection: connectionString, - }); - - const s = spinner(); - s.start("Updating worker image reference..."); - - try { - // Get the first record from WorkerDeployment table - const firstWorkerDeployment = await knex("WorkerDeployment").select("id").first(); - - if (!firstWorkerDeployment) { - s.stop("No WorkerDeployment records found, skipping image update"); - note("No WorkerDeployment records found, skipping image update"); - return; - } - - // Update the imageReference column with the TRIGGER_TASKS_IMAGE value - await knex("WorkerDeployment").where("id", firstWorkerDeployment.id).update({ - imageReference: TRIGGER_TASKS_IMAGE, - updatedAt: new Date(), - }); - - s.stop(`Successfully updated worker image reference to: ${TRIGGER_TASKS_IMAGE}`); - } catch (error: any) { - s.stop("Failed to update worker image."); - log.warning("Failed to update worker image: " + error.message); - throw new Error(`Worker image update failed: ${error.message}`); - } finally { - await knex.destroy(); - } -} diff --git a/apps/init/trigger.dump b/apps/init/trigger.dump deleted file mode 100644 index 81a09bc..0000000 Binary files a/apps/init/trigger.dump and /dev/null differ diff --git a/apps/init/tsconfig.json b/apps/init/tsconfig.json deleted file mode 100644 index b01fe1d..0000000 --- a/apps/init/tsconfig.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "include": ["./src/**/*.ts"], - "exclude": ["./src/**/*.test.ts"], - "compilerOptions": { - "target": "es2022", - "lib": ["ES2022", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], - "module": "NodeNext", - "moduleResolution": "NodeNext", - "moduleDetection": "force", - "verbatimModuleSyntax": false, - "jsx": "react", - - "strict": true, - "alwaysStrict": true, - "strictPropertyInitialization": true, - "skipLibCheck": true, - "forceConsistentCasingInFileNames": true, - "noUnusedLocals": false, - "noUnusedParameters": false, - "noImplicitAny": true, - "noImplicitReturns": true, - "noImplicitThis": true, - - "noFallthroughCasesInSwitch": true, - "resolveJsonModule": true, - - "removeComments": false, - "esModuleInterop": true, - "emitDecoratorMetadata": false, - "experimentalDecorators": false, - "downlevelIteration": true, - "isolatedModules": true, - "noUncheckedIndexedAccess": true, - - "pretty": true, - "isolatedDeclarations": false, - "composite": true, - "sourceMap": true - } -} diff --git a/apps/init/vite.config.ts b/apps/init/vite.config.ts deleted file mode 100644 index aba1cd0..0000000 --- a/apps/init/vite.config.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { configDefaults, defineConfig } from "vitest/config"; - -export default defineConfig({ - test: { - globals: true, - exclude: [...configDefaults.exclude, "e2e/**/*"], - }, -}); diff --git a/apps/webapp/app/bert/README.md b/apps/webapp/app/bert/README.md new file mode 100644 index 0000000..f44116b --- /dev/null +++ b/apps/webapp/app/bert/README.md @@ -0,0 +1,299 @@ +# BERT Topic Modeling CLI for Echo Episodes + +This CLI tool performs topic modeling on Echo episodes using BERTopic. It connects to Neo4j, retrieves episodes with their pre-computed embeddings for a given user, and discovers thematic clusters using HDBSCAN clustering. + +## Features + +- Connects to Neo4j database to fetch episodes +- Uses pre-computed embeddings (no need to regenerate them) +- Performs semantic topic clustering with BERTopic +- Displays topics with: + - Top keywords per topic + - Episode count per topic + - Sample episodes for each topic +- Configurable minimum topic size +- Environment variable support for easy configuration + +## Prerequisites + +- Python 3.8+ +- Access to Neo4j database with episodes stored +- Pre-computed embeddings stored in Neo4j (in `contentEmbedding` field) + +## Installation + +1. Navigate to the bert directory: + +```bash +cd apps/webapp/app/bert +``` + +2. Install dependencies: + +```bash +pip install -r requirements.txt +``` + +## Configuration + +The CLI can read Neo4j connection details from: + +1. **Environment variables** (recommended) - Create a `.env` file or export: + + ```bash + export NEO4J_URI=bolt://localhost:7687 + export NEO4J_USERNAME=neo4j + export NEO4J_PASSWORD=your_password + ``` + +2. **Command-line options** - Pass credentials directly as flags + +3. **From project root** - The tool automatically loads `.env` from the project root + +## Usage + +### Basic Usage + +Using environment variables (most common): + +```bash +python main.py +``` + +### Advanced Options + +```bash +python main.py [OPTIONS] +``` + +**Options:** + +- `--min-topic-size INTEGER`: Minimum number of episodes per topic (default: 10) +- `--nr-topics INTEGER`: Target number of topics for reduction (optional) +- `--propose-spaces`: Generate space proposals using OpenAI (requires OPENAI_API_KEY) +- `--openai-api-key TEXT`: OpenAI API key for space proposals (or use OPENAI_API_KEY env var) +- `--json`: Output only final results in JSON format (suppresses all other output) +- `--neo4j-uri TEXT`: Neo4j connection URI (default: bolt://localhost:7687) +- `--neo4j-username TEXT`: Neo4j username (default: neo4j) +- `--neo4j-password TEXT`: Neo4j password (required) + +### Examples + +1. **Basic usage with environment variables:** + + ```bash + python main.py user-123 + ``` + +2. **Custom minimum topic size:** + + ```bash + python main.py user-123 --min-topic-size 10 + ``` + +3. **Explicit credentials:** + + ```bash + python main.py user-123 \ + --neo4j-uri bolt://neo4j:7687 \ + --neo4j-username neo4j \ + --neo4j-password mypassword + ``` + +4. **Using Docker compose Neo4j:** + ```bash + python main.py user-123 \ + --neo4j-uri bolt://localhost:7687 \ + --neo4j-password 27192e6432564f4788d55c15131bd5ac + ``` + +5. **With space proposals:** + ```bash + python main.py user-123 --propose-spaces + ``` + +6. **JSON output mode (for programmatic use):** + ```bash + python main.py user-123 --json + ``` + +7. **JSON output with space proposals:** + ```bash + python main.py user-123 --propose-spaces --json + ``` + +### Get Help + +```bash +python main.py --help +``` + +## Output Formats + +### Human-Readable Output (Default) + +The CLI outputs: + +``` +================================================================================ +BERT TOPIC MODELING FOR ECHO EPISODES +================================================================================ +User ID: user-123 +Min Topic Size: 20 +================================================================================ + +āœ“ Connected to Neo4j at bolt://localhost:7687 +āœ“ Fetched 150 episodes with embeddings + +šŸ” Running BERTopic analysis (min_topic_size=20)... +āœ“ Topic modeling complete + +================================================================================ +TOPIC MODELING RESULTS +================================================================================ +Total Topics Found: 5 +Total Episodes: 150 +================================================================================ + +──────────────────────────────────────────────────────────────────────────────── +Topic 0: 45 episodes +──────────────────────────────────────────────────────────────────────────────── +Keywords: authentication, login, user, security, session, password, token, oauth, jwt, credentials + +Sample Episodes (showing up to 3): + 1. [uuid-123] + Discussing authentication flow for the new user login system... + + 2. [uuid-456] + Implementing OAuth2 with JWT tokens for secure sessions... + + 3. [uuid-789] + Password reset functionality with email verification... + +──────────────────────────────────────────────────────────────────────────────── +Topic 1: 32 episodes +──────────────────────────────────────────────────────────────────────────────── +Keywords: database, neo4j, query, graph, cypher, nodes, relationships, index, performance, optimization + +Sample Episodes (showing up to 3): + ... + +Topic -1 (Outliers): 8 episodes + +================================================================================ +āœ“ Analysis complete! +================================================================================ + +āœ“ Neo4j connection closed +``` + +### JSON Output Mode (--json flag) + +When using the `--json` flag, the tool outputs only a clean JSON object with no debug logs: + +```json +{ + "topics": { + "0": { + "keywords": ["authentication", "login", "user", "security", "session"], + "episodeIds": ["uuid-123", "uuid-456", "uuid-789"] + }, + "1": { + "keywords": ["database", "neo4j", "query", "graph", "cypher"], + "episodeIds": ["uuid-abc", "uuid-def"] + } + }, + "spaces": [ + { + "name": "User Authentication", + "intent": "Episodes about user authentication, login systems, and security belong in this space.", + "confidence": 85, + "topics": [0, 3], + "estimatedEpisodes": 120 + } + ] +} +``` + +**JSON Output Structure:** +- `topics`: Dictionary of topic IDs with keywords and episode UUIDs +- `spaces`: Array of space proposals (only if `--propose-spaces` is used) + - `name`: Space name (2-5 words) + - `intent`: Classification intent (1-2 sentences) + - `confidence`: Confidence score (0-100) + - `topics`: Source topic IDs that form this space + - `estimatedEpisodes`: Estimated number of episodes in this space + +**Use Cases for JSON Mode:** +- Programmatic consumption by other tools +- Piping output to jq or other JSON processors +- Integration with CI/CD pipelines +- Automated space creation workflows + +## How It Works + +1. **Connection**: Establishes connection to Neo4j database +2. **Data Fetching**: Queries all episodes for the given userId that have: + - Non-null `contentEmbedding` field + - Non-empty content +3. **Topic Modeling**: Runs BERTopic with: + - Pre-computed embeddings (no re-embedding needed) + - HDBSCAN clustering (automatic cluster discovery) + - Keyword extraction via c-TF-IDF +4. **Results**: Displays topics with keywords and sample episodes + +## Neo4j Query + +The tool uses this Cypher query to fetch episodes: + +```cypher +MATCH (e:Episode {userId: $userId}) +WHERE e.contentEmbedding IS NOT NULL + AND size(e.contentEmbedding) > 0 + AND e.content IS NOT NULL + AND e.content <> '' +RETURN e.uuid as uuid, + e.content as content, + e.contentEmbedding as embedding, + e.createdAt as createdAt +ORDER BY e.createdAt DESC +``` + +## Tuning Parameters + +- **`--min-topic-size`**: + - Smaller values (5-10): More granular topics, may include noise + - Larger values (20-30): Broader topics, more coherent but fewer clusters + - Recommended: Start with 20 and adjust based on your data + +## Troubleshooting + +### No episodes found + +- Verify the userId exists in Neo4j +- Check that episodes have `contentEmbedding` populated +- Ensure episodes have non-empty `content` field + +### Connection errors + +- Verify Neo4j is running: `docker ps | grep neo4j` +- Check URI format: should be `bolt://host:port` +- Verify credentials are correct + +### Too few/many topics + +- Adjust `--min-topic-size` parameter +- Need more topics: decrease the value (e.g., `--min-topic-size 10`) +- Need fewer topics: increase the value (e.g., `--min-topic-size 30`) + +## Dependencies + +- `bertopic>=0.16.0` - Topic modeling +- `neo4j>=5.14.0` - Neo4j Python driver +- `click>=8.1.0` - CLI framework +- `numpy>=1.24.0` - Numerical operations +- `python-dotenv>=1.0.0` - Environment variable loading + +## License + +Part of the Echo project. diff --git a/apps/webapp/app/bert/main.py b/apps/webapp/app/bert/main.py new file mode 100644 index 0000000..e4fc22a --- /dev/null +++ b/apps/webapp/app/bert/main.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python3 +""" +BERT Topic Modeling CLI for Echo Episodes + +This CLI tool connects to Neo4j, retrieves episodes with their embeddings for a given userId, +and performs topic modeling using BERTopic to discover thematic clusters. +""" + +import os +import sys +import json +from typing import List, Tuple, Dict, Any +import click +import numpy as np +from neo4j import GraphDatabase +from bertopic import BERTopic +from bertopic.vectorizers import ClassTfidfTransformer +from dotenv import load_dotenv +from sklearn.feature_extraction.text import CountVectorizer +from umap import UMAP +from hdbscan import HDBSCAN + + +class Neo4jConnection: + """Manages Neo4j database connection.""" + + def __init__(self, uri: str, username: str, password: str, quiet: bool = False): + """Initialize Neo4j connection. + + Args: + uri: Neo4j connection URI (e.g., bolt://localhost:7687) + username: Neo4j username + password: Neo4j password + quiet: If True, suppress output messages + """ + self.quiet = quiet + try: + self.driver = GraphDatabase.driver(uri, auth=(username, password)) + # Verify connection + self.driver.verify_connectivity() + if not quiet: + click.echo(f"āœ“ Connected to Neo4j at {uri}") + except Exception as e: + if not quiet: + click.echo(f"āœ— Failed to connect to Neo4j: {e}", err=True) + sys.exit(1) + + def close(self): + """Close the Neo4j connection.""" + if self.driver: + self.driver.close() + if not self.quiet: + click.echo("āœ“ Neo4j connection closed") + + def get_episodes_with_embeddings(self, user_id: str) -> Tuple[List[str], List[str], np.ndarray]: + """Fetch all episodes with their embeddings for a given user. + + Args: + user_id: The user ID to fetch episodes for + + Returns: + Tuple of (episode_uuids, episode_contents, embeddings_array) + """ + query = """ + MATCH (e:Episode {userId: $userId}) + WHERE e.contentEmbedding IS NOT NULL + AND size(e.contentEmbedding) > 0 + AND e.content IS NOT NULL + AND e.content <> '' + RETURN e.uuid as uuid, + e.content as content, + e.contentEmbedding as embedding, + e.createdAt as createdAt + ORDER BY e.createdAt DESC + """ + + with self.driver.session() as session: + result = session.run(query, userId=user_id) + records = list(result) + + if not records: + if not self.quiet: + click.echo(f"āœ— No episodes found for userId: {user_id}", err=True) + sys.exit(1) + + uuids = [] + contents = [] + embeddings = [] + + for record in records: + uuids.append(record["uuid"]) + contents.append(record["content"]) + embeddings.append(record["embedding"]) + + embeddings_array = np.array(embeddings, dtype=np.float32) + + if not self.quiet: + click.echo(f"āœ“ Fetched {len(contents)} episodes with embeddings") + return uuids, contents, embeddings_array + + +def run_bertopic_analysis( + contents: List[str], + embeddings: np.ndarray, + min_topic_size: int = 20, + nr_topics: int = None, + quiet: bool = False +) -> Tuple[BERTopic, List[int], List[float]]: + """Run BERTopic clustering on episode contents with improved configuration. + + Args: + contents: List of episode content strings + embeddings: Pre-computed embeddings for the episodes + min_topic_size: Minimum number of documents per topic + nr_topics: Target number of topics (optional, for topic reduction) + quiet: If True, suppress output messages + + Returns: + Tuple of (bertopic_model, topic_assignments, probabilities) + """ + if not quiet: + click.echo(f"\nšŸ” Running BERTopic analysis (min_topic_size={min_topic_size})...") + + # Step 1: Configure UMAP for dimensionality reduction + # More aggressive reduction helps find distinct clusters + umap_model = UMAP( + n_neighbors=15, # Balance between local/global structure + n_components=5, # Reduce to 5 dimensions + min_dist=0.0, # Allow tight clusters + metric='cosine', # Use cosine similarity + random_state=42 + ) + + # Step 2: Configure HDBSCAN for clustering + # Tuned to find more granular topics + hdbscan_model = HDBSCAN( + min_cluster_size=min_topic_size, # Minimum episodes per topic + min_samples=5, # More sensitive to local density + metric='euclidean', + cluster_selection_method='eom', # Excess of mass method + prediction_data=True + ) + + # Step 3: Configure vectorizer with stopword removal + # Remove common English stopwords that pollute topic keywords + vectorizer_model = CountVectorizer( + stop_words='english', # Remove common English words + min_df=2, # Word must appear in at least 2 docs + max_df=0.95, # Ignore words in >95% of docs + ngram_range=(1, 2) # Include unigrams and bigrams + ) + + # Step 4: Configure c-TF-IDF with better parameters + ctfidf_model = ClassTfidfTransformer( + reduce_frequent_words=True, # Further reduce common words + bm25_weighting=True # Use BM25 for better keyword extraction + ) + + # Step 5: Initialize BERTopic with all custom components + model = BERTopic( + embedding_model=None, # Use pre-computed embeddings + umap_model=umap_model, + hdbscan_model=hdbscan_model, + vectorizer_model=vectorizer_model, + ctfidf_model=ctfidf_model, + top_n_words=15, # More keywords per topic + nr_topics=nr_topics, # Optional topic reduction + calculate_probabilities=True, + verbose=(not quiet) + ) + + # Fit the model with pre-computed embeddings + topics, probs = model.fit_transform(contents, embeddings=embeddings) + + # Get topic count + unique_topics = len(set(topics)) - (1 if -1 in topics else 0) + if not quiet: + click.echo(f"āœ“ Topic modeling complete - Found {unique_topics} topics") + + return model, topics, probs + + +def print_topic_results( + model: BERTopic, + topics: List[int], + uuids: List[str], + contents: List[str] +): + """Print formatted topic results. + + Args: + model: Fitted BERTopic model + topics: Topic assignments for each episode + uuids: Episode UUIDs + contents: Episode contents + """ + # Get topic info + topic_info = model.get_topic_info() + num_topics = len(topic_info) - 1 # Exclude outlier topic (-1) + + click.echo(f"\n{'='*80}") + click.echo(f"TOPIC MODELING RESULTS") + click.echo(f"{'='*80}") + click.echo(f"Total Topics Found: {num_topics}") + click.echo(f"Total Episodes: {len(contents)}") + click.echo(f"{'='*80}\n") + + # Print each topic + for idx, row in topic_info.iterrows(): + topic_id = row['Topic'] + count = row['Count'] + + # Skip outlier topic + if topic_id == -1: + click.echo(f"Topic -1 (Outliers): {count} episodes\n") + continue + + # Get top words for this topic + topic_words = model.get_topic(topic_id) + + click.echo(f"{'─'*80}") + click.echo(f"Topic {topic_id}: {count} episodes") + click.echo(f"{'─'*80}") + + # Print top keywords + if topic_words: + keywords = [word for word, score in topic_words[:10]] + click.echo(f"Keywords: {', '.join(keywords)}") + + # Print sample episodes + topic_episodes = [(uuid, content) for uuid, content, topic + in zip(uuids, contents, topics) if topic == topic_id] + + click.echo(f"\nSample Episodes (showing up to 3):") + for i, (uuid, content) in enumerate(topic_episodes[:3]): + # Truncate content for display + truncated = content[:200] + "..." if len(content) > 200 else content + click.echo(f" {i+1}. [{uuid}]") + click.echo(f" {truncated}\n") + + click.echo() + + +def build_json_output( + model: BERTopic, + topics: List[int], + uuids: List[str] +) -> Dict[str, Any]: + """Build JSON output structure. + + Args: + model: Fitted BERTopic model + topics: Topic assignments for each episode + uuids: Episode UUIDs + + Returns: + Dictionary with topics data + """ + # Build topics dictionary + topics_dict = {} + topic_info = model.get_topic_info() + + for idx, row in topic_info.iterrows(): + topic_id = row['Topic'] + + # Skip outlier topic + if topic_id == -1: + continue + + # Get keywords + topic_words = model.get_topic(topic_id) + keywords = [word for word, score in topic_words[:10]] if topic_words else [] + + # Get episode IDs for this topic + episode_ids = [uuid for uuid, topic in zip(uuids, topics) if topic == topic_id] + + topics_dict[topic_id] = { + "keywords": keywords, + "episodeIds": episode_ids + } + + return {"topics": topics_dict} + + +@click.command() +@click.argument('user_id', type=str) +@click.option( + '--min-topic-size', + default=10, + type=int, + help='Minimum number of episodes per topic (default: 10, lower = more granular topics)' +) +@click.option( + '--nr-topics', + default=None, + type=int, + help='Target number of topics for reduction (optional, e.g., 20 for ~20 topics)' +) +@click.option( + '--neo4j-uri', + envvar='NEO4J_URI', + default='bolt://localhost:7687', + help='Neo4j connection URI (default: bolt://localhost:7687)' +) +@click.option( + '--neo4j-username', + envvar='NEO4J_USERNAME', + default='neo4j', + help='Neo4j username (default: neo4j)' +) +@click.option( + '--neo4j-password', + envvar='NEO4J_PASSWORD', + required=True, + help='Neo4j password (required, can use NEO4J_PASSWORD env var)' +) +@click.option( + '--json', + 'json_output', + is_flag=True, + default=False, + help='Output only final results in JSON format (suppresses all other output)' +) +def main(user_id: str, min_topic_size: int, nr_topics: int, neo4j_uri: str, neo4j_username: str, neo4j_password: str, json_output: bool): + """ + Run BERTopic analysis on episodes for a given USER_ID. + + This tool connects to Neo4j, retrieves all episodes with embeddings for the specified user, + and performs topic modeling to discover thematic clusters. + + Examples: + + # Using environment variables from .env file + python main.py user-123 + + # With custom min topic size + python main.py user-123 --min-topic-size 10 + + # With explicit Neo4j credentials + python main.py user-123 --neo4j-uri bolt://localhost:7687 --neo4j-password mypassword + """ + # Print header only if not in JSON mode + if not json_output: + click.echo(f"\n{'='*80}") + click.echo("BERT TOPIC MODELING FOR ECHO EPISODES") + click.echo(f"{'='*80}") + click.echo(f"User ID: {user_id}") + click.echo(f"Min Topic Size: {min_topic_size}") + if nr_topics: + click.echo(f"Target Topics: ~{nr_topics}") + click.echo(f"{'='*80}\n") + + # Connect to Neo4j (quiet mode if JSON output) + neo4j_conn = Neo4jConnection(neo4j_uri, neo4j_username, neo4j_password, quiet=json_output) + + try: + # Fetch episodes with embeddings + uuids, contents, embeddings = neo4j_conn.get_episodes_with_embeddings(user_id) + + # Run BERTopic analysis + model, topics, probs = run_bertopic_analysis(contents, embeddings, min_topic_size, nr_topics, quiet=json_output) + + # Output results + if json_output: + # JSON output mode - only print JSON + output = build_json_output(model, topics, uuids) + click.echo(json.dumps(output, indent=2)) + else: + # Normal output mode - print formatted results + print_topic_results(model, topics, uuids, contents) + + click.echo(f"{'='*80}") + click.echo("āœ“ Analysis complete!") + click.echo(f"{'='*80}\n") + + finally: + # Always close connection + neo4j_conn.close() + + +if __name__ == '__main__': + # Load environment variables from .env file if present + load_dotenv() + main() diff --git a/apps/webapp/app/bert/requirements.txt b/apps/webapp/app/bert/requirements.txt new file mode 100644 index 0000000..3b9f751 --- /dev/null +++ b/apps/webapp/app/bert/requirements.txt @@ -0,0 +1,8 @@ +bertopic>=0.16.0 +neo4j>=5.14.0 +click>=8.1.0 +numpy>=1.24.0 +python-dotenv>=1.0.0 +scikit-learn>=1.3.0 +umap-learn>=0.5.4 +hdbscan>=0.8.33 diff --git a/apps/webapp/app/bullmq/queues/index.ts b/apps/webapp/app/bullmq/queues/index.ts index ebefc4b..7356c96 100644 --- a/apps/webapp/app/bullmq/queues/index.ts +++ b/apps/webapp/app/bullmq/queues/index.ts @@ -92,3 +92,26 @@ export const sessionCompactionQueue = new Queue("session-compaction-queue", { }, }, }); + +/** + * BERT topic analysis queue + * Handles CPU-intensive topic modeling on user episodes + */ +export const bertTopicQueue = new Queue("bert-topic-queue", { + connection: getRedisConnection(), + defaultJobOptions: { + attempts: 2, // Only 2 attempts due to long runtime + backoff: { + type: "exponential", + delay: 5000, + }, + timeout: 300000, // 5 minute timeout + removeOnComplete: { + age: 7200, // Keep completed jobs for 2 hours + count: 100, + }, + removeOnFail: { + age: 172800, // Keep failed jobs for 48 hours (for debugging) + }, + }, +}); diff --git a/apps/webapp/app/bullmq/workers/index.ts b/apps/webapp/app/bullmq/workers/index.ts index e2d930d..ca8eb96 100644 --- a/apps/webapp/app/bullmq/workers/index.ts +++ b/apps/webapp/app/bullmq/workers/index.ts @@ -23,10 +23,15 @@ import { processSessionCompaction, type SessionCompactionPayload, } from "~/jobs/session/session-compaction.logic"; +import { + processTopicAnalysis, + type TopicAnalysisPayload, +} from "~/jobs/bert/topic-analysis.logic"; import { enqueueIngestEpisode, enqueueSpaceAssignment, enqueueSessionCompaction, + enqueueBertTopicAnalysis, } from "~/lib/queue-adapter.server"; import { logger } from "~/services/logger.service"; @@ -47,6 +52,7 @@ export const ingestWorker = new Worker( // Callbacks to enqueue follow-up jobs enqueueSpaceAssignment, enqueueSessionCompaction, + enqueueBertTopicAnalysis, ); }, { @@ -108,6 +114,22 @@ export const sessionCompactionWorker = new Worker( }, ); +/** + * BERT topic analysis worker + * Handles CPU-intensive topic modeling + */ +export const bertTopicWorker = new Worker( + "bert-topic-queue", + async (job) => { + const payload = job.data as TopicAnalysisPayload; + return await processTopicAnalysis(payload); + }, + { + connection: getRedisConnection(), + concurrency: 2, // Process up to 2 analyses in parallel (CPU-intensive) + }, +); + /** * Graceful shutdown handler */ @@ -116,8 +138,8 @@ export async function closeAllWorkers(): Promise { ingestWorker.close(), documentIngestWorker.close(), conversationTitleWorker.close(), - sessionCompactionWorker.close(), + bertTopicWorker.close(), ]); logger.log("All BullMQ workers closed"); } diff --git a/apps/webapp/app/jobs/bert/topic-analysis.logic.ts b/apps/webapp/app/jobs/bert/topic-analysis.logic.ts new file mode 100644 index 0000000..43900b5 --- /dev/null +++ b/apps/webapp/app/jobs/bert/topic-analysis.logic.ts @@ -0,0 +1,116 @@ +import { exec } from "child_process"; +import { promisify } from "util"; + +const execAsync = promisify(exec); + +export interface TopicAnalysisPayload { + userId: string; + minTopicSize?: number; + nrTopics?: number; +} + +export interface TopicAnalysisResult { + topics: { + [topicId: string]: { + keywords: string[]; + episodeIds: string[]; + }; + }; +} + +/** + * Process BERT topic analysis on user's episodes + * This is the common logic shared between Trigger.dev and BullMQ + * + * NOTE: This function does NOT update workspace.metadata.lastTopicAnalysisAt + * That should be done by the caller BEFORE enqueueing this job to prevent + * duplicate analyses from racing conditions. + */ +export async function processTopicAnalysis( + payload: TopicAnalysisPayload +): Promise { + const { userId, minTopicSize = 10, nrTopics } = payload; + + console.log(`[BERT Topic Analysis] Starting analysis for user: ${userId}`); + console.log( + `[BERT Topic Analysis] Parameters: minTopicSize=${minTopicSize}, nrTopics=${nrTopics || "auto"}` + ); + + // Build the command + let command = `python3 /core/apps/webapp/app/bert/main.py ${userId} --json`; + + if (minTopicSize) { + command += ` --min-topic-size ${minTopicSize}`; + } + + if (nrTopics) { + command += ` --nr-topics ${nrTopics}`; + } + + console.log(`[BERT Topic Analysis] Executing: ${command}`); + + try { + const startTime = Date.now(); + + // Execute the Python script with a 5-minute timeout + const { stdout, stderr } = await execAsync(command, { + timeout: 300000, // 5 minutes + maxBuffer: 10 * 1024 * 1024, // 10MB buffer for large outputs + }); + + const duration = Date.now() - startTime; + console.log(`[BERT Topic Analysis] Completed in ${duration}ms`); + + if (stderr) { + console.warn(`[BERT Topic Analysis] Warnings:`, stderr); + } + + // Parse the JSON output + const result: TopicAnalysisResult = JSON.parse(stdout); + + // Log summary + const topicCount = Object.keys(result.topics).length; + const totalEpisodes = Object.values(result.topics).reduce( + (sum, topic) => sum + topic.episodeIds.length, + 0 + ); + + console.log( + `[BERT Topic Analysis] Found ${topicCount} topics covering ${totalEpisodes} episodes` + ); + + return result; + } catch (error) { + console.error(`[BERT Topic Analysis] Error:`, error); + + if (error instanceof Error) { + // Check for timeout + if (error.message.includes("ETIMEDOUT")) { + throw new Error( + `Topic analysis timed out after 5 minutes. User may have too many episodes.` + ); + } + + // Check for Python errors + if (error.message.includes("python3: not found")) { + throw new Error( + `Python 3 is not installed or not available in PATH.` + ); + } + + // Check for Neo4j connection errors + if (error.message.includes("Failed to connect to Neo4j")) { + throw new Error( + `Could not connect to Neo4j. Check NEO4J_URI and credentials.` + ); + } + + // Check for no episodes + if (error.message.includes("No episodes found")) { + throw new Error(`No episodes found for userId: ${userId}`); + } + } + + throw error; + } +} diff --git a/apps/webapp/app/jobs/ingest/ingest-episode.logic.ts b/apps/webapp/app/jobs/ingest/ingest-episode.logic.ts index e1b515a..57af32b 100644 --- a/apps/webapp/app/jobs/ingest/ingest-episode.logic.ts +++ b/apps/webapp/app/jobs/ingest/ingest-episode.logic.ts @@ -7,6 +7,10 @@ import { prisma } from "~/trigger/utils/prisma"; import { EpisodeType } from "@core/types"; import { deductCredits, hasCredits } from "~/trigger/utils/utils"; import { assignEpisodesToSpace } from "~/services/graphModels/space"; +import { + shouldTriggerTopicAnalysis, + updateLastTopicAnalysisTime, +} from "~/services/bertTopicAnalysis.server"; export const IngestBodyRequest = z.object({ episodeBody: z.string(), @@ -55,6 +59,11 @@ export async function processEpisodeIngestion( sessionId: string; source: string; }) => Promise, + enqueueBertTopicAnalysis?: (params: { + userId: string; + minTopicSize?: number; + nrTopics?: number; + }) => Promise, ): Promise { try { logger.log(`Processing job for user ${payload.userId}`); @@ -250,6 +259,43 @@ export async function processEpisodeIngestion( }); } + // Auto-trigger BERT topic analysis if threshold met (20+ new episodes) + try { + if ( + currentStatus === IngestionStatus.COMPLETED && + enqueueBertTopicAnalysis + ) { + const shouldTrigger = await shouldTriggerTopicAnalysis( + payload.userId, + payload.workspaceId, + ); + + if (shouldTrigger) { + logger.info( + `Triggering BERT topic analysis after reaching 20+ new episodes`, + { + userId: payload.userId, + workspaceId: payload.workspaceId, + }, + ); + + await enqueueBertTopicAnalysis({ + userId: payload.userId, + minTopicSize: 10, + }); + + // Update the last analysis timestamp + await updateLastTopicAnalysisTime(payload.workspaceId); + } + } + } catch (topicAnalysisError) { + // Don't fail the ingestion if topic analysis fails + logger.warn(`Failed to trigger topic analysis after ingestion:`, { + error: topicAnalysisError, + userId: payload.userId, + }); + } + return { success: true, episodeDetails }; } catch (err: any) { await prisma.ingestionQueue.update({ diff --git a/apps/webapp/app/lib/queue-adapter.server.ts b/apps/webapp/app/lib/queue-adapter.server.ts index af9281d..38bac92 100644 --- a/apps/webapp/app/lib/queue-adapter.server.ts +++ b/apps/webapp/app/lib/queue-adapter.server.ts @@ -163,6 +163,39 @@ export async function enqueueSpaceAssignment( } } +/** + * Enqueue BERT topic analysis job + */ +export async function enqueueBertTopicAnalysis(payload: { + userId: string; + minTopicSize?: number; + nrTopics?: number; +}): Promise<{ id?: string }> { + const provider = env.QUEUE_PROVIDER as QueueProvider; + + if (provider === "trigger") { + const { bertTopicAnalysisTask } = await import( + "~/trigger/bert/topic-analysis" + ); + const handler = await bertTopicAnalysisTask.trigger(payload, { + queue: "bert-topic-analysis", + concurrencyKey: payload.userId, + tags: [payload.userId, "bert-analysis"], + }); + return { id: handler.id }; + } else { + // BullMQ + const { bertTopicQueue } = await import("~/bullmq/queues"); + const job = await bertTopicQueue.add("topic-analysis", payload, { + jobId: `bert-${payload.userId}-${Date.now()}`, + attempts: 2, // Only 2 attempts for expensive operations + backoff: { type: "exponential", delay: 5000 }, + timeout: 300000, // 5 minute timeout + }); + return { id: job.id }; + } +} + export const isTriggerDeployment = () => { return env.QUEUE_PROVIDER === "trigger"; }; diff --git a/apps/webapp/app/services/bertTopicAnalysis.server.ts b/apps/webapp/app/services/bertTopicAnalysis.server.ts new file mode 100644 index 0000000..27b0dcf --- /dev/null +++ b/apps/webapp/app/services/bertTopicAnalysis.server.ts @@ -0,0 +1,107 @@ +import { prisma } from "~/trigger/utils/prisma"; +import { logger } from "~/services/logger.service"; +import { runQuery } from "~/lib/neo4j.server"; + +interface WorkspaceMetadata { + lastTopicAnalysisAt?: string; + [key: string]: any; +} + +/** + * Check if we should trigger a BERT topic analysis for this workspace + * Criteria: 20+ new episodes since last analysis (or no previous analysis) + */ +export async function shouldTriggerTopicAnalysis( + userId: string, + workspaceId: string, +): Promise { + try { + // Get workspace metadata + const workspace = await prisma.workspace.findUnique({ + where: { id: workspaceId }, + select: { metadata: true }, + }); + + if (!workspace) { + logger.warn(`Workspace not found: ${workspaceId}`); + return false; + } + + const metadata = (workspace.metadata || {}) as WorkspaceMetadata; + const lastAnalysisAt = metadata.lastTopicAnalysisAt; + + // Count episodes since last analysis + const query = lastAnalysisAt + ? ` + MATCH (e:Episode {userId: $userId}) + WHERE e.createdAt > datetime($lastAnalysisAt) + RETURN count(e) as newEpisodeCount + ` + : ` + MATCH (e:Episode {userId: $userId}) + RETURN count(e) as totalEpisodeCount + `; + + const result = await runQuery(query, { + userId, + lastAnalysisAt, + }); + + const episodeCount = lastAnalysisAt + ? result[0]?.get("newEpisodeCount")?.toNumber() || 0 + : result[0]?.get("totalEpisodeCount")?.toNumber() || 0; + + logger.info( + `[Topic Analysis Check] User: ${userId}, New episodes: ${episodeCount}, Last analysis: ${lastAnalysisAt || "never"}`, + ); + + // Trigger if 20+ new episodes + return episodeCount >= 20; + } catch (error) { + logger.error( + `[Topic Analysis Check] Error checking episode count:`, + error, + ); + return false; + } +} + +/** + * Update workspace metadata with last topic analysis timestamp + */ +export async function updateLastTopicAnalysisTime( + workspaceId: string, +): Promise { + try { + const workspace = await prisma.workspace.findUnique({ + where: { id: workspaceId }, + select: { metadata: true }, + }); + + if (!workspace) { + logger.warn(`Workspace not found: ${workspaceId}`); + return; + } + + const metadata = (workspace.metadata || {}) as WorkspaceMetadata; + + await prisma.workspace.update({ + where: { id: workspaceId }, + data: { + metadata: { + ...metadata, + lastTopicAnalysisAt: new Date().toISOString(), + }, + }, + }); + + logger.info( + `[Topic Analysis] Updated last analysis timestamp for workspace: ${workspaceId}`, + ); + } catch (error) { + logger.error( + `[Topic Analysis] Error updating last analysis timestamp:`, + error, + ); + } +} diff --git a/apps/webapp/app/trigger/bert/topic-analysis.ts b/apps/webapp/app/trigger/bert/topic-analysis.ts new file mode 100644 index 0000000..7473e32 --- /dev/null +++ b/apps/webapp/app/trigger/bert/topic-analysis.ts @@ -0,0 +1,21 @@ +import { task } from "@trigger.dev/sdk/v3"; +import { + processTopicAnalysis, + type TopicAnalysisPayload, +} from "~/jobs/bert/topic-analysis.logic"; + +/** + * Trigger.dev task for BERT topic analysis + * + * This is a thin wrapper around the common logic in jobs/bert/topic-analysis.logic.ts + */ +export const bertTopicAnalysisTask = task({ + id: "bert-topic-analysis", + queue: { + name: "bert-topic-analysis", + concurrencyLimit: 3, // Max 3 parallel analyses to avoid CPU overload + }, + run: async (payload: TopicAnalysisPayload) => { + return await processTopicAnalysis(payload); + }, +}); diff --git a/apps/webapp/app/trigger/ingest/ingest.ts b/apps/webapp/app/trigger/ingest/ingest.ts index 4a3cd02..a65f672 100644 --- a/apps/webapp/app/trigger/ingest/ingest.ts +++ b/apps/webapp/app/trigger/ingest/ingest.ts @@ -6,6 +6,7 @@ import { } from "~/jobs/ingest/ingest-episode.logic"; import { triggerSpaceAssignment } from "../spaces/space-assignment"; import { triggerSessionCompaction } from "../session/session-compaction"; +import { bertTopicAnalysisTask } from "../bert/topic-analysis"; const ingestionQueue = queue({ name: "ingestion-queue", @@ -32,6 +33,14 @@ export const ingestTask = task({ async (params) => { await triggerSessionCompaction(params); }, + // Callback for BERT topic analysis + async (params) => { + await bertTopicAnalysisTask.trigger(params, { + queue: "bert-topic-analysis", + concurrencyKey: params.userId, + tags: [params.userId, "bert-analysis"], + }); + }, ); }, }); diff --git a/apps/webapp/app/trigger/spaces/space-pattern.ts b/apps/webapp/app/trigger/spaces/space-pattern.ts deleted file mode 100644 index 89a6263..0000000 --- a/apps/webapp/app/trigger/spaces/space-pattern.ts +++ /dev/null @@ -1,557 +0,0 @@ -import { task } from "@trigger.dev/sdk/v3"; -import { logger } from "~/services/logger.service"; -import { makeModelCall } from "~/lib/model.server"; -import { runQuery } from "~/lib/neo4j.server"; -import type { CoreMessage } from "ai"; -import { z } from "zod"; -import { - EXPLICIT_PATTERN_TYPES, - IMPLICIT_PATTERN_TYPES, - type SpacePattern, - type PatternDetectionResult, -} from "@core/types"; -import { createSpacePattern, getSpace } from "../utils/space-utils"; - -interface SpacePatternPayload { - userId: string; - workspaceId: string; - spaceId: string; - triggerSource?: - | "summary_complete" - | "manual" - | "assignment" - | "scheduled" - | "new_space" - | "growth_threshold" - | "ingestion_complete"; -} - -interface SpaceStatementData { - uuid: string; - fact: string; - subject: string; - predicate: string; - object: string; - createdAt: Date; - validAt: Date; - content?: string; // For implicit pattern analysis -} - -interface SpaceThemeData { - themes: string[]; - summary: string; -} - -// Zod schemas for LLM response validation -const ExplicitPatternSchema = z.object({ - name: z.string(), - type: z.string(), - summary: z.string(), - evidence: z.array(z.string()), - confidence: z.number().min(0).max(1), -}); - -const ImplicitPatternSchema = z.object({ - name: z.string(), - type: z.string(), - summary: z.string(), - evidence: z.array(z.string()), - confidence: z.number().min(0).max(1), -}); - -const PatternAnalysisSchema = z.object({ - explicitPatterns: z.array(ExplicitPatternSchema), - implicitPatterns: z.array(ImplicitPatternSchema), -}); - -const CONFIG = { - minStatementsForPatterns: 5, - maxPatternsPerSpace: 20, - minPatternConfidence: 0.85, -}; - -export const spacePatternTask = task({ - id: "space-pattern", - run: async (payload: SpacePatternPayload) => { - const { userId, workspaceId, spaceId, triggerSource = "manual" } = payload; - - logger.info(`Starting space pattern detection`, { - userId, - workspaceId, - spaceId, - triggerSource, - }); - - try { - // Get space data and check if it has enough content - const space = await getSpaceForPatternAnalysis(spaceId); - if (!space) { - return { - success: false, - spaceId, - error: "Space not found or insufficient data", - }; - } - - // Get statements for pattern analysis - const statements = await getSpaceStatementsForPatterns(spaceId, userId); - - if (statements.length < CONFIG.minStatementsForPatterns) { - logger.info( - `Space ${spaceId} has insufficient statements (${statements.length}) for pattern detection`, - ); - return { - success: true, - spaceId, - triggerSource, - patterns: { - explicitPatterns: [], - implicitPatterns: [], - totalPatternsFound: 0, - }, - }; - } - - // Detect patterns - const patternResult = await detectSpacePatterns(space, statements); - - if (patternResult) { - // Store patterns - await storePatterns( - patternResult.explicitPatterns, - patternResult.implicitPatterns, - spaceId, - ); - - logger.info(`Generated patterns for space ${spaceId}`, { - explicitPatterns: patternResult.explicitPatterns.length, - implicitPatterns: patternResult.implicitPatterns.length, - totalPatterns: patternResult.totalPatternsFound, - triggerSource, - }); - - return { - success: true, - spaceId, - triggerSource, - patterns: { - explicitPatterns: patternResult.explicitPatterns.length, - implicitPatterns: patternResult.implicitPatterns.length, - totalPatternsFound: patternResult.totalPatternsFound, - }, - }; - } else { - logger.warn(`Failed to detect patterns for space ${spaceId}`); - return { - success: false, - spaceId, - triggerSource, - error: "Failed to detect patterns", - }; - } - } catch (error) { - logger.error( - `Error in space pattern detection for space ${spaceId}:`, - error as Record, - ); - throw error; - } - }, -}); - -async function getSpaceForPatternAnalysis( - spaceId: string, -): Promise { - try { - const space = await getSpace(spaceId); - - if (!space || !space.themes || space.themes.length === 0) { - logger.warn( - `Space ${spaceId} not found or has no themes for pattern analysis`, - ); - return null; - } - - return { - themes: space.themes, - summary: space.summary || "", - }; - } catch (error) { - logger.error( - `Error getting space for pattern analysis:`, - error as Record, - ); - return null; - } -} - -async function getSpaceStatementsForPatterns( - spaceId: string, - userId: string, -): Promise { - const query = ` - MATCH (s:Statement) - WHERE s.userId = $userId - AND s.spaceIds IS NOT NULL - AND $spaceId IN s.spaceIds - AND s.invalidAt IS NULL - MATCH (s)-[:HAS_SUBJECT]->(subj:Entity) - MATCH (s)-[:HAS_PREDICATE]->(pred:Entity) - MATCH (s)-[:HAS_OBJECT]->(obj:Entity) - RETURN s, subj.name as subject, pred.name as predicate, obj.name as object - ORDER BY s.createdAt DESC - `; - - const result = await runQuery(query, { - spaceId, - userId, - }); - - return result.map((record) => { - const statement = record.get("s").properties; - return { - uuid: statement.uuid, - fact: statement.fact, - subject: record.get("subject"), - predicate: record.get("predicate"), - object: record.get("object"), - createdAt: new Date(statement.createdAt), - validAt: new Date(statement.validAt), - content: statement.fact, // Use fact as content for implicit analysis - }; - }); -} - -async function detectSpacePatterns( - space: SpaceThemeData, - statements: SpaceStatementData[], -): Promise { - try { - // Extract explicit patterns from themes - const explicitPatterns = await extractExplicitPatterns( - space.themes, - space.summary, - statements, - ); - - // Extract implicit patterns from statement analysis - const implicitPatterns = await extractImplicitPatterns(statements); - - return { - explicitPatterns, - implicitPatterns, - totalPatternsFound: explicitPatterns.length + implicitPatterns.length, - processingStats: { - statementsAnalyzed: statements.length, - themesProcessed: space.themes.length, - implicitPatternsExtracted: implicitPatterns.length, - }, - }; - } catch (error) { - logger.error( - "Error detecting space patterns:", - error as Record, - ); - return null; - } -} - -async function extractExplicitPatterns( - themes: string[], - summary: string, - statements: SpaceStatementData[], -): Promise[]> { - if (themes.length === 0) return []; - - const prompt = createExplicitPatternPrompt(themes, summary, statements); - - // Pattern extraction requires HIGH complexity (insight synthesis, pattern recognition) - let responseText = ""; - await makeModelCall(false, prompt, (text: string) => { - responseText = text; - }, undefined, 'high'); - - const patterns = parseExplicitPatternResponse(responseText); - - return patterns.map((pattern) => ({ - name: pattern.name || `${pattern.type} pattern`, - source: "explicit" as const, - type: pattern.type, - summary: pattern.summary, - evidence: pattern.evidence, - confidence: pattern.confidence, - userConfirmed: "pending" as const, - })); -} - -async function extractImplicitPatterns( - statements: SpaceStatementData[], -): Promise[]> { - if (statements.length < CONFIG.minStatementsForPatterns) return []; - - const prompt = createImplicitPatternPrompt(statements); - - // Implicit pattern discovery requires HIGH complexity (pattern recognition from statements) - let responseText = ""; - await makeModelCall(false, prompt, (text: string) => { - responseText = text; - }, undefined, 'high'); - - const patterns = parseImplicitPatternResponse(responseText); - - return patterns.map((pattern) => ({ - name: pattern.name || `${pattern.type} pattern`, - source: "implicit" as const, - type: pattern.type, - summary: pattern.summary, - evidence: pattern.evidence, - confidence: pattern.confidence, - userConfirmed: "pending" as const, - })); -} - -function createExplicitPatternPrompt( - themes: string[], - summary: string, - statements: SpaceStatementData[], -): CoreMessage[] { - const statementsText = statements - .map((stmt) => `[${stmt.uuid}] ${stmt.fact}`) - .join("\n"); - - const explicitTypes = Object.values(EXPLICIT_PATTERN_TYPES).join('", "'); - - return [ - { - role: "system", - content: `You are an expert at extracting structured patterns from themes and supporting evidence. - -Your task is to convert high-level themes into explicit patterns with supporting statement evidence. - -INSTRUCTIONS: -1. For each theme, create a pattern that explains what it reveals about the user -2. Give each pattern a short, descriptive name (2-4 words) -3. Find supporting statement IDs that provide evidence for each pattern -4. Assess confidence based on evidence strength and theme clarity -5. Use appropriate pattern types from these guidelines: "${explicitTypes}" - - "theme": High-level thematic content areas - - "topic": Specific subject matter or topics of interest - - "domain": Knowledge or work domains the user operates in - - "interest_area": Areas of personal interest or hobby -6. You may suggest new pattern types if none of the guidelines fit well - -RESPONSE FORMAT: -Provide your response inside tags with valid JSON. - - -{ - "explicitPatterns": [ - { - "name": "Short descriptive name for the pattern", - "type": "theme", - "summary": "Description of what this pattern reveals about the user", - "evidence": ["statement_id_1", "statement_id_2"], - "confidence": 0.85 - } - ] -} -`, - }, - { - role: "user", - content: `THEMES TO ANALYZE: -${themes.map((theme, i) => `${i + 1}. ${theme}`).join("\n")} - -SPACE SUMMARY: -${summary} - -SUPPORTING STATEMENTS: -${statementsText} - -Please extract explicit patterns from these themes and map them to supporting statement evidence.`, - }, - ]; -} - -function createImplicitPatternPrompt( - statements: SpaceStatementData[], -): CoreMessage[] { - const statementsText = statements - .map( - (stmt) => - `[${stmt.uuid}] ${stmt.fact} (${stmt.subject} → ${stmt.predicate} → ${stmt.object})`, - ) - .join("\n"); - - const implicitTypes = Object.values(IMPLICIT_PATTERN_TYPES).join('", "'); - - return [ - { - role: "system", - content: `You are an expert at discovering implicit behavioral patterns from statement analysis. - -Your task is to identify hidden patterns in user behavior, preferences, and habits from statement content. - -INSTRUCTIONS: -1. Analyze statement content for behavioral patterns, not explicit topics -2. Give each pattern a short, descriptive name (2-4 words) -3. Look for recurring behaviors, preferences, and working styles -4. Identify how the user approaches tasks, makes decisions, and interacts -5. Use appropriate pattern types from these guidelines: "${implicitTypes}" - - "preference": Personal preferences and choices - - "habit": Recurring behaviors and routines - - "workflow": Work and process patterns - - "communication_style": How user communicates and expresses ideas - - "decision_pattern": Decision-making approaches and criteria - - "temporal_pattern": Time-based behavioral patterns - - "behavioral_pattern": General behavioral tendencies - - "learning_style": How user learns and processes information - - "collaboration_style": How user works with others -6. You may suggest new pattern types if none of the guidelines fit well -7. Focus on what the statements reveal about how the user thinks, works, or behaves - -RESPONSE FORMAT: -Provide your response inside tags with valid JSON. - - -{ - "implicitPatterns": [ - { - "name": "Short descriptive name for the pattern", - "type": "preference", - "summary": "Description of what this behavioral pattern reveals", - "evidence": ["statement_id_1", "statement_id_2"], - "confidence": 0.75 - } - ] -} -`, - }, - { - role: "user", - content: `STATEMENTS TO ANALYZE FOR IMPLICIT PATTERNS: -${statementsText} - -Please identify implicit behavioral patterns, preferences, and habits from these statements.`, - }, - ]; -} - -function parseExplicitPatternResponse(response: string): Array<{ - name: string; - type: string; - summary: string; - evidence: string[]; - confidence: number; -}> { - try { - const outputMatch = response.match(/([\s\S]*?)<\/output>/); - if (!outputMatch) { - logger.warn("No tags found in explicit pattern response"); - return []; - } - - const parsed = JSON.parse(outputMatch[1].trim()); - const validationResult = z - .object({ - explicitPatterns: z.array(ExplicitPatternSchema), - }) - .safeParse(parsed); - - if (!validationResult.success) { - logger.warn("Invalid explicit pattern response format:", { - error: validationResult.error, - }); - return []; - } - - return validationResult.data.explicitPatterns.filter( - (p) => - p.confidence >= CONFIG.minPatternConfidence && p.evidence.length >= 3, // Ensure at least 3 evidence statements - ); - } catch (error) { - logger.error( - "Error parsing explicit pattern response:", - error as Record, - ); - return []; - } -} - -function parseImplicitPatternResponse(response: string): Array<{ - name: string; - type: string; - summary: string; - evidence: string[]; - confidence: number; -}> { - try { - const outputMatch = response.match(/([\s\S]*?)<\/output>/); - if (!outputMatch) { - logger.warn("No tags found in implicit pattern response"); - return []; - } - - const parsed = JSON.parse(outputMatch[1].trim()); - const validationResult = z - .object({ - implicitPatterns: z.array(ImplicitPatternSchema), - }) - .safeParse(parsed); - - if (!validationResult.success) { - logger.warn("Invalid implicit pattern response format:", { - error: validationResult.error, - }); - return []; - } - - return validationResult.data.implicitPatterns.filter( - (p) => - p.confidence >= CONFIG.minPatternConfidence && p.evidence.length >= 3, // Ensure at least 3 evidence statements - ); - } catch (error) { - logger.error( - "Error parsing implicit pattern response:", - error as Record, - ); - return []; - } -} - -async function storePatterns( - explicitPatterns: Omit< - SpacePattern, - "id" | "createdAt" | "updatedAt" | "spaceId" - >[], - implicitPatterns: Omit< - SpacePattern, - "id" | "createdAt" | "updatedAt" | "spaceId" - >[], - spaceId: string, -): Promise { - try { - const allPatterns = [...explicitPatterns, ...implicitPatterns]; - - if (allPatterns.length === 0) return; - - // Store in PostgreSQL - await createSpacePattern(spaceId, allPatterns); - - logger.info(`Stored ${allPatterns.length} patterns`, { - explicit: explicitPatterns.length, - implicit: implicitPatterns.length, - }); - } catch (error) { - logger.error("Error storing patterns:", error as Record); - throw error; - } -} - -// Helper function to trigger the task -export async function triggerSpacePattern(payload: SpacePatternPayload) { - return await spacePatternTask.trigger(payload, { - concurrencyKey: `space-pattern-${payload.spaceId}`, // Prevent parallel runs for the same space - tags: [payload.userId, payload.spaceId, payload.triggerSource || "manual"], - }); -} diff --git a/apps/webapp/app/utils/mcp/memory.ts b/apps/webapp/app/utils/mcp/memory.ts index 9bc5792..45a4f79 100644 --- a/apps/webapp/app/utils/mcp/memory.ts +++ b/apps/webapp/app/utils/mcp/memory.ts @@ -19,24 +19,24 @@ const SearchParamsSchema = { description: "Search query optimized for knowledge graph retrieval. Choose the right query structure based on your search intent:\n\n" + "1. **Entity-Centric Queries** (Best for graph search):\n" + - " - āœ… GOOD: \"User's preferences for code style and formatting\"\n" + - " - āœ… GOOD: \"Project authentication implementation decisions\"\n" + - " - āŒ BAD: \"user code style\"\n" + + ' - āœ… GOOD: "User\'s preferences for code style and formatting"\n' + + ' - āœ… GOOD: "Project authentication implementation decisions"\n' + + ' - āŒ BAD: "user code style"\n' + " - Format: [Person/Project] + [relationship/attribute] + [context]\n\n" + "2. **Multi-Entity Relationship Queries** (Excellent for episode graph):\n" + - " - āœ… GOOD: \"User and team discussions about API design patterns\"\n" + - " - āœ… GOOD: \"relationship between database schema and performance optimization\"\n" + - " - āŒ BAD: \"user team api design\"\n" + + ' - āœ… GOOD: "User and team discussions about API design patterns"\n' + + ' - āœ… GOOD: "relationship between database schema and performance optimization"\n' + + ' - āŒ BAD: "user team api design"\n' + " - Format: [Entity1] + [relationship type] + [Entity2] + [context]\n\n" + "3. **Semantic Question Queries** (Good for vector search):\n" + - " - āœ… GOOD: \"What causes authentication errors in production? What are the security requirements?\"\n" + - " - āœ… GOOD: \"How does caching improve API response times compared to direct database queries?\"\n" + - " - āŒ BAD: \"auth errors production\"\n" + + ' - āœ… GOOD: "What causes authentication errors in production? What are the security requirements?"\n' + + ' - āœ… GOOD: "How does caching improve API response times compared to direct database queries?"\n' + + ' - āŒ BAD: "auth errors production"\n' + " - Format: Complete natural questions with full context\n\n" + "4. **Concept Exploration Queries** (Good for BFS traversal):\n" + - " - āœ… GOOD: \"concepts and ideas related to database indexing and query optimization\"\n" + - " - āœ… GOOD: \"topics connected to user authentication and session management\"\n" + - " - āŒ BAD: \"database indexing concepts\"\n" + + ' - āœ… GOOD: "concepts and ideas related to database indexing and query optimization"\n' + + ' - āœ… GOOD: "topics connected to user authentication and session management"\n' + + ' - āŒ BAD: "database indexing concepts"\n' + " - Format: [concept] + related/connected + [domain/context]\n\n" + "Avoid keyword soup queries - use complete phrases with proper context for best results.", }, @@ -462,7 +462,7 @@ async function handleGetSpace(args: any) { const spaceDetails = { id: space.id, name: space.name, - description: space.description, + summary: space.summary, }; return { diff --git a/docker/Dockerfile b/docker/Dockerfile index 8833971..9022175 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -55,7 +55,16 @@ RUN pnpm run build --filter=webapp... # Runner FROM ${NODE_IMAGE} AS runner -RUN apt-get update && apt-get install -y openssl netcat-openbsd ca-certificates +RUN apt-get update && apt-get install -y \ + openssl \ + netcat-openbsd \ + ca-certificates \ + python3 \ + python3-pip \ + python3-venv \ + gcc \ + g++ \ + && rm -rf /var/lib/apt/lists/* WORKDIR /core RUN corepack enable ENV NODE_ENV production @@ -69,6 +78,13 @@ COPY --from=builder --chown=node:node /core/apps/webapp/build ./apps/webapp/buil COPY --from=builder --chown=node:node /core/apps/webapp/public ./apps/webapp/public COPY --from=builder --chown=node:node /core/scripts ./scripts +# Install BERT Python dependencies +COPY --chown=node:node apps/webapp/app/bert/requirements.txt ./apps/webapp/app/bert/requirements.txt +RUN pip3 install --no-cache-dir -r ./apps/webapp/app/bert/requirements.txt + +# Copy BERT scripts +COPY --chown=node:node apps/webapp/app/bert/main.py ./apps/webapp/app/bert/main.py + EXPOSE 3000 USER node diff --git a/packages/database/prisma/migrations/20251029102022_add_metadata_to_workspace/migration.sql b/packages/database/prisma/migrations/20251029102022_add_metadata_to_workspace/migration.sql new file mode 100644 index 0000000..fbff632 --- /dev/null +++ b/packages/database/prisma/migrations/20251029102022_add_metadata_to_workspace/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "Workspace" ADD COLUMN "metadata" JSONB NOT NULL DEFAULT '{}'; diff --git a/packages/database/prisma/schema.prisma b/packages/database/prisma/schema.prisma index e6f67e0..ae728fa 100644 --- a/packages/database/prisma/schema.prisma +++ b/packages/database/prisma/schema.prisma @@ -694,6 +694,8 @@ model Workspace { slug String @unique icon String? + metadata Json @default("{}") + integrations String[] userId String? @unique