mirror of
https://github.com/eliasstepanik/core.git
synced 2026-01-11 17:18:28 +00:00
feat: space v3
This commit is contained in:
parent
c5407be54d
commit
6d850f3d89
@ -41,10 +41,7 @@ NEO4J_USERNAME=neo4j
|
||||
NEO4J_PASSWORD=27192e6432564f4788d55c15131bd5ac
|
||||
OPENAI_API_KEY=
|
||||
|
||||
|
||||
MAGIC_LINK_SECRET=27192e6432564f4788d55c15131bd5ac
|
||||
|
||||
|
||||
NEO4J_AUTH=neo4j/27192e6432564f4788d55c15131bd5ac
|
||||
OLLAMA_URL=http://ollama:11434
|
||||
|
||||
|
||||
26
.github/workflows/build-docker-image.yml
vendored
26
.github/workflows/build-docker-image.yml
vendored
@ -7,32 +7,6 @@ on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build-init:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Login to Docker Registry
|
||||
run: echo "${{ secrets.DOCKER_PASSWORD }}" | docker login -u "${{ secrets.DOCKER_USERNAME }}" --password-stdin
|
||||
|
||||
- name: Build and Push Frontend Docker Image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
file: ./apps/init/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: redplanethq/init:${{ github.ref_name }}
|
||||
|
||||
build-webapp:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
|
||||
51
apps/init/.gitignore
vendored
51
apps/init/.gitignore
vendored
@ -1,51 +0,0 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
# Dependencies
|
||||
node_modules
|
||||
.pnp
|
||||
.pnp.js
|
||||
|
||||
# Local env files
|
||||
.env
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
# Testing
|
||||
coverage
|
||||
|
||||
# Turbo
|
||||
.turbo
|
||||
|
||||
# Vercel
|
||||
.vercel
|
||||
|
||||
# Build Outputs
|
||||
.next/
|
||||
out/
|
||||
build
|
||||
dist
|
||||
.tshy/
|
||||
.tshy-build/
|
||||
|
||||
# Debug
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# Misc
|
||||
.DS_Store
|
||||
*.pem
|
||||
|
||||
docker-compose.dev.yaml
|
||||
|
||||
clickhouse/
|
||||
.vscode/
|
||||
registry/
|
||||
|
||||
.cursor
|
||||
CLAUDE.md
|
||||
|
||||
.claude
|
||||
|
||||
@ -1,70 +0,0 @@
|
||||
ARG NODE_IMAGE=node:20.11.1-bullseye-slim@sha256:5a5a92b3a8d392691c983719dbdc65d9f30085d6dcd65376e7a32e6fe9bf4cbe
|
||||
|
||||
FROM ${NODE_IMAGE} AS pruner
|
||||
|
||||
WORKDIR /core
|
||||
|
||||
COPY --chown=node:node . .
|
||||
RUN npx -q turbo@2.5.3 prune --scope=@redplanethq/init --docker
|
||||
RUN find . -name "node_modules" -type d -prune -exec rm -rf '{}' +
|
||||
|
||||
# Base strategy to have layer caching
|
||||
FROM ${NODE_IMAGE} AS base
|
||||
RUN apt-get update && apt-get install -y openssl dumb-init postgresql-client
|
||||
WORKDIR /core
|
||||
COPY --chown=node:node .gitignore .gitignore
|
||||
COPY --from=pruner --chown=node:node /core/out/json/ .
|
||||
COPY --from=pruner --chown=node:node /core/out/pnpm-lock.yaml ./pnpm-lock.yaml
|
||||
COPY --from=pruner --chown=node:node /core/out/pnpm-workspace.yaml ./pnpm-workspace.yaml
|
||||
|
||||
## Dev deps
|
||||
FROM base AS dev-deps
|
||||
WORKDIR /core
|
||||
# Corepack is used to install pnpm
|
||||
RUN corepack enable
|
||||
ENV NODE_ENV development
|
||||
RUN pnpm install --ignore-scripts --no-frozen-lockfile
|
||||
|
||||
## Production deps
|
||||
FROM base AS production-deps
|
||||
WORKDIR /core
|
||||
# Corepack is used to install pnpm
|
||||
RUN corepack enable
|
||||
ENV NODE_ENV production
|
||||
RUN pnpm install --prod --no-frozen-lockfile
|
||||
|
||||
## Builder (builds the init CLI)
|
||||
FROM base AS builder
|
||||
WORKDIR /core
|
||||
# Corepack is used to install pnpm
|
||||
RUN corepack enable
|
||||
|
||||
COPY --from=pruner --chown=node:node /core/out/full/ .
|
||||
COPY --from=dev-deps --chown=node:node /core/ .
|
||||
COPY --chown=node:node turbo.json turbo.json
|
||||
COPY --chown=node:node .configs/tsconfig.base.json .configs/tsconfig.base.json
|
||||
RUN pnpm run build --filter=@redplanethq/init...
|
||||
|
||||
# Runner
|
||||
FROM ${NODE_IMAGE} AS runner
|
||||
RUN apt-get update && apt-get install -y openssl postgresql-client ca-certificates
|
||||
WORKDIR /core
|
||||
RUN corepack enable
|
||||
ENV NODE_ENV production
|
||||
|
||||
COPY --from=base /usr/bin/dumb-init /usr/bin/dumb-init
|
||||
COPY --from=pruner --chown=node:node /core/out/full/ .
|
||||
COPY --from=production-deps --chown=node:node /core .
|
||||
COPY --from=builder --chown=node:node /core/apps/init/dist ./apps/init/dist
|
||||
|
||||
# Copy the trigger dump file
|
||||
COPY --chown=node:node apps/init/trigger.dump ./apps/init/trigger.dump
|
||||
|
||||
# Copy and set up entrypoint script
|
||||
COPY --chown=node:node apps/init/entrypoint.sh ./apps/init/entrypoint.sh
|
||||
RUN chmod +x ./apps/init/entrypoint.sh
|
||||
|
||||
USER node
|
||||
WORKDIR /core/apps/init
|
||||
ENTRYPOINT ["dumb-init", "--"]
|
||||
CMD ["./entrypoint.sh"]
|
||||
@ -1,197 +0,0 @@
|
||||
# Core CLI
|
||||
|
||||
🧠 **CORE - Contextual Observation & Recall Engine**
|
||||
|
||||
A Command-Line Interface for setting up and managing the Core development environment.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
npm install -g @redplanethq/core
|
||||
```
|
||||
|
||||
## Commands
|
||||
|
||||
### `core init`
|
||||
|
||||
**One-time setup command** - Initializes the Core development environment with full configuration.
|
||||
|
||||
### `core start`
|
||||
|
||||
**Daily usage command** - Starts all Core services (Docker containers).
|
||||
|
||||
### `core stop`
|
||||
|
||||
**Daily usage command** - Stops all Core services (Docker containers).
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- **Node.js** (v18.20.0 or higher)
|
||||
- **Docker** and **Docker Compose**
|
||||
- **Git**
|
||||
- **pnpm** package manager
|
||||
|
||||
### Initial Setup
|
||||
|
||||
1. **Clone the Core repository:**
|
||||
```bash
|
||||
git clone https://github.com/redplanethq/core.git
|
||||
cd core
|
||||
```
|
||||
|
||||
2. **Run the initialization command:**
|
||||
```bash
|
||||
core init
|
||||
```
|
||||
|
||||
3. **The CLI will guide you through the complete setup process:**
|
||||
|
||||
#### Step 1: Prerequisites Check
|
||||
- The CLI shows a checklist of required tools
|
||||
- Confirms you're in the Core repository directory
|
||||
- Exits with instructions if prerequisites aren't met
|
||||
|
||||
#### Step 2: Environment Configuration
|
||||
|
||||
- Copies `.env.example` to `.env` in the root directory
|
||||
- Copies `trigger/.env.example` to `trigger/.env`
|
||||
- Skips copying if `.env` files already exist
|
||||
|
||||
#### Step 3: Docker Services Startup
|
||||
|
||||
- Starts main Core services: `docker compose up -d`
|
||||
- Starts Trigger.dev services: `docker compose up -d` (in trigger/ directory)
|
||||
- Shows real-time output with progress indicators
|
||||
|
||||
#### Step 4: Database Health Check
|
||||
|
||||
- Verifies PostgreSQL is running on `localhost:5432`
|
||||
- Retries for up to 60 seconds if needed
|
||||
|
||||
#### Step 5: Trigger.dev Setup (Interactive)
|
||||
|
||||
- **If Trigger.dev is not configured:**
|
||||
|
||||
1. Prompts you to open http://localhost:8030
|
||||
2. Asks you to login to Trigger.dev
|
||||
3. Guides you to create an organization and project
|
||||
4. Collects your Project ID and Secret Key
|
||||
5. Updates `.env` with your Trigger.dev configuration
|
||||
6. Restarts Core services with new configuration
|
||||
|
||||
- **If Trigger.dev is already configured:**
|
||||
- Skips setup and shows "Configuration already exists" message
|
||||
|
||||
#### Step 6: Docker Registry Login
|
||||
|
||||
- Displays docker login command with credentials from `.env`
|
||||
- Waits for you to complete the login process
|
||||
|
||||
#### Step 7: Trigger.dev Task Deployment
|
||||
|
||||
- Automatically runs: `npx trigger.dev@v4-beta login -a http://localhost:8030`
|
||||
- Deploys tasks with: `pnpm trigger:deploy`
|
||||
- Shows manual deployment instructions if automatic deployment fails
|
||||
|
||||
#### Step 8: Setup Complete!
|
||||
|
||||
- Confirms all services are running
|
||||
- Shows service URLs and connection information
|
||||
|
||||
## Daily Usage
|
||||
|
||||
After initial setup, use these commands for daily development:
|
||||
|
||||
### Start Services
|
||||
|
||||
```bash
|
||||
core start
|
||||
```
|
||||
|
||||
Starts all Docker containers for Core development.
|
||||
|
||||
### Stop Services
|
||||
|
||||
```bash
|
||||
core stop
|
||||
```
|
||||
|
||||
Stops all Docker containers.
|
||||
|
||||
## Service URLs
|
||||
|
||||
After setup, these services will be available:
|
||||
|
||||
- **Core Application**: http://localhost:3033
|
||||
- **Trigger.dev**: http://localhost:8030
|
||||
- **PostgreSQL**: localhost:5432
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Repository Not Found
|
||||
|
||||
If you run commands outside the Core repository:
|
||||
|
||||
- The CLI will ask you to confirm you're in the Core repository
|
||||
- If not, it provides instructions to clone the repository
|
||||
- Navigate to the Core repository directory before running commands again
|
||||
|
||||
### Docker Issues
|
||||
|
||||
- Ensure Docker is running
|
||||
- Check Docker Compose is installed
|
||||
- Verify you have sufficient system resources
|
||||
|
||||
### Trigger.dev Setup Issues
|
||||
|
||||
- Check container logs: `docker logs trigger-webapp --tail 50`
|
||||
- Ensure you can access http://localhost:8030
|
||||
- Verify your network allows connections to localhost
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The CLI automatically manages these environment variables:
|
||||
|
||||
- `TRIGGER_PROJECT_ID` - Your Trigger.dev project ID
|
||||
- `TRIGGER_SECRET_KEY` - Your Trigger.dev secret key
|
||||
- Docker registry credentials for deployment
|
||||
|
||||
### Manual Trigger.dev Deployment
|
||||
|
||||
If automatic deployment fails, run manually:
|
||||
|
||||
```bash
|
||||
npx trigger.dev@v4-beta login -a http://localhost:8030
|
||||
pnpm trigger:deploy
|
||||
```
|
||||
|
||||
## Development Workflow
|
||||
|
||||
1. **First time setup:** `core init`
|
||||
2. **Daily development:**
|
||||
- `core start` - Start your development environment
|
||||
- Do your development work
|
||||
- `core stop` - Stop services when done
|
||||
|
||||
## Support
|
||||
|
||||
For issues and questions:
|
||||
|
||||
- Check the main Core repository: https://github.com/redplanethq/core
|
||||
- Review Docker container logs for troubleshooting
|
||||
- Ensure all prerequisites are properly installed
|
||||
|
||||
## Features
|
||||
|
||||
- 🚀 **One-command setup** - Complete environment initialization
|
||||
- 🔄 **Smart configuration** - Skips already configured components
|
||||
- 📱 **Real-time feedback** - Live progress indicators and output
|
||||
- 🐳 **Docker integration** - Full container lifecycle management
|
||||
- 🔧 **Interactive setup** - Guided configuration process
|
||||
- 🎯 **Error handling** - Graceful failure with recovery instructions
|
||||
|
||||
---
|
||||
|
||||
**Happy coding with Core!** 🎉
|
||||
@ -1,22 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Exit on any error
|
||||
set -e
|
||||
|
||||
echo "Starting init CLI..."
|
||||
|
||||
# Wait for database to be ready
|
||||
echo "Waiting for database connection..."
|
||||
until pg_isready -h "${DB_HOST:-localhost}" -p "${DB_PORT:-5432}" -U "${POSTGRES_USER:-docker}"; do
|
||||
echo "Database is unavailable - sleeping"
|
||||
sleep 2
|
||||
done
|
||||
|
||||
echo "Database is ready!"
|
||||
|
||||
# Run the init command
|
||||
echo "Running init command..."
|
||||
node ./dist/esm/index.js init
|
||||
|
||||
echo "Init completed successfully!"
|
||||
exit 0
|
||||
@ -1,145 +0,0 @@
|
||||
{
|
||||
"name": "@redplanethq/init",
|
||||
"version": "0.1.0",
|
||||
"description": "A init service to create trigger instance",
|
||||
"type": "module",
|
||||
"license": "MIT",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/redplanethq/core",
|
||||
"directory": "apps/init"
|
||||
},
|
||||
"publishConfig": {
|
||||
"access": "public"
|
||||
},
|
||||
"keywords": [
|
||||
"typescript"
|
||||
],
|
||||
"files": [
|
||||
"dist",
|
||||
"trigger.dump"
|
||||
],
|
||||
"bin": {
|
||||
"core": "./dist/esm/index.js"
|
||||
},
|
||||
"tshy": {
|
||||
"selfLink": false,
|
||||
"main": false,
|
||||
"module": false,
|
||||
"dialects": [
|
||||
"esm"
|
||||
],
|
||||
"project": "./tsconfig.json",
|
||||
"exclude": [
|
||||
"**/*.test.ts"
|
||||
],
|
||||
"exports": {
|
||||
"./package.json": "./package.json",
|
||||
".": "./src/index.ts"
|
||||
}
|
||||
},
|
||||
"devDependencies": {
|
||||
"@epic-web/test-server": "^0.1.0",
|
||||
"@types/gradient-string": "^1.1.2",
|
||||
"@types/ini": "^4.1.1",
|
||||
"@types/object-hash": "3.0.6",
|
||||
"@types/polka": "^0.5.7",
|
||||
"@types/react": "^18.2.48",
|
||||
"@types/resolve": "^1.20.6",
|
||||
"@types/rimraf": "^4.0.5",
|
||||
"@types/semver": "^7.5.0",
|
||||
"@types/source-map-support": "0.5.10",
|
||||
"@types/ws": "^8.5.3",
|
||||
"cpy-cli": "^5.0.0",
|
||||
"execa": "^8.0.1",
|
||||
"find-up": "^7.0.0",
|
||||
"rimraf": "^5.0.7",
|
||||
"ts-essentials": "10.0.1",
|
||||
"tshy": "^3.0.2",
|
||||
"tsx": "4.17.0"
|
||||
},
|
||||
"scripts": {
|
||||
"clean": "rimraf dist .tshy .tshy-build .turbo",
|
||||
"typecheck": "tsc -p tsconfig.src.json --noEmit",
|
||||
"build": "tshy",
|
||||
"test": "vitest",
|
||||
"test:e2e": "vitest --run -c ./e2e/vitest.config.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"@clack/prompts": "^0.10.0",
|
||||
"@depot/cli": "0.0.1-cli.2.80.0",
|
||||
"@opentelemetry/api": "1.9.0",
|
||||
"@opentelemetry/api-logs": "0.52.1",
|
||||
"@opentelemetry/exporter-logs-otlp-http": "0.52.1",
|
||||
"@opentelemetry/exporter-trace-otlp-http": "0.52.1",
|
||||
"@opentelemetry/instrumentation": "0.52.1",
|
||||
"@opentelemetry/instrumentation-fetch": "0.52.1",
|
||||
"@opentelemetry/resources": "1.25.1",
|
||||
"@opentelemetry/sdk-logs": "0.52.1",
|
||||
"@opentelemetry/sdk-node": "0.52.1",
|
||||
"@opentelemetry/sdk-trace-base": "1.25.1",
|
||||
"@opentelemetry/sdk-trace-node": "1.25.1",
|
||||
"@opentelemetry/semantic-conventions": "1.25.1",
|
||||
"ansi-escapes": "^7.0.0",
|
||||
"braces": "^3.0.3",
|
||||
"c12": "^1.11.1",
|
||||
"chalk": "^5.2.0",
|
||||
"chokidar": "^3.6.0",
|
||||
"cli-table3": "^0.6.3",
|
||||
"commander": "^9.4.1",
|
||||
"defu": "^6.1.4",
|
||||
"dotenv": "^16.4.5",
|
||||
"dotenv-expand": "^12.0.2",
|
||||
"esbuild": "^0.23.0",
|
||||
"eventsource": "^3.0.2",
|
||||
"evt": "^2.4.13",
|
||||
"fast-npm-meta": "^0.2.2",
|
||||
"git-last-commit": "^1.0.1",
|
||||
"gradient-string": "^2.0.2",
|
||||
"has-flag": "^5.0.1",
|
||||
"import-in-the-middle": "1.11.0",
|
||||
"import-meta-resolve": "^4.1.0",
|
||||
"ini": "^5.0.0",
|
||||
"jsonc-parser": "3.2.1",
|
||||
"magicast": "^0.3.4",
|
||||
"minimatch": "^10.0.1",
|
||||
"mlly": "^1.7.1",
|
||||
"nypm": "^0.5.4",
|
||||
"nanoid": "3.3.8",
|
||||
"object-hash": "^3.0.0",
|
||||
"open": "^10.0.3",
|
||||
"knex": "3.1.0",
|
||||
"p-limit": "^6.2.0",
|
||||
"p-retry": "^6.1.0",
|
||||
"partysocket": "^1.0.2",
|
||||
"pkg-types": "^1.1.3",
|
||||
"polka": "^0.5.2",
|
||||
"pg": "8.16.3",
|
||||
"resolve": "^1.22.8",
|
||||
"semver": "^7.5.0",
|
||||
"signal-exit": "^4.1.0",
|
||||
"source-map-support": "0.5.21",
|
||||
"std-env": "^3.7.0",
|
||||
"supports-color": "^10.0.0",
|
||||
"tiny-invariant": "^1.2.0",
|
||||
"tinyexec": "^0.3.1",
|
||||
"tinyglobby": "^0.2.10",
|
||||
"uuid": "11.1.0",
|
||||
"ws": "^8.18.0",
|
||||
"xdg-app-paths": "^8.3.0",
|
||||
"zod": "3.23.8",
|
||||
"zod-validation-error": "^1.5.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.20.0"
|
||||
},
|
||||
"exports": {
|
||||
"./package.json": "./package.json",
|
||||
".": {
|
||||
"import": {
|
||||
"types": "./dist/esm/index.d.ts",
|
||||
"default": "./dist/esm/index.js"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
import { Command } from "commander";
|
||||
import { initCommand } from "../commands/init.js";
|
||||
import { VERSION } from "./version.js";
|
||||
|
||||
const program = new Command();
|
||||
|
||||
program.name("core").description("Core CLI - A Command-Line Interface for Core").version(VERSION);
|
||||
|
||||
program
|
||||
.command("init")
|
||||
.description("Initialize Core development environment (run once)")
|
||||
.action(initCommand);
|
||||
|
||||
program.parse(process.argv);
|
||||
@ -1,3 +0,0 @@
|
||||
import { env } from "../utils/env.js";
|
||||
|
||||
export const VERSION = env.VERSION;
|
||||
@ -1,36 +0,0 @@
|
||||
import { intro, outro, note } from "@clack/prompts";
|
||||
import { printCoreBrainLogo } from "../utils/ascii.js";
|
||||
import { initTriggerDatabase, updateWorkerImage } from "../utils/trigger.js";
|
||||
|
||||
export async function initCommand() {
|
||||
// Display the CORE brain logo
|
||||
printCoreBrainLogo();
|
||||
|
||||
intro("🚀 Core Development Environment Setup");
|
||||
|
||||
try {
|
||||
await initTriggerDatabase();
|
||||
await updateWorkerImage();
|
||||
|
||||
note(
|
||||
[
|
||||
"Your services will start running:",
|
||||
"",
|
||||
"• Core Application: http://localhost:3033",
|
||||
"• Trigger.dev: http://localhost:8030",
|
||||
"• PostgreSQL: localhost:5432",
|
||||
"",
|
||||
"You can now start developing with Core!",
|
||||
"",
|
||||
"ℹ️ When logging in to the Core Application, you can find the login URL in the Docker container logs:",
|
||||
" docker logs core-app --tail 50",
|
||||
].join("\n"),
|
||||
"🚀 Services Running"
|
||||
);
|
||||
outro("🎉 Setup Complete!");
|
||||
process.exit(0);
|
||||
} catch (error: any) {
|
||||
outro(`❌ Setup failed: ${error.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
@ -1,3 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import "./cli/index.js";
|
||||
@ -1,29 +0,0 @@
|
||||
import chalk from "chalk";
|
||||
import { VERSION } from "../cli/version.js";
|
||||
|
||||
export function printCoreBrainLogo(): void {
|
||||
const brain = `
|
||||
██████╗ ██████╗ ██████╗ ███████╗
|
||||
██╔════╝██╔═══██╗██╔══██╗██╔════╝
|
||||
██║ ██║ ██║██████╔╝█████╗
|
||||
██║ ██║ ██║██╔══██╗██╔══╝
|
||||
╚██████╗╚██████╔╝██║ ██║███████╗
|
||||
╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝
|
||||
|
||||
o o o
|
||||
o o---o---o o
|
||||
o---o o o---o---o
|
||||
o o---o---o---o o
|
||||
o---o o o---o---o
|
||||
o o---o---o o
|
||||
o o o
|
||||
|
||||
`;
|
||||
|
||||
console.log(chalk.cyan(brain));
|
||||
console.log(
|
||||
chalk.bold.white(
|
||||
` 🧠 CORE - Contextual Observation & Recall Engine ${VERSION ? chalk.gray(`(${VERSION})`) : ""}\n`
|
||||
)
|
||||
);
|
||||
}
|
||||
@ -1,24 +0,0 @@
|
||||
import { z } from "zod";
|
||||
|
||||
const EnvironmentSchema = z.object({
|
||||
// Version
|
||||
VERSION: z.string().default("0.1.24"),
|
||||
|
||||
// Database
|
||||
DB_HOST: z.string().default("localhost"),
|
||||
DB_PORT: z.string().default("5432"),
|
||||
TRIGGER_DB: z.string().default("trigger"),
|
||||
POSTGRES_USER: z.string().default("docker"),
|
||||
POSTGRES_PASSWORD: z.string().default("docker"),
|
||||
|
||||
// Trigger database
|
||||
TRIGGER_TASKS_IMAGE: z.string().default("redplanethq/proj_core:latest"),
|
||||
|
||||
// Node environment
|
||||
NODE_ENV: z
|
||||
.union([z.literal("development"), z.literal("production"), z.literal("test")])
|
||||
.default("development"),
|
||||
});
|
||||
|
||||
export type Environment = z.infer<typeof EnvironmentSchema>;
|
||||
export const env = EnvironmentSchema.parse(process.env);
|
||||
@ -1,182 +0,0 @@
|
||||
import Knex from "knex";
|
||||
import path from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import { env } from "./env.js";
|
||||
import { spinner, note, log } from "@clack/prompts";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
/**
|
||||
* Returns a PostgreSQL database URL for the given database name.
|
||||
* Throws if required environment variables are missing.
|
||||
*/
|
||||
export function getDatabaseUrl(dbName: string): string {
|
||||
const { POSTGRES_USER, POSTGRES_PASSWORD, DB_HOST, DB_PORT } = env;
|
||||
|
||||
if (!POSTGRES_USER || !POSTGRES_PASSWORD || !DB_HOST || !DB_PORT || !dbName) {
|
||||
throw new Error(
|
||||
"One or more required environment variables are missing: POSTGRES_USER, POSTGRES_PASSWORD, DB_HOST, DB_PORT, dbName"
|
||||
);
|
||||
}
|
||||
|
||||
return `postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${DB_HOST}:${DB_PORT}/${dbName}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the database specified by TRIGGER_DB exists, and creates it if it does not.
|
||||
* Returns { exists: boolean, created: boolean } - exists indicates success, created indicates if database was newly created.
|
||||
*/
|
||||
export async function ensureDatabaseExists(): Promise<{ exists: boolean; created: boolean }> {
|
||||
const { TRIGGER_DB } = env;
|
||||
|
||||
if (!TRIGGER_DB) {
|
||||
throw new Error("TRIGGER_DB environment variable is missing");
|
||||
}
|
||||
|
||||
// Build a connection string to the default 'postgres' database
|
||||
const adminDbUrl = getDatabaseUrl("postgres");
|
||||
|
||||
// Create a Knex instance for the admin connection
|
||||
const adminKnex = Knex({
|
||||
client: "pg",
|
||||
connection: adminDbUrl,
|
||||
});
|
||||
|
||||
const s = spinner();
|
||||
s.start("Checking for Trigger.dev database...");
|
||||
|
||||
try {
|
||||
// Check if the database exists
|
||||
const result = await adminKnex.select(1).from("pg_database").where("datname", TRIGGER_DB);
|
||||
|
||||
if (result.length === 0) {
|
||||
s.message("Database not found. Creating...");
|
||||
// Database does not exist, create it
|
||||
await adminKnex.raw(`CREATE DATABASE "${TRIGGER_DB}"`);
|
||||
s.stop("Database created.");
|
||||
return { exists: true, created: true };
|
||||
} else {
|
||||
s.stop("Database exists.");
|
||||
return { exists: true, created: false };
|
||||
}
|
||||
} catch (err) {
|
||||
s.stop("Failed to ensure database exists.");
|
||||
log.warning("Failed to ensure database exists: " + (err as Error).message);
|
||||
return { exists: false, created: false };
|
||||
} finally {
|
||||
await adminKnex.destroy();
|
||||
}
|
||||
}
|
||||
|
||||
// Main initialization function
|
||||
export async function initTriggerDatabase() {
|
||||
const { TRIGGER_DB } = env;
|
||||
|
||||
if (!TRIGGER_DB) {
|
||||
throw new Error("TRIGGER_DB environment variable is missing");
|
||||
}
|
||||
|
||||
// Ensure the database exists
|
||||
const { exists, created } = await ensureDatabaseExists();
|
||||
if (!exists) {
|
||||
throw new Error("Failed to create or verify database exists");
|
||||
}
|
||||
|
||||
// Only run pg_restore if the database was newly created
|
||||
if (!created) {
|
||||
note("Database already exists, skipping restore from trigger.dump");
|
||||
return;
|
||||
}
|
||||
|
||||
// Run pg_restore with the trigger.dump file
|
||||
const dumpFilePath = path.join(__dirname, "../../../trigger.dump");
|
||||
const connectionString = getDatabaseUrl(TRIGGER_DB);
|
||||
|
||||
const s = spinner();
|
||||
s.start("Restoring database from trigger.dump...");
|
||||
|
||||
try {
|
||||
// Use execSync and capture stdout/stderr, send to spinner.log
|
||||
const { spawn } = await import("child_process");
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const child = spawn(
|
||||
"pg_restore",
|
||||
["--verbose", "--no-acl", "--no-owner", "-d", connectionString, dumpFilePath],
|
||||
{ stdio: ["ignore", "pipe", "pipe"] }
|
||||
);
|
||||
|
||||
child.stdout.on("data", (data) => {
|
||||
s.message(data.toString());
|
||||
});
|
||||
|
||||
child.stderr.on("data", (data) => {
|
||||
s.message(data.toString());
|
||||
});
|
||||
|
||||
child.on("close", (code) => {
|
||||
if (code === 0) {
|
||||
s.stop("Database restored successfully from trigger.dump");
|
||||
resolve();
|
||||
} else {
|
||||
s.stop("Failed to restore database.");
|
||||
log.warning(`Failed to restore database: pg_restore exited with code ${code}`);
|
||||
reject(new Error(`Database restore failed: pg_restore exited with code ${code}`));
|
||||
}
|
||||
});
|
||||
|
||||
child.on("error", (err) => {
|
||||
s.stop("Failed to restore database.");
|
||||
log.warning("Failed to restore database: " + err.message);
|
||||
reject(new Error(`Database restore failed: ${err.message}`));
|
||||
});
|
||||
});
|
||||
} catch (error: any) {
|
||||
s.stop("Failed to restore database.");
|
||||
log.warning("Failed to restore database: " + error.message);
|
||||
throw new Error(`Database restore failed: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateWorkerImage() {
|
||||
const { TRIGGER_DB, TRIGGER_TASKS_IMAGE } = env;
|
||||
|
||||
if (!TRIGGER_DB) {
|
||||
throw new Error("TRIGGER_DB environment variable is missing");
|
||||
}
|
||||
|
||||
const connectionString = getDatabaseUrl(TRIGGER_DB);
|
||||
|
||||
const knex = Knex({
|
||||
client: "pg",
|
||||
connection: connectionString,
|
||||
});
|
||||
|
||||
const s = spinner();
|
||||
s.start("Updating worker image reference...");
|
||||
|
||||
try {
|
||||
// Get the first record from WorkerDeployment table
|
||||
const firstWorkerDeployment = await knex("WorkerDeployment").select("id").first();
|
||||
|
||||
if (!firstWorkerDeployment) {
|
||||
s.stop("No WorkerDeployment records found, skipping image update");
|
||||
note("No WorkerDeployment records found, skipping image update");
|
||||
return;
|
||||
}
|
||||
|
||||
// Update the imageReference column with the TRIGGER_TASKS_IMAGE value
|
||||
await knex("WorkerDeployment").where("id", firstWorkerDeployment.id).update({
|
||||
imageReference: TRIGGER_TASKS_IMAGE,
|
||||
updatedAt: new Date(),
|
||||
});
|
||||
|
||||
s.stop(`Successfully updated worker image reference to: ${TRIGGER_TASKS_IMAGE}`);
|
||||
} catch (error: any) {
|
||||
s.stop("Failed to update worker image.");
|
||||
log.warning("Failed to update worker image: " + error.message);
|
||||
throw new Error(`Worker image update failed: ${error.message}`);
|
||||
} finally {
|
||||
await knex.destroy();
|
||||
}
|
||||
}
|
||||
Binary file not shown.
@ -1,40 +0,0 @@
|
||||
{
|
||||
"include": ["./src/**/*.ts"],
|
||||
"exclude": ["./src/**/*.test.ts"],
|
||||
"compilerOptions": {
|
||||
"target": "es2022",
|
||||
"lib": ["ES2022", "DOM", "DOM.Iterable", "DOM.AsyncIterable"],
|
||||
"module": "NodeNext",
|
||||
"moduleResolution": "NodeNext",
|
||||
"moduleDetection": "force",
|
||||
"verbatimModuleSyntax": false,
|
||||
"jsx": "react",
|
||||
|
||||
"strict": true,
|
||||
"alwaysStrict": true,
|
||||
"strictPropertyInitialization": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"noUnusedLocals": false,
|
||||
"noUnusedParameters": false,
|
||||
"noImplicitAny": true,
|
||||
"noImplicitReturns": true,
|
||||
"noImplicitThis": true,
|
||||
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
"resolveJsonModule": true,
|
||||
|
||||
"removeComments": false,
|
||||
"esModuleInterop": true,
|
||||
"emitDecoratorMetadata": false,
|
||||
"experimentalDecorators": false,
|
||||
"downlevelIteration": true,
|
||||
"isolatedModules": true,
|
||||
"noUncheckedIndexedAccess": true,
|
||||
|
||||
"pretty": true,
|
||||
"isolatedDeclarations": false,
|
||||
"composite": true,
|
||||
"sourceMap": true
|
||||
}
|
||||
}
|
||||
@ -1,8 +0,0 @@
|
||||
import { configDefaults, defineConfig } from "vitest/config";
|
||||
|
||||
export default defineConfig({
|
||||
test: {
|
||||
globals: true,
|
||||
exclude: [...configDefaults.exclude, "e2e/**/*"],
|
||||
},
|
||||
});
|
||||
299
apps/webapp/app/bert/README.md
Normal file
299
apps/webapp/app/bert/README.md
Normal file
@ -0,0 +1,299 @@
|
||||
# BERT Topic Modeling CLI for Echo Episodes
|
||||
|
||||
This CLI tool performs topic modeling on Echo episodes using BERTopic. It connects to Neo4j, retrieves episodes with their pre-computed embeddings for a given user, and discovers thematic clusters using HDBSCAN clustering.
|
||||
|
||||
## Features
|
||||
|
||||
- Connects to Neo4j database to fetch episodes
|
||||
- Uses pre-computed embeddings (no need to regenerate them)
|
||||
- Performs semantic topic clustering with BERTopic
|
||||
- Displays topics with:
|
||||
- Top keywords per topic
|
||||
- Episode count per topic
|
||||
- Sample episodes for each topic
|
||||
- Configurable minimum topic size
|
||||
- Environment variable support for easy configuration
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.8+
|
||||
- Access to Neo4j database with episodes stored
|
||||
- Pre-computed embeddings stored in Neo4j (in `contentEmbedding` field)
|
||||
|
||||
## Installation
|
||||
|
||||
1. Navigate to the bert directory:
|
||||
|
||||
```bash
|
||||
cd apps/webapp/app/bert
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The CLI can read Neo4j connection details from:
|
||||
|
||||
1. **Environment variables** (recommended) - Create a `.env` file or export:
|
||||
|
||||
```bash
|
||||
export NEO4J_URI=bolt://localhost:7687
|
||||
export NEO4J_USERNAME=neo4j
|
||||
export NEO4J_PASSWORD=your_password
|
||||
```
|
||||
|
||||
2. **Command-line options** - Pass credentials directly as flags
|
||||
|
||||
3. **From project root** - The tool automatically loads `.env` from the project root
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Usage
|
||||
|
||||
Using environment variables (most common):
|
||||
|
||||
```bash
|
||||
python main.py <user_id>
|
||||
```
|
||||
|
||||
### Advanced Options
|
||||
|
||||
```bash
|
||||
python main.py <user_id> [OPTIONS]
|
||||
```
|
||||
|
||||
**Options:**
|
||||
|
||||
- `--min-topic-size INTEGER`: Minimum number of episodes per topic (default: 10)
|
||||
- `--nr-topics INTEGER`: Target number of topics for reduction (optional)
|
||||
- `--propose-spaces`: Generate space proposals using OpenAI (requires OPENAI_API_KEY)
|
||||
- `--openai-api-key TEXT`: OpenAI API key for space proposals (or use OPENAI_API_KEY env var)
|
||||
- `--json`: Output only final results in JSON format (suppresses all other output)
|
||||
- `--neo4j-uri TEXT`: Neo4j connection URI (default: bolt://localhost:7687)
|
||||
- `--neo4j-username TEXT`: Neo4j username (default: neo4j)
|
||||
- `--neo4j-password TEXT`: Neo4j password (required)
|
||||
|
||||
### Examples
|
||||
|
||||
1. **Basic usage with environment variables:**
|
||||
|
||||
```bash
|
||||
python main.py user-123
|
||||
```
|
||||
|
||||
2. **Custom minimum topic size:**
|
||||
|
||||
```bash
|
||||
python main.py user-123 --min-topic-size 10
|
||||
```
|
||||
|
||||
3. **Explicit credentials:**
|
||||
|
||||
```bash
|
||||
python main.py user-123 \
|
||||
--neo4j-uri bolt://neo4j:7687 \
|
||||
--neo4j-username neo4j \
|
||||
--neo4j-password mypassword
|
||||
```
|
||||
|
||||
4. **Using Docker compose Neo4j:**
|
||||
```bash
|
||||
python main.py user-123 \
|
||||
--neo4j-uri bolt://localhost:7687 \
|
||||
--neo4j-password 27192e6432564f4788d55c15131bd5ac
|
||||
```
|
||||
|
||||
5. **With space proposals:**
|
||||
```bash
|
||||
python main.py user-123 --propose-spaces
|
||||
```
|
||||
|
||||
6. **JSON output mode (for programmatic use):**
|
||||
```bash
|
||||
python main.py user-123 --json
|
||||
```
|
||||
|
||||
7. **JSON output with space proposals:**
|
||||
```bash
|
||||
python main.py user-123 --propose-spaces --json
|
||||
```
|
||||
|
||||
### Get Help
|
||||
|
||||
```bash
|
||||
python main.py --help
|
||||
```
|
||||
|
||||
## Output Formats
|
||||
|
||||
### Human-Readable Output (Default)
|
||||
|
||||
The CLI outputs:
|
||||
|
||||
```
|
||||
================================================================================
|
||||
BERT TOPIC MODELING FOR ECHO EPISODES
|
||||
================================================================================
|
||||
User ID: user-123
|
||||
Min Topic Size: 20
|
||||
================================================================================
|
||||
|
||||
✓ Connected to Neo4j at bolt://localhost:7687
|
||||
✓ Fetched 150 episodes with embeddings
|
||||
|
||||
🔍 Running BERTopic analysis (min_topic_size=20)...
|
||||
✓ Topic modeling complete
|
||||
|
||||
================================================================================
|
||||
TOPIC MODELING RESULTS
|
||||
================================================================================
|
||||
Total Topics Found: 5
|
||||
Total Episodes: 150
|
||||
================================================================================
|
||||
|
||||
────────────────────────────────────────────────────────────────────────────────
|
||||
Topic 0: 45 episodes
|
||||
────────────────────────────────────────────────────────────────────────────────
|
||||
Keywords: authentication, login, user, security, session, password, token, oauth, jwt, credentials
|
||||
|
||||
Sample Episodes (showing up to 3):
|
||||
1. [uuid-123]
|
||||
Discussing authentication flow for the new user login system...
|
||||
|
||||
2. [uuid-456]
|
||||
Implementing OAuth2 with JWT tokens for secure sessions...
|
||||
|
||||
3. [uuid-789]
|
||||
Password reset functionality with email verification...
|
||||
|
||||
────────────────────────────────────────────────────────────────────────────────
|
||||
Topic 1: 32 episodes
|
||||
────────────────────────────────────────────────────────────────────────────────
|
||||
Keywords: database, neo4j, query, graph, cypher, nodes, relationships, index, performance, optimization
|
||||
|
||||
Sample Episodes (showing up to 3):
|
||||
...
|
||||
|
||||
Topic -1 (Outliers): 8 episodes
|
||||
|
||||
================================================================================
|
||||
✓ Analysis complete!
|
||||
================================================================================
|
||||
|
||||
✓ Neo4j connection closed
|
||||
```
|
||||
|
||||
### JSON Output Mode (--json flag)
|
||||
|
||||
When using the `--json` flag, the tool outputs only a clean JSON object with no debug logs:
|
||||
|
||||
```json
|
||||
{
|
||||
"topics": {
|
||||
"0": {
|
||||
"keywords": ["authentication", "login", "user", "security", "session"],
|
||||
"episodeIds": ["uuid-123", "uuid-456", "uuid-789"]
|
||||
},
|
||||
"1": {
|
||||
"keywords": ["database", "neo4j", "query", "graph", "cypher"],
|
||||
"episodeIds": ["uuid-abc", "uuid-def"]
|
||||
}
|
||||
},
|
||||
"spaces": [
|
||||
{
|
||||
"name": "User Authentication",
|
||||
"intent": "Episodes about user authentication, login systems, and security belong in this space.",
|
||||
"confidence": 85,
|
||||
"topics": [0, 3],
|
||||
"estimatedEpisodes": 120
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**JSON Output Structure:**
|
||||
- `topics`: Dictionary of topic IDs with keywords and episode UUIDs
|
||||
- `spaces`: Array of space proposals (only if `--propose-spaces` is used)
|
||||
- `name`: Space name (2-5 words)
|
||||
- `intent`: Classification intent (1-2 sentences)
|
||||
- `confidence`: Confidence score (0-100)
|
||||
- `topics`: Source topic IDs that form this space
|
||||
- `estimatedEpisodes`: Estimated number of episodes in this space
|
||||
|
||||
**Use Cases for JSON Mode:**
|
||||
- Programmatic consumption by other tools
|
||||
- Piping output to jq or other JSON processors
|
||||
- Integration with CI/CD pipelines
|
||||
- Automated space creation workflows
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Connection**: Establishes connection to Neo4j database
|
||||
2. **Data Fetching**: Queries all episodes for the given userId that have:
|
||||
- Non-null `contentEmbedding` field
|
||||
- Non-empty content
|
||||
3. **Topic Modeling**: Runs BERTopic with:
|
||||
- Pre-computed embeddings (no re-embedding needed)
|
||||
- HDBSCAN clustering (automatic cluster discovery)
|
||||
- Keyword extraction via c-TF-IDF
|
||||
4. **Results**: Displays topics with keywords and sample episodes
|
||||
|
||||
## Neo4j Query
|
||||
|
||||
The tool uses this Cypher query to fetch episodes:
|
||||
|
||||
```cypher
|
||||
MATCH (e:Episode {userId: $userId})
|
||||
WHERE e.contentEmbedding IS NOT NULL
|
||||
AND size(e.contentEmbedding) > 0
|
||||
AND e.content IS NOT NULL
|
||||
AND e.content <> ''
|
||||
RETURN e.uuid as uuid,
|
||||
e.content as content,
|
||||
e.contentEmbedding as embedding,
|
||||
e.createdAt as createdAt
|
||||
ORDER BY e.createdAt DESC
|
||||
```
|
||||
|
||||
## Tuning Parameters
|
||||
|
||||
- **`--min-topic-size`**:
|
||||
- Smaller values (5-10): More granular topics, may include noise
|
||||
- Larger values (20-30): Broader topics, more coherent but fewer clusters
|
||||
- Recommended: Start with 20 and adjust based on your data
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### No episodes found
|
||||
|
||||
- Verify the userId exists in Neo4j
|
||||
- Check that episodes have `contentEmbedding` populated
|
||||
- Ensure episodes have non-empty `content` field
|
||||
|
||||
### Connection errors
|
||||
|
||||
- Verify Neo4j is running: `docker ps | grep neo4j`
|
||||
- Check URI format: should be `bolt://host:port`
|
||||
- Verify credentials are correct
|
||||
|
||||
### Too few/many topics
|
||||
|
||||
- Adjust `--min-topic-size` parameter
|
||||
- Need more topics: decrease the value (e.g., `--min-topic-size 10`)
|
||||
- Need fewer topics: increase the value (e.g., `--min-topic-size 30`)
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `bertopic>=0.16.0` - Topic modeling
|
||||
- `neo4j>=5.14.0` - Neo4j Python driver
|
||||
- `click>=8.1.0` - CLI framework
|
||||
- `numpy>=1.24.0` - Numerical operations
|
||||
- `python-dotenv>=1.0.0` - Environment variable loading
|
||||
|
||||
## License
|
||||
|
||||
Part of the Echo project.
|
||||
384
apps/webapp/app/bert/main.py
Normal file
384
apps/webapp/app/bert/main.py
Normal file
@ -0,0 +1,384 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BERT Topic Modeling CLI for Echo Episodes
|
||||
|
||||
This CLI tool connects to Neo4j, retrieves episodes with their embeddings for a given userId,
|
||||
and performs topic modeling using BERTopic to discover thematic clusters.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from typing import List, Tuple, Dict, Any
|
||||
import click
|
||||
import numpy as np
|
||||
from neo4j import GraphDatabase
|
||||
from bertopic import BERTopic
|
||||
from bertopic.vectorizers import ClassTfidfTransformer
|
||||
from dotenv import load_dotenv
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from umap import UMAP
|
||||
from hdbscan import HDBSCAN
|
||||
|
||||
|
||||
class Neo4jConnection:
|
||||
"""Manages Neo4j database connection."""
|
||||
|
||||
def __init__(self, uri: str, username: str, password: str, quiet: bool = False):
|
||||
"""Initialize Neo4j connection.
|
||||
|
||||
Args:
|
||||
uri: Neo4j connection URI (e.g., bolt://localhost:7687)
|
||||
username: Neo4j username
|
||||
password: Neo4j password
|
||||
quiet: If True, suppress output messages
|
||||
"""
|
||||
self.quiet = quiet
|
||||
try:
|
||||
self.driver = GraphDatabase.driver(uri, auth=(username, password))
|
||||
# Verify connection
|
||||
self.driver.verify_connectivity()
|
||||
if not quiet:
|
||||
click.echo(f"✓ Connected to Neo4j at {uri}")
|
||||
except Exception as e:
|
||||
if not quiet:
|
||||
click.echo(f"✗ Failed to connect to Neo4j: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
def close(self):
|
||||
"""Close the Neo4j connection."""
|
||||
if self.driver:
|
||||
self.driver.close()
|
||||
if not self.quiet:
|
||||
click.echo("✓ Neo4j connection closed")
|
||||
|
||||
def get_episodes_with_embeddings(self, user_id: str) -> Tuple[List[str], List[str], np.ndarray]:
|
||||
"""Fetch all episodes with their embeddings for a given user.
|
||||
|
||||
Args:
|
||||
user_id: The user ID to fetch episodes for
|
||||
|
||||
Returns:
|
||||
Tuple of (episode_uuids, episode_contents, embeddings_array)
|
||||
"""
|
||||
query = """
|
||||
MATCH (e:Episode {userId: $userId})
|
||||
WHERE e.contentEmbedding IS NOT NULL
|
||||
AND size(e.contentEmbedding) > 0
|
||||
AND e.content IS NOT NULL
|
||||
AND e.content <> ''
|
||||
RETURN e.uuid as uuid,
|
||||
e.content as content,
|
||||
e.contentEmbedding as embedding,
|
||||
e.createdAt as createdAt
|
||||
ORDER BY e.createdAt DESC
|
||||
"""
|
||||
|
||||
with self.driver.session() as session:
|
||||
result = session.run(query, userId=user_id)
|
||||
records = list(result)
|
||||
|
||||
if not records:
|
||||
if not self.quiet:
|
||||
click.echo(f"✗ No episodes found for userId: {user_id}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
uuids = []
|
||||
contents = []
|
||||
embeddings = []
|
||||
|
||||
for record in records:
|
||||
uuids.append(record["uuid"])
|
||||
contents.append(record["content"])
|
||||
embeddings.append(record["embedding"])
|
||||
|
||||
embeddings_array = np.array(embeddings, dtype=np.float32)
|
||||
|
||||
if not self.quiet:
|
||||
click.echo(f"✓ Fetched {len(contents)} episodes with embeddings")
|
||||
return uuids, contents, embeddings_array
|
||||
|
||||
|
||||
def run_bertopic_analysis(
|
||||
contents: List[str],
|
||||
embeddings: np.ndarray,
|
||||
min_topic_size: int = 20,
|
||||
nr_topics: int = None,
|
||||
quiet: bool = False
|
||||
) -> Tuple[BERTopic, List[int], List[float]]:
|
||||
"""Run BERTopic clustering on episode contents with improved configuration.
|
||||
|
||||
Args:
|
||||
contents: List of episode content strings
|
||||
embeddings: Pre-computed embeddings for the episodes
|
||||
min_topic_size: Minimum number of documents per topic
|
||||
nr_topics: Target number of topics (optional, for topic reduction)
|
||||
quiet: If True, suppress output messages
|
||||
|
||||
Returns:
|
||||
Tuple of (bertopic_model, topic_assignments, probabilities)
|
||||
"""
|
||||
if not quiet:
|
||||
click.echo(f"\n🔍 Running BERTopic analysis (min_topic_size={min_topic_size})...")
|
||||
|
||||
# Step 1: Configure UMAP for dimensionality reduction
|
||||
# More aggressive reduction helps find distinct clusters
|
||||
umap_model = UMAP(
|
||||
n_neighbors=15, # Balance between local/global structure
|
||||
n_components=5, # Reduce to 5 dimensions
|
||||
min_dist=0.0, # Allow tight clusters
|
||||
metric='cosine', # Use cosine similarity
|
||||
random_state=42
|
||||
)
|
||||
|
||||
# Step 2: Configure HDBSCAN for clustering
|
||||
# Tuned to find more granular topics
|
||||
hdbscan_model = HDBSCAN(
|
||||
min_cluster_size=min_topic_size, # Minimum episodes per topic
|
||||
min_samples=5, # More sensitive to local density
|
||||
metric='euclidean',
|
||||
cluster_selection_method='eom', # Excess of mass method
|
||||
prediction_data=True
|
||||
)
|
||||
|
||||
# Step 3: Configure vectorizer with stopword removal
|
||||
# Remove common English stopwords that pollute topic keywords
|
||||
vectorizer_model = CountVectorizer(
|
||||
stop_words='english', # Remove common English words
|
||||
min_df=2, # Word must appear in at least 2 docs
|
||||
max_df=0.95, # Ignore words in >95% of docs
|
||||
ngram_range=(1, 2) # Include unigrams and bigrams
|
||||
)
|
||||
|
||||
# Step 4: Configure c-TF-IDF with better parameters
|
||||
ctfidf_model = ClassTfidfTransformer(
|
||||
reduce_frequent_words=True, # Further reduce common words
|
||||
bm25_weighting=True # Use BM25 for better keyword extraction
|
||||
)
|
||||
|
||||
# Step 5: Initialize BERTopic with all custom components
|
||||
model = BERTopic(
|
||||
embedding_model=None, # Use pre-computed embeddings
|
||||
umap_model=umap_model,
|
||||
hdbscan_model=hdbscan_model,
|
||||
vectorizer_model=vectorizer_model,
|
||||
ctfidf_model=ctfidf_model,
|
||||
top_n_words=15, # More keywords per topic
|
||||
nr_topics=nr_topics, # Optional topic reduction
|
||||
calculate_probabilities=True,
|
||||
verbose=(not quiet)
|
||||
)
|
||||
|
||||
# Fit the model with pre-computed embeddings
|
||||
topics, probs = model.fit_transform(contents, embeddings=embeddings)
|
||||
|
||||
# Get topic count
|
||||
unique_topics = len(set(topics)) - (1 if -1 in topics else 0)
|
||||
if not quiet:
|
||||
click.echo(f"✓ Topic modeling complete - Found {unique_topics} topics")
|
||||
|
||||
return model, topics, probs
|
||||
|
||||
|
||||
def print_topic_results(
|
||||
model: BERTopic,
|
||||
topics: List[int],
|
||||
uuids: List[str],
|
||||
contents: List[str]
|
||||
):
|
||||
"""Print formatted topic results.
|
||||
|
||||
Args:
|
||||
model: Fitted BERTopic model
|
||||
topics: Topic assignments for each episode
|
||||
uuids: Episode UUIDs
|
||||
contents: Episode contents
|
||||
"""
|
||||
# Get topic info
|
||||
topic_info = model.get_topic_info()
|
||||
num_topics = len(topic_info) - 1 # Exclude outlier topic (-1)
|
||||
|
||||
click.echo(f"\n{'='*80}")
|
||||
click.echo(f"TOPIC MODELING RESULTS")
|
||||
click.echo(f"{'='*80}")
|
||||
click.echo(f"Total Topics Found: {num_topics}")
|
||||
click.echo(f"Total Episodes: {len(contents)}")
|
||||
click.echo(f"{'='*80}\n")
|
||||
|
||||
# Print each topic
|
||||
for idx, row in topic_info.iterrows():
|
||||
topic_id = row['Topic']
|
||||
count = row['Count']
|
||||
|
||||
# Skip outlier topic
|
||||
if topic_id == -1:
|
||||
click.echo(f"Topic -1 (Outliers): {count} episodes\n")
|
||||
continue
|
||||
|
||||
# Get top words for this topic
|
||||
topic_words = model.get_topic(topic_id)
|
||||
|
||||
click.echo(f"{'─'*80}")
|
||||
click.echo(f"Topic {topic_id}: {count} episodes")
|
||||
click.echo(f"{'─'*80}")
|
||||
|
||||
# Print top keywords
|
||||
if topic_words:
|
||||
keywords = [word for word, score in topic_words[:10]]
|
||||
click.echo(f"Keywords: {', '.join(keywords)}")
|
||||
|
||||
# Print sample episodes
|
||||
topic_episodes = [(uuid, content) for uuid, content, topic
|
||||
in zip(uuids, contents, topics) if topic == topic_id]
|
||||
|
||||
click.echo(f"\nSample Episodes (showing up to 3):")
|
||||
for i, (uuid, content) in enumerate(topic_episodes[:3]):
|
||||
# Truncate content for display
|
||||
truncated = content[:200] + "..." if len(content) > 200 else content
|
||||
click.echo(f" {i+1}. [{uuid}]")
|
||||
click.echo(f" {truncated}\n")
|
||||
|
||||
click.echo()
|
||||
|
||||
|
||||
def build_json_output(
|
||||
model: BERTopic,
|
||||
topics: List[int],
|
||||
uuids: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""Build JSON output structure.
|
||||
|
||||
Args:
|
||||
model: Fitted BERTopic model
|
||||
topics: Topic assignments for each episode
|
||||
uuids: Episode UUIDs
|
||||
|
||||
Returns:
|
||||
Dictionary with topics data
|
||||
"""
|
||||
# Build topics dictionary
|
||||
topics_dict = {}
|
||||
topic_info = model.get_topic_info()
|
||||
|
||||
for idx, row in topic_info.iterrows():
|
||||
topic_id = row['Topic']
|
||||
|
||||
# Skip outlier topic
|
||||
if topic_id == -1:
|
||||
continue
|
||||
|
||||
# Get keywords
|
||||
topic_words = model.get_topic(topic_id)
|
||||
keywords = [word for word, score in topic_words[:10]] if topic_words else []
|
||||
|
||||
# Get episode IDs for this topic
|
||||
episode_ids = [uuid for uuid, topic in zip(uuids, topics) if topic == topic_id]
|
||||
|
||||
topics_dict[topic_id] = {
|
||||
"keywords": keywords,
|
||||
"episodeIds": episode_ids
|
||||
}
|
||||
|
||||
return {"topics": topics_dict}
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('user_id', type=str)
|
||||
@click.option(
|
||||
'--min-topic-size',
|
||||
default=10,
|
||||
type=int,
|
||||
help='Minimum number of episodes per topic (default: 10, lower = more granular topics)'
|
||||
)
|
||||
@click.option(
|
||||
'--nr-topics',
|
||||
default=None,
|
||||
type=int,
|
||||
help='Target number of topics for reduction (optional, e.g., 20 for ~20 topics)'
|
||||
)
|
||||
@click.option(
|
||||
'--neo4j-uri',
|
||||
envvar='NEO4J_URI',
|
||||
default='bolt://localhost:7687',
|
||||
help='Neo4j connection URI (default: bolt://localhost:7687)'
|
||||
)
|
||||
@click.option(
|
||||
'--neo4j-username',
|
||||
envvar='NEO4J_USERNAME',
|
||||
default='neo4j',
|
||||
help='Neo4j username (default: neo4j)'
|
||||
)
|
||||
@click.option(
|
||||
'--neo4j-password',
|
||||
envvar='NEO4J_PASSWORD',
|
||||
required=True,
|
||||
help='Neo4j password (required, can use NEO4J_PASSWORD env var)'
|
||||
)
|
||||
@click.option(
|
||||
'--json',
|
||||
'json_output',
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help='Output only final results in JSON format (suppresses all other output)'
|
||||
)
|
||||
def main(user_id: str, min_topic_size: int, nr_topics: int, neo4j_uri: str, neo4j_username: str, neo4j_password: str, json_output: bool):
|
||||
"""
|
||||
Run BERTopic analysis on episodes for a given USER_ID.
|
||||
|
||||
This tool connects to Neo4j, retrieves all episodes with embeddings for the specified user,
|
||||
and performs topic modeling to discover thematic clusters.
|
||||
|
||||
Examples:
|
||||
|
||||
# Using environment variables from .env file
|
||||
python main.py user-123
|
||||
|
||||
# With custom min topic size
|
||||
python main.py user-123 --min-topic-size 10
|
||||
|
||||
# With explicit Neo4j credentials
|
||||
python main.py user-123 --neo4j-uri bolt://localhost:7687 --neo4j-password mypassword
|
||||
"""
|
||||
# Print header only if not in JSON mode
|
||||
if not json_output:
|
||||
click.echo(f"\n{'='*80}")
|
||||
click.echo("BERT TOPIC MODELING FOR ECHO EPISODES")
|
||||
click.echo(f"{'='*80}")
|
||||
click.echo(f"User ID: {user_id}")
|
||||
click.echo(f"Min Topic Size: {min_topic_size}")
|
||||
if nr_topics:
|
||||
click.echo(f"Target Topics: ~{nr_topics}")
|
||||
click.echo(f"{'='*80}\n")
|
||||
|
||||
# Connect to Neo4j (quiet mode if JSON output)
|
||||
neo4j_conn = Neo4jConnection(neo4j_uri, neo4j_username, neo4j_password, quiet=json_output)
|
||||
|
||||
try:
|
||||
# Fetch episodes with embeddings
|
||||
uuids, contents, embeddings = neo4j_conn.get_episodes_with_embeddings(user_id)
|
||||
|
||||
# Run BERTopic analysis
|
||||
model, topics, probs = run_bertopic_analysis(contents, embeddings, min_topic_size, nr_topics, quiet=json_output)
|
||||
|
||||
# Output results
|
||||
if json_output:
|
||||
# JSON output mode - only print JSON
|
||||
output = build_json_output(model, topics, uuids)
|
||||
click.echo(json.dumps(output, indent=2))
|
||||
else:
|
||||
# Normal output mode - print formatted results
|
||||
print_topic_results(model, topics, uuids, contents)
|
||||
|
||||
click.echo(f"{'='*80}")
|
||||
click.echo("✓ Analysis complete!")
|
||||
click.echo(f"{'='*80}\n")
|
||||
|
||||
finally:
|
||||
# Always close connection
|
||||
neo4j_conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Load environment variables from .env file if present
|
||||
load_dotenv()
|
||||
main()
|
||||
8
apps/webapp/app/bert/requirements.txt
Normal file
8
apps/webapp/app/bert/requirements.txt
Normal file
@ -0,0 +1,8 @@
|
||||
bertopic>=0.16.0
|
||||
neo4j>=5.14.0
|
||||
click>=8.1.0
|
||||
numpy>=1.24.0
|
||||
python-dotenv>=1.0.0
|
||||
scikit-learn>=1.3.0
|
||||
umap-learn>=0.5.4
|
||||
hdbscan>=0.8.33
|
||||
@ -92,3 +92,26 @@ export const sessionCompactionQueue = new Queue("session-compaction-queue", {
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
/**
|
||||
* BERT topic analysis queue
|
||||
* Handles CPU-intensive topic modeling on user episodes
|
||||
*/
|
||||
export const bertTopicQueue = new Queue("bert-topic-queue", {
|
||||
connection: getRedisConnection(),
|
||||
defaultJobOptions: {
|
||||
attempts: 2, // Only 2 attempts due to long runtime
|
||||
backoff: {
|
||||
type: "exponential",
|
||||
delay: 5000,
|
||||
},
|
||||
timeout: 300000, // 5 minute timeout
|
||||
removeOnComplete: {
|
||||
age: 7200, // Keep completed jobs for 2 hours
|
||||
count: 100,
|
||||
},
|
||||
removeOnFail: {
|
||||
age: 172800, // Keep failed jobs for 48 hours (for debugging)
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
@ -23,10 +23,15 @@ import {
|
||||
processSessionCompaction,
|
||||
type SessionCompactionPayload,
|
||||
} from "~/jobs/session/session-compaction.logic";
|
||||
import {
|
||||
processTopicAnalysis,
|
||||
type TopicAnalysisPayload,
|
||||
} from "~/jobs/bert/topic-analysis.logic";
|
||||
import {
|
||||
enqueueIngestEpisode,
|
||||
enqueueSpaceAssignment,
|
||||
enqueueSessionCompaction,
|
||||
enqueueBertTopicAnalysis,
|
||||
} from "~/lib/queue-adapter.server";
|
||||
import { logger } from "~/services/logger.service";
|
||||
|
||||
@ -47,6 +52,7 @@ export const ingestWorker = new Worker(
|
||||
// Callbacks to enqueue follow-up jobs
|
||||
enqueueSpaceAssignment,
|
||||
enqueueSessionCompaction,
|
||||
enqueueBertTopicAnalysis,
|
||||
);
|
||||
},
|
||||
{
|
||||
@ -108,6 +114,22 @@ export const sessionCompactionWorker = new Worker(
|
||||
},
|
||||
);
|
||||
|
||||
/**
|
||||
* BERT topic analysis worker
|
||||
* Handles CPU-intensive topic modeling
|
||||
*/
|
||||
export const bertTopicWorker = new Worker(
|
||||
"bert-topic-queue",
|
||||
async (job) => {
|
||||
const payload = job.data as TopicAnalysisPayload;
|
||||
return await processTopicAnalysis(payload);
|
||||
},
|
||||
{
|
||||
connection: getRedisConnection(),
|
||||
concurrency: 2, // Process up to 2 analyses in parallel (CPU-intensive)
|
||||
},
|
||||
);
|
||||
|
||||
/**
|
||||
* Graceful shutdown handler
|
||||
*/
|
||||
@ -116,8 +138,8 @@ export async function closeAllWorkers(): Promise<void> {
|
||||
ingestWorker.close(),
|
||||
documentIngestWorker.close(),
|
||||
conversationTitleWorker.close(),
|
||||
|
||||
sessionCompactionWorker.close(),
|
||||
bertTopicWorker.close(),
|
||||
]);
|
||||
logger.log("All BullMQ workers closed");
|
||||
}
|
||||
|
||||
116
apps/webapp/app/jobs/bert/topic-analysis.logic.ts
Normal file
116
apps/webapp/app/jobs/bert/topic-analysis.logic.ts
Normal file
@ -0,0 +1,116 @@
|
||||
import { exec } from "child_process";
|
||||
import { promisify } from "util";
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
export interface TopicAnalysisPayload {
|
||||
userId: string;
|
||||
minTopicSize?: number;
|
||||
nrTopics?: number;
|
||||
}
|
||||
|
||||
export interface TopicAnalysisResult {
|
||||
topics: {
|
||||
[topicId: string]: {
|
||||
keywords: string[];
|
||||
episodeIds: string[];
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Process BERT topic analysis on user's episodes
|
||||
* This is the common logic shared between Trigger.dev and BullMQ
|
||||
*
|
||||
* NOTE: This function does NOT update workspace.metadata.lastTopicAnalysisAt
|
||||
* That should be done by the caller BEFORE enqueueing this job to prevent
|
||||
* duplicate analyses from racing conditions.
|
||||
*/
|
||||
export async function processTopicAnalysis(
|
||||
payload: TopicAnalysisPayload
|
||||
): Promise<TopicAnalysisResult> {
|
||||
const { userId, minTopicSize = 10, nrTopics } = payload;
|
||||
|
||||
console.log(`[BERT Topic Analysis] Starting analysis for user: ${userId}`);
|
||||
console.log(
|
||||
`[BERT Topic Analysis] Parameters: minTopicSize=${minTopicSize}, nrTopics=${nrTopics || "auto"}`
|
||||
);
|
||||
|
||||
// Build the command
|
||||
let command = `python3 /core/apps/webapp/app/bert/main.py ${userId} --json`;
|
||||
|
||||
if (minTopicSize) {
|
||||
command += ` --min-topic-size ${minTopicSize}`;
|
||||
}
|
||||
|
||||
if (nrTopics) {
|
||||
command += ` --nr-topics ${nrTopics}`;
|
||||
}
|
||||
|
||||
console.log(`[BERT Topic Analysis] Executing: ${command}`);
|
||||
|
||||
try {
|
||||
const startTime = Date.now();
|
||||
|
||||
// Execute the Python script with a 5-minute timeout
|
||||
const { stdout, stderr } = await execAsync(command, {
|
||||
timeout: 300000, // 5 minutes
|
||||
maxBuffer: 10 * 1024 * 1024, // 10MB buffer for large outputs
|
||||
});
|
||||
|
||||
const duration = Date.now() - startTime;
|
||||
console.log(`[BERT Topic Analysis] Completed in ${duration}ms`);
|
||||
|
||||
if (stderr) {
|
||||
console.warn(`[BERT Topic Analysis] Warnings:`, stderr);
|
||||
}
|
||||
|
||||
// Parse the JSON output
|
||||
const result: TopicAnalysisResult = JSON.parse(stdout);
|
||||
|
||||
// Log summary
|
||||
const topicCount = Object.keys(result.topics).length;
|
||||
const totalEpisodes = Object.values(result.topics).reduce(
|
||||
(sum, topic) => sum + topic.episodeIds.length,
|
||||
0
|
||||
);
|
||||
|
||||
console.log(
|
||||
`[BERT Topic Analysis] Found ${topicCount} topics covering ${totalEpisodes} episodes`
|
||||
);
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error(`[BERT Topic Analysis] Error:`, error);
|
||||
|
||||
if (error instanceof Error) {
|
||||
// Check for timeout
|
||||
if (error.message.includes("ETIMEDOUT")) {
|
||||
throw new Error(
|
||||
`Topic analysis timed out after 5 minutes. User may have too many episodes.`
|
||||
);
|
||||
}
|
||||
|
||||
// Check for Python errors
|
||||
if (error.message.includes("python3: not found")) {
|
||||
throw new Error(
|
||||
`Python 3 is not installed or not available in PATH.`
|
||||
);
|
||||
}
|
||||
|
||||
// Check for Neo4j connection errors
|
||||
if (error.message.includes("Failed to connect to Neo4j")) {
|
||||
throw new Error(
|
||||
`Could not connect to Neo4j. Check NEO4J_URI and credentials.`
|
||||
);
|
||||
}
|
||||
|
||||
// Check for no episodes
|
||||
if (error.message.includes("No episodes found")) {
|
||||
throw new Error(`No episodes found for userId: ${userId}`);
|
||||
}
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@ -7,6 +7,10 @@ import { prisma } from "~/trigger/utils/prisma";
|
||||
import { EpisodeType } from "@core/types";
|
||||
import { deductCredits, hasCredits } from "~/trigger/utils/utils";
|
||||
import { assignEpisodesToSpace } from "~/services/graphModels/space";
|
||||
import {
|
||||
shouldTriggerTopicAnalysis,
|
||||
updateLastTopicAnalysisTime,
|
||||
} from "~/services/bertTopicAnalysis.server";
|
||||
|
||||
export const IngestBodyRequest = z.object({
|
||||
episodeBody: z.string(),
|
||||
@ -55,6 +59,11 @@ export async function processEpisodeIngestion(
|
||||
sessionId: string;
|
||||
source: string;
|
||||
}) => Promise<any>,
|
||||
enqueueBertTopicAnalysis?: (params: {
|
||||
userId: string;
|
||||
minTopicSize?: number;
|
||||
nrTopics?: number;
|
||||
}) => Promise<any>,
|
||||
): Promise<IngestEpisodeResult> {
|
||||
try {
|
||||
logger.log(`Processing job for user ${payload.userId}`);
|
||||
@ -250,6 +259,43 @@ export async function processEpisodeIngestion(
|
||||
});
|
||||
}
|
||||
|
||||
// Auto-trigger BERT topic analysis if threshold met (20+ new episodes)
|
||||
try {
|
||||
if (
|
||||
currentStatus === IngestionStatus.COMPLETED &&
|
||||
enqueueBertTopicAnalysis
|
||||
) {
|
||||
const shouldTrigger = await shouldTriggerTopicAnalysis(
|
||||
payload.userId,
|
||||
payload.workspaceId,
|
||||
);
|
||||
|
||||
if (shouldTrigger) {
|
||||
logger.info(
|
||||
`Triggering BERT topic analysis after reaching 20+ new episodes`,
|
||||
{
|
||||
userId: payload.userId,
|
||||
workspaceId: payload.workspaceId,
|
||||
},
|
||||
);
|
||||
|
||||
await enqueueBertTopicAnalysis({
|
||||
userId: payload.userId,
|
||||
minTopicSize: 10,
|
||||
});
|
||||
|
||||
// Update the last analysis timestamp
|
||||
await updateLastTopicAnalysisTime(payload.workspaceId);
|
||||
}
|
||||
}
|
||||
} catch (topicAnalysisError) {
|
||||
// Don't fail the ingestion if topic analysis fails
|
||||
logger.warn(`Failed to trigger topic analysis after ingestion:`, {
|
||||
error: topicAnalysisError,
|
||||
userId: payload.userId,
|
||||
});
|
||||
}
|
||||
|
||||
return { success: true, episodeDetails };
|
||||
} catch (err: any) {
|
||||
await prisma.ingestionQueue.update({
|
||||
|
||||
@ -163,6 +163,39 @@ export async function enqueueSpaceAssignment(
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Enqueue BERT topic analysis job
|
||||
*/
|
||||
export async function enqueueBertTopicAnalysis(payload: {
|
||||
userId: string;
|
||||
minTopicSize?: number;
|
||||
nrTopics?: number;
|
||||
}): Promise<{ id?: string }> {
|
||||
const provider = env.QUEUE_PROVIDER as QueueProvider;
|
||||
|
||||
if (provider === "trigger") {
|
||||
const { bertTopicAnalysisTask } = await import(
|
||||
"~/trigger/bert/topic-analysis"
|
||||
);
|
||||
const handler = await bertTopicAnalysisTask.trigger(payload, {
|
||||
queue: "bert-topic-analysis",
|
||||
concurrencyKey: payload.userId,
|
||||
tags: [payload.userId, "bert-analysis"],
|
||||
});
|
||||
return { id: handler.id };
|
||||
} else {
|
||||
// BullMQ
|
||||
const { bertTopicQueue } = await import("~/bullmq/queues");
|
||||
const job = await bertTopicQueue.add("topic-analysis", payload, {
|
||||
jobId: `bert-${payload.userId}-${Date.now()}`,
|
||||
attempts: 2, // Only 2 attempts for expensive operations
|
||||
backoff: { type: "exponential", delay: 5000 },
|
||||
timeout: 300000, // 5 minute timeout
|
||||
});
|
||||
return { id: job.id };
|
||||
}
|
||||
}
|
||||
|
||||
export const isTriggerDeployment = () => {
|
||||
return env.QUEUE_PROVIDER === "trigger";
|
||||
};
|
||||
|
||||
107
apps/webapp/app/services/bertTopicAnalysis.server.ts
Normal file
107
apps/webapp/app/services/bertTopicAnalysis.server.ts
Normal file
@ -0,0 +1,107 @@
|
||||
import { prisma } from "~/trigger/utils/prisma";
|
||||
import { logger } from "~/services/logger.service";
|
||||
import { runQuery } from "~/lib/neo4j.server";
|
||||
|
||||
interface WorkspaceMetadata {
|
||||
lastTopicAnalysisAt?: string;
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if we should trigger a BERT topic analysis for this workspace
|
||||
* Criteria: 20+ new episodes since last analysis (or no previous analysis)
|
||||
*/
|
||||
export async function shouldTriggerTopicAnalysis(
|
||||
userId: string,
|
||||
workspaceId: string,
|
||||
): Promise<boolean> {
|
||||
try {
|
||||
// Get workspace metadata
|
||||
const workspace = await prisma.workspace.findUnique({
|
||||
where: { id: workspaceId },
|
||||
select: { metadata: true },
|
||||
});
|
||||
|
||||
if (!workspace) {
|
||||
logger.warn(`Workspace not found: ${workspaceId}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
const metadata = (workspace.metadata || {}) as WorkspaceMetadata;
|
||||
const lastAnalysisAt = metadata.lastTopicAnalysisAt;
|
||||
|
||||
// Count episodes since last analysis
|
||||
const query = lastAnalysisAt
|
||||
? `
|
||||
MATCH (e:Episode {userId: $userId})
|
||||
WHERE e.createdAt > datetime($lastAnalysisAt)
|
||||
RETURN count(e) as newEpisodeCount
|
||||
`
|
||||
: `
|
||||
MATCH (e:Episode {userId: $userId})
|
||||
RETURN count(e) as totalEpisodeCount
|
||||
`;
|
||||
|
||||
const result = await runQuery(query, {
|
||||
userId,
|
||||
lastAnalysisAt,
|
||||
});
|
||||
|
||||
const episodeCount = lastAnalysisAt
|
||||
? result[0]?.get("newEpisodeCount")?.toNumber() || 0
|
||||
: result[0]?.get("totalEpisodeCount")?.toNumber() || 0;
|
||||
|
||||
logger.info(
|
||||
`[Topic Analysis Check] User: ${userId}, New episodes: ${episodeCount}, Last analysis: ${lastAnalysisAt || "never"}`,
|
||||
);
|
||||
|
||||
// Trigger if 20+ new episodes
|
||||
return episodeCount >= 20;
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`[Topic Analysis Check] Error checking episode count:`,
|
||||
error,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update workspace metadata with last topic analysis timestamp
|
||||
*/
|
||||
export async function updateLastTopicAnalysisTime(
|
||||
workspaceId: string,
|
||||
): Promise<void> {
|
||||
try {
|
||||
const workspace = await prisma.workspace.findUnique({
|
||||
where: { id: workspaceId },
|
||||
select: { metadata: true },
|
||||
});
|
||||
|
||||
if (!workspace) {
|
||||
logger.warn(`Workspace not found: ${workspaceId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const metadata = (workspace.metadata || {}) as WorkspaceMetadata;
|
||||
|
||||
await prisma.workspace.update({
|
||||
where: { id: workspaceId },
|
||||
data: {
|
||||
metadata: {
|
||||
...metadata,
|
||||
lastTopicAnalysisAt: new Date().toISOString(),
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
logger.info(
|
||||
`[Topic Analysis] Updated last analysis timestamp for workspace: ${workspaceId}`,
|
||||
);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`[Topic Analysis] Error updating last analysis timestamp:`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
}
|
||||
21
apps/webapp/app/trigger/bert/topic-analysis.ts
Normal file
21
apps/webapp/app/trigger/bert/topic-analysis.ts
Normal file
@ -0,0 +1,21 @@
|
||||
import { task } from "@trigger.dev/sdk/v3";
|
||||
import {
|
||||
processTopicAnalysis,
|
||||
type TopicAnalysisPayload,
|
||||
} from "~/jobs/bert/topic-analysis.logic";
|
||||
|
||||
/**
|
||||
* Trigger.dev task for BERT topic analysis
|
||||
*
|
||||
* This is a thin wrapper around the common logic in jobs/bert/topic-analysis.logic.ts
|
||||
*/
|
||||
export const bertTopicAnalysisTask = task({
|
||||
id: "bert-topic-analysis",
|
||||
queue: {
|
||||
name: "bert-topic-analysis",
|
||||
concurrencyLimit: 3, // Max 3 parallel analyses to avoid CPU overload
|
||||
},
|
||||
run: async (payload: TopicAnalysisPayload) => {
|
||||
return await processTopicAnalysis(payload);
|
||||
},
|
||||
});
|
||||
@ -6,6 +6,7 @@ import {
|
||||
} from "~/jobs/ingest/ingest-episode.logic";
|
||||
import { triggerSpaceAssignment } from "../spaces/space-assignment";
|
||||
import { triggerSessionCompaction } from "../session/session-compaction";
|
||||
import { bertTopicAnalysisTask } from "../bert/topic-analysis";
|
||||
|
||||
const ingestionQueue = queue({
|
||||
name: "ingestion-queue",
|
||||
@ -32,6 +33,14 @@ export const ingestTask = task({
|
||||
async (params) => {
|
||||
await triggerSessionCompaction(params);
|
||||
},
|
||||
// Callback for BERT topic analysis
|
||||
async (params) => {
|
||||
await bertTopicAnalysisTask.trigger(params, {
|
||||
queue: "bert-topic-analysis",
|
||||
concurrencyKey: params.userId,
|
||||
tags: [params.userId, "bert-analysis"],
|
||||
});
|
||||
},
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
@ -1,557 +0,0 @@
|
||||
import { task } from "@trigger.dev/sdk/v3";
|
||||
import { logger } from "~/services/logger.service";
|
||||
import { makeModelCall } from "~/lib/model.server";
|
||||
import { runQuery } from "~/lib/neo4j.server";
|
||||
import type { CoreMessage } from "ai";
|
||||
import { z } from "zod";
|
||||
import {
|
||||
EXPLICIT_PATTERN_TYPES,
|
||||
IMPLICIT_PATTERN_TYPES,
|
||||
type SpacePattern,
|
||||
type PatternDetectionResult,
|
||||
} from "@core/types";
|
||||
import { createSpacePattern, getSpace } from "../utils/space-utils";
|
||||
|
||||
interface SpacePatternPayload {
|
||||
userId: string;
|
||||
workspaceId: string;
|
||||
spaceId: string;
|
||||
triggerSource?:
|
||||
| "summary_complete"
|
||||
| "manual"
|
||||
| "assignment"
|
||||
| "scheduled"
|
||||
| "new_space"
|
||||
| "growth_threshold"
|
||||
| "ingestion_complete";
|
||||
}
|
||||
|
||||
interface SpaceStatementData {
|
||||
uuid: string;
|
||||
fact: string;
|
||||
subject: string;
|
||||
predicate: string;
|
||||
object: string;
|
||||
createdAt: Date;
|
||||
validAt: Date;
|
||||
content?: string; // For implicit pattern analysis
|
||||
}
|
||||
|
||||
interface SpaceThemeData {
|
||||
themes: string[];
|
||||
summary: string;
|
||||
}
|
||||
|
||||
// Zod schemas for LLM response validation
|
||||
const ExplicitPatternSchema = z.object({
|
||||
name: z.string(),
|
||||
type: z.string(),
|
||||
summary: z.string(),
|
||||
evidence: z.array(z.string()),
|
||||
confidence: z.number().min(0).max(1),
|
||||
});
|
||||
|
||||
const ImplicitPatternSchema = z.object({
|
||||
name: z.string(),
|
||||
type: z.string(),
|
||||
summary: z.string(),
|
||||
evidence: z.array(z.string()),
|
||||
confidence: z.number().min(0).max(1),
|
||||
});
|
||||
|
||||
const PatternAnalysisSchema = z.object({
|
||||
explicitPatterns: z.array(ExplicitPatternSchema),
|
||||
implicitPatterns: z.array(ImplicitPatternSchema),
|
||||
});
|
||||
|
||||
const CONFIG = {
|
||||
minStatementsForPatterns: 5,
|
||||
maxPatternsPerSpace: 20,
|
||||
minPatternConfidence: 0.85,
|
||||
};
|
||||
|
||||
export const spacePatternTask = task({
|
||||
id: "space-pattern",
|
||||
run: async (payload: SpacePatternPayload) => {
|
||||
const { userId, workspaceId, spaceId, triggerSource = "manual" } = payload;
|
||||
|
||||
logger.info(`Starting space pattern detection`, {
|
||||
userId,
|
||||
workspaceId,
|
||||
spaceId,
|
||||
triggerSource,
|
||||
});
|
||||
|
||||
try {
|
||||
// Get space data and check if it has enough content
|
||||
const space = await getSpaceForPatternAnalysis(spaceId);
|
||||
if (!space) {
|
||||
return {
|
||||
success: false,
|
||||
spaceId,
|
||||
error: "Space not found or insufficient data",
|
||||
};
|
||||
}
|
||||
|
||||
// Get statements for pattern analysis
|
||||
const statements = await getSpaceStatementsForPatterns(spaceId, userId);
|
||||
|
||||
if (statements.length < CONFIG.minStatementsForPatterns) {
|
||||
logger.info(
|
||||
`Space ${spaceId} has insufficient statements (${statements.length}) for pattern detection`,
|
||||
);
|
||||
return {
|
||||
success: true,
|
||||
spaceId,
|
||||
triggerSource,
|
||||
patterns: {
|
||||
explicitPatterns: [],
|
||||
implicitPatterns: [],
|
||||
totalPatternsFound: 0,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Detect patterns
|
||||
const patternResult = await detectSpacePatterns(space, statements);
|
||||
|
||||
if (patternResult) {
|
||||
// Store patterns
|
||||
await storePatterns(
|
||||
patternResult.explicitPatterns,
|
||||
patternResult.implicitPatterns,
|
||||
spaceId,
|
||||
);
|
||||
|
||||
logger.info(`Generated patterns for space ${spaceId}`, {
|
||||
explicitPatterns: patternResult.explicitPatterns.length,
|
||||
implicitPatterns: patternResult.implicitPatterns.length,
|
||||
totalPatterns: patternResult.totalPatternsFound,
|
||||
triggerSource,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
spaceId,
|
||||
triggerSource,
|
||||
patterns: {
|
||||
explicitPatterns: patternResult.explicitPatterns.length,
|
||||
implicitPatterns: patternResult.implicitPatterns.length,
|
||||
totalPatternsFound: patternResult.totalPatternsFound,
|
||||
},
|
||||
};
|
||||
} else {
|
||||
logger.warn(`Failed to detect patterns for space ${spaceId}`);
|
||||
return {
|
||||
success: false,
|
||||
spaceId,
|
||||
triggerSource,
|
||||
error: "Failed to detect patterns",
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`Error in space pattern detection for space ${spaceId}:`,
|
||||
error as Record<string, unknown>,
|
||||
);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
async function getSpaceForPatternAnalysis(
|
||||
spaceId: string,
|
||||
): Promise<SpaceThemeData | null> {
|
||||
try {
|
||||
const space = await getSpace(spaceId);
|
||||
|
||||
if (!space || !space.themes || space.themes.length === 0) {
|
||||
logger.warn(
|
||||
`Space ${spaceId} not found or has no themes for pattern analysis`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
themes: space.themes,
|
||||
summary: space.summary || "",
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`Error getting space for pattern analysis:`,
|
||||
error as Record<string, unknown>,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function getSpaceStatementsForPatterns(
|
||||
spaceId: string,
|
||||
userId: string,
|
||||
): Promise<SpaceStatementData[]> {
|
||||
const query = `
|
||||
MATCH (s:Statement)
|
||||
WHERE s.userId = $userId
|
||||
AND s.spaceIds IS NOT NULL
|
||||
AND $spaceId IN s.spaceIds
|
||||
AND s.invalidAt IS NULL
|
||||
MATCH (s)-[:HAS_SUBJECT]->(subj:Entity)
|
||||
MATCH (s)-[:HAS_PREDICATE]->(pred:Entity)
|
||||
MATCH (s)-[:HAS_OBJECT]->(obj:Entity)
|
||||
RETURN s, subj.name as subject, pred.name as predicate, obj.name as object
|
||||
ORDER BY s.createdAt DESC
|
||||
`;
|
||||
|
||||
const result = await runQuery(query, {
|
||||
spaceId,
|
||||
userId,
|
||||
});
|
||||
|
||||
return result.map((record) => {
|
||||
const statement = record.get("s").properties;
|
||||
return {
|
||||
uuid: statement.uuid,
|
||||
fact: statement.fact,
|
||||
subject: record.get("subject"),
|
||||
predicate: record.get("predicate"),
|
||||
object: record.get("object"),
|
||||
createdAt: new Date(statement.createdAt),
|
||||
validAt: new Date(statement.validAt),
|
||||
content: statement.fact, // Use fact as content for implicit analysis
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function detectSpacePatterns(
|
||||
space: SpaceThemeData,
|
||||
statements: SpaceStatementData[],
|
||||
): Promise<PatternDetectionResult | null> {
|
||||
try {
|
||||
// Extract explicit patterns from themes
|
||||
const explicitPatterns = await extractExplicitPatterns(
|
||||
space.themes,
|
||||
space.summary,
|
||||
statements,
|
||||
);
|
||||
|
||||
// Extract implicit patterns from statement analysis
|
||||
const implicitPatterns = await extractImplicitPatterns(statements);
|
||||
|
||||
return {
|
||||
explicitPatterns,
|
||||
implicitPatterns,
|
||||
totalPatternsFound: explicitPatterns.length + implicitPatterns.length,
|
||||
processingStats: {
|
||||
statementsAnalyzed: statements.length,
|
||||
themesProcessed: space.themes.length,
|
||||
implicitPatternsExtracted: implicitPatterns.length,
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
"Error detecting space patterns:",
|
||||
error as Record<string, unknown>,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function extractExplicitPatterns(
|
||||
themes: string[],
|
||||
summary: string,
|
||||
statements: SpaceStatementData[],
|
||||
): Promise<Omit<SpacePattern, "id" | "createdAt" | "updatedAt" | "spaceId">[]> {
|
||||
if (themes.length === 0) return [];
|
||||
|
||||
const prompt = createExplicitPatternPrompt(themes, summary, statements);
|
||||
|
||||
// Pattern extraction requires HIGH complexity (insight synthesis, pattern recognition)
|
||||
let responseText = "";
|
||||
await makeModelCall(false, prompt, (text: string) => {
|
||||
responseText = text;
|
||||
}, undefined, 'high');
|
||||
|
||||
const patterns = parseExplicitPatternResponse(responseText);
|
||||
|
||||
return patterns.map((pattern) => ({
|
||||
name: pattern.name || `${pattern.type} pattern`,
|
||||
source: "explicit" as const,
|
||||
type: pattern.type,
|
||||
summary: pattern.summary,
|
||||
evidence: pattern.evidence,
|
||||
confidence: pattern.confidence,
|
||||
userConfirmed: "pending" as const,
|
||||
}));
|
||||
}
|
||||
|
||||
async function extractImplicitPatterns(
|
||||
statements: SpaceStatementData[],
|
||||
): Promise<Omit<SpacePattern, "id" | "createdAt" | "updatedAt" | "spaceId">[]> {
|
||||
if (statements.length < CONFIG.minStatementsForPatterns) return [];
|
||||
|
||||
const prompt = createImplicitPatternPrompt(statements);
|
||||
|
||||
// Implicit pattern discovery requires HIGH complexity (pattern recognition from statements)
|
||||
let responseText = "";
|
||||
await makeModelCall(false, prompt, (text: string) => {
|
||||
responseText = text;
|
||||
}, undefined, 'high');
|
||||
|
||||
const patterns = parseImplicitPatternResponse(responseText);
|
||||
|
||||
return patterns.map((pattern) => ({
|
||||
name: pattern.name || `${pattern.type} pattern`,
|
||||
source: "implicit" as const,
|
||||
type: pattern.type,
|
||||
summary: pattern.summary,
|
||||
evidence: pattern.evidence,
|
||||
confidence: pattern.confidence,
|
||||
userConfirmed: "pending" as const,
|
||||
}));
|
||||
}
|
||||
|
||||
function createExplicitPatternPrompt(
|
||||
themes: string[],
|
||||
summary: string,
|
||||
statements: SpaceStatementData[],
|
||||
): CoreMessage[] {
|
||||
const statementsText = statements
|
||||
.map((stmt) => `[${stmt.uuid}] ${stmt.fact}`)
|
||||
.join("\n");
|
||||
|
||||
const explicitTypes = Object.values(EXPLICIT_PATTERN_TYPES).join('", "');
|
||||
|
||||
return [
|
||||
{
|
||||
role: "system",
|
||||
content: `You are an expert at extracting structured patterns from themes and supporting evidence.
|
||||
|
||||
Your task is to convert high-level themes into explicit patterns with supporting statement evidence.
|
||||
|
||||
INSTRUCTIONS:
|
||||
1. For each theme, create a pattern that explains what it reveals about the user
|
||||
2. Give each pattern a short, descriptive name (2-4 words)
|
||||
3. Find supporting statement IDs that provide evidence for each pattern
|
||||
4. Assess confidence based on evidence strength and theme clarity
|
||||
5. Use appropriate pattern types from these guidelines: "${explicitTypes}"
|
||||
- "theme": High-level thematic content areas
|
||||
- "topic": Specific subject matter or topics of interest
|
||||
- "domain": Knowledge or work domains the user operates in
|
||||
- "interest_area": Areas of personal interest or hobby
|
||||
6. You may suggest new pattern types if none of the guidelines fit well
|
||||
|
||||
RESPONSE FORMAT:
|
||||
Provide your response inside <output></output> tags with valid JSON.
|
||||
|
||||
<output>
|
||||
{
|
||||
"explicitPatterns": [
|
||||
{
|
||||
"name": "Short descriptive name for the pattern",
|
||||
"type": "theme",
|
||||
"summary": "Description of what this pattern reveals about the user",
|
||||
"evidence": ["statement_id_1", "statement_id_2"],
|
||||
"confidence": 0.85
|
||||
}
|
||||
]
|
||||
}
|
||||
</output>`,
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `THEMES TO ANALYZE:
|
||||
${themes.map((theme, i) => `${i + 1}. ${theme}`).join("\n")}
|
||||
|
||||
SPACE SUMMARY:
|
||||
${summary}
|
||||
|
||||
SUPPORTING STATEMENTS:
|
||||
${statementsText}
|
||||
|
||||
Please extract explicit patterns from these themes and map them to supporting statement evidence.`,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
function createImplicitPatternPrompt(
|
||||
statements: SpaceStatementData[],
|
||||
): CoreMessage[] {
|
||||
const statementsText = statements
|
||||
.map(
|
||||
(stmt) =>
|
||||
`[${stmt.uuid}] ${stmt.fact} (${stmt.subject} → ${stmt.predicate} → ${stmt.object})`,
|
||||
)
|
||||
.join("\n");
|
||||
|
||||
const implicitTypes = Object.values(IMPLICIT_PATTERN_TYPES).join('", "');
|
||||
|
||||
return [
|
||||
{
|
||||
role: "system",
|
||||
content: `You are an expert at discovering implicit behavioral patterns from statement analysis.
|
||||
|
||||
Your task is to identify hidden patterns in user behavior, preferences, and habits from statement content.
|
||||
|
||||
INSTRUCTIONS:
|
||||
1. Analyze statement content for behavioral patterns, not explicit topics
|
||||
2. Give each pattern a short, descriptive name (2-4 words)
|
||||
3. Look for recurring behaviors, preferences, and working styles
|
||||
4. Identify how the user approaches tasks, makes decisions, and interacts
|
||||
5. Use appropriate pattern types from these guidelines: "${implicitTypes}"
|
||||
- "preference": Personal preferences and choices
|
||||
- "habit": Recurring behaviors and routines
|
||||
- "workflow": Work and process patterns
|
||||
- "communication_style": How user communicates and expresses ideas
|
||||
- "decision_pattern": Decision-making approaches and criteria
|
||||
- "temporal_pattern": Time-based behavioral patterns
|
||||
- "behavioral_pattern": General behavioral tendencies
|
||||
- "learning_style": How user learns and processes information
|
||||
- "collaboration_style": How user works with others
|
||||
6. You may suggest new pattern types if none of the guidelines fit well
|
||||
7. Focus on what the statements reveal about how the user thinks, works, or behaves
|
||||
|
||||
RESPONSE FORMAT:
|
||||
Provide your response inside <output></output> tags with valid JSON.
|
||||
|
||||
<output>
|
||||
{
|
||||
"implicitPatterns": [
|
||||
{
|
||||
"name": "Short descriptive name for the pattern",
|
||||
"type": "preference",
|
||||
"summary": "Description of what this behavioral pattern reveals",
|
||||
"evidence": ["statement_id_1", "statement_id_2"],
|
||||
"confidence": 0.75
|
||||
}
|
||||
]
|
||||
}
|
||||
</output>`,
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: `STATEMENTS TO ANALYZE FOR IMPLICIT PATTERNS:
|
||||
${statementsText}
|
||||
|
||||
Please identify implicit behavioral patterns, preferences, and habits from these statements.`,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
function parseExplicitPatternResponse(response: string): Array<{
|
||||
name: string;
|
||||
type: string;
|
||||
summary: string;
|
||||
evidence: string[];
|
||||
confidence: number;
|
||||
}> {
|
||||
try {
|
||||
const outputMatch = response.match(/<output>([\s\S]*?)<\/output>/);
|
||||
if (!outputMatch) {
|
||||
logger.warn("No <output> tags found in explicit pattern response");
|
||||
return [];
|
||||
}
|
||||
|
||||
const parsed = JSON.parse(outputMatch[1].trim());
|
||||
const validationResult = z
|
||||
.object({
|
||||
explicitPatterns: z.array(ExplicitPatternSchema),
|
||||
})
|
||||
.safeParse(parsed);
|
||||
|
||||
if (!validationResult.success) {
|
||||
logger.warn("Invalid explicit pattern response format:", {
|
||||
error: validationResult.error,
|
||||
});
|
||||
return [];
|
||||
}
|
||||
|
||||
return validationResult.data.explicitPatterns.filter(
|
||||
(p) =>
|
||||
p.confidence >= CONFIG.minPatternConfidence && p.evidence.length >= 3, // Ensure at least 3 evidence statements
|
||||
);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
"Error parsing explicit pattern response:",
|
||||
error as Record<string, unknown>,
|
||||
);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function parseImplicitPatternResponse(response: string): Array<{
|
||||
name: string;
|
||||
type: string;
|
||||
summary: string;
|
||||
evidence: string[];
|
||||
confidence: number;
|
||||
}> {
|
||||
try {
|
||||
const outputMatch = response.match(/<output>([\s\S]*?)<\/output>/);
|
||||
if (!outputMatch) {
|
||||
logger.warn("No <output> tags found in implicit pattern response");
|
||||
return [];
|
||||
}
|
||||
|
||||
const parsed = JSON.parse(outputMatch[1].trim());
|
||||
const validationResult = z
|
||||
.object({
|
||||
implicitPatterns: z.array(ImplicitPatternSchema),
|
||||
})
|
||||
.safeParse(parsed);
|
||||
|
||||
if (!validationResult.success) {
|
||||
logger.warn("Invalid implicit pattern response format:", {
|
||||
error: validationResult.error,
|
||||
});
|
||||
return [];
|
||||
}
|
||||
|
||||
return validationResult.data.implicitPatterns.filter(
|
||||
(p) =>
|
||||
p.confidence >= CONFIG.minPatternConfidence && p.evidence.length >= 3, // Ensure at least 3 evidence statements
|
||||
);
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
"Error parsing implicit pattern response:",
|
||||
error as Record<string, unknown>,
|
||||
);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function storePatterns(
|
||||
explicitPatterns: Omit<
|
||||
SpacePattern,
|
||||
"id" | "createdAt" | "updatedAt" | "spaceId"
|
||||
>[],
|
||||
implicitPatterns: Omit<
|
||||
SpacePattern,
|
||||
"id" | "createdAt" | "updatedAt" | "spaceId"
|
||||
>[],
|
||||
spaceId: string,
|
||||
): Promise<void> {
|
||||
try {
|
||||
const allPatterns = [...explicitPatterns, ...implicitPatterns];
|
||||
|
||||
if (allPatterns.length === 0) return;
|
||||
|
||||
// Store in PostgreSQL
|
||||
await createSpacePattern(spaceId, allPatterns);
|
||||
|
||||
logger.info(`Stored ${allPatterns.length} patterns`, {
|
||||
explicit: explicitPatterns.length,
|
||||
implicit: implicitPatterns.length,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error("Error storing patterns:", error as Record<string, unknown>);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to trigger the task
|
||||
export async function triggerSpacePattern(payload: SpacePatternPayload) {
|
||||
return await spacePatternTask.trigger(payload, {
|
||||
concurrencyKey: `space-pattern-${payload.spaceId}`, // Prevent parallel runs for the same space
|
||||
tags: [payload.userId, payload.spaceId, payload.triggerSource || "manual"],
|
||||
});
|
||||
}
|
||||
@ -19,24 +19,24 @@ const SearchParamsSchema = {
|
||||
description:
|
||||
"Search query optimized for knowledge graph retrieval. Choose the right query structure based on your search intent:\n\n" +
|
||||
"1. **Entity-Centric Queries** (Best for graph search):\n" +
|
||||
" - ✅ GOOD: \"User's preferences for code style and formatting\"\n" +
|
||||
" - ✅ GOOD: \"Project authentication implementation decisions\"\n" +
|
||||
" - ❌ BAD: \"user code style\"\n" +
|
||||
' - ✅ GOOD: "User\'s preferences for code style and formatting"\n' +
|
||||
' - ✅ GOOD: "Project authentication implementation decisions"\n' +
|
||||
' - ❌ BAD: "user code style"\n' +
|
||||
" - Format: [Person/Project] + [relationship/attribute] + [context]\n\n" +
|
||||
"2. **Multi-Entity Relationship Queries** (Excellent for episode graph):\n" +
|
||||
" - ✅ GOOD: \"User and team discussions about API design patterns\"\n" +
|
||||
" - ✅ GOOD: \"relationship between database schema and performance optimization\"\n" +
|
||||
" - ❌ BAD: \"user team api design\"\n" +
|
||||
' - ✅ GOOD: "User and team discussions about API design patterns"\n' +
|
||||
' - ✅ GOOD: "relationship between database schema and performance optimization"\n' +
|
||||
' - ❌ BAD: "user team api design"\n' +
|
||||
" - Format: [Entity1] + [relationship type] + [Entity2] + [context]\n\n" +
|
||||
"3. **Semantic Question Queries** (Good for vector search):\n" +
|
||||
" - ✅ GOOD: \"What causes authentication errors in production? What are the security requirements?\"\n" +
|
||||
" - ✅ GOOD: \"How does caching improve API response times compared to direct database queries?\"\n" +
|
||||
" - ❌ BAD: \"auth errors production\"\n" +
|
||||
' - ✅ GOOD: "What causes authentication errors in production? What are the security requirements?"\n' +
|
||||
' - ✅ GOOD: "How does caching improve API response times compared to direct database queries?"\n' +
|
||||
' - ❌ BAD: "auth errors production"\n' +
|
||||
" - Format: Complete natural questions with full context\n\n" +
|
||||
"4. **Concept Exploration Queries** (Good for BFS traversal):\n" +
|
||||
" - ✅ GOOD: \"concepts and ideas related to database indexing and query optimization\"\n" +
|
||||
" - ✅ GOOD: \"topics connected to user authentication and session management\"\n" +
|
||||
" - ❌ BAD: \"database indexing concepts\"\n" +
|
||||
' - ✅ GOOD: "concepts and ideas related to database indexing and query optimization"\n' +
|
||||
' - ✅ GOOD: "topics connected to user authentication and session management"\n' +
|
||||
' - ❌ BAD: "database indexing concepts"\n' +
|
||||
" - Format: [concept] + related/connected + [domain/context]\n\n" +
|
||||
"Avoid keyword soup queries - use complete phrases with proper context for best results.",
|
||||
},
|
||||
@ -462,7 +462,7 @@ async function handleGetSpace(args: any) {
|
||||
const spaceDetails = {
|
||||
id: space.id,
|
||||
name: space.name,
|
||||
description: space.description,
|
||||
summary: space.summary,
|
||||
};
|
||||
|
||||
return {
|
||||
|
||||
@ -55,7 +55,16 @@ RUN pnpm run build --filter=webapp...
|
||||
|
||||
# Runner
|
||||
FROM ${NODE_IMAGE} AS runner
|
||||
RUN apt-get update && apt-get install -y openssl netcat-openbsd ca-certificates
|
||||
RUN apt-get update && apt-get install -y \
|
||||
openssl \
|
||||
netcat-openbsd \
|
||||
ca-certificates \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-venv \
|
||||
gcc \
|
||||
g++ \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
WORKDIR /core
|
||||
RUN corepack enable
|
||||
ENV NODE_ENV production
|
||||
@ -69,6 +78,13 @@ COPY --from=builder --chown=node:node /core/apps/webapp/build ./apps/webapp/buil
|
||||
COPY --from=builder --chown=node:node /core/apps/webapp/public ./apps/webapp/public
|
||||
COPY --from=builder --chown=node:node /core/scripts ./scripts
|
||||
|
||||
# Install BERT Python dependencies
|
||||
COPY --chown=node:node apps/webapp/app/bert/requirements.txt ./apps/webapp/app/bert/requirements.txt
|
||||
RUN pip3 install --no-cache-dir -r ./apps/webapp/app/bert/requirements.txt
|
||||
|
||||
# Copy BERT scripts
|
||||
COPY --chown=node:node apps/webapp/app/bert/main.py ./apps/webapp/app/bert/main.py
|
||||
|
||||
EXPOSE 3000
|
||||
|
||||
USER node
|
||||
|
||||
@ -0,0 +1,2 @@
|
||||
-- AlterTable
|
||||
ALTER TABLE "Workspace" ADD COLUMN "metadata" JSONB NOT NULL DEFAULT '{}';
|
||||
@ -694,6 +694,8 @@ model Workspace {
|
||||
slug String @unique
|
||||
icon String?
|
||||
|
||||
metadata Json @default("{}")
|
||||
|
||||
integrations String[]
|
||||
|
||||
userId String? @unique
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user