diff --git a/.gitignore b/.gitignore index 2ea9e1a..eb6ca66 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,69 @@ cython_debug/ #.idea/ links.csv *.xcuserstate +node_modules/ +.node_modules/ +built/* +tests/cases/rwc/* +tests/cases/perf/* +!tests/cases/webharness/compilerToString.js +test-args.txt +~*.docx +\#*\# +.\#* +tests/baselines/local/* +tests/baselines/local.old/* +tests/services/baselines/local/* +tests/baselines/prototyping/local/* +tests/baselines/rwc/* +tests/baselines/reference/projectOutput/* +tests/baselines/local/projectOutput/* +tests/baselines/reference/testresults.tap +tests/baselines/symlinks/* +tests/services/baselines/prototyping/local/* +tests/services/browser/typescriptServices.js +src/harness/*.js +src/compiler/diagnosticInformationMap.generated.ts +src/compiler/diagnosticMessages.generated.json +src/parser/diagnosticInformationMap.generated.ts +src/parser/diagnosticMessages.generated.json +rwc-report.html +*.swp +build.json +*.actual +tests/webTestServer.js +tests/webTestServer.js.map +tests/webhost/*.d.ts +tests/webhost/webtsc.js +tests/cases/**/*.js +tests/cases/**/*.js.map +*.config +scripts/eslint/built/ +scripts/debug.bat +scripts/run.bat +scripts/**/*.js +scripts/**/*.js.map +coverage/ +internal/ +**/.DS_Store +.settings +**/.vs +**/.vscode/* +!**/.vscode/tasks.json +!**/.vscode/settings.template.json +!**/.vscode/launch.template.json +!**/.vscode/extensions.json +!tests/cases/projects/projectOption/**/node_modules +!tests/cases/projects/NodeModulesSearch/**/* +!tests/baselines/reference/project/nodeModules*/**/* +.idea +yarn.lock +yarn-error.log +.parallelperf.* +tests/baselines/reference/dt +.failed-tests +TEST-results.xml +package-lock.json +.eslintcache +*v8.log +/lib/ \ No newline at end of file diff --git a/RealJeff/Jeff.xcodeproj/project.xcworkspace/xcuserdata/brianprzezdziecki.xcuserdatad/UserInterfaceState.xcuserstate b/RealJeff/Jeff.xcodeproj/project.xcworkspace/xcuserdata/brianprzezdziecki.xcuserdatad/UserInterfaceState.xcuserstate index 2c0162f..b81b8c2 100644 Binary files a/RealJeff/Jeff.xcodeproj/project.xcworkspace/xcuserdata/brianprzezdziecki.xcuserdatad/UserInterfaceState.xcuserstate and b/RealJeff/Jeff.xcodeproj/project.xcworkspace/xcuserdata/brianprzezdziecki.xcuserdatad/UserInterfaceState.xcuserstate differ diff --git a/server/.gitignore b/server/.gitignore new file mode 100644 index 0000000..11ddd8d --- /dev/null +++ b/server/.gitignore @@ -0,0 +1,3 @@ +node_modules +# Keep environment variables out of version control +.env diff --git a/server/package.json b/server/package.json new file mode 100644 index 0000000..0a17611 --- /dev/null +++ b/server/package.json @@ -0,0 +1,34 @@ +{ + "name": "server", + "version": "1.0.0", + "main": "index.js", + "license": "MIT", + "devDependencies": { + "@types/express": "^4.17.21", + "@types/marked": "^6.0.0", + "ts-node": "^10.9.2", + "typescript": "^5.5.4" + }, + "dependencies": { + "@prisma/client": "^5.19.1", + "axios": "^1.7.7", + "dotenv": "^16.4.5", + "express": "^4.19.2", + "google-auth-library": "^9.14.1", + "googleapis": "^144.0.0", + "gpt-3-encoder": "^1.1.4", + "inversify": "^6.0.2", + "marked": "^14.1.2", + "openai": "^4.58.1", + "pdf-lib": "^1.17.1", + "pdf2pic": "^3.1.3", + "pgvector": "^0.2.0", + "prisma": "^5.19.1", + "tesseract.js": "^5.1.1" + }, + "scripts": { + "start": "ts-node src/server.ts", + "generate": "yarn prisma generate", + "ingest": "ts-node src/scripts/ingest.ts" + } +} diff --git a/server/prisma/migrations/20240909154453_init/migration.sql b/server/prisma/migrations/20240909154453_init/migration.sql new file mode 100644 index 0000000..1128782 --- /dev/null +++ b/server/prisma/migrations/20240909154453_init/migration.sql @@ -0,0 +1,63 @@ +-- CreateExtension +CREATE EXTENSION IF NOT EXISTS "vector"; + +-- CreateEnum +CREATE TYPE "ContentType" AS ENUM ('EMAIL', 'FILE', 'LINK'); + +-- CreateTable +CREATE TABLE "Email" ( + "id" UUID NOT NULL, + "sender" TEXT NOT NULL, + "subject" TEXT NOT NULL, + "body" TEXT NOT NULL, + "messageId" TEXT NOT NULL, + + CONSTRAINT "Email_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "File" ( + "id" UUID NOT NULL, + "name" TEXT NOT NULL, + "path" TEXT NOT NULL, + "content" TEXT NOT NULL, + "contentHash" VARCHAR(64) NOT NULL, + + CONSTRAINT "File_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "Link" ( + "id" UUID NOT NULL, + "url" TEXT NOT NULL, + "title" TEXT NOT NULL, + "content" TEXT NOT NULL, + + CONSTRAINT "Link_pkey" PRIMARY KEY ("id") +); + +-- CreateTable +CREATE TABLE "Embedding" ( + "id" UUID NOT NULL, + "contentType" "ContentType" NOT NULL, + "emailId" UUID, + "fileId" UUID, + "linkId" UUID, + + CONSTRAINT "Embedding_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE UNIQUE INDEX "Email_messageId_key" ON "Email"("messageId"); + +-- CreateIndex +CREATE INDEX "File_contentHash_idx" ON "File"("contentHash"); + +-- AddForeignKey +ALTER TABLE "Embedding" ADD CONSTRAINT "Embedding_fileId_fkey" FOREIGN KEY ("fileId") REFERENCES "File"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "Embedding" ADD CONSTRAINT "Embedding_emailId_fkey" FOREIGN KEY ("emailId") REFERENCES "Email"("id") ON DELETE SET NULL ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "Embedding" ADD CONSTRAINT "Embedding_linkId_fkey" FOREIGN KEY ("linkId") REFERENCES "Link"("id") ON DELETE CASCADE ON UPDATE CASCADE; diff --git a/server/prisma/migrations/migration_lock.toml b/server/prisma/migrations/migration_lock.toml new file mode 100644 index 0000000..fbffa92 --- /dev/null +++ b/server/prisma/migrations/migration_lock.toml @@ -0,0 +1,3 @@ +# Please do not edit this file manually +# It should be added in your version-control system (i.e. Git) +provider = "postgresql" \ No newline at end of file diff --git a/server/prisma/schema.prisma b/server/prisma/schema.prisma new file mode 100644 index 0000000..dd8c331 --- /dev/null +++ b/server/prisma/schema.prisma @@ -0,0 +1,61 @@ +// This is your Prisma schema file, +// learn more about it in the docs: https://pris.ly/d/prisma-schema + +// Looking for ways to speed up your queries, or scale easily with your serverless or edge functions? +// Try Prisma Accelerate: https://pris.ly/cli/accelerate-init + +generator client { + provider = "prisma-client-js" + previewFeatures = ["postgresqlExtensions"] // Enable the postgresqlExtensions. Currently in preview +} + +datasource db { + provider = "postgresql" + url = env("DATABASE_URL") + extensions = [vector] +} + +enum ContentType { + EMAIL + FILE + LINK +} + +model Email { + id String @id @default(uuid()) @db.Uuid + sender String + subject String + body String + messageId String @unique + embeddings Embedding[] +} + +model File { + id String @id @default(uuid()) @db.Uuid + name String + path String + content String + contentHash String @db.VarChar(64) + embeddings Embedding[] + + @@index([contentHash]) +} + +model Link { + id String @id @default(uuid()) @db.Uuid + url String + title String + content String + embeddings Embedding[] +} + +model Embedding { + id String @id @default(uuid()) @db.Uuid + contentType ContentType + emailId String? @db.Uuid + fileId String? @db.Uuid + linkId String? @db.Uuid + file File? @relation(fields: [fileId], references: [id], onDelete: Cascade) + email Email? @relation(fields: [emailId], references: [id]) + link Link? @relation(fields: [linkId], references: [id], onDelete: Cascade) +} diff --git a/server/src/dao/emailDao.ts b/server/src/dao/emailDao.ts new file mode 100644 index 0000000..8baeacd --- /dev/null +++ b/server/src/dao/emailDao.ts @@ -0,0 +1,22 @@ +import { PrismaClient, Prisma } from '@prisma/client'; +import { container } from '../utils/container'; + +export async function createEmail(data: Prisma.EmailCreateInput) { + const prisma = container.get(PrismaClient); + return prisma.email.create({ data }); +} + +export async function getEmail(id: string) { + const prisma = container.get(PrismaClient); + return prisma.email.findUnique({ where: { id } }); +} + +export async function updateEmail(id: string, data: Prisma.EmailUpdateInput) { + const prisma = container.get(PrismaClient); + return prisma.email.update({ where: { id }, data }); +} + +export async function deleteEmail(id: string) { + const prisma = container.get(PrismaClient); + return prisma.email.delete({ where: { id } }); +} diff --git a/server/src/dao/embeddingDao.ts b/server/src/dao/embeddingDao.ts new file mode 100644 index 0000000..36b98cd --- /dev/null +++ b/server/src/dao/embeddingDao.ts @@ -0,0 +1,59 @@ +import { PrismaClient, ContentType } from '@prisma/client'; +import { container } from '../utils/container'; +import pgvector from 'pgvector'; + +// Prisma doesn't support pgvector types, so we need to use raw SQL and make our own create input types +type EmbeddingCreateInput = { + embedding: number[]; + contentType: ContentType; + emailId?: string; + fileId?: string; + linkId?: string; +}; + +export async function createEmbedding(data: EmbeddingCreateInput) { + const prisma = container.get(PrismaClient); + const embedding = pgvector.toSql(data.embedding); + return prisma.$executeRaw` + INSERT INTO "Embedding" (id, embedding, "contentType", "emailId", "fileId", "linkId") + VALUES (gen_random_uuid(), ${embedding}::vector, ${data.contentType}, ${data.emailId}, ${data.fileId}, ${data.linkId}) + `; +} + +export type SimilaritySearchResult = { + content_type: ContentType; + source: string; + title: string; + content: string; + distance: number; +}; + +export async function performSimilaritySearch(queryEmbedding: number[], limit: number): Promise { + const prisma = container.get(PrismaClient); + const embedding = pgvector.toSql(queryEmbedding); + const results = await prisma.$queryRaw` + SELECT + e."contentType" as content_type, + COALESCE(em."messageId", f.path, l.url) AS source, + COALESCE(em.subject, f.name, l.title) AS title, + COALESCE(em.body, f.content, l.content) AS content, + e.embedding <-> ${embedding}::vector AS distance + FROM + "Embedding" e + LEFT JOIN "Email" em ON e."emailId" = em.id + LEFT JOIN "File" f ON e."fileId" = f.id + LEFT JOIN "Link" l ON e."linkId" = l.id + ORDER BY distance + LIMIT ${limit} + `; + + return results; +} + +export async function clearAllTables() { + const prisma = container.get(PrismaClient); + await prisma.embedding.deleteMany(); + await prisma.email.deleteMany(); + await prisma.file.deleteMany(); + await prisma.link.deleteMany(); +} \ No newline at end of file diff --git a/server/src/dao/fileDao.ts b/server/src/dao/fileDao.ts new file mode 100644 index 0000000..7e63503 --- /dev/null +++ b/server/src/dao/fileDao.ts @@ -0,0 +1,22 @@ +import { PrismaClient, Prisma } from '@prisma/client'; +import { container } from '../utils/container'; + +export async function createFile(data: Prisma.FileCreateInput) { + const prisma = container.get(PrismaClient); + return prisma.file.create({ data }); +} + +export async function getFile(id: string) { + const prisma = container.get(PrismaClient); + return prisma.file.findUnique({ where: { id } }); +} + +export async function updateFile(id: string, data: Prisma.FileUpdateInput) { + const prisma = container.get(PrismaClient); + return prisma.file.update({ where: { id }, data }); +} + +export async function deleteFile(id: string) { + const prisma = container.get(PrismaClient); + return prisma.file.delete({ where: { id } }); +} diff --git a/server/src/dao/linkDao.ts b/server/src/dao/linkDao.ts new file mode 100644 index 0000000..3aa9a75 --- /dev/null +++ b/server/src/dao/linkDao.ts @@ -0,0 +1,22 @@ +import { PrismaClient, Prisma } from '@prisma/client'; +import { container } from '../utils/container'; + +export async function createLink(data: Prisma.LinkCreateInput) { + const prisma = container.get(PrismaClient); + return prisma.link.create({ data }); +} + +export async function getLink(id: string) { + const prisma = container.get(PrismaClient); + return prisma.link.findUnique({ where: { id } }); +} + +export async function updateLink(id: string, data: Prisma.LinkUpdateInput) { + const prisma = container.get(PrismaClient); + return prisma.link.update({ where: { id }, data }); +} + +export async function deleteLink(id: string) { + const prisma = container.get(PrismaClient); + return prisma.link.delete({ where: { id } }); +} diff --git a/server/src/scripts/ingest.ts b/server/src/scripts/ingest.ts new file mode 100644 index 0000000..0a9db5b --- /dev/null +++ b/server/src/scripts/ingest.ts @@ -0,0 +1,94 @@ +import { container } from '../utils/container'; +import { PrismaClient } from '@prisma/client'; +import { gmailClient } from '../services/clients/gmail'; +import { processAndStoreEmail } from '../services/dataLoaders/email'; +import { processFiles } from '../services/dataLoaders/files'; +import { getContentsForUrl } from '../services/clients/exa'; +import { createLink } from '../dao/linkDao'; +import dotenv from 'dotenv'; +import fs from 'fs'; +import path from 'path'; + +dotenv.config(); + +const FOLDER_PATH = process.env.FOLDER_PATH || 'jeff_storage'; + +async function ingestRecentEmails(days: number = 7) { + const prisma = container.get(PrismaClient); + + try { + const recentEmailThreads = await gmailClient.readEmails(days); + + for (const thread of Object.values(recentEmailThreads)) { + for (const email of thread) { + try { + const storedEmail = await processAndStoreEmail(email); + console.log(`Stored email: ${storedEmail.id} - ${storedEmail.subject}`); + } catch (error: any) { + if (error.code === 'P2002') { + console.log(`Email already exists: ${email.subject}`); + } else { + console.error(`Error processing email: ${error}`); + } + } + } + } + } catch (error) { + console.error('Error during email ingestion:', error); + } + + console.log("Email ingestion complete."); +} + +async function ingestFiles(folderPath: string) { + try { + await processFiles(folderPath); + } catch (error) { + console.error('Error during file ingestion:', error); + } +} + +async function ingestBrowserHistory() { + try { + console.log("Getting history"); + const historyFilePath = path.join(FOLDER_PATH, 'browser_history.json'); + const historyData = fs.readFileSync(historyFilePath, 'utf8'); + const historyEntries = JSON.parse(historyData); + + for (const entry of historyEntries) { + try { + const content = await getContentsForUrl(entry.url); + await createLink({ + url: entry.url, + title: entry.title, + content: content, + }); + console.log(`Stored link: ${entry.url}`); + } catch (error) { + console.error(`Error processing link ${entry.url}:`, error); + } + } + } catch (error) { + console.error('Error during history ingestion:', error); + } +} + +async function main() { + const prisma = container.get(PrismaClient); + + // Ingest emails + await ingestRecentEmails(); + + // Ingest files + await ingestFiles(FOLDER_PATH); + + // Ingest browser history + await ingestBrowserHistory(); + + await prisma.$disconnect(); +} + +main().catch((error) => { + console.error('Error in main ingest process:', error); + process.exit(1); +}); \ No newline at end of file diff --git a/server/src/server.ts b/server/src/server.ts new file mode 100644 index 0000000..a816a3b --- /dev/null +++ b/server/src/server.ts @@ -0,0 +1,76 @@ +import express, { Request, Response } from 'express'; +import { performSimilaritySearch } from './dao/embeddingDao'; +import { generateAnswerSummary } from './services/openai'; +import dotenv from 'dotenv'; +import { getEmbedding } from './services/embeddings/embed'; + +dotenv.config(); + +const app = express(); +app.use(express.json()); + + +enum ItemType { + FILE = "FILE", + URL = "URL" +} + +interface SearchResult { + type: ItemType; + source: string; + title: string; + distance: number; +} + +interface SearchResponse { + results: SearchResult[]; + answer_summary: string | null; + error?: string; +} + +interface SearchRequest { + query: string; + limit?: number; +} + +app.post('/search', async (req: Request<{}, {}, SearchRequest>, res: Response) => { + const { query, limit = 5 } = req.body; + console.log(`Search endpoint accessed with query: ${query}`); + + try { + const queryEmbedding = await getEmbedding(query); + const results = await performSimilaritySearch(queryEmbedding, limit); + + let summaryContextString = ""; + const searchResults: SearchResult[] = results.map(result => { + summaryContextString += "\n\n" + result.content; + let itemType = result.content_type === 'FILE' ? ItemType.FILE : ItemType.URL; + let source = result.source; + + if (result.content_type === 'EMAIL') { + source = `https://mail.google.com/mail/u/0/?tab=rm&ogbl#inbox/${source}`; + } + + return { + type: itemType, + source: source, + title: result.title, + distance: result.distance + }; + }); + + const answerSummary = await generateAnswerSummary(query, summaryContextString); + + console.log(`Search results: ${JSON.stringify(searchResults)}`); + res.json({ results: searchResults, answer_summary: answerSummary }); + } catch (error) { + console.error('Error during search:', error); + res.status(500).json({ results: [], answer_summary: null, error: 'An error occurred during the search' }); + } +}); + +const PORT = process.env.PORT || 8000; + +app.listen(PORT, () => { + console.log(`Server is running on http://localhost:${PORT}`); +}); diff --git a/server/src/services/clients/exa.ts b/server/src/services/clients/exa.ts new file mode 100644 index 0000000..c5104d7 --- /dev/null +++ b/server/src/services/clients/exa.ts @@ -0,0 +1,37 @@ +import axios from 'axios'; +import dotenv from 'dotenv'; + +dotenv.config(); + +const EXA_API_URL = 'https://api.exa.ai/contents'; +const EXA_API_KEY = process.env.EXA_KEY; + +interface ExaResponse { + results: Array<{ + text: string; + }>; +} + +export async function getContentsForUrl(webUrl: string): Promise { + try { + const response = await axios.post( + EXA_API_URL, + { + ids: [webUrl], + text: { includeHtmlTags: false }, + }, + { + headers: { + 'accept': 'application/json', + 'content-type': 'application/json', + 'x-api-key': EXA_API_KEY, + }, + } + ); + + return response.data.results[0].text; + } catch (error) { + console.error('Error fetching content from Exa:', error); + throw error; + } +} diff --git a/server/src/services/clients/gmail.ts b/server/src/services/clients/gmail.ts new file mode 100644 index 0000000..a2ced8d --- /dev/null +++ b/server/src/services/clients/gmail.ts @@ -0,0 +1,98 @@ +import { google } from 'googleapis'; +import { OAuth2Client } from 'google-auth-library'; +import dotenv from 'dotenv'; + +dotenv.config(); + +const SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']; +const TOKEN_PATH = 'token.json'; + +interface EmailThread { + [threadId: string]: Array<{ + sender: string; + subject: string; + body: string; + messageId: string; + }>; +} + +class GmailClient { + private auth: OAuth2Client; + + constructor() { + this.auth = new google.auth.OAuth2( + process.env.GMAIL_CLIENT_ID, + process.env.GMAIL_CLIENT_SECRET, + 'urn:ietf:wg:oauth:2.0:oob' + ); + + this.auth.setCredentials({ + refresh_token: process.env.GMAIL_REFRESH_TOKEN, + }); + } + + async readEmails(days: number = 7): Promise { + const gmail = google.gmail({ version: 'v1', auth: this.auth }); + const date = new Date(); + date.setDate(date.getDate() - days); + const query = `after:${date.getFullYear()}/${date.getMonth() + 1}/${date.getDate()}`; + + try { + const res = await gmail.users.messages.list({ + userId: 'me', + q: query, + }); + + const messages = res.data.messages || []; + const emailThreads: EmailThread = {}; + + for (const message of messages) { + const fullMessage = await gmail.users.messages.get({ + userId: 'me', + id: message.id!, + }); + + const threadId = fullMessage.data.threadId!; + const headers = fullMessage.data.payload?.headers; + const sender = headers?.find(h => h.name === 'From')?.value || ''; + const subject = headers?.find(h => h.name === 'Subject')?.value || ''; + const messageId = headers?.find(h => h.name === 'Message-ID')?.value || ''; + const body = this.getEmailBody(fullMessage.data); + + if (!emailThreads[threadId]) { + emailThreads[threadId] = []; + } + + emailThreads[threadId].push({ + sender, + subject, + body, + messageId, + }); + } + + return Object.values(emailThreads); + } catch (error) { + console.error('The API returned an error:', error); + throw error; + } + } + + private getEmailBody(message: any): string { + if (message.payload?.body?.data) { + return Buffer.from(message.payload.body.data, 'base64').toString(); + } + + if (message.payload?.parts) { + for (const part of message.payload.parts) { + if (part.mimeType === 'text/plain' && part.body?.data) { + return Buffer.from(part.body.data, 'base64').toString(); + } + } + } + + return ''; + } +} + +export const gmailClient = new GmailClient(); diff --git a/server/src/services/clients/openai.ts b/server/src/services/clients/openai.ts new file mode 100644 index 0000000..e69de29 diff --git a/server/src/services/dataLoaders/email.ts b/server/src/services/dataLoaders/email.ts new file mode 100644 index 0000000..0303991 --- /dev/null +++ b/server/src/services/dataLoaders/email.ts @@ -0,0 +1,22 @@ +import { createEmail } from '../../dao/emailDao'; +import { createEmbedding } from '../../dao/embeddingDao'; +import { getEmbedding } from '../embeddings/embed'; + +export async function processAndStoreEmail(email: any) { + const storedEmail = await createEmail({ + sender: email.sender, + subject: email.subject, + body: email.body, + messageId: email.messageId, + }); + + const embedding = await getEmbedding(email.body); + + await createEmbedding({ + embedding, + contentType: 'EMAIL', + emailId: storedEmail.id, + }); + + return storedEmail; +} diff --git a/server/src/services/dataLoaders/files.ts b/server/src/services/dataLoaders/files.ts new file mode 100644 index 0000000..36055ac --- /dev/null +++ b/server/src/services/dataLoaders/files.ts @@ -0,0 +1,123 @@ +import fs from 'fs'; +import path from 'path'; +import crypto from 'crypto'; +import { marked } from 'marked'; +import { PDFDocument } from 'pdf-lib'; +import { createWorker } from 'tesseract.js'; +import { container } from '../../utils/container'; +import { PrismaClient } from '@prisma/client'; +import { getEmbedding, chunkContent } from '../embeddings/embed'; +import { createFile } from '../../dao/fileDao'; +import { createEmbedding } from '../../dao/embeddingDao'; +import { fromPath } from 'pdf2pic'; + +const FOLDER_PATH = process.env.FOLDER_PATH || 'jeff_storage'; + +async function getFileContents(filePath: string): Promise { + const ext = path.extname(filePath).toLowerCase(); + + if (ext === '.md' || ext === '.txt') { + const content = fs.readFileSync(filePath, 'utf-8'); + return ext === '.md' ? marked(content) : content; + } else if (ext === '.pdf') { + return ocrPdf(filePath); + } else { + throw new Error(`Unsupported file type: ${ext}`); + } +} + +async function ocrPdf(filePath: string): Promise { + const worker = await createWorker(); + let text = ''; + + const options = { + density: 300, + saveFilename: "temp_page", + savePath: "./temp", + format: "png", + width: 2480, + height: 3508 + }; + + const convert = fromPath(filePath, options); + const pdfDoc = await PDFDocument.load(fs.readFileSync(filePath)); + + for (let i = 0; i < pdfDoc.getPageCount(); i++) { + const result = await convert(i + 1); + const base64 = (result as any).base64; + const { data: { text: pageText } } = await worker.recognize(`data:image/png;base64,${base64}`); + text += pageText + '\n'; + } + + await worker.terminate(); + return text.replace(/\0/g, ''); +} + +function getContentHash(content: string): string { + return crypto.createHash('sha256').update(content).digest('hex'); +} + +export async function processFiles(folderPath: string): Promise { + const prisma = container.get(PrismaClient); + + const processFile = async (filePath: string) => { + const ext = path.extname(filePath).toLowerCase(); + + if (['.md', '.txt', '.pdf'].includes(ext)) { + try { + const content = await getFileContents(filePath); + const chunks = chunkContent(content); + + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + const contentHash = getContentHash(chunk); + + const existingFile = await prisma.file.findFirst({ + where: { contentHash } + }); + + if (existingFile) continue; + + const embedding = await getEmbedding(chunk); + + const fileData = { + name: `${path.basename(filePath)}_chunk_${i + 1}`, + path: filePath, + content: chunk, + contentHash: contentHash, + }; + + const storedFile = await createFile(fileData); + await createEmbedding({ + embedding, + contentType: 'FILE', + fileId: storedFile.id, + }); + + console.log(`Stored file chunk: ${storedFile.id} - ${storedFile.name}`); + } + } catch (error) { + console.error(`Error processing file ${filePath}:`, error); + } + } + }; + + const processDirectory = async (dir: string) => { + const entries = fs.readdirSync(dir, { withFileTypes: true }); + + for (const entry of entries) { + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) { + await processDirectory(fullPath); + } else { + await processFile(fullPath); + } + } + }; + + await processDirectory(folderPath); +} + +if (require.main === module) { + processFiles(FOLDER_PATH).catch(console.error); +} diff --git a/server/src/services/embeddings/embed.ts b/server/src/services/embeddings/embed.ts new file mode 100644 index 0000000..9386a01 --- /dev/null +++ b/server/src/services/embeddings/embed.ts @@ -0,0 +1,48 @@ +import OpenAI from 'openai'; +import dotenv from 'dotenv'; +import { encode } from 'gpt-3-encoder'; +import { container } from '../../utils/container'; + +dotenv.config(); + +export function numTokensFromString(string: string, modelName: string = "text-embedding-3-large"): number { + return encode(string).length; +} + +export function chunkContent(content: string, maxTokens: number = 5000): string[] { + const chunks: string[] = []; + let currentChunk = ""; + let currentTokens = 0; + + for (const line of content.split('\n')) { + const lineTokens = numTokensFromString(line); + if (currentTokens + lineTokens > maxTokens) { + chunks.push(currentChunk.trim()); + currentChunk = line; + currentTokens = lineTokens; + } else { + currentChunk += line + '\n'; + currentTokens += lineTokens; + } + } + + if (currentChunk) { + chunks.push(currentChunk.trim()); + } + + return chunks; +} + +export async function getEmbedding(text: string): Promise { + try { + const openai = container.get(OpenAI); + const response = await openai.embeddings.create({ + model: "text-embedding-3-large", + input: text, + }); + return response.data[0].embedding + } catch (error) { + console.error('Error getting embedding:', error); + throw error; + } +} diff --git a/server/src/services/openai.ts b/server/src/services/openai.ts new file mode 100644 index 0000000..7c81c7a --- /dev/null +++ b/server/src/services/openai.ts @@ -0,0 +1,21 @@ +import OpenAI from 'openai'; +import { container } from '../utils/container'; + +export async function generateAnswerSummary(question: string, content: string, model: string = 'gpt-4'): Promise { + const openai = container.get(OpenAI); + + const messages = [ + { + role: "user" as const, + content: `Answer the following question: ${question}. Only use information from the following text and provide as brief of an answer as possible: \n ${content}` + } + ]; + + const response = await openai.chat.completions.create({ + model: model, + messages: messages, + temperature: 0.0 + }); + + return response.choices[0].message.content || ''; +} \ No newline at end of file diff --git a/server/src/utils/container.ts b/server/src/utils/container.ts new file mode 100644 index 0000000..2ca5162 --- /dev/null +++ b/server/src/utils/container.ts @@ -0,0 +1,33 @@ +import { Container, ContainerModule, interfaces } from 'inversify' +import dotenv from 'dotenv' +import { PrismaClient } from '@prisma/client' +import OpenAI from 'openai' + +dotenv.config(); + +export const container = new Container({ + autoBindInjectable: true, + skipBaseClassChecks: true, + defaultScope: 'Singleton', +}) + +export const TYPES = { + Logger: Symbol.for('Logger'), + LoggerFactory: Symbol.for('LoggerFactory'), +} + +export const coreBindingsModule = new ContainerModule((bind: interfaces.Bind) => { + + const prismaClient = new PrismaClient({ + log: [ + { emit: 'event', level: 'info' }, + { emit: 'event', level: 'warn' }, + { emit: 'event', level: 'error' }, + ], + }) + bind(PrismaClient).toConstantValue(prismaClient) + + bind(OpenAI).toConstantValue(new OpenAI({ apiKey: process.env.OPENAI_API_KEY })) +}) + +container.load(coreBindingsModule) diff --git a/server/tsconfig.json b/server/tsconfig.json new file mode 100644 index 0000000..73d3514 --- /dev/null +++ b/server/tsconfig.json @@ -0,0 +1,110 @@ +{ + "exclude": ["node_modules"], + "compilerOptions": { + "baseUrl": "./src", + /* Visit https://aka.ms/tsconfig to read more about this file */ + + /* Projects */ + // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ + // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ + // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ + // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ + // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ + // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ + + /* Language and Environment */ + "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ + // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ + // "jsx": "preserve", /* Specify what JSX code is generated. */ + // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ + // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ + // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ + // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ + // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ + // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ + // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ + // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ + // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ + + /* Modules */ + "module": "commonjs", /* Specify what module code is generated. */ + // "rootDir": "./", /* Specify the root folder within your source files. */ + // "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */ + // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ + // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ + // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ + // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ + // "types": [], /* Specify type package names to be included without being referenced in a source file. */ + // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ + // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ + // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ + // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ + // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ + // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ + // "resolveJsonModule": true, /* Enable importing .json files. */ + // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ + // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ + + /* JavaScript Support */ + // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ + // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ + // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ + + /* Emit */ + // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ + // "declarationMap": true, /* Create sourcemaps for d.ts files. */ + // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ + // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ + // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ + // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ + // "outDir": "./", /* Specify an output folder for all emitted files. */ + // "removeComments": true, /* Disable emitting comments. */ + // "noEmit": true, /* Disable emitting files from a compilation. */ + // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ + // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ + // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ + // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ + // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ + // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ + // "newLine": "crlf", /* Set the newline character for emitting files. */ + // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ + // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ + // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ + // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ + // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ + + /* Interop Constraints */ + // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ + // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ + // "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */ + // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ + "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ + // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ + "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ + + /* Type Checking */ + "strict": true, /* Enable all strict type-checking options. */ + // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ + // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ + // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ + // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ + // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ + // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ + // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ + // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ + // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ + // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ + // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ + // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ + // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ + // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ + // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ + // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ + // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ + // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ + + /* Completeness */ + // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ + "skipLibCheck": true /* Skip type checking all .d.ts files. */ + } +} diff --git a/server/apis/exa_client.py b/server2/apis/exa_client.py similarity index 100% rename from server/apis/exa_client.py rename to server2/apis/exa_client.py diff --git a/server/apis/gmail_client.py b/server2/apis/gmail_client.py similarity index 100% rename from server/apis/gmail_client.py rename to server2/apis/gmail_client.py diff --git a/server/apis/openai_client.py b/server2/apis/openai_client.py similarity index 100% rename from server/apis/openai_client.py rename to server2/apis/openai_client.py diff --git a/server/app.py b/server2/app.py similarity index 100% rename from server/app.py rename to server2/app.py diff --git a/server/constants.py b/server2/constants.py similarity index 100% rename from server/constants.py rename to server2/constants.py diff --git a/server/data_loaders/files.py b/server2/data_loaders/files.py similarity index 100% rename from server/data_loaders/files.py rename to server2/data_loaders/files.py diff --git a/server/data_loaders/gmail.py b/server2/data_loaders/gmail.py similarity index 100% rename from server/data_loaders/gmail.py rename to server2/data_loaders/gmail.py diff --git a/server/data_loaders/history.py b/server2/data_loaders/history.py similarity index 100% rename from server/data_loaders/history.py rename to server2/data_loaders/history.py diff --git a/server/database/db.py b/server2/database/db.py similarity index 100% rename from server/database/db.py rename to server2/database/db.py diff --git a/server/database/queries.py b/server2/database/queries.py similarity index 100% rename from server/database/queries.py rename to server2/database/queries.py diff --git a/server/database/tables.py b/server2/database/tables.py similarity index 100% rename from server/database/tables.py rename to server2/database/tables.py diff --git a/server/embeddings/embed.py b/server2/embeddings/embed.py similarity index 100% rename from server/embeddings/embed.py rename to server2/embeddings/embed.py diff --git a/server/types/email.py b/server2/types/email.py similarity index 100% rename from server/types/email.py rename to server2/types/email.py