Skip to content

Commit 20399cf

Browse files
integrated hugging face dataset
1 parent d7ef0af commit 20399cf

File tree

11 files changed

+1893
-173
lines changed

11 files changed

+1893
-173
lines changed

.github/workflows/changelog.yml

Lines changed: 0 additions & 75 deletions
This file was deleted.

.github/workflows/release.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: Release
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
8+
jobs:
9+
release:
10+
runs-on: ubuntu-latest
11+
permissions:
12+
contents: write
13+
14+
steps:
15+
- name: Checkout code
16+
uses: actions/checkout@v4
17+
18+
- name: Bump version and push tag
19+
id: tag_version
20+
uses: mathieudutour/github-tag-action@v6.2
21+
with:
22+
github_token: ${{ secrets.GITHUB_TOKEN }}
23+
24+
- name: Create a GitHub release
25+
uses: actions/create-release@v1
26+
with:
27+
tag_name: ${{ steps.tag_version.outputs.new_tag }}
28+
release_name: "Release ${{ steps.tag_version.outputs.new_tag }}"
29+
body: ${{ steps.tag_version.outputs.changelog }}
30+
env:
31+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,5 @@ yarn-error.log*
3636
# typescript
3737
*.tsbuildinfo
3838
next-env.d.ts
39+
40+
gcp.json

app/api/chat/route.ts

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { getModelClient } from '@/lib/models'
33
import { LLMModel, LLMModelConfig } from '@/lib/models'
44
import { toPrompt } from '@/lib/prompt'
55
import ratelimit from '@/lib/ratelimit'
6+
import { retrieveRelevantCode } from '@/lib/retrieval'
67
import { fragmentSchema as schema } from '@/lib/schema'
78
import { Templates } from '@/lib/templates'
89
import { streamObject, LanguageModel, CoreMessage } from 'ai'
@@ -53,19 +54,34 @@ export async function POST(req: Request) {
5354
}
5455

5556
console.log('userID', userID)
56-
console.log('teamID', teamID)
57-
// console.log('template', template)
57+
console.log('template', template)
5858
console.log('model', model)
59-
// console.log('config', config)
59+
console.log('config', config)
6060

6161
const { model: modelNameString, apiKey: modelApiKey, ...modelParams } = config
6262
const modelClient = getModelClient(model, config)
6363

6464
try {
65+
const lastUserMessage = messages[messages.length - 1]?.content;
66+
let retrievedContext = '';
67+
if (typeof lastUserMessage === 'string') {
68+
const relevantCode = await retrieveRelevantCode(lastUserMessage);
69+
if (relevantCode.length > 0) {
70+
retrievedContext = `
71+
---
72+
Here is some relevant code from our knowledge base that might help:
73+
${relevantCode.join('\n---\n')}
74+
---
75+
`;
76+
}
77+
}
78+
79+
const systemPrompt = toPrompt(template) + retrievedContext;
80+
6581
const stream = await streamObject({
6682
model: modelClient as LanguageModel,
6783
schema,
68-
system: toPrompt(template),
84+
system: systemPrompt,
6985
messages,
7086
maxRetries: 0, // do not retry on errors
7187
...modelParams,
@@ -116,4 +132,4 @@ export async function POST(req: Request) {
116132
},
117133
)
118134
}
119-
}
135+
}

app/api/import-dataset/route.ts

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import { NextResponse } from 'next/server';
2+
import { pipeline } from '@xenova/transformers';
3+
import { createClient } from '@supabase/supabase-js';
4+
5+
// Configuration
6+
const DATASET = 'bigcode/the-stack';
7+
const EMBEDDING_MODEL = 'Xenova/all-MiniLM-L6-v2';
8+
const HUGGING_FACE_API_URL = 'https://huggingface.co/api/datasets';
9+
10+
// Initialize Supabase client with the service role key for admin access
11+
const supabase = createClient(
12+
process.env.NEXT_PUBLIC_SUPABASE_URL!,
13+
process.env.SUPABASE_SERVICE_ROLE_KEY!
14+
);
15+
16+
// Function to process and embed a single file's content
17+
async function processAndEmbedFile(content: string, embeddingPipeline: any) {
18+
// 1. Chunk the content
19+
const chunks = content.split('\n').filter(line => line.trim().length > 10);
20+
21+
if (chunks.length === 0) {
22+
return;
23+
}
24+
25+
// 2. Generate embeddings for each chunk
26+
const embeddings = await embeddingPipeline(chunks, {
27+
pooling: 'mean',
28+
normalize: true,
29+
});
30+
31+
// 3. Prepare data for Supabase
32+
const dataToInsert = chunks.map((chunk, i) => ({
33+
content: chunk,
34+
embedding: Array.from(embeddings.data.slice(i * 384, (i + 1) * 384)),
35+
}));
36+
37+
// 4. Upsert into Supabase
38+
const { error } = await supabase.from('code_embeddings').insert(dataToInsert);
39+
if (error) {
40+
console.error('Supabase insert error:', error);
41+
}
42+
}
43+
44+
// The main background job logic
45+
async function importDataset(subset: string) {
46+
try {
47+
console.log(`Starting dataset import for subset: ${subset}...`);
48+
const embeddingPipeline = await pipeline('feature-extraction', EMBEDDING_MODEL);
49+
50+
// 1. Fetch the list of files in the dataset subset
51+
const repoInfoUrl = `${HUGGING_FACE_API_URL}/${DATASET}/tree/main/data/${subset}`;
52+
const repoInfoResponse = await fetch(repoInfoUrl);
53+
if (!repoInfoResponse.ok) {
54+
throw new Error(`Failed to fetch repo info: ${repoInfoResponse.statusText}`);
55+
}
56+
const files: { path: string; type: string }[] = await repoInfoResponse.json();
57+
58+
// 2. Process each file
59+
for (const file of files) {
60+
if (file.type === 'file') {
61+
console.log(`Processing file: ${file.path}`);
62+
63+
// Construct the direct download URL
64+
const downloadUrl = `https://huggingface.co/datasets/${DATASET}/resolve/main/${file.path}`;
65+
const fileResponse = await fetch(downloadUrl);
66+
67+
if (fileResponse.ok) {
68+
const content = await fileResponse.text();
69+
await processAndEmbedFile(content, embeddingPipeline);
70+
} else {
71+
console.warn(`Could not download file: ${file.path}`);
72+
}
73+
}
74+
}
75+
76+
console.log(`Dataset import for subset: ${subset} completed.`);
77+
} catch (error) {
78+
console.error(`Error during dataset import for subset: ${subset}`, error);
79+
}
80+
}
81+
82+
// The API Route Handler
83+
export async function POST(request: Request) {
84+
try {
85+
const { subset } = await request.json();
86+
87+
if (!subset) {
88+
return NextResponse.json({ error: 'Missing "subset" parameter (e.g., "python")' }, { status: 400 });
89+
}
90+
91+
// Trigger the background job without waiting for it to complete
92+
importDataset(subset).catch(console.error);
93+
94+
return NextResponse.json(
95+
{ message: `Started importing the "${subset}" subset from "${DATASET}". This will take a while.` },
96+
{ status: 202 }
97+
);
98+
} catch (error: any) {
99+
console.error('API Error:', error);
100+
return NextResponse.json({ error: 'Internal Server Error', detail: error.message }, { status: 500 });
101+
}
102+
}

lib/retrieval.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { createClient } from '@supabase/supabase-js';
2+
import { pipeline } from '@xenova/transformers';
3+
4+
// Configuration
5+
const EMBEDDING_MODEL = 'Xenova/all-MiniLM-L6-v2';
6+
7+
// Initialize Supabase client
8+
const supabase = createClient(
9+
process.env.NEXT_PUBLIC_SUPABASE_URL!,
10+
process.env.SUPABASE_SERVICE_ROLE_KEY!
11+
);
12+
13+
// Singleton pattern for the embedding pipeline
14+
let embeddingPipeline: any = null;
15+
async function getEmbeddingPipeline() {
16+
if (!embeddingPipeline) {
17+
embeddingPipeline = await pipeline('feature-extraction', EMBEDDING_MODEL);
18+
}
19+
return embeddingPipeline;
20+
}
21+
22+
// Function to retrieve relevant code snippets
23+
export async function retrieveRelevantCode(query: string, match_threshold = 0.75, match_count = 5) {
24+
try {
25+
const pipeline = await getEmbeddingPipeline();
26+
27+
// 1. Generate an embedding for the user's query
28+
const queryEmbedding = await pipeline(query, {
29+
pooling: 'mean',
30+
normalize: true,
31+
});
32+
33+
// 2. Query Supabase for similar embeddings
34+
const { data, error } = await supabase.rpc('match_code_embeddings', {
35+
query_embedding: Array.from(queryEmbedding.data),
36+
match_threshold,
37+
match_count,
38+
});
39+
40+
if (error) {
41+
console.error('Supabase RPC error:', error);
42+
return [];
43+
}
44+
45+
return data.map((item: any) => item.content);
46+
} catch (error) {
47+
console.error('Error during code retrieval:', error);
48+
return [];
49+
}
50+
}

lib/sql/create_user_team_on_signup.sql

Lines changed: 0 additions & 24 deletions
This file was deleted.

memory-bank/raw_reflection_log.md

Lines changed: 0 additions & 21 deletions
This file was deleted.

next.config.mjs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
/** @type {import('next').NextConfig} */
2-
const nextConfig = {}
2+
const nextConfig = {
3+
webpack: (config) => {
4+
config.externals.push('onnxruntime-node')
5+
return config
6+
},
7+
}
38

49
export default nextConfig

0 commit comments

Comments
 (0)