diff --git a/.github/workflows/jsr-publish.yml b/.github/workflows/jsr-publish.yml index 578212b..c6a7fc8 100644 --- a/.github/workflows/jsr-publish.yml +++ b/.github/workflows/jsr-publish.yml @@ -22,4 +22,5 @@ jobs: run: | bun install bun run build + bunx kksh@latest verify --publish bunx jsr publish --allow-slow-types diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b943dbc --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "deno.enable": true +} \ No newline at end of file diff --git a/README.md b/README.md index dc8623c..62eeafc 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,29 @@ RAG means Retrieval-Augmented Generation. This extension is a local RAG app, that allows you to index a local directory of files and search them using a LLM model. + +If you don't know what RAG is, see [Wikipedia: RAG](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) + +Basically, this extension allows you index local files and directories and search them using a LLM model. + +For now, only text files and pdf files are supported. + +The following file extensions are supported for `Add Files`: + +- `.txt` +- `.pdf` +- `.md` +- `.mdx` + +`.pdf` is not supported yet for `Add Directory`. + +> [!CAUTION] +> If you want other file extensions to be supported, please send a issue to the repository. +> I will add options to let user add dynamic file extensions if there are people using this extension. + +This is to prevent indexing other files you may not want to index, like lock files. + +## Sample Images + +![](https://i.imgur.com/SMwsks7.png) +![](https://i.imgur.com/KPkwhMN.png) diff --git a/bun.lockb b/bun.lockb index 72d7a94..0ac6552 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/deno-src/bucket.ts b/deno-src/bucket.ts index 6039bf6..9f3031c 100644 --- a/deno-src/bucket.ts +++ b/deno-src/bucket.ts @@ -1,17 +1,22 @@ import { FaissStore } from '@langchain/community/vectorstores/faiss'; -import { OpenAIEmbeddings } from '@langchain/openai'; +import { ChatOpenAI, OpenAIEmbeddings } from '@langchain/openai'; import * as v from 'valibot'; import * as path from 'jsr:@std/path'; import { existsSync, readdirSync } from 'node:fs'; import { Document } from '@langchain/core/documents'; import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'; import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'; -import { JSONLoader, JSONLinesLoader } from 'langchain/document_loaders/fs/json'; import { TextLoader } from 'langchain/document_loaders/fs/text'; import { computeSha256FromText } from './crypto.ts'; import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'; +import { DenoAPI } from '../src/api.types.ts'; +import { txtExts } from './constants.ts'; +import { AIMessageChunk } from '@langchain/core/messages'; export const embeddings = new OpenAIEmbeddings({ + // configuration: { + // baseURL: 'https://api.deepseek.com' + // }, model: 'text-embedding-3-large' }); @@ -27,8 +32,8 @@ export async function getDocsFromDirectory(directoryPath: string): Promise new JSONLoader(path, '/texts'), - '.jsonl': (path) => new JSONLinesLoader(path, '/html'), + // '.json': (path) => new JSONLoader(path, '/texts'), + // '.jsonl': (path) => new JSONLinesLoader(path, '/html'), '.txt': (path) => new TextLoader(path), '.md': (path) => new TextLoader(path), '.mdx': (path) => new TextLoader(path) @@ -38,23 +43,22 @@ export async function getDocsFromDirectory(directoryPath: string): Promise = new Set(); - constructor( - readonly bucketDir: string, - readonly bucketName: string - ) { + async init(bucketDir: string, bucketName: string) { + this.bucketDir = bucketDir; + this.bucketName = bucketName; this.bucketPath = path.join(this.bucketDir, this.bucketName); this.faissStorePath = path.join(this.bucketPath, 'faiss-store'); this.metadataPath = path.join(this.bucketPath, 'metadata.json'); - } - async init() { if (!existsSync(this.bucketPath)) { Deno.mkdirSync(this.bucketPath, { recursive: true }); } @@ -69,9 +73,6 @@ export class Bucket { } this.updateMetadata(); this._vectorStore = await this.getVectorStore(); - // if (this._vectorStore) { - // await this._vectorStore.save(this.faissStorePath); - // } } updateMetadata() { @@ -160,7 +161,7 @@ export class Bucket { this.updateSha256(docs); console.error('Updated sha256', this.filesSha256.size); // await this.addDocuments(fileteredDocs); - return this.vectorStore.addDocuments(fileteredDocs).catch((err) => { + await this.vectorStore.addDocuments(fileteredDocs).catch((err) => { console.error('Error adding documents', err); }); } @@ -184,4 +185,70 @@ export class Bucket { this.updateSha256(docs); await this.addDocuments(fileteredDocs); } + + async retrieve(query: string) { + const retriever = this.vectorStore.asRetriever(); + const docs = await retriever.invoke(query); + const docsText = docs.map((d) => d.pageContent).join(''); + return docsText; + } + + async query(question: string) { + const docsText = await this.retrieve(question); + const systemPrompt = `You are an assistant for question-answering tasks. +Use the following pieces of retrieved context to answer the question. +If you don't know the answer, just say that you don't know. +Use three sentences maximum and keep the answer concise. +Context: {context}:`; + + // Populate the system prompt with the retrieved context + const systemPromptFmt = systemPrompt.replace('{context}', docsText); + + // Create a model + const model = new ChatOpenAI({ + model: 'gpt-4o', + temperature: 0 + }); + + // Generate a response + const ans: AIMessageChunk = await model.invoke([ + { + role: 'system', + content: systemPromptFmt + }, + { + role: 'user', + content: question + } + ]); + return ans.content.toString(); + } + + async indexFiles(files: string[]) { + console.error('Indexing files', files); + for (const file of files) { + if (!existsSync(file)) { + throw new Error(`File ${file} does not exist`); + } + // check if file is directory + const stats = Deno.statSync(file); + console.error('Indexing file', file, 'stats.isFile', stats.isFile); + if (stats.isFile) { + const ext = path.extname(file); + if (txtExts.includes(ext)) { + console.error('Adding text file 1', file); + await this.addTextFile(file); + console.error('Finished adding text file', file); + } else if (ext === '.pdf') { + console.error('Adding pdf file', file); + await this.addPDF(file); + } else { + throw new Error(`Unsupported file type: ${ext}`); + } + } else { + console.error('Adding directory', file); + await this.addDirectory(file); + } + } + } } diff --git a/deno-src/dev.ts b/deno-src/dev.ts deleted file mode 100644 index f9c92d6..0000000 --- a/deno-src/dev.ts +++ /dev/null @@ -1,77 +0,0 @@ -// import { FaissStore } from '@langchain/community/vectorstores/faiss'; -// import { Bucket, embeddings, getDocsFromDirectory } from './bucket.ts'; -// import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'; -// import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'; -// import { JSONLoader, JSONLinesLoader } from 'langchain/document_loaders/fs/json'; -// import { TextLoader } from 'langchain/document_loaders/fs/text'; -// import { OpenAIEmbeddings } from '@langchain/openai'; -import { existsSync } from 'node:fs'; -// import path from 'path'; -import { txtExts } from './constants.ts'; -import { Bucket } from './bucket.ts'; -import path from 'node:path'; - -async function indexFiles(bucketName: string, files: string[]): Promise { - const bucket = new Bucket('./store', bucketName); - // const bucket = new Bucket(extensionSupportPath, bucketName); - console.error('bucket path', bucket.bucketPath); - console.error('files', files); - await bucket.init(); - for (const file of files) { - if (!existsSync(file)) { - throw new Error(`File ${file} does not exist`); - } - console.error('file', file); - // check if file is directory - const stats = Deno.statSync(file); - if (stats.isFile) { - const ext = path.extname(file); - if (txtExts.includes(ext)) { - console.error('Adding text file', file); - await bucket.addTextFile(file); - console.error('Finished adding text file', file); - } else if (ext === '.pdf') { - console.error('Adding pdf file', file); - await bucket.addPDF(file); - } else if (stats.isDirectory) { - console.error('Adding directory', file); - await bucket.addDirectory(file); - } else { - throw new Error(`Unsupported file type: ${ext}`); - } - } - } - await bucket.save(); -} -indexFiles('Kunkun Docs', ['/Users/hk/Dev/kunkun-docs/src/content/docs/developer/DX.mdx']); - -// const bucket = new Bucket( -// '/Users/hk/Dev/kunkun-extension-repos/kunkun-ext-rag/extensions_support', -// 'Kunkun Docs' -// ); -// await bucket.init(); -// const files = ['/Users/hk/Dev/kunkun-docs/src/content/docs/developer/manifest.mdx']; -// for (const file of files) { -// if (!existsSync(file)) { -// throw new Error(`File ${file} does not exist`); -// } -// console.error('file', file); -// // check if file is directory -// const stats = Deno.statSync(file); -// if (stats.isFile) { -// const ext = path.extname(file); -// if (txtExts.includes(ext)) { -// console.error('Adding text file', file); -// await bucket.addTextFile(file); -// } else if (ext === '.pdf') { -// console.error('Adding pdf file', file); -// await bucket.addPDF(file); -// } else if (stats.isDirectory) { -// console.error('Adding directory', file); -// await bucket.addDirectory(file); -// } else { -// throw new Error(`Unsupported file type: ${ext}`); -// } -// } -// } -// await bucket.save(); diff --git a/deno-src/index.ts b/deno-src/index.ts index f67477f..da58f1e 100644 --- a/deno-src/index.ts +++ b/deno-src/index.ts @@ -1,48 +1,5 @@ import { expose } from '@kunkun/api/runtime/deno'; import type { DenoAPI } from '../src/api.types.ts'; import { Bucket } from './bucket.ts'; -import { existsSync } from 'node:fs'; -import path from 'node:path'; -import { txtExts } from './constants.ts'; -export const extensionSupportPath = Deno.env.get('EXTENSION_SUPPORT'); -if (!extensionSupportPath) { - throw new Error('EXTENSION_SUPPORT is not set'); -} - -expose({ - async indexFiles(bucketName: string, files: string[]): Promise { - const cwd = Deno.cwd(); - console.error('cwd', cwd); - const bucket = new Bucket(extensionSupportPath, bucketName); - // const bucket = new Bucket(extensionSupportPath, bucketName); - console.error('bucket path', bucket.bucketPath); - console.error('files', files); - await bucket.init(); - for (const file of files) { - if (!existsSync(file)) { - throw new Error(`File ${file} does not exist`); - } - console.error('file', file); - // check if file is directory - const stats = Deno.statSync(file); - if (stats.isFile) { - const ext = path.extname(file); - if (txtExts.includes(ext)) { - console.error('Adding text file', file); - await bucket.addTextFile(file); - console.error('Finished adding text file', file); - } else if (ext === '.pdf') { - console.error('Adding pdf file', file); - await bucket.addPDF(file); - } else if (stats.isDirectory) { - console.error('Adding directory', file); - await bucket.addDirectory(file); - } else { - throw new Error(`Unsupported file type: ${ext}`); - } - } - } - await bucket.save(); - } -} satisfies DenoAPI); +expose(new Bucket() satisfies DenoAPI); diff --git a/deno-src/main.ts b/deno-src/main.ts index 6544331..09e5269 100644 --- a/deno-src/main.ts +++ b/deno-src/main.ts @@ -60,60 +60,60 @@ async function deleteDocuments(vectorStore: FaissStore, ids: string[]) { const vectorStore = await getVectorStore(); -// const llm = new ChatOpenAI({ -// model: "gpt-4o-mini", -// temperature: 0, -// }); +const llm = new ChatOpenAI({ + model: "gpt-4o-mini", + temperature: 0, +}); -// // Define prompt for question-answering -// const promptTemplate = await pull("rlm/rag-prompt"); +// Define prompt for question-answering +const promptTemplate = await pull("rlm/rag-prompt"); -// // Define state for application -// const InputStateAnnotation = Annotation.Root({ -// question: Annotation, -// }); +// Define state for application +const InputStateAnnotation = Annotation.Root({ + question: Annotation, +}); -// const StateAnnotation = Annotation.Root({ -// question: Annotation, -// context: Annotation, -// answer: Annotation, -// }); +const StateAnnotation = Annotation.Root({ + question: Annotation, + context: Annotation, + answer: Annotation, +}); -// // Define application steps -// const retrieve = async (state: typeof InputStateAnnotation.State) => { -// const retrievedDocs = await vectorStore.similaritySearch(state.question); -// return { context: retrievedDocs }; -// }; +// Define application steps +const retrieve = async (state: typeof InputStateAnnotation.State) => { + const retrievedDocs = await vectorStore.similaritySearch(state.question); + return { context: retrievedDocs }; +}; -// const generate = async (state: typeof StateAnnotation.State) => { -// const docsContent = state.context.map((doc) => doc.pageContent).join("\n"); -// const messages = await promptTemplate.invoke({ -// question: state.question, -// context: docsContent, -// }); -// const response = await llm.invoke(messages); -// return { answer: response.content }; -// }; +const generate = async (state: typeof StateAnnotation.State) => { + const docsContent = state.context.map((doc) => doc.pageContent).join("\n"); + const messages = await promptTemplate.invoke({ + question: state.question, + context: docsContent, + }); + const response = await llm.invoke(messages); + return { answer: response.content }; +}; -// // Compile application and test -// const graph = new StateGraph(StateAnnotation) -// .addNode("retrieve", retrieve) -// .addNode("generate", generate) -// .addEdge("__start__", "retrieve") -// .addEdge("retrieve", "generate") -// .addEdge("generate", "__end__") -// .compile(); +// Compile application and test +const graph = new StateGraph(StateAnnotation) + .addNode("retrieve", retrieve) + .addNode("generate", generate) + .addEdge("__start__", "retrieve") + .addEdge("retrieve", "generate") + .addEdge("generate", "__end__") + .compile(); -// let inputs = { question: "What is Task Decomposition?" }; +let inputs = { question: "What is Task Decomposition?" }; -// while (true) { -// const question = prompt("Enter your question (or 'exit' to quit): "); -// if (!question || question.toLowerCase() === "exit") { -// break; -// } +while (true) { + const question = prompt("Enter your question (or 'exit' to quit): "); + if (!question || question.toLowerCase() === "exit") { + break; + } -// const result = await graph.invoke({ question }); -// console.log("\nAnswer:"); -// console.log(result.answer); -// console.log("\n-------------------\n"); -// } + const result = await graph.invoke({ question }); + console.log("\nAnswer:"); + console.log(result.answer); + console.log("\n-------------------\n"); +} diff --git a/jsr.json b/jsr.json index 861de4f..6b4a582 100644 --- a/jsr.json +++ b/jsr.json @@ -1,19 +1,22 @@ { "name": "@kunkun/kunkun-ext-rag", - "version": "0.0.4", + "version": "0.0.5", "license": "MIT", "exports": "./mod.ts", "publish": { "include": ["mod.ts", "deno-src", "build", "LICENSE", "README.md", "package.json"] }, "imports": { + "@kunkun/api": "jsr:@kunkun/api@^0.0.52", "@langchain/community": "npm:@langchain/community@^0.3.22", "@langchain/core": "npm:@langchain/core@^0.3.27", "@langchain/langgraph": "npm:@langchain/langgraph@^0.2.38", "@langchain/openai": "npm:@langchain/openai@^0.3.16", "@langchain/textsplitters": "npm:@langchain/textsplitters@^0.1.0", + "pdf-parse": "npm:pdf-parse@^1.1.1", "@std/assert": "jsr:@std/assert@1", + "valibot": "jsr:@valibot/valibot@^0.42.1", "faiss-node": "npm:faiss-node@^0.5.1", - "langchain": "npm:langchain@^0.3.9" + "langchain": "npm:langchain@^0.3.12" } } diff --git a/package.json b/package.json index 36dc5a9..6a2679a 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,7 @@ "license": "MIT", "name": "kunkun-ext-rag", "draft": true, - "version": "0.0.4", + "version": "0.0.5", "private": true, "kunkun": { "name": "RAG", @@ -62,14 +62,25 @@ ] }, "shell:stdin-write", - "shell:kill" + "shell:kill", + { + "permission": "open:url", + "allow": [ + { + "url": "https://en.wikipedia.org/wiki/Retrieval-augmented_generation" + }, + { + "url": "https://github.com/kunkunsh/kunkun-ext-rag" + } + ] + } ], "customUiCmds": [ { "main": "/", "dist": "build", "devMain": "http://localhost:5173", - "name": "RAG", + "name": "Local RAG", "cmds": [] } ], @@ -88,9 +99,11 @@ "@iconify/svelte": "^4.2.0", "@kksh/api": "^0.0.55", "@kksh/svelte5": "0.1.15", + "@langchain/openai": "^0.4.2", "clsx": "^2.1.1", "lucide-svelte": "^0.474.0", "mode-watcher": "^0.5.1", + "svelte-markdown": "^0.4.1", "sveltekit-superforms": "^2.23.1", "tailwind-merge": "^2.6.0", "tailwind-variants": "^0.3.1", diff --git a/src/api.types.ts b/src/api.types.ts index 489212f..b6fa940 100644 --- a/src/api.types.ts +++ b/src/api.types.ts @@ -1,3 +1,10 @@ export interface DenoAPI { - indexFiles(bucketName: string, files: string[]): Promise; + init(bucketDir: string, bucketName: string): Promise; + addTextFile(filePath: string): Promise; + addPDF(filePath: string): Promise; + addDirectory(dir: string): Promise; + indexFiles(files: string[]): Promise; + save(): Promise; + retrieve(query: string): Promise; + query(query: string): Promise; } diff --git a/src/lib/components/DatabaseList.svelte b/src/lib/components/DatabaseList.svelte index 4beecba..a60669a 100644 --- a/src/lib/components/DatabaseList.svelte +++ b/src/lib/components/DatabaseList.svelte @@ -11,8 +11,9 @@ {dbInfo.name} AI Provider: {dbInfo.ai} - + - + + + {/snippet} diff --git a/src/lib/components/TauriLink.svelte b/src/lib/components/TauriLink.svelte new file mode 100644 index 0000000..cea6197 --- /dev/null +++ b/src/lib/components/TauriLink.svelte @@ -0,0 +1,16 @@ + + + { + e.preventDefault(); + open.url(href); + }} +> + {@render children()} + diff --git a/src/lib/components/app-sidebar.svelte b/src/lib/components/app-sidebar.svelte index 43b752c..61cd550 100644 --- a/src/lib/components/app-sidebar.svelte +++ b/src/lib/components/app-sidebar.svelte @@ -1,8 +1,7 @@ diff --git a/src/lib/deno.ts b/src/lib/deno.ts index 6ae6d6c..2720108 100644 --- a/src/lib/deno.ts +++ b/src/lib/deno.ts @@ -1,7 +1,7 @@ import { fs, shell, path, toast } from '@kksh/api/ui/iframe'; -import type { DenoAPI } from '../api.types'; +import type { DenoAPI } from '../api.types.ts'; -export async function getRpcAPI(env: { OPENAI_API_KEY: string; EXTENSION_SUPPORT: string }) { +export async function getRpcAPI(env: { OPENAI_API_KEY: string }) { await installDenoDeps().catch((err) => { return toast.error(`Failed to install deno dependencies; ${err.message}`); }); @@ -13,7 +13,7 @@ export async function getRpcAPI(env: { OPENAI_API_KEY: string; EXTENSION_SUPPORT { cwd, // allowAllEnv: true, - allowEnv: ['OPENAI_API_KEY', 'EXTENSION_SUPPORT', 'CWD'], + allowEnv: ['OPENAI_API_KEY', 'CWD'], allowWrite: ['$EXTENSION_SUPPORT'], allowAllRead: true, // allowAllWrite: true, diff --git a/src/routes/about/+page.svelte b/src/routes/about/+page.svelte index f514f24..6f46b1b 100644 --- a/src/routes/about/+page.svelte +++ b/src/routes/about/+page.svelte @@ -1,15 +1,30 @@ - - - About Page - - - - - - - +
+

About Page

+ Source Code: + + https://github.com/kunkunsh/kunkun-ext-rag + . +
+
+

+ Kunkun RAG Extension is a local RAG app, that allows you to index a local directory of files and + search them using a LLM model. +

+

+ If you don't know what RAG is, you can read more about it + + https://en.wikipedia.org/wiki/Retrieval-augmented_generation + . +

+

You can add files to a database. Currently only text files are supported.

+

+ Text Files with extension .txt, .md, .mdx are supported. If + you need other file types, send a feature request to the repo. +

+
diff --git a/src/routes/database/[id]/+page.svelte b/src/routes/database/[id]/+page.svelte index f97801a..939fa43 100644 --- a/src/routes/database/[id]/+page.svelte +++ b/src/routes/database/[id]/+page.svelte @@ -1,43 +1,51 @@
-

Manage Database

- - +

+ Manage Database + + + + + + Pick the files or directories you want to index into vector database. Then you can use the + database to answer questions. + + +

+
+ + +
+
{ + cancel(); + ans = ''; + loading = true; + if (query.length === 0) { + toast.error('Question is required'); + return; + } + ans = (await rpc?.api.query(query)) ?? ''; + query = ''; + loading = false; + }} + > +
+ + +
+ {#if loading} +
+ +
+ {:else} +
+ +
+ {/if} +
diff --git a/src/routes/database/[id]/+page.ts b/src/routes/database/[id]/+page.ts index 7184a07..f0f42f2 100644 --- a/src/routes/database/[id]/+page.ts +++ b/src/routes/database/[id]/+page.ts @@ -1,5 +1,7 @@ import type { PageLoad } from './$types'; +export const prerender = false; + export const load: PageLoad = ({ params: { id } }) => { return { id: parseInt(id) }; }; diff --git a/svelte.config.js b/svelte.config.js index 03d229d..e625f86 100644 --- a/svelte.config.js +++ b/svelte.config.js @@ -11,7 +11,9 @@ const config = { // adapter-auto only supports some environments, see https://kit.svelte.dev/docs/adapter-auto for a list. // If your environment is not supported, or you settled on a specific environment, switch out the adapter. // See https://kit.svelte.dev/docs/adapters for more information about adapters. - adapter: adapter({}), + adapter: adapter({ + fallback: '400.html' + }), alias: { '@/*': './src/lib/*' }