mirror of
https://github.com/kunkunsh/kunkun-ext-rag.git
synced 2025-04-04 02:16:41 +00:00
255 lines
7.9 KiB
TypeScript
255 lines
7.9 KiB
TypeScript
import { FaissStore } from '@langchain/community/vectorstores/faiss';
|
|
import { ChatOpenAI, OpenAIEmbeddings } from '@langchain/openai';
|
|
import * as v from 'valibot';
|
|
import * as path from 'jsr:@std/path';
|
|
import { existsSync, readdirSync } from 'node:fs';
|
|
import { Document } from '@langchain/core/documents';
|
|
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
|
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory';
|
|
import { TextLoader } from 'langchain/document_loaders/fs/text';
|
|
import { computeSha256FromText } from './crypto.ts';
|
|
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
|
|
import { DenoAPI } from '../src/api.types.ts';
|
|
import { txtExts } from './constants.ts';
|
|
import { AIMessageChunk } from '@langchain/core/messages';
|
|
|
|
export const embeddings = new OpenAIEmbeddings({
|
|
// configuration: {
|
|
// baseURL: 'https://api.deepseek.com'
|
|
// },
|
|
model: 'text-embedding-3-large'
|
|
});
|
|
|
|
export const MetadataSchema = v.object({
|
|
filesSha256: v.array(v.string())
|
|
});
|
|
export type Metadata = v.InferOutput<typeof MetadataSchema>;
|
|
|
|
export async function getDocsFromDirectory(directoryPath: string): Promise<Document[]> {
|
|
const splitter = new RecursiveCharacterTextSplitter({
|
|
chunkSize: 1000,
|
|
chunkOverlap: 200
|
|
});
|
|
|
|
const loader = new DirectoryLoader(directoryPath, {
|
|
// '.json': (path) => new JSONLoader(path, '/texts'),
|
|
// '.jsonl': (path) => new JSONLinesLoader(path, '/html'),
|
|
'.txt': (path) => new TextLoader(path),
|
|
'.md': (path) => new TextLoader(path),
|
|
'.mdx': (path) => new TextLoader(path)
|
|
});
|
|
const docs = await loader.load();
|
|
const allSplits = await splitter.splitDocuments(docs);
|
|
return allSplits;
|
|
}
|
|
|
|
export class Bucket implements DenoAPI {
|
|
bucketPath: string = '';
|
|
faissStorePath: string = '';
|
|
metadataPath: string = '';
|
|
bucketDir: string = '';
|
|
bucketName: string = '';
|
|
private _vectorStore: FaissStore | null = null;
|
|
filesSha256: Set<string> = new Set();
|
|
|
|
async init(bucketDir: string, bucketName: string) {
|
|
this.bucketDir = bucketDir;
|
|
this.bucketName = bucketName;
|
|
this.bucketPath = path.join(this.bucketDir, this.bucketName);
|
|
this.faissStorePath = path.join(this.bucketPath, 'faiss-store');
|
|
this.metadataPath = path.join(this.bucketPath, 'metadata.json');
|
|
|
|
if (!existsSync(this.bucketPath)) {
|
|
Deno.mkdirSync(this.bucketPath, { recursive: true });
|
|
}
|
|
if (existsSync(this.metadataPath)) {
|
|
const metadata = JSON.parse(Deno.readTextFileSync(this.metadataPath));
|
|
const parsedMetadata = v.safeParse(MetadataSchema, metadata);
|
|
if (parsedMetadata.success) {
|
|
this.filesSha256 = new Set(parsedMetadata.output.filesSha256);
|
|
} else {
|
|
throw new Error('Invalid metadata');
|
|
}
|
|
}
|
|
this.updateMetadata();
|
|
this._vectorStore = await this.getVectorStore();
|
|
}
|
|
|
|
updateMetadata() {
|
|
const metadata: Metadata = {
|
|
filesSha256: Array.from(this.filesSha256)
|
|
};
|
|
Deno.writeTextFileSync(this.metadataPath, JSON.stringify(metadata));
|
|
}
|
|
|
|
async getVectorStore() {
|
|
if (
|
|
existsSync(this.faissStorePath) &&
|
|
existsSync(path.join(this.faissStorePath, 'docstore.json'))
|
|
) {
|
|
const vectorStore = await FaissStore.load(this.faissStorePath, embeddings);
|
|
return vectorStore;
|
|
}
|
|
// const vectorStore = await FaissStore.fromDocuments(docs, embeddings);
|
|
const vectorStore = new FaissStore(embeddings, {});
|
|
// await vectorStore.save(this.faissStorePath);
|
|
return vectorStore;
|
|
}
|
|
|
|
get vectorStore() {
|
|
if (this._vectorStore === null) {
|
|
throw new Error('Vector store not initialized');
|
|
}
|
|
return this._vectorStore;
|
|
}
|
|
|
|
private async addDocuments(documents: Document[]) {
|
|
await this.vectorStore.addDocuments(documents);
|
|
// await this.vectorStore.save(this.faissStorePath);
|
|
}
|
|
|
|
private fillSha256(documents: Document[]) {
|
|
for (const doc of documents) {
|
|
const sha256 = computeSha256FromText(doc.pageContent);
|
|
doc.metadata.sha256 = sha256;
|
|
}
|
|
}
|
|
|
|
private getFilteredDocs(documents: Document[]) {
|
|
return documents.filter((doc) => !this.filesSha256.has(doc.metadata.sha256));
|
|
}
|
|
|
|
private updateSha256(docs: Document[]) {
|
|
for (const doc of docs) {
|
|
this.filesSha256.add(doc.metadata.sha256 as string);
|
|
}
|
|
this.updateMetadata();
|
|
}
|
|
|
|
async addDirectory(directoryPath: string) {
|
|
if (!existsSync(directoryPath)) {
|
|
throw new Error('Directory does not exist');
|
|
}
|
|
// check if path is a directory or file
|
|
const stats = Deno.statSync(directoryPath);
|
|
if (stats.isFile) {
|
|
throw new Error('Path is a file');
|
|
}
|
|
|
|
const docs = await getDocsFromDirectory(directoryPath);
|
|
this.fillSha256(docs);
|
|
const fileteredDocs = this.getFilteredDocs(docs);
|
|
for (const doc of fileteredDocs) {
|
|
this.filesSha256.add(doc.metadata.sha256 as string);
|
|
}
|
|
this.updateMetadata();
|
|
const splitter = new RecursiveCharacterTextSplitter({
|
|
chunkSize: 1000,
|
|
chunkOverlap: 200
|
|
});
|
|
const allSplits = await splitter.splitDocuments(fileteredDocs);
|
|
await this.addDocuments(allSplits);
|
|
}
|
|
|
|
async addTextFile(filePath: string) {
|
|
const loader = new TextLoader(filePath);
|
|
const docs = await loader.load();
|
|
console.error('Loaded docs', docs.length);
|
|
this.fillSha256(docs);
|
|
const fileteredDocs = this.getFilteredDocs(docs);
|
|
console.error('Filtered docs', fileteredDocs.length);
|
|
this.updateSha256(docs);
|
|
console.error('Updated sha256', this.filesSha256.size);
|
|
// await this.addDocuments(fileteredDocs);
|
|
await this.vectorStore.addDocuments(fileteredDocs).catch((err) => {
|
|
console.error('Error adding documents', err);
|
|
});
|
|
}
|
|
|
|
async save() {
|
|
console.error('Save Bucket', this.vectorStore.docstore._docs.size);
|
|
if (existsSync(this.faissStorePath) && readdirSync(this.faissStorePath).length === 0) {
|
|
Deno.removeSync(this.bucketPath, { recursive: true });
|
|
}
|
|
if (this.vectorStore.docstore._docs.size === 0) {
|
|
throw new Error('No documents to save');
|
|
}
|
|
await this.vectorStore.save(this.faissStorePath);
|
|
}
|
|
|
|
async addPDF(filePath: string) {
|
|
const loader = new PDFLoader(filePath);
|
|
const docs = await loader.load();
|
|
this.fillSha256(docs);
|
|
const fileteredDocs = this.getFilteredDocs(docs);
|
|
this.updateSha256(docs);
|
|
await this.addDocuments(fileteredDocs);
|
|
}
|
|
|
|
async retrieve(query: string) {
|
|
const retriever = this.vectorStore.asRetriever();
|
|
const docs = await retriever.invoke(query);
|
|
const docsText = docs.map((d) => d.pageContent).join('');
|
|
return docsText;
|
|
}
|
|
|
|
async query(question: string) {
|
|
const docsText = await this.retrieve(question);
|
|
const systemPrompt = `You are an assistant for question-answering tasks.
|
|
Use the following pieces of retrieved context to answer the question.
|
|
If you don't know the answer, just say that you don't know.
|
|
Use three sentences maximum and keep the answer concise.
|
|
Context: {context}:`;
|
|
|
|
// Populate the system prompt with the retrieved context
|
|
const systemPromptFmt = systemPrompt.replace('{context}', docsText);
|
|
|
|
// Create a model
|
|
const model = new ChatOpenAI({
|
|
model: 'gpt-4o',
|
|
temperature: 0
|
|
});
|
|
|
|
// Generate a response
|
|
const ans: AIMessageChunk = await model.invoke([
|
|
{
|
|
role: 'system',
|
|
content: systemPromptFmt
|
|
},
|
|
{
|
|
role: 'user',
|
|
content: question
|
|
}
|
|
]);
|
|
return ans.content.toString();
|
|
}
|
|
|
|
async indexFiles(files: string[]) {
|
|
console.error('Indexing files', files);
|
|
for (const file of files) {
|
|
if (!existsSync(file)) {
|
|
throw new Error(`File ${file} does not exist`);
|
|
}
|
|
// check if file is directory
|
|
const stats = Deno.statSync(file);
|
|
console.error('Indexing file', file, 'stats.isFile', stats.isFile);
|
|
if (stats.isFile) {
|
|
const ext = path.extname(file);
|
|
if (txtExts.includes(ext)) {
|
|
console.error('Adding text file 1', file);
|
|
await this.addTextFile(file);
|
|
console.error('Finished adding text file', file);
|
|
} else if (ext === '.pdf') {
|
|
console.error('Adding pdf file', file);
|
|
await this.addPDF(file);
|
|
} else {
|
|
throw new Error(`Unsupported file type: ${ext}`);
|
|
}
|
|
} else {
|
|
console.error('Adding directory', file);
|
|
await this.addDirectory(file);
|
|
}
|
|
}
|
|
}
|
|
}
|