diff --git a/bun.lockb b/bun.lockb index 15224d0..8342534 100644 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index 1988dfa..2d445ec 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "antd": "^5.13.3", "axios": "^1.6.7", "cheerio": "^1.0.0-rc.12", + "d3-dsv": "2", "dayjs": "^1.11.10", "html-to-text": "^9.0.5", "i18next": "^23.10.1", @@ -55,6 +56,7 @@ "devDependencies": { "@plasmohq/prettier-plugin-sort-imports": "4.0.1", "@types/chrome": "0.0.259", + "@types/d3-dsv": "^3.0.7", "@types/html-to-text": "^9.0.4", "@types/node": "20.11.9", "@types/pubsub-js": "^1.8.6", diff --git a/src/components/Option/Knowledge/SelectedKnwledge.tsx b/src/components/Option/Knowledge/SelectedKnwledge.tsx index c217465..b7eaa2d 100644 --- a/src/components/Option/Knowledge/SelectedKnwledge.tsx +++ b/src/components/Option/Knowledge/SelectedKnwledge.tsx @@ -12,9 +12,9 @@ export const SelectedKnowledge = () => { {"/"} -
+
- + {knowledge.title} diff --git a/src/libs/process-knowledge.ts b/src/libs/process-knowledge.ts index d98cf17..eeff415 100644 --- a/src/libs/process-knowledge.ts +++ b/src/libs/process-knowledge.ts @@ -7,6 +7,8 @@ import { import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { PageAssistVectorStore } from "./PageAssistVectorStore" +import { PageAssisCSVUrlLoader } from "@/loader/csv" +import { PageAssisTXTUrlLoader } from "@/loader/txt" export const processKnowledge = async (msg: any, id: string): Promise => { console.log(`Processing knowledge with id: ${id}`) @@ -38,6 +40,34 @@ export const processKnowledge = async (msg: any, id: string): Promise => { }) let docs = await loader.load() const chunks = await textSplitter.splitDocuments(docs) + await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, { + knownledge_id: knowledge.id, + file_id: doc.source_id + }) + } else if (doc.type === "csv" || doc.type === "text/csv") { + const loader = new PageAssisCSVUrlLoader({ + name: doc.filename, + url: doc.content, + options: {} + }) + + let docs = await loader.load() + + const chunks = await textSplitter.splitDocuments(docs) + await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, { + knownledge_id: knowledge.id, + file_id: doc.source_id + }) + } else if (doc.type === "txt" || doc.type === "text/plain") { + const loader = new PageAssisTXTUrlLoader({ + name: doc.filename, + url: doc.content + }) + + let docs = await loader.load() + + const chunks = await textSplitter.splitDocuments(docs) + await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, { knownledge_id: knowledge.id, file_id: doc.source_id diff --git a/src/loader/csv.ts b/src/loader/csv.ts new file mode 100644 index 0000000..b93f1e1 --- /dev/null +++ b/src/loader/csv.ts @@ -0,0 +1,84 @@ +import { dsvFormat } from "d3-dsv" + +import { BaseDocumentLoader } from "langchain/document_loaders/base" +import { Document } from "@langchain/core/documents" +export interface WebLoaderParams { + url: string + name: string + options: { + column?: string + separator?: string + } +} + +export class PageAssisCSVUrlLoader + extends BaseDocumentLoader + implements WebLoaderParams +{ + pdf: { content: string; page: number }[] + url: string + name: string + options: { column?: string; separator?: string } + + constructor({ url, name }: WebLoaderParams) { + super() + this.url = url + this.name = name + this.options = {} + } + + public async parse(raw: string): Promise { + const { column, separator = "," } = this.options + const psv = dsvFormat(separator) + + let parsed = psv.parseRows(raw.trim()) + + if (column !== undefined) { + if (!parsed[0].includes(column)) { + throw new Error(`ColumnNotFoundError: Column ${column} not found`) + } + + const columnIndex = parsed[0].indexOf(column) + return parsed.map((row) => row[columnIndex]!) + } + + const headers = parsed[0] + parsed = parsed.slice(1) + + return parsed.map((row) => + row.map((value, index) => `${headers[index]}: ${value}`).join("\n") + ) + } + async load(): Promise>[]> { + const res = await fetch(this.url) + + if (!res.ok) { + throw new Error(`Failed to fetch ${this.url}`) + } + + const raw = await res.text() + + const parsed = await this.parse(raw) + let metadata = { source: this.name, type: "csv" } + parsed.forEach((pageContent, i) => { + if (typeof pageContent !== "string") { + throw new Error( + `Expected string, at position ${i} got ${typeof pageContent}` + ) + } + }) + return parsed.map( + (pageContent, i) => + new Document({ + pageContent, + metadata: + parsed.length === 1 + ? metadata + : { + ...metadata, + line: i + 1 + } + }) + ) + } +} diff --git a/src/loader/txt.ts b/src/loader/txt.ts new file mode 100644 index 0000000..4b92453 --- /dev/null +++ b/src/loader/txt.ts @@ -0,0 +1,57 @@ +import { BaseDocumentLoader } from "langchain/document_loaders/base" +import { Document } from "@langchain/core/documents" +export interface WebLoaderParams { + url: string + name: string +} + +export class PageAssisTXTUrlLoader + extends BaseDocumentLoader + implements WebLoaderParams +{ + pdf: { content: string; page: number }[] + url: string + name: string + + constructor({ url, name }: WebLoaderParams) { + super() + this.url = url + this.name = name + } + + public async parse(raw: string): Promise { + return [raw] + } + async load(): Promise>[]> { + const res = await fetch(this.url) + + if (!res.ok) { + throw new Error(`Failed to fetch ${this.url}`) + } + + const raw = await res.text() + + const parsed = await this.parse(raw) + let metadata = { source: this.name, type: "csv" } + parsed.forEach((pageContent, i) => { + if (typeof pageContent !== "string") { + throw new Error( + `Expected string, at position ${i} got ${typeof pageContent}` + ) + } + }) + return parsed.map( + (pageContent, i) => + new Document({ + pageContent, + metadata: + parsed.length === 1 + ? metadata + : { + ...metadata, + line: i + 1 + } + }) + ) + } +}