-
+
{knowledge.title}
diff --git a/src/libs/process-knowledge.ts b/src/libs/process-knowledge.ts
index d98cf17..eeff415 100644
--- a/src/libs/process-knowledge.ts
+++ b/src/libs/process-knowledge.ts
@@ -7,6 +7,8 @@ import {
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { PageAssistVectorStore } from "./PageAssistVectorStore"
+import { PageAssisCSVUrlLoader } from "@/loader/csv"
+import { PageAssisTXTUrlLoader } from "@/loader/txt"
export const processKnowledge = async (msg: any, id: string): Promise => {
console.log(`Processing knowledge with id: ${id}`)
@@ -38,6 +40,34 @@ export const processKnowledge = async (msg: any, id: string): Promise => {
})
let docs = await loader.load()
const chunks = await textSplitter.splitDocuments(docs)
+ await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
+ knownledge_id: knowledge.id,
+ file_id: doc.source_id
+ })
+ } else if (doc.type === "csv" || doc.type === "text/csv") {
+ const loader = new PageAssisCSVUrlLoader({
+ name: doc.filename,
+ url: doc.content,
+ options: {}
+ })
+
+ let docs = await loader.load()
+
+ const chunks = await textSplitter.splitDocuments(docs)
+ await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
+ knownledge_id: knowledge.id,
+ file_id: doc.source_id
+ })
+ } else if (doc.type === "txt" || doc.type === "text/plain") {
+ const loader = new PageAssisTXTUrlLoader({
+ name: doc.filename,
+ url: doc.content
+ })
+
+ let docs = await loader.load()
+
+ const chunks = await textSplitter.splitDocuments(docs)
+
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
knownledge_id: knowledge.id,
file_id: doc.source_id
diff --git a/src/loader/csv.ts b/src/loader/csv.ts
new file mode 100644
index 0000000..b93f1e1
--- /dev/null
+++ b/src/loader/csv.ts
@@ -0,0 +1,84 @@
+import { dsvFormat } from "d3-dsv"
+
+import { BaseDocumentLoader } from "langchain/document_loaders/base"
+import { Document } from "@langchain/core/documents"
+export interface WebLoaderParams {
+ url: string
+ name: string
+ options: {
+ column?: string
+ separator?: string
+ }
+}
+
+export class PageAssisCSVUrlLoader
+ extends BaseDocumentLoader
+ implements WebLoaderParams
+{
+ pdf: { content: string; page: number }[]
+ url: string
+ name: string
+ options: { column?: string; separator?: string }
+
+ constructor({ url, name }: WebLoaderParams) {
+ super()
+ this.url = url
+ this.name = name
+ this.options = {}
+ }
+
+ public async parse(raw: string): Promise {
+ const { column, separator = "," } = this.options
+ const psv = dsvFormat(separator)
+
+ let parsed = psv.parseRows(raw.trim())
+
+ if (column !== undefined) {
+ if (!parsed[0].includes(column)) {
+ throw new Error(`ColumnNotFoundError: Column ${column} not found`)
+ }
+
+ const columnIndex = parsed[0].indexOf(column)
+ return parsed.map((row) => row[columnIndex]!)
+ }
+
+ const headers = parsed[0]
+ parsed = parsed.slice(1)
+
+ return parsed.map((row) =>
+ row.map((value, index) => `${headers[index]}: ${value}`).join("\n")
+ )
+ }
+ async load(): Promise>[]> {
+ const res = await fetch(this.url)
+
+ if (!res.ok) {
+ throw new Error(`Failed to fetch ${this.url}`)
+ }
+
+ const raw = await res.text()
+
+ const parsed = await this.parse(raw)
+ let metadata = { source: this.name, type: "csv" }
+ parsed.forEach((pageContent, i) => {
+ if (typeof pageContent !== "string") {
+ throw new Error(
+ `Expected string, at position ${i} got ${typeof pageContent}`
+ )
+ }
+ })
+ return parsed.map(
+ (pageContent, i) =>
+ new Document({
+ pageContent,
+ metadata:
+ parsed.length === 1
+ ? metadata
+ : {
+ ...metadata,
+ line: i + 1
+ }
+ })
+ )
+ }
+}
diff --git a/src/loader/txt.ts b/src/loader/txt.ts
new file mode 100644
index 0000000..4b92453
--- /dev/null
+++ b/src/loader/txt.ts
@@ -0,0 +1,57 @@
+import { BaseDocumentLoader } from "langchain/document_loaders/base"
+import { Document } from "@langchain/core/documents"
+export interface WebLoaderParams {
+ url: string
+ name: string
+}
+
+export class PageAssisTXTUrlLoader
+ extends BaseDocumentLoader
+ implements WebLoaderParams
+{
+ pdf: { content: string; page: number }[]
+ url: string
+ name: string
+
+ constructor({ url, name }: WebLoaderParams) {
+ super()
+ this.url = url
+ this.name = name
+ }
+
+ public async parse(raw: string): Promise {
+ return [raw]
+ }
+ async load(): Promise>[]> {
+ const res = await fetch(this.url)
+
+ if (!res.ok) {
+ throw new Error(`Failed to fetch ${this.url}`)
+ }
+
+ const raw = await res.text()
+
+ const parsed = await this.parse(raw)
+ let metadata = { source: this.name, type: "csv" }
+ parsed.forEach((pageContent, i) => {
+ if (typeof pageContent !== "string") {
+ throw new Error(
+ `Expected string, at position ${i} got ${typeof pageContent}`
+ )
+ }
+ })
+ return parsed.map(
+ (pageContent, i) =>
+ new Document({
+ pageContent,
+ metadata:
+ parsed.length === 1
+ ? metadata
+ : {
+ ...metadata,
+ line: i + 1
+ }
+ })
+ )
+ }
+}