Add d3-dsv and @types/d3-dsv dependencies
This commit is contained in:
parent
82abbf5bad
commit
0de5ea0b04
@ -29,6 +29,7 @@
|
|||||||
"antd": "^5.13.3",
|
"antd": "^5.13.3",
|
||||||
"axios": "^1.6.7",
|
"axios": "^1.6.7",
|
||||||
"cheerio": "^1.0.0-rc.12",
|
"cheerio": "^1.0.0-rc.12",
|
||||||
|
"d3-dsv": "2",
|
||||||
"dayjs": "^1.11.10",
|
"dayjs": "^1.11.10",
|
||||||
"html-to-text": "^9.0.5",
|
"html-to-text": "^9.0.5",
|
||||||
"i18next": "^23.10.1",
|
"i18next": "^23.10.1",
|
||||||
@ -55,6 +56,7 @@
|
|||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@plasmohq/prettier-plugin-sort-imports": "4.0.1",
|
"@plasmohq/prettier-plugin-sort-imports": "4.0.1",
|
||||||
"@types/chrome": "0.0.259",
|
"@types/chrome": "0.0.259",
|
||||||
|
"@types/d3-dsv": "^3.0.7",
|
||||||
"@types/html-to-text": "^9.0.4",
|
"@types/html-to-text": "^9.0.4",
|
||||||
"@types/node": "20.11.9",
|
"@types/node": "20.11.9",
|
||||||
"@types/pubsub-js": "^1.8.6",
|
"@types/pubsub-js": "^1.8.6",
|
||||||
|
@ -12,9 +12,9 @@ export const SelectedKnowledge = () => {
|
|||||||
<span className="text-lg font-thin text-zinc-300 dark:text-zinc-600">
|
<span className="text-lg font-thin text-zinc-300 dark:text-zinc-600">
|
||||||
{"/"}
|
{"/"}
|
||||||
</span>
|
</span>
|
||||||
<div className="border flex justify-between items-center rounded-md p-1 gap-2 bg-gray-100 dark:bg-gray-800 dark:border-gray-700">
|
<div className="border flex justify-between items-center rounded-full px-2 py-1 gap-2 bg-gray-100 dark:bg-slate-800 dark:border-slate-700">
|
||||||
<div className="inline-flex items-center gap-2">
|
<div className="inline-flex items-center gap-2">
|
||||||
<Blocks className="h-6 w-6 text-gray-400" />
|
<Blocks className="h-5 w-5 text-gray-400" />
|
||||||
<span className="text-xs font-semibold dark:text-gray-100">
|
<span className="text-xs font-semibold dark:text-gray-100">
|
||||||
{knowledge.title}
|
{knowledge.title}
|
||||||
</span>
|
</span>
|
||||||
|
@ -7,6 +7,8 @@ import {
|
|||||||
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama"
|
||||||
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
||||||
import { PageAssistVectorStore } from "./PageAssistVectorStore"
|
import { PageAssistVectorStore } from "./PageAssistVectorStore"
|
||||||
|
import { PageAssisCSVUrlLoader } from "@/loader/csv"
|
||||||
|
import { PageAssisTXTUrlLoader } from "@/loader/txt"
|
||||||
|
|
||||||
export const processKnowledge = async (msg: any, id: string): Promise<void> => {
|
export const processKnowledge = async (msg: any, id: string): Promise<void> => {
|
||||||
console.log(`Processing knowledge with id: ${id}`)
|
console.log(`Processing knowledge with id: ${id}`)
|
||||||
@ -38,6 +40,34 @@ export const processKnowledge = async (msg: any, id: string): Promise<void> => {
|
|||||||
})
|
})
|
||||||
let docs = await loader.load()
|
let docs = await loader.load()
|
||||||
const chunks = await textSplitter.splitDocuments(docs)
|
const chunks = await textSplitter.splitDocuments(docs)
|
||||||
|
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
|
||||||
|
knownledge_id: knowledge.id,
|
||||||
|
file_id: doc.source_id
|
||||||
|
})
|
||||||
|
} else if (doc.type === "csv" || doc.type === "text/csv") {
|
||||||
|
const loader = new PageAssisCSVUrlLoader({
|
||||||
|
name: doc.filename,
|
||||||
|
url: doc.content,
|
||||||
|
options: {}
|
||||||
|
})
|
||||||
|
|
||||||
|
let docs = await loader.load()
|
||||||
|
|
||||||
|
const chunks = await textSplitter.splitDocuments(docs)
|
||||||
|
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
|
||||||
|
knownledge_id: knowledge.id,
|
||||||
|
file_id: doc.source_id
|
||||||
|
})
|
||||||
|
} else if (doc.type === "txt" || doc.type === "text/plain") {
|
||||||
|
const loader = new PageAssisTXTUrlLoader({
|
||||||
|
name: doc.filename,
|
||||||
|
url: doc.content
|
||||||
|
})
|
||||||
|
|
||||||
|
let docs = await loader.load()
|
||||||
|
|
||||||
|
const chunks = await textSplitter.splitDocuments(docs)
|
||||||
|
|
||||||
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
|
await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, {
|
||||||
knownledge_id: knowledge.id,
|
knownledge_id: knowledge.id,
|
||||||
file_id: doc.source_id
|
file_id: doc.source_id
|
||||||
|
84
src/loader/csv.ts
Normal file
84
src/loader/csv.ts
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import { dsvFormat } from "d3-dsv"
|
||||||
|
|
||||||
|
import { BaseDocumentLoader } from "langchain/document_loaders/base"
|
||||||
|
import { Document } from "@langchain/core/documents"
|
||||||
|
export interface WebLoaderParams {
|
||||||
|
url: string
|
||||||
|
name: string
|
||||||
|
options: {
|
||||||
|
column?: string
|
||||||
|
separator?: string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class PageAssisCSVUrlLoader
|
||||||
|
extends BaseDocumentLoader
|
||||||
|
implements WebLoaderParams
|
||||||
|
{
|
||||||
|
pdf: { content: string; page: number }[]
|
||||||
|
url: string
|
||||||
|
name: string
|
||||||
|
options: { column?: string; separator?: string }
|
||||||
|
|
||||||
|
constructor({ url, name }: WebLoaderParams) {
|
||||||
|
super()
|
||||||
|
this.url = url
|
||||||
|
this.name = name
|
||||||
|
this.options = {}
|
||||||
|
}
|
||||||
|
|
||||||
|
public async parse(raw: string): Promise<string[]> {
|
||||||
|
const { column, separator = "," } = this.options
|
||||||
|
const psv = dsvFormat(separator)
|
||||||
|
|
||||||
|
let parsed = psv.parseRows(raw.trim())
|
||||||
|
|
||||||
|
if (column !== undefined) {
|
||||||
|
if (!parsed[0].includes(column)) {
|
||||||
|
throw new Error(`ColumnNotFoundError: Column ${column} not found`)
|
||||||
|
}
|
||||||
|
|
||||||
|
const columnIndex = parsed[0].indexOf(column)
|
||||||
|
return parsed.map((row) => row[columnIndex]!)
|
||||||
|
}
|
||||||
|
|
||||||
|
const headers = parsed[0]
|
||||||
|
parsed = parsed.slice(1)
|
||||||
|
|
||||||
|
return parsed.map((row) =>
|
||||||
|
row.map((value, index) => `${headers[index]}: ${value}`).join("\n")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
async load(): Promise<Document<Record<string, any>>[]> {
|
||||||
|
const res = await fetch(this.url)
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
throw new Error(`Failed to fetch ${this.url}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
const raw = await res.text()
|
||||||
|
|
||||||
|
const parsed = await this.parse(raw)
|
||||||
|
let metadata = { source: this.name, type: "csv" }
|
||||||
|
parsed.forEach((pageContent, i) => {
|
||||||
|
if (typeof pageContent !== "string") {
|
||||||
|
throw new Error(
|
||||||
|
`Expected string, at position ${i} got ${typeof pageContent}`
|
||||||
|
)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return parsed.map(
|
||||||
|
(pageContent, i) =>
|
||||||
|
new Document({
|
||||||
|
pageContent,
|
||||||
|
metadata:
|
||||||
|
parsed.length === 1
|
||||||
|
? metadata
|
||||||
|
: {
|
||||||
|
...metadata,
|
||||||
|
line: i + 1
|
||||||
|
}
|
||||||
|
})
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
57
src/loader/txt.ts
Normal file
57
src/loader/txt.ts
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import { BaseDocumentLoader } from "langchain/document_loaders/base"
|
||||||
|
import { Document } from "@langchain/core/documents"
|
||||||
|
export interface WebLoaderParams {
|
||||||
|
url: string
|
||||||
|
name: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export class PageAssisTXTUrlLoader
|
||||||
|
extends BaseDocumentLoader
|
||||||
|
implements WebLoaderParams
|
||||||
|
{
|
||||||
|
pdf: { content: string; page: number }[]
|
||||||
|
url: string
|
||||||
|
name: string
|
||||||
|
|
||||||
|
constructor({ url, name }: WebLoaderParams) {
|
||||||
|
super()
|
||||||
|
this.url = url
|
||||||
|
this.name = name
|
||||||
|
}
|
||||||
|
|
||||||
|
public async parse(raw: string): Promise<string[]> {
|
||||||
|
return [raw]
|
||||||
|
}
|
||||||
|
async load(): Promise<Document<Record<string, any>>[]> {
|
||||||
|
const res = await fetch(this.url)
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
throw new Error(`Failed to fetch ${this.url}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
const raw = await res.text()
|
||||||
|
|
||||||
|
const parsed = await this.parse(raw)
|
||||||
|
let metadata = { source: this.name, type: "csv" }
|
||||||
|
parsed.forEach((pageContent, i) => {
|
||||||
|
if (typeof pageContent !== "string") {
|
||||||
|
throw new Error(
|
||||||
|
`Expected string, at position ${i} got ${typeof pageContent}`
|
||||||
|
)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return parsed.map(
|
||||||
|
(pageContent, i) =>
|
||||||
|
new Document({
|
||||||
|
pageContent,
|
||||||
|
metadata:
|
||||||
|
parsed.length === 1
|
||||||
|
? metadata
|
||||||
|
: {
|
||||||
|
...metadata,
|
||||||
|
line: i + 1
|
||||||
|
}
|
||||||
|
})
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user