Add d3-dsv and @types/d3-dsv dependencies
This commit is contained in:
		
							parent
							
								
									82abbf5bad
								
							
						
					
					
						commit
						0de5ea0b04
					
				| @ -29,6 +29,7 @@ | |||||||
|     "antd": "^5.13.3", |     "antd": "^5.13.3", | ||||||
|     "axios": "^1.6.7", |     "axios": "^1.6.7", | ||||||
|     "cheerio": "^1.0.0-rc.12", |     "cheerio": "^1.0.0-rc.12", | ||||||
|  |     "d3-dsv": "2", | ||||||
|     "dayjs": "^1.11.10", |     "dayjs": "^1.11.10", | ||||||
|     "html-to-text": "^9.0.5", |     "html-to-text": "^9.0.5", | ||||||
|     "i18next": "^23.10.1", |     "i18next": "^23.10.1", | ||||||
| @ -55,6 +56,7 @@ | |||||||
|   "devDependencies": { |   "devDependencies": { | ||||||
|     "@plasmohq/prettier-plugin-sort-imports": "4.0.1", |     "@plasmohq/prettier-plugin-sort-imports": "4.0.1", | ||||||
|     "@types/chrome": "0.0.259", |     "@types/chrome": "0.0.259", | ||||||
|  |     "@types/d3-dsv": "^3.0.7", | ||||||
|     "@types/html-to-text": "^9.0.4", |     "@types/html-to-text": "^9.0.4", | ||||||
|     "@types/node": "20.11.9", |     "@types/node": "20.11.9", | ||||||
|     "@types/pubsub-js": "^1.8.6", |     "@types/pubsub-js": "^1.8.6", | ||||||
|  | |||||||
| @ -12,9 +12,9 @@ export const SelectedKnowledge = () => { | |||||||
|       <span className="text-lg font-thin text-zinc-300 dark:text-zinc-600"> |       <span className="text-lg font-thin text-zinc-300 dark:text-zinc-600"> | ||||||
|         {"/"} |         {"/"} | ||||||
|       </span> |       </span> | ||||||
|       <div className="border flex justify-between items-center rounded-md p-1 gap-2 bg-gray-100 dark:bg-gray-800 dark:border-gray-700"> |       <div className="border flex justify-between items-center rounded-full px-2 py-1 gap-2 bg-gray-100 dark:bg-slate-800 dark:border-slate-700"> | ||||||
|         <div className="inline-flex items-center gap-2"> |         <div className="inline-flex items-center gap-2"> | ||||||
|           <Blocks className="h-6 w-6 text-gray-400" /> |           <Blocks className="h-5 w-5 text-gray-400" /> | ||||||
|           <span className="text-xs font-semibold dark:text-gray-100"> |           <span className="text-xs font-semibold dark:text-gray-100"> | ||||||
|             {knowledge.title} |             {knowledge.title} | ||||||
|           </span> |           </span> | ||||||
|  | |||||||
| @ -7,6 +7,8 @@ import { | |||||||
| import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" | import { OllamaEmbeddings } from "@langchain/community/embeddings/ollama" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" | import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" | ||||||
| import { PageAssistVectorStore } from "./PageAssistVectorStore" | import { PageAssistVectorStore } from "./PageAssistVectorStore" | ||||||
|  | import { PageAssisCSVUrlLoader } from "@/loader/csv" | ||||||
|  | import { PageAssisTXTUrlLoader } from "@/loader/txt" | ||||||
| 
 | 
 | ||||||
| export const processKnowledge = async (msg: any, id: string): Promise<void> => { | export const processKnowledge = async (msg: any, id: string): Promise<void> => { | ||||||
|   console.log(`Processing knowledge with id: ${id}`) |   console.log(`Processing knowledge with id: ${id}`) | ||||||
| @ -38,6 +40,34 @@ export const processKnowledge = async (msg: any, id: string): Promise<void> => { | |||||||
|         }) |         }) | ||||||
|         let docs = await loader.load() |         let docs = await loader.load() | ||||||
|         const chunks = await textSplitter.splitDocuments(docs) |         const chunks = await textSplitter.splitDocuments(docs) | ||||||
|  |         await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, { | ||||||
|  |           knownledge_id: knowledge.id, | ||||||
|  |           file_id: doc.source_id | ||||||
|  |         }) | ||||||
|  |       } else if (doc.type === "csv" || doc.type === "text/csv") { | ||||||
|  |         const loader = new PageAssisCSVUrlLoader({ | ||||||
|  |           name: doc.filename, | ||||||
|  |           url: doc.content, | ||||||
|  |           options: {} | ||||||
|  |         }) | ||||||
|  | 
 | ||||||
|  |         let docs = await loader.load() | ||||||
|  | 
 | ||||||
|  |         const chunks = await textSplitter.splitDocuments(docs) | ||||||
|  |         await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, { | ||||||
|  |           knownledge_id: knowledge.id, | ||||||
|  |           file_id: doc.source_id | ||||||
|  |         }) | ||||||
|  |       } else if (doc.type === "txt" || doc.type === "text/plain") { | ||||||
|  |         const loader = new PageAssisTXTUrlLoader({ | ||||||
|  |           name: doc.filename, | ||||||
|  |           url: doc.content | ||||||
|  |         }) | ||||||
|  | 
 | ||||||
|  |         let docs = await loader.load() | ||||||
|  | 
 | ||||||
|  |         const chunks = await textSplitter.splitDocuments(docs) | ||||||
|  | 
 | ||||||
|         await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, { |         await PageAssistVectorStore.fromDocuments(chunks, ollamaEmbedding, { | ||||||
|           knownledge_id: knowledge.id, |           knownledge_id: knowledge.id, | ||||||
|           file_id: doc.source_id |           file_id: doc.source_id | ||||||
|  | |||||||
							
								
								
									
										84
									
								
								src/loader/csv.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								src/loader/csv.ts
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,84 @@ | |||||||
|  | import { dsvFormat } from "d3-dsv" | ||||||
|  | 
 | ||||||
|  | import { BaseDocumentLoader } from "langchain/document_loaders/base" | ||||||
|  | import { Document } from "@langchain/core/documents" | ||||||
|  | export interface WebLoaderParams { | ||||||
|  |   url: string | ||||||
|  |   name: string | ||||||
|  |   options: { | ||||||
|  |     column?: string | ||||||
|  |     separator?: string | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | export class PageAssisCSVUrlLoader | ||||||
|  |   extends BaseDocumentLoader | ||||||
|  |   implements WebLoaderParams | ||||||
|  | { | ||||||
|  |   pdf: { content: string; page: number }[] | ||||||
|  |   url: string | ||||||
|  |   name: string | ||||||
|  |   options: { column?: string; separator?: string } | ||||||
|  | 
 | ||||||
|  |   constructor({ url, name }: WebLoaderParams) { | ||||||
|  |     super() | ||||||
|  |     this.url = url | ||||||
|  |     this.name = name | ||||||
|  |     this.options = {} | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   public async parse(raw: string): Promise<string[]> { | ||||||
|  |     const { column, separator = "," } = this.options | ||||||
|  |     const psv = dsvFormat(separator) | ||||||
|  | 
 | ||||||
|  |     let parsed = psv.parseRows(raw.trim()) | ||||||
|  | 
 | ||||||
|  |     if (column !== undefined) { | ||||||
|  |       if (!parsed[0].includes(column)) { | ||||||
|  |         throw new Error(`ColumnNotFoundError: Column ${column} not found`) | ||||||
|  |       } | ||||||
|  | 
 | ||||||
|  |       const columnIndex = parsed[0].indexOf(column) | ||||||
|  |       return parsed.map((row) => row[columnIndex]!) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const headers = parsed[0] | ||||||
|  |     parsed = parsed.slice(1) | ||||||
|  | 
 | ||||||
|  |     return parsed.map((row) => | ||||||
|  |       row.map((value, index) => `${headers[index]}: ${value}`).join("\n") | ||||||
|  |     ) | ||||||
|  |   } | ||||||
|  |   async load(): Promise<Document<Record<string, any>>[]> { | ||||||
|  |     const res = await fetch(this.url) | ||||||
|  | 
 | ||||||
|  |     if (!res.ok) { | ||||||
|  |       throw new Error(`Failed to fetch ${this.url}`) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const raw = await res.text() | ||||||
|  | 
 | ||||||
|  |     const parsed = await this.parse(raw) | ||||||
|  |     let metadata = { source: this.name, type: "csv" } | ||||||
|  |     parsed.forEach((pageContent, i) => { | ||||||
|  |       if (typeof pageContent !== "string") { | ||||||
|  |         throw new Error( | ||||||
|  |           `Expected string, at position ${i} got ${typeof pageContent}` | ||||||
|  |         ) | ||||||
|  |       } | ||||||
|  |     }) | ||||||
|  |     return parsed.map( | ||||||
|  |       (pageContent, i) => | ||||||
|  |         new Document({ | ||||||
|  |           pageContent, | ||||||
|  |           metadata: | ||||||
|  |             parsed.length === 1 | ||||||
|  |               ? metadata | ||||||
|  |               : { | ||||||
|  |                   ...metadata, | ||||||
|  |                   line: i + 1 | ||||||
|  |                 } | ||||||
|  |         }) | ||||||
|  |     ) | ||||||
|  |   } | ||||||
|  | } | ||||||
							
								
								
									
										57
									
								
								src/loader/txt.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								src/loader/txt.ts
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,57 @@ | |||||||
|  | import { BaseDocumentLoader } from "langchain/document_loaders/base" | ||||||
|  | import { Document } from "@langchain/core/documents" | ||||||
|  | export interface WebLoaderParams { | ||||||
|  |   url: string | ||||||
|  |   name: string | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | export class PageAssisTXTUrlLoader | ||||||
|  |   extends BaseDocumentLoader | ||||||
|  |   implements WebLoaderParams | ||||||
|  | { | ||||||
|  |   pdf: { content: string; page: number }[] | ||||||
|  |   url: string | ||||||
|  |   name: string | ||||||
|  | 
 | ||||||
|  |   constructor({ url, name }: WebLoaderParams) { | ||||||
|  |     super() | ||||||
|  |     this.url = url | ||||||
|  |     this.name = name | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   public async parse(raw: string): Promise<string[]> { | ||||||
|  |     return [raw] | ||||||
|  |   } | ||||||
|  |   async load(): Promise<Document<Record<string, any>>[]> { | ||||||
|  |     const res = await fetch(this.url) | ||||||
|  | 
 | ||||||
|  |     if (!res.ok) { | ||||||
|  |       throw new Error(`Failed to fetch ${this.url}`) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const raw = await res.text() | ||||||
|  | 
 | ||||||
|  |     const parsed = await this.parse(raw) | ||||||
|  |     let metadata = { source: this.name, type: "csv" } | ||||||
|  |     parsed.forEach((pageContent, i) => { | ||||||
|  |       if (typeof pageContent !== "string") { | ||||||
|  |         throw new Error( | ||||||
|  |           `Expected string, at position ${i} got ${typeof pageContent}` | ||||||
|  |         ) | ||||||
|  |       } | ||||||
|  |     }) | ||||||
|  |     return parsed.map( | ||||||
|  |       (pageContent, i) => | ||||||
|  |         new Document({ | ||||||
|  |           pageContent, | ||||||
|  |           metadata: | ||||||
|  |             parsed.length === 1 | ||||||
|  |               ? metadata | ||||||
|  |               : { | ||||||
|  |                   ...metadata, | ||||||
|  |                   line: i + 1 | ||||||
|  |                 } | ||||||
|  |         }) | ||||||
|  |     ) | ||||||
|  |   } | ||||||
|  | } | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user