diff --git a/src/assets/locale/ar/settings.json b/src/assets/locale/ar/settings.json index 6de45fd..23e964d 100644 --- a/src/assets/locale/ar/settings.json +++ b/src/assets/locale/ar/settings.json @@ -334,6 +334,14 @@ "label": "عدد المستندات المسترجعة", "placeholder": "أدخل عدد المستندات المسترجعة", "required": "الرجاء إدخال عدد المستندات المسترجعة" + }, + "splittingSeparator": { + "label": "الفاصل", + "placeholder": "أدخل الفاصل (مثال: \\n\\n)", + "required": "الرجاء إدخال الفاصل" + }, + "splittingStrategy": { + "label": "مقسم النص" } }, "prompt": { @@ -355,4 +363,5 @@ }, "chromeAiSettings": { "title": "إعدادات Chrome AI" - }} + } +} diff --git a/src/assets/locale/da/settings.json b/src/assets/locale/da/settings.json index 1712a45..7efe9eb 100644 --- a/src/assets/locale/da/settings.json +++ b/src/assets/locale/da/settings.json @@ -331,6 +331,14 @@ "label": "Antal Hentede Dokumenter", "placeholder": "Indtast Number of Retrieved Documents", "required": "Venligst indtast the number of retrieved documents" + }, + "splittingSeparator": { + "label": "Separator", + "placeholder": "Indtast Separator (f.eks. \\n\\n)", + "required": "Indtast venligst en separator" + }, + "splittingStrategy": { + "label": "Tekst Splitter" } }, "prompt": { diff --git a/src/assets/locale/de/settings.json b/src/assets/locale/de/settings.json index 581cc28..9d5822b 100644 --- a/src/assets/locale/de/settings.json +++ b/src/assets/locale/de/settings.json @@ -331,6 +331,14 @@ "label": "Anzahl der abgerufenen Dokumente", "placeholder": "Anzahl der abgerufenen Dokumente eingeben", "required": "Bitte geben Sie die Anzahl der abgerufenen Dokumente ein" + }, + "splittingSeparator": { + "label": "Separator", + "placeholder": "Separator eingeben (z.B. \\n\\n)", + "required": "Bitte geben Sie einen Separator ein" + }, + "splittingStrategy": { + "label": "Text-Splitter" } }, "prompt": { diff --git a/src/assets/locale/en/settings.json b/src/assets/locale/en/settings.json index 3f6af27..2d7cdcd 100644 --- a/src/assets/locale/en/settings.json +++ b/src/assets/locale/en/settings.json @@ -72,7 +72,7 @@ } }, "braveApi": { - "label": "Brave API Key", + "label": "Brave API Key", "placeholder": "Enter your Brave API key" }, "googleDomain": { @@ -337,6 +337,14 @@ "label": "Number of Retrieved Documents", "placeholder": "Enter Number of Retrieved Documents", "required": "Please enter the number of retrieved documents" + }, + "splittingSeparator": { + "label": "Separator", + "placeholder": "Enter Separator (e.g., \\n\\n)", + "required": "Please enter a separator" + }, + "splittingStrategy": { + "label": "Text Splitter" } }, "prompt": { diff --git a/src/assets/locale/es/settings.json b/src/assets/locale/es/settings.json index 3809ff1..41a24cb 100644 --- a/src/assets/locale/es/settings.json +++ b/src/assets/locale/es/settings.json @@ -331,6 +331,14 @@ "label": "Número de Documentos Recuperados", "placeholder": "Ingrese el Número de Documentos Recuperados", "required": "Por favor, ingrese el número de documentos recuperados" + }, + "splittingSeparator": { + "label": "Separador", + "placeholder": "Ingrese el separador (ej., \\n\\n)", + "required": "Por favor, ingrese un separador" + }, + "splittingStrategy": { + "label": "Divisor de Texto" } }, "prompt": { diff --git a/src/assets/locale/fa/settings.json b/src/assets/locale/fa/settings.json index bb037ff..48aaf5d 100644 --- a/src/assets/locale/fa/settings.json +++ b/src/assets/locale/fa/settings.json @@ -327,6 +327,14 @@ "label": "تعداد اسناد بازیابی شده", "placeholder": "تعداد اسناد بازیابی شده را وارد کنید", "required": "لطفاً تعداد اسناد بازیابی شده را وارد کنید" + }, + "splittingSeparator": { + "label": "جداکننده", + "placeholder": "جداکننده را وارد کنید (مثلاً \\n\\n)", + "required": "لطفاً یک جداکننده وارد کنید" + }, + "splittingStrategy": { + "label": "تقسیم‌کننده متن" } }, "prompt": { diff --git a/src/assets/locale/fr/settings.json b/src/assets/locale/fr/settings.json index 93ba685..dbd7fad 100644 --- a/src/assets/locale/fr/settings.json +++ b/src/assets/locale/fr/settings.json @@ -331,6 +331,14 @@ "label": "Nombre de documents récupérés", "placeholder": "Entrez le nombre de documents récupérés", "required": "Veuillez saisir le nombre de documents récupérés" + }, + "splittingSeparator": { + "label": "Séparateur", + "placeholder": "Entrez le séparateur (par exemple, \\n\\n)", + "required": "Veuillez saisir un séparateur" + }, + "splittingStrategy": { + "label": "Diviseur de texte" } }, "prompt": { diff --git a/src/assets/locale/it/settings.json b/src/assets/locale/it/settings.json index d7d7007..3b61c5c 100644 --- a/src/assets/locale/it/settings.json +++ b/src/assets/locale/it/settings.json @@ -331,6 +331,14 @@ "label": "Numero di Documenti Recuperati", "placeholder": "Inserisci il Numero di Documenti Recuperati", "required": "Inserisci il numero di documenti recuperati" + }, + "splittingSeparator": { + "label": "Separatore", + "placeholder": "Inserisci il Separatore (es. \\n\\n)", + "required": "Inserisci un separatore" + }, + "splittingStrategy": { + "label": "Divisore di Testo" } }, "prompt": { diff --git a/src/assets/locale/ja-JP/settings.json b/src/assets/locale/ja-JP/settings.json index 62363a5..239ffa4 100644 --- a/src/assets/locale/ja-JP/settings.json +++ b/src/assets/locale/ja-JP/settings.json @@ -334,6 +334,14 @@ "label": "取得ドキュメント数", "placeholder": "取得ドキュメント数を入力", "required": "取得ドキュメント数を入力してください" + }, + "splittingSeparator": { + "label": "セパレーター", + "placeholder": "セパレーターを入力(例:\\n\\n)", + "required": "セパレーターを入力してください" + }, + "splittingStrategy": { + "label": "テキスト分割方式" } }, "prompt": { diff --git a/src/assets/locale/ko/settings.json b/src/assets/locale/ko/settings.json index f4f019a..9728e98 100644 --- a/src/assets/locale/ko/settings.json +++ b/src/assets/locale/ko/settings.json @@ -334,6 +334,14 @@ "label": "검색 문서 수", "placeholder": "검색 문서 수 입력", "required": "검색 문서 수를 입력해주세요" + }, + "splittingSeparator": { + "label": "구분자", + "placeholder": "구분자 입력 (예: \\n\\n)", + "required": "구분자를 입력해주세요" + }, + "splittingStrategy": { + "label": "텍스트 분할기" } }, "prompt": { diff --git a/src/assets/locale/ml/settings.json b/src/assets/locale/ml/settings.json index e18ef5e..fab6e6f 100644 --- a/src/assets/locale/ml/settings.json +++ b/src/assets/locale/ml/settings.json @@ -334,6 +334,14 @@ "label": "വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം", "placeholder": "വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം നൽകുക", "required": "ദയവായി വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം നൽകുക" + }, + "splittingSeparator": { + "label": "വിഭജന ചിഹ്നം", + "placeholder": "വിഭജന ചിഹ്നം നൽകുക (ഉദാ: \\n\\n)", + "required": "ദയവായി ഒരു വിഭജന ചിഹ്നം നൽകുക" + }, + "splittingStrategy": { + "label": "ടെക്സ്റ്റ് സ്പ്ലിറ്റർ" } }, "prompt": { diff --git a/src/assets/locale/no/settings.json b/src/assets/locale/no/settings.json index cd96712..40dece1 100644 --- a/src/assets/locale/no/settings.json +++ b/src/assets/locale/no/settings.json @@ -331,6 +331,14 @@ "label": "Antall hentede dokumenter", "placeholder": "Skriv inn antall hentede dokumenter", "required": "Vennligst skriv inn antall hentede dokumenter" + }, + "splittingSeparator": { + "label": "Separator", + "placeholder": "Skriv inn separator (f.eks. \\n\\n)", + "required": "Vennligst skriv inn en separator" + }, + "splittingStrategy": { + "label": "Tekstdeler" } }, "prompt": { diff --git a/src/assets/locale/pt-BR/settings.json b/src/assets/locale/pt-BR/settings.json index 6dbd407..e296201 100644 --- a/src/assets/locale/pt-BR/settings.json +++ b/src/assets/locale/pt-BR/settings.json @@ -331,6 +331,14 @@ "label": "Número de Documentos Recuperados", "placeholder": "Digite o Número de Documentos Recuperados", "required": "Por favor, insira o número de documentos recuperados" + }, + "splittingSeparator": { + "label": "Separador", + "placeholder": "Digite o Separador (ex: \\n\\n)", + "required": "Por favor, insira um separador" + }, + "splittingStrategy": { + "label": "Divisor de Texto" } }, "prompt": { diff --git a/src/assets/locale/ru/settings.json b/src/assets/locale/ru/settings.json index c71c037..1f985ae 100644 --- a/src/assets/locale/ru/settings.json +++ b/src/assets/locale/ru/settings.json @@ -333,6 +333,14 @@ "label": "Количество извлеченных документов", "placeholder": "Введите количество извлеченных документов", "required": "Пожалуйста, введите количество извлеченных документов" + }, + "splittingSeparator": { + "label": "Разделитель", + "placeholder": "Введите разделитель (например, \\n\\n)", + "required": "Пожалуйста, введите разделитель" + }, + "splittingStrategy": { + "label": "Разделитель текста" } }, "prompt": { diff --git a/src/assets/locale/sv/settings.json b/src/assets/locale/sv/settings.json index 712e9a3..aa049c6 100644 --- a/src/assets/locale/sv/settings.json +++ b/src/assets/locale/sv/settings.json @@ -331,6 +331,14 @@ "label": "Antal hämtade dokument", "placeholder": "Ange antal hämtade dokument", "required": "Vänligen ange antal hämtade dokument" + }, + "splittingSeparator": { + "label": "Separator", + "placeholder": "Ange separator (t.ex. \\n\\n)", + "required": "Vänligen ange en separator" + }, + "splittingStrategy": { + "label": "Textdelare" } }, "prompt": { diff --git a/src/assets/locale/uk/settings.json b/src/assets/locale/uk/settings.json index 7462317..34752c9 100644 --- a/src/assets/locale/uk/settings.json +++ b/src/assets/locale/uk/settings.json @@ -331,6 +331,14 @@ "label": "Кількість отриманих документів", "placeholder": "Ввести кількість отриманих документів", "required": "Будь ласка, введіть кількість документів" + }, + "splittingSeparator": { + "label": "Роздільник", + "placeholder": "Введіть роздільник (напр., \\n\\n)", + "required": "Будь ласка, введіть роздільник" + }, + "splittingStrategy": { + "label": "Розділювач тексту" } }, "prompt": { diff --git a/src/assets/locale/zh/settings.json b/src/assets/locale/zh/settings.json index 2557c44..1a66351 100644 --- a/src/assets/locale/zh/settings.json +++ b/src/assets/locale/zh/settings.json @@ -336,6 +336,14 @@ "label": "检索文档数量", "placeholder": "输入检索文档数量", "required": "请输入检索文档数量" + }, + "splittingSeparator": { + "label": "分隔符", + "placeholder": "输入分隔符(例如:\\n\\n)", + "required": "请输入分隔符" + }, + "splittingStrategy": { + "label": "文本分割器" } }, "prompt": { diff --git a/src/components/Option/Settings/rag.tsx b/src/components/Option/Settings/rag.tsx index 8cc6bbf..534b9be 100644 --- a/src/components/Option/Settings/rag.tsx +++ b/src/components/Option/Settings/rag.tsx @@ -1,10 +1,12 @@ import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query" -import { Form, InputNumber, Select, Skeleton } from "antd" +import { Form, Input, InputNumber, Select, Skeleton } from "antd" import { SaveButton } from "~/components/Common/SaveButton" import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, + defaultSplittingStrategy, + defaultSsplttingSeparator, getEmbeddingModels, saveForRag } from "~/services/ollama" @@ -16,7 +18,8 @@ import { ProviderIcons } from "@/components/Common/ProviderIcon" export const RagSettings = () => { const { t } = useTranslation("settings") - + const [form] = Form.useForm() + const splittingStrategy = Form.useWatch("splittingStrategy", form) const queryClient = useQueryClient() const { data: ollamaInfo, status } = useQuery({ @@ -28,14 +31,18 @@ export const RagSettings = () => { chunkSize, defaultEM, totalFilePerKB, - noOfRetrievedDocs + noOfRetrievedDocs, + splittingStrategy, + splittingSeparator ] = await Promise.all([ getEmbeddingModels({ returnEmpty: true }), defaultEmbeddingChunkOverlap(), defaultEmbeddingChunkSize(), defaultEmbeddingModelForRag(), getTotalFilePerKB(), - getNoOfRetrievedDocs() + getNoOfRetrievedDocs(), + defaultSplittingStrategy(), + defaultSsplttingSeparator() ]) return { models: allModels, @@ -43,7 +50,9 @@ export const RagSettings = () => { chunkSize, defaultEM, totalFilePerKB, - noOfRetrievedDocs + noOfRetrievedDocs, + splittingStrategy, + splittingSeparator } } }) @@ -55,13 +64,17 @@ export const RagSettings = () => { overlap: number totalFilePerKB: number noOfRetrievedDocs: number + strategy: string + separator: string }) => { await saveForRag( data.model, data.chunkSize, data.overlap, data.totalFilePerKB, - data.noOfRetrievedDocs + data.noOfRetrievedDocs, + data.strategy, + data.separator ) return true }, @@ -85,6 +98,7 @@ export const RagSettings = () => {
{ saveRAG({ @@ -92,7 +106,9 @@ export const RagSettings = () => { chunkSize: data.chunkSize, overlap: data.chunkOverlap, totalFilePerKB: data.totalFilePerKB, - noOfRetrievedDocs: data.noOfRetrievedDocs + noOfRetrievedDocs: data.noOfRetrievedDocs, + separator: data.splittingSeparator, + strategy: data.splittingStrategy }) }} initialValues={{ @@ -100,7 +116,9 @@ export const RagSettings = () => { chunkOverlap: ollamaInfo?.chunkOverlap, defaultEM: ollamaInfo?.defaultEM, totalFilePerKB: ollamaInfo?.totalFilePerKB, - noOfRetrievedDocs: ollamaInfo?.noOfRetrievedDocs + noOfRetrievedDocs: ollamaInfo?.noOfRetrievedDocs, + splittingStrategy: ollamaInfo?.splittingStrategy, + splittingSeparator: ollamaInfo?.splittingSeparator }}> { /> + + + + )} + => { console.log(`Processing knowledge with id: ${id}`) @@ -32,12 +27,8 @@ export const processKnowledge = async (msg: any, id: string): Promise => { baseUrl: cleanUrl(ollamaUrl), model: knowledge.embedding_model }) - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) + + const textSplitter = await getPageAssistTextSplitter() for (const doc of knowledge.source) { if (doc.type === "pdf" || doc.type === "application/pdf") { @@ -65,13 +56,15 @@ export const processKnowledge = async (msg: any, id: string): Promise => { knownledge_id: knowledge.id, file_id: doc.source_id }) - } else if (doc.type === "docx" || doc.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document") { + } else if ( + doc.type === "docx" || + doc.type === + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) { try { const loader = new PageAssistDocxLoader({ fileName: doc.filename, - buffer: await toArrayBufferFromBase64( - doc.content - ) + buffer: await toArrayBufferFromBase64(doc.content) }) let docs = await loader.load() diff --git a/src/services/ollama.ts b/src/services/ollama.ts index e19d704..f5e3f71 100644 --- a/src/services/ollama.ts +++ b/src/services/ollama.ts @@ -8,6 +8,9 @@ import { ollamaFormatAllCustomModels } from "@/db/models" const storage = new Storage() +const storage2 = new Storage({ + area: "local" +}) const DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434" const DEFAULT_ASK_FOR_MODEL_SELECTION_EVERY_TIME = true @@ -310,6 +313,22 @@ export const defaultEmbeddingChunkSize = async () => { return parseInt(embeddingChunkSize) } +export const defaultSplittingStrategy = async () => { + const splittingStrategy = await storage.get("defaultSplittingStrategy") + if (!splittingStrategy || splittingStrategy.length === 0) { + return "RecursiveCharacterTextSplitter" + } + return splittingStrategy +} + +export const defaultSsplttingSeparator = async () => { + const splittingSeparator = await storage.get("defaultSplittingSeparator") + if (!splittingSeparator || splittingSeparator.length === 0) { + return "\\n\\n" + } + return splittingSeparator +} + export const defaultEmbeddingChunkOverlap = async () => { const embeddingChunkOverlap = await storage.get( "defaultEmbeddingChunkOverlap" @@ -320,6 +339,14 @@ export const defaultEmbeddingChunkOverlap = async () => { return parseInt(embeddingChunkOverlap) } +export const setDefaultSplittingStrategy = async (strategy: string) => { + await storage.set("defaultSplittingStrategy", strategy) +} + +export const setDefaultSplittingSeparator = async (separator: string) => { + await storage.set("defaultSplittingSeparator", separator) +} + export const setDefaultEmbeddingModelForRag = async (model: string) => { await storage.set("defaultEmbeddingModel", model) } @@ -337,7 +364,9 @@ export const saveForRag = async ( chunkSize: number, overlap: number, totalFilePerKB: number, - noOfRetrievedDocs?: number + noOfRetrievedDocs?: number, + strategy?: string, + separator?: string ) => { await setDefaultEmbeddingModelForRag(model) await setDefaultEmbeddingChunkSize(chunkSize) @@ -346,6 +375,12 @@ export const saveForRag = async ( if (noOfRetrievedDocs) { await setNoOfRetrievedDocs(noOfRetrievedDocs) } + if (strategy) { + await setDefaultSplittingStrategy(strategy) + } + if (separator) { + await setDefaultSplittingSeparator(separator) + } } export const getWebSearchPrompt = async () => { diff --git a/src/utils/memory-embeddings.ts b/src/utils/memory-embeddings.ts index 9cb1b82..1128cf4 100644 --- a/src/utils/memory-embeddings.ts +++ b/src/utils/memory-embeddings.ts @@ -1,12 +1,8 @@ import { PageAssistHtmlLoader } from "~/loader/html" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" -import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize -} from "@/services/ollama" import { PageAssistPDFLoader } from "@/loader/pdf" import { PAMemoryVectorStore } from "@/libs/PAMemoryVectorStore" +import { getPageAssistTextSplitter } from "./text-splitter" export const getLoader = ({ html, @@ -54,12 +50,7 @@ export const memoryEmbedding = async ({ setIsEmbedding(true) const loader = getLoader({ html, pdf, type, url }) const docs = await loader.load() - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) + const textSplitter = await getPageAssistTextSplitter() const chunks = await textSplitter.splitDocuments(docs) diff --git a/src/utils/text-splitter.ts b/src/utils/text-splitter.ts new file mode 100644 index 0000000..67a0d7d --- /dev/null +++ b/src/utils/text-splitter.ts @@ -0,0 +1,37 @@ +import { + RecursiveCharacterTextSplitter, + CharacterTextSplitter +} from "langchain/text_splitter" + +import { + defaultEmbeddingChunkOverlap, + defaultEmbeddingChunkSize, + defaultSsplttingSeparator, + defaultSplittingStrategy +} from "@/services/ollama" + +export const getPageAssistTextSplitter = async () => { + const chunkSize = await defaultEmbeddingChunkSize() + const chunkOverlap = await defaultEmbeddingChunkOverlap() + const splittingStrategy = await defaultSplittingStrategy() + + switch (splittingStrategy) { + case "CharacterTextSplitter": + console.log("Using CharacterTextSplitter") + const splittingSeparator = await defaultSsplttingSeparator() + const processedSeparator = splittingSeparator + .replace(/\\n/g, "\n") + .replace(/\\t/g, "\t") + .replace(/\\r/g, "\r") + return new CharacterTextSplitter({ + chunkSize, + chunkOverlap, + separator: processedSeparator + }) + default: + return new RecursiveCharacterTextSplitter({ + chunkSize, + chunkOverlap + }) + } +} diff --git a/src/web/search-engines/brave-api.ts b/src/web/search-engines/brave-api.ts index 5e13312..b95c37c 100644 --- a/src/web/search-engines/brave-api.ts +++ b/src/web/search-engines/brave-api.ts @@ -2,15 +2,13 @@ import { cleanUrl } from "~/libs/clean-url" import { getIsSimpleInternetSearch, totalSearchResults, getBraveApiKey } from "@/services/search" import { pageAssistEmbeddingModel } from "@/models/embedding" import type { Document } from "@langchain/core/documents" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { MemoryVectorStore } from "langchain/vectorstores/memory" import { PageAssistHtmlLoader } from "~/loader/html" import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "~/services/ollama" +import { getPageAssistTextSplitter } from "@/utils/text-splitter" interface BraveAPIResult { title: string @@ -70,12 +68,7 @@ export const braveAPISearch = async (query: string) => { baseUrl: cleanUrl(ollamaUrl) }) - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) + const textSplitter = await getPageAssistTextSplitter() const chunks = await textSplitter.splitDocuments(docs) const store = new MemoryVectorStore(ollamaEmbedding) diff --git a/src/web/search-engines/brave.ts b/src/web/search-engines/brave.ts index b795d8b..71b7670 100644 --- a/src/web/search-engines/brave.ts +++ b/src/web/search-engines/brave.ts @@ -3,8 +3,6 @@ import { urlRewriteRuntime } from "@/libs/runtime" import { PageAssistHtmlLoader } from "@/loader/html" import { pageAssistEmbeddingModel } from "@/models/embedding" import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" @@ -12,10 +10,10 @@ import { getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" +import { getPageAssistTextSplitter } from "@/utils/text-splitter" import type { Document } from "@langchain/core/documents" import * as cheerio from "cheerio" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { MemoryVectorStore } from "langchain/vectorstores/memory" export const localBraveSearch = async (query: string) => { @@ -87,12 +85,8 @@ export const webBraveSearch = async (query: string) => { baseUrl: cleanUrl(ollamaUrl) }) - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) + + const textSplitter = await getPageAssistTextSplitter(); const chunks = await textSplitter.splitDocuments(docs) diff --git a/src/web/search-engines/duckduckgo.ts b/src/web/search-engines/duckduckgo.ts index e368500..9552b9d 100644 --- a/src/web/search-engines/duckduckgo.ts +++ b/src/web/search-engines/duckduckgo.ts @@ -3,8 +3,6 @@ import { urlRewriteRuntime } from "@/libs/runtime" import { PageAssistHtmlLoader } from "@/loader/html" import { pageAssistEmbeddingModel } from "@/models/embedding" import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" @@ -12,9 +10,9 @@ import { getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" +import { getPageAssistTextSplitter } from "@/utils/text-splitter" import type { Document } from "@langchain/core/documents" import * as cheerio from "cheerio" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { MemoryVectorStore } from "langchain/vectorstores/memory" export const localDuckDuckGoSearch = async (query: string) => { @@ -90,12 +88,7 @@ export const webDuckDuckGoSearch = async (query: string) => { baseUrl: cleanUrl(ollamaUrl) }) - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) + const textSplitter = await getPageAssistTextSplitter() const chunks = await textSplitter.splitDocuments(docs) diff --git a/src/web/search-engines/google.ts b/src/web/search-engines/google.ts index 8c0a92d..94dd3c4 100644 --- a/src/web/search-engines/google.ts +++ b/src/web/search-engines/google.ts @@ -4,15 +4,13 @@ import { getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" +import { getPageAssistTextSplitter } from "@/utils/text-splitter" import type { Document } from "@langchain/core/documents" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { MemoryVectorStore } from "langchain/vectorstores/memory" import { cleanUrl } from "~/libs/clean-url" import { urlRewriteRuntime } from "~/libs/runtime" import { PageAssistHtmlLoader } from "~/loader/html" import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "~/services/ollama" @@ -91,13 +89,9 @@ export const webGoogleSearch = async (query: string) => { baseUrl: cleanUrl(ollamaUrl) }) - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) - + + const textSplitter = await getPageAssistTextSplitter() + const chunks = await textSplitter.splitDocuments(docs) const store = new MemoryVectorStore(ollamaEmbedding) diff --git a/src/web/search-engines/searxng.ts b/src/web/search-engines/searxng.ts index 0dc2e64..d3277bf 100644 --- a/src/web/search-engines/searxng.ts +++ b/src/web/search-engines/searxng.ts @@ -3,15 +3,13 @@ import { cleanUrl } from "~/libs/clean-url" import { getSearxngURL, isSearxngJSONMode, getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" import { pageAssistEmbeddingModel } from "@/models/embedding" import type { Document } from "@langchain/core/documents" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { MemoryVectorStore } from "langchain/vectorstores/memory" import { PageAssistHtmlLoader } from "~/loader/html" import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "~/services/ollama" +import { getPageAssistTextSplitter } from "@/utils/text-splitter" interface SearxNGJSONResult { title: string @@ -73,13 +71,9 @@ export const searxngSearch = async (query: string) => { baseUrl: cleanUrl(ollamaUrl) }) - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) + const textSplitter = await getPageAssistTextSplitter(); + const chunks = await textSplitter.splitDocuments(docs) const store = new MemoryVectorStore(ollamaEmbedding) await store.addDocuments(chunks) diff --git a/src/web/search-engines/sogou.ts b/src/web/search-engines/sogou.ts index d1a6090..7bc0126 100644 --- a/src/web/search-engines/sogou.ts +++ b/src/web/search-engines/sogou.ts @@ -3,8 +3,6 @@ import { urlRewriteRuntime } from "@/libs/runtime" import { PageAssistHtmlLoader } from "@/loader/html" import { pageAssistEmbeddingModel } from "@/models/embedding" import { - defaultEmbeddingChunkOverlap, - defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" @@ -12,9 +10,9 @@ import { getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" +import { getPageAssistTextSplitter } from "@/utils/text-splitter" import type { Document } from "@langchain/core/documents" import * as cheerio from "cheerio" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { MemoryVectorStore } from "langchain/vectorstores/memory" const getCorrectTargeUrl = async (url: string) => { if (!url) return "" @@ -104,12 +102,7 @@ export const webSogouSearch = async (query: string) => { baseUrl: cleanUrl(ollamaUrl) }) - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) + const textSplitter = await getPageAssistTextSplitter() const chunks = await textSplitter.splitDocuments(docs) diff --git a/src/web/website/index.ts b/src/web/website/index.ts index 817fdb7..d817160 100644 --- a/src/web/website/index.ts +++ b/src/web/website/index.ts @@ -1,8 +1,9 @@ import { cleanUrl } from "@/libs/clean-url" import { PageAssistHtmlLoader } from "@/loader/html" import { pageAssistEmbeddingModel } from "@/models/embedding" -import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" +import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" +import { getPageAssistTextSplitter } from "@/utils/text-splitter" + import { MemoryVectorStore } from "langchain/vectorstores/memory" export const processSingleWebsite = async (url: string, query: string) => { @@ -20,12 +21,8 @@ export const processSingleWebsite = async (url: string, query: string) => { baseUrl: cleanUrl(ollamaUrl) }) - const chunkSize = await defaultEmbeddingChunkSize() - const chunkOverlap = await defaultEmbeddingChunkOverlap() - const textSplitter = new RecursiveCharacterTextSplitter({ - chunkSize, - chunkOverlap - }) + + const textSplitter = await getPageAssistTextSplitter() const chunks = await textSplitter.splitDocuments(docs)