feat: Add text splitting configuration options
This commit is contained in:
		
							parent
							
								
									1d9d704c76
								
							
						
					
					
						commit
						0af69a3be8
					
				| @ -334,6 +334,14 @@ | |||||||
|         "label": "عدد المستندات المسترجعة", |         "label": "عدد المستندات المسترجعة", | ||||||
|         "placeholder": "أدخل عدد المستندات المسترجعة", |         "placeholder": "أدخل عدد المستندات المسترجعة", | ||||||
|         "required": "الرجاء إدخال عدد المستندات المسترجعة" |         "required": "الرجاء إدخال عدد المستندات المسترجعة" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "الفاصل", | ||||||
|  |         "placeholder": "أدخل الفاصل (مثال: \\n\\n)", | ||||||
|  |         "required": "الرجاء إدخال الفاصل" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "مقسم النص" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
| @ -355,4 +363,5 @@ | |||||||
|   }, |   }, | ||||||
|   "chromeAiSettings": { |   "chromeAiSettings": { | ||||||
|     "title": "إعدادات Chrome AI" |     "title": "إعدادات Chrome AI" | ||||||
|   }} |   } | ||||||
|  | } | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Antal Hentede Dokumenter", |         "label": "Antal Hentede Dokumenter", | ||||||
|         "placeholder": "Indtast Number of Retrieved Documents", |         "placeholder": "Indtast Number of Retrieved Documents", | ||||||
|         "required": "Venligst indtast the number of retrieved documents" |         "required": "Venligst indtast the number of retrieved documents" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Separator", | ||||||
|  |         "placeholder": "Indtast Separator (f.eks. \\n\\n)", | ||||||
|  |         "required": "Indtast venligst en separator" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Tekst Splitter" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Anzahl der abgerufenen Dokumente", |         "label": "Anzahl der abgerufenen Dokumente", | ||||||
|         "placeholder": "Anzahl der abgerufenen Dokumente eingeben", |         "placeholder": "Anzahl der abgerufenen Dokumente eingeben", | ||||||
|         "required": "Bitte geben Sie die Anzahl der abgerufenen Dokumente ein" |         "required": "Bitte geben Sie die Anzahl der abgerufenen Dokumente ein" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Separator", | ||||||
|  |         "placeholder": "Separator eingeben (z.B. \\n\\n)", | ||||||
|  |         "required": "Bitte geben Sie einen Separator ein" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Text-Splitter" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -72,7 +72,7 @@ | |||||||
|         } |         } | ||||||
|       }, |       }, | ||||||
|       "braveApi": { |       "braveApi": { | ||||||
|         "label":  "Brave API Key", |         "label": "Brave API Key", | ||||||
|         "placeholder": "Enter your Brave API key" |         "placeholder": "Enter your Brave API key" | ||||||
|       }, |       }, | ||||||
|       "googleDomain": { |       "googleDomain": { | ||||||
| @ -337,6 +337,14 @@ | |||||||
|         "label": "Number of Retrieved Documents", |         "label": "Number of Retrieved Documents", | ||||||
|         "placeholder": "Enter Number of Retrieved Documents", |         "placeholder": "Enter Number of Retrieved Documents", | ||||||
|         "required": "Please enter the number of retrieved documents" |         "required": "Please enter the number of retrieved documents" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Separator", | ||||||
|  |         "placeholder": "Enter Separator (e.g., \\n\\n)", | ||||||
|  |         "required": "Please enter a separator" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Text Splitter" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Número de Documentos Recuperados", |         "label": "Número de Documentos Recuperados", | ||||||
|         "placeholder": "Ingrese el Número de Documentos Recuperados", |         "placeholder": "Ingrese el Número de Documentos Recuperados", | ||||||
|         "required": "Por favor, ingrese el número de documentos recuperados" |         "required": "Por favor, ingrese el número de documentos recuperados" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Separador", | ||||||
|  |         "placeholder": "Ingrese el separador (ej., \\n\\n)", | ||||||
|  |         "required": "Por favor, ingrese un separador" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Divisor de Texto" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -327,6 +327,14 @@ | |||||||
|         "label": "تعداد اسناد بازیابی شده", |         "label": "تعداد اسناد بازیابی شده", | ||||||
|         "placeholder": "تعداد اسناد بازیابی شده را وارد کنید", |         "placeholder": "تعداد اسناد بازیابی شده را وارد کنید", | ||||||
|         "required": "لطفاً تعداد اسناد بازیابی شده را وارد کنید" |         "required": "لطفاً تعداد اسناد بازیابی شده را وارد کنید" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "جداکننده", | ||||||
|  |         "placeholder": "جداکننده را وارد کنید (مثلاً \\n\\n)", | ||||||
|  |         "required": "لطفاً یک جداکننده وارد کنید" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "تقسیمکننده متن" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Nombre de documents récupérés", |         "label": "Nombre de documents récupérés", | ||||||
|         "placeholder": "Entrez le nombre de documents récupérés", |         "placeholder": "Entrez le nombre de documents récupérés", | ||||||
|         "required": "Veuillez saisir le nombre de documents récupérés" |         "required": "Veuillez saisir le nombre de documents récupérés" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Séparateur", | ||||||
|  |         "placeholder": "Entrez le séparateur (par exemple, \\n\\n)", | ||||||
|  |         "required": "Veuillez saisir un séparateur" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Diviseur de texte" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Numero di Documenti Recuperati", |         "label": "Numero di Documenti Recuperati", | ||||||
|         "placeholder": "Inserisci il Numero di Documenti Recuperati", |         "placeholder": "Inserisci il Numero di Documenti Recuperati", | ||||||
|         "required": "Inserisci il numero di documenti recuperati" |         "required": "Inserisci il numero di documenti recuperati" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Separatore", | ||||||
|  |         "placeholder": "Inserisci il Separatore (es. \\n\\n)", | ||||||
|  |         "required": "Inserisci un separatore" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Divisore di Testo" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -334,6 +334,14 @@ | |||||||
|         "label": "取得ドキュメント数", |         "label": "取得ドキュメント数", | ||||||
|         "placeholder": "取得ドキュメント数を入力", |         "placeholder": "取得ドキュメント数を入力", | ||||||
|         "required": "取得ドキュメント数を入力してください" |         "required": "取得ドキュメント数を入力してください" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "セパレーター", | ||||||
|  |         "placeholder": "セパレーターを入力(例:\\n\\n)", | ||||||
|  |         "required": "セパレーターを入力してください" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "テキスト分割方式" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -334,6 +334,14 @@ | |||||||
|         "label": "검색 문서 수", |         "label": "검색 문서 수", | ||||||
|         "placeholder": "검색 문서 수 입력", |         "placeholder": "검색 문서 수 입력", | ||||||
|         "required": "검색 문서 수를 입력해주세요" |         "required": "검색 문서 수를 입력해주세요" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "구분자", | ||||||
|  |         "placeholder": "구분자 입력 (예: \\n\\n)", | ||||||
|  |         "required": "구분자를 입력해주세요" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "텍스트 분할기" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -334,6 +334,14 @@ | |||||||
|         "label": "വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം", |         "label": "വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം", | ||||||
|         "placeholder": "വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം നൽകുക", |         "placeholder": "വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം നൽകുക", | ||||||
|         "required": "ദയവായി വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം നൽകുക" |         "required": "ദയവായി വീണ്ടെടുത്ത രേഖകളുടെ എണ്ണം നൽകുക" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "വിഭജന ചിഹ്നം", | ||||||
|  |         "placeholder": "വിഭജന ചിഹ്നം നൽകുക (ഉദാ: \\n\\n)", | ||||||
|  |         "required": "ദയവായി ഒരു വിഭജന ചിഹ്നം നൽകുക" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "ടെക്സ്റ്റ് സ്പ്ലിറ്റർ" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Antall hentede dokumenter", |         "label": "Antall hentede dokumenter", | ||||||
|         "placeholder": "Skriv inn antall hentede dokumenter", |         "placeholder": "Skriv inn antall hentede dokumenter", | ||||||
|         "required": "Vennligst skriv inn antall hentede dokumenter" |         "required": "Vennligst skriv inn antall hentede dokumenter" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Separator", | ||||||
|  |         "placeholder": "Skriv inn separator (f.eks. \\n\\n)", | ||||||
|  |         "required": "Vennligst skriv inn en separator" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Tekstdeler" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Número de Documentos Recuperados", |         "label": "Número de Documentos Recuperados", | ||||||
|         "placeholder": "Digite o Número de Documentos Recuperados", |         "placeholder": "Digite o Número de Documentos Recuperados", | ||||||
|         "required": "Por favor, insira o número de documentos recuperados" |         "required": "Por favor, insira o número de documentos recuperados" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Separador", | ||||||
|  |         "placeholder": "Digite o Separador (ex: \\n\\n)", | ||||||
|  |         "required": "Por favor, insira um separador" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Divisor de Texto" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -333,6 +333,14 @@ | |||||||
|         "label": "Количество извлеченных документов", |         "label": "Количество извлеченных документов", | ||||||
|         "placeholder": "Введите количество извлеченных документов", |         "placeholder": "Введите количество извлеченных документов", | ||||||
|         "required": "Пожалуйста, введите количество извлеченных документов" |         "required": "Пожалуйста, введите количество извлеченных документов" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Разделитель", | ||||||
|  |         "placeholder": "Введите разделитель (например, \\n\\n)", | ||||||
|  |         "required": "Пожалуйста, введите разделитель" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Разделитель текста" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Antal hämtade dokument", |         "label": "Antal hämtade dokument", | ||||||
|         "placeholder": "Ange antal hämtade dokument", |         "placeholder": "Ange antal hämtade dokument", | ||||||
|         "required": "Vänligen ange antal hämtade dokument" |         "required": "Vänligen ange antal hämtade dokument" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Separator", | ||||||
|  |         "placeholder": "Ange separator (t.ex. \\n\\n)", | ||||||
|  |         "required": "Vänligen ange en separator" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Textdelare" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -331,6 +331,14 @@ | |||||||
|         "label": "Кількість отриманих документів", |         "label": "Кількість отриманих документів", | ||||||
|         "placeholder": "Ввести кількість отриманих документів", |         "placeholder": "Ввести кількість отриманих документів", | ||||||
|         "required": "Будь ласка, введіть кількість документів" |         "required": "Будь ласка, введіть кількість документів" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "Роздільник", | ||||||
|  |         "placeholder": "Введіть роздільник (напр., \\n\\n)", | ||||||
|  |         "required": "Будь ласка, введіть роздільник" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "Розділювач тексту" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -336,6 +336,14 @@ | |||||||
|         "label": "检索文档数量", |         "label": "检索文档数量", | ||||||
|         "placeholder": "输入检索文档数量", |         "placeholder": "输入检索文档数量", | ||||||
|         "required": "请输入检索文档数量" |         "required": "请输入检索文档数量" | ||||||
|  |       }, | ||||||
|  |       "splittingSeparator": { | ||||||
|  |         "label": "分隔符", | ||||||
|  |         "placeholder": "输入分隔符(例如:\\n\\n)", | ||||||
|  |         "required": "请输入分隔符" | ||||||
|  |       }, | ||||||
|  |       "splittingStrategy": { | ||||||
|  |         "label": "文本分割器" | ||||||
|       } |       } | ||||||
|     }, |     }, | ||||||
|     "prompt": { |     "prompt": { | ||||||
|  | |||||||
| @ -1,10 +1,12 @@ | |||||||
| import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query" | import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query" | ||||||
| import { Form, InputNumber, Select, Skeleton } from "antd" | import { Form, Input, InputNumber, Select, Skeleton } from "antd" | ||||||
| import { SaveButton } from "~/components/Common/SaveButton" | import { SaveButton } from "~/components/Common/SaveButton" | ||||||
| import { | import { | ||||||
|   defaultEmbeddingChunkOverlap, |   defaultEmbeddingChunkOverlap, | ||||||
|   defaultEmbeddingChunkSize, |   defaultEmbeddingChunkSize, | ||||||
|   defaultEmbeddingModelForRag, |   defaultEmbeddingModelForRag, | ||||||
|  |   defaultSplittingStrategy, | ||||||
|  |   defaultSsplttingSeparator, | ||||||
|   getEmbeddingModels, |   getEmbeddingModels, | ||||||
|   saveForRag |   saveForRag | ||||||
| } from "~/services/ollama" | } from "~/services/ollama" | ||||||
| @ -16,7 +18,8 @@ import { ProviderIcons } from "@/components/Common/ProviderIcon" | |||||||
| 
 | 
 | ||||||
| export const RagSettings = () => { | export const RagSettings = () => { | ||||||
|   const { t } = useTranslation("settings") |   const { t } = useTranslation("settings") | ||||||
| 
 |   const [form] = Form.useForm() | ||||||
|  |   const splittingStrategy = Form.useWatch("splittingStrategy", form) | ||||||
|   const queryClient = useQueryClient() |   const queryClient = useQueryClient() | ||||||
| 
 | 
 | ||||||
|   const { data: ollamaInfo, status } = useQuery({ |   const { data: ollamaInfo, status } = useQuery({ | ||||||
| @ -28,14 +31,18 @@ export const RagSettings = () => { | |||||||
|         chunkSize, |         chunkSize, | ||||||
|         defaultEM, |         defaultEM, | ||||||
|         totalFilePerKB, |         totalFilePerKB, | ||||||
|         noOfRetrievedDocs |         noOfRetrievedDocs, | ||||||
|  |         splittingStrategy, | ||||||
|  |         splittingSeparator | ||||||
|       ] = await Promise.all([ |       ] = await Promise.all([ | ||||||
|         getEmbeddingModels({ returnEmpty: true }), |         getEmbeddingModels({ returnEmpty: true }), | ||||||
|         defaultEmbeddingChunkOverlap(), |         defaultEmbeddingChunkOverlap(), | ||||||
|         defaultEmbeddingChunkSize(), |         defaultEmbeddingChunkSize(), | ||||||
|         defaultEmbeddingModelForRag(), |         defaultEmbeddingModelForRag(), | ||||||
|         getTotalFilePerKB(), |         getTotalFilePerKB(), | ||||||
|         getNoOfRetrievedDocs() |         getNoOfRetrievedDocs(), | ||||||
|  |         defaultSplittingStrategy(), | ||||||
|  |         defaultSsplttingSeparator() | ||||||
|       ]) |       ]) | ||||||
|       return { |       return { | ||||||
|         models: allModels, |         models: allModels, | ||||||
| @ -43,7 +50,9 @@ export const RagSettings = () => { | |||||||
|         chunkSize, |         chunkSize, | ||||||
|         defaultEM, |         defaultEM, | ||||||
|         totalFilePerKB, |         totalFilePerKB, | ||||||
|         noOfRetrievedDocs |         noOfRetrievedDocs, | ||||||
|  |         splittingStrategy, | ||||||
|  |         splittingSeparator | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   }) |   }) | ||||||
| @ -55,13 +64,17 @@ export const RagSettings = () => { | |||||||
|       overlap: number |       overlap: number | ||||||
|       totalFilePerKB: number |       totalFilePerKB: number | ||||||
|       noOfRetrievedDocs: number |       noOfRetrievedDocs: number | ||||||
|  |       strategy: string | ||||||
|  |       separator: string | ||||||
|     }) => { |     }) => { | ||||||
|       await saveForRag( |       await saveForRag( | ||||||
|         data.model, |         data.model, | ||||||
|         data.chunkSize, |         data.chunkSize, | ||||||
|         data.overlap, |         data.overlap, | ||||||
|         data.totalFilePerKB, |         data.totalFilePerKB, | ||||||
|         data.noOfRetrievedDocs |         data.noOfRetrievedDocs, | ||||||
|  |         data.strategy, | ||||||
|  |         data.separator | ||||||
|       ) |       ) | ||||||
|       return true |       return true | ||||||
|     }, |     }, | ||||||
| @ -85,6 +98,7 @@ export const RagSettings = () => { | |||||||
|               <div className="border border-b border-gray-200 dark:border-gray-600 mt-3 mb-6"></div> |               <div className="border border-b border-gray-200 dark:border-gray-600 mt-3 mb-6"></div> | ||||||
|             </div> |             </div> | ||||||
|             <Form |             <Form | ||||||
|  |               form={form} | ||||||
|               layout="vertical" |               layout="vertical" | ||||||
|               onFinish={(data) => { |               onFinish={(data) => { | ||||||
|                 saveRAG({ |                 saveRAG({ | ||||||
| @ -92,7 +106,9 @@ export const RagSettings = () => { | |||||||
|                   chunkSize: data.chunkSize, |                   chunkSize: data.chunkSize, | ||||||
|                   overlap: data.chunkOverlap, |                   overlap: data.chunkOverlap, | ||||||
|                   totalFilePerKB: data.totalFilePerKB, |                   totalFilePerKB: data.totalFilePerKB, | ||||||
|                   noOfRetrievedDocs: data.noOfRetrievedDocs |                   noOfRetrievedDocs: data.noOfRetrievedDocs, | ||||||
|  |                   separator: data.splittingSeparator, | ||||||
|  |                   strategy: data.splittingStrategy | ||||||
|                 }) |                 }) | ||||||
|               }} |               }} | ||||||
|               initialValues={{ |               initialValues={{ | ||||||
| @ -100,7 +116,9 @@ export const RagSettings = () => { | |||||||
|                 chunkOverlap: ollamaInfo?.chunkOverlap, |                 chunkOverlap: ollamaInfo?.chunkOverlap, | ||||||
|                 defaultEM: ollamaInfo?.defaultEM, |                 defaultEM: ollamaInfo?.defaultEM, | ||||||
|                 totalFilePerKB: ollamaInfo?.totalFilePerKB, |                 totalFilePerKB: ollamaInfo?.totalFilePerKB, | ||||||
|                 noOfRetrievedDocs: ollamaInfo?.noOfRetrievedDocs |                 noOfRetrievedDocs: ollamaInfo?.noOfRetrievedDocs, | ||||||
|  |                 splittingStrategy: ollamaInfo?.splittingStrategy, | ||||||
|  |                 splittingSeparator: ollamaInfo?.splittingSeparator | ||||||
|               }}> |               }}> | ||||||
|               <Form.Item |               <Form.Item | ||||||
|                 name="defaultEM" |                 name="defaultEM" | ||||||
| @ -140,6 +158,50 @@ export const RagSettings = () => { | |||||||
|                 /> |                 /> | ||||||
|               </Form.Item> |               </Form.Item> | ||||||
| 
 | 
 | ||||||
|  |               <Form.Item | ||||||
|  |                 name="splittingStrategy" | ||||||
|  |                 label={t("rag.ragSettings.splittingStrategy.label")} | ||||||
|  |                 rules={[ | ||||||
|  |                   { | ||||||
|  |                     required: true, | ||||||
|  |                     message: t("rag.ragSettings.model.required") | ||||||
|  |                   } | ||||||
|  |                 ]}> | ||||||
|  |                 <Select | ||||||
|  |                   size="large" | ||||||
|  |                   showSearch | ||||||
|  |                   style={{ width: "100%" }} | ||||||
|  |                   className="mt-4" | ||||||
|  |                   options={[ | ||||||
|  |                     "RecursiveCharacterTextSplitter", | ||||||
|  |                     "CharacterTextSplitter" | ||||||
|  |                   ].map((e) => ({ | ||||||
|  |                     label: e, | ||||||
|  |                     value: e | ||||||
|  |                   }))} | ||||||
|  |                 /> | ||||||
|  |               </Form.Item> | ||||||
|  | 
 | ||||||
|  |               {splittingStrategy !== "RecursiveCharacterTextSplitter" && ( | ||||||
|  |                 <Form.Item | ||||||
|  |                   name="splittingSeparator" | ||||||
|  |                   label={t("rag.ragSettings.splittingSeparator.label")} | ||||||
|  |                   rules={[ | ||||||
|  |                     { | ||||||
|  |                       required: true, | ||||||
|  |                       message: t("rag.ragSettings.splittingSeparator.required") | ||||||
|  |                     } | ||||||
|  |                   ]}> | ||||||
|  |                   <Input | ||||||
|  |                     size="large" | ||||||
|  |                     style={{ width: "100%" }} | ||||||
|  |                     placeholder={t( | ||||||
|  |                       "rag.ragSettings.splittingSeparator.placeholder" | ||||||
|  |                     )} | ||||||
|  |                   /> | ||||||
|  |                 </Form.Item> | ||||||
|  |               )} | ||||||
|  | 
 | ||||||
|               <Form.Item |               <Form.Item | ||||||
|                 name="chunkSize" |                 name="chunkSize" | ||||||
|                 label={t("rag.ragSettings.chunkSize.label")} |                 label={t("rag.ragSettings.chunkSize.label")} | ||||||
|  | |||||||
| @ -1,11 +1,6 @@ | |||||||
| import { getKnowledgeById, updateKnowledgeStatus } from "@/db/knowledge" | import { getKnowledgeById, updateKnowledgeStatus } from "@/db/knowledge" | ||||||
| import { PageAssistPDFUrlLoader } from "@/loader/pdf-url" | import { PageAssistPDFUrlLoader } from "@/loader/pdf-url" | ||||||
| import { | import { getOllamaURL } from "@/services/ollama" | ||||||
|   defaultEmbeddingChunkOverlap, |  | ||||||
|   defaultEmbeddingChunkSize, |  | ||||||
|   getOllamaURL |  | ||||||
| } from "@/services/ollama" |  | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" |  | ||||||
| import { PageAssistVectorStore } from "./PageAssistVectorStore" | import { PageAssistVectorStore } from "./PageAssistVectorStore" | ||||||
| import { PageAssisCSVUrlLoader } from "@/loader/csv" | import { PageAssisCSVUrlLoader } from "@/loader/csv" | ||||||
| import { PageAssisTXTUrlLoader } from "@/loader/txt" | import { PageAssisTXTUrlLoader } from "@/loader/txt" | ||||||
| @ -13,7 +8,7 @@ import { PageAssistDocxLoader } from "@/loader/docx" | |||||||
| import { cleanUrl } from "./clean-url" | import { cleanUrl } from "./clean-url" | ||||||
| import { sendEmbeddingCompleteNotification } from "./send-notification" | import { sendEmbeddingCompleteNotification } from "./send-notification" | ||||||
| import { pageAssistEmbeddingModel } from "@/models/embedding" | import { pageAssistEmbeddingModel } from "@/models/embedding" | ||||||
| 
 | import { getPageAssistTextSplitter } from "@/utils/text-splitter" | ||||||
| 
 | 
 | ||||||
| export const processKnowledge = async (msg: any, id: string): Promise<void> => { | export const processKnowledge = async (msg: any, id: string): Promise<void> => { | ||||||
|   console.log(`Processing knowledge with id: ${id}`) |   console.log(`Processing knowledge with id: ${id}`) | ||||||
| @ -32,12 +27,8 @@ export const processKnowledge = async (msg: any, id: string): Promise<void> => { | |||||||
|       baseUrl: cleanUrl(ollamaUrl), |       baseUrl: cleanUrl(ollamaUrl), | ||||||
|       model: knowledge.embedding_model |       model: knowledge.embedding_model | ||||||
|     }) |     }) | ||||||
|     const chunkSize = await defaultEmbeddingChunkSize() | 
 | ||||||
|     const chunkOverlap = await defaultEmbeddingChunkOverlap() |     const textSplitter = await getPageAssistTextSplitter() | ||||||
|     const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|       chunkSize, |  | ||||||
|       chunkOverlap |  | ||||||
|     }) |  | ||||||
| 
 | 
 | ||||||
|     for (const doc of knowledge.source) { |     for (const doc of knowledge.source) { | ||||||
|       if (doc.type === "pdf" || doc.type === "application/pdf") { |       if (doc.type === "pdf" || doc.type === "application/pdf") { | ||||||
| @ -65,13 +56,15 @@ export const processKnowledge = async (msg: any, id: string): Promise<void> => { | |||||||
|           knownledge_id: knowledge.id, |           knownledge_id: knowledge.id, | ||||||
|           file_id: doc.source_id |           file_id: doc.source_id | ||||||
|         }) |         }) | ||||||
|       } else if (doc.type === "docx" || doc.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document") { |       } else if ( | ||||||
|  |         doc.type === "docx" || | ||||||
|  |         doc.type === | ||||||
|  |           "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||||||
|  |       ) { | ||||||
|         try { |         try { | ||||||
|           const loader = new PageAssistDocxLoader({ |           const loader = new PageAssistDocxLoader({ | ||||||
|             fileName: doc.filename, |             fileName: doc.filename, | ||||||
|             buffer: await toArrayBufferFromBase64( |             buffer: await toArrayBufferFromBase64(doc.content) | ||||||
|               doc.content |  | ||||||
|             ) |  | ||||||
|           }) |           }) | ||||||
| 
 | 
 | ||||||
|           let docs = await loader.load() |           let docs = await loader.load() | ||||||
|  | |||||||
| @ -8,6 +8,9 @@ import { ollamaFormatAllCustomModels } from "@/db/models" | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| const storage = new Storage() | const storage = new Storage() | ||||||
|  | const storage2 = new Storage({ | ||||||
|  |   area: "local" | ||||||
|  | }) | ||||||
| 
 | 
 | ||||||
| const DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434" | const DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434" | ||||||
| const DEFAULT_ASK_FOR_MODEL_SELECTION_EVERY_TIME = true | const DEFAULT_ASK_FOR_MODEL_SELECTION_EVERY_TIME = true | ||||||
| @ -310,6 +313,22 @@ export const defaultEmbeddingChunkSize = async () => { | |||||||
|   return parseInt(embeddingChunkSize) |   return parseInt(embeddingChunkSize) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | export const defaultSplittingStrategy = async () => { | ||||||
|  |   const splittingStrategy = await storage.get("defaultSplittingStrategy") | ||||||
|  |   if (!splittingStrategy || splittingStrategy.length === 0) { | ||||||
|  |     return "RecursiveCharacterTextSplitter" | ||||||
|  |   } | ||||||
|  |   return splittingStrategy | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | export const defaultSsplttingSeparator = async () => { | ||||||
|  |   const splittingSeparator = await storage.get("defaultSplittingSeparator") | ||||||
|  |   if (!splittingSeparator || splittingSeparator.length === 0) { | ||||||
|  |     return "\\n\\n" | ||||||
|  |   } | ||||||
|  |   return splittingSeparator | ||||||
|  | } | ||||||
|  | 
 | ||||||
| export const defaultEmbeddingChunkOverlap = async () => { | export const defaultEmbeddingChunkOverlap = async () => { | ||||||
|   const embeddingChunkOverlap = await storage.get( |   const embeddingChunkOverlap = await storage.get( | ||||||
|     "defaultEmbeddingChunkOverlap" |     "defaultEmbeddingChunkOverlap" | ||||||
| @ -320,6 +339,14 @@ export const defaultEmbeddingChunkOverlap = async () => { | |||||||
|   return parseInt(embeddingChunkOverlap) |   return parseInt(embeddingChunkOverlap) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | export const setDefaultSplittingStrategy = async (strategy: string) => { | ||||||
|  |   await storage.set("defaultSplittingStrategy", strategy) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | export const setDefaultSplittingSeparator = async (separator: string) => { | ||||||
|  |   await storage.set("defaultSplittingSeparator", separator) | ||||||
|  | } | ||||||
|  | 
 | ||||||
| export const setDefaultEmbeddingModelForRag = async (model: string) => { | export const setDefaultEmbeddingModelForRag = async (model: string) => { | ||||||
|   await storage.set("defaultEmbeddingModel", model) |   await storage.set("defaultEmbeddingModel", model) | ||||||
| } | } | ||||||
| @ -337,7 +364,9 @@ export const saveForRag = async ( | |||||||
|   chunkSize: number, |   chunkSize: number, | ||||||
|   overlap: number, |   overlap: number, | ||||||
|   totalFilePerKB: number, |   totalFilePerKB: number, | ||||||
|   noOfRetrievedDocs?: number |   noOfRetrievedDocs?: number, | ||||||
|  |   strategy?: string, | ||||||
|  |   separator?: string | ||||||
| ) => { | ) => { | ||||||
|   await setDefaultEmbeddingModelForRag(model) |   await setDefaultEmbeddingModelForRag(model) | ||||||
|   await setDefaultEmbeddingChunkSize(chunkSize) |   await setDefaultEmbeddingChunkSize(chunkSize) | ||||||
| @ -346,6 +375,12 @@ export const saveForRag = async ( | |||||||
|   if (noOfRetrievedDocs) { |   if (noOfRetrievedDocs) { | ||||||
|     await setNoOfRetrievedDocs(noOfRetrievedDocs) |     await setNoOfRetrievedDocs(noOfRetrievedDocs) | ||||||
|   } |   } | ||||||
|  |   if (strategy) { | ||||||
|  |     await setDefaultSplittingStrategy(strategy) | ||||||
|  |   } | ||||||
|  |   if (separator) { | ||||||
|  |     await setDefaultSplittingSeparator(separator) | ||||||
|  |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| export const getWebSearchPrompt = async () => { | export const getWebSearchPrompt = async () => { | ||||||
|  | |||||||
| @ -1,12 +1,8 @@ | |||||||
| import { PageAssistHtmlLoader } from "~/loader/html" | import { PageAssistHtmlLoader } from "~/loader/html" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" |  | ||||||
| 
 | 
 | ||||||
| import { |  | ||||||
|   defaultEmbeddingChunkOverlap, |  | ||||||
|   defaultEmbeddingChunkSize |  | ||||||
| } from "@/services/ollama" |  | ||||||
| import { PageAssistPDFLoader } from "@/loader/pdf" | import { PageAssistPDFLoader } from "@/loader/pdf" | ||||||
| import { PAMemoryVectorStore } from "@/libs/PAMemoryVectorStore" | import { PAMemoryVectorStore } from "@/libs/PAMemoryVectorStore" | ||||||
|  | import { getPageAssistTextSplitter } from "./text-splitter" | ||||||
| 
 | 
 | ||||||
| export const getLoader = ({ | export const getLoader = ({ | ||||||
|   html, |   html, | ||||||
| @ -54,12 +50,7 @@ export const memoryEmbedding = async ({ | |||||||
|   setIsEmbedding(true) |   setIsEmbedding(true) | ||||||
|   const loader = getLoader({ html, pdf, type, url }) |   const loader = getLoader({ html, pdf, type, url }) | ||||||
|   const docs = await loader.load() |   const docs = await loader.load() | ||||||
|   const chunkSize = await defaultEmbeddingChunkSize() |   const textSplitter = await getPageAssistTextSplitter() | ||||||
|   const chunkOverlap = await defaultEmbeddingChunkOverlap() |  | ||||||
|   const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|     chunkSize, |  | ||||||
|     chunkOverlap |  | ||||||
|   }) |  | ||||||
| 
 | 
 | ||||||
|   const chunks = await textSplitter.splitDocuments(docs) |   const chunks = await textSplitter.splitDocuments(docs) | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										37
									
								
								src/utils/text-splitter.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								src/utils/text-splitter.ts
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,37 @@ | |||||||
|  | import { | ||||||
|  |   RecursiveCharacterTextSplitter, | ||||||
|  |   CharacterTextSplitter | ||||||
|  | } from "langchain/text_splitter" | ||||||
|  | 
 | ||||||
|  | import { | ||||||
|  |   defaultEmbeddingChunkOverlap, | ||||||
|  |   defaultEmbeddingChunkSize, | ||||||
|  |   defaultSsplttingSeparator, | ||||||
|  |   defaultSplittingStrategy | ||||||
|  | } from "@/services/ollama" | ||||||
|  | 
 | ||||||
|  | export const getPageAssistTextSplitter = async () => { | ||||||
|  |   const chunkSize = await defaultEmbeddingChunkSize() | ||||||
|  |   const chunkOverlap = await defaultEmbeddingChunkOverlap() | ||||||
|  |   const splittingStrategy = await defaultSplittingStrategy() | ||||||
|  | 
 | ||||||
|  |   switch (splittingStrategy) { | ||||||
|  |     case "CharacterTextSplitter": | ||||||
|  |       console.log("Using CharacterTextSplitter") | ||||||
|  |       const splittingSeparator = await defaultSsplttingSeparator() | ||||||
|  |       const processedSeparator = splittingSeparator | ||||||
|  |         .replace(/\\n/g, "\n") | ||||||
|  |         .replace(/\\t/g, "\t") | ||||||
|  |         .replace(/\\r/g, "\r") | ||||||
|  |       return new CharacterTextSplitter({ | ||||||
|  |         chunkSize, | ||||||
|  |         chunkOverlap, | ||||||
|  |         separator: processedSeparator | ||||||
|  |       }) | ||||||
|  |     default: | ||||||
|  |       return new RecursiveCharacterTextSplitter({ | ||||||
|  |         chunkSize, | ||||||
|  |         chunkOverlap | ||||||
|  |       }) | ||||||
|  |   } | ||||||
|  | } | ||||||
| @ -2,15 +2,13 @@ import { cleanUrl } from "~/libs/clean-url" | |||||||
| import { getIsSimpleInternetSearch, totalSearchResults, getBraveApiKey } from "@/services/search" | import { getIsSimpleInternetSearch, totalSearchResults, getBraveApiKey } from "@/services/search" | ||||||
| import { pageAssistEmbeddingModel } from "@/models/embedding" | import { pageAssistEmbeddingModel } from "@/models/embedding" | ||||||
| import type { Document } from "@langchain/core/documents" | import type { Document } from "@langchain/core/documents" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" |  | ||||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||||
| import { PageAssistHtmlLoader } from "~/loader/html" | import { PageAssistHtmlLoader } from "~/loader/html" | ||||||
| import { | import { | ||||||
|     defaultEmbeddingChunkOverlap, |  | ||||||
|     defaultEmbeddingChunkSize, |  | ||||||
|     defaultEmbeddingModelForRag, |     defaultEmbeddingModelForRag, | ||||||
|     getOllamaURL |     getOllamaURL | ||||||
| } from "~/services/ollama" | } from "~/services/ollama" | ||||||
|  | import { getPageAssistTextSplitter } from "@/utils/text-splitter" | ||||||
| 
 | 
 | ||||||
| interface BraveAPIResult { | interface BraveAPIResult { | ||||||
|     title: string |     title: string | ||||||
| @ -70,12 +68,7 @@ export const braveAPISearch = async (query: string) => { | |||||||
|         baseUrl: cleanUrl(ollamaUrl) |         baseUrl: cleanUrl(ollamaUrl) | ||||||
|     }) |     }) | ||||||
| 
 | 
 | ||||||
|     const chunkSize = await defaultEmbeddingChunkSize() |     const textSplitter = await getPageAssistTextSplitter() | ||||||
|     const chunkOverlap = await defaultEmbeddingChunkOverlap() |  | ||||||
|     const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|         chunkSize, |  | ||||||
|         chunkOverlap |  | ||||||
|     }) |  | ||||||
| 
 | 
 | ||||||
|     const chunks = await textSplitter.splitDocuments(docs) |     const chunks = await textSplitter.splitDocuments(docs) | ||||||
|     const store = new MemoryVectorStore(ollamaEmbedding) |     const store = new MemoryVectorStore(ollamaEmbedding) | ||||||
|  | |||||||
| @ -3,8 +3,6 @@ import { urlRewriteRuntime } from "@/libs/runtime" | |||||||
| import { PageAssistHtmlLoader } from "@/loader/html" | import { PageAssistHtmlLoader } from "@/loader/html" | ||||||
| import { pageAssistEmbeddingModel } from "@/models/embedding" | import { pageAssistEmbeddingModel } from "@/models/embedding" | ||||||
| import { | import { | ||||||
|     defaultEmbeddingChunkOverlap, |  | ||||||
|     defaultEmbeddingChunkSize, |  | ||||||
|     defaultEmbeddingModelForRag, |     defaultEmbeddingModelForRag, | ||||||
|     getOllamaURL |     getOllamaURL | ||||||
| } from "@/services/ollama" | } from "@/services/ollama" | ||||||
| @ -12,10 +10,10 @@ import { | |||||||
|     getIsSimpleInternetSearch, |     getIsSimpleInternetSearch, | ||||||
|     totalSearchResults |     totalSearchResults | ||||||
| } from "@/services/search" | } from "@/services/search" | ||||||
|  | import { getPageAssistTextSplitter } from "@/utils/text-splitter" | ||||||
| 
 | 
 | ||||||
| import type { Document } from "@langchain/core/documents" | import type { Document } from "@langchain/core/documents" | ||||||
| import * as cheerio from "cheerio" | import * as cheerio from "cheerio" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" |  | ||||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||||
| 
 | 
 | ||||||
| export const localBraveSearch = async (query: string) => { | export const localBraveSearch = async (query: string) => { | ||||||
| @ -87,12 +85,8 @@ export const webBraveSearch = async (query: string) => { | |||||||
|         baseUrl: cleanUrl(ollamaUrl) |         baseUrl: cleanUrl(ollamaUrl) | ||||||
|     }) |     }) | ||||||
| 
 | 
 | ||||||
|     const chunkSize = await defaultEmbeddingChunkSize() |   | ||||||
|     const chunkOverlap = await defaultEmbeddingChunkOverlap() |     const textSplitter = await getPageAssistTextSplitter(); | ||||||
|     const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|         chunkSize, |  | ||||||
|         chunkOverlap |  | ||||||
|     }) |  | ||||||
| 
 | 
 | ||||||
|     const chunks = await textSplitter.splitDocuments(docs) |     const chunks = await textSplitter.splitDocuments(docs) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -3,8 +3,6 @@ import { urlRewriteRuntime } from "@/libs/runtime" | |||||||
| import { PageAssistHtmlLoader } from "@/loader/html" | import { PageAssistHtmlLoader } from "@/loader/html" | ||||||
| import { pageAssistEmbeddingModel } from "@/models/embedding" | import { pageAssistEmbeddingModel } from "@/models/embedding" | ||||||
| import { | import { | ||||||
|   defaultEmbeddingChunkOverlap, |  | ||||||
|   defaultEmbeddingChunkSize, |  | ||||||
|   defaultEmbeddingModelForRag, |   defaultEmbeddingModelForRag, | ||||||
|   getOllamaURL |   getOllamaURL | ||||||
| } from "@/services/ollama" | } from "@/services/ollama" | ||||||
| @ -12,9 +10,9 @@ import { | |||||||
|   getIsSimpleInternetSearch, |   getIsSimpleInternetSearch, | ||||||
|   totalSearchResults |   totalSearchResults | ||||||
| } from "@/services/search" | } from "@/services/search" | ||||||
|  | import { getPageAssistTextSplitter } from "@/utils/text-splitter" | ||||||
| import type { Document } from "@langchain/core/documents" | import type { Document } from "@langchain/core/documents" | ||||||
| import * as cheerio from "cheerio" | import * as cheerio from "cheerio" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" |  | ||||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||||
| 
 | 
 | ||||||
| export const localDuckDuckGoSearch = async (query: string) => { | export const localDuckDuckGoSearch = async (query: string) => { | ||||||
| @ -90,12 +88,7 @@ export const webDuckDuckGoSearch = async (query: string) => { | |||||||
|     baseUrl: cleanUrl(ollamaUrl) |     baseUrl: cleanUrl(ollamaUrl) | ||||||
|   }) |   }) | ||||||
| 
 | 
 | ||||||
|   const chunkSize = await defaultEmbeddingChunkSize() |   const textSplitter = await getPageAssistTextSplitter() | ||||||
|   const chunkOverlap = await defaultEmbeddingChunkOverlap() |  | ||||||
|   const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|     chunkSize, |  | ||||||
|     chunkOverlap |  | ||||||
|   }) |  | ||||||
| 
 | 
 | ||||||
|   const chunks = await textSplitter.splitDocuments(docs) |   const chunks = await textSplitter.splitDocuments(docs) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -4,15 +4,13 @@ import { | |||||||
|   getIsSimpleInternetSearch, |   getIsSimpleInternetSearch, | ||||||
|   totalSearchResults |   totalSearchResults | ||||||
| } from "@/services/search" | } from "@/services/search" | ||||||
|  | import { getPageAssistTextSplitter } from "@/utils/text-splitter" | ||||||
| import type { Document } from "@langchain/core/documents" | import type { Document } from "@langchain/core/documents" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" |  | ||||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||||
| import { cleanUrl } from "~/libs/clean-url" | import { cleanUrl } from "~/libs/clean-url" | ||||||
| import { urlRewriteRuntime } from "~/libs/runtime" | import { urlRewriteRuntime } from "~/libs/runtime" | ||||||
| import { PageAssistHtmlLoader } from "~/loader/html" | import { PageAssistHtmlLoader } from "~/loader/html" | ||||||
| import { | import { | ||||||
|   defaultEmbeddingChunkOverlap, |  | ||||||
|   defaultEmbeddingChunkSize, |  | ||||||
|   defaultEmbeddingModelForRag, |   defaultEmbeddingModelForRag, | ||||||
|   getOllamaURL |   getOllamaURL | ||||||
| } from "~/services/ollama" | } from "~/services/ollama" | ||||||
| @ -91,12 +89,8 @@ export const webGoogleSearch = async (query: string) => { | |||||||
|     baseUrl: cleanUrl(ollamaUrl) |     baseUrl: cleanUrl(ollamaUrl) | ||||||
|   }) |   }) | ||||||
| 
 | 
 | ||||||
|   const chunkSize = await defaultEmbeddingChunkSize() |    | ||||||
|   const chunkOverlap = await defaultEmbeddingChunkOverlap() |   const textSplitter = await getPageAssistTextSplitter() | ||||||
|   const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|     chunkSize, |  | ||||||
|     chunkOverlap |  | ||||||
|   }) |  | ||||||
|    |    | ||||||
|   const chunks = await textSplitter.splitDocuments(docs) |   const chunks = await textSplitter.splitDocuments(docs) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -3,15 +3,13 @@ import { cleanUrl } from "~/libs/clean-url" | |||||||
| import { getSearxngURL, isSearxngJSONMode, getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" | import { getSearxngURL, isSearxngJSONMode, getIsSimpleInternetSearch, totalSearchResults } from "@/services/search" | ||||||
| import { pageAssistEmbeddingModel } from "@/models/embedding" | import { pageAssistEmbeddingModel } from "@/models/embedding" | ||||||
| import type { Document } from "@langchain/core/documents" | import type { Document } from "@langchain/core/documents" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" |  | ||||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||||
| import { PageAssistHtmlLoader } from "~/loader/html" | import { PageAssistHtmlLoader } from "~/loader/html" | ||||||
| import { | import { | ||||||
|   defaultEmbeddingChunkOverlap, |  | ||||||
|   defaultEmbeddingChunkSize, |  | ||||||
|   defaultEmbeddingModelForRag, |   defaultEmbeddingModelForRag, | ||||||
|   getOllamaURL |   getOllamaURL | ||||||
| } from "~/services/ollama" | } from "~/services/ollama" | ||||||
|  | import { getPageAssistTextSplitter } from "@/utils/text-splitter" | ||||||
| 
 | 
 | ||||||
| interface SearxNGJSONResult { | interface SearxNGJSONResult { | ||||||
|   title: string |   title: string | ||||||
| @ -73,12 +71,8 @@ export const searxngSearch = async (query: string) => { | |||||||
|     baseUrl: cleanUrl(ollamaUrl) |     baseUrl: cleanUrl(ollamaUrl) | ||||||
|   }) |   }) | ||||||
| 
 | 
 | ||||||
|   const chunkSize = await defaultEmbeddingChunkSize() | 
 | ||||||
|   const chunkOverlap = await defaultEmbeddingChunkOverlap() |   const textSplitter = await getPageAssistTextSplitter(); | ||||||
|   const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|     chunkSize, |  | ||||||
|     chunkOverlap |  | ||||||
|   }) |  | ||||||
|    |    | ||||||
|   const chunks = await textSplitter.splitDocuments(docs) |   const chunks = await textSplitter.splitDocuments(docs) | ||||||
|   const store = new MemoryVectorStore(ollamaEmbedding) |   const store = new MemoryVectorStore(ollamaEmbedding) | ||||||
|  | |||||||
| @ -3,8 +3,6 @@ import { urlRewriteRuntime } from "@/libs/runtime" | |||||||
| import { PageAssistHtmlLoader } from "@/loader/html" | import { PageAssistHtmlLoader } from "@/loader/html" | ||||||
| import { pageAssistEmbeddingModel } from "@/models/embedding" | import { pageAssistEmbeddingModel } from "@/models/embedding" | ||||||
| import { | import { | ||||||
|   defaultEmbeddingChunkOverlap, |  | ||||||
|   defaultEmbeddingChunkSize, |  | ||||||
|   defaultEmbeddingModelForRag, |   defaultEmbeddingModelForRag, | ||||||
|   getOllamaURL |   getOllamaURL | ||||||
| } from "@/services/ollama" | } from "@/services/ollama" | ||||||
| @ -12,9 +10,9 @@ import { | |||||||
|   getIsSimpleInternetSearch, |   getIsSimpleInternetSearch, | ||||||
|   totalSearchResults |   totalSearchResults | ||||||
| } from "@/services/search" | } from "@/services/search" | ||||||
|  | import { getPageAssistTextSplitter } from "@/utils/text-splitter" | ||||||
| import type { Document } from "@langchain/core/documents" | import type { Document } from "@langchain/core/documents" | ||||||
| import * as cheerio from "cheerio" | import * as cheerio from "cheerio" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" |  | ||||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||||
| const getCorrectTargeUrl = async (url: string) => { | const getCorrectTargeUrl = async (url: string) => { | ||||||
|   if (!url) return "" |   if (!url) return "" | ||||||
| @ -104,12 +102,7 @@ export const webSogouSearch = async (query: string) => { | |||||||
|     baseUrl: cleanUrl(ollamaUrl) |     baseUrl: cleanUrl(ollamaUrl) | ||||||
|   }) |   }) | ||||||
| 
 | 
 | ||||||
|   const chunkSize = await defaultEmbeddingChunkSize() |   const textSplitter = await getPageAssistTextSplitter() | ||||||
|   const chunkOverlap = await defaultEmbeddingChunkOverlap() |  | ||||||
|   const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|     chunkSize, |  | ||||||
|     chunkOverlap |  | ||||||
|   }) |  | ||||||
| 
 | 
 | ||||||
|   const chunks = await textSplitter.splitDocuments(docs) |   const chunks = await textSplitter.splitDocuments(docs) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -1,8 +1,9 @@ | |||||||
| import { cleanUrl } from "@/libs/clean-url" | import { cleanUrl } from "@/libs/clean-url" | ||||||
| import { PageAssistHtmlLoader } from "@/loader/html" | import { PageAssistHtmlLoader } from "@/loader/html" | ||||||
| import { pageAssistEmbeddingModel } from "@/models/embedding" | import { pageAssistEmbeddingModel } from "@/models/embedding" | ||||||
| import { defaultEmbeddingChunkOverlap, defaultEmbeddingChunkSize, defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" | import { defaultEmbeddingModelForRag, getOllamaURL } from "@/services/ollama" | ||||||
| import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" | import { getPageAssistTextSplitter } from "@/utils/text-splitter" | ||||||
|  | 
 | ||||||
| import { MemoryVectorStore } from "langchain/vectorstores/memory" | import { MemoryVectorStore } from "langchain/vectorstores/memory" | ||||||
| 
 | 
 | ||||||
| export const processSingleWebsite = async (url: string, query: string) => { | export const processSingleWebsite = async (url: string, query: string) => { | ||||||
| @ -20,12 +21,8 @@ export const processSingleWebsite = async (url: string, query: string) => { | |||||||
|         baseUrl: cleanUrl(ollamaUrl) |         baseUrl: cleanUrl(ollamaUrl) | ||||||
|     }) |     }) | ||||||
| 
 | 
 | ||||||
|     const chunkSize = await defaultEmbeddingChunkSize() | 
 | ||||||
|     const chunkOverlap = await defaultEmbeddingChunkOverlap() |     const textSplitter = await getPageAssistTextSplitter() | ||||||
|     const textSplitter = new RecursiveCharacterTextSplitter({ |  | ||||||
|         chunkSize, |  | ||||||
|         chunkOverlap |  | ||||||
|     }) |  | ||||||
| 
 | 
 | ||||||
|     const chunks = await textSplitter.splitDocuments(docs) |     const chunks = await textSplitter.splitDocuments(docs) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user