diff --git a/pom.xml b/pom.xml
index ba6add4137..5f859b0fd1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,7 +23,7 @@
- yudao-module-ai
+
diff --git a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/enums/AiDocumentSplitStrategyEnum.java b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/enums/AiDocumentSplitStrategyEnum.java
index 2c9f657579..f0a9cd21d6 100644
--- a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/enums/AiDocumentSplitStrategyEnum.java
+++ b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/enums/AiDocumentSplitStrategyEnum.java
@@ -50,16 +50,4 @@ public enum AiDocumentSplitStrategyEnum {
*/
private final String name;
- /**
- * 根据代码获取枚举
- */
- public static AiDocumentSplitStrategyEnum fromCode(String code) {
- for (AiDocumentSplitStrategyEnum strategy : values()) {
- if (strategy.getCode().equals(code)) {
- return strategy;
- }
- }
- return AUTO; // 默认返回自动识别
- }
-
}
diff --git a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java
index 51a1ce94d5..9d64fcce9f 100644
--- a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java
+++ b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/AiKnowledgeSegmentServiceImpl.java
@@ -107,11 +107,8 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
if (StrUtil.isEmpty(segment.getText())) {
return null;
}
- return new AiKnowledgeSegmentDO()
- .setKnowledgeId(documentDO.getKnowledgeId())
- .setDocumentId(documentId)
- .setContent(segment.getText())
- .setContentLength(segment.getText().length())
+ return new AiKnowledgeSegmentDO().setKnowledgeId(documentDO.getKnowledgeId()).setDocumentId(documentId)
+ .setContent(segment.getText()).setContentLength(segment.getText().length())
.setVectorId(AiKnowledgeSegmentDO.VECTOR_ID_EMPTY)
.setTokens(tokenCountEstimator.estimate(segment.getText()))
.setStatus(CommonStatusEnum.ENABLE.getStatus());
@@ -302,13 +299,12 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
// 1. 读取 URL 内容
String content = knowledgeDocumentService.readUrl(url);
- // 2. 自动检测文档类型并选择策略
+ // 2.1 自动检测文档类型并选择策略
AiDocumentSplitStrategyEnum strategy = detectDocumentStrategy(content, url);
-
- // 3. 文档切片
+ // 2.2 文档切片
List documentSegments = splitContentByStrategy(content, segmentMaxTokens, strategy, url);
- // 4. 转换为段落对象
+ // 3. 转换为段落对象
return convertList(documentSegments, segment -> {
if (StrUtil.isEmpty(segment.getText())) {
return null;
@@ -352,6 +348,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
* @param url 文档 URL(用于自动检测文件类型)
* @return 切片后的文档列表
*/
+ @SuppressWarnings("EnhancedSwitchMigration")
private List splitContentByStrategy(String content, Integer segmentMaxTokens,
AiDocumentSplitStrategyEnum strategy, String url) {
// 自动检测策略
@@ -359,7 +356,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
strategy = detectDocumentStrategy(content, url);
log.info("[splitContentByStrategy][自动检测到文档策略: {}]", strategy.getName());
}
-
+ // 根据策略切分
TextSplitter textSplitter;
switch (strategy) {
case MARKDOWN_QA:
@@ -376,7 +373,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
textSplitter = buildTokenTextSplitter(segmentMaxTokens);
break;
}
-
+ // 执行切分
return textSplitter.apply(Collections.singletonList(new Document(content)));
}
@@ -391,17 +388,14 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
if (StrUtil.isEmpty(content)) {
return AiDocumentSplitStrategyEnum.TOKEN;
}
-
// 1. 检测 Markdown QA 格式
if (isMarkdownQaFormat(content, url)) {
return AiDocumentSplitStrategyEnum.MARKDOWN_QA;
}
-
// 2. 检测普通 Markdown 文档
if (isMarkdownDocument(url)) {
return AiDocumentSplitStrategyEnum.SEMANTIC;
}
-
// 3. 默认使用语义切分(比 Token 切分更智能)
return AiDocumentSplitStrategyEnum.SEMANTIC;
}
@@ -421,16 +415,14 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
.filter(line -> line.trim().startsWith("## "))
.count();
- // 至少包含 2 个二级标题才认为是 QA 格式
+ // 要求一:至少包含 2 个二级标题才认为是 QA 格式
if (h2Count < 2) {
return false;
}
- // 检查标题占比(QA 文档标题行数相对较多)
+ // 要求二:检查标题占比(QA 文档标题行数相对较多),如果二级标题占比超过 10%,认为是 QA 格式
long totalLines = content.lines().count();
double h2Ratio = (double) h2Count / totalLines;
-
- // 如果二级标题占比超过 10%,认为是 QA 格式
return h2Ratio > 0.1;
}
@@ -438,7 +430,7 @@ public class AiKnowledgeSegmentServiceImpl implements AiKnowledgeSegmentService
* 检测是否为 Markdown 文档
*/
private boolean isMarkdownDocument(String url) {
- return StrUtil.isNotEmpty(url) && url.toLowerCase().endsWith(".md");
+ return StrUtil.endWithAnyIgnoreCase(url, ".md", ".markdown");
}
/**
diff --git a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/MarkdownQaSplitter.java b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/MarkdownQaSplitter.java
index 2957f4140e..1fbf4f2429 100644
--- a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/MarkdownQaSplitter.java
+++ b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/MarkdownQaSplitter.java
@@ -1,6 +1,8 @@
package cn.iocoder.yudao.module.ai.service.knowledge.splitter;
+import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
+import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.transformer.splitter.TextSplitter;
@@ -24,6 +26,7 @@ import java.util.regex.Pattern;
* @author runzhen
*/
@Slf4j
+@SuppressWarnings("SizeReplaceableByIsEmpty")
public class MarkdownQaSplitter extends TextSplitter {
/**
@@ -62,41 +65,38 @@ public class MarkdownQaSplitter extends TextSplitter {
return Collections.emptyList();
}
- List result = new ArrayList<>();
-
// 解析 QA 对
List qaPairs = parseQaPairs(text);
-
- if (qaPairs.isEmpty()) {
+ if (CollUtil.isEmpty(qaPairs)) {
// 如果没有识别到 QA 格式,按段落切分
return fallbackSplit(text);
}
// 处理每个 QA 对
+ List result = new ArrayList<>();
for (QaPair qaPair : qaPairs) {
result.addAll(splitQaPair(qaPair));
}
-
return result;
}
/**
* 解析 Markdown QA 对
+ *
+ * @param content 文本内容
+ * @return QA 对列表
*/
private List parseQaPairs(String content) {
+ // 找到所有二级标题位置
List qaPairs = new ArrayList<>();
- Matcher matcher = H2_PATTERN.matcher(content);
-
List headingPositions = new ArrayList<>();
List questions = new ArrayList<>();
-
- // 找到所有二级标题位置
+ Matcher matcher = H2_PATTERN.matcher(content);
while (matcher.find()) {
headingPositions.add(matcher.start());
questions.add(matcher.group(1).trim());
}
-
- if (headingPositions.isEmpty()) {
+ if (CollUtil.isEmpty(headingPositions)) {
return qaPairs;
}
@@ -106,55 +106,51 @@ public class MarkdownQaSplitter extends TextSplitter {
int end = (i + 1 < headingPositions.size())
? headingPositions.get(i + 1)
: content.length();
-
String qaText = content.substring(start, end).trim();
String question = questions.get(i);
-
// 提取答案部分(去掉问题标题)
String answer = qaText.substring(qaText.indexOf('\n') + 1).trim();
-
qaPairs.add(new QaPair(question, answer, qaText));
}
-
return qaPairs;
}
/**
* 切分单个 QA 对
+ *
+ * @param qaPair QA 对
+ * @return 切分后的文本片段列表
*/
private List splitQaPair(QaPair qaPair) {
+ // 如果整个 QA 对不超过限制,保持完整
List chunks = new ArrayList<>();
-
String fullQa = qaPair.fullText;
int qaTokens = tokenEstimator.estimate(fullQa);
-
- // 如果整个 QA 对不超过限制,保持完整
if (qaTokens <= chunkSize) {
chunks.add(fullQa);
return chunks;
}
// 长答案需要切分
- log.debug("QA 对超过 Token 限制 ({} > {}),开始智能切分: {}",
- qaTokens, chunkSize, qaPair.question);
-
+ log.debug("QA 对超过 Token 限制 ({} > {}),开始智能切分: {}", qaTokens, chunkSize, qaPair.question);
List answerChunks = splitLongAnswer(qaPair.answer, qaPair.question);
-
for (String answerChunk : answerChunks) {
// 每个片段都包含完整问题
String chunkText = "## " + qaPair.question + "\n" + answerChunk;
chunks.add(chunkText);
}
-
return chunks;
}
/**
* 切分长答案
+ *
+ * @param answer 答案文本
+ * @param question 问题文本
+ * @return 切分后的答案片段列表
*/
private List splitLongAnswer(String answer, String question) {
List chunks = new ArrayList<>();
-
// 预留问题的 Token 空间
String questionHeader = "## " + question + "\n";
int questionTokens = tokenEstimator.estimate(questionHeader);
@@ -162,17 +158,13 @@ public class MarkdownQaSplitter extends TextSplitter {
// 先按段落切分
String[] paragraphs = answer.split(PARAGRAPH_SEPARATOR);
-
StringBuilder currentChunk = new StringBuilder();
int currentTokens = 0;
-
for (String paragraph : paragraphs) {
if (StrUtil.isEmpty(paragraph)) {
continue;
}
-
int paragraphTokens = tokenEstimator.estimate(paragraph);
-
// 如果单个段落就超过限制,需要按句子切分
if (paragraphTokens > availableTokens) {
// 先保存当前块
@@ -181,19 +173,105 @@ public class MarkdownQaSplitter extends TextSplitter {
currentChunk = new StringBuilder();
currentTokens = 0;
}
-
// 按句子切分长段落
chunks.addAll(splitLongParagraph(paragraph, availableTokens));
continue;
}
-
// 如果加上这个段落会超过限制
if (currentTokens + paragraphTokens > availableTokens && currentChunk.length() > 0) {
chunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
currentTokens = 0;
}
+ if (currentChunk.length() > 0) {
+ currentChunk.append("\n\n");
+ }
+ // 添加段落
+ currentChunk.append(paragraph);
+ currentTokens += paragraphTokens;
+ }
+ // 添加最后一块
+ if (currentChunk.length() > 0) {
+ chunks.add(currentChunk.toString().trim());
+ }
+ return CollUtil.isEmpty(chunks) ? Collections.singletonList(answer) : chunks;
+ }
+
+ /**
+ * 切分长段落(按句子)
+ *
+ * @param paragraph 段落文本
+ * @param availableTokens 可用的 Token 数
+ * @return 切分后的文本片段列表
+ */
+ private List splitLongParagraph(String paragraph, int availableTokens) {
+ // 按句子切分
+ List chunks = new ArrayList<>();
+ String[] sentences = SENTENCE_PATTERN.split(paragraph);
+
+ // 按句子累积切分
+ StringBuilder currentChunk = new StringBuilder();
+ int currentTokens = 0;
+ for (String sentence : sentences) {
+ if (StrUtil.isEmpty(sentence)) {
+ continue;
+ }
+ int sentenceTokens = tokenEstimator.estimate(sentence);
+ // 如果单个句子就超过限制,强制切分
+ if (sentenceTokens > availableTokens) {
+ if (currentChunk.length() > 0) {
+ chunks.add(currentChunk.toString().trim());
+ currentChunk = new StringBuilder();
+ currentTokens = 0;
+ }
+ chunks.add(sentence.trim());
+ continue;
+ }
+ // 如果加上这个句子会超过限制
+ if (currentTokens + sentenceTokens > availableTokens && currentChunk.length() > 0) {
+ chunks.add(currentChunk.toString().trim());
+ currentChunk = new StringBuilder();
+ currentTokens = 0;
+ }
+ // 添加句子
+ currentChunk.append(sentence);
+ currentTokens += sentenceTokens;
+ }
+
+ // 添加最后一块
+ if (currentChunk.length() > 0) {
+ chunks.add(currentChunk.toString().trim());
+ }
+ return chunks.isEmpty() ? Collections.singletonList(paragraph) : chunks;
+ }
+
+ /**
+ * 降级切分策略(当未识别到 QA 格式时)
+ *
+ * @param content 文本内容
+ * @return 切分后的文本片段列表
+ */
+ private List fallbackSplit(String content) {
+ // 按段落切分
+ List chunks = new ArrayList<>();
+ String[] paragraphs = content.split(PARAGRAPH_SEPARATOR);
+
+ // 按段落累积切分
+ StringBuilder currentChunk = new StringBuilder();
+ int currentTokens = 0;
+ for (String paragraph : paragraphs) {
+ if (StrUtil.isEmpty(paragraph)) {
+ continue;
+ }
+ int paragraphTokens = tokenEstimator.estimate(paragraph);
+ // 如果加上这个段落会超过限制
+ if (currentTokens + paragraphTokens > chunkSize && currentChunk.length() > 0) {
+ chunks.add(currentChunk.toString().trim());
+ currentChunk = new StringBuilder();
+ currentTokens = 0;
+ }
+ // 添加段落
if (currentChunk.length() > 0) {
currentChunk.append("\n\n");
}
@@ -205,112 +283,28 @@ public class MarkdownQaSplitter extends TextSplitter {
if (currentChunk.length() > 0) {
chunks.add(currentChunk.toString().trim());
}
-
- return chunks.isEmpty() ? Collections.singletonList(answer) : chunks;
- }
-
- /**
- * 切分长段落(按句子)
- */
- private List splitLongParagraph(String paragraph, int availableTokens) {
- List chunks = new ArrayList<>();
- String[] sentences = SENTENCE_PATTERN.split(paragraph);
-
- StringBuilder currentChunk = new StringBuilder();
- int currentTokens = 0;
-
- for (String sentence : sentences) {
- if (StrUtil.isEmpty(sentence)) {
- continue;
- }
-
- int sentenceTokens = tokenEstimator.estimate(sentence);
-
- // 如果单个句子就超过限制,强制切分
- if (sentenceTokens > availableTokens) {
- if (currentChunk.length() > 0) {
- chunks.add(currentChunk.toString().trim());
- currentChunk = new StringBuilder();
- currentTokens = 0;
- }
- chunks.add(sentence.trim());
- continue;
- }
-
- if (currentTokens + sentenceTokens > availableTokens && currentChunk.length() > 0) {
- chunks.add(currentChunk.toString().trim());
- currentChunk = new StringBuilder();
- currentTokens = 0;
- }
-
- currentChunk.append(sentence);
- currentTokens += sentenceTokens;
- }
-
- if (currentChunk.length() > 0) {
- chunks.add(currentChunk.toString().trim());
- }
-
- return chunks.isEmpty() ? Collections.singletonList(paragraph) : chunks;
- }
-
- /**
- * 降级切分策略(当未识别到 QA 格式时)
- */
- private List fallbackSplit(String content) {
- List chunks = new ArrayList<>();
- String[] paragraphs = content.split(PARAGRAPH_SEPARATOR);
-
- StringBuilder currentChunk = new StringBuilder();
- int currentTokens = 0;
-
- for (String paragraph : paragraphs) {
- if (StrUtil.isEmpty(paragraph)) {
- continue;
- }
-
- int paragraphTokens = tokenEstimator.estimate(paragraph);
-
- if (currentTokens + paragraphTokens > chunkSize && currentChunk.length() > 0) {
- chunks.add(currentChunk.toString().trim());
- currentChunk = new StringBuilder();
- currentTokens = 0;
- }
-
- if (currentChunk.length() > 0) {
- currentChunk.append("\n\n");
- }
- currentChunk.append(paragraph);
- currentTokens += paragraphTokens;
- }
-
- if (currentChunk.length() > 0) {
- chunks.add(currentChunk.toString().trim());
- }
-
return chunks.isEmpty() ? Collections.singletonList(content) : chunks;
}
/**
* QA 对数据结构
*/
+ @AllArgsConstructor
private static class QaPair {
+
String question;
String answer;
String fullText;
- QaPair(String question, String answer, String fullText) {
- this.question = question;
- this.answer = answer;
- this.fullText = fullText;
- }
}
/**
* Token 估算器接口
*/
public interface TokenEstimator {
+
int estimate(String text);
+
}
/**
@@ -319,6 +313,7 @@ public class MarkdownQaSplitter extends TextSplitter {
* 英文:1 单词 ≈ 1.3 Token
*/
private static class SimpleTokenEstimator implements TokenEstimator {
+
@Override
public int estimate(String text) {
if (StrUtil.isEmpty(text)) {
@@ -327,14 +322,12 @@ public class MarkdownQaSplitter extends TextSplitter {
int chineseChars = 0;
int englishWords = 0;
-
// 简单统计中英文
for (char c : text.toCharArray()) {
if (c >= 0x4E00 && c <= 0x9FA5) {
chineseChars++;
}
}
-
// 英文单词估算
String[] words = text.split("\\s+");
for (String word : words) {
@@ -342,8 +335,8 @@ public class MarkdownQaSplitter extends TextSplitter {
englishWords++;
}
}
-
return chineseChars + (int) (englishWords * 1.3);
}
}
+
}
diff --git a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/SemanticTextSplitter.java b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/SemanticTextSplitter.java
index 64160a41a0..4c7112e9ad 100644
--- a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/SemanticTextSplitter.java
+++ b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/SemanticTextSplitter.java
@@ -8,6 +8,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
@@ -72,12 +73,14 @@ public class SemanticTextSplitter extends TextSplitter {
if (StrUtil.isEmpty(text)) {
return Collections.emptyList();
}
-
return splitTextRecursive(text);
}
/**
* 切分文本(递归策略)
+ *
+ * @param text 待切分文本
+ * @return 切分后的文本块列表
*/
private List splitTextRecursive(String text) {
List chunks = new ArrayList<>();
@@ -92,7 +95,6 @@ public class SemanticTextSplitter extends TextSplitter {
// 尝试按不同分隔符切分
List splits = null;
String usedSeparator = null;
-
for (String separator : PARAGRAPH_SEPARATORS) {
if (text.contains(separator)) {
splits = Arrays.asList(text.split(Pattern.quote(separator)));
@@ -109,18 +111,20 @@ public class SemanticTextSplitter extends TextSplitter {
// 合并小片段
chunks = mergeSplits(splits, usedSeparator);
-
return chunks;
}
/**
* 按句子切分
+ *
+ * @param text 待切分文本
+ * @return 句子列表
*/
private List splitBySentences(String text) {
+ // 使用正则表达式匹配句子结束位置
List sentences = new ArrayList<>();
int lastEnd = 0;
-
- java.util.regex.Matcher matcher = SENTENCE_END_PATTERN.matcher(text);
+ Matcher matcher = SENTENCE_END_PATTERN.matcher(text);
while (matcher.find()) {
String sentence = text.substring(lastEnd, matcher.end()).trim();
if (StrUtil.isNotEmpty(sentence)) {
@@ -136,12 +140,15 @@ public class SemanticTextSplitter extends TextSplitter {
sentences.add(remaining);
}
}
-
return sentences.isEmpty() ? Collections.singletonList(text) : sentences;
}
/**
* 合并切分后的小片段
+ *
+ * @param splits 切分后的片段列表
+ * @param separator 片段间的分隔符
+ * @return 合并后的文本块列表
*/
private List mergeSplits(List splits, String separator) {
List chunks = new ArrayList<>();
@@ -152,9 +159,7 @@ public class SemanticTextSplitter extends TextSplitter {
if (StrUtil.isEmpty(split)) {
continue;
}
-
int splitTokens = tokenEstimator.estimate(split);
-
// 如果单个片段就超过限制,进一步递归切分
if (splitTokens > chunkSize) {
// 先保存当前累积的块
@@ -164,7 +169,6 @@ public class SemanticTextSplitter extends TextSplitter {
currentChunks.clear();
currentLength = 0;
}
-
// 递归切分大片段
if (!separator.isEmpty()) {
// 如果是段落分隔符,尝试按句子切分
@@ -175,10 +179,8 @@ public class SemanticTextSplitter extends TextSplitter {
}
continue;
}
-
// 计算加上分隔符的 Token 数
int separatorTokens = StrUtil.isEmpty(separator) ? 0 : tokenEstimator.estimate(separator);
-
// 如果加上这个片段会超过限制
if (!currentChunks.isEmpty() && currentLength + splitTokens + separatorTokens > chunkSize) {
// 保存当前块
@@ -189,7 +191,7 @@ public class SemanticTextSplitter extends TextSplitter {
currentChunks = getOverlappingChunks(currentChunks, separator);
currentLength = estimateTokens(currentChunks, separator);
}
-
+ // 添加当前片段
currentChunks.add(split);
currentLength += splitTokens + separatorTokens;
}
@@ -199,39 +201,43 @@ public class SemanticTextSplitter extends TextSplitter {
String chunkText = String.join(separator, currentChunks);
chunks.add(chunkText.trim());
}
-
return chunks;
}
/**
* 获取重叠的片段(用于保持上下文)
+ *
+ * @param chunks 当前片段列表
+ * @param separator 片段间的分隔符
+ * @return 重叠的片段列表
*/
private List getOverlappingChunks(List chunks, String separator) {
if (chunkOverlap == 0 || chunks.isEmpty()) {
return new ArrayList<>();
}
+ // 从后往前取片段,直到达到重叠大小
List overlapping = new ArrayList<>();
int tokens = 0;
-
- // 从后往前取片段,直到达到重叠大小
for (int i = chunks.size() - 1; i >= 0; i--) {
String chunk = chunks.get(i);
int chunkTokens = tokenEstimator.estimate(chunk);
-
if (tokens + chunkTokens > chunkOverlap) {
break;
}
-
+ // 添加到重叠列表前端
overlapping.add(0, chunk);
tokens += chunkTokens + (StrUtil.isEmpty(separator) ? 0 : tokenEstimator.estimate(separator));
}
-
return overlapping;
}
/**
* 估算片段列表的总 Token 数
+ *
+ * @param chunks 片段列表
+ * @param separator 片段间的分隔符
+ * @return 总 Token 数
*/
private int estimateTokens(List chunks, String separator) {
int total = 0;
@@ -246,17 +252,18 @@ public class SemanticTextSplitter extends TextSplitter {
/**
* 强制切分长文本(当语义切分失败时)
+ *
+ * @param text 待切分文本
+ * @return 切分后的文本块列表
*/
private List forceSplitLongText(String text) {
List chunks = new ArrayList<>();
int charsPerChunk = (int) (chunkSize * 0.8); // 保守估计
-
for (int i = 0; i < text.length(); i += charsPerChunk) {
int end = Math.min(i + charsPerChunk, text.length());
String chunk = text.substring(i, end);
chunks.add(chunk.trim());
}
-
log.warn("文本过长,已强制按字符切分,可能影响语义完整性");
return chunks;
}
@@ -265,6 +272,7 @@ public class SemanticTextSplitter extends TextSplitter {
* 简单的 Token 估算器实现
*/
private static class SimpleTokenEstimator implements MarkdownQaSplitter.TokenEstimator {
+
@Override
public int estimate(String text) {
if (StrUtil.isEmpty(text)) {
@@ -273,21 +281,21 @@ public class SemanticTextSplitter extends TextSplitter {
int chineseChars = 0;
int englishWords = 0;
-
+ // 简单统计中英文
for (char c : text.toCharArray()) {
if (c >= 0x4E00 && c <= 0x9FA5) {
chineseChars++;
}
}
-
+ // 英文单词估算
String[] words = text.split("\\s+");
for (String word : words) {
if (word.matches(".*[a-zA-Z].*")) {
englishWords++;
}
}
-
return chineseChars + (int) (englishWords * 1.3);
}
}
+
}