diff --git a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/enums/AiDocumentSplitStrategyEnum.java b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/enums/AiDocumentSplitStrategyEnum.java deleted file mode 100644 index f0a9cd21d6..0000000000 --- a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/enums/AiDocumentSplitStrategyEnum.java +++ /dev/null @@ -1,53 +0,0 @@ -package cn.iocoder.yudao.module.ai.enums; - -import lombok.AllArgsConstructor; -import lombok.Getter; - -/** - * AI 知识库文档切片策略枚举 - * - * @author runzhen - */ -@AllArgsConstructor -@Getter -public enum AiDocumentSplitStrategyEnum { - - /** - * 自动识别文档类型并选择最佳切片策略 - */ - AUTO("auto", "自动识别"), - - /** - * 基于 Token 数量机械切分(默认策略) - */ - TOKEN("token", "Token 切分"), - - /** - * 按段落切分(以双换行符为分隔) - */ - PARAGRAPH("paragraph", "段落切分"), - - /** - * Markdown QA 格式专用切片器 - * 识别二级标题作为问题,保持问答对完整性 - * 长答案智能切分但保留问题作为上下文 - */ - MARKDOWN_QA("markdown_qa", "Markdown QA 切分"), - - /** - * 语义化切分,保留句子完整性 - * 在段落和句子边界处切分,避免截断 - */ - SEMANTIC("semantic", "语义切分"); - - /** - * 策略代码 - */ - private final String code; - - /** - * 策略名称 - */ - private final String name; - -} diff --git a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/MarkdownQaSplitter.java b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/MarkdownQaSplitter.java deleted file mode 100644 index 1fbf4f2429..0000000000 --- a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/MarkdownQaSplitter.java +++ /dev/null @@ -1,342 +0,0 @@ -package cn.iocoder.yudao.module.ai.service.knowledge.splitter; - -import cn.hutool.core.collection.CollUtil; -import cn.hutool.core.util.StrUtil; -import lombok.AllArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.springframework.ai.transformer.splitter.TextSplitter; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * Markdown QA 格式专用切片器 - * - *

功能特点: - *

- * - * @author runzhen - */ -@Slf4j -@SuppressWarnings("SizeReplaceableByIsEmpty") -public class MarkdownQaSplitter extends TextSplitter { - - /** - * 二级标题正则:匹配 "## " 开头的行 - */ - private static final Pattern H2_PATTERN = Pattern.compile("^##\\s+(.+)$", Pattern.MULTILINE); - - /** - * 段落分隔符:双换行 - */ - private static final String PARAGRAPH_SEPARATOR = "\n\n"; - - /** - * 句子分隔符 - */ - private static final Pattern SENTENCE_PATTERN = Pattern.compile("[。!?.!?]\\s*"); - - /** - * 分段的最大 Token 数 - */ - private final int chunkSize; - - /** - * Token 估算器(简单实现:中文按字符数,英文按单词数的 1.3 倍) - */ - private final TokenEstimator tokenEstimator; - - public MarkdownQaSplitter(int chunkSize) { - this.chunkSize = chunkSize; - this.tokenEstimator = new SimpleTokenEstimator(); - } - - @Override - protected List splitText(String text) { - if (StrUtil.isEmpty(text)) { - return Collections.emptyList(); - } - - // 解析 QA 对 - List qaPairs = parseQaPairs(text); - if (CollUtil.isEmpty(qaPairs)) { - // 如果没有识别到 QA 格式,按段落切分 - return fallbackSplit(text); - } - - // 处理每个 QA 对 - List result = new ArrayList<>(); - for (QaPair qaPair : qaPairs) { - result.addAll(splitQaPair(qaPair)); - } - return result; - } - - /** - * 解析 Markdown QA 对 - * - * @param content 文本内容 - * @return QA 对列表 - */ - private List parseQaPairs(String content) { - // 找到所有二级标题位置 - List qaPairs = new ArrayList<>(); - List headingPositions = new ArrayList<>(); - List questions = new ArrayList<>(); - Matcher matcher = H2_PATTERN.matcher(content); - while (matcher.find()) { - headingPositions.add(matcher.start()); - questions.add(matcher.group(1).trim()); - } - if (CollUtil.isEmpty(headingPositions)) { - return qaPairs; - } - - // 提取每个 QA 对 - for (int i = 0; i < headingPositions.size(); i++) { - int start = headingPositions.get(i); - int end = (i + 1 < headingPositions.size()) - ? headingPositions.get(i + 1) - : content.length(); - String qaText = content.substring(start, end).trim(); - String question = questions.get(i); - // 提取答案部分(去掉问题标题) - String answer = qaText.substring(qaText.indexOf('\n') + 1).trim(); - qaPairs.add(new QaPair(question, answer, qaText)); - } - return qaPairs; - } - - /** - * 切分单个 QA 对 - * - * @param qaPair QA 对 - * @return 切分后的文本片段列表 - */ - private List splitQaPair(QaPair qaPair) { - // 如果整个 QA 对不超过限制,保持完整 - List chunks = new ArrayList<>(); - String fullQa = qaPair.fullText; - int qaTokens = tokenEstimator.estimate(fullQa); - if (qaTokens <= chunkSize) { - chunks.add(fullQa); - return chunks; - } - - // 长答案需要切分 - log.debug("QA 对超过 Token 限制 ({} > {}),开始智能切分: {}", qaTokens, chunkSize, qaPair.question); - List answerChunks = splitLongAnswer(qaPair.answer, qaPair.question); - for (String answerChunk : answerChunks) { - // 每个片段都包含完整问题 - String chunkText = "## " + qaPair.question + "\n" + answerChunk; - chunks.add(chunkText); - } - return chunks; - } - - /** - * 切分长答案 - * - * @param answer 答案文本 - * @param question 问题文本 - * @return 切分后的答案片段列表 - */ - private List splitLongAnswer(String answer, String question) { - List chunks = new ArrayList<>(); - // 预留问题的 Token 空间 - String questionHeader = "## " + question + "\n"; - int questionTokens = tokenEstimator.estimate(questionHeader); - int availableTokens = chunkSize - questionTokens - 10; // 预留 10 个 Token 的缓冲 - - // 先按段落切分 - String[] paragraphs = answer.split(PARAGRAPH_SEPARATOR); - StringBuilder currentChunk = new StringBuilder(); - int currentTokens = 0; - for (String paragraph : paragraphs) { - if (StrUtil.isEmpty(paragraph)) { - continue; - } - int paragraphTokens = tokenEstimator.estimate(paragraph); - // 如果单个段落就超过限制,需要按句子切分 - if (paragraphTokens > availableTokens) { - // 先保存当前块 - if (currentChunk.length() > 0) { - chunks.add(currentChunk.toString().trim()); - currentChunk = new StringBuilder(); - currentTokens = 0; - } - // 按句子切分长段落 - chunks.addAll(splitLongParagraph(paragraph, availableTokens)); - continue; - } - // 如果加上这个段落会超过限制 - if (currentTokens + paragraphTokens > availableTokens && currentChunk.length() > 0) { - chunks.add(currentChunk.toString().trim()); - currentChunk = new StringBuilder(); - currentTokens = 0; - } - if (currentChunk.length() > 0) { - currentChunk.append("\n\n"); - } - // 添加段落 - currentChunk.append(paragraph); - currentTokens += paragraphTokens; - } - - // 添加最后一块 - if (currentChunk.length() > 0) { - chunks.add(currentChunk.toString().trim()); - } - return CollUtil.isEmpty(chunks) ? Collections.singletonList(answer) : chunks; - } - - /** - * 切分长段落(按句子) - * - * @param paragraph 段落文本 - * @param availableTokens 可用的 Token 数 - * @return 切分后的文本片段列表 - */ - private List splitLongParagraph(String paragraph, int availableTokens) { - // 按句子切分 - List chunks = new ArrayList<>(); - String[] sentences = SENTENCE_PATTERN.split(paragraph); - - // 按句子累积切分 - StringBuilder currentChunk = new StringBuilder(); - int currentTokens = 0; - for (String sentence : sentences) { - if (StrUtil.isEmpty(sentence)) { - continue; - } - int sentenceTokens = tokenEstimator.estimate(sentence); - // 如果单个句子就超过限制,强制切分 - if (sentenceTokens > availableTokens) { - if (currentChunk.length() > 0) { - chunks.add(currentChunk.toString().trim()); - currentChunk = new StringBuilder(); - currentTokens = 0; - } - chunks.add(sentence.trim()); - continue; - } - // 如果加上这个句子会超过限制 - if (currentTokens + sentenceTokens > availableTokens && currentChunk.length() > 0) { - chunks.add(currentChunk.toString().trim()); - currentChunk = new StringBuilder(); - currentTokens = 0; - } - // 添加句子 - currentChunk.append(sentence); - currentTokens += sentenceTokens; - } - - // 添加最后一块 - if (currentChunk.length() > 0) { - chunks.add(currentChunk.toString().trim()); - } - return chunks.isEmpty() ? Collections.singletonList(paragraph) : chunks; - } - - /** - * 降级切分策略(当未识别到 QA 格式时) - * - * @param content 文本内容 - * @return 切分后的文本片段列表 - */ - private List fallbackSplit(String content) { - // 按段落切分 - List chunks = new ArrayList<>(); - String[] paragraphs = content.split(PARAGRAPH_SEPARATOR); - - // 按段落累积切分 - StringBuilder currentChunk = new StringBuilder(); - int currentTokens = 0; - for (String paragraph : paragraphs) { - if (StrUtil.isEmpty(paragraph)) { - continue; - } - int paragraphTokens = tokenEstimator.estimate(paragraph); - // 如果加上这个段落会超过限制 - if (currentTokens + paragraphTokens > chunkSize && currentChunk.length() > 0) { - chunks.add(currentChunk.toString().trim()); - currentChunk = new StringBuilder(); - currentTokens = 0; - } - // 添加段落 - if (currentChunk.length() > 0) { - currentChunk.append("\n\n"); - } - currentChunk.append(paragraph); - currentTokens += paragraphTokens; - } - - // 添加最后一块 - if (currentChunk.length() > 0) { - chunks.add(currentChunk.toString().trim()); - } - return chunks.isEmpty() ? Collections.singletonList(content) : chunks; - } - - /** - * QA 对数据结构 - */ - @AllArgsConstructor - private static class QaPair { - - String question; - String answer; - String fullText; - - } - - /** - * Token 估算器接口 - */ - public interface TokenEstimator { - - int estimate(String text); - - } - - /** - * 简单的 Token 估算器实现 - * 中文:1 字符 ≈ 1 Token - * 英文:1 单词 ≈ 1.3 Token - */ - private static class SimpleTokenEstimator implements TokenEstimator { - - @Override - public int estimate(String text) { - if (StrUtil.isEmpty(text)) { - return 0; - } - - int chineseChars = 0; - int englishWords = 0; - // 简单统计中英文 - for (char c : text.toCharArray()) { - if (c >= 0x4E00 && c <= 0x9FA5) { - chineseChars++; - } - } - // 英文单词估算 - String[] words = text.split("\\s+"); - for (String word : words) { - if (word.matches(".*[a-zA-Z].*")) { - englishWords++; - } - } - return chineseChars + (int) (englishWords * 1.3); - } - } - -} diff --git a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/SemanticTextSplitter.java b/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/SemanticTextSplitter.java deleted file mode 100644 index 4c7112e9ad..0000000000 --- a/yudao-module-ai/src/main/java/cn/iocoder/yudao/module/ai/service/knowledge/splitter/SemanticTextSplitter.java +++ /dev/null @@ -1,301 +0,0 @@ -package cn.iocoder.yudao.module.ai.service.knowledge.splitter; - -import cn.hutool.core.util.StrUtil; -import lombok.extern.slf4j.Slf4j; -import org.springframework.ai.transformer.splitter.TextSplitter; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * 语义化文本切片器 - * - *

功能特点: - *

    - *
  • 优先在段落边界(双换行)处切分
  • - *
  • 其次在句子边界(句号、问号、感叹号)处切分
  • - *
  • 避免在句子中间截断,保持语义完整性
  • - *
  • 支持中英文标点符号识别
  • - *
- * - * @author runzhen - */ -@Slf4j -public class SemanticTextSplitter extends TextSplitter { - - /** - * 分段的最大 Token 数 - */ - private final int chunkSize; - - /** - * 段落重叠大小(用于保持上下文连贯性) - */ - private final int chunkOverlap; - - /** - * 段落分隔符(按优先级排序) - */ - private static final List PARAGRAPH_SEPARATORS = Arrays.asList( - "\n\n\n", // 三个换行 - "\n\n", // 双换行 - "\n" // 单换行 - ); - - /** - * 句子结束标记(中英文标点) - */ - private static final Pattern SENTENCE_END_PATTERN = Pattern.compile( - "[。!?.!?]+[\\s\"'))】\\]]*" - ); - - /** - * Token 估算器 - */ - private final MarkdownQaSplitter.TokenEstimator tokenEstimator; - - public SemanticTextSplitter(int chunkSize, int chunkOverlap) { - this.chunkSize = chunkSize; - this.chunkOverlap = Math.min(chunkOverlap, chunkSize / 2); // 重叠不超过一半 - this.tokenEstimator = new SimpleTokenEstimator(); - } - - public SemanticTextSplitter(int chunkSize) { - this(chunkSize, 50); // 默认重叠 50 个 Token - } - - @Override - protected List splitText(String text) { - if (StrUtil.isEmpty(text)) { - return Collections.emptyList(); - } - return splitTextRecursive(text); - } - - /** - * 切分文本(递归策略) - * - * @param text 待切分文本 - * @return 切分后的文本块列表 - */ - private List splitTextRecursive(String text) { - List chunks = new ArrayList<>(); - - // 如果文本不超过限制,直接返回 - int textTokens = tokenEstimator.estimate(text); - if (textTokens <= chunkSize) { - chunks.add(text.trim()); - return chunks; - } - - // 尝试按不同分隔符切分 - List splits = null; - String usedSeparator = null; - for (String separator : PARAGRAPH_SEPARATORS) { - if (text.contains(separator)) { - splits = Arrays.asList(text.split(Pattern.quote(separator))); - usedSeparator = separator; - break; - } - } - - // 如果没有找到段落分隔符,按句子切分 - if (splits == null || splits.size() == 1) { - splits = splitBySentences(text); - usedSeparator = ""; // 句子切分不需要分隔符 - } - - // 合并小片段 - chunks = mergeSplits(splits, usedSeparator); - return chunks; - } - - /** - * 按句子切分 - * - * @param text 待切分文本 - * @return 句子列表 - */ - private List splitBySentences(String text) { - // 使用正则表达式匹配句子结束位置 - List sentences = new ArrayList<>(); - int lastEnd = 0; - Matcher matcher = SENTENCE_END_PATTERN.matcher(text); - while (matcher.find()) { - String sentence = text.substring(lastEnd, matcher.end()).trim(); - if (StrUtil.isNotEmpty(sentence)) { - sentences.add(sentence); - } - lastEnd = matcher.end(); - } - - // 添加剩余部分 - if (lastEnd < text.length()) { - String remaining = text.substring(lastEnd).trim(); - if (StrUtil.isNotEmpty(remaining)) { - sentences.add(remaining); - } - } - return sentences.isEmpty() ? Collections.singletonList(text) : sentences; - } - - /** - * 合并切分后的小片段 - * - * @param splits 切分后的片段列表 - * @param separator 片段间的分隔符 - * @return 合并后的文本块列表 - */ - private List mergeSplits(List splits, String separator) { - List chunks = new ArrayList<>(); - List currentChunks = new ArrayList<>(); - int currentLength = 0; - - for (String split : splits) { - if (StrUtil.isEmpty(split)) { - continue; - } - int splitTokens = tokenEstimator.estimate(split); - // 如果单个片段就超过限制,进一步递归切分 - if (splitTokens > chunkSize) { - // 先保存当前累积的块 - if (!currentChunks.isEmpty()) { - String chunkText = String.join(separator, currentChunks); - chunks.add(chunkText.trim()); - currentChunks.clear(); - currentLength = 0; - } - // 递归切分大片段 - if (!separator.isEmpty()) { - // 如果是段落分隔符,尝试按句子切分 - chunks.addAll(splitTextRecursive(split)); - } else { - // 如果已经是句子级别,强制按字符切分 - chunks.addAll(forceSplitLongText(split)); - } - continue; - } - // 计算加上分隔符的 Token 数 - int separatorTokens = StrUtil.isEmpty(separator) ? 0 : tokenEstimator.estimate(separator); - // 如果加上这个片段会超过限制 - if (!currentChunks.isEmpty() && currentLength + splitTokens + separatorTokens > chunkSize) { - // 保存当前块 - String chunkText = String.join(separator, currentChunks); - chunks.add(chunkText.trim()); - - // 处理重叠:保留最后几个片段 - currentChunks = getOverlappingChunks(currentChunks, separator); - currentLength = estimateTokens(currentChunks, separator); - } - // 添加当前片段 - currentChunks.add(split); - currentLength += splitTokens + separatorTokens; - } - - // 添加最后一块 - if (!currentChunks.isEmpty()) { - String chunkText = String.join(separator, currentChunks); - chunks.add(chunkText.trim()); - } - return chunks; - } - - /** - * 获取重叠的片段(用于保持上下文) - * - * @param chunks 当前片段列表 - * @param separator 片段间的分隔符 - * @return 重叠的片段列表 - */ - private List getOverlappingChunks(List chunks, String separator) { - if (chunkOverlap == 0 || chunks.isEmpty()) { - return new ArrayList<>(); - } - - // 从后往前取片段,直到达到重叠大小 - List overlapping = new ArrayList<>(); - int tokens = 0; - for (int i = chunks.size() - 1; i >= 0; i--) { - String chunk = chunks.get(i); - int chunkTokens = tokenEstimator.estimate(chunk); - if (tokens + chunkTokens > chunkOverlap) { - break; - } - // 添加到重叠列表前端 - overlapping.add(0, chunk); - tokens += chunkTokens + (StrUtil.isEmpty(separator) ? 0 : tokenEstimator.estimate(separator)); - } - return overlapping; - } - - /** - * 估算片段列表的总 Token 数 - * - * @param chunks 片段列表 - * @param separator 片段间的分隔符 - * @return 总 Token 数 - */ - private int estimateTokens(List chunks, String separator) { - int total = 0; - for (int i = 0; i < chunks.size(); i++) { - total += tokenEstimator.estimate(chunks.get(i)); - if (i < chunks.size() - 1 && StrUtil.isNotEmpty(separator)) { - total += tokenEstimator.estimate(separator); - } - } - return total; - } - - /** - * 强制切分长文本(当语义切分失败时) - * - * @param text 待切分文本 - * @return 切分后的文本块列表 - */ - private List forceSplitLongText(String text) { - List chunks = new ArrayList<>(); - int charsPerChunk = (int) (chunkSize * 0.8); // 保守估计 - for (int i = 0; i < text.length(); i += charsPerChunk) { - int end = Math.min(i + charsPerChunk, text.length()); - String chunk = text.substring(i, end); - chunks.add(chunk.trim()); - } - log.warn("文本过长,已强制按字符切分,可能影响语义完整性"); - return chunks; - } - - /** - * 简单的 Token 估算器实现 - */ - private static class SimpleTokenEstimator implements MarkdownQaSplitter.TokenEstimator { - - @Override - public int estimate(String text) { - if (StrUtil.isEmpty(text)) { - return 0; - } - - int chineseChars = 0; - int englishWords = 0; - // 简单统计中英文 - for (char c : text.toCharArray()) { - if (c >= 0x4E00 && c <= 0x9FA5) { - chineseChars++; - } - } - // 英文单词估算 - String[] words = text.split("\\s+"); - for (String word : words) { - if (word.matches(".*[a-zA-Z].*")) { - englishWords++; - } - } - return chineseChars + (int) (englishWords * 1.3); - } - } - -}