From 61e2e62326423da968e61c45a4ba23fdc1827d04 Mon Sep 17 00:00:00 2001 From: Mile Shi Date: Fri, 2 May 2025 16:49:56 +0800 Subject: [PATCH] [fix] issue #105 tagging not works when enable ignore --- .../result/WordResultHandlerWordTags.java | 18 ++++++-- .../sensitive/word/bugs/b105/Bug105Test.java | 45 +++++++++++++++++++ 2 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 src/test/java/com/github/houbb/sensitive/word/bugs/b105/Bug105Test.java diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java index cab8a62..3893f55 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java @@ -19,13 +19,25 @@ public class WordResultHandlerWordTags extends AbstractWordResultHandler wordTags = InnerWordTagUtils.tags(word, wordContext); + + // 如果启用了字符忽略功能,清理单词后再查找标签 + StringBuilder wordForTagLookup = new StringBuilder(); + char[] chars = word.toCharArray(); + for (int i = 0; i < word.length(); i++) { + // 如果字符不被忽略,则保留 + // TODO: 此处innercontext 被设为null,是否合理? + if (!wordContext.charIgnore().ignore(i, chars, null)) { + wordForTagLookup.append(chars[i]); + } + } + // 获取 tags (使用清理后的单词查找标签) + Set wordTags = InnerWordTagUtils.tags(wordForTagLookup.toString(), wordContext); dto.setTags(wordTags); + return dto; } diff --git a/src/test/java/com/github/houbb/sensitive/word/bugs/b105/Bug105Test.java b/src/test/java/com/github/houbb/sensitive/word/bugs/b105/Bug105Test.java new file mode 100644 index 0000000..913d2ec --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bugs/b105/Bug105Test.java @@ -0,0 +1,45 @@ +package com.github.houbb.sensitive.word.bugs.b105; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +import com.github.houbb.sensitive.word.bs.SensitiveWordBs; +import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores; +import com.github.houbb.sensitive.word.support.result.WordResultHandlers; +import com.github.houbb.sensitive.word.support.result.WordTagsDto; +import com.github.houbb.sensitive.word.support.tag.WordTags; + +/** + * 测试Issue #105: 处理带有噪音字符时的标签查找 + * 验证在启用字符忽略功能时,敏感词标签仍能被正确查找 + */ +public class Bug105Test { + + @Test + public void testNoiseCharacterInTaggedWords() { + // 配置同时启用字符忽略和标签的实例 + SensitiveWordBs ignoreAndTagWordBs = SensitiveWordBs.newInstance() + .charIgnore(SensitiveWordCharIgnores.specialChars()) // 启用字符忽略 + .wordTag(WordTags.map(Collections.singletonMap("毛主席", + new HashSet<>(Arrays.asList("政治", "领导人"))))) + .init(); + + // 包含噪音字符的敏感词文本 + final String noisyText = "你好毛---主---席"; + + // 测试同时启用字符忽略和标签的实例(修复前会失败) + List fixedWord = ignoreAndTagWordBs.findAll(noisyText, WordResultHandlers.wordTags()); + Assert.assertEquals(1, fixedWord.size()); + Assert.assertEquals("毛---主---席", fixedWord.get(0).getWord()); + Assert.assertNotNull("标签不应为空", fixedWord.get(0).getTags()); + Assert.assertTrue("应包含'政治'标签", fixedWord.get(0).getTags().contains("政治")); + Assert.assertTrue("应包含'领导人'标签", fixedWord.get(0).getTags().contains("领导人")); + + System.out.println("Fixed result: " + fixedWord); + } +}