diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 7353a3b..7ffad8d 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -414,9 +414,8 @@ | 1 | A | wordCheck 策略支持用户自定义 | 2025-2-17 12:06:45 | https://github.com/houbb/sensitive-word/issues/101 | | 2 | A | wordCheckUrlNoPrefix | 2025-2-17 12:06:45 | https://github.com/houbb/sensitive-word/issues/101 | -# release_0.25.0 +# release_0.25.1 -| 序号 | 变更类型 | 说明 | 时间 | 备注 | -|:---|:-----|----------------------|:-------------------|:-----| -| 1 | A | wordCheck 策略支持用户自定义 | 2025-2-17 12:06:45 | https://github.com/houbb/sensitive-word/issues/101 | -| 2 | A | wordCheckUrlNoPrefix | 2025-2-17 12:06:45 | https://github.com/houbb/sensitive-word/issues/101 | +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:-----|------------------------------|:------------------|:---------------------------------------------------| +| 1 | A | 修正 tags 匹配问题,黑名单命中时返回对应的黑名单词 | 2025-5-2 20:25:04 | https://github.com/houbb/sensitive-word/issues/105 | diff --git a/README.md b/README.md index 27cbc28..e65a173 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ v0.24.0 开始内置支持对敏感词的分类细化,不过工作量比较大 com.github.houbb sensitive-word - 0.25.0 + 0.25.1 ``` @@ -1289,6 +1289,14 @@ FormatCombine/CheckCombine/AllowDenyCombine 组合策略,允许用户自定义 [v0.21.0 敏感词新特性之白名单支持单个编辑,修正白名单包含黑名单时的问题](https://houbb.github.io/2020/01/07/sensitive-word-12-v0.21.0-allow-word-edit) +[v0.23.0 敏感词结果条件拓展,内置支持链式+单词标签](https://houbb.github.io/2020/01/07/sensitive-word-13-v0.23.0-result-condition-enhance) + +[v0.24.0 新特性支持标签分类,内置实现多种策略](https://houbb.github.io/2020/01/07/sensitive-word-13-v0.24.0-word-tag-impl) + +[v0.25.0 新特性之 wordCheck 策略支持用户自定义](https://houbb.github.io/2020/01/07/sensitive-word-14-v0.25.0-url-define) + +[v0.25.1 新特性之返回匹配词,修正 tags 标签](https://houbb.github.io/2020/01/07/sensitive-word-14-v0.25.1-tags-match) + ![wechat](https://img-blog.csdnimg.cn/63926529df364f09bcb203a8a9016854.png) # NLP 开源矩阵 diff --git a/pom.xml b/pom.xml index 8e459a0..f453ffa 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.25.0 + 0.25.1 diff --git a/release.bat b/release.bat index a87d5d3..145eb7d 100644 --- a/release.bat +++ b/release.bat @@ -10,7 +10,7 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.25.0 +SET version=0.25.1 :::: 新版本名称 SET newVersion=0.26.0 :::: 组织名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordResult.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordResult.java index ceaaf47..cfb1918 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordResult.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordResult.java @@ -28,4 +28,11 @@ public interface IWordResult { */ String type(); + /** + * 实际匹配的单词,方便统一的标签等处理,实际问题排查等 + * @return 结果 + * @since 0.25.1 + */ + String word(); + } diff --git a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java index 20f90b3..8147d34 100644 --- a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java @@ -6,6 +6,7 @@ import com.github.houbb.sensitive.word.api.*; import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum; import com.github.houbb.sensitive.word.support.check.WordCheckResult; +import com.github.houbb.sensitive.word.support.result.WordLengthResult; import com.github.houbb.sensitive.word.support.result.WordResult; import com.github.houbb.sensitive.word.utils.InnerWordFormatUtils; @@ -80,13 +81,16 @@ public class SensitiveWord extends AbstractSensitiveWord { // 命中 - int wordLength = checkResult.wordLengthResult().wordDenyLen(); + final WordLengthResult wordLengthResult = checkResult.wordLengthResult(); + int wordLength = wordLengthResult.wordDenyLen(); if (wordLength > 0) { // 保存敏感词 WordResult wordResult = WordResult.newInstance() .startIndex(i) .endIndex(i+wordLength) - .type(checkResult.type()); + .type(checkResult.type()) + .word(wordLengthResult.wordDeny()); + //v0.13.0 添加判断 if(wordResultCondition.match(wordResult, text, modeEnum, context)) { resultList.add(wordResult); diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/AbstractWordCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/AbstractWordCheck.java index 32f6ea6..4d67497 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/AbstractWordCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/AbstractWordCheck.java @@ -59,7 +59,8 @@ public abstract class AbstractWordCheck implements IWordCheck { return WordCheckResult.newInstance() .wordLengthResult(wordLengthResult) .type(getType()) - .checkClass(clazz); + .checkClass(clazz) + ; } } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordLengthResult.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordLengthResult.java index d1888bd..9bc5663 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordLengthResult.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordLengthResult.java @@ -15,6 +15,18 @@ public class WordLengthResult { */ private int wordDenyLen; + /** + * 黑名单匹配词 + * @since 0.25.1 + */ + private String wordDeny; + + /** + * 白名单实际匹配值 + * @since 0.25.1 + */ + private String wordAllow; + public static WordLengthResult newInstance() { return new WordLengthResult(); } @@ -37,11 +49,31 @@ public class WordLengthResult { return this; } + public String wordDeny() { + return wordDeny; + } + + public WordLengthResult wordDeny(String wordDeny) { + this.wordDeny = wordDeny; + return this; + } + + public String wordAllow() { + return wordAllow; + } + + public WordLengthResult wordAllow(String wordAllow) { + this.wordAllow = wordAllow; + return this; + } + @Override public String toString() { return "WordLengthResult{" + "wordAllowLen=" + wordAllowLen + ", wordDenyLen=" + wordDenyLen + + ", wordDeny='" + wordDeny + '\'' + + ", wordAllow='" + wordAllow + '\'' + '}'; } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResult.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResult.java index fa6e018..b1919ed 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResult.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResult.java @@ -18,6 +18,12 @@ public class WordResult implements IWordResult { */ private String type; + /** + * 单词匹配 + * @since 0.25.0 + */ + private String word; + private WordResult(){} public static WordResult newInstance() { @@ -54,12 +60,23 @@ public class WordResult implements IWordResult { return this; } + @Override + public String word() { + return word; + } + + public WordResult word(String word) { + this.word = word; + return this; + } + @Override public String toString() { return "WordResult{" + "startIndex=" + startIndex + ", endIndex=" + endIndex + ", type='" + type + '\'' + + ", word='" + word + '\'' + '}'; } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java index cab8a62..5e79d64 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java @@ -1,5 +1,6 @@ package com.github.houbb.sensitive.word.support.result; +import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordResult; import com.github.houbb.sensitive.word.utils.InnerWordCharUtils; @@ -17,15 +18,22 @@ public class WordResultHandlerWordTags extends AbstractWordResultHandler wordTags = InnerWordTagUtils.tags(word, wordContext); + + // 如果为空,则尝试使用命中的敏感词匹配 v0.25.1 bug105 + if(CollectionUtil.isEmpty(wordTags)) { + wordTags = InnerWordTagUtils.tags(wordResult.word(), wordContext); + } + + dto.setWord(word); dto.setTags(wordTags); + return dto; } diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordTagUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordTagUtils.java index 3401e03..b2d617f 100644 --- a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordTagUtils.java +++ b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordTagUtils.java @@ -26,7 +26,7 @@ public class InnerWordTagUtils { public static Set tags(final String word, final IWordContext wordContext) { if(StringUtil.isEmpty(word)) { - return Collections.emptySet(); + return null; } final IWordTag wordTag = wordContext.wordTag(); diff --git a/src/test/java/com/github/houbb/sensitive/word/bugs/b105/Bug105Test.java b/src/test/java/com/github/houbb/sensitive/word/bugs/b105/Bug105Test.java new file mode 100644 index 0000000..271fd95 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bugs/b105/Bug105Test.java @@ -0,0 +1,88 @@ +package com.github.houbb.sensitive.word.bugs.b105; + +import java.util.*; + +import com.github.houbb.heaven.util.util.CollectionUtil; +import org.junit.Assert; +import org.junit.Test; + +import com.github.houbb.sensitive.word.bs.SensitiveWordBs; +import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores; +import com.github.houbb.sensitive.word.support.result.WordResultHandlers; +import com.github.houbb.sensitive.word.support.result.WordTagsDto; +import com.github.houbb.sensitive.word.support.tag.WordTags; + +/** + * 测试Issue #105: 处理带有噪音字符时的标签查找 + * 验证在启用字符忽略功能时,敏感词标签仍能被正确查找 + * + * src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java + * @since 0.25.1 + */ +public class Bug105Test { + + @Test + public void testNoiseCharacterInTaggedWords() { + Map> newHashMap = new HashMap<>(); + newHashMap.put("毛主席", new HashSet<>(Arrays.asList("政治", "领导人"))); + + // 配置同时启用字符忽略和标签的实例 + SensitiveWordBs ignoreAndTagWordBs = SensitiveWordBs.newInstance() + .charIgnore(SensitiveWordCharIgnores.specialChars()) // 启用字符忽略 + .wordTag(WordTags.map(newHashMap)) + .init(); + + // 包含噪音字符的敏感词文本 + final String noisyText = "你好毛---主---席"; + + // 测试同时启用字符忽略和标签的实例(修复前会失败) + List fixedWord = ignoreAndTagWordBs.findAll(noisyText, WordResultHandlers.wordTags()); + Assert.assertEquals(1, fixedWord.size()); + Assert.assertEquals("毛---主---席", fixedWord.get(0).getWord()); + Assert.assertNotNull("标签不应为空", fixedWord.get(0).getTags()); + Assert.assertTrue("应包含'政治'标签", fixedWord.get(0).getTags().contains("政治")); + Assert.assertTrue("应包含'领导人'标签", fixedWord.get(0).getTags().contains("领导人")); + } + + @Test + public void testNoiseCharacterInTaggedWords2() { + Map> newHashMap = new HashMap<>(); + newHashMap.put("毛主席", new HashSet<>(Arrays.asList("政治", "领导人"))); + newHashMap.put("毛---主---席", new HashSet<>(Arrays.asList("政治", "领导人", "自定义的"))); + + // 配置同时启用字符忽略和标签的实例 + SensitiveWordBs ignoreAndTagWordBs = SensitiveWordBs.newInstance() + .charIgnore(SensitiveWordCharIgnores.specialChars()) // 启用字符忽略 + .wordTag(WordTags.map(newHashMap)) + .init(); + + // 包含噪音字符的敏感词文本 + final String noisyText = "你好毛---主---席"; + + // 测试同时启用字符忽略和标签的实例(修复前会失败) + List fixedWord = ignoreAndTagWordBs.findAll(noisyText, WordResultHandlers.wordTags()); + Assert.assertEquals(1, fixedWord.size()); + Assert.assertEquals("[政治, 自定义的, 领导人]", fixedWord.get(0).getTags().toString()); + } + + @Test + public void testNoiseCharacterInTaggedWords3() { + Map> newHashMap = new HashMap<>(); + newHashMap.put("毛xxx主xxxx席", new HashSet<>(Arrays.asList("政治", "领导人", "自定义的"))); + + // 配置同时启用字符忽略和标签的实例 + SensitiveWordBs ignoreAndTagWordBs = SensitiveWordBs.newInstance() + .charIgnore(SensitiveWordCharIgnores.specialChars()) // 启用字符忽略 + .wordTag(WordTags.map(newHashMap)) + .init(); + + // 包含噪音字符的敏感词文本 + final String noisyText = "你好毛---主---席"; + + // 测试同时启用字符忽略和标签的实例(修复前会失败) + List fixedWord = ignoreAndTagWordBs.findAll(noisyText, WordResultHandlers.wordTags()); + Assert.assertEquals(1, fixedWord.size()); + Assert.assertTrue(CollectionUtil.isEmpty(fixedWord.get(0).getTags())); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java b/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java index 71640fe..9d80501 100644 --- a/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java @@ -62,7 +62,7 @@ public class SensitiveWordHelperTest { final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; List wordList = SensitiveWordHelper.findAll(text, WordResultHandlers.raw()); - Assert.assertEquals("[WordResult{startIndex=0, endIndex=4, type='WORD'}, WordResult{startIndex=9, endIndex=12, type='WORD'}, WordResult{startIndex=18, endIndex=21, type='WORD'}]", wordList.toString()); + Assert.assertEquals("[WordResult{startIndex=0, endIndex=4, type='WORD', word='5星红旗'}, WordResult{startIndex=9, endIndex=12, type='WORD', word='毛主席'}, WordResult{startIndex=18, endIndex=21, type='WORD', word='天安门'}]", wordList.toString()); } @@ -99,7 +99,7 @@ public class SensitiveWordHelperTest { final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; IWordResult word = SensitiveWordHelper.findFirst(text, WordResultHandlers.raw()); - Assert.assertEquals("WordResult{startIndex=0, endIndex=4, type='WORD'}", word.toString()); + Assert.assertEquals("WordResult{startIndex=0, endIndex=4, type='WORD', word='5星红旗'}", word.toString()); } /** diff --git a/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java b/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java index 9525a4d..38db75a 100644 --- a/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java @@ -10,7 +10,7 @@ import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; -import java.util.List; +import java.util.*; /** * @since 0.12.0 @@ -27,7 +27,7 @@ public class WordResultHandlerTest { Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList2.toString()); List wordList3 = SensitiveWordHelper.findAll(text, WordResultHandlers.raw()); - Assert.assertEquals("[WordResult{startIndex=0, endIndex=4, type='WORD'}, WordResult{startIndex=9, endIndex=12, type='WORD'}, WordResult{startIndex=18, endIndex=21, type='WORD'}]", wordList3.toString()); + Assert.assertEquals("[WordResult{startIndex=0, endIndex=4, type='WORD', word='5星红旗'}, WordResult{startIndex=9, endIndex=12, type='WORD', word='毛主席'}, WordResult{startIndex=18, endIndex=21, type='WORD', word='天安门'}]", wordList3.toString()); } @Test @@ -35,20 +35,24 @@ public class WordResultHandlerTest { final String text = "骂人:你他妈; 邮箱:123@qq.com; mobile: 13088889999; 网址:https://www.baidu.com"; List wordList3 = SensitiveWordHelper .findAll(text, WordResultHandlers.raw()); - Assert.assertEquals("[WordResult{startIndex=3, endIndex=6, type='WORD'}]", wordList3.toString()); + Assert.assertEquals("[WordResult{startIndex=3, endIndex=6, type='WORD', word='你他妈'}]", wordList3.toString()); } @Test - @Ignore public void wordTagsTest() { final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; // 默认敏感词标签为空 List wordList1 = SensitiveWordHelper.findAll(text, WordResultHandlers.wordTags()); - Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[]}, WordTagsDto{word='毛主席', tags=[]}, WordTagsDto{word='天安门', tags=[]}]", wordList1.toString()); + Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=null}, WordTagsDto{word='毛主席', tags=[0]}, WordTagsDto{word='天安门', tags=null}]", wordList1.toString()); + + Map> wordMap = new HashMap<>(); + wordMap.put("五星红旗", new HashSet<>(Arrays.asList("政治", "国家"))); + wordMap.put("毛主席", new HashSet<>(Arrays.asList("政治", "伟人", "国家"))); + wordMap.put("天安门", new HashSet<>(Arrays.asList("政治", "国家", "地址"))); List wordList2 = SensitiveWordBs.newInstance() - .wordTag(WordTags.file("D:\\github\\sensitive-word\\src\\test\\resources\\dict_tag_test.txt")) + .wordTag(WordTags.map(wordMap)) .init() .findAll(text, WordResultHandlers.wordTags()); Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[政治, 国家]}, WordTagsDto{word='毛主席', tags=[政治, 伟人, 国家]}, WordTagsDto{word='天安门', tags=[政治, 国家, 地址]}]", wordList2.toString());