diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 467fc53..ed66493 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -217,4 +217,11 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:-----|----------------------|:--------------------|:------| -| 1 | A | 添加忽略字符接口,便于跳过一些干扰的字符 | 2023-12-08 23:51:58 | | \ No newline at end of file +| 1 | A | 添加忽略字符接口,便于跳过一些干扰的字符 | 2023-12-08 23:51:58 | | + +# release_0.12.0 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:-----|----------------------------------------------|:--------------------|:------| +| 1 | A | 添加 wordTags 标签结果处理类 | 2023-12-18 23:51:58 | | +| 2 | A | 添加 AbstractWordResultHandler 处理类,便于后续拓展和统一管理 | 2023-12-18 23:51:58 | | \ No newline at end of file diff --git a/README.md b/README.md index 3c5f224..6e14a4b 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ 这两个资料阅读可在下方文章获取: -> [v0.11.0-敏感词新特性](https://mp.weixin.qq.com/s/m40ZnR6YF6WgPrArUSZ_0g) +> [v0.11.0-敏感词新特性及对应标签文件](https://mp.weixin.qq.com/s/m40ZnR6YF6WgPrArUSZ_0g) # 快速开始 @@ -78,7 +78,7 @@ com.github.houbb sensitive-word - 0.11.0 + 0.12.0 ``` @@ -98,23 +98,7 @@ | findFirst(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串 | 返回字符串中第一个敏感词 | | tags(String) | 获取敏感词的标签 | 敏感词字符串 | 返回敏感词的标签列表 | -## IWordResultHandler 结果处理类 -IWordResultHandler 可以对敏感词的结果进行处理,允许用户自定义。 - -内置实现见 `WordResultHandlers` 工具类: - -- WordResultHandlers.word() - -只保留敏感词单词本身。 - -- WordResultHandlers.raw() - -保留敏感词相关信息,包含敏感词的开始和结束下标。 - -## 使用实例 - -所有测试案例参见 [SensitiveWordHelperTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java) ### 判断是否包含敏感词 @@ -243,6 +227,60 @@ public class MyWordReplace implements IWordReplace { 我们针对其中的部分词做固定映射处理,其他的默认转换为 `*`。 +## IWordResultHandler 结果处理类 + +IWordResultHandler 可以对敏感词的结果进行处理,允许用户自定义。 + +内置实现见 `WordResultHandlers` 工具类: + +- WordResultHandlers.word() + +只保留敏感词单词本身。 + +- WordResultHandlers.raw() + +保留敏感词相关信息,包含敏感词的开始和结束下标。 + +- WordResultHandlers.wordTags() + +同时保留单词,和对应的词标签信息。 + +### 使用实例 + +所有测试案例参见 [SensitiveWordHelperTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java) + +1)基本例子 + +```java +final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + +List wordList = SensitiveWordHelper.findAll(text); +Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); +List wordList2 = SensitiveWordHelper.findAll(text, WordResultHandlers.word()); +Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList2.toString()); + +List wordList3 = SensitiveWordHelper.findAll(text, WordResultHandlers.raw()); +Assert.assertEquals("[WordResult{startIndex=0, endIndex=4}, WordResult{startIndex=9, endIndex=12}, WordResult{startIndex=18, endIndex=21}]", wordList3.toString()); +``` + +2) wordTags 例子 + +我们在 `dict_tag_test.txt` 文件中指定对应词的标签信息。 + +```java +final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + +// 默认敏感词标签为空 +List wordList1 = SensitiveWordHelper.findAll(text, WordResultHandlers.wordTags()); +Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[]}, WordTagsDto{word='毛主席', tags=[]}, WordTagsDto{word='天安门', tags=[]}]", wordList1.toString()); + +List wordList2 = SensitiveWordBs.newInstance() + .wordTag(WordTags.file("dict_tag_test.txt")) + .init() + .findAll(text, WordResultHandlers.wordTags()); +Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[政治, 国家]}, WordTagsDto{word='毛主席', tags=[政治, 伟人, 国家]}, WordTagsDto{word='天安门', tags=[政治, 国家, 地址]}]", wordList2.toString()); +``` + # 更多特性 后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。 @@ -793,9 +831,7 @@ remove、add、edit? - [x] 敏感词标签接口支持 -- [ ] 敏感词处理时标签支持 - -TODO: 比较耗时间。 +- [x] 敏感词处理时标签支持 - [x] wordData 的内存占用对比 + 优化 @@ -807,8 +843,6 @@ FormatCombine/CheckCombine/AllowDenyCombine 组合策略,允许用户自定义 - [ ] 添加 ThreadLocal 等性能优化 - - # 拓展阅读 [敏感词工具实现思路](https://houbb.github.io/2020/01/07/sensitive-word) @@ -819,6 +853,8 @@ FormatCombine/CheckCombine/AllowDenyCombine 组合策略,允许用户自定义 [java 如何实现开箱即用的敏感词控台服务?](https://mp.weixin.qq.com/s/rQo75cfMU_OEbTJa0JGMGg) +[v0.11.0-敏感词新特性及对应标签文件](https://mp.weixin.qq.com/s/m40ZnR6YF6WgPrArUSZ_0g) + ![wechat](https://img-blog.csdnimg.cn/63926529df364f09bcb203a8a9016854.png) # NLP 开源矩阵 diff --git a/release.bat b/release.bat index 0460ad3..161d6e5 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.11.0 +SET version=0.12.0 :::: 新版本名称 -SET newVersion=0.12.0 +SET newVersion=0.13.0 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/AbstractWordResultHandler.java b/src/main/java/com/github/houbb/sensitive/word/support/result/AbstractWordResultHandler.java new file mode 100644 index 0000000..345fb67 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/AbstractWordResultHandler.java @@ -0,0 +1,26 @@ +package com.github.houbb.sensitive.word.support.result; + +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordResult; +import com.github.houbb.sensitive.word.api.IWordResultHandler; + +/** + * 抽象的处理结果 + * + * @since 0.12.0 + * @param 泛型 + */ +public abstract class AbstractWordResultHandler implements IWordResultHandler { + + protected abstract R doHandle(IWordResult wordResult, IWordContext wordContext, String originalText); + + @Override + public R handle(IWordResult wordResult, IWordContext wordContext, String originalText) { + if(wordResult == null) { + return null; + } + + return doHandle(wordResult, wordContext, originalText); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java index a1ddb2d..1e373ab 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java @@ -3,7 +3,6 @@ package com.github.houbb.sensitive.word.support.result; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordResult; -import com.github.houbb.sensitive.word.api.IWordResultHandler; /** * 不做任何处理 @@ -11,7 +10,7 @@ import com.github.houbb.sensitive.word.api.IWordResultHandler; * @since 0.1.0 */ @ThreadSafe -public class WordResultHandlerRaw implements IWordResultHandler { +public class WordResultHandlerRaw extends AbstractWordResultHandler { /** * @since 0.3.0 @@ -23,7 +22,7 @@ public class WordResultHandlerRaw implements IWordResultHandler { } @Override - public IWordResult handle(IWordResult wordResult, IWordContext wordContext, String originalText) { + protected IWordResult doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) { return wordResult; } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java index 6748946..af79edb 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java @@ -3,7 +3,6 @@ package com.github.houbb.sensitive.word.support.result; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordResult; -import com.github.houbb.sensitive.word.api.IWordResultHandler; import com.github.houbb.sensitive.word.utils.InnerWordCharUtils; /** @@ -13,7 +12,7 @@ import com.github.houbb.sensitive.word.utils.InnerWordCharUtils; * @since 0.1.0 */ @ThreadSafe -public class WordResultHandlerWord implements IWordResultHandler { +public class WordResultHandlerWord extends AbstractWordResultHandler { /** * @since 0.3.0 @@ -25,11 +24,7 @@ public class WordResultHandlerWord implements IWordResultHandler { } @Override - public String handle(IWordResult wordResult, IWordContext wordContext, String originalText) { - if(wordResult == null) { - return null; - } - + protected String doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) { // 截取 return InnerWordCharUtils.getString(originalText.toCharArray(), wordResult); } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java new file mode 100644 index 0000000..7d1f4f2 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java @@ -0,0 +1,31 @@ +package com.github.houbb.sensitive.word.support.result; + +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordResult; +import com.github.houbb.sensitive.word.utils.InnerWordCharUtils; + +import java.util.Set; + +/** + * 单词+对应的标签信息 + * + * @author binbin.hou + * @since 0.12.0 + */ +public class WordResultHandlerWordTags extends AbstractWordResultHandler { + + @Override + protected WordTagsDto doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) { + // 截取 + String word = InnerWordCharUtils.getString(originalText.toCharArray(), wordResult); + // 标签 + + WordTagsDto dto = new WordTagsDto(); + dto.setWord(word); + // 获取 tags + Set wordTags = wordContext.wordTag().getTag(word); + dto.setTags(wordTags); + return dto; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java index b77e6b7..07b1726 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java @@ -30,4 +30,13 @@ public final class WordResultHandlers { return WordResultHandlerWord.getInstance(); } + /** + * 单词+标签的处理结果 + * @return 单词+标签的处理结果 + * @since 0.12.0 + */ + public static IWordResultHandler wordTags() { + return new WordResultHandlerWordTags(); + } + } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordTagsDto.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordTagsDto.java new file mode 100644 index 0000000..8031aa3 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordTagsDto.java @@ -0,0 +1,39 @@ +package com.github.houbb.sensitive.word.support.result; + +import java.io.Serializable; +import java.util.Set; + +/** + * @since 0.12.0 + */ +public class WordTagsDto implements Serializable { + + private String word; + + private Set tags; + + public String getWord() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public Set getTags() { + return tags; + } + + public void setTags(Set tags) { + this.tags = tags; + } + + @Override + public String toString() { + return "WordTagsDto{" + + "word='" + word + '\'' + + ", tags=" + tags + + '}'; + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java b/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java new file mode 100644 index 0000000..dd0784f --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java @@ -0,0 +1,49 @@ +package com.github.houbb.sensitive.word.support.handler; + +import com.github.houbb.sensitive.word.api.IWordResult; +import com.github.houbb.sensitive.word.bs.SensitiveWordBs; +import com.github.houbb.sensitive.word.core.SensitiveWordHelper; +import com.github.houbb.sensitive.word.support.result.WordResultHandlers; +import com.github.houbb.sensitive.word.support.result.WordTagsDto; +import com.github.houbb.sensitive.word.support.tag.WordTags; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.List; + +/** + * @since 0.12.0 + */ +public class WordResultHandlerTest { + + @Test + public void findAllWordTest() { + final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + + List wordList = SensitiveWordHelper.findAll(text); + Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); + List wordList2 = SensitiveWordHelper.findAll(text, WordResultHandlers.word()); + Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList2.toString()); + + List wordList3 = SensitiveWordHelper.findAll(text, WordResultHandlers.raw()); + Assert.assertEquals("[WordResult{startIndex=0, endIndex=4}, WordResult{startIndex=9, endIndex=12}, WordResult{startIndex=18, endIndex=21}]", wordList3.toString()); + } + + @Test + @Ignore + public void wordTagsTest() { + final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + + // 默认敏感词标签为空 + List wordList1 = SensitiveWordHelper.findAll(text, WordResultHandlers.wordTags()); + Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[]}, WordTagsDto{word='毛主席', tags=[]}, WordTagsDto{word='天安门', tags=[]}]", wordList1.toString()); + + List wordList2 = SensitiveWordBs.newInstance() + .wordTag(WordTags.file("D:\\github\\sensitive-word\\src\\test\\resources\\dict_tag_test.txt")) + .init() + .findAll(text, WordResultHandlers.wordTags()); + Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[政治, 国家]}, WordTagsDto{word='毛主席', tags=[政治, 伟人, 国家]}, WordTagsDto{word='天安门', tags=[政治, 国家, 地址]}]", wordList2.toString()); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/support/package-info.java b/src/test/java/com/github/houbb/sensitive/word/support/package-info.java new file mode 100644 index 0000000..2f0e44d --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/support/package-info.java @@ -0,0 +1 @@ +package com.github.houbb.sensitive.word.support; \ No newline at end of file diff --git a/src/test/resources/dict_tag_test.txt b/src/test/resources/dict_tag_test.txt index 51d66ca..02889a7 100644 --- a/src/test/resources/dict_tag_test.txt +++ b/src/test/resources/dict_tag_test.txt @@ -1 +1,3 @@ -五星红旗 政治,国家 \ No newline at end of file +五星红旗 政治,国家 +毛主席 政治,国家,伟人 +天安门 政治,国家,地址 \ No newline at end of file