diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 672dff3..cded43a 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -206,3 +206,9 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:-----|-----------------|:--------------------|:------| | 1 | O | 移除单个汉字+部分常用词的脏词 | 2023-11-17 23:51:58 | 降低误判率 | + +# release_0.10.0 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:-----|------------------|:--------------------|:------| +| 1 | A | 添加脏词的标签接口,便于后续拓展 | 2023-12-05 23:51:58 | | diff --git a/README.md b/README.md index 316d362..8550bbe 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ - [支持数据的数据动态更新(用户自定义),实时生效](https://github.com/houbb/sensitive-word#%E5%8A%A8%E6%80%81%E5%8A%A0%E8%BD%BD%E7%94%A8%E6%88%B7%E8%87%AA%E5%AE%9A%E4%B9%89) +- [支持敏感词的标签接口]() + ## 变更日志 [CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/CHANGE_LOG.md) @@ -58,7 +60,7 @@ com.github.houbb sensitive-word - 0.9.0 + 0.10.0 ``` @@ -66,16 +68,17 @@ `SensitiveWordHelper` 作为敏感词的工具类,核心方法如下: -| 方法 | 参数 | 返回值| 说明 | -|:---|:---|:---|:---| -| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 | -| replace(String, ISensitiveWordReplace) | 使用指定的替换策略替换敏感词 | 字符串 | 返回脱敏后的字符串 | -| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 | -| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 | -| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 | -| findFirst(String) | 待验证的字符串 | 字符串 | 返回字符串中第一个敏感词 | -| findAll(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串列表 | 返回字符串中所有敏感词 | -| findFirst(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串 | 返回字符串中第一个敏感词 | +| 方法 | 参数 | 返回值 | 说明 | +|:---------------------------------------|:-------------------------|:-------|:-------------| +| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 | +| replace(String, ISensitiveWordReplace) | 使用指定的替换策略替换敏感词 | 字符串 | 返回脱敏后的字符串 | +| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 | +| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 | +| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 | +| findFirst(String) | 待验证的字符串 | 字符串 | 返回字符串中第一个敏感词 | +| findAll(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串列表 | 返回字符串中所有敏感词 | +| findFirst(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串 | 返回字符串中第一个敏感词 | +| tags(String) | 获取敏感词的标签 | 敏感词字符串 | 返回敏感词的标签列表 | ## IWordResultHandler 结果处理类 @@ -387,6 +390,70 @@ Assert.assertTrue(wordBs.contains(text)); | 10 | enableWordCheck | 是否启用敏感单词检测 | true | | 11 | numCheckLen | 数字检测,自定义指定长度。 | 8 | +# 敏感词标签 + +## 说明 + +有时候我们希望对敏感词加一个分类标签:比如社情、暴/力等等。 + +这样后续可以按照标签等进行更多特性操作,比如只处理某一类的标签。 + +支持版本:v0.10.0 + +## 入门例子 + +### 接口 + +这里只是一个抽象的接口,用户可以自行定义实现。比如从数据库查询等。 + +```java +public interface IWordTag { + + /** + * 查询标签列表 + * @param word 脏词 + * @return 结果 + */ + Set getTag(String word); + +} +``` + +### 配置文件 + +我们可以自定义 dict 标签文件,通过 WordTags.file() 创建一个 WordTag 实现。 + +- dict_tag_test.txt + +``` +五星红旗 政治,国家 +``` + +格式如下: + +``` +敏感词 tag1,tag2 +``` + +### 实现 + +具体的效果如下,在引导类设置一下即可。 + +默认的 wordTag 是空的。 + +```java +String filePath = "dict_tag_test.txt"; +IWordTag wordTag = WordTags.file(filePath); + +SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() + .wordTag(wordTag) + .init(); + +Assert.assertEquals("[政治, 国家]", sensitiveWordBs.tags("五星红旗").toString());; +``` + +后续会考虑引入一个内置的标签文件策略。 + # 动态加载(用户自定义) ## 情景说明 @@ -667,9 +734,11 @@ ps: 不同环境会有差异,但是比例基本稳定。 remove、add、edit? -- [ ] 敏感词标签支持 + 分级支持 +- [x] 敏感词标签接口支持 -比较耗时间。 +- [ ] 敏感词处理时标签支持 + +TODO: 比较耗时间。 - [x] wordData 的内存占用对比 + 优化 diff --git a/pom.xml b/pom.xml index 8f8838b..d324ece 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.9.0 + 0.10.0 diff --git a/release.bat b/release.bat index af69ea8..3373766 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.9.0 +SET version=0.10.0 :::: 新版本名称 -SET newVersion=0.10.0 +SET newVersion=0.11.0 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index bbd8b75..44315cf 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -1,5 +1,7 @@ package com.github.houbb.sensitive.word.api; +import com.github.houbb.sensitive.word.bs.SensitiveWordContext; + /** * @author binbin.hou * @since 0.0.4 @@ -231,4 +233,8 @@ public interface IWordContext { */ IWordContext wordData(IWordData wordMap); + IWordTag wordTag(); + + SensitiveWordContext wordTag(IWordTag wordTag); + } diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordTag.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordTag.java new file mode 100644 index 0000000..2072ed2 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordTag.java @@ -0,0 +1,20 @@ +package com.github.houbb.sensitive.word.api; + +import java.util.Set; + +/** + * 获取脏词的标签,便于分类 + * + * @author dh + * @since 0.10.0 + */ +public interface IWordTag { + + /** + * 查询标签列表 + * @param word 脏词 + * @return 结果 + */ + Set getTag(String word); + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 29bd564..cf20c9c 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -2,6 +2,7 @@ package com.github.houbb.sensitive.word.bs; import com.github.houbb.heaven.support.handler.IHandler; import com.github.houbb.heaven.util.common.ArgUtil; +import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.sensitive.word.api.*; import com.github.houbb.sensitive.word.api.combine.IWordAllowDenyCombine; @@ -16,9 +17,12 @@ import com.github.houbb.sensitive.word.support.data.WordDatas; import com.github.houbb.sensitive.word.support.deny.WordDenys; import com.github.houbb.sensitive.word.support.replace.WordReplaces; import com.github.houbb.sensitive.word.support.result.WordResultHandlers; +import com.github.houbb.sensitive.word.support.tag.WordTags; import java.util.Collection; +import java.util.Collections; import java.util.List; +import java.util.Set; /** * 敏感词引导类 @@ -146,6 +150,12 @@ public class SensitiveWordBs { */ private IWordAllowDenyCombine wordAllowDenyCombine = WordAllowDenyCombines.defaults(); + /** + * 单词标签 + * @since 0.10.0 + */ + private IWordTag wordTag = WordTags.none(); + /** * 新建验证实例 *

@@ -214,10 +224,18 @@ public class SensitiveWordBs { context.sensitiveCheckNumLen(numCheckLen); context.wordReplace(wordReplace); context.wordData(wordData); + context.wordTag(wordTag); return context; } + public SensitiveWordBs wordTag(IWordTag wordTag) { + ArgUtil.notNull(wordTag, "wordTag"); + + this.wordTag = wordTag; + return this; + } + public SensitiveWordBs wordCheckCombine(IWordCheckCombine wordCheckCombine) { ArgUtil.notNull(wordCheckCombine, "wordCheckCombine"); @@ -509,6 +527,22 @@ public class SensitiveWordBs { return sensitiveWord.replace(target, context); } + /** + * 获取敏感词的标签 + * + * @param word 敏感词 + * @return 结果 + * @since 0.10.0 + */ + public Set tags(final String word) { + if(StringUtil.isEmpty(word)) { + return Collections.emptySet(); + } + + // 是否需要格式化? + return wordTag.getTag(word); + } + //------------------------------------------------------------------------------------ 公开方法 END } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index 59bc673..a42730c 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -100,6 +100,13 @@ public class SensitiveWordContext implements IWordContext { */ private IWordData wordData; + /** + * 单词标签 + * + * @since 0.10.0 + */ + private IWordTag wordTag; + public IWordData wordData() { return wordData; } @@ -273,4 +280,14 @@ public class SensitiveWordContext implements IWordContext { this.wordFormat = wordFormat; return this; } + + public IWordTag wordTag() { + return wordTag; + } + + public SensitiveWordContext wordTag(IWordTag wordTag) { + this.wordTag = wordTag; + return this; + } + } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/tag/AbstractWordTag.java b/src/main/java/com/github/houbb/sensitive/word/support/tag/AbstractWordTag.java new file mode 100644 index 0000000..abc3f51 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/tag/AbstractWordTag.java @@ -0,0 +1,33 @@ +package com.github.houbb.sensitive.word.support.tag; + +import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.sensitive.word.api.IWordTag; + +import java.util.Collections; +import java.util.Set; + +/** + * 抽象的单词标签 + * + * @since 0.10.0 + */ +public abstract class AbstractWordTag implements IWordTag { + + + /** + * 获取标签 + * @param word 单词 + * @return 结果 + */ + protected abstract Set doGetTag(String word); + + @Override + public Set getTag(String word) { + if(StringUtil.isEmpty(word)) { + return Collections.emptySet(); + } + + return doGetTag(word); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/tag/FileWordTag.java b/src/main/java/com/github/houbb/sensitive/word/support/tag/FileWordTag.java new file mode 100644 index 0000000..50fae6e --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/tag/FileWordTag.java @@ -0,0 +1,89 @@ +package com.github.houbb.sensitive.word.support.tag; + +import com.github.houbb.heaven.util.common.ArgUtil; +import com.github.houbb.heaven.util.io.FileUtil; +import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.heaven.util.util.CollectionUtil; + +import java.util.*; + +/** + * 基于文件的标签 + * + * word tag1,tag2 + * @since 0.10.0 + */ +public class FileWordTag extends AbstractWordTag { + + /** + * 文件路径 + */ + protected final String filePath; + /** + * 词和标签的分隔符 + */ + protected final String wordSplit; + /** + * 标签的分隔符 + */ + protected final String tagSplit; + + protected Map> wordTagMap = new HashMap<>(); + + public FileWordTag(String filePath) { + this(filePath, " ", ","); + } + + public FileWordTag(String filePath, String wordSplit, String tagSplit) { + ArgUtil.notEmpty(filePath, "filePath"); + ArgUtil.notEmpty(wordSplit, "wordSplit"); + ArgUtil.notEmpty(tagSplit, "tagSplit"); + + this.wordSplit = wordSplit; + this.tagSplit = tagSplit; + this.filePath = filePath; + + this.initWordTagMap(); + } + + + /** + * 初始化 + */ + protected synchronized void initWordTagMap() { + List lines = FileUtil.readAllLines(filePath); + if(CollectionUtil.isEmpty(lines)) { + return; + } + + for(String line : lines) { + if(StringUtil.isEmpty(line)) { + continue; + } + + // 处理每一行 + handleInitLine(line); + } + } + + protected synchronized void handleInitLine(String line) { + String[] strings = line.split(wordSplit); + if(strings.length < 2) { + return; + } + + String word = strings[0]; + String tagText = strings[1]; + + + String[] tags = tagText.split(tagSplit); + Set tagSet = new HashSet<>(Arrays.asList(tags)); + wordTagMap.put(word, tagSet); + } + + @Override + protected Set doGetTag(String word) { + return wordTagMap.get(word); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/tag/NoneWordTag.java b/src/main/java/com/github/houbb/sensitive/word/support/tag/NoneWordTag.java new file mode 100644 index 0000000..af6083a --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/tag/NoneWordTag.java @@ -0,0 +1,19 @@ +package com.github.houbb.sensitive.word.support.tag; + +import java.util.Collections; +import java.util.Set; + +/** + * 空标签 + * + * word tag1,tag2 + * @since 0.10.0 + */ +public class NoneWordTag extends AbstractWordTag { + + @Override + protected Set doGetTag(String word) { + return Collections.emptySet(); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/tag/WordTags.java b/src/main/java/com/github/houbb/sensitive/word/support/tag/WordTags.java new file mode 100644 index 0000000..0279196 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/tag/WordTags.java @@ -0,0 +1,20 @@ +package com.github.houbb.sensitive.word.support.tag; + +import com.github.houbb.sensitive.word.api.IWordTag; + +/** + * 单词标签 + * + * @since 0.10.0 + */ +public class WordTags { + + public static IWordTag none() { + return new NoneWordTag(); + } + + public static IWordTag file(String filePath) { + return new FileWordTag(filePath); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTagTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTagTest.java new file mode 100644 index 0000000..dfef147 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTagTest.java @@ -0,0 +1,29 @@ +package com.github.houbb.sensitive.word.bs; + +import com.github.houbb.sensitive.word.api.IWordTag; +import com.github.houbb.sensitive.word.support.tag.WordTags; +import org.junit.Assert; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.10.0 + */ +public class SensitiveWordBsTagTest { + + public static void main(String[] args) { + String filePath = "D:\\code\\github\\sensitive-word\\src\\test\\resources\\dict_tag_test.txt"; + + IWordTag wordTag = WordTags.file(filePath); + + SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() + .wordTag(wordTag) + .init() + ; + + Assert.assertEquals("[政治, 国家]", sensitiveWordBs.tags("五星红旗").toString());; + } + +} diff --git a/src/test/resources/dict_tag_test.txt b/src/test/resources/dict_tag_test.txt new file mode 100644 index 0000000..51d66ca --- /dev/null +++ b/src/test/resources/dict_tag_test.txt @@ -0,0 +1 @@ +五星红旗 政治,国家 \ No newline at end of file