diff --git a/README.md b/README.md index d65c2fe..9b819fa 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,8 @@ - 支持英文常见形式的互换 +- 支持用户自定义敏感词和白名单 + ## 变更日志 [CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/doc/CHANGE_LOG.md) @@ -54,7 +56,7 @@ com.github.houbb sensitive-word - 0.0.7 + 0.0.8 ``` @@ -178,14 +180,36 @@ List wordList = SensitiveWordBs.newInstance().findAll(text); Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString()); ``` +# 用户自定义 + +## 敏感词和白名单 + +直接在 resource 目录下新建文件,每一行对应一个敏感词。 + +`sensitive_word_deny.txt` 代表用户自定义敏感词文件。 + +`sensitive_word_allow.txt` 代表用户自定义白名单文件。 + +## 测试 + +我们在敏感词文件中加入一行,内容为 `自定义敏感词`,同时在白名单文件中加入一行, +内容为 `gender` 作为用户不认为是敏感词的信息。 + +- 测试代码 + +```java +final String text = "gender 我们认为应该通过,自定义敏感词我们认为应该拒绝。"; + +List wordList = SensitiveWordBs.newInstance().findAll(text); +Assert.assertEquals("[自定义敏感词]", wordList.toString()); +``` + # 后期 road-map - 停顿词 - 拼音互换 -- 用户自定义敏感词和白名单 - - 文字镜像翻转 - 敏感词标签支持 diff --git a/doc/CHANGE_LOG.md b/doc/CHANGE_LOG.md index 34e900c..667c253 100644 --- a/doc/CHANGE_LOG.md +++ b/doc/CHANGE_LOG.md @@ -61,4 +61,10 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:---|:---|:---|:--| -| 1 | A | 添加忽略重复词支持 | 2020-1-10 09:34:35 | | \ No newline at end of file +| 1 | A | 添加忽略重复词支持 | 2020-1-10 09:34:35 | | + +# release_0.0.8 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:---|:---|:---|:--| +| 1 | A | 添加用户自定义敏感词和白名单 | 2020-1-10 09:34:35 | | \ No newline at end of file diff --git a/pom.xml b/pom.xml index 708f0c3..9497c34 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.0.8-SNAPSHOT + 0.0.8 diff --git a/release.bat b/release.bat index 2f3a21d..210e5d7 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.0.7 +SET version=0.0.8 :::: 新版本名称 -SET newVersion=0.0.8 +SET newVersion=0.0.9 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java index 14090b9..1b239bb 100644 --- a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java +++ b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java @@ -31,4 +31,16 @@ public final class AppConst { */ public static final int DICT_EN_SIZE = 12; + /** + * 拒绝的词语 + * @since 0.0.8 + */ + public static final String SENSITIVE_WORD_DENY_PATH = "/sensitive_word_deny.txt"; + + /** + * 用户允许的词语 + * @since 0.0.8 + */ + public static final String SENSITIVE_WORD_ALLOW_PATH = "/sensitive_word_allow.txt"; + } diff --git a/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java b/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java deleted file mode 100644 index 81cc793..0000000 --- a/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java +++ /dev/null @@ -1,69 +0,0 @@ -package com.github.houbb.sensitive.word.model; - -/** - * 检测敏感词结果 - * - * TODO: 这里需要结合 KMP 和 暴力匹配算法。 - * - * 暂时不使用,后期会使用到。 - * @author binbin.hou - * @since 0.0.2 - */ -@Deprecated -public class CheckSensitiveWordResult { - - /** - * 是否匹配到了敏感词 - * @since 0.0.2 - */ - private boolean hasMatch; - - /** - * 敏感词长度 - * @since 0.0.2 - */ - private int sensitiveWordSize; - - /** - * 普通单词的长度 - * @since 0.0.2 - */ - private int commonWordSize; - - public boolean hasMatch() { - return hasMatch; - } - - public CheckSensitiveWordResult hasMatch(boolean hasMatch) { - this.hasMatch = hasMatch; - return this; - } - - public int sentiveWordSize() { - return sensitiveWordSize; - } - - public CheckSensitiveWordResult sentiveWordSize(int sensitiveWordSize) { - this.sensitiveWordSize = sensitiveWordSize; - return this; - } - - public int commonWordSize() { - return commonWordSize; - } - - public CheckSensitiveWordResult commonWordSize(int commonWordSize) { - this.commonWordSize = commonWordSize; - return this; - } - - @Override - public String toString() { - return "CheckSensitiveWordResult{" + - "hasMatch=" + hasMatch + - ", sensitiveWordSize=" + sensitiveWordSize + - ", commonWordSize=" + commonWordSize + - '}'; - } - -} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java index 9be0a55..0216109 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java @@ -3,6 +3,7 @@ package com.github.houbb.sensitive.word.support.data; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.heaven.util.guava.Guavas; import com.github.houbb.heaven.util.io.StreamUtil; +import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.sensitive.word.api.IWordData; import com.github.houbb.sensitive.word.constant.AppConst; @@ -30,6 +31,15 @@ public class SensitiveWordData implements IWordData { defaultLines = Guavas.newArrayList(AppConst.DICT_SIZE+AppConst.DICT_EN_SIZE); defaultLines = StreamUtil.readAllLines("/dict.txt"); defaultLines.addAll(StreamUtil.readAllLines("/dict_en.txt")); + + // 用户自定义 + List denyList = StreamUtil.readAllLines("/sensitive_word_deny.txt"); + defaultLines.addAll(denyList); + + // 移除白名单词语 + List allowList = StreamUtil.readAllLines("/sensitive_word_allow.txt"); + defaultLines = CollectionUtil.difference(defaultLines, allowList); + long end = System.currentTimeMillis(); System.out.println("Sensitive data loaded!, cost time: " + (end - start) + "ms"); } diff --git a/src/main/resources/sensitive_word_allow.txt b/src/main/resources/sensitive_word_allow.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/main/resources/sensitive_word_deny.txt b/src/main/resources/sensitive_word_deny.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUserDefineTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUserDefineTest.java new file mode 100644 index 0000000..4978c34 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUserDefineTest.java @@ -0,0 +1,29 @@ +package com.github.houbb.sensitive.word.bs; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.0.8 + */ +public class SensitiveWordBsUserDefineTest { + + /** + * 自定义允许和拒绝的文件 + * @since 0.0.8 + */ + @Test + public void allowAndDenyTest() { + final String text = "gender 我们认为应该通过,自定义敏感词我们认为应该拒绝。"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[自定义敏感词]", wordList.toString()); + } + +} diff --git a/src/test/resources/sensitive_word_allow.txt b/src/test/resources/sensitive_word_allow.txt new file mode 100644 index 0000000..92e3f55 --- /dev/null +++ b/src/test/resources/sensitive_word_allow.txt @@ -0,0 +1 @@ +gender \ No newline at end of file diff --git a/src/test/resources/sensitive_word_deny.txt b/src/test/resources/sensitive_word_deny.txt new file mode 100644 index 0000000..07c3ecc --- /dev/null +++ b/src/test/resources/sensitive_word_deny.txt @@ -0,0 +1 @@ +自定义敏感词 \ No newline at end of file