diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index ba3fa56..f045cfc 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -126,3 +126,10 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:---|:---|:---|:--| | 1 | F | 自定义敏感词 allow/deny 进行格式化处理 | 2021-12-11 23:51:58 | | + +# release_0.2.0 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:---|:---|:---|:--| +| 1 | A | 允许用户自定义替换策略 | 2022-01-15 23:51:58 | | +| 2 | U | 升级二方数据库依赖 | 2022-01-15 23:51:58 | | diff --git a/README.md b/README.md index c792156..1d8746e 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,9 @@ [CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/doc/CHANGE_LOG.md) -v0.1.1 变更: +v0.2.0 变更: -- 敏感词自定义 Allow/Deny 进行格式化处理 +- 支持用户自定义替换策略 # 快速开始 @@ -62,7 +62,7 @@ v0.1.1 变更: com.github.houbb sensitive-word - 0.1.1 + 0.2.0 ``` @@ -73,6 +73,7 @@ v0.1.1 变更: | 方法 | 参数 | 返回值| 说明 | |:---|:---|:---|:---| | contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 | +| replace(String, ISensitiveWordReplace) | 使用指定的替换策略替换敏感词 | 字符串 | 返回脱敏后的字符串 | | replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 | | replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 | | findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 | @@ -170,6 +171,58 @@ String result = SensitiveWordHelper.replace(text, '0'); Assert.assertEquals("0000迎风飘扬,000的画像屹立在000前。", result); ``` +### 自定义替换策略 + +V0.2.0 支持该特性。 + +场景说明:有时候我们希望不同的敏感词有不同的替换结果。比如【游戏】替换为【电子竞技】,【失业】替换为【灵活就业】。 + +诚然,提前使用字符串的正则替换也可以,不过性能一般。 + +使用例子: + +```java +/** + * 自定替换策略 + * @since 0.2.0 + */ +@Test +public void defineReplaceTest() { + final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + + ISensitiveWordReplace replace = new MySensitiveWordReplace(); + String result = SensitiveWordHelper.replace(text, replace); + + Assert.assertEquals("国家旗帜迎风飘扬,教员的画像屹立在***前。", result); +} +``` + +其中 `MySensitiveWordReplace` 是我们自定义的替换策略,实现如下: + +```java +public class MySensitiveWordReplace implements ISensitiveWordReplace { + + @Override + public String replace(ISensitiveWordReplaceContext context) { + String sensitiveWord = context.sensitiveWord(); + // 自定义不同的敏感词替换策略,可以从数据库等地方读取 + if("五星红旗".equals(sensitiveWord)) { + return "国家旗帜"; + } + if("毛主席".equals(sensitiveWord)) { + return "教员"; + } + + // 其他默认使用 * 代替 + int wordLength = context.wordLength(); + return CharUtil.repeat('*', wordLength); + } + +} +``` + +我们针对其中的部分词做固定映射处理,其他的默认转换为 `*`。 + # 更多特性 后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。 @@ -530,8 +583,6 @@ public class SensitiveWordService { # 后期 road-map -- 停顿词 - - 同音字处理 - 形近字处理 @@ -542,7 +593,7 @@ public class SensitiveWordService { - 敏感词标签支持 -- 邮箱后缀检测 +- [ ] DFA 数据结构的另一种实现 # 拓展阅读 @@ -552,4 +603,16 @@ public class SensitiveWordService { [敏感词库优化流程](https://houbb.github.io/2020/01/07/sensitive-word-slim) -[停止词的思考记录](https://houbb.github.io/2020/01/07/sensitive-word-stopword) +[java 如何实现开箱即用的敏感词控台服务?](https://mp.weixin.qq.com/s/rQo75cfMU_OEbTJa0JGMGg) + +![WECHAT](WECHAT.png) + +# 相关开源库 + +[heaven 基础工具包](https://github.com/houbb/heaven) + +[opencc4j 繁简体转换](https://github.com/houbb/opencc4j) + +[pinyin 拼音工具](https://github.com/houbb/pinyin) + +[nlp-hanzi-similar 汉字相似度工具](https://github.com/houbb/nlp-hanzi-similar) diff --git a/WECHAT.png b/WECHAT.png new file mode 100644 index 0000000..ff0aedb Binary files /dev/null and b/WECHAT.png differ diff --git a/pom.xml b/pom.xml index 2faadb4..66eb642 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.1.1 + 0.2.0 @@ -25,8 +25,8 @@ 1.7 - 0.1.148 - 1.7.1 + 0.1.154 + 1.7.2 4.13.1 @@ -104,7 +104,7 @@ ${project.compiler.level} ${project.compiler.level} ${project.build.sourceEncoding} - -proc:none + @@ -255,4 +255,4 @@ - \ No newline at end of file + diff --git a/release.bat b/release.bat index fb5391f..463f219 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.1.1 +SET version=0.2.0 :::: 新版本名称 -SET newVersion=0.1.2 +SET newVersion=0.2.1 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplace.java b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplace.java new file mode 100644 index 0000000..429e435 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplace.java @@ -0,0 +1,19 @@ +package com.github.houbb.sensitive.word.api; + +/** + * 敏感词替换策略 + * + * @author binbin.hou + * @since 0.2.0 + */ +public interface ISensitiveWordReplace { + + /** + * 替换 + * @param context 上下文 + * @return 结果 + * @since 0.2.0 + */ + String replace(ISensitiveWordReplaceContext context); + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplaceContext.java b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplaceContext.java new file mode 100644 index 0000000..fce6da0 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplaceContext.java @@ -0,0 +1,25 @@ +package com.github.houbb.sensitive.word.api; + +/** + * 敏感词替换策略上下文 + * + * @author binbin.hou + * @since 0.2.0 + */ +public interface ISensitiveWordReplaceContext { + + /** + * 敏感词 + * @return 敏感词 + * @since 0.2.0 + */ + String sensitiveWord(); + + /** + * 单词长度 + * @return 单词长度 + * @since 0.2.0 + */ + int wordLength(); + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java index 9bf2140..238c15d 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java @@ -59,12 +59,13 @@ public interface IWordMap extends ISensitiveCheck { * ps: 这里可以添加优化。 * * @param target 目标字符串 - * @param replaceChar 替换为的 char + * @param replace 替换策略 * @param context 上下文 * @return 替换后结果 * @since 0.0.2 */ - String replace(final String target, final char replaceChar, + String replace(final String target, + final ISensitiveWordReplace replace, final IWordContext context); } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 4d7c627..33cb4be 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -8,6 +8,7 @@ import com.github.houbb.sensitive.word.api.*; import com.github.houbb.sensitive.word.support.allow.WordAllows; import com.github.houbb.sensitive.word.support.deny.WordDenys; import com.github.houbb.sensitive.word.support.map.SensitiveWordMap; +import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceChar; import com.github.houbb.sensitive.word.support.result.WordResultHandlers; import com.github.houbb.sensitive.word.utils.InnerFormatUtils; @@ -393,9 +394,23 @@ public class SensitiveWordBs { * @since 0.0.2 */ public String replace(final String target, final char replaceChar) { + ISensitiveWordReplace replace = new SensitiveWordReplaceChar(replaceChar); + + return replace(target, replace); + } + + /** + * 替换所有内容 + * + * @param target 目标字符串 + * @param replace 替换策略 + * @return 替换后结果 + * @since 0.2.0 + */ + public String replace(final String target, final ISensitiveWordReplace replace) { statusCheck(); - return sensitiveWordMap.replace(target, replaceChar, context); + return sensitiveWordMap.replace(target, replace, context); } /** diff --git a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWordHelper.java b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWordHelper.java index c435efe..2fe076b 100644 --- a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWordHelper.java +++ b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWordHelper.java @@ -1,5 +1,6 @@ package com.github.houbb.sensitive.word.core; +import com.github.houbb.sensitive.word.api.ISensitiveWordReplace; import com.github.houbb.sensitive.word.api.IWordResultHandler; import com.github.houbb.sensitive.word.bs.SensitiveWordBs; @@ -59,6 +60,18 @@ public final class SensitiveWordHelper { return WORD_BS.findFirst(target); } + /** + * 替换所有内容 + * + * @param target 目标字符串 + * @param replace 替换策略 + * @return 替换后结果 + * @since 0.2.0 + */ + public static String replace(final String target, final ISensitiveWordReplace replace) { + return WORD_BS.replace(target, replace); + } + /** * 替换所有内容 * diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java index 683ee10..18e34da 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java @@ -9,14 +9,13 @@ import com.github.houbb.heaven.util.lang.ObjectUtil; import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.heaven.util.util.MapUtil; -import com.github.houbb.sensitive.word.api.IWordContext; -import com.github.houbb.sensitive.word.api.IWordMap; -import com.github.houbb.sensitive.word.api.IWordResult; +import com.github.houbb.sensitive.word.api.*; import com.github.houbb.sensitive.word.constant.AppConst; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckChain; import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl; +import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext; import com.github.houbb.sensitive.word.support.result.WordResult; import java.util.Collection; @@ -156,12 +155,12 @@ public class SensitiveWordMap implements IWordMap { } @Override - public String replace(String target, char replaceChar, final IWordContext context) { + public String replace(String target, final ISensitiveWordReplace replace, final IWordContext context) { if(StringUtil.isEmpty(target)) { return target; } - return this.replaceSensitiveWord(target, replaceChar, context); + return this.replaceSensitiveWord(target, replace, context); } /** @@ -213,11 +212,13 @@ public class SensitiveWordMap implements IWordMap { /** * 直接替换敏感词,返回替换后的结果 * @param target 文本信息 + * @param replace 替换策略 + * @param context 上下文 * @return 脱敏后的字符串 * @since 0.0.2 */ private String replaceSensitiveWord(final String target, - final char replaceChar, + final ISensitiveWordReplace replace, final IWordContext context) { if(StringUtil.isEmpty(target)) { return target; @@ -241,7 +242,12 @@ public class SensitiveWordMap implements IWordMap { // 直接使用原始内容,避免 markdown 图片转换失败 resultBuilder.append(string); } else { - String replaceStr = CharUtil.repeat(replaceChar, wordLength); + // 创建上下文 + ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance() + .sensitiveWord(string) + .wordLength(wordLength); + String replaceStr = replace.replace(replaceContext); + resultBuilder.append(replaceStr); } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceChar.java b/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceChar.java new file mode 100644 index 0000000..4893eee --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceChar.java @@ -0,0 +1,29 @@ +package com.github.houbb.sensitive.word.support.replace; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.util.lang.CharUtil; +import com.github.houbb.sensitive.word.api.ISensitiveWordReplace; +import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext; + +/** + * 指定字符的替换策略 + * @author binbin.hou + * @since 0.2.0 + */ +@ThreadSafe +public class SensitiveWordReplaceChar implements ISensitiveWordReplace { + + private final char replaceChar; + + public SensitiveWordReplaceChar(char replaceChar) { + this.replaceChar = replaceChar; + } + + @Override + public String replace(ISensitiveWordReplaceContext context) { + int wordLength = context.wordLength(); + + return CharUtil.repeat(replaceChar, wordLength); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceContext.java b/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceContext.java new file mode 100644 index 0000000..31c67b1 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceContext.java @@ -0,0 +1,57 @@ +package com.github.houbb.sensitive.word.support.replace; + +import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext; + +/** + * 敏感词替换上下文 + * + * @author binbin.hou + * @since 0.2.0 + */ +public class SensitiveWordReplaceContext implements ISensitiveWordReplaceContext { + + public static SensitiveWordReplaceContext newInstance() { + return new SensitiveWordReplaceContext(); + } + + /** + * 敏感词 + * @since 0.2.0 + */ + private String sensitiveWord; + + /** + * 单词长度 + * @since 0.2.0 + */ + private int wordLength; + + @Override + public String sensitiveWord() { + return sensitiveWord; + } + + public SensitiveWordReplaceContext sensitiveWord(String sensitiveWord) { + this.sensitiveWord = sensitiveWord; + return this; + } + + @Override + public int wordLength() { + return wordLength; + } + + public SensitiveWordReplaceContext wordLength(int wordLength) { + this.wordLength = wordLength; + return this; + } + + @Override + public String toString() { + return "SensitiveWordReplaceContext{" + + "sensitiveWord='" + sensitiveWord + '\'' + + ", wordLength=" + wordLength + + '}'; + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java b/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java index 978b009..9d82823 100644 --- a/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java @@ -1,6 +1,8 @@ package com.github.houbb.sensitive.word.core; +import com.github.houbb.sensitive.word.api.ISensitiveWordReplace; import com.github.houbb.sensitive.word.api.IWordResult; +import com.github.houbb.sensitive.word.replace.MySensitiveWordReplace; import com.github.houbb.sensitive.word.support.result.WordResultHandlers; import org.junit.Assert; import org.junit.Test; @@ -148,4 +150,18 @@ public class SensitiveWordHelperTest { Assert.assertEquals("fuck", word); } + /** + * 自定替换策略 + * @since 0.2.0 + */ + @Test + public void defineReplaceTest() { + final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + + ISensitiveWordReplace replace = new MySensitiveWordReplace(); + String result = SensitiveWordHelper.replace(text, replace); + + Assert.assertEquals("国家旗帜迎风飘扬,教员的画像屹立在***前。", result); + } + } diff --git a/src/test/java/com/github/houbb/sensitive/word/replace/MySensitiveWordReplace.java b/src/test/java/com/github/houbb/sensitive/word/replace/MySensitiveWordReplace.java new file mode 100644 index 0000000..32d6a8e --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/replace/MySensitiveWordReplace.java @@ -0,0 +1,31 @@ +package com.github.houbb.sensitive.word.replace; + +import com.github.houbb.heaven.util.lang.CharUtil; +import com.github.houbb.sensitive.word.api.ISensitiveWordReplace; +import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext; + +/** + * 自定义敏感词替换策略 + * + * @author binbin.hou + * @since 0.2.0 + */ +public class MySensitiveWordReplace implements ISensitiveWordReplace { + + @Override + public String replace(ISensitiveWordReplaceContext context) { + String sensitiveWord = context.sensitiveWord(); + // 自定义不同的敏感词替换策略,可以从数据库等地方读取 + if("五星红旗".equals(sensitiveWord)) { + return "国家旗帜"; + } + if("毛主席".equals(sensitiveWord)) { + return "教员"; + } + + // 其他默认使用 * 代替 + int wordLength = context.wordLength(); + return CharUtil.repeat('*', wordLength); + } + +}