From fa9348d55d983c1dab0e8dca3c2c95b3cfbe0e10 Mon Sep 17 00:00:00 2001 From: "binbin.hou" <1060732496@qq.com> Date: Thu, 9 Jan 2020 13:34:43 +0800 Subject: [PATCH] [Feature] add for new --- README.md | 88 +++++++++++++++---- .../roadmap/v004-实现标点英文全角半角转换.md | 11 ++- doc/issues/roadmap/v008-拼音的处理.md | 8 +- pom.xml | 4 + .../sensitive/word/api/IWordContext.java | 39 ++++++++ .../houbb/sensitive/word/api/IWordMap.java | 17 ++-- .../sensitive/word/bs/SensitiveWordBs.java | 71 ++++++++++----- .../word/bs/SensitiveWordContext.java | 88 +++++++++++++++++++ .../word/constant/enums/ValidModeEnum.java | 2 +- .../word/model/CheckSensitiveWordResult.java | 4 +- .../word/support/data/SensitiveWordData.java | 2 +- .../word/support/map/SensitiveWordMap.java | 72 ++++++++++----- .../word/bs/SensitiveWordBsTest.java | 34 +++++-- 13 files changed, 363 insertions(+), 77 deletions(-) create mode 100644 src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java create mode 100644 src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java diff --git a/README.md b/README.md index cf61dad..92e98ff 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # sensitive-word -[sensitive-word](https://github.com/houbb/sensitive-word) 基于 DFA 算法实现的敏感词工具。 +[sensitive-word](https://github.com/houbb/sensitive-word) 基于 DFA 算法实现的高性能敏感词工具。 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.houbb/sensitive-word/badge.svg)](http://mvnrepository.com/artifact/com.github.houbb/sensitive-word) @@ -10,21 +10,25 @@ 实现一款好用敏感词工具。 -基于 DFA 算法实现,目前敏感词库内容收录 18W+ 感觉过于臃肿。 +基于 DFA 算法实现,目前敏感词库内容收录 6W+(源文件 18W+,经过一次删减)。 -后期将进行相关优化,降低字典的数量。 +后期将进行持续优化和补充敏感词库,并进一步提升算法的性能。 -希望可以细化敏感词的分类,感觉工作量比较大,暂时没有太好的思路。 +希望可以细化敏感词的分类,感觉工作量比较大,暂时没有进行。 -## 后期目标 +## 特性 -- 持续扩容对应的敏感词(如合法的数据抓取) +- 6W+ 词库,且不断优化更新 -- 添加英文大小写忽略,全角半角忽略 +- 基于 DFA 算法,性能很好 -- 中文添加拼音相关转换,添加繁简体转换忽略 +- 基于 fluent-api 实现,优雅方便 -- 允许用户自定义敏感词和白名单 +- 支持敏感词的判断、返回、脱敏等常见操作 + +- 支持全角半角互换 + +- 支持英文大小写互换 # 快速开始 @@ -40,10 +44,22 @@ com.github.houbb sensitive-word - 0.0.3 + 0.0.4 ``` +## api 概览 + +`SensitiveWordBs` 作为敏感词的引导类,核心方法如下: + +| 方法 | 参数 | 返回值| 说明 | +|:---|:---|:---|:---| +| newInstance() | 无 | 引导类 | 初始化引导类 | +| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 | +| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 | +| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 | +| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 | + ## 使用实例 所有测试案例参见 [SensitiveWordBsTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java) @@ -53,7 +69,7 @@ ```java final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; -Assert.assertTrue(SensitiveWordBs.getInstance().contains(text)); +Assert.assertTrue(SensitiveWordBs.newInstance().contains(text)); ``` ### 返回第一个敏感词 @@ -61,7 +77,7 @@ Assert.assertTrue(SensitiveWordBs.getInstance().contains(text)); ```java final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; -String word = SensitiveWordBs.getInstance().findFirst(text); +String word = SensitiveWordBs.newInstance().findFirst(text); Assert.assertEquals("五星红旗", word); ``` @@ -70,7 +86,7 @@ Assert.assertEquals("五星红旗", word); ```java final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; -List wordList = SensitiveWordBs.getInstance().findAll(text); +List wordList = SensitiveWordBs.newInstance().findAll(text); Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); ``` @@ -78,7 +94,7 @@ Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()) ```java final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; -String result = SensitiveWordBs.getInstance().replace(text); +String result = SensitiveWordBs.newInstance().replace(text); Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result); ``` @@ -86,6 +102,46 @@ Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result) ```java final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; -String result = SensitiveWordBs.getInstance().replace(text, '0'); +String result = SensitiveWordBs.newInstance().replace(text, '0'); Assert.assertEquals("0000迎风飘扬,000的画像屹立在000前。", result); -``` \ No newline at end of file +``` + +# 更多特性 + +后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。 + +这是一场漫长的攻防之战。 + +## 忽略大小写 + +```java +final String text = "fuCK the bad words."; + +String word = SensitiveWordBs.newInstance().findFirst(text); +Assert.assertEquals("fuCK", word); +``` + +## 忽略半角圆角 + +```java +final String text = "fuck the bad words."; + +String word = SensitiveWordBs.newInstance().findFirst(text); +Assert.assertEquals("fuck", word); +``` + +# 后期 road-map + +- 繁简体互换 + +- 重复词 + +- 停顿词 + +- 拼音互换 + +- 用户自定义敏感词和白名单 + +- 文字镜像翻转 + +- 敏感词标签支持 \ No newline at end of file diff --git a/doc/issues/roadmap/v004-实现标点英文全角半角转换.md b/doc/issues/roadmap/v004-实现标点英文全角半角转换.md index 821168c..f17dab8 100644 --- a/doc/issues/roadmap/v004-实现标点英文全角半角转换.md +++ b/doc/issues/roadmap/v004-实现标点英文全角半角转换.md @@ -1,3 +1,12 @@ # 字符 -全部使用小写+半角的形式匹配。 \ No newline at end of file +全部使用小写+半角的形式匹配。 + +## 忽略大小写 + +if(Character.isLetter) { + ignoreCase=true + ignoreWidth=true +} + + diff --git a/doc/issues/roadmap/v008-拼音的处理.md b/doc/issues/roadmap/v008-拼音的处理.md index e5f8387..b230c2f 100644 --- a/doc/issues/roadmap/v008-拼音的处理.md +++ b/doc/issues/roadmap/v008-拼音的处理.md @@ -8,4 +8,10 @@ (2)数字 -对于数字,除却象形,最常用的就是谐音。 \ No newline at end of file +对于数字,除却象形,最常用的就是谐音。 + +## 不可变性 + +这个涉及到拼音的 DFA 树构建,可能需要 wordMap 提供一个添加的接口。 + +这个需要在初始化的时候,直接指定。而且不可变化。 \ No newline at end of file diff --git a/pom.xml b/pom.xml index be2130c..da40bb5 100644 --- a/pom.xml +++ b/pom.xml @@ -52,6 +52,10 @@ com.github.houbb heaven + + com.huaban + jieba-analysis + diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java new file mode 100644 index 0000000..d0c706f --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -0,0 +1,39 @@ +package com.github.houbb.sensitive.word.api; + +/** + * @author binbin.hou + * @since 0.0.4 + */ +public interface IWordContext { + + /** + * 是否忽略大小写 + * @return 是否 + * @since 0.0.4 + */ + boolean ignoreCase(); + + /** + * 是否忽略半角圆角 + * @return 是否 + * @since 0.0.4 + */ + boolean ignoreWidth(); + + /** + * 设置是否忽略大小写 + * @param ignoreCase 是否忽略大小写 + * @return this + * @since 0.0.4 + */ + IWordContext ignoreCase(boolean ignoreCase); + + /** + * 设置是否忽略半角圆角 + * @param ignoreWidth 是否忽略半角圆角 + * @return this + * @since 0.0.4 + */ + IWordContext ignoreWidth(boolean ignoreWidth); + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java index 39a6849..d431ec6 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java @@ -1,6 +1,5 @@ package com.github.houbb.sensitive.word.api; -import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import java.util.Collection; @@ -24,28 +23,34 @@ public interface IWordMap { /** * 是否包含敏感词 * @param string 字符串 + * @param context 上下文 * @return 是否包含 * @since 0.0.1 * @see ValidModeEnum#FAIL_FAST 建议使用快速返回模式 */ - boolean contains(final String string); + boolean contains(final String string, + final IWordContext context); /** * 返回所有对应的敏感词 * @param string 原始字符串 + * @param context 上下文 * @return 结果 * @since 0.0.1 * @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式 */ - List findAll(final String string); + List findAll(final String string, + final IWordContext context); /** * 返回第一个对应的敏感词 * @param string 原始字符串 + * @param context 上下文 * @return 结果 * @since 0.0.1 */ - String findFirst(final String string); + String findFirst(final String string, + final IWordContext context); /** * 替换所有敏感词内容 @@ -54,9 +59,11 @@ public interface IWordMap { * * @param target 目标字符串 * @param replaceChar 替换为的 char + * @param context 上下文 * @return 替换后结果 * @since 0.0.2 */ - String replace(final String target, final char replaceChar); + String replace(final String target, final char replaceChar, + final IWordContext context); } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 95fec06..52cebde 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -1,7 +1,7 @@ package com.github.houbb.sensitive.word.bs; import com.github.houbb.heaven.constant.CharConst; -import com.github.houbb.heaven.support.instance.impl.Instances; +import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordData; import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.support.data.SensitiveWordData; @@ -22,41 +22,68 @@ public class SensitiveWordBs { */ private SensitiveWordBs(){} - /** - * 敏感数据信息 - * @since 0.0.1 - */ - private IWordData sensitiveWordData = Instances.singleton(SensitiveWordData.class); - /** * 敏感词 map * @since 0.0.1 */ - private IWordMap sensitiveWordMap = Instances.singleton(SensitiveWordMap.class); + private static volatile IWordMap sensitiveWordMap; /** - * 获取单例信息 - * @since 0.0.1 + * 默认的执行上下文 + * @since 0.0.4 */ - private static final SensitiveWordBs INSTANCE; + private volatile IWordContext context; - static { - synchronized (SensitiveWordBs.class) { - INSTANCE = new SensitiveWordBs(); - List lines = INSTANCE.sensitiveWordData.getWordData(); - INSTANCE.sensitiveWordMap.initWordMap(lines); + /** + * DCL 初始化 wordMap 信息 + * @return 初始化后的结果 + * @since 0.0.4 + */ + private static IWordMap initWordMap() { + if(sensitiveWordMap == null) { + synchronized (IWordMap.class) { + if(sensitiveWordMap == null) { + // 加载配置信息 + IWordData wordData = new SensitiveWordData(); + List lines = wordData.getWordData(); + + // 初始化 DFA 信息 + sensitiveWordMap = new SensitiveWordMap(); + sensitiveWordMap.initWordMap(lines); + } + } } + + return sensitiveWordMap; } /** * 新建验证实例 + * + * double-lock * @return this * @since 0.0.1 */ - public static SensitiveWordBs getInstance() { - return INSTANCE; + public static SensitiveWordBs newInstance() { + initWordMap(); + + SensitiveWordBs bs = new SensitiveWordBs(); + bs.context = buildDefaultContext(); + return bs; } + /** + * 构建默认的上下文 + * @return 结果 + * @since 0.0.4 + */ + private static IWordContext buildDefaultContext() { + IWordContext wordContext = SensitiveWordContext.newInstance(); + wordContext.ignoreCase(true); + wordContext.ignoreWidth(true); + + return wordContext; + } /** * 是否包含敏感词 * @param target 目标字符串 @@ -64,7 +91,7 @@ public class SensitiveWordBs { * @since 0.0.1 */ public boolean contains(final String target) { - return this.sensitiveWordMap.contains(target); + return sensitiveWordMap.contains(target, context); } /** @@ -76,7 +103,7 @@ public class SensitiveWordBs { * @since 0.0.1 */ public List findAll(final String target) { - return this.sensitiveWordMap.findAll(target); + return sensitiveWordMap.findAll(target, context); } /** @@ -87,7 +114,7 @@ public class SensitiveWordBs { * @since 0.0.1 */ public String findFirst(final String target) { - return this.sensitiveWordMap.findFirst(target); + return sensitiveWordMap.findFirst(target, context); } /** @@ -98,7 +125,7 @@ public class SensitiveWordBs { * @since 0.0.2 */ public String replace(final String target, final char replaceChar) { - return this.sensitiveWordMap.replace(target, replaceChar); + return sensitiveWordMap.replace(target, replaceChar, context); } /** diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java new file mode 100644 index 0000000..29adaff --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -0,0 +1,88 @@ +package com.github.houbb.sensitive.word.bs; + +import com.github.houbb.sensitive.word.api.IWordContext; + +/** + * 上下文 + * @author binbin.hou + * @since 0.0.4 + */ +public class SensitiveWordContext implements IWordContext { + + /** + * 忽略大小写 + * @since 0.0.4 + */ + private boolean ignoreCase; + + /** + * 忽略半角全角 + * @since 0.0.4 + */ + private boolean ignoreWidth; + + /** + * 私有化构造器 + * @since 0.0.4 + */ + private SensitiveWordContext() { + } + + /** + * 新建一个对象实例 + * @return 对象实例 + * @since 0.0.4 + */ + public static SensitiveWordContext newInstance() { + return new SensitiveWordContext(); + } + + @Override + public boolean ignoreCase() { + return ignoreCase; + } + + @Override + public SensitiveWordContext ignoreCase(boolean ignoreCase) { + this.ignoreCase = ignoreCase; + return this; + } + + @Override + public boolean ignoreWidth() { + return ignoreWidth; + } + + @Override + public SensitiveWordContext ignoreWidth(boolean ignoreWidth) { + this.ignoreWidth = ignoreWidth; + return this; + } + + private static class ContextHolder { + private static final SensitiveWordContext INSTANCE = new SensitiveWordContext(); + + static { + INSTANCE.ignoreCase(true); + INSTANCE.ignoreWidth(true); + } + } + + /** + * 默认配置 + * @return 结果 + * @since 0.0.4 + */ + private static SensitiveWordContext defaultContext() { + return ContextHolder.INSTANCE; + } + + @Override + public String toString() { + return "SensitiveWordContext{" + + "ignoreCase=" + ignoreCase + + ", ignoreWidth=" + ignoreWidth + + '}'; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java b/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java index e90b2fc..0c0cdfd 100644 --- a/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java +++ b/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java @@ -5,7 +5,7 @@ package com.github.houbb.sensitive.word.constant.enums; *

create on 2020/1/7 22:46

* * @author Administrator - * @since 1.0.0 + * @since 0.0.1 */ public enum ValidModeEnum { diff --git a/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java b/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java index 9c4d030..81cc793 100644 --- a/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java +++ b/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java @@ -43,8 +43,8 @@ public class CheckSensitiveWordResult { return sensitiveWordSize; } - public CheckSensitiveWordResult sentiveWordSize(int sentiveWordSize) { - this.sensitiveWordSize = sentiveWordSize; + public CheckSensitiveWordResult sentiveWordSize(int sensitiveWordSize) { + this.sensitiveWordSize = sensitiveWordSize; return this; } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java index b551ad9..9be0a55 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java @@ -31,7 +31,7 @@ public class SensitiveWordData implements IWordData { defaultLines = StreamUtil.readAllLines("/dict.txt"); defaultLines.addAll(StreamUtil.readAllLines("/dict_en.txt")); long end = System.currentTimeMillis(); - System.out.println("Sensitive data loaded!, cost time: " + (end - start) + " ms"); + System.out.println("Sensitive data loaded!, cost time: " + (end - start) + "ms"); } } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java index 4b774c6..62b608e 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java @@ -7,6 +7,7 @@ import com.github.houbb.heaven.util.lang.ObjectUtil; import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.heaven.util.util.MapUtil; +import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.constant.AppConst; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; @@ -30,7 +31,7 @@ public class SensitiveWordMap implements IWordMap { * * @since 0.0.1 */ - private static Map sensitiveWordMap; + private Map innerWordMap; /** * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型: @@ -46,13 +47,13 @@ public class SensitiveWordMap implements IWordMap { @SuppressWarnings("unchecked") public void initWordMap(Collection collection) { // 避免重复加载 - if (MapUtil.isNotEmpty(sensitiveWordMap)) { + if (MapUtil.isNotEmpty(innerWordMap)) { return; } long startTime = System.currentTimeMillis(); // 避免扩容带来的消耗 - sensitiveWordMap = new HashMap(collection.size()); + innerWordMap = new HashMap(collection.size()); for (String key : collection) { if (StringUtil.isEmpty(key)) { @@ -64,7 +65,7 @@ public class SensitiveWordMap implements IWordMap { final int size = chars.length; // 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中 - Map currentMap = sensitiveWordMap; + Map currentMap = innerWordMap; for (int i = 0; i < size; i++) { // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值 @@ -78,7 +79,7 @@ public class SensitiveWordMap implements IWordMap { currentMap = (Map) wordMap; } else { //不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一 - Map newWordMap = new HashMap<>(); + Map newWordMap = new HashMap<>(8); newWordMap.put(AppConst.IS_END, false); // 将新的节点放入当前 map 中 @@ -96,7 +97,7 @@ public class SensitiveWordMap implements IWordMap { } long endTime = System.currentTimeMillis(); - System.out.println("Init sensitive word map end! Cost time " + (endTime - startTime) + "ms"); + System.out.println("Init sensitive word map end! Cost time: " + (endTime - startTime) + "ms"); } /** @@ -109,13 +110,13 @@ public class SensitiveWordMap implements IWordMap { * @since 0.0.1 */ @Override - public boolean contains(String string) { + public boolean contains(String string, final IWordContext context) { if (StringUtil.isEmpty(string)) { return false; } for (int i = 0; i < string.length(); i++) { - int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST); + int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST, context); // 快速返回 if (checkResult > 0) { return true; @@ -134,13 +135,13 @@ public class SensitiveWordMap implements IWordMap { * @since 0.0.1 */ @Override - public List findAll(String string) { - return getSensitiveWords(string, ValidModeEnum.FAIL_OVER); + public List findAll(String string, final IWordContext context) { + return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context); } @Override - public String findFirst(String string) { - List stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST); + public String findFirst(String string, final IWordContext context) { + List stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context); if (CollectionUtil.isEmpty(stringList)) { return null; @@ -150,12 +151,12 @@ public class SensitiveWordMap implements IWordMap { } @Override - public String replace(String target, char replaceChar) { + public String replace(String target, char replaceChar, final IWordContext context) { if(StringUtil.isEmpty(target)) { return target; } - return this.replaceSensitiveWord(target, ValidModeEnum.FAIL_OVER, replaceChar); + return this.replaceSensitiveWord(target, replaceChar, context); } /** @@ -166,7 +167,8 @@ public class SensitiveWordMap implements IWordMap { * @return 结果列表 * @since 0.0.1 */ - private List getSensitiveWords(final String text, final ValidModeEnum modeEnum) { + private List getSensitiveWords(final String text, final ValidModeEnum modeEnum, + final IWordContext context) { //1. 是否存在敏感词,如果比存在,直接返回空列表 if (StringUtil.isEmpty(text)) { return Guavas.newArrayList(); @@ -174,7 +176,7 @@ public class SensitiveWordMap implements IWordMap { List resultList = Guavas.newArrayList(); for (int i = 0; i < text.length(); i++) { - int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER); + int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER, context); // 命中 if (wordLength > 0) { @@ -215,19 +217,23 @@ public class SensitiveWordMap implements IWordMap { * @param txt 文本信息 * @param beginIndex 开始下标 * @param validModeEnum 验证模式 + * @param context 执行上下文 * @return 敏感词对应的长度 * @since 0.0.1 */ private int checkSensitiveWord(final String txt, final int beginIndex, - final ValidModeEnum validModeEnum) { - Map nowMap = sensitiveWordMap; + final ValidModeEnum validModeEnum, + final IWordContext context) { + Map nowMap = innerWordMap; // 记录敏感词的长度 int lengthCount = 0; int actualLength = 0; for (int i = beginIndex; i < txt.length(); i++) { - char charKey = txt.charAt(i); + char c = txt.charAt(i); + char charKey = getActualChar(c, context); + // 判断该字是否存在于敏感词库中 // 并且将 nowMap 替换为新的 map,进入下一层的循环。 nowMap = (Map) nowMap.get(charKey); @@ -256,16 +262,36 @@ public class SensitiveWordMap implements IWordMap { return actualLength; } + /** + * 获取实际对应的符号 + * @param c 编号 + * @param context 上下文 + * @return 结果 + * @since 0.0.4 + */ + private char getActualChar(final char c, + final IWordContext context) { + char resultChar = c; + + if(context.ignoreCase()) { + resultChar = Character.toLowerCase(resultChar); + } + if(context.ignoreWidth()) { + resultChar = CharUtil.toHalfWidth(resultChar); + } + + return resultChar; + } + /** * 直接替换敏感词,返回替换后的结果 * @param target 文本信息 - * @param validModeEnum 验证模式 * @return 脱敏后的字符串 * @since 0.0.2 */ private String replaceSensitiveWord(final String target, - final ValidModeEnum validModeEnum, - final char replaceChar) { + final char replaceChar, + final IWordContext context) { if(StringUtil.isEmpty(target)) { return target; } @@ -275,7 +301,7 @@ public class SensitiveWordMap implements IWordMap { for (int i = 0; i < target.length(); i++) { char currentChar = target.charAt(i); // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词 - int wordLength = checkSensitiveWord(target, i, validModeEnum); + int wordLength = checkSensitiveWord(target, i, ValidModeEnum.FAIL_OVER, context); // 敏感词 if(wordLength > 0) { diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java index bfc3ecd..75cea85 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java @@ -22,7 +22,7 @@ public class SensitiveWordBsTest { public void containsTest() { final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; - Assert.assertTrue(SensitiveWordBs.getInstance().contains(text)); + Assert.assertTrue(SensitiveWordBs.newInstance().contains(text)); } /** @@ -33,7 +33,7 @@ public class SensitiveWordBsTest { public void findAllTest() { final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; - List wordList = SensitiveWordBs.getInstance().findAll(text); + List wordList = SensitiveWordBs.newInstance().findAll(text); Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); } @@ -45,7 +45,7 @@ public class SensitiveWordBsTest { public void findFirstTest() { final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; - String word = SensitiveWordBs.getInstance().findFirst(text); + String word = SensitiveWordBs.newInstance().findFirst(text); Assert.assertEquals("五星红旗", word); } @@ -57,7 +57,7 @@ public class SensitiveWordBsTest { public void replaceTest() { final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; - String result = SensitiveWordBs.getInstance().replace(text); + String result = SensitiveWordBs.newInstance().replace(text); Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result); } @@ -69,8 +69,32 @@ public class SensitiveWordBsTest { public void replaceCharTest() { final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; - String result = SensitiveWordBs.getInstance().replace(text, '0'); + String result = SensitiveWordBs.newInstance().replace(text, '0'); Assert.assertEquals("0000迎风飘扬,000的画像屹立在000前。", result); } + /** + * 忽略大小写 + * @since 0.0.4 + */ + @Test + public void ignoreCaseTest() { + final String text = "fuCK the bad words."; + + String word = SensitiveWordBs.newInstance().findFirst(text); + Assert.assertEquals("fuCK", word); + } + + /** + * 忽略半角圆角 + * @since 0.0.4 + */ + @Test + public void ignoreWidthTest() { + final String text = "fuck the bad words."; + + String word = SensitiveWordBs.newInstance().findFirst(text); + Assert.assertEquals("fuck", word); + } + }