diff --git a/README.md b/README.md index 8a38c5c..7b254d9 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,8 @@ - 支持英文大小写互换 +- 支持数字各种形式的互换 + ## 变更日志 [CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/doc/CHANGE_LOG.md) @@ -48,7 +50,7 @@ com.github.houbb sensitive-word - 0.0.4 + 0.0.5 ``` @@ -134,9 +136,18 @@ String word = SensitiveWordBs.newInstance().findFirst(text); Assert.assertEquals("fuck", word); ``` -# 后期 road-map +## 忽略数字的写法 -- 数字的转换处理 +这里实现了数字常见形式的转换。 + +```java +final String text = "这个是我的微信:9⓿二肆⁹₈③⑸⒋➃㈤㊄"; + +List wordList = SensitiveWordBs.newInstance().findAll(text); +Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString()); +``` + +# 后期 road-map - 繁简体互换 diff --git a/doc/CHANGE_LOG.md b/doc/CHANGE_LOG.md index d4e2e0b..1cafb7c 100644 --- a/doc/CHANGE_LOG.md +++ b/doc/CHANGE_LOG.md @@ -41,4 +41,8 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:---|:---|:---|:--| -| 1 | D | 移除单个字符 `v` | 2020-1-9 09:34:35 | | \ No newline at end of file +| 1 | D | 移除单个字符 `v` | 2020-1-9 09:34:35 | | +| 2 | D | 移除单个字符 `我` | 2020-1-10 09:34:35 | | +| 3 | O | 责任链模式优化代码实现 | 2020-1-10 09:34:35 | | +| 4 | A | 支持数字格式化转换 | 2020-1-10 09:34:35 | | +| 5 | A | 支持数字敏感词验证 | 2020-1-10 09:34:35 | | \ No newline at end of file diff --git a/doc/issues/roadmap/v005-数字的转换实现.md b/doc/issues/roadmap/v005-数字的转换实现.md index e69de29..b59ece7 100644 --- a/doc/issues/roadmap/v005-数字的转换实现.md +++ b/doc/issues/roadmap/v005-数字的转换实现.md @@ -0,0 +1,7 @@ +# 转换为数字 + +所有中文/符号转换为数字。 + +# 是否为多个数字的判断 + +连续超过 6 位的数字。 \ No newline at end of file diff --git a/doc/issues/roadmap/v008-繁简体转换实现.md b/doc/issues/roadmap/v006-繁简体转换实现.md similarity index 100% rename from doc/issues/roadmap/v008-繁简体转换实现.md rename to doc/issues/roadmap/v006-繁简体转换实现.md diff --git a/doc/issues/roadmap/v006-重复词的处理.md b/doc/issues/roadmap/v007-重复词的处理.md similarity index 100% rename from doc/issues/roadmap/v006-重复词的处理.md rename to doc/issues/roadmap/v007-重复词的处理.md diff --git a/doc/issues/roadmap/v007-停顿词的处理.md b/doc/issues/roadmap/v012-停顿词的处理.md similarity index 100% rename from doc/issues/roadmap/v007-停顿词的处理.md rename to doc/issues/roadmap/v012-停顿词的处理.md diff --git a/doc/issues/roadmap/v013-邮箱URL的转换实现.md b/doc/issues/roadmap/v013-邮箱URL的转换实现.md new file mode 100644 index 0000000..d2f4884 --- /dev/null +++ b/doc/issues/roadmap/v013-邮箱URL的转换实现.md @@ -0,0 +1,7 @@ +# 是否为邮箱 check + +# 是否为 URL check + +可以直接开辟另一道验证方式。 + +直接 regex+全文检索实现。 \ No newline at end of file diff --git a/pom.xml b/pom.xml index e41659b..66c7e51 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.0.5-SNAPSHOT + 0.0.5 diff --git a/release.bat b/release.bat index 0c08976..687e022 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.0.4 +SET version=0.0.5 :::: 新版本名称 -SET newVersion=0.0.5 +SET newVersion=0.0.6 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/ICharFormat.java b/src/main/java/com/github/houbb/sensitive/word/api/ICharFormat.java new file mode 100644 index 0000000..a6e4513 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/api/ICharFormat.java @@ -0,0 +1,25 @@ +package com.github.houbb.sensitive.word.api; + +/** + * 单词格式化 + * (1)忽略大小写 + * (2)忽略全角半角 + * (3)忽略停顿词 + * (4)忽略数字转换。 + * + * @author binbin.hou + * @since 0.0.5 + */ +public interface ICharFormat { + + /** + * 针对 char 格式化 + * @param original 原始 char + * @param context 上下文 + * @return 格式化后的 char + * @since 0.0.5 + */ + char format(final char original, + final IWordContext context); + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveCheck.java b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveCheck.java new file mode 100644 index 0000000..5fddcf2 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveCheck.java @@ -0,0 +1,41 @@ +package com.github.houbb.sensitive.word.api; + +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; + +/** + * 敏感信息监测接口 + * (1)敏感词 + * (2)数字(连续6位及其以上) + * (3)邮箱 + * (4)URL + * + * 可以使用责任链的模式,循环调用。 + * @author binbin.hou + * @since 0.0.5 + */ +public interface ISensitiveCheck { + + /** + * 检查敏感词数量 + *

+ * (1)如果未命中敏感词,直接返回 0 + * (2)命中敏感词,则返回敏感词的长度。 + *

+ * ps: 这里结果进行优化, + * 1. 是否包含敏感词。 + * 2. 敏感词的长度 + * 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复) + * + * @param txt 文本信息 + * @param beginIndex 开始下标 + * @param validModeEnum 验证模式 + * @param context 执行上下文 + * @return 敏感信息对应的长度 + * @since 0.0.5 + */ + int checkSensitive(final String txt, + final int beginIndex, + final ValidModeEnum validModeEnum, + final IWordContext context); + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index d0c706f..faa9619 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -1,5 +1,7 @@ package com.github.houbb.sensitive.word.api; +import java.util.Map; + /** * @author binbin.hou * @since 0.0.4 @@ -20,6 +22,14 @@ public interface IWordContext { */ boolean ignoreWidth(); + /** + * 是否忽略数字格式 + * @return 是否 + * @since 0.0.5 + */ + boolean ignoreNumStyle(); + + /** * 设置是否忽略大小写 * @param ignoreCase 是否忽略大小写 @@ -36,4 +46,41 @@ public interface IWordContext { */ IWordContext ignoreWidth(boolean ignoreWidth); + /** + * 设置是否忽略半角圆角 + * @param ignoreNumStyle 是否忽略半角圆角 + * @return this + * @since 0.0.5 + */ + IWordContext ignoreNumStyle(boolean ignoreNumStyle); + + /** + * 获取敏感词信息 + * @return 敏感词 + * @since 0.0.5 + */ + Map sensitiveWordMap(); + + /** + * 敏感词信息 + * @param map map 信息 + * @return this + * @since 0.0.5 + */ + IWordContext sensitiveWordMap(final Map map); + + /** + * 敏感数字检测 + * @return 数字检测 + * @since 0.0.5 + */ + boolean sensitiveNumCheck(); + + /** + * 设置敏感数字检测 + * @return 数字检测 + * @since 0.0.5 + */ + IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck); + } diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java index d431ec6..fa6a58f 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java @@ -10,7 +10,7 @@ import java.util.List; * @author binbin.hou * @since 0.0.1 */ -public interface IWordMap { +public interface IWordMap extends ISensitiveCheck { /** diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 52cebde..1746216 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -79,8 +79,13 @@ public class SensitiveWordBs { */ private static IWordContext buildDefaultContext() { IWordContext wordContext = SensitiveWordContext.newInstance(); + // 格式统一化 wordContext.ignoreCase(true); wordContext.ignoreWidth(true); + wordContext.ignoreNumStyle(true); + + // 开启校验 + wordContext.sensitiveNumCheck(true); return wordContext; } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index 29adaff..250f34a 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -2,6 +2,8 @@ package com.github.houbb.sensitive.word.bs; import com.github.houbb.sensitive.word.api.IWordContext; +import java.util.Map; + /** * 上下文 * @author binbin.hou @@ -21,6 +23,23 @@ public class SensitiveWordContext implements IWordContext { */ private boolean ignoreWidth; + /** + * 是否忽略数字格式 + * @since 0.0.5 + */ + private boolean ignoreNumStyle; + + /** + * 敏感词信息 + * @since 0.0.5 + */ + private Map sensitiveWordMap; + + /** + * 是否进行敏感数字检测 + * @since 0.0.6 + */ + private boolean sensitiveNumCheck; /** * 私有化构造器 * @since 0.0.4 @@ -59,22 +78,37 @@ public class SensitiveWordContext implements IWordContext { return this; } - private static class ContextHolder { - private static final SensitiveWordContext INSTANCE = new SensitiveWordContext(); - - static { - INSTANCE.ignoreCase(true); - INSTANCE.ignoreWidth(true); - } + @Override + public boolean ignoreNumStyle() { + return ignoreNumStyle; } - /** - * 默认配置 - * @return 结果 - * @since 0.0.4 - */ - private static SensitiveWordContext defaultContext() { - return ContextHolder.INSTANCE; + @Override + public SensitiveWordContext ignoreNumStyle(boolean ignoreNumStyle) { + this.ignoreNumStyle = ignoreNumStyle; + return this; + } + + @Override + public Map sensitiveWordMap() { + return sensitiveWordMap; + } + + @Override + public SensitiveWordContext sensitiveWordMap(Map sensitiveWordMap) { + this.sensitiveWordMap = sensitiveWordMap; + return this; + } + + @Override + public boolean sensitiveNumCheck() { + return sensitiveNumCheck; + } + + @Override + public SensitiveWordContext sensitiveNumCheck(boolean sensitiveNumCheck) { + this.sensitiveNumCheck = sensitiveNumCheck; + return this; } @Override @@ -82,6 +116,8 @@ public class SensitiveWordContext implements IWordContext { return "SensitiveWordContext{" + "ignoreCase=" + ignoreCase + ", ignoreWidth=" + ignoreWidth + + ", ignoreNumStyle=" + ignoreNumStyle + + ", sensitiveNumCheck=" + sensitiveNumCheck + '}'; } diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java index 7b9ff22..1fa8a1f 100644 --- a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java +++ b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java @@ -23,7 +23,7 @@ public final class AppConst { * 字典的大小 * @since 0.0.1 */ - public static final int DICT_SIZE = 65711; + public static final int DICT_SIZE = 65709; /** * 英语词典的大小 diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java new file mode 100644 index 0000000..682b872 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java @@ -0,0 +1,45 @@ +package com.github.houbb.sensitive.word.support.check; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.support.instance.impl.Instances; +import com.github.houbb.heaven.util.guava.Guavas; +import com.github.houbb.sensitive.word.api.ISensitiveCheck; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; + +import java.util.List; + +/** + * 敏感词检测责任链模式 + * + * 这里可以提供一个公共的父类。 + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class SensitiveCheckChain implements ISensitiveCheck { + + @Override + public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + // 初始化责任链 + List sensitiveChecks = Guavas.newArrayList(); + // 默认添加敏感词校验 + sensitiveChecks.add(Instances.singleton(SensitiveWordCheck.class)); + if(context.sensitiveNumCheck()) { + sensitiveChecks.add(Instances.singleton(SensitiveNumCheck.class)); + } + + // 循环调用 + for(ISensitiveCheck sensitiveCheck : sensitiveChecks) { + int result = sensitiveCheck.checkSensitive(txt, beginIndex, validModeEnum, context); + + if(result > 0) { + return result; + } + } + + // 默认返回 0 + return 0; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveNumCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveNumCheck.java new file mode 100644 index 0000000..c3814b7 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveNumCheck.java @@ -0,0 +1,67 @@ +package com.github.houbb.sensitive.word.support.check; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.support.instance.impl.Instances; +import com.github.houbb.heaven.util.lang.CharUtil; +import com.github.houbb.sensitive.word.api.ISensitiveCheck; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.format.CharFormatChain; +import com.github.houbb.sensitive.word.utils.NumUtils; + +/** + * 敏感词监测实现 + * + * 这里可以提供一个公共的父类。 + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class SensitiveNumCheck implements ISensitiveCheck { + + @Override + public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + // 记录敏感词的长度 + int lengthCount = 0; + int actualLength = 0; + + for (int i = beginIndex; i < txt.length(); i++) { + char c = txt.charAt(i); + char charKey = Instances.singleton(CharFormatChain.class).format(c, context); + + // 如果是数字 + // 满足进入的条件 + if (Character.isDigit(charKey)) { + lengthCount++; + + // 满足结束的条件 + boolean isCondition = isCondition(lengthCount); + if (isCondition) { + // 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。 + actualLength = lengthCount; + + // 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。 + if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) { + break; + } + } + } else { + // 直接跳出循环 + break; + } + } + + return actualLength; + } + + /** + * 这里指定一个阈值条件 + * @param lengthCount 长度 + * @return 是否满足条件 + * @since 0.0.5 + */ + private boolean isCondition(final int lengthCount) { + return lengthCount >= 6; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java new file mode 100644 index 0000000..1a1d48c --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java @@ -0,0 +1,63 @@ +package com.github.houbb.sensitive.word.support.check; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.support.instance.impl.Instances; +import com.github.houbb.heaven.util.lang.CharUtil; +import com.github.houbb.heaven.util.lang.ObjectUtil; +import com.github.houbb.sensitive.word.api.ISensitiveCheck; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.constant.AppConst; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.format.CharFormatChain; + +import java.util.Map; + +/** + * 敏感词监测实现 + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class SensitiveWordCheck implements ISensitiveCheck { + + @Override + public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + Map nowMap = context.sensitiveWordMap(); + + // 记录敏感词的长度 + int lengthCount = 0; + int actualLength = 0; + + for (int i = beginIndex; i < txt.length(); i++) { + char c = txt.charAt(i); + char charKey = Instances.singleton(CharFormatChain.class).format(c, context); + + // 判断该字是否存在于敏感词库中 + // 并且将 nowMap 替换为新的 map,进入下一层的循环。 + nowMap = (Map) nowMap.get(charKey); + if (ObjectUtil.isNotNull(nowMap)) { + lengthCount++; + + // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测 + boolean isEnd = (boolean) nowMap.get(AppConst.IS_END); + if (isEnd) { + // 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。 + // eg: 敏感词 敏感词xxx + // 如果是 【敏感词x】也会被匹配。 + actualLength = lengthCount; + + // 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。 + if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) { + break; + } + } + } else { + // 直接跳出循环 + break; + } + } + + return actualLength; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/CharFormatChain.java b/src/main/java/com/github/houbb/sensitive/word/support/format/CharFormatChain.java new file mode 100644 index 0000000..31239b9 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/CharFormatChain.java @@ -0,0 +1,38 @@ +package com.github.houbb.sensitive.word.support.format; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.support.instance.impl.Instances; +import com.github.houbb.heaven.util.guava.Guavas; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.ICharFormat; + +import java.util.List; + +/** + * 格式化责任链 + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class CharFormatChain implements ICharFormat { + + @Override + public char format(char original, IWordContext context) { + char result = original; + + List charFormats = Guavas.newArrayList(); + if(context.ignoreCase()) { + charFormats.add(Instances.singleton(IgnoreCaseCharFormat.class)); + charFormats.add(Instances.singleton(IgnoreWidthCharFormat.class)); + charFormats.add(Instances.singleton(IgnoreNumStyleCharFormat.class)); + } + + // 循环执行 + for(ICharFormat charFormat : charFormats) { + result = charFormat.format(result, context); + } + + return result; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreCaseCharFormat.java b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreCaseCharFormat.java new file mode 100644 index 0000000..9c43d22 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreCaseCharFormat.java @@ -0,0 +1,20 @@ +package com.github.houbb.sensitive.word.support.format; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.sensitive.word.api.ICharFormat; +import com.github.houbb.sensitive.word.api.IWordContext; + +/** + * 忽略大小写 + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class IgnoreCaseCharFormat implements ICharFormat { + + @Override + public char format(char original, IWordContext context) { + return Character.toLowerCase(original); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreNumStyleCharFormat.java b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreNumStyleCharFormat.java new file mode 100644 index 0000000..2923626 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreNumStyleCharFormat.java @@ -0,0 +1,21 @@ +package com.github.houbb.sensitive.word.support.format; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.ICharFormat; +import com.github.houbb.sensitive.word.utils.NumUtils; + +/** + * 忽略数字的样式 + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class IgnoreNumStyleCharFormat implements ICharFormat { + + @Override + public char format(char original, IWordContext context) { + return NumUtils.getMappingChar(original); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreWidthCharFormat.java b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreWidthCharFormat.java new file mode 100644 index 0000000..64f8f38 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreWidthCharFormat.java @@ -0,0 +1,21 @@ +package com.github.houbb.sensitive.word.support.format; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.util.lang.CharUtil; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.ICharFormat; + +/** + * 格式化责任链 + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class IgnoreWidthCharFormat implements ICharFormat { + + @Override + public char format(char original, IWordContext context) { + return CharUtil.toHalfWidth(original); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java index 62b608e..c233821 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java @@ -1,6 +1,7 @@ package com.github.houbb.sensitive.word.support.map; import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.support.instance.impl.Instances; import com.github.houbb.heaven.util.guava.Guavas; import com.github.houbb.heaven.util.lang.CharUtil; import com.github.houbb.heaven.util.lang.ObjectUtil; @@ -11,6 +12,7 @@ import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.constant.AppConst; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.check.SensitiveCheckChain; import java.util.Collection; import java.util.HashMap; @@ -116,7 +118,7 @@ public class SensitiveWordMap implements IWordMap { } for (int i = 0; i < string.length(); i++) { - int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST, context); + int checkResult = checkSensitive(string, i, ValidModeEnum.FAIL_FAST, context); // 快速返回 if (checkResult > 0) { return true; @@ -176,7 +178,7 @@ public class SensitiveWordMap implements IWordMap { List resultList = Guavas.newArrayList(); for (int i = 0; i < text.length(); i++) { - int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER, context); + int wordLength = checkSensitive(text, i, ValidModeEnum.FAIL_OVER, context); // 命中 if (wordLength > 0) { @@ -203,86 +205,6 @@ public class SensitiveWordMap implements IWordMap { return resultList; } - /** - * 检查敏感词数量 - *

- * (1)如果未命中敏感词,直接返回 0 - * (2)命中敏感词,则返回敏感词的长度。 - * - * ps: 这里结果进行优化, - * 1. 是否包含敏感词。 - * 2. 敏感词的长度 - * 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复) - * - * @param txt 文本信息 - * @param beginIndex 开始下标 - * @param validModeEnum 验证模式 - * @param context 执行上下文 - * @return 敏感词对应的长度 - * @since 0.0.1 - */ - private int checkSensitiveWord(final String txt, final int beginIndex, - final ValidModeEnum validModeEnum, - final IWordContext context) { - Map nowMap = innerWordMap; - - // 记录敏感词的长度 - int lengthCount = 0; - int actualLength = 0; - - for (int i = beginIndex; i < txt.length(); i++) { - char c = txt.charAt(i); - char charKey = getActualChar(c, context); - - // 判断该字是否存在于敏感词库中 - // 并且将 nowMap 替换为新的 map,进入下一层的循环。 - nowMap = (Map) nowMap.get(charKey); - if (ObjectUtil.isNotNull(nowMap)) { - lengthCount++; - - // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测 - boolean isEnd = (boolean) nowMap.get(AppConst.IS_END); - if (isEnd) { - // 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。 - // eg: 敏感词 敏感词xxx - // 如果是 【敏感词x】也会被匹配。 - actualLength = lengthCount; - - // 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。 - if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) { - break; - } - } - } else { - // 直接跳出循环 - break; - } - } - - return actualLength; - } - - /** - * 获取实际对应的符号 - * @param c 编号 - * @param context 上下文 - * @return 结果 - * @since 0.0.4 - */ - private char getActualChar(final char c, - final IWordContext context) { - char resultChar = c; - - if(context.ignoreCase()) { - resultChar = Character.toLowerCase(resultChar); - } - if(context.ignoreWidth()) { - resultChar = CharUtil.toHalfWidth(resultChar); - } - - return resultChar; - } - /** * 直接替换敏感词,返回替换后的结果 * @param target 文本信息 @@ -301,7 +223,7 @@ public class SensitiveWordMap implements IWordMap { for (int i = 0; i < target.length(); i++) { char currentChar = target.charAt(i); // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词 - int wordLength = checkSensitiveWord(target, i, ValidModeEnum.FAIL_OVER, context); + int wordLength = checkSensitive(target, i, ValidModeEnum.FAIL_OVER, context); // 敏感词 if(wordLength > 0) { @@ -319,4 +241,14 @@ public class SensitiveWordMap implements IWordMap { return resultBuilder.toString(); } + @Override + public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + // 默认执行敏感词操作 + context.sensitiveWordMap(innerWordMap); + + // 责任链模式调用 + return Instances.singleton(SensitiveCheckChain.class) + .checkSensitive(txt, beginIndex, validModeEnum, context); + } + } diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/NumUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/NumUtils.java index daa525f..486bbe5 100644 --- a/src/main/java/com/github/houbb/sensitive/word/utils/NumUtils.java +++ b/src/main/java/com/github/houbb/sensitive/word/utils/NumUtils.java @@ -3,6 +3,8 @@ package com.github.houbb.sensitive.word.utils; import com.github.houbb.heaven.util.guava.Guavas; import com.github.houbb.heaven.util.lang.ObjectUtil; import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import java.util.Map; @@ -51,11 +53,6 @@ public final class NumUtils { "123456789" + "123456789" + "123456789" + - "123456789" + - "123456789" + - "123456789" + - "123456789" + - "123456789" + "123456789"; /** @@ -104,4 +101,28 @@ public final class NumUtils { return stringBuilder.toString(); } + /** + * 检查敏感词数量 + *

+ * (1)如果未命中敏感词,直接返回 0 + * (2)命中敏感词,则返回敏感词的长度。 + * + * ps: 这里结果进行优化, + * 1. 是否包含敏感词。 + * 2. 敏感词的长度 + * 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复) + * + * @param txt 文本信息 + * @param beginIndex 开始下标 + * @param validModeEnum 验证模式 + * @param context 执行上下文 + * @return 敏感数字对应的长度 + * @since 0.0.5 + */ + private int getSensitiveNumber(final String txt, final int beginIndex, + final ValidModeEnum validModeEnum, + final IWordContext context) { + return 0; + } + } diff --git a/src/main/resources/dict.txt b/src/main/resources/dict.txt index 8130303..75c7377 100644 --- a/src/main/resources/dict.txt +++ b/src/main/resources/dict.txt @@ -32719,7 +32719,6 @@ z以留吧以其以武 成都美女上门qq100996803 成都锦天 成龙记 -我 我qiuqiu446巴636巴8 我qq前五位27279接后接61388共十位 我xx你 diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsNumTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsNumTest.java new file mode 100644 index 0000000..552593f --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsNumTest.java @@ -0,0 +1,41 @@ +package com.github.houbb.sensitive.word.bs; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.0.5 + */ +public class SensitiveWordBsNumTest { + + /** + * 返回所有敏感词 + * @since 0.0.5 + */ + @Test + public void findAllTest() { + final String text = "这个是我的微信:9989123456"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[9989123456]", wordList.toString()); + } + + /** + * 返回所有敏感词 + * @since 0.0.5 + */ + @Test + public void ignoreNumStyleTest() { + final String text = "这个是我的微信:9⓿二肆⁹₈③⑸⒋➃㈤㊄"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString()); + } + +}