diff --git a/doc/CHANGE_LOG.md b/doc/CHANGE_LOG.md index b1c0892..a4c1aca 100644 --- a/doc/CHANGE_LOG.md +++ b/doc/CHANGE_LOG.md @@ -87,3 +87,9 @@ |:---|:---|:---|:---|:--| | 1 | A | 添加对于数字过滤的可配置型 | 2020-1-14 22:48:12 | | | 2 | A | 添加部分敏感词 | 2020-1-14 22:48:12 | | + +# release_0.0.12 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:---|:---|:---|:--| +| 1 | A | 添加对于网址的过滤 | 2020-1-16 20:51:58 | | diff --git a/doc/issues/roadmap/v011-邮箱检测实现.md b/doc/issues/roadmap/v011-邮箱网址Regex检测实现.md similarity index 70% rename from doc/issues/roadmap/v011-邮箱检测实现.md rename to doc/issues/roadmap/v011-邮箱网址Regex检测实现.md index f4849ce..39f4736 100644 --- a/doc/issues/roadmap/v011-邮箱检测实现.md +++ b/doc/issues/roadmap/v011-邮箱网址Regex检测实现.md @@ -1,6 +1,8 @@ # 是否为邮箱 check +暂时先使用基本的正则表达式, + ================== 网址等等 @@ -13,6 +15,10 @@ Image-URL 检测,避免替换错误。 针对不同的信息脱敏,则需要知道对应的检测代码是什么。 +jpg +png +jpeg +gif ## 是否脱敏的配置 @@ -26,4 +32,11 @@ Image-URL 检测,避免替换错误。 可以直接开辟另一道验证方式。 -直接 regex+全文检索实现。 \ No newline at end of file +直接 regex+全文检索实现。 + +# 前提 + +首先实现 Regex + +这里也可以支持 allow_regex/deny_regex + diff --git a/pom.xml b/pom.xml index 358b419..e9e767d 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.0.12-SNAPSHOT + 0.0.12 @@ -25,7 +25,7 @@ 1.7 - 0.1.72 + 0.1.73 1.2.0 @@ -37,13 +37,6 @@ - - com.github.houbb - heaven - ${heaven.version} - true - - com.github.houbb opencc4j @@ -59,6 +52,12 @@ + + + com.github.houbb + heaven + ${heaven.version} + diff --git a/release.bat b/release.bat index 164e8c4..b0cb2a0 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.0.11 +SET version=0.0.12 :::: 新版本名称 -SET newVersion=0.0.12 +SET newVersion=0.0.13 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index ec1f268..12b219d 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -88,30 +88,45 @@ public interface IWordContext { * @return 数字检测 * @since 0.0.5 */ - boolean sensitiveNumCheck(); + boolean sensitiveCheckNum(); /** * 设置敏感数字检测 - * @param sensitiveNumCheck 数字格式检测 + * @param sensitiveCheckNum 数字格式检测 * @return this * @since 0.0.5 */ - IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck); + IWordContext sensitiveCheckNum(final boolean sensitiveCheckNum); /** * 是否进行邮箱检测 * @return this * @since 0.0.9 */ - boolean sensitiveEmailCheck(); + boolean sensitiveCheckEmail(); /** * 设置敏感邮箱检测 - * @param sensitiveEmailCheck 是否检测 + * @param sensitiveCheckEmail 是否检测 * @return this * @since 0.0.9 */ - IWordContext sensitiveEmailCheck(final boolean sensitiveEmailCheck); + IWordContext sensitiveCheckEmail(final boolean sensitiveCheckEmail); + + /** + * 敏感链接检测 + * @return 是否启用 + * @since 0. + */ + boolean sensitiveCheckUrl(); + + /** + * 设置敏感邮箱检测 + * @param sensitiveCheckUrl 是否检测 + * @return this + * @since 0.0.9 + */ + IWordContext sensitiveCheckUrl(final boolean sensitiveCheckUrl); /** * 忽略英文的写法 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java index fa6a58f..2b30244 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java @@ -1,6 +1,7 @@ package com.github.houbb.sensitive.word.api; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; import java.util.Collection; import java.util.List; diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 777d39a..f6f028d 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -11,6 +11,7 @@ import java.util.List; /** * 敏感词引导类 + * * @author binbin.hou * @since 0.0.1 */ @@ -18,37 +19,36 @@ public class SensitiveWordBs { /** * 私有化构造器 + * * @since 0.0.1 */ - private SensitiveWordBs(){} + private SensitiveWordBs() { + } /** * 敏感词 map + * * @since 0.0.1 */ private static volatile IWordMap sensitiveWordMap; /** * 默认的执行上下文 + * * @since 0.0.4 */ private volatile IWordContext context; - /** - * 是否启用数字校验 - * @since 0.0.11 - */ - private boolean enableNumCheck = true; - /** * DCL 初始化 wordMap 信息 + * * @return 初始化后的结果 * @since 0.0.4 */ private static IWordMap initWordMap() { - if(sensitiveWordMap == null) { + if (sensitiveWordMap == null) { synchronized (IWordMap.class) { - if(sensitiveWordMap == null) { + if (sensitiveWordMap == null) { // 加载配置信息 IWordData wordData = new SensitiveWordData(); List lines = wordData.getWordData(); @@ -65,8 +65,9 @@ public class SensitiveWordBs { /** * 新建验证实例 - * + *

* double-lock + * * @return this * @since 0.0.1 */ @@ -81,16 +82,40 @@ public class SensitiveWordBs { /** * 设置是否启动数字检测 + * * @param enableNumCheck 数字检测 * @since 0.0.11 */ public SensitiveWordBs enableNumCheck(boolean enableNumCheck) { - this.context.sensitiveNumCheck(enableNumCheck); + this.context.sensitiveCheckNum(enableNumCheck); return this; -} + } + + /** + * 设置是否启动 email 检测 + * + * @param enableEmailCheck email 检测 + * @since 0.0.11 + */ + public SensitiveWordBs enableEmailCheck(boolean enableEmailCheck) { + this.context.sensitiveCheckEmail(enableEmailCheck); + return this; + } + + /** + * 设置是否启动 url 检测 + * + * @param enableUrlCheck url 检测 + * @since 0.0.12 + */ + public SensitiveWordBs enableUrlCheck(boolean enableUrlCheck) { + this.context.sensitiveCheckUrl(enableUrlCheck); + return this; + } /** * 构建默认的上下文 + * * @return 结果 * @since 0.0.4 */ @@ -105,13 +130,16 @@ public class SensitiveWordBs { wordContext.ignoreRepeat(true); // 开启校验 - wordContext.sensitiveNumCheck(true); - wordContext.sensitiveEmailCheck(true); + wordContext.sensitiveCheckNum(true); + wordContext.sensitiveCheckEmail(true); + wordContext.sensitiveCheckUrl(true); return wordContext; } + /** * 是否包含敏感词 + * * @param target 目标字符串 * @return 是否 * @since 0.0.1 @@ -124,6 +152,7 @@ public class SensitiveWordBs { * 返回所有的敏感词 * 1. 这里是默认去重的,且是有序的。 * 2. 如果不存在,返回空列表 + * * @param target 目标字符串 * @return 敏感词列表 * @since 0.0.1 @@ -135,6 +164,7 @@ public class SensitiveWordBs { /** * 返回第一个敏感词 * (1)如果不存在,则返回 {@code null} + * * @param target 目标字符串 * @return 敏感词 * @since 0.0.1 @@ -145,7 +175,8 @@ public class SensitiveWordBs { /** * 替换所有内容 - * @param target 目标字符串 + * + * @param target 目标字符串 * @param replaceChar 替换为的 char * @return 替换后结果 * @since 0.0.2 @@ -157,12 +188,13 @@ public class SensitiveWordBs { /** * 替换所有内容 * 1. 默认使用空格替换,避免星号改变 md 的格式。 + * * @param target 目标字符串 * @return 替换后结果 * @since 0.0.2 */ public String replace(final String target) { - return this.replace(target, CharConst.BLANK); + return this.replace(target, CharConst.STAR); } } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index b611f62..2a1ad9d 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -39,7 +39,7 @@ public class SensitiveWordContext implements IWordContext { * 是否进行敏感数字检测 * @since 0.0.6 */ - private boolean sensitiveNumCheck; + private boolean sensitiveCheckNum; /** * 是否忽略中文繁简体 @@ -63,7 +63,13 @@ public class SensitiveWordContext implements IWordContext { * 是否进行邮箱测试 * @since 0.0.9 */ - private boolean sensitiveEmailCheck; + private boolean sensitiveCheckEmail; + + /** + * 是否进行 url 测试 + * @since 0.0.12 + */ + private boolean sensitiveCheckUrl; /** * 私有化构造器 @@ -126,13 +132,13 @@ public class SensitiveWordContext implements IWordContext { } @Override - public boolean sensitiveNumCheck() { - return sensitiveNumCheck; + public boolean sensitiveCheckNum() { + return sensitiveCheckNum; } @Override - public SensitiveWordContext sensitiveNumCheck(boolean sensitiveNumCheck) { - this.sensitiveNumCheck = sensitiveNumCheck; + public SensitiveWordContext sensitiveCheckNum(boolean sensitiveCheckNum) { + this.sensitiveCheckNum = sensitiveCheckNum; return this; } @@ -170,14 +176,24 @@ public class SensitiveWordContext implements IWordContext { } @Override - public boolean sensitiveEmailCheck() { - return sensitiveEmailCheck; + public boolean sensitiveCheckEmail() { + return sensitiveCheckEmail; } @Override - public SensitiveWordContext sensitiveEmailCheck(boolean sensitiveEmailCheck) { - this.sensitiveEmailCheck = sensitiveEmailCheck; + public SensitiveWordContext sensitiveCheckEmail(boolean sensitiveCheckEmail) { + this.sensitiveCheckEmail = sensitiveCheckEmail; return this; } + @Override + public boolean sensitiveCheckUrl() { + return sensitiveCheckUrl; + } + + @Override + public SensitiveWordContext sensitiveCheckUrl(boolean sensitiveCheckUrl) { + this.sensitiveCheckUrl = sensitiveCheckUrl; + return this; + } } diff --git a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/ISensitiveCheck.java similarity index 71% rename from src/main/java/com/github/houbb/sensitive/word/api/ISensitiveCheck.java rename to src/main/java/com/github/houbb/sensitive/word/support/check/ISensitiveCheck.java index 08a3eee..a07578c 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/ISensitiveCheck.java @@ -1,5 +1,6 @@ -package com.github.houbb.sensitive.word.api; +package com.github.houbb.sensitive.word.support.check; +import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; /** @@ -33,9 +34,9 @@ public interface ISensitiveCheck { * @return 敏感信息对应的长度 * @since 0.0.5 */ - int checkSensitive(final String txt, - final int beginIndex, - final ValidModeEnum validModeEnum, - final IWordContext context); + SensitiveCheckResult sensitiveCheck(final String txt, + final int beginIndex, + final ValidModeEnum validModeEnum, + final IWordContext context); } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java deleted file mode 100644 index e0c45e6..0000000 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.github.houbb.sensitive.word.support.check; - -import com.github.houbb.heaven.annotation.ThreadSafe; -import com.github.houbb.heaven.support.instance.impl.Instances; -import com.github.houbb.heaven.util.guava.Guavas; -import com.github.houbb.sensitive.word.api.ISensitiveCheck; -import com.github.houbb.sensitive.word.api.IWordContext; -import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; - -import java.util.List; - -/** - * 敏感词检测责任链模式 - * - * 这里可以提供一个公共的父类。 - * - * - * DFA 算法的优化可以参考论文: - * 【DFA 算法】各种论文。 - * - * @author binbin.hou - * @since 0.0.5 - */ -@ThreadSafe -public class SensitiveCheckChain implements ISensitiveCheck { - - @Override - public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { - // 初始化责任链 - List sensitiveChecks = Guavas.newArrayList(); - // 默认添加敏感词校验 - sensitiveChecks.add(Instances.singleton(SensitiveWordCheck.class)); - if(context.sensitiveNumCheck()) { - sensitiveChecks.add(Instances.singleton(SensitiveNumCheck.class)); - } - if(context.sensitiveEmailCheck()) { - sensitiveChecks.add(Instances.singleton(SensitiveEmailCheck.class)); - } - - // 循环调用 - for(ISensitiveCheck sensitiveCheck : sensitiveChecks) { - int result = sensitiveCheck.checkSensitive(txt, beginIndex, validModeEnum, context); - - if(result > 0) { - return result; - } - } - - // 默认返回 0 - return 0; - } - -} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckResult.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckResult.java new file mode 100644 index 0000000..88d24fb --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckResult.java @@ -0,0 +1,64 @@ +package com.github.houbb.sensitive.word.support.check; + +/** + * 敏感信息监测接口结果 + * + * 可以使用责任链的模式,循环调用。 + * @author binbin.hou + * @since 0.0.12 + */ +public class SensitiveCheckResult { + + /** + * 下标 + * @since 0.0.12 + */ + private int index; + + /** + * 检测类 + * @since 0.0.12 + */ + private Class checkClass; + + /** + * 实例化 + * @param index 返回索引 + * @param checkClass 验证类 + * @return 结果 + * @since 0.0.12 + */ + public static SensitiveCheckResult of(final int index, + final Class checkClass) { + SensitiveCheckResult result = new SensitiveCheckResult(); + result.index(index).checkClass(checkClass); + return result; + } + + public int index() { + return index; + } + + public SensitiveCheckResult index(int index) { + this.index = index; + return this; + } + + public Class checkClass() { + return checkClass; + } + + public SensitiveCheckResult checkClass(Class checkClass) { + this.checkClass = checkClass; + return this; + } + + @Override + public String toString() { + return "SensitiveCheckResult{" + + "index=" + index + + ", checkClass=" + checkClass + + '}'; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckChain.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckChain.java new file mode 100644 index 0000000..2d099fa --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckChain.java @@ -0,0 +1,58 @@ +package com.github.houbb.sensitive.word.support.check.impl; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.support.instance.impl.Instances; +import com.github.houbb.heaven.util.guava.Guavas; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; +import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; + +import java.util.List; + +/** + * 敏感词检测责任链模式 + * + * 这里可以提供一个公共的父类。 + * + * + * DFA 算法的优化可以参考论文: + * 【DFA 算法】各种论文。 + * + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class SensitiveCheckChain implements ISensitiveCheck { + + @Override + public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + // 初始化责任链 + List sensitiveChecks = Guavas.newArrayList(); + // 默认添加敏感词校验 + sensitiveChecks.add(Instances.singleton(SensitiveCheckWord.class)); + if(context.sensitiveCheckNum()) { + sensitiveChecks.add(Instances.singleton(SensitiveCheckNum.class)); + } + if(context.sensitiveCheckEmail()) { + sensitiveChecks.add(Instances.singleton(SensitiveCheckEmail.class)); + } + if(context.sensitiveCheckUrl()) { + sensitiveChecks.add(Instances.singleton(SensitiveCheckUrl.class)); + } + + // 循环调用 + for(ISensitiveCheck sensitiveCheck : sensitiveChecks) { + SensitiveCheckResult result = sensitiveCheck.sensitiveCheck(txt, beginIndex, validModeEnum, context); + + if(result.index() > 0) { + return result; + } + } + + // 这里直接进行正则表达式相关的调用。 + // 默认返回 0 + return SensitiveCheckResult.of(0, SensitiveCheckChain.class); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckEmail.java similarity index 83% rename from src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java rename to src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckEmail.java index 0325212..3718b5c 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckEmail.java @@ -1,12 +1,13 @@ -package com.github.houbb.sensitive.word.support.check; +package com.github.houbb.sensitive.word.support.check.impl; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.heaven.support.instance.impl.Instances; import com.github.houbb.heaven.util.lang.CharUtil; import com.github.houbb.heaven.util.util.regex.RegexUtil; -import com.github.houbb.sensitive.word.api.ISensitiveCheck; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; +import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; import com.github.houbb.sensitive.word.support.format.CharFormatChain; /** @@ -25,10 +26,10 @@ import com.github.houbb.sensitive.word.support.format.CharFormatChain; * @since 0.0.9 */ @ThreadSafe -public class SensitiveEmailCheck implements ISensitiveCheck { +public class SensitiveCheckEmail implements ISensitiveCheck { @Override - public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { // 记录敏感词的长度 int lengthCount = 0; int actualLength = 0; @@ -59,7 +60,7 @@ public class SensitiveEmailCheck implements ISensitiveCheck { } } - return actualLength; + return SensitiveCheckResult.of(actualLength, SensitiveCheckEmail.class); } /** diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveNumCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckNum.java similarity index 76% rename from src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveNumCheck.java rename to src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckNum.java index 166cefb..74bd783 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveNumCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckNum.java @@ -1,17 +1,12 @@ -package com.github.houbb.sensitive.word.support.check; +package com.github.houbb.sensitive.word.support.check.impl; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.heaven.support.instance.impl.Instances; -import com.github.houbb.heaven.util.io.FileUtil; -import com.github.houbb.heaven.util.lang.NumUtil; -import com.github.houbb.heaven.util.lang.StringUtil; -import com.github.houbb.sensitive.word.api.ISensitiveCheck; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; +import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; import com.github.houbb.sensitive.word.support.format.CharFormatChain; -import com.github.houbb.sensitive.word.support.format.IgnoreNumStyleCharFormat; - -import java.util.List; /** * 敏感词监测实现 @@ -21,10 +16,10 @@ import java.util.List; * @since 0.0.5 */ @ThreadSafe -public class SensitiveNumCheck implements ISensitiveCheck { +public class SensitiveCheckNum implements ISensitiveCheck { @Override - public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { // 记录敏感词的长度 int lengthCount = 0; int actualLength = 0; @@ -55,7 +50,7 @@ public class SensitiveNumCheck implements ISensitiveCheck { } } - return actualLength; + return SensitiveCheckResult.of(actualLength, SensitiveCheckNum.class); } /** diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckUrl.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckUrl.java new file mode 100644 index 0000000..d5760fb --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckUrl.java @@ -0,0 +1,82 @@ +package com.github.houbb.sensitive.word.support.check.impl; + +import com.github.houbb.heaven.annotation.CommonEager; +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.support.instance.impl.Instances; +import com.github.houbb.heaven.util.lang.CharUtil; +import com.github.houbb.heaven.util.util.regex.RegexUtil; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; +import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; +import com.github.houbb.sensitive.word.support.format.CharFormatChain; + +/** + * URL 正则表达式检测实现。 + * + * 也可以严格的保留下来。 + * + * (1)暂时先粗略的处理 web-site + * (2)如果网址的最后为图片类型,则跳过。 + * (3)长度超过 70,直接结束。 + * + * @author binbin.hou + * @since 0.0.9 + */ +@ThreadSafe +public class SensitiveCheckUrl implements ISensitiveCheck { + + /** + * 最长的网址长度 + * @since 0.0.12 + */ + private static final int MAX_WEB_SITE_LEN = 70; + + @Override + public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + // 记录敏感词的长度 + int lengthCount = 0; + int actualLength = 0; + + StringBuilder stringBuilder = new StringBuilder(); + // 这里偷懒直接使用 String 拼接,然后结合正则表达式。 + // DFA 本质就可以做正则表达式,这样实现不免性能会差一些。 + // 后期如果有想法,对 DFA 进一步深入学习后,将进行优化。 + for(int i = beginIndex; i < txt.length(); i++) { + char currentChar = txt.charAt(i); + char mappingChar = Instances.singleton(CharFormatChain.class) + .format(currentChar, context); + + if(CharUtil.isWebSiteChar(mappingChar) + && lengthCount <= MAX_WEB_SITE_LEN) { + lengthCount++; + stringBuilder.append(currentChar); + + if(isCondition(stringBuilder.toString())) { + actualLength = lengthCount; + + // 是否遍历全部匹配的模式 + if(ValidModeEnum.FAIL_FAST.equals(validModeEnum)) { + break; + } + } + } else { + break; + } + } + + return SensitiveCheckResult.of(actualLength, SensitiveCheckUrl.class); + } + + /** + * 这里指定一个阈值条件 + * @param string 长度 + * @return 是否满足条件 + * @since 0.0.12 + */ + private boolean isCondition(final String string) { + return RegexUtil.isWebSite(string); + } + + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java similarity index 87% rename from src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java rename to src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java index c8206a6..9b6a364 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java @@ -1,12 +1,13 @@ -package com.github.houbb.sensitive.word.support.check; +package com.github.houbb.sensitive.word.support.check.impl; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.heaven.support.instance.impl.Instances; import com.github.houbb.heaven.util.lang.ObjectUtil; -import com.github.houbb.sensitive.word.api.ISensitiveCheck; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.constant.AppConst; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; +import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; import com.github.houbb.sensitive.word.support.format.CharFormatChain; import java.util.Map; @@ -17,10 +18,10 @@ import java.util.Map; * @since 0.0.5 */ @ThreadSafe -public class SensitiveWordCheck implements ISensitiveCheck { +public class SensitiveCheckWord implements ISensitiveCheck { @Override - public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { Map nowMap = context.sensitiveWordMap(); // 记录敏感词的长度 @@ -53,7 +54,7 @@ public class SensitiveWordCheck implements ISensitiveCheck { } } - return actualLength; + return SensitiveCheckResult.of(actualLength, SensitiveCheckWord.class); } /** diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java index c233821..35e5295 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java @@ -3,6 +3,7 @@ package com.github.houbb.sensitive.word.support.map; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.heaven.support.instance.impl.Instances; import com.github.houbb.heaven.util.guava.Guavas; +import com.github.houbb.heaven.util.io.FileUtil; import com.github.houbb.heaven.util.lang.CharUtil; import com.github.houbb.heaven.util.lang.ObjectUtil; import com.github.houbb.heaven.util.lang.StringUtil; @@ -12,7 +13,9 @@ import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.constant.AppConst; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; -import com.github.houbb.sensitive.word.support.check.SensitiveCheckChain; +import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; +import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckChain; +import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl; import java.util.Collection; import java.util.HashMap; @@ -118,9 +121,9 @@ public class SensitiveWordMap implements IWordMap { } for (int i = 0; i < string.length(); i++) { - int checkResult = checkSensitive(string, i, ValidModeEnum.FAIL_FAST, context); + SensitiveCheckResult checkResult = sensitiveCheck(string, i, ValidModeEnum.FAIL_FAST, context); // 快速返回 - if (checkResult > 0) { + if (checkResult.index() > 0) { return true; } } @@ -178,9 +181,9 @@ public class SensitiveWordMap implements IWordMap { List resultList = Guavas.newArrayList(); for (int i = 0; i < text.length(); i++) { - int wordLength = checkSensitive(text, i, ValidModeEnum.FAIL_OVER, context); - + SensitiveCheckResult checkResult = sensitiveCheck(text, i, ValidModeEnum.FAIL_OVER, context); // 命中 + int wordLength = checkResult.index(); if (wordLength > 0) { // 保存敏感词 String sensitiveWord = text.substring(i, i + wordLength); @@ -223,12 +226,22 @@ public class SensitiveWordMap implements IWordMap { for (int i = 0; i < target.length(); i++) { char currentChar = target.charAt(i); // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词 - int wordLength = checkSensitive(target, i, ValidModeEnum.FAIL_OVER, context); + SensitiveCheckResult checkResult = sensitiveCheck(target, i, ValidModeEnum.FAIL_OVER, context); // 敏感词 + int wordLength = checkResult.index(); if(wordLength > 0) { - String replaceStr = CharUtil.repeat(replaceChar, wordLength); - resultBuilder.append(replaceStr); + // 是否执行替换 + Class checkClass = checkResult.checkClass(); + String string = target.substring(i, i+wordLength); + if(SensitiveCheckUrl.class.equals(checkClass) + && FileUtil.isImage(string)) { + // 直接使用原始内容,避免 markdown 图片转换失败 + resultBuilder.append(string); + } else { + String replaceStr = CharUtil.repeat(replaceChar, wordLength); + resultBuilder.append(replaceStr); + } // 直接跳过敏感词的长度 i += wordLength-1; @@ -242,13 +255,13 @@ public class SensitiveWordMap implements IWordMap { } @Override - public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { // 默认执行敏感词操作 context.sensitiveWordMap(innerWordMap); // 责任链模式调用 return Instances.singleton(SensitiveCheckChain.class) - .checkSensitive(txt, beginIndex, validModeEnum, context); + .sensitiveCheck(txt, beginIndex, validModeEnum, context); } } diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java index 1fac846..5fa0485 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java @@ -23,7 +23,7 @@ public class SensitiveWordBsChineseTest { final String text = "我爱我的祖国和五星紅旗。"; List wordList = SensitiveWordBs.newInstance().findAll(text); - Assert.assertEquals("[五星紅旗]", wordList.toString()); + Assert.assertEquals("[祖国, 五星紅旗]", wordList.toString()); } } diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java index 1b9d4d8..c930774 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java @@ -35,7 +35,7 @@ public class SensitiveWordBsEmailTest { final String text = "楼主好人,邮箱 123456789@xx.com"; List wordList = SensitiveWordBs.newInstance().findAll(text); - Assert.assertEquals("[邮箱, 123456789]", wordList.toString()); + Assert.assertEquals("[邮箱, 123456789, xx.com]", wordList.toString()); } } diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUrlTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUrlTest.java new file mode 100644 index 0000000..e13a1c7 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsUrlTest.java @@ -0,0 +1,50 @@ +package com.github.houbb.sensitive.word.bs; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.0.12 + */ +public class SensitiveWordBsUrlTest { + + /** + * 忽略中文繁简体 + * @since 0.0.12 + */ + @Test + public void commonUrlTest() { + final String text = "点击链接 www.baidu.com查看答案"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[链接, www.baidu.com]", wordList.toString()); + + Assert.assertEquals("点击** *************查看答案", SensitiveWordBs + .newInstance().replace(text)); + } + + /** + * 图片测试 + * + * (1)可以检测 + * (2)默认不替换 + * + * @since 0.0.12 + */ + @Test + public void imageUrlTest() { + final String text = "双击查看大图 www.big-image.png查看"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[www.big-image.png]", wordList.toString()); + + Assert.assertEquals(text, SensitiveWordBs.newInstance().replace(text)); + } + +}