From 678686df0b8038ad58499f11ac1d889d524a1eb5 Mon Sep 17 00:00:00 2001 From: "binbin.hou" Date: Thu, 8 Jun 2023 14:53:57 +0800 Subject: [PATCH] release branch 0.4.0 --- CHANGE_LOG.md | 7 ++ README.md | 11 ++- pom.xml | 2 +- release.bat | 4 +- .../word/api/ISensitiveWordReplace.java | 13 ++- .../sensitive/word/api/IWordContext.java | 37 +++---- .../houbb/sensitive/word/api/IWordMap.java | 5 +- .../sensitive/word/bs/SensitiveWordBs.java | 43 +++++--- .../word/bs/SensitiveWordContext.java | 54 +++++------ .../sensitive/word/constant/AppConst.java | 30 ++---- .../constant/enums/WordContainsTypeEnum.java | 24 +++++ .../word/core/AbstractSensitiveWord.java | 17 +--- .../sensitive/word/core/SensitiveWord.java | 1 + .../impl/AbstractConditionSensitiveCheck.java | 83 ++++++++++++++++ .../check/impl/AbstractSensitiveCheck.java | 97 ++++++++----------- .../check/impl/SensitiveCheckEmail.java | 15 ++- .../support/check/impl/SensitiveCheckNum.java | 2 +- .../support/check/impl/SensitiveCheckUrl.java | 6 +- .../check/impl/SensitiveCheckWord.java | 48 +++++++-- .../support/check/impl/SensitiveChecks.java | 12 +-- .../sensitive/word/support/map/WordMap.java | 17 +++- .../replace/SensitiveWordReplaceChar.java | 11 ++- .../sensitive/word/benchmark/BasicTest.java | 56 +++++++++++ .../sensitive/word/data/NumUtilTest.java | 2 + .../word/replace/MySensitiveWordReplace.java | 25 ++--- 25 files changed, 413 insertions(+), 209 deletions(-) create mode 100644 src/main/java/com/github/houbb/sensitive/word/constant/enums/WordContainsTypeEnum.java create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/check/impl/AbstractConditionSensitiveCheck.java create mode 100644 src/test/java/com/github/houbb/sensitive/word/benchmark/BasicTest.java diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 3eb4e09..daf1ff9 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -163,3 +163,10 @@ | 1 | O | 中文繁简体样式 | 2023-06-07 23:51:58 | 调整实现策略 | | 2 | A | 代码结构优化 | 2023-06-07 23:51:58 | 调整实现策略 | +# release_0.4.0 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:-----|:------------|:--------------------|:-------| +| 1 | O | 优化单词校验逻辑 | 2023-06-08 23:51:58 | | +| 2 | A | 新增是否单词校验的开关 | 2023-06-08 23:51:58 | | + diff --git a/README.md b/README.md index 467e9ac..10bbc17 100644 --- a/README.md +++ b/README.md @@ -366,8 +366,9 @@ Assert.assertTrue(wordBs.contains(text)); | 7 | enableNumCheck | 是否启用数字检测。 | true | | 8 | enableEmailCheck | 是有启用邮箱检测 | true | | 9 | enableUrlCheck | 是否启用链接检测 | true | -| 10 | numCheckLen | 数字检测,自定义指定长度。 | 8 | -| 11 | sensitiveWordReplace | 敏感词替换策略 | `*` 替换 | +| 10 | enableUrlCheck | 是否启用敏感单词检测 | true | +| 11 | numCheckLen | 数字检测,自定义指定长度。 | 8 | +| 12 | sensitiveWordReplace | 敏感词替换策略 | `*` 替换 | # 动态加载(用户自定义) @@ -617,7 +618,11 @@ public class SensitiveWordService { # 后期 road-map -- [ ] wordMap 的抽象,便于拓展 +- [x] wordMap 的抽象,便于拓展 + +- [ ] word 的统一性能优化,移除 string 的生成 + +- [ ] word 策略的优化,统一遍历+转换 - 同音字处理 diff --git a/pom.xml b/pom.xml index 80f17ee..10ad0e8 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.3.2 + 0.4.0 diff --git a/release.bat b/release.bat index 9d66ae4..b1af6b6 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.3.2 +SET version=0.4.0 :::: 新版本名称 -SET newVersion=0.3.3 +SET newVersion=0.5.0 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplace.java b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplace.java index 429e435..b82d31c 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplace.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordReplace.java @@ -10,10 +10,15 @@ public interface ISensitiveWordReplace { /** * 替换 - * @param context 上下文 - * @return 结果 - * @since 0.2.0 + *

+ * 説明:废弃以前的字符串返回,减少对象创建,提升性能。 + * + * @param stringBuilder 字符串连接器 + * @param rawChars 原始字符串 + * @param wordResult 当前的敏感词结果 + * @param wordContext 上下文 + * @since 0.4.0 */ - String replace(ISensitiveWordReplaceContext context); + void replace(final StringBuilder stringBuilder, final char[] rawChars, final IWordResult wordResult, final IWordContext wordContext); } diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index c7189d7..ee9a597 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -1,10 +1,7 @@ package com.github.houbb.sensitive.word.api; -import com.github.houbb.sensitive.word.bs.SensitiveWordContext; import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; -import java.util.Map; - /** * @author binbin.hou * @since 0.0.4 @@ -72,64 +69,62 @@ public interface IWordContext { IWordContext ignoreChineseStyle(final boolean ignoreChineseStyle); /** - * 获取敏感词信息 - * @return 敏感词 - * @since 0.0.5 + * 是否启用单词 + * @return 是否 */ - Map sensitiveWordMap(); + boolean enableWordCheck(); /** - * 敏感词信息 - * @param map map 信息 - * @return this - * @since 0.0.5 + * 设置是否启用单词 + * @param enableWordCheck 是否 + * @return 结果 */ - IWordContext sensitiveWordMap(final Map map); + IWordContext enableWordCheck(boolean enableWordCheck); /** * 敏感数字检测 * @return 数字检测 * @since 0.0.5 */ - boolean sensitiveCheckNum(); + boolean enableNumCheck(); /** * 设置敏感数字检测 - * @param sensitiveCheckNum 数字格式检测 + * @param enableNumCheck 数字格式检测 * @return this * @since 0.0.5 */ - IWordContext sensitiveCheckNum(final boolean sensitiveCheckNum); + IWordContext enableNumCheck(final boolean enableNumCheck); /** * 是否进行邮箱检测 * @return this * @since 0.0.9 */ - boolean sensitiveCheckEmail(); + boolean enableEmailCheck(); /** * 设置敏感邮箱检测 - * @param sensitiveCheckEmail 是否检测 + * @param enableEmailCheck 是否检测 * @return this * @since 0.0.9 */ - IWordContext sensitiveCheckEmail(final boolean sensitiveCheckEmail); + IWordContext enableEmailCheck(final boolean enableEmailCheck); /** * 敏感链接检测 * @return 是否启用 * @since 0. */ - boolean sensitiveCheckUrl(); + boolean enableUrlCheck(); /** * 设置敏感邮箱检测 - * @param sensitiveCheckUrl 是否检测 + * @param enableUrlCheck 是否检测 * @return this * @since 0.0.9 */ - IWordContext sensitiveCheckUrl(final boolean sensitiveCheckUrl); + IWordContext enableUrlCheck(final boolean enableUrlCheck); /** * 忽略英文的写法 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java index 906d045..80f8fa4 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java @@ -1,6 +1,7 @@ package com.github.houbb.sensitive.word.api; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; import java.util.Collection; @@ -29,7 +30,7 @@ public interface IWordMap { * @since 0.0.1 * @see ValidModeEnum#FAIL_FAST 建议使用快速返回模式 */ - boolean contains(final String string, - final IWordContext context); + WordContainsTypeEnum contains(final String string, + final IWordContext context); } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 77ea1de..d5696f1 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -65,21 +65,27 @@ public class SensitiveWordBs { /** * 启用数字检测 */ - private boolean sensitiveCheckNum = true; + private boolean enableNumCheck = true; /** * 启用邮箱检测 */ - private boolean sensitiveCheckEmail = true; + private boolean enableEmailCheck = true; /** * 启用 URL 检测 */ - private boolean sensitiveCheckUrl = true; + private boolean enableUrlCheck = true; + + /** + * 单词校验 + * @since 0.4.0 + */ + private boolean enableWordCheck = true; // 额外配置 /** * 检测数字时的长度 */ - private int sensitiveCheckNumLen = 8; + private int numCheckLen = 8; //------------------------------------------------------------- 基本属性 END /** @@ -177,12 +183,13 @@ public class SensitiveWordBs { context.ignoreRepeat(ignoreRepeat); // 开启校验 - context.sensitiveCheckNum(sensitiveCheckNum); - context.sensitiveCheckEmail(sensitiveCheckEmail); - context.sensitiveCheckUrl(sensitiveCheckUrl); + context.enableNumCheck(enableNumCheck); + context.enableEmailCheck(enableEmailCheck); + context.enableUrlCheck(enableUrlCheck); + context.enableWordCheck(enableWordCheck); // 额外配置 - context.sensitiveCheckNumLen(sensitiveCheckNumLen); + context.sensitiveCheckNumLen(numCheckLen); context.sensitiveWordReplace(sensitiveWordReplace); context.wordMap(wordMap); @@ -247,6 +254,18 @@ public class SensitiveWordBs { return this; } + /** + * 设置是否启动数字检测 + * + * @param enableWordCheck 数字检测 + * @since 0.0.11 + * @return this + */ + public SensitiveWordBs enableWordCheck(boolean enableWordCheck) { + this.enableWordCheck = enableWordCheck; + return this; + } + /** * 设置是否启动数字检测 * @@ -255,7 +274,7 @@ public class SensitiveWordBs { * @return this */ public SensitiveWordBs enableNumCheck(boolean enableNumCheck) { - this.sensitiveCheckNum = enableNumCheck; + this.enableNumCheck = enableNumCheck; return this; } @@ -266,7 +285,7 @@ public class SensitiveWordBs { * @since 0.2.1 */ public SensitiveWordBs numCheckLen(int numCheckLen) { - this.sensitiveCheckNumLen = numCheckLen; + this.numCheckLen = numCheckLen; return this; } @@ -278,7 +297,7 @@ public class SensitiveWordBs { * @return this */ public SensitiveWordBs enableEmailCheck(boolean enableEmailCheck) { - this.sensitiveCheckEmail = enableEmailCheck; + this.enableEmailCheck = enableEmailCheck; return this; } @@ -290,7 +309,7 @@ public class SensitiveWordBs { * @return this */ public SensitiveWordBs enableUrlCheck(boolean enableUrlCheck) { - this.sensitiveCheckUrl = enableUrlCheck; + this.enableUrlCheck = enableUrlCheck; return this; } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index 7e039a6..c14f70f 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -1,10 +1,11 @@ package com.github.houbb.sensitive.word.bs; -import com.github.houbb.sensitive.word.api.*; +import com.github.houbb.sensitive.word.api.ICharFormat; +import com.github.houbb.sensitive.word.api.ISensitiveWordReplace; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; -import java.util.Map; - /** * 上下文 * @author binbin.hou @@ -31,17 +32,16 @@ public class SensitiveWordContext implements IWordContext { private boolean ignoreNumStyle; /** - * 敏感词信息 - * @since 0.0.5 + * 启动单词校验 + * @since 0.4.0 */ - @Deprecated - private Map sensitiveWordMap; + private boolean enableWordCheck; /** * 是否进行敏感数字检测 * @since 0.0.6 */ - private boolean sensitiveCheckNum; + private boolean enableNumCheck; /** * 是否忽略中文繁简体 @@ -65,13 +65,13 @@ public class SensitiveWordContext implements IWordContext { * 是否进行邮箱测试 * @since 0.0.9 */ - private boolean sensitiveCheckEmail; + private boolean enableEmailCheck; /** * 是否进行 url 测试 * @since 0.0.12 */ - private boolean sensitiveCheckUrl; + private boolean enableUrlCheck; /** * 敏感数字检测对应的长度限制 @@ -182,25 +182,23 @@ public class SensitiveWordContext implements IWordContext { return this; } - @Override - public Map sensitiveWordMap() { - return sensitiveWordMap; + public boolean enableWordCheck() { + return enableWordCheck; } - @Override - public SensitiveWordContext sensitiveWordMap(Map sensitiveWordMap) { - this.sensitiveWordMap = sensitiveWordMap; + public SensitiveWordContext enableWordCheck(boolean enableWordCheck) { + this.enableWordCheck = enableWordCheck; return this; } @Override - public boolean sensitiveCheckNum() { - return sensitiveCheckNum; + public boolean enableNumCheck() { + return enableNumCheck; } @Override - public SensitiveWordContext sensitiveCheckNum(boolean sensitiveCheckNum) { - this.sensitiveCheckNum = sensitiveCheckNum; + public SensitiveWordContext enableNumCheck(boolean enableNumCheck) { + this.enableNumCheck = enableNumCheck; return this; } @@ -238,24 +236,24 @@ public class SensitiveWordContext implements IWordContext { } @Override - public boolean sensitiveCheckEmail() { - return sensitiveCheckEmail; + public boolean enableEmailCheck() { + return enableEmailCheck; } @Override - public SensitiveWordContext sensitiveCheckEmail(boolean sensitiveCheckEmail) { - this.sensitiveCheckEmail = sensitiveCheckEmail; + public SensitiveWordContext enableEmailCheck(boolean enableEmailCheck) { + this.enableEmailCheck = enableEmailCheck; return this; } @Override - public boolean sensitiveCheckUrl() { - return sensitiveCheckUrl; + public boolean enableUrlCheck() { + return enableUrlCheck; } @Override - public SensitiveWordContext sensitiveCheckUrl(boolean sensitiveCheckUrl) { - this.sensitiveCheckUrl = sensitiveCheckUrl; + public SensitiveWordContext enableUrlCheck(boolean enableUrlCheck) { + this.enableUrlCheck = enableUrlCheck; return this; } diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java index 0950534..4699de8 100644 --- a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java +++ b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java @@ -19,34 +19,16 @@ public final class AppConst { */ public static final String IS_END = "ED"; - /** - * 字典的大小 - * @since 0.0.1 - */ - public static final int DICT_SIZE = 65275; - - /** - * 英语词典的大小 - * @since 0.0.4 - */ - public static final int DICT_EN_SIZE = 12; - - /** - * 拒绝的词语 - * @since 0.0.8 - */ - public static final String SENSITIVE_WORD_DENY_PATH = "/sensitive_word_deny.txt"; - - /** - * 用户允许的词语 - * @since 0.0.8 - */ - public static final String SENSITIVE_WORD_ALLOW_PATH = "/sensitive_word_allow.txt"; - /** * 最长的网址长度 * @since 0.3.0 */ public static final int MAX_WEB_SITE_LEN = 70; + /** + * 最大邮箱地址 + * @since 0.4.0 + */ + public static final int MAX_EMAIL_LEN = 64; + } diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/enums/WordContainsTypeEnum.java b/src/main/java/com/github/houbb/sensitive/word/constant/enums/WordContainsTypeEnum.java new file mode 100644 index 0000000..09c3c10 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/constant/enums/WordContainsTypeEnum.java @@ -0,0 +1,24 @@ +package com.github.houbb.sensitive.word.constant.enums; + +/** + * 单词包含类别 + * @since 0.4.0 + */ +public enum WordContainsTypeEnum { + + /** + * 包含+前缀 + */ + CONTAINS_PREFIX, + + /** + * 包含+且是结尾 + */ + CONTAINS_END, + + /** + * 不存在 + */ + NOT_FOUND, + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/core/AbstractSensitiveWord.java b/src/main/java/com/github/houbb/sensitive/word/core/AbstractSensitiveWord.java index 45656d5..3843b82 100644 --- a/src/main/java/com/github/houbb/sensitive/word/core/AbstractSensitiveWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/core/AbstractSensitiveWord.java @@ -3,11 +3,9 @@ package com.github.houbb.sensitive.word.core; import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.sensitive.word.api.*; -import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext; import java.util.Collections; import java.util.List; -import java.util.Map; /** * 抽象实现 @@ -42,7 +40,7 @@ public abstract class AbstractSensitiveWord implements ISensitiveWord { // 注意边界 int startIndex = 0; - char[] chars = target.toCharArray(); + char[] rawChars = target.toCharArray(); for(IWordResult wordResult : allList) { final int itemStartIx = wordResult.startIndex(); @@ -50,24 +48,19 @@ public abstract class AbstractSensitiveWord implements ISensitiveWord { // 脱敏的左边 if(startIndex < itemStartIx) { - stringBuilder.append(chars, startIndex, itemStartIx-startIndex); + stringBuilder.append(rawChars, startIndex, itemStartIx-startIndex); } // 脱敏部分 - String word = wordResult.word(); - ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance() - .sensitiveWord(word) - .wordLength(word.length()); - String replacedText = replace.replace(replaceContext); - stringBuilder.append(replacedText); + replace.replace(stringBuilder, rawChars, wordResult, context); // 更新结尾 startIndex = Math.max(startIndex, itemEndIx); } // 最后部分 - if (startIndex < chars.length) { - stringBuilder.append(chars, startIndex, chars.length-startIndex); + if (startIndex < rawChars.length) { + stringBuilder.append(rawChars, startIndex, rawChars.length-startIndex); } return stringBuilder.toString(); diff --git a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java index 6ed9664..f8164f5 100644 --- a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java @@ -57,6 +57,7 @@ public class SensitiveWord extends AbstractSensitiveWord { int wordLength = checkResult.index(); if (wordLength > 0) { // 保存敏感词 + // TODO: 这其实是一个比较消耗的操作,后续可以考虑简化掉。 String sensitiveWord = text.substring(i, i + wordLength); // 添加去重 diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/AbstractConditionSensitiveCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/AbstractConditionSensitiveCheck.java new file mode 100644 index 0000000..d4256aa --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/AbstractConditionSensitiveCheck.java @@ -0,0 +1,83 @@ +package com.github.houbb.sensitive.word.support.check.impl; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; +import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; + +/** + * 抽象实现策略 + * + * @author binbin.hou + * @since 0.3.2 + */ +@ThreadSafe +public abstract class AbstractConditionSensitiveCheck extends AbstractSensitiveCheck { + + /** + * 当前字符串是否符合规范 + * @param mappingChar 当前字符 + * @param index 下标 + * @param rawText 原始文本 + * @param context 上下文 + * @return 结果 + * @since 0.3.2 + */ + protected abstract boolean isCharCondition(char mappingChar, + int index, + String rawText, + final IWordContext context); + + /** + * 这里指定一个阈值条件 + * @param index 当前下标 + * @param rawText 原始文本 + * @param stringBuilder 缓存 + * @param context 上下文 + * @return 是否满足条件 + * @since 0.3.2 + */ + protected abstract boolean isStringCondition(int index, + String rawText, + final StringBuilder stringBuilder, + final IWordContext context); + + @Override + protected int doGetActualLength(String txt, int beginIndex, + ValidModeEnum validModeEnum, + IWordContext context) { + int actualLength = 0; + + // 采用 ThreadLocal 应该可以提升性能,减少对象的创建。 + StringBuilder stringBuilder = new StringBuilder(); + // 前一个条件 + boolean preCondition = false; + int currentIx = 0; + for(int i = beginIndex; i < txt.length(); i++) { + currentIx = i; + char currentChar = txt.charAt(i); + + // 映射处理 + char mappingChar = context.charFormat().format(currentChar, context); + + // 符合条件 + boolean currentCondition = isCharCondition(mappingChar, i, txt, context); + + //4 个场景 + if(currentCondition) { + stringBuilder.append(currentChar); + } else { + break; + } + } + + // 匹配 + if(isStringCondition(currentIx, txt, stringBuilder, context)) { + actualLength = stringBuilder.length(); + } + + return actualLength; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/AbstractSensitiveCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/AbstractSensitiveCheck.java index 978d4f7..802efc5 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/AbstractSensitiveCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/AbstractSensitiveCheck.java @@ -1,6 +1,7 @@ package com.github.houbb.sensitive.word.support.check.impl; import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; @@ -10,39 +11,11 @@ import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; * 抽象实现策略 * * @author binbin.hou - * @since 0.3.2 + * @since 0.4.0 */ @ThreadSafe public abstract class AbstractSensitiveCheck implements ISensitiveCheck { - /** - * 当前字符串是否符合规范 - * @param mappingChar 当前字符 - * @param index 下标 - * @param rawText 原始文本 - * @param context 上下文 - * @return 结果 - * @since 0.3.2 - */ - protected abstract boolean isCharCondition(char mappingChar, - int index, - String rawText, - final IWordContext context); - - /** - * 这里指定一个阈值条件 - * @param index 当前下标 - * @param rawText 原始文本 - * @param stringBuilder 缓存 - * @param context 上下文 - * @return 是否满足条件 - * @since 0.3.2 - */ - protected abstract boolean isStringCondition(int index, - String rawText, - final StringBuilder stringBuilder, - final IWordContext context); - /** * 获取校验类 * @return 类 @@ -50,41 +23,49 @@ public abstract class AbstractSensitiveCheck implements ISensitiveCheck { */ protected abstract Class getSensitiveCheckClass(); + /** + * 获取确切的长度 + * @param txt 文本 + * @param beginIndex 开始 + * @param validModeEnum 校验枚举 + * @param context 上下文 + * @return 长度 + * @since 0.4.0 + */ + protected abstract int doGetActualLength(String txt, int beginIndex, + ValidModeEnum validModeEnum, + IWordContext context); + + /** + * 获取确切的长度 + * @param txt 文本 + * @param beginIndex 开始 + * @param validModeEnum 校验枚举 + * @param context 上下文 + * @return 长度 + * @since 0.4.0 + */ + protected int getActualLength(String txt, int beginIndex, + ValidModeEnum validModeEnum, + IWordContext context) { + if(StringUtil.isEmpty(txt)) { + return 0; + } + + return doGetActualLength(txt, beginIndex, validModeEnum, context); + } + @Override public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { - // 采用 ThreadLocal 应该可以提升性能,减少对象的创建。 - StringBuilder stringBuilder = new StringBuilder(); - int actualLength = 0; - // 前一个条件 - for(int i = beginIndex; i < txt.length(); i++) { - char currentChar = txt.charAt(i); - - // 映射处理 - char mappingChar = context.charFormat().format(currentChar, context); - - // 符合条件 - boolean currentCondition = isCharCondition(mappingChar, i, txt, context); - if(currentCondition) { - stringBuilder.append(currentChar); - - // 匹配 - if(isStringCondition(i, txt, stringBuilder, context)) { - actualLength = stringBuilder.length(); - - // 是否遍历全部匹配的模式 - if(ValidModeEnum.FAIL_FAST.equals(validModeEnum)) { - break; - } - } - } else { - break; - } + Class clazz = getSensitiveCheckClass(); + if(StringUtil.isEmpty(txt)) { + return SensitiveCheckResult.of(0, clazz); } + int actualLength = getActualLength(txt, beginIndex, validModeEnum, context); - // 处理结果 - return SensitiveCheckResult.of(actualLength, getSensitiveCheckClass()); + return SensitiveCheckResult.of(actualLength, clazz); } } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckEmail.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckEmail.java index d58bd87..03ea436 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckEmail.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckEmail.java @@ -4,9 +4,8 @@ import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.heaven.util.lang.CharUtil; import com.github.houbb.heaven.util.util.regex.RegexUtil; import com.github.houbb.sensitive.word.api.IWordContext; -import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.constant.AppConst; import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; -import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; /** * email 正则表达式检测实现。 @@ -24,7 +23,7 @@ import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; * @since 0.0.9 */ @ThreadSafe -public class SensitiveCheckEmail extends AbstractSensitiveCheck { +public class SensitiveCheckEmail extends AbstractConditionSensitiveCheck { /** * @since 0.3.0 @@ -42,6 +41,16 @@ public class SensitiveCheckEmail extends AbstractSensitiveCheck { @Override protected boolean isStringCondition(int index, String rawText, StringBuilder stringBuilder, IWordContext context) { + int bufferLen = stringBuilder.length(); + + //x@a.cn + if(bufferLen < 6) { + return false; + } + if(bufferLen > AppConst.MAX_EMAIL_LEN) { + return false; + } + String string = stringBuilder.toString(); return RegexUtil.isEmail(string); } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckNum.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckNum.java index 3c0eca2..ca34f8e 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckNum.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckNum.java @@ -12,7 +12,7 @@ import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; * @since 0.0.5 */ @ThreadSafe -public class SensitiveCheckNum extends AbstractSensitiveCheck { +public class SensitiveCheckNum extends AbstractConditionSensitiveCheck { /** * @since 0.3.0 diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckUrl.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckUrl.java index 0c85f91..528a20a 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckUrl.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckUrl.java @@ -20,7 +20,7 @@ import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; * @since 0.0.9 */ @ThreadSafe -public class SensitiveCheckUrl extends AbstractSensitiveCheck { +public class SensitiveCheckUrl extends AbstractConditionSensitiveCheck { /** * @since 0.3.0 @@ -39,6 +39,10 @@ public class SensitiveCheckUrl extends AbstractSensitiveCheck { @Override protected boolean isStringCondition(int index, String rawText, StringBuilder stringBuilder, IWordContext context) { int bufferLen = stringBuilder.length(); + //a.cn + if(bufferLen < 4) { + return false; + } if(bufferLen > AppConst.MAX_WEB_SITE_LEN) { return false; } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java index 18dc22d..ff4c89e 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java @@ -2,6 +2,9 @@ package com.github.houbb.sensitive.word.support.check.impl; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordMap; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; /** @@ -21,19 +24,44 @@ public class SensitiveCheckWord extends AbstractSensitiveCheck { return INSTANCE; } - @Override - protected boolean isCharCondition(char mappingChar, int index, String rawText, IWordContext context) { - return true; - } - - @Override - protected boolean isStringCondition(int index, String rawText, StringBuilder stringBuilder, IWordContext context) { - return context.wordMap().contains(stringBuilder.toString(), context); - } - @Override protected Class getSensitiveCheckClass() { return SensitiveCheckWord.class; } + @Override + protected int doGetActualLength(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + // 采用 ThreadLocal 应该可以提升性能,减少对象的创建。 + int actualLength = 0; + final IWordMap wordMap = context.wordMap(); + + // 前一个条件 + StringBuilder stringBuilder = new StringBuilder(); + for(int i = beginIndex; i < txt.length(); i++) { + char currentChar = txt.charAt(i); + + // 映射处理 + char mappingChar = context.charFormat().format(currentChar, context); + stringBuilder.append(mappingChar); + + // 判断是否存在 + WordContainsTypeEnum wordContainsTypeEnum = wordMap.contains(stringBuilder.toString(), context); + if(WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnum)) { + actualLength = stringBuilder.length(); + + // 是否遍历全部匹配的模式 + if(ValidModeEnum.FAIL_FAST.equals(validModeEnum)) { + break; + } + } + + // 如果不包含,则直接返回。后续遍历无意义 + if(WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnum)) { + break; + } + } + + return actualLength; + } + } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveChecks.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveChecks.java index 71614e7..2bf4a9a 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveChecks.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveChecks.java @@ -28,16 +28,16 @@ public final class SensitiveChecks { public static ISensitiveCheck initSensitiveCheck(final IWordContext context) { List sensitiveCheckList = new ArrayList<>(); - // 默认添加敏感词校验 - sensitiveCheckList.add(SensitiveChecks.word()); - - if(context.sensitiveCheckNum()) { + if(context.enableWordCheck()) { + sensitiveCheckList.add(SensitiveChecks.word()); + } + if(context.enableNumCheck()) { sensitiveCheckList.add(SensitiveChecks.num()); } - if(context.sensitiveCheckEmail()) { + if(context.enableEmailCheck()) { sensitiveCheckList.add(SensitiveChecks.email()); } - if(context.sensitiveCheckUrl()) { + if(context.enableUrlCheck()) { sensitiveCheckList.add(SensitiveChecks.url()); } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/WordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/map/WordMap.java index 299c7ec..38de3b1 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/WordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/map/WordMap.java @@ -6,6 +6,7 @@ import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.constant.AppConst; +import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; import java.util.Collection; import java.util.HashMap; @@ -98,15 +99,15 @@ public class WordMap implements IWordMap { * @since 0.0.1 */ @Override - public boolean contains(String string, final IWordContext context) { + public WordContainsTypeEnum contains(String string, final IWordContext context) { if (StringUtil.isEmpty(string)) { - return false; + return WordContainsTypeEnum.NOT_FOUND; } return innerContainsSensitive(string, context); } - private boolean innerContainsSensitive(String txt, + private WordContainsTypeEnum innerContainsSensitive(String txt, IWordContext context) { // 初始化为当前的 map Map nowMap = this.innerWordMap; @@ -118,11 +119,17 @@ public class WordMap implements IWordMap { // 如果不为空,则判断是否为结尾。 if (ObjectUtil.isNull(nowMap)) { - return false; + return WordContainsTypeEnum.NOT_FOUND; } } - return isEnd(nowMap); + // 是否为结尾,便于快速失败 + boolean isEnd = isEnd(nowMap); + if(isEnd) { + return WordContainsTypeEnum.CONTAINS_END; + } + + return WordContainsTypeEnum.CONTAINS_PREFIX; } /** diff --git a/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceChar.java b/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceChar.java index 02de3ec..6b5cc17 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceChar.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaceChar.java @@ -5,6 +5,8 @@ import com.github.houbb.heaven.constant.CharConst; import com.github.houbb.heaven.util.lang.CharUtil; import com.github.houbb.sensitive.word.api.ISensitiveWordReplace; import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordResult; /** * 指定字符的替换策略 @@ -29,10 +31,11 @@ public class SensitiveWordReplaceChar implements ISensitiveWordReplace { } @Override - public String replace(ISensitiveWordReplaceContext context) { - int wordLength = context.wordLength(); - - return CharUtil.repeat(replaceChar, wordLength); + public void replace(StringBuilder stringBuilder, final char[] rawChars, IWordResult wordResult, IWordContext wordContext) { + int wordLen = wordResult.endIndex() - wordResult.startIndex(); + for(int i = 0; i < wordLen; i++) { + stringBuilder.append(replaceChar); + } } } diff --git a/src/test/java/com/github/houbb/sensitive/word/benchmark/BasicTest.java b/src/test/java/com/github/houbb/sensitive/word/benchmark/BasicTest.java new file mode 100644 index 0000000..871efcb --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/benchmark/BasicTest.java @@ -0,0 +1,56 @@ +package com.github.houbb.sensitive.word.benchmark; + +import com.github.houbb.heaven.util.util.RandomUtil; +import com.github.houbb.sensitive.word.bs.SensitiveWordBs; +import com.github.houbb.sensitive.word.core.SensitiveWordHelper; +import org.junit.Ignore; +import org.junit.Test; + +@Ignore +public class BasicTest { + + /** + * + * + * 100*100 耗时:926ms,性能較差。 + * + * 100*100000 的字符:12942ms 第一次优化。 + */ + @Test + public void costTimeTest() { + String randomText = "你他妈的不要说脏话"+ RandomUtil.randomString("1234567890bcdefghiJKLMNOPQRSTUVWXYZ", 100) + + "我们他妈的从来不说脏说"; + + + // 1W 次 + long start = System.currentTimeMillis(); + for(int i = 0; i < 10000; i++) { + SensitiveWordHelper.findAll(randomText); + } + long end = System.currentTimeMillis(); + System.out.println("------------------ COST: " + (end-start)); + } + + /** + * + * 100*100000 的字符:12440ms + */ + @Test + public void costTimeOnlyWordTest() { + String randomText = "你他妈的不要说脏话"+ RandomUtil.randomString("1234567890bcdefghiJKLMNOPQRSTUVWXYZ", 100) + + "我们他妈的从来不说脏说"; + + // 1W 次 + long start = System.currentTimeMillis(); + SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() + .enableWordCheck(false) + .init(); + + for(int i = 0; i < 10000; i++) { + sensitiveWordBs.findAll(randomText); + } + long end = System.currentTimeMillis(); + System.out.println("------------------ COST: " + (end-start)); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/data/NumUtilTest.java b/src/test/java/com/github/houbb/sensitive/word/data/NumUtilTest.java index 691c9c8..15ac42b 100644 --- a/src/test/java/com/github/houbb/sensitive/word/data/NumUtilTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/data/NumUtilTest.java @@ -1,5 +1,6 @@ package com.github.houbb.sensitive.word.data; +import org.junit.Ignore; import org.junit.Test; import java.util.Arrays; @@ -9,6 +10,7 @@ import java.util.List; * @author binbin.hou * @since 0.0.11 */ +@Ignore public class NumUtilTest { @Test diff --git a/src/test/java/com/github/houbb/sensitive/word/replace/MySensitiveWordReplace.java b/src/test/java/com/github/houbb/sensitive/word/replace/MySensitiveWordReplace.java index 32d6a8e..c4f05c0 100644 --- a/src/test/java/com/github/houbb/sensitive/word/replace/MySensitiveWordReplace.java +++ b/src/test/java/com/github/houbb/sensitive/word/replace/MySensitiveWordReplace.java @@ -1,8 +1,8 @@ package com.github.houbb.sensitive.word.replace; -import com.github.houbb.heaven.util.lang.CharUtil; import com.github.houbb.sensitive.word.api.ISensitiveWordReplace; -import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordResult; /** * 自定义敏感词替换策略 @@ -13,19 +13,20 @@ import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext; public class MySensitiveWordReplace implements ISensitiveWordReplace { @Override - public String replace(ISensitiveWordReplaceContext context) { - String sensitiveWord = context.sensitiveWord(); + public void replace(StringBuilder stringBuilder, final char[] rawChars, IWordResult wordResult, IWordContext wordContext) { + String sensitiveWord = wordResult.word(); // 自定义不同的敏感词替换策略,可以从数据库等地方读取 if("五星红旗".equals(sensitiveWord)) { - return "国家旗帜"; + stringBuilder.append("国家旗帜"); + } else if("毛主席".equals(sensitiveWord)) { + stringBuilder.append("教员"); + } else { + // 其他默认使用 * 代替 + int wordLength = wordResult.endIndex() - wordResult.startIndex(); + for(int i = 0; i < wordLength; i++) { + stringBuilder.append('*'); + } } - if("毛主席".equals(sensitiveWord)) { - return "教员"; - } - - // 其他默认使用 * 代替 - int wordLength = context.wordLength(); - return CharUtil.repeat('*', wordLength); } }