diff --git a/README.md b/README.md index 9b819fa..9c53fdd 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,15 @@ List wordList = SensitiveWordBs.newInstance().findAll(text); Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString()); ``` +## 邮箱检测 + +```java +final String text = "楼主好人,邮箱 sensitiveword@xx.com"; + +List wordList = SensitiveWordBs.newInstance().findAll(text); +Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString()); +``` + # 用户自定义 ## 敏感词和白名单 @@ -208,12 +217,18 @@ Assert.assertEquals("[自定义敏感词]", wordList.toString()); - 停顿词 -- 拼音互换 +- 同音字处理 + +- 形近字处理 - 文字镜像翻转 +- 文字降噪处理 + - 敏感词标签支持 +- 邮箱后缀检测 + # 拓展阅读 [敏感词工具实现思路](https://houbb.github.io/2020/01/07/sensitive-word) diff --git a/doc/CHANGE_LOG.md b/doc/CHANGE_LOG.md index 667c253..6049cb0 100644 --- a/doc/CHANGE_LOG.md +++ b/doc/CHANGE_LOG.md @@ -67,4 +67,10 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:---|:---|:---|:--| -| 1 | A | 添加用户自定义敏感词和白名单 | 2020-1-10 09:34:35 | | \ No newline at end of file +| 1 | A | 添加用户自定义敏感词和白名单 | 2020-1-10 09:34:35 | | + +# release_0.0.9 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:---|:---|:---|:--| +| 1 | A | 添加邮箱检测 | 2020-1-11 09:34:35 | | \ No newline at end of file diff --git a/pom.xml b/pom.xml index 6a0f95d..e36fd65 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.0.9-SNAPSHOT + 0.0.9 diff --git a/release.bat b/release.bat index 210e5d7..fe09a4c 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.0.8 +SET version=0.0.9 :::: 新版本名称 -SET newVersion=0.0.9 +SET newVersion=0.1.0 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java index 48f9924..e0c45e6 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java @@ -38,9 +38,7 @@ public class SensitiveCheckChain implements ISensitiveCheck { } // 循环调用 - //TODO: 这里同时满足两个条件,会出现 BUG for(ISensitiveCheck sensitiveCheck : sensitiveChecks) { - System.out.println(sensitiveCheck.getClass().getSimpleName()+"check start"); int result = sensitiveCheck.checkSensitive(txt, beginIndex, validModeEnum, context); if(result > 0) { diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java index 83fbf8b..0325212 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java @@ -11,6 +11,16 @@ import com.github.houbb.sensitive.word.support.format.CharFormatChain; /** * email 正则表达式检测实现。 + * + * TODO: 这里暂时不实现邮箱后缀的实现。 + * + * (1)命中结果应该有标记,属于哪一个验证模式命中 + * (2)后期优化方案可以是: + * 如果数字后面紧跟的是邮箱后缀命中,则直接连接起来 num+email-suffix; + * (3)邮箱后缀的去重 + * 邮箱后缀可以只处理为和 Num 构建,如果没有直接丢弃的模式。 + * + * 也可以严格的保留下来。 * @author binbin.hou * @since 0.0.9 */ @@ -32,13 +42,17 @@ public class SensitiveEmailCheck implements ISensitiveCheck { char mappingChar = Instances.singleton(CharFormatChain.class) .format(currentChar, context); - if(isEmailChar(mappingChar)) { + if(CharUtil.isEmilChar(mappingChar)) { lengthCount++; stringBuilder.append(currentChar); if(isCondition(stringBuilder.toString())) { actualLength = lengthCount; - break; + + // 是否遍历全部匹配的模式 + if(ValidModeEnum.FAIL_FAST.equals(validModeEnum)) { + break; + } } } else { break; @@ -58,15 +72,4 @@ public class SensitiveEmailCheck implements ISensitiveCheck { return RegexUtil.isEmail(string); } - /** - * 是否为组成 email 的字符 - * @param c 字符 - * @return 结果 - * @since 0.0.9 - */ - private boolean isEmailChar(final char c) { - return CharUtil.isDigitOrLetter(c) - || c == '.' || c == '@'; - } - } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java index 4236a56..c8206a6 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java @@ -35,9 +35,7 @@ public class SensitiveWordCheck implements ISensitiveCheck { lengthCount++; // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测 - System.out.println("chat is : " + i +"==="+txt.charAt(i)); - System.out.println("now map: " + nowMap.get(AppConst.IS_END)); - boolean isEnd = (boolean) nowMap.get(AppConst.IS_END); + boolean isEnd = isEnd(nowMap); if (isEnd) { // 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。 // eg: 敏感词 敏感词xxx @@ -58,6 +56,25 @@ public class SensitiveWordCheck implements ISensitiveCheck { return actualLength; } + /** + * 判断是否结束 + * BUG-FIX: 避免出现敏感词库中没有的文字。 + * @param map map 信息 + * @return 是否结束 + * @since 0.0.9 + */ + private static boolean isEnd(final Map map) { + if(ObjectUtil.isNull(map)) { + return false; + } + + Object value = map.get(AppConst.IS_END); + if(ObjectUtil.isNull(value)) { + return false; + } + + return (boolean)value; + } /** * 获取当前的 Map * @param nowMap 原始的当前 map @@ -75,6 +92,7 @@ public class SensitiveWordCheck implements ISensitiveCheck { char mappingChar = Instances.singleton(CharFormatChain.class).format(c, context); // 这里做一次重复词的处理 + //TODO: 这里可以优化,是否获取一次。 Map currentMap = (Map) nowMap.get(mappingChar); // 启用忽略重复&当前下标不是第一个 if(context.ignoreRepeat() diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java index 53edf83..429ff84 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java @@ -10,7 +10,7 @@ import java.util.List; *

create on 2020/1/7 23:43

* * @author Administrator - * @since 0.0.6 + * @since 0.0.9 */ public class SensitiveWordBsEmailTest { @@ -19,11 +19,23 @@ public class SensitiveWordBsEmailTest { * @since 0.0.9 */ @Test - public void emailTest() { - final String text = "楼主好人,邮箱 123456789@qq.com"; + public void emailEnglishTest() { + final String text = "楼主好人,邮箱 sensitiveword@xx.com"; List wordList = SensitiveWordBs.newInstance().findAll(text); - Assert.assertEquals("[五星紅旗]", wordList.toString()); + Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString()); + } + + /** + * 邮箱测试 + * @since 0.0.9 + */ + @Test + public void emailNumberTest() { + final String text = "楼主好人,邮箱 123456789@xx.com"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[123456789]", wordList.toString()); } }