From c9a9837cf744a3eda05cc1269ea7afe2e208a9a3 Mon Sep 17 00:00:00 2001 From: "binbin.hou" <1060732496@qq.com> Date: Fri, 10 Jan 2020 17:29:18 +0800 Subject: [PATCH] [Feature] add for new --- ...箱URL的转换实现.md => v011-邮箱检测实现.md} | 7 ++ ...1-镜像反转处理.md => v015-镜像反转处理.md} | 4 +- doc/issues/roadmap/v016-自定义降噪处理.md | 8 +++ pom.xml | 5 +- .../sensitive/word/api/IWordContext.java | 15 ++++ .../sensitive/word/bs/SensitiveWordBs.java | 1 + .../word/bs/SensitiveWordContext.java | 17 +++++ .../support/check/SensitiveCheckChain.java | 10 +++ .../support/check/SensitiveEmailCheck.java | 72 +++++++++++++++++++ .../support/check/SensitiveWordCheck.java | 2 + .../word/bs/SensitiveWordBsEmailTest.java | 29 ++++++++ 11 files changed, 167 insertions(+), 3 deletions(-) rename doc/issues/roadmap/{v013-邮箱URL的转换实现.md => v011-邮箱检测实现.md} (67%) rename doc/issues/roadmap/{v011-镜像反转处理.md => v015-镜像反转处理.md} (82%) create mode 100644 doc/issues/roadmap/v016-自定义降噪处理.md create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java create mode 100644 src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java diff --git a/doc/issues/roadmap/v013-邮箱URL的转换实现.md b/doc/issues/roadmap/v011-邮箱检测实现.md similarity index 67% rename from doc/issues/roadmap/v013-邮箱URL的转换实现.md rename to doc/issues/roadmap/v011-邮箱检测实现.md index d2f4884..c97ed2a 100644 --- a/doc/issues/roadmap/v013-邮箱URL的转换实现.md +++ b/doc/issues/roadmap/v011-邮箱检测实现.md @@ -1,5 +1,12 @@ # 是否为邮箱 check + +================== + +网址等等 + +URL 初期可以不做。 + # 是否为 URL check 可以直接开辟另一道验证方式。 diff --git a/doc/issues/roadmap/v011-镜像反转处理.md b/doc/issues/roadmap/v015-镜像反转处理.md similarity index 82% rename from doc/issues/roadmap/v011-镜像反转处理.md rename to doc/issues/roadmap/v015-镜像反转处理.md index 15385fd..e5a52e4 100644 --- a/doc/issues/roadmap/v011-镜像反转处理.md +++ b/doc/issues/roadmap/v015-镜像反转处理.md @@ -4,4 +4,6 @@ 你大爷 -一句话如果反转之后是敏感词,那应该就是敏感词。 \ No newline at end of file +一句话如果反转之后是敏感词,那应该就是敏感词。 + +这个不是很着急用。 \ No newline at end of file diff --git a/doc/issues/roadmap/v016-自定义降噪处理.md b/doc/issues/roadmap/v016-自定义降噪处理.md new file mode 100644 index 0000000..e444937 --- /dev/null +++ b/doc/issues/roadmap/v016-自定义降噪处理.md @@ -0,0 +1,8 @@ +有时候噪音是恶意插入的,程序本身难以辨认。 + +比如: + +``` +123我是噪音456我是噪音789 +```` + diff --git a/pom.xml b/pom.xml index 9497c34..6a0f95d 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.0.8 + 0.0.9-SNAPSHOT @@ -25,8 +25,9 @@ 1.7 - 0.1.68 + 0.1.69-SNAPSHOT 1.2.0 + 4.12 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index 29ff5ce..ec1f268 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -98,6 +98,21 @@ public interface IWordContext { */ IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck); + /** + * 是否进行邮箱检测 + * @return this + * @since 0.0.9 + */ + boolean sensitiveEmailCheck(); + + /** + * 设置敏感邮箱检测 + * @param sensitiveEmailCheck 是否检测 + * @return this + * @since 0.0.9 + */ + IWordContext sensitiveEmailCheck(final boolean sensitiveEmailCheck); + /** * 忽略英文的写法 * @return 数字检测 diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index da2fa86..f2c5cf1 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -89,6 +89,7 @@ public class SensitiveWordBs { // 开启校验 wordContext.sensitiveNumCheck(true); + wordContext.sensitiveEmailCheck(true); return wordContext; } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index 9cdf438..b611f62 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -59,6 +59,12 @@ public class SensitiveWordContext implements IWordContext { */ private boolean ignoreRepeat; + /** + * 是否进行邮箱测试 + * @since 0.0.9 + */ + private boolean sensitiveEmailCheck; + /** * 私有化构造器 * @since 0.0.4 @@ -163,4 +169,15 @@ public class SensitiveWordContext implements IWordContext { return this; } + @Override + public boolean sensitiveEmailCheck() { + return sensitiveEmailCheck; + } + + @Override + public SensitiveWordContext sensitiveEmailCheck(boolean sensitiveEmailCheck) { + this.sensitiveEmailCheck = sensitiveEmailCheck; + return this; + } + } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java index 682b872..48f9924 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java @@ -13,6 +13,11 @@ import java.util.List; * 敏感词检测责任链模式 * * 这里可以提供一个公共的父类。 + * + * + * DFA 算法的优化可以参考论文: + * 【DFA 算法】各种论文。 + * * @author binbin.hou * @since 0.0.5 */ @@ -28,9 +33,14 @@ public class SensitiveCheckChain implements ISensitiveCheck { if(context.sensitiveNumCheck()) { sensitiveChecks.add(Instances.singleton(SensitiveNumCheck.class)); } + if(context.sensitiveEmailCheck()) { + sensitiveChecks.add(Instances.singleton(SensitiveEmailCheck.class)); + } // 循环调用 + //TODO: 这里同时满足两个条件,会出现 BUG for(ISensitiveCheck sensitiveCheck : sensitiveChecks) { + System.out.println(sensitiveCheck.getClass().getSimpleName()+"check start"); int result = sensitiveCheck.checkSensitive(txt, beginIndex, validModeEnum, context); if(result > 0) { diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java new file mode 100644 index 0000000..83fbf8b --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java @@ -0,0 +1,72 @@ +package com.github.houbb.sensitive.word.support.check; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.support.instance.impl.Instances; +import com.github.houbb.heaven.util.lang.CharUtil; +import com.github.houbb.heaven.util.util.regex.RegexUtil; +import com.github.houbb.sensitive.word.api.ISensitiveCheck; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; +import com.github.houbb.sensitive.word.support.format.CharFormatChain; + +/** + * email 正则表达式检测实现。 + * @author binbin.hou + * @since 0.0.9 + */ +@ThreadSafe +public class SensitiveEmailCheck implements ISensitiveCheck { + + @Override + public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { + // 记录敏感词的长度 + int lengthCount = 0; + int actualLength = 0; + + StringBuilder stringBuilder = new StringBuilder(); + // 这里偷懒直接使用 String 拼接,然后结合正则表达式。 + // DFA 本质就可以做正则表达式,这样实现不免性能会差一些。 + // 后期如果有想法,对 DFA 进一步深入学习后,将进行优化。 + for(int i = beginIndex; i < txt.length(); i++) { + char currentChar = txt.charAt(i); + char mappingChar = Instances.singleton(CharFormatChain.class) + .format(currentChar, context); + + if(isEmailChar(mappingChar)) { + lengthCount++; + stringBuilder.append(currentChar); + + if(isCondition(stringBuilder.toString())) { + actualLength = lengthCount; + break; + } + } else { + break; + } + } + + return actualLength; + } + + /** + * 这里指定一个阈值条件 + * @param string 长度 + * @return 是否满足条件 + * @since 0.0.9 + */ + private boolean isCondition(final String string) { + return RegexUtil.isEmail(string); + } + + /** + * 是否为组成 email 的字符 + * @param c 字符 + * @return 结果 + * @since 0.0.9 + */ + private boolean isEmailChar(final char c) { + return CharUtil.isDigitOrLetter(c) + || c == '.' || c == '@'; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java index b1f27d5..4236a56 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java @@ -35,6 +35,8 @@ public class SensitiveWordCheck implements ISensitiveCheck { lengthCount++; // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测 + System.out.println("chat is : " + i +"==="+txt.charAt(i)); + System.out.println("now map: " + nowMap.get(AppConst.IS_END)); boolean isEnd = (boolean) nowMap.get(AppConst.IS_END); if (isEnd) { // 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。 diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java new file mode 100644 index 0000000..53edf83 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java @@ -0,0 +1,29 @@ +package com.github.houbb.sensitive.word.bs; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.0.6 + */ +public class SensitiveWordBsEmailTest { + + /** + * 邮箱测试 + * @since 0.0.9 + */ + @Test + public void emailTest() { + final String text = "楼主好人,邮箱 123456789@qq.com"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[五星紅旗]", wordList.toString()); + } + +}