From c9a9837cf744a3eda05cc1269ea7afe2e208a9a3 Mon Sep 17 00:00:00 2001
From: "binbin.hou" <1060732496@qq.com>
Date: Fri, 10 Jan 2020 17:29:18 +0800
Subject: [PATCH] [Feature] add for new
---
...箱URL的转换实现.md => v011-邮箱检测实现.md} | 7 ++
...1-镜像反转处理.md => v015-镜像反转处理.md} | 4 +-
doc/issues/roadmap/v016-自定义降噪处理.md | 8 +++
pom.xml | 5 +-
.../sensitive/word/api/IWordContext.java | 15 ++++
.../sensitive/word/bs/SensitiveWordBs.java | 1 +
.../word/bs/SensitiveWordContext.java | 17 +++++
.../support/check/SensitiveCheckChain.java | 10 +++
.../support/check/SensitiveEmailCheck.java | 72 +++++++++++++++++++
.../support/check/SensitiveWordCheck.java | 2 +
.../word/bs/SensitiveWordBsEmailTest.java | 29 ++++++++
11 files changed, 167 insertions(+), 3 deletions(-)
rename doc/issues/roadmap/{v013-邮箱URL的转换实现.md => v011-邮箱检测实现.md} (67%)
rename doc/issues/roadmap/{v011-镜像反转处理.md => v015-镜像反转处理.md} (82%)
create mode 100644 doc/issues/roadmap/v016-自定义降噪处理.md
create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java
create mode 100644 src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java
diff --git a/doc/issues/roadmap/v013-邮箱URL的转换实现.md b/doc/issues/roadmap/v011-邮箱检测实现.md
similarity index 67%
rename from doc/issues/roadmap/v013-邮箱URL的转换实现.md
rename to doc/issues/roadmap/v011-邮箱检测实现.md
index d2f4884..c97ed2a 100644
--- a/doc/issues/roadmap/v013-邮箱URL的转换实现.md
+++ b/doc/issues/roadmap/v011-邮箱检测实现.md
@@ -1,5 +1,12 @@
# 是否为邮箱 check
+
+==================
+
+网址等等
+
+URL 初期可以不做。
+
# 是否为 URL check
可以直接开辟另一道验证方式。
diff --git a/doc/issues/roadmap/v011-镜像反转处理.md b/doc/issues/roadmap/v015-镜像反转处理.md
similarity index 82%
rename from doc/issues/roadmap/v011-镜像反转处理.md
rename to doc/issues/roadmap/v015-镜像反转处理.md
index 15385fd..e5a52e4 100644
--- a/doc/issues/roadmap/v011-镜像反转处理.md
+++ b/doc/issues/roadmap/v015-镜像反转处理.md
@@ -4,4 +4,6 @@
你大爷
-一句话如果反转之后是敏感词,那应该就是敏感词。
\ No newline at end of file
+一句话如果反转之后是敏感词,那应该就是敏感词。
+
+这个不是很着急用。
\ No newline at end of file
diff --git a/doc/issues/roadmap/v016-自定义降噪处理.md b/doc/issues/roadmap/v016-自定义降噪处理.md
new file mode 100644
index 0000000..e444937
--- /dev/null
+++ b/doc/issues/roadmap/v016-自定义降噪处理.md
@@ -0,0 +1,8 @@
+有时候噪音是恶意插入的,程序本身难以辨认。
+
+比如:
+
+```
+123我是噪音456我是噪音789
+````
+
diff --git a/pom.xml b/pom.xml
index 9497c34..6a0f95d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
com.github.houbb
sensitive-word
- 0.0.8
+ 0.0.9-SNAPSHOT
@@ -25,8 +25,9 @@
1.7
- 0.1.68
+ 0.1.69-SNAPSHOT
1.2.0
+
4.12
diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java
index 29ff5ce..ec1f268 100644
--- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java
+++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java
@@ -98,6 +98,21 @@ public interface IWordContext {
*/
IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck);
+ /**
+ * 是否进行邮箱检测
+ * @return this
+ * @since 0.0.9
+ */
+ boolean sensitiveEmailCheck();
+
+ /**
+ * 设置敏感邮箱检测
+ * @param sensitiveEmailCheck 是否检测
+ * @return this
+ * @since 0.0.9
+ */
+ IWordContext sensitiveEmailCheck(final boolean sensitiveEmailCheck);
+
/**
* 忽略英文的写法
* @return 数字检测
diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
index da2fa86..f2c5cf1 100644
--- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
+++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
@@ -89,6 +89,7 @@ public class SensitiveWordBs {
// 开启校验
wordContext.sensitiveNumCheck(true);
+ wordContext.sensitiveEmailCheck(true);
return wordContext;
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java
index 9cdf438..b611f62 100644
--- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java
+++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java
@@ -59,6 +59,12 @@ public class SensitiveWordContext implements IWordContext {
*/
private boolean ignoreRepeat;
+ /**
+ * 是否进行邮箱测试
+ * @since 0.0.9
+ */
+ private boolean sensitiveEmailCheck;
+
/**
* 私有化构造器
* @since 0.0.4
@@ -163,4 +169,15 @@ public class SensitiveWordContext implements IWordContext {
return this;
}
+ @Override
+ public boolean sensitiveEmailCheck() {
+ return sensitiveEmailCheck;
+ }
+
+ @Override
+ public SensitiveWordContext sensitiveEmailCheck(boolean sensitiveEmailCheck) {
+ this.sensitiveEmailCheck = sensitiveEmailCheck;
+ return this;
+ }
+
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java
index 682b872..48f9924 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveCheckChain.java
@@ -13,6 +13,11 @@ import java.util.List;
* 敏感词检测责任链模式
*
* 这里可以提供一个公共的父类。
+ *
+ *
+ * DFA 算法的优化可以参考论文:
+ * 【DFA 算法】各种论文。
+ *
* @author binbin.hou
* @since 0.0.5
*/
@@ -28,9 +33,14 @@ public class SensitiveCheckChain implements ISensitiveCheck {
if(context.sensitiveNumCheck()) {
sensitiveChecks.add(Instances.singleton(SensitiveNumCheck.class));
}
+ if(context.sensitiveEmailCheck()) {
+ sensitiveChecks.add(Instances.singleton(SensitiveEmailCheck.class));
+ }
// 循环调用
+ //TODO: 这里同时满足两个条件,会出现 BUG
for(ISensitiveCheck sensitiveCheck : sensitiveChecks) {
+ System.out.println(sensitiveCheck.getClass().getSimpleName()+"check start");
int result = sensitiveCheck.checkSensitive(txt, beginIndex, validModeEnum, context);
if(result > 0) {
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java
new file mode 100644
index 0000000..83fbf8b
--- /dev/null
+++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveEmailCheck.java
@@ -0,0 +1,72 @@
+package com.github.houbb.sensitive.word.support.check;
+
+import com.github.houbb.heaven.annotation.ThreadSafe;
+import com.github.houbb.heaven.support.instance.impl.Instances;
+import com.github.houbb.heaven.util.lang.CharUtil;
+import com.github.houbb.heaven.util.util.regex.RegexUtil;
+import com.github.houbb.sensitive.word.api.ISensitiveCheck;
+import com.github.houbb.sensitive.word.api.IWordContext;
+import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
+import com.github.houbb.sensitive.word.support.format.CharFormatChain;
+
+/**
+ * email 正则表达式检测实现。
+ * @author binbin.hou
+ * @since 0.0.9
+ */
+@ThreadSafe
+public class SensitiveEmailCheck implements ISensitiveCheck {
+
+ @Override
+ public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
+ // 记录敏感词的长度
+ int lengthCount = 0;
+ int actualLength = 0;
+
+ StringBuilder stringBuilder = new StringBuilder();
+ // 这里偷懒直接使用 String 拼接,然后结合正则表达式。
+ // DFA 本质就可以做正则表达式,这样实现不免性能会差一些。
+ // 后期如果有想法,对 DFA 进一步深入学习后,将进行优化。
+ for(int i = beginIndex; i < txt.length(); i++) {
+ char currentChar = txt.charAt(i);
+ char mappingChar = Instances.singleton(CharFormatChain.class)
+ .format(currentChar, context);
+
+ if(isEmailChar(mappingChar)) {
+ lengthCount++;
+ stringBuilder.append(currentChar);
+
+ if(isCondition(stringBuilder.toString())) {
+ actualLength = lengthCount;
+ break;
+ }
+ } else {
+ break;
+ }
+ }
+
+ return actualLength;
+ }
+
+ /**
+ * 这里指定一个阈值条件
+ * @param string 长度
+ * @return 是否满足条件
+ * @since 0.0.9
+ */
+ private boolean isCondition(final String string) {
+ return RegexUtil.isEmail(string);
+ }
+
+ /**
+ * 是否为组成 email 的字符
+ * @param c 字符
+ * @return 结果
+ * @since 0.0.9
+ */
+ private boolean isEmailChar(final char c) {
+ return CharUtil.isDigitOrLetter(c)
+ || c == '.' || c == '@';
+ }
+
+}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java
index b1f27d5..4236a56 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java
@@ -35,6 +35,8 @@ public class SensitiveWordCheck implements ISensitiveCheck {
lengthCount++;
// 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
+ System.out.println("chat is : " + i +"==="+txt.charAt(i));
+ System.out.println("now map: " + nowMap.get(AppConst.IS_END));
boolean isEnd = (boolean) nowMap.get(AppConst.IS_END);
if (isEnd) {
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java
new file mode 100644
index 0000000..53edf83
--- /dev/null
+++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java
@@ -0,0 +1,29 @@
+package com.github.houbb.sensitive.word.bs;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.List;
+
+/**
+ *
project: sensitive-word-SensitiveWordBsTest
+ * create on 2020/1/7 23:43
+ *
+ * @author Administrator
+ * @since 0.0.6
+ */
+public class SensitiveWordBsEmailTest {
+
+ /**
+ * 邮箱测试
+ * @since 0.0.9
+ */
+ @Test
+ public void emailTest() {
+ final String text = "楼主好人,邮箱 123456789@qq.com";
+
+ List wordList = SensitiveWordBs.newInstance().findAll(text);
+ Assert.assertEquals("[五星紅旗]", wordList.toString());
+ }
+
+}