mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
[Feature] add for new
This commit is contained in:
@@ -1,5 +1,12 @@
|
||||
# 是否为邮箱 check
|
||||
|
||||
|
||||
==================
|
||||
|
||||
网址等等
|
||||
|
||||
URL 初期可以不做。
|
||||
|
||||
# 是否为 URL check
|
||||
|
||||
可以直接开辟另一道验证方式。
|
||||
@@ -4,4 +4,6 @@
|
||||
|
||||
你大爷
|
||||
|
||||
一句话如果反转之后是敏感词,那应该就是敏感词。
|
||||
一句话如果反转之后是敏感词,那应该就是敏感词。
|
||||
|
||||
这个不是很着急用。
|
||||
8
doc/issues/roadmap/v016-自定义降噪处理.md
Normal file
8
doc/issues/roadmap/v016-自定义降噪处理.md
Normal file
@@ -0,0 +1,8 @@
|
||||
有时候噪音是恶意插入的,程序本身难以辨认。
|
||||
|
||||
比如:
|
||||
|
||||
```
|
||||
123我是噪音456我是噪音789
|
||||
````
|
||||
|
||||
5
pom.xml
5
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.0.8</version>
|
||||
<version>0.0.9-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
@@ -25,8 +25,9 @@
|
||||
<project.compiler.level>1.7</project.compiler.level>
|
||||
|
||||
<!--============================== INTER ==============================-->
|
||||
<heaven.version>0.1.68</heaven.version>
|
||||
<heaven.version>0.1.69-SNAPSHOT</heaven.version>
|
||||
<opencc4j.version>1.2.0</opencc4j.version>
|
||||
|
||||
<!--============================== OTHER ==============================-->
|
||||
<junit.version>4.12</junit.version>
|
||||
</properties>
|
||||
|
||||
@@ -98,6 +98,21 @@ public interface IWordContext {
|
||||
*/
|
||||
IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck);
|
||||
|
||||
/**
|
||||
* 是否进行邮箱检测
|
||||
* @return this
|
||||
* @since 0.0.9
|
||||
*/
|
||||
boolean sensitiveEmailCheck();
|
||||
|
||||
/**
|
||||
* 设置敏感邮箱检测
|
||||
* @param sensitiveEmailCheck 是否检测
|
||||
* @return this
|
||||
* @since 0.0.9
|
||||
*/
|
||||
IWordContext sensitiveEmailCheck(final boolean sensitiveEmailCheck);
|
||||
|
||||
/**
|
||||
* 忽略英文的写法
|
||||
* @return 数字检测
|
||||
|
||||
@@ -89,6 +89,7 @@ public class SensitiveWordBs {
|
||||
|
||||
// 开启校验
|
||||
wordContext.sensitiveNumCheck(true);
|
||||
wordContext.sensitiveEmailCheck(true);
|
||||
|
||||
return wordContext;
|
||||
}
|
||||
|
||||
@@ -59,6 +59,12 @@ public class SensitiveWordContext implements IWordContext {
|
||||
*/
|
||||
private boolean ignoreRepeat;
|
||||
|
||||
/**
|
||||
* 是否进行邮箱测试
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private boolean sensitiveEmailCheck;
|
||||
|
||||
/**
|
||||
* 私有化构造器
|
||||
* @since 0.0.4
|
||||
@@ -163,4 +169,15 @@ public class SensitiveWordContext implements IWordContext {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean sensitiveEmailCheck() {
|
||||
return sensitiveEmailCheck;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveWordContext sensitiveEmailCheck(boolean sensitiveEmailCheck) {
|
||||
this.sensitiveEmailCheck = sensitiveEmailCheck;
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -13,6 +13,11 @@ import java.util.List;
|
||||
* 敏感词检测责任链模式
|
||||
*
|
||||
* 这里可以提供一个公共的父类。
|
||||
*
|
||||
*
|
||||
* DFA 算法的优化可以参考论文:
|
||||
* 【DFA 算法】各种论文。
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@@ -28,9 +33,14 @@ public class SensitiveCheckChain implements ISensitiveCheck {
|
||||
if(context.sensitiveNumCheck()) {
|
||||
sensitiveChecks.add(Instances.singleton(SensitiveNumCheck.class));
|
||||
}
|
||||
if(context.sensitiveEmailCheck()) {
|
||||
sensitiveChecks.add(Instances.singleton(SensitiveEmailCheck.class));
|
||||
}
|
||||
|
||||
// 循环调用
|
||||
//TODO: 这里同时满足两个条件,会出现 BUG
|
||||
for(ISensitiveCheck sensitiveCheck : sensitiveChecks) {
|
||||
System.out.println(sensitiveCheck.getClass().getSimpleName()+"check start");
|
||||
int result = sensitiveCheck.checkSensitive(txt, beginIndex, validModeEnum, context);
|
||||
|
||||
if(result > 0) {
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
package com.github.houbb.sensitive.word.support.check;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.support.instance.impl.Instances;
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.heaven.util.util.regex.RegexUtil;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.format.CharFormatChain;
|
||||
|
||||
/**
|
||||
* email 正则表达式检测实现。
|
||||
* @author binbin.hou
|
||||
* @since 0.0.9
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveEmailCheck implements ISensitiveCheck {
|
||||
|
||||
@Override
|
||||
public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
// 这里偷懒直接使用 String 拼接,然后结合正则表达式。
|
||||
// DFA 本质就可以做正则表达式,这样实现不免性能会差一些。
|
||||
// 后期如果有想法,对 DFA 进一步深入学习后,将进行优化。
|
||||
for(int i = beginIndex; i < txt.length(); i++) {
|
||||
char currentChar = txt.charAt(i);
|
||||
char mappingChar = Instances.singleton(CharFormatChain.class)
|
||||
.format(currentChar, context);
|
||||
|
||||
if(isEmailChar(mappingChar)) {
|
||||
lengthCount++;
|
||||
stringBuilder.append(currentChar);
|
||||
|
||||
if(isCondition(stringBuilder.toString())) {
|
||||
actualLength = lengthCount;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return actualLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* 这里指定一个阈值条件
|
||||
* @param string 长度
|
||||
* @return 是否满足条件
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private boolean isCondition(final String string) {
|
||||
return RegexUtil.isEmail(string);
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否为组成 email 的字符
|
||||
* @param c 字符
|
||||
* @return 结果
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private boolean isEmailChar(final char c) {
|
||||
return CharUtil.isDigitOrLetter(c)
|
||||
|| c == '.' || c == '@';
|
||||
}
|
||||
|
||||
}
|
||||
@@ -35,6 +35,8 @@ public class SensitiveWordCheck implements ISensitiveCheck {
|
||||
lengthCount++;
|
||||
|
||||
// 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
|
||||
System.out.println("chat is : " + i +"==="+txt.charAt(i));
|
||||
System.out.println("now map: " + nowMap.get(AppConst.IS_END));
|
||||
boolean isEnd = (boolean) nowMap.get(AppConst.IS_END);
|
||||
if (isEnd) {
|
||||
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* <p> project: sensitive-word-SensitiveWordBsTest </p>
|
||||
* <p> create on 2020/1/7 23:43 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 0.0.6
|
||||
*/
|
||||
public class SensitiveWordBsEmailTest {
|
||||
|
||||
/**
|
||||
* 邮箱测试
|
||||
* @since 0.0.9
|
||||
*/
|
||||
@Test
|
||||
public void emailTest() {
|
||||
final String text = "楼主好人,邮箱 123456789@qq.com";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[五星紅旗]", wordList.toString());
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user