mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
release branch 0.0.9
This commit is contained in:
17
README.md
17
README.md
@@ -180,6 +180,15 @@ List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString());
|
||||
```
|
||||
|
||||
## 邮箱检测
|
||||
|
||||
```java
|
||||
final String text = "楼主好人,邮箱 sensitiveword@xx.com";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString());
|
||||
```
|
||||
|
||||
# 用户自定义
|
||||
|
||||
## 敏感词和白名单
|
||||
@@ -208,12 +217,18 @@ Assert.assertEquals("[自定义敏感词]", wordList.toString());
|
||||
|
||||
- 停顿词
|
||||
|
||||
- 拼音互换
|
||||
- 同音字处理
|
||||
|
||||
- 形近字处理
|
||||
|
||||
- 文字镜像翻转
|
||||
|
||||
- 文字降噪处理
|
||||
|
||||
- 敏感词标签支持
|
||||
|
||||
- 邮箱后缀检测
|
||||
|
||||
# 拓展阅读
|
||||
|
||||
[敏感词工具实现思路](https://houbb.github.io/2020/01/07/sensitive-word)
|
||||
|
||||
@@ -67,4 +67,10 @@
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | A | 添加用户自定义敏感词和白名单 | 2020-1-10 09:34:35 | |
|
||||
| 1 | A | 添加用户自定义敏感词和白名单 | 2020-1-10 09:34:35 | |
|
||||
|
||||
# release_0.0.9
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | A | 添加邮箱检测 | 2020-1-11 09:34:35 | |
|
||||
2
pom.xml
2
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.0.9-SNAPSHOT</version>
|
||||
<version>0.0.9</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
|
||||
@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
|
||||
|
||||
:: 版本号信息(需要手动指定)
|
||||
:::: 旧版本名称
|
||||
SET version=0.0.8
|
||||
SET version=0.0.9
|
||||
:::: 新版本名称
|
||||
SET newVersion=0.0.9
|
||||
SET newVersion=0.1.0
|
||||
:::: 组织名称
|
||||
SET groupName=com.github.houbb
|
||||
:::: 项目名称
|
||||
|
||||
@@ -38,9 +38,7 @@ public class SensitiveCheckChain implements ISensitiveCheck {
|
||||
}
|
||||
|
||||
// 循环调用
|
||||
//TODO: 这里同时满足两个条件,会出现 BUG
|
||||
for(ISensitiveCheck sensitiveCheck : sensitiveChecks) {
|
||||
System.out.println(sensitiveCheck.getClass().getSimpleName()+"check start");
|
||||
int result = sensitiveCheck.checkSensitive(txt, beginIndex, validModeEnum, context);
|
||||
|
||||
if(result > 0) {
|
||||
|
||||
@@ -11,6 +11,16 @@ import com.github.houbb.sensitive.word.support.format.CharFormatChain;
|
||||
|
||||
/**
|
||||
* email 正则表达式检测实现。
|
||||
*
|
||||
* TODO: 这里暂时不实现邮箱后缀的实现。
|
||||
*
|
||||
* (1)命中结果应该有标记,属于哪一个验证模式命中
|
||||
* (2)后期优化方案可以是:
|
||||
* 如果数字后面紧跟的是邮箱后缀命中,则直接连接起来 num+email-suffix;
|
||||
* (3)邮箱后缀的去重
|
||||
* 邮箱后缀可以只处理为和 Num 构建,如果没有直接丢弃的模式。
|
||||
*
|
||||
* 也可以严格的保留下来。
|
||||
* @author binbin.hou
|
||||
* @since 0.0.9
|
||||
*/
|
||||
@@ -32,13 +42,17 @@ public class SensitiveEmailCheck implements ISensitiveCheck {
|
||||
char mappingChar = Instances.singleton(CharFormatChain.class)
|
||||
.format(currentChar, context);
|
||||
|
||||
if(isEmailChar(mappingChar)) {
|
||||
if(CharUtil.isEmilChar(mappingChar)) {
|
||||
lengthCount++;
|
||||
stringBuilder.append(currentChar);
|
||||
|
||||
if(isCondition(stringBuilder.toString())) {
|
||||
actualLength = lengthCount;
|
||||
break;
|
||||
|
||||
// 是否遍历全部匹配的模式
|
||||
if(ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
@@ -58,15 +72,4 @@ public class SensitiveEmailCheck implements ISensitiveCheck {
|
||||
return RegexUtil.isEmail(string);
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否为组成 email 的字符
|
||||
* @param c 字符
|
||||
* @return 结果
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private boolean isEmailChar(final char c) {
|
||||
return CharUtil.isDigitOrLetter(c)
|
||||
|| c == '.' || c == '@';
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -35,9 +35,7 @@ public class SensitiveWordCheck implements ISensitiveCheck {
|
||||
lengthCount++;
|
||||
|
||||
// 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
|
||||
System.out.println("chat is : " + i +"==="+txt.charAt(i));
|
||||
System.out.println("now map: " + nowMap.get(AppConst.IS_END));
|
||||
boolean isEnd = (boolean) nowMap.get(AppConst.IS_END);
|
||||
boolean isEnd = isEnd(nowMap);
|
||||
if (isEnd) {
|
||||
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
|
||||
// eg: 敏感词 敏感词xxx
|
||||
@@ -58,6 +56,25 @@ public class SensitiveWordCheck implements ISensitiveCheck {
|
||||
return actualLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否结束
|
||||
* BUG-FIX: 避免出现敏感词库中没有的文字。
|
||||
* @param map map 信息
|
||||
* @return 是否结束
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private static boolean isEnd(final Map map) {
|
||||
if(ObjectUtil.isNull(map)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Object value = map.get(AppConst.IS_END);
|
||||
if(ObjectUtil.isNull(value)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (boolean)value;
|
||||
}
|
||||
/**
|
||||
* 获取当前的 Map
|
||||
* @param nowMap 原始的当前 map
|
||||
@@ -75,6 +92,7 @@ public class SensitiveWordCheck implements ISensitiveCheck {
|
||||
char mappingChar = Instances.singleton(CharFormatChain.class).format(c, context);
|
||||
|
||||
// 这里做一次重复词的处理
|
||||
//TODO: 这里可以优化,是否获取一次。
|
||||
Map currentMap = (Map) nowMap.get(mappingChar);
|
||||
// 启用忽略重复&当前下标不是第一个
|
||||
if(context.ignoreRepeat()
|
||||
|
||||
@@ -10,7 +10,7 @@ import java.util.List;
|
||||
* <p> create on 2020/1/7 23:43 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 0.0.6
|
||||
* @since 0.0.9
|
||||
*/
|
||||
public class SensitiveWordBsEmailTest {
|
||||
|
||||
@@ -19,11 +19,23 @@ public class SensitiveWordBsEmailTest {
|
||||
* @since 0.0.9
|
||||
*/
|
||||
@Test
|
||||
public void emailTest() {
|
||||
final String text = "楼主好人,邮箱 123456789@qq.com";
|
||||
public void emailEnglishTest() {
|
||||
final String text = "楼主好人,邮箱 sensitiveword@xx.com";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[五星紅旗]", wordList.toString());
|
||||
Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* 邮箱测试
|
||||
* @since 0.0.9
|
||||
*/
|
||||
@Test
|
||||
public void emailNumberTest() {
|
||||
final String text = "楼主好人,邮箱 123456789@xx.com";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[123456789]", wordList.toString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user