mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
[Feature] add for new
This commit is contained in:
@@ -133,3 +133,10 @@
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | A | 允许用户自定义替换策略 | 2022-01-15 23:51:58 | |
|
||||
| 2 | U | 升级二方数据库依赖 | 2022-01-15 23:51:58 | |
|
||||
|
||||
# release_0.2.1
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | O | 移除日志初始化的控台日志输出 | 2023-02-17 23:51:58 | |
|
||||
| 2 | A | 支持数字检验的长度指定 | 2022-01-17 23:51:58 | |
|
||||
|
||||
52
README.md
52
README.md
@@ -46,9 +46,9 @@
|
||||
|
||||
[CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/doc/CHANGE_LOG.md)
|
||||
|
||||
v0.2.0 变更:
|
||||
v0.2.1 变更:
|
||||
|
||||
- 支持用户自定义替换策略
|
||||
- 支持用户自定义数字检测的长度
|
||||
|
||||
# 快速开始
|
||||
|
||||
@@ -64,7 +64,7 @@ v0.2.0 变更:
|
||||
<dependency>
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.2.0</version>
|
||||
<version>0.2.1</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@@ -298,6 +298,26 @@ List<String> wordList = SensitiveWordHelper.findAll(text);
|
||||
Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString());
|
||||
```
|
||||
|
||||
## 连续数字检测
|
||||
|
||||
一般用于过滤手机号/QQ等广告信息。
|
||||
|
||||
V0.2.1 之后,支持通过 `numCheckLen(长度)` 自定义检测的长度。
|
||||
|
||||
```java
|
||||
final String text = "你懂得:12345678";
|
||||
|
||||
// 默认检测 8 位
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[12345678]", wordList.toString());
|
||||
|
||||
// 指定数字的长度,避免误杀
|
||||
List<String> wordList2 = SensitiveWordBs.newInstance()
|
||||
.numCheckLen(9)
|
||||
.findAll(text);
|
||||
Assert.assertEquals("[]", wordList2.toString());
|
||||
```
|
||||
|
||||
# 特性配置
|
||||
|
||||
## 说明
|
||||
@@ -319,10 +339,11 @@ SensitiveWordBs wordBs = SensitiveWordBs.newInstance()
|
||||
.ignoreNumStyle(true)
|
||||
.ignoreChineseStyle(true)
|
||||
.ignoreEnglishStyle(true)
|
||||
.ignoreRepeat(true)
|
||||
.ignoreRepeat(false)
|
||||
.enableNumCheck(true)
|
||||
.enableEmailCheck(true)
|
||||
.enableUrlCheck(true)
|
||||
.numCheckLen(8)
|
||||
.init();
|
||||
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
@@ -332,17 +353,18 @@ Assert.assertTrue(wordBs.contains(text));
|
||||
|
||||
其中各项配置的说明如下:
|
||||
|
||||
| 序号 | 方法 | 说明 |
|
||||
|:---|:---|:---|
|
||||
| 1 | ignoreCase | 忽略大小写 |
|
||||
| 2 | ignoreWidth | 忽略半角圆角 |
|
||||
| 3 | ignoreNumStyle | 忽略数字的写法 |
|
||||
| 4 | ignoreChineseStyle | 忽略中文的书写格式 |
|
||||
| 5 | ignoreEnglishStyle | 忽略英文的书写格式 |
|
||||
| 6 | ignoreRepeat | 忽略重复词 |
|
||||
| 7 | enableNumCheck | 是否启用数字检测。默认连续 8 位数字认为是敏感词 |
|
||||
| 8 | enableEmailCheck | 是有启用邮箱检测 |
|
||||
| 9 | enableUrlCheck | 是否启用链接检测 |
|
||||
| 序号 | 方法 | 说明 |
|
||||
|:----|:---|:--------------|
|
||||
| 1 | ignoreCase | 忽略大小写 |
|
||||
| 2 | ignoreWidth | 忽略半角圆角 |
|
||||
| 3 | ignoreNumStyle | 忽略数字的写法 |
|
||||
| 4 | ignoreChineseStyle | 忽略中文的书写格式 |
|
||||
| 5 | ignoreEnglishStyle | 忽略英文的书写格式 |
|
||||
| 6 | ignoreRepeat | 忽略重复词 |
|
||||
| 7 | enableNumCheck | 是否启用数字检测。 |
|
||||
| 8 | enableEmailCheck | 是有启用邮箱检测 |
|
||||
| 9 | enableUrlCheck | 是否启用链接检测 |
|
||||
| 10 | numCheckLen | 数字检测,自定义指定长度。默认连续 8 位数字认为是敏感词 |
|
||||
|
||||
# 动态加载(用户自定义)
|
||||
|
||||
|
||||
@@ -158,4 +158,19 @@ public interface IWordContext {
|
||||
*/
|
||||
IWordContext ignoreRepeat(final boolean ignoreRepeat);
|
||||
|
||||
/**
|
||||
* 敏感数字检测
|
||||
* @return 数字检测
|
||||
* @since 0.2.1
|
||||
*/
|
||||
int sensitiveCheckNumLen();
|
||||
|
||||
/**
|
||||
* 设置敏感数字检测长度
|
||||
* @param sensitiveCheckNumLen 数字格式检测长度
|
||||
* @return this
|
||||
* @since 0.2.1
|
||||
*/
|
||||
IWordContext sensitiveCheckNumLen(final int sensitiveCheckNumLen);
|
||||
|
||||
}
|
||||
|
||||
@@ -190,6 +190,17 @@ public class SensitiveWordBs {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测敏感词对应的长度限制,便于用户灵活定义
|
||||
* @param numCheckLen 长度
|
||||
* @return this
|
||||
* @since 0.2.1
|
||||
*/
|
||||
public SensitiveWordBs numCheckLen(int numCheckLen) {
|
||||
this.context.sensitiveCheckNumLen(numCheckLen);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置是否启动 email 检测
|
||||
*
|
||||
@@ -301,6 +312,9 @@ public class SensitiveWordBs {
|
||||
wordContext.sensitiveCheckEmail(true);
|
||||
wordContext.sensitiveCheckUrl(true);
|
||||
|
||||
// 额外配置
|
||||
wordContext.sensitiveCheckNumLen(8);
|
||||
|
||||
return wordContext;
|
||||
}
|
||||
|
||||
|
||||
@@ -71,6 +71,12 @@ public class SensitiveWordContext implements IWordContext {
|
||||
*/
|
||||
private boolean sensitiveCheckUrl;
|
||||
|
||||
/**
|
||||
* 敏感数字检测对应的长度限制
|
||||
* @since 0.2.1
|
||||
*/
|
||||
private int sensitiveCheckNumLen;
|
||||
|
||||
/**
|
||||
* 私有化构造器
|
||||
* @since 0.0.4
|
||||
@@ -196,4 +202,16 @@ public class SensitiveWordContext implements IWordContext {
|
||||
this.sensitiveCheckUrl = sensitiveCheckUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int sensitiveCheckNumLen() {
|
||||
return sensitiveCheckNumLen;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveWordContext sensitiveCheckNumLen(int sensitiveCheckNumLen) {
|
||||
this.sensitiveCheckNumLen = sensitiveCheckNumLen;
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ public class SensitiveCheckNum implements ISensitiveCheck {
|
||||
lengthCount++;
|
||||
|
||||
// 满足结束的条件
|
||||
boolean isCondition = isCondition(lengthCount);
|
||||
boolean isCondition = isCondition(lengthCount, context);
|
||||
if (isCondition) {
|
||||
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
|
||||
actualLength = lengthCount;
|
||||
@@ -57,11 +57,13 @@ public class SensitiveCheckNum implements ISensitiveCheck {
|
||||
* 这里指定一个阈值条件
|
||||
* TODO: 这里有一个问题,会把一些 url 中的数字替换掉。
|
||||
* @param lengthCount 长度
|
||||
* @param context 上下文
|
||||
* @return 是否满足条件
|
||||
* @since 0.0.5
|
||||
*/
|
||||
private boolean isCondition(final int lengthCount) {
|
||||
return lengthCount >= 8;
|
||||
protected boolean isCondition(final int lengthCount,
|
||||
final IWordContext context) {
|
||||
return lengthCount >= context.sensitiveCheckNumLen();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -41,7 +41,6 @@ public class SensitiveWordData implements IWordData {
|
||||
defaultLines = CollectionUtil.difference(defaultLines, allowList);
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("Sensitive data loaded!, cost time: " + (end - start) + "ms");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -101,7 +101,6 @@ public class SensitiveWordMap implements IWordMap {
|
||||
this.innerWordMap = newInnerWordMap;
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
System.out.println("Init sensitive word map end! Cost time: " + (endTime - startTime) + "ms");
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -26,6 +26,7 @@ public class SensitiveWordBsConfigTest {
|
||||
.enableNumCheck(true)
|
||||
.enableEmailCheck(true)
|
||||
.enableUrlCheck(true)
|
||||
.numCheckLen(8)
|
||||
.init();
|
||||
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* <p> project: sensitive-word-SensitiveWordBsTest </p>
|
||||
* <p> create on 2020/1/7 23:43 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 0.2.1
|
||||
*/
|
||||
public class SensitiveWordBsNumLenTest {
|
||||
|
||||
/**
|
||||
* 返回所有敏感词
|
||||
* @since 0.2.1
|
||||
*/
|
||||
@Test
|
||||
public void findAllTest() {
|
||||
final String text = "你懂得:12345678";
|
||||
|
||||
// 默认检测 8 位
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[12345678]", wordList.toString());
|
||||
|
||||
// 指定数字的长度,避免误杀
|
||||
List<String> wordList2 = SensitiveWordBs.newInstance()
|
||||
.numCheckLen(9)
|
||||
.findAll(text);
|
||||
Assert.assertEquals("[]", wordList2.toString());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user