mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
[Feature] add for new
This commit is contained in:
86
README.md
86
README.md
@@ -1,6 +1,6 @@
|
||||
# sensitive-word
|
||||
|
||||
[sensitive-word](https://github.com/houbb/sensitive-word) 基于 DFA 算法实现的敏感词工具。
|
||||
[sensitive-word](https://github.com/houbb/sensitive-word) 基于 DFA 算法实现的高性能敏感词工具。
|
||||
|
||||
[](http://mvnrepository.com/artifact/com.github.houbb/sensitive-word)
|
||||
|
||||
@@ -10,21 +10,25 @@
|
||||
|
||||
实现一款好用敏感词工具。
|
||||
|
||||
基于 DFA 算法实现,目前敏感词库内容收录 18W+ 感觉过于臃肿。
|
||||
基于 DFA 算法实现,目前敏感词库内容收录 6W+(源文件 18W+,经过一次删减)。
|
||||
|
||||
后期将进行相关优化,降低字典的数量。
|
||||
后期将进行持续优化和补充敏感词库,并进一步提升算法的性能。
|
||||
|
||||
希望可以细化敏感词的分类,感觉工作量比较大,暂时没有太好的思路。
|
||||
希望可以细化敏感词的分类,感觉工作量比较大,暂时没有进行。
|
||||
|
||||
## 后期目标
|
||||
## 特性
|
||||
|
||||
- 持续扩容对应的敏感词(如合法的数据抓取)
|
||||
- 6W+ 词库,且不断优化更新
|
||||
|
||||
- 添加英文大小写忽略,全角半角忽略
|
||||
- 基于 DFA 算法,性能很好
|
||||
|
||||
- 中文添加拼音相关转换,添加繁简体转换忽略
|
||||
- 基于 fluent-api 实现,优雅方便
|
||||
|
||||
- 允许用户自定义敏感词和白名单
|
||||
- 支持敏感词的判断、返回、脱敏等常见操作
|
||||
|
||||
- 支持全角半角互换
|
||||
|
||||
- 支持英文大小写互换
|
||||
|
||||
# 快速开始
|
||||
|
||||
@@ -40,10 +44,22 @@
|
||||
<dependency>
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.0.3</version>
|
||||
<version>0.0.4</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
## api 概览
|
||||
|
||||
`SensitiveWordBs` 作为敏感词的引导类,核心方法如下:
|
||||
|
||||
| 方法 | 参数 | 返回值| 说明 |
|
||||
|:---|:---|:---|:---|
|
||||
| newInstance() | 无 | 引导类 | 初始化引导类 |
|
||||
| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 |
|
||||
| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 |
|
||||
| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
|
||||
## 使用实例
|
||||
|
||||
所有测试案例参见 [SensitiveWordBsTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java)
|
||||
@@ -53,7 +69,7 @@
|
||||
```java
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
|
||||
Assert.assertTrue(SensitiveWordBs.newInstance().contains(text));
|
||||
```
|
||||
|
||||
### 返回第一个敏感词
|
||||
@@ -61,7 +77,7 @@ Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
|
||||
```java
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
String word = SensitiveWordBs.getInstance().findFirst(text);
|
||||
String word = SensitiveWordBs.newInstance().findFirst(text);
|
||||
Assert.assertEquals("五星红旗", word);
|
||||
```
|
||||
|
||||
@@ -70,7 +86,7 @@ Assert.assertEquals("五星红旗", word);
|
||||
```java
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.getInstance().findAll(text);
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
|
||||
```
|
||||
|
||||
@@ -78,7 +94,7 @@ Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString())
|
||||
|
||||
```java
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
String result = SensitiveWordBs.getInstance().replace(text);
|
||||
String result = SensitiveWordBs.newInstance().replace(text);
|
||||
Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result);
|
||||
```
|
||||
|
||||
@@ -86,6 +102,46 @@ Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result)
|
||||
|
||||
```java
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
String result = SensitiveWordBs.getInstance().replace(text, '0');
|
||||
String result = SensitiveWordBs.newInstance().replace(text, '0');
|
||||
Assert.assertEquals("0000迎风飘扬,000的画像屹立在000前。", result);
|
||||
```
|
||||
|
||||
# 更多特性
|
||||
|
||||
后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。
|
||||
|
||||
这是一场漫长的攻防之战。
|
||||
|
||||
## 忽略大小写
|
||||
|
||||
```java
|
||||
final String text = "fuCK the bad words.";
|
||||
|
||||
String word = SensitiveWordBs.newInstance().findFirst(text);
|
||||
Assert.assertEquals("fuCK", word);
|
||||
```
|
||||
|
||||
## 忽略半角圆角
|
||||
|
||||
```java
|
||||
final String text = "fuck the bad words.";
|
||||
|
||||
String word = SensitiveWordBs.newInstance().findFirst(text);
|
||||
Assert.assertEquals("fuck", word);
|
||||
```
|
||||
|
||||
# 后期 road-map
|
||||
|
||||
- 繁简体互换
|
||||
|
||||
- 重复词
|
||||
|
||||
- 停顿词
|
||||
|
||||
- 拼音互换
|
||||
|
||||
- 用户自定义敏感词和白名单
|
||||
|
||||
- 文字镜像翻转
|
||||
|
||||
- 敏感词标签支持
|
||||
@@ -1,3 +1,12 @@
|
||||
# 字符
|
||||
|
||||
全部使用小写+半角的形式匹配。
|
||||
|
||||
## 忽略大小写
|
||||
|
||||
if(Character.isLetter) {
|
||||
ignoreCase=true
|
||||
ignoreWidth=true
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -9,3 +9,9 @@
|
||||
(2)数字
|
||||
|
||||
对于数字,除却象形,最常用的就是谐音。
|
||||
|
||||
## 不可变性
|
||||
|
||||
这个涉及到拼音的 DFA 树构建,可能需要 wordMap 提供一个添加的接口。
|
||||
|
||||
这个需要在初始化的时候,直接指定。而且不可变化。
|
||||
4
pom.xml
4
pom.xml
@@ -52,6 +52,10 @@
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>heaven</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>com.huaban</groupId>
|
||||
<artifactId>jieba-analysis</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<!--============================== OTHER ==============================-->
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
/**
|
||||
* @author binbin.hou
|
||||
* @since 0.0.4
|
||||
*/
|
||||
public interface IWordContext {
|
||||
|
||||
/**
|
||||
* 是否忽略大小写
|
||||
* @return 是否
|
||||
* @since 0.0.4
|
||||
*/
|
||||
boolean ignoreCase();
|
||||
|
||||
/**
|
||||
* 是否忽略半角圆角
|
||||
* @return 是否
|
||||
* @since 0.0.4
|
||||
*/
|
||||
boolean ignoreWidth();
|
||||
|
||||
/**
|
||||
* 设置是否忽略大小写
|
||||
* @param ignoreCase 是否忽略大小写
|
||||
* @return this
|
||||
* @since 0.0.4
|
||||
*/
|
||||
IWordContext ignoreCase(boolean ignoreCase);
|
||||
|
||||
/**
|
||||
* 设置是否忽略半角圆角
|
||||
* @param ignoreWidth 是否忽略半角圆角
|
||||
* @return this
|
||||
* @since 0.0.4
|
||||
*/
|
||||
IWordContext ignoreWidth(boolean ignoreWidth);
|
||||
|
||||
}
|
||||
@@ -1,6 +1,5 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
|
||||
import java.util.Collection;
|
||||
@@ -24,28 +23,34 @@ public interface IWordMap {
|
||||
/**
|
||||
* 是否包含敏感词
|
||||
* @param string 字符串
|
||||
* @param context 上下文
|
||||
* @return 是否包含
|
||||
* @since 0.0.1
|
||||
* @see ValidModeEnum#FAIL_FAST 建议使用快速返回模式
|
||||
*/
|
||||
boolean contains(final String string);
|
||||
boolean contains(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 返回所有对应的敏感词
|
||||
* @param string 原始字符串
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.0.1
|
||||
* @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式
|
||||
*/
|
||||
List<String> findAll(final String string);
|
||||
List<String> findAll(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 返回第一个对应的敏感词
|
||||
* @param string 原始字符串
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.0.1
|
||||
*/
|
||||
String findFirst(final String string);
|
||||
String findFirst(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 替换所有敏感词内容
|
||||
@@ -54,9 +59,11 @@ public interface IWordMap {
|
||||
*
|
||||
* @param target 目标字符串
|
||||
* @param replaceChar 替换为的 char
|
||||
* @param context 上下文
|
||||
* @return 替换后结果
|
||||
* @since 0.0.2
|
||||
*/
|
||||
String replace(final String target, final char replaceChar);
|
||||
String replace(final String target, final char replaceChar,
|
||||
final IWordContext context);
|
||||
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import com.github.houbb.heaven.constant.CharConst;
|
||||
import com.github.houbb.heaven.support.instance.impl.Instances;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordData;
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
import com.github.houbb.sensitive.word.support.data.SensitiveWordData;
|
||||
@@ -22,41 +22,68 @@ public class SensitiveWordBs {
|
||||
*/
|
||||
private SensitiveWordBs(){}
|
||||
|
||||
/**
|
||||
* 敏感数据信息
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private IWordData sensitiveWordData = Instances.singleton(SensitiveWordData.class);
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private IWordMap sensitiveWordMap = Instances.singleton(SensitiveWordMap.class);
|
||||
private static volatile IWordMap sensitiveWordMap;
|
||||
|
||||
/**
|
||||
* 获取单例信息
|
||||
* @since 0.0.1
|
||||
* 默认的执行上下文
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final SensitiveWordBs INSTANCE;
|
||||
private volatile IWordContext context;
|
||||
|
||||
static {
|
||||
synchronized (SensitiveWordBs.class) {
|
||||
INSTANCE = new SensitiveWordBs();
|
||||
List<String> lines = INSTANCE.sensitiveWordData.getWordData();
|
||||
INSTANCE.sensitiveWordMap.initWordMap(lines);
|
||||
/**
|
||||
* DCL 初始化 wordMap 信息
|
||||
* @return 初始化后的结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static IWordMap initWordMap() {
|
||||
if(sensitiveWordMap == null) {
|
||||
synchronized (IWordMap.class) {
|
||||
if(sensitiveWordMap == null) {
|
||||
// 加载配置信息
|
||||
IWordData wordData = new SensitiveWordData();
|
||||
List<String> lines = wordData.getWordData();
|
||||
|
||||
// 初始化 DFA 信息
|
||||
sensitiveWordMap = new SensitiveWordMap();
|
||||
sensitiveWordMap.initWordMap(lines);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sensitiveWordMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建验证实例
|
||||
*
|
||||
* double-lock
|
||||
* @return this
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public static SensitiveWordBs getInstance() {
|
||||
return INSTANCE;
|
||||
public static SensitiveWordBs newInstance() {
|
||||
initWordMap();
|
||||
|
||||
SensitiveWordBs bs = new SensitiveWordBs();
|
||||
bs.context = buildDefaultContext();
|
||||
return bs;
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建默认的上下文
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static IWordContext buildDefaultContext() {
|
||||
IWordContext wordContext = SensitiveWordContext.newInstance();
|
||||
wordContext.ignoreCase(true);
|
||||
wordContext.ignoreWidth(true);
|
||||
|
||||
return wordContext;
|
||||
}
|
||||
/**
|
||||
* 是否包含敏感词
|
||||
* @param target 目标字符串
|
||||
@@ -64,7 +91,7 @@ public class SensitiveWordBs {
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public boolean contains(final String target) {
|
||||
return this.sensitiveWordMap.contains(target);
|
||||
return sensitiveWordMap.contains(target, context);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -76,7 +103,7 @@ public class SensitiveWordBs {
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public List<String> findAll(final String target) {
|
||||
return this.sensitiveWordMap.findAll(target);
|
||||
return sensitiveWordMap.findAll(target, context);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -87,7 +114,7 @@ public class SensitiveWordBs {
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public String findFirst(final String target) {
|
||||
return this.sensitiveWordMap.findFirst(target);
|
||||
return sensitiveWordMap.findFirst(target, context);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -98,7 +125,7 @@ public class SensitiveWordBs {
|
||||
* @since 0.0.2
|
||||
*/
|
||||
public String replace(final String target, final char replaceChar) {
|
||||
return this.sensitiveWordMap.replace(target, replaceChar);
|
||||
return sensitiveWordMap.replace(target, replaceChar, context);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
|
||||
/**
|
||||
* 上下文
|
||||
* @author binbin.hou
|
||||
* @since 0.0.4
|
||||
*/
|
||||
public class SensitiveWordContext implements IWordContext {
|
||||
|
||||
/**
|
||||
* 忽略大小写
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private boolean ignoreCase;
|
||||
|
||||
/**
|
||||
* 忽略半角全角
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private boolean ignoreWidth;
|
||||
|
||||
/**
|
||||
* 私有化构造器
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private SensitiveWordContext() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建一个对象实例
|
||||
* @return 对象实例
|
||||
* @since 0.0.4
|
||||
*/
|
||||
public static SensitiveWordContext newInstance() {
|
||||
return new SensitiveWordContext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean ignoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveWordContext ignoreCase(boolean ignoreCase) {
|
||||
this.ignoreCase = ignoreCase;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean ignoreWidth() {
|
||||
return ignoreWidth;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveWordContext ignoreWidth(boolean ignoreWidth) {
|
||||
this.ignoreWidth = ignoreWidth;
|
||||
return this;
|
||||
}
|
||||
|
||||
private static class ContextHolder {
|
||||
private static final SensitiveWordContext INSTANCE = new SensitiveWordContext();
|
||||
|
||||
static {
|
||||
INSTANCE.ignoreCase(true);
|
||||
INSTANCE.ignoreWidth(true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 默认配置
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static SensitiveWordContext defaultContext() {
|
||||
return ContextHolder.INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SensitiveWordContext{" +
|
||||
"ignoreCase=" + ignoreCase +
|
||||
", ignoreWidth=" + ignoreWidth +
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
@@ -5,7 +5,7 @@ package com.github.houbb.sensitive.word.constant.enums;
|
||||
* <p> create on 2020/1/7 22:46 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 1.0.0
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public enum ValidModeEnum {
|
||||
|
||||
|
||||
@@ -43,8 +43,8 @@ public class CheckSensitiveWordResult {
|
||||
return sensitiveWordSize;
|
||||
}
|
||||
|
||||
public CheckSensitiveWordResult sentiveWordSize(int sentiveWordSize) {
|
||||
this.sensitiveWordSize = sentiveWordSize;
|
||||
public CheckSensitiveWordResult sentiveWordSize(int sensitiveWordSize) {
|
||||
this.sensitiveWordSize = sensitiveWordSize;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ public class SensitiveWordData implements IWordData {
|
||||
defaultLines = StreamUtil.readAllLines("/dict.txt");
|
||||
defaultLines.addAll(StreamUtil.readAllLines("/dict_en.txt"));
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("Sensitive data loaded!, cost time: " + (end - start) + " ms");
|
||||
System.out.println("Sensitive data loaded!, cost time: " + (end - start) + "ms");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.heaven.util.util.MapUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
@@ -30,7 +31,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
*
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private static Map sensitiveWordMap;
|
||||
private Map innerWordMap;
|
||||
|
||||
/**
|
||||
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
|
||||
@@ -46,13 +47,13 @@ public class SensitiveWordMap implements IWordMap {
|
||||
@SuppressWarnings("unchecked")
|
||||
public void initWordMap(Collection<String> collection) {
|
||||
// 避免重复加载
|
||||
if (MapUtil.isNotEmpty(sensitiveWordMap)) {
|
||||
if (MapUtil.isNotEmpty(innerWordMap)) {
|
||||
return;
|
||||
}
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
// 避免扩容带来的消耗
|
||||
sensitiveWordMap = new HashMap(collection.size());
|
||||
innerWordMap = new HashMap(collection.size());
|
||||
|
||||
for (String key : collection) {
|
||||
if (StringUtil.isEmpty(key)) {
|
||||
@@ -64,7 +65,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
final int size = chars.length;
|
||||
|
||||
// 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中
|
||||
Map currentMap = sensitiveWordMap;
|
||||
Map currentMap = innerWordMap;
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
// 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
|
||||
@@ -78,7 +79,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
currentMap = (Map) wordMap;
|
||||
} else {
|
||||
//不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一
|
||||
Map<String, Boolean> newWordMap = new HashMap<>();
|
||||
Map<String, Boolean> newWordMap = new HashMap<>(8);
|
||||
newWordMap.put(AppConst.IS_END, false);
|
||||
|
||||
// 将新的节点放入当前 map 中
|
||||
@@ -96,7 +97,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
}
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
System.out.println("Init sensitive word map end! Cost time " + (endTime - startTime) + "ms");
|
||||
System.out.println("Init sensitive word map end! Cost time: " + (endTime - startTime) + "ms");
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -109,13 +110,13 @@ public class SensitiveWordMap implements IWordMap {
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@Override
|
||||
public boolean contains(String string) {
|
||||
public boolean contains(String string, final IWordContext context) {
|
||||
if (StringUtil.isEmpty(string)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST);
|
||||
int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST, context);
|
||||
// 快速返回
|
||||
if (checkResult > 0) {
|
||||
return true;
|
||||
@@ -134,13 +135,13 @@ public class SensitiveWordMap implements IWordMap {
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@Override
|
||||
public List<String> findAll(String string) {
|
||||
return getSensitiveWords(string, ValidModeEnum.FAIL_OVER);
|
||||
public List<String> findAll(String string, final IWordContext context) {
|
||||
return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String findFirst(String string) {
|
||||
List<String> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST);
|
||||
public String findFirst(String string, final IWordContext context) {
|
||||
List<String> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context);
|
||||
|
||||
if (CollectionUtil.isEmpty(stringList)) {
|
||||
return null;
|
||||
@@ -150,12 +151,12 @@ public class SensitiveWordMap implements IWordMap {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String replace(String target, char replaceChar) {
|
||||
public String replace(String target, char replaceChar, final IWordContext context) {
|
||||
if(StringUtil.isEmpty(target)) {
|
||||
return target;
|
||||
}
|
||||
|
||||
return this.replaceSensitiveWord(target, ValidModeEnum.FAIL_OVER, replaceChar);
|
||||
return this.replaceSensitiveWord(target, replaceChar, context);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -166,7 +167,8 @@ public class SensitiveWordMap implements IWordMap {
|
||||
* @return 结果列表
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private List<String> getSensitiveWords(final String text, final ValidModeEnum modeEnum) {
|
||||
private List<String> getSensitiveWords(final String text, final ValidModeEnum modeEnum,
|
||||
final IWordContext context) {
|
||||
//1. 是否存在敏感词,如果比存在,直接返回空列表
|
||||
if (StringUtil.isEmpty(text)) {
|
||||
return Guavas.newArrayList();
|
||||
@@ -174,7 +176,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
|
||||
List<String> resultList = Guavas.newArrayList();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER);
|
||||
int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER, context);
|
||||
|
||||
// 命中
|
||||
if (wordLength > 0) {
|
||||
@@ -215,19 +217,23 @@ public class SensitiveWordMap implements IWordMap {
|
||||
* @param txt 文本信息
|
||||
* @param beginIndex 开始下标
|
||||
* @param validModeEnum 验证模式
|
||||
* @param context 执行上下文
|
||||
* @return 敏感词对应的长度
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private int checkSensitiveWord(final String txt, final int beginIndex,
|
||||
final ValidModeEnum validModeEnum) {
|
||||
Map nowMap = sensitiveWordMap;
|
||||
final ValidModeEnum validModeEnum,
|
||||
final IWordContext context) {
|
||||
Map nowMap = innerWordMap;
|
||||
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
for (int i = beginIndex; i < txt.length(); i++) {
|
||||
char charKey = txt.charAt(i);
|
||||
char c = txt.charAt(i);
|
||||
char charKey = getActualChar(c, context);
|
||||
|
||||
// 判断该字是否存在于敏感词库中
|
||||
// 并且将 nowMap 替换为新的 map,进入下一层的循环。
|
||||
nowMap = (Map) nowMap.get(charKey);
|
||||
@@ -256,16 +262,36 @@ public class SensitiveWordMap implements IWordMap {
|
||||
return actualLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取实际对应的符号
|
||||
* @param c 编号
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private char getActualChar(final char c,
|
||||
final IWordContext context) {
|
||||
char resultChar = c;
|
||||
|
||||
if(context.ignoreCase()) {
|
||||
resultChar = Character.toLowerCase(resultChar);
|
||||
}
|
||||
if(context.ignoreWidth()) {
|
||||
resultChar = CharUtil.toHalfWidth(resultChar);
|
||||
}
|
||||
|
||||
return resultChar;
|
||||
}
|
||||
|
||||
/**
|
||||
* 直接替换敏感词,返回替换后的结果
|
||||
* @param target 文本信息
|
||||
* @param validModeEnum 验证模式
|
||||
* @return 脱敏后的字符串
|
||||
* @since 0.0.2
|
||||
*/
|
||||
private String replaceSensitiveWord(final String target,
|
||||
final ValidModeEnum validModeEnum,
|
||||
final char replaceChar) {
|
||||
final char replaceChar,
|
||||
final IWordContext context) {
|
||||
if(StringUtil.isEmpty(target)) {
|
||||
return target;
|
||||
}
|
||||
@@ -275,7 +301,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
for (int i = 0; i < target.length(); i++) {
|
||||
char currentChar = target.charAt(i);
|
||||
// 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
|
||||
int wordLength = checkSensitiveWord(target, i, validModeEnum);
|
||||
int wordLength = checkSensitiveWord(target, i, ValidModeEnum.FAIL_OVER, context);
|
||||
|
||||
// 敏感词
|
||||
if(wordLength > 0) {
|
||||
|
||||
@@ -22,7 +22,7 @@ public class SensitiveWordBsTest {
|
||||
public void containsTest() {
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
|
||||
Assert.assertTrue(SensitiveWordBs.newInstance().contains(text));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -33,7 +33,7 @@ public class SensitiveWordBsTest {
|
||||
public void findAllTest() {
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.getInstance().findAll(text);
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ public class SensitiveWordBsTest {
|
||||
public void findFirstTest() {
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
String word = SensitiveWordBs.getInstance().findFirst(text);
|
||||
String word = SensitiveWordBs.newInstance().findFirst(text);
|
||||
Assert.assertEquals("五星红旗", word);
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ public class SensitiveWordBsTest {
|
||||
public void replaceTest() {
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
String result = SensitiveWordBs.getInstance().replace(text);
|
||||
String result = SensitiveWordBs.newInstance().replace(text);
|
||||
Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result);
|
||||
}
|
||||
|
||||
@@ -69,8 +69,32 @@ public class SensitiveWordBsTest {
|
||||
public void replaceCharTest() {
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
String result = SensitiveWordBs.getInstance().replace(text, '0');
|
||||
String result = SensitiveWordBs.newInstance().replace(text, '0');
|
||||
Assert.assertEquals("0000迎风飘扬,000的画像屹立在000前。", result);
|
||||
}
|
||||
|
||||
/**
|
||||
* 忽略大小写
|
||||
* @since 0.0.4
|
||||
*/
|
||||
@Test
|
||||
public void ignoreCaseTest() {
|
||||
final String text = "fuCK the bad words.";
|
||||
|
||||
String word = SensitiveWordBs.newInstance().findFirst(text);
|
||||
Assert.assertEquals("fuCK", word);
|
||||
}
|
||||
|
||||
/**
|
||||
* 忽略半角圆角
|
||||
* @since 0.0.4
|
||||
*/
|
||||
@Test
|
||||
public void ignoreWidthTest() {
|
||||
final String text = "fuck the bad words.";
|
||||
|
||||
String word = SensitiveWordBs.newInstance().findFirst(text);
|
||||
Assert.assertEquals("fuck", word);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user