[Feature] add for new

This commit is contained in:
binbin.hou
2020-01-09 13:34:43 +08:00
parent 200a60c3ba
commit fa9348d55d
13 changed files with 363 additions and 77 deletions

View File

@@ -1,6 +1,6 @@
# sensitive-word # sensitive-word
[sensitive-word](https://github.com/houbb/sensitive-word) 基于 DFA 算法实现的敏感词工具。 [sensitive-word](https://github.com/houbb/sensitive-word) 基于 DFA 算法实现的高性能敏感词工具。
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.houbb/sensitive-word/badge.svg)](http://mvnrepository.com/artifact/com.github.houbb/sensitive-word) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.houbb/sensitive-word/badge.svg)](http://mvnrepository.com/artifact/com.github.houbb/sensitive-word)
@@ -10,21 +10,25 @@
实现一款好用敏感词工具。 实现一款好用敏感词工具。
基于 DFA 算法实现,目前敏感词库内容收录 18W+ 感觉过于臃肿 基于 DFA 算法实现,目前敏感词库内容收录 6W+(源文件 18W+,经过一次删减)
后期将进行相关优化,降低字典的数量 后期将进行持续优化和补充敏感词库,并进一步提升算法的性能
希望可以细化敏感词的分类,感觉工作量比较大,暂时没有太好的思路 希望可以细化敏感词的分类,感觉工作量比较大,暂时没有进行
## 后期目标 ## 特性
- 持续扩容对应的敏感词(如合法的数据抓取) - 6W+ 词库,且不断优化更新
- 添加英文大小写忽略,全角半角忽略 - 基于 DFA 算法,性能很好
- 中文添加拼音相关转换,添加繁简体转换忽略 - 基于 fluent-api 实现,优雅方便
- 允许用户自定义敏感词和白名单 - 支持敏感词的判断、返回、脱敏等常见操作
- 支持全角半角互换
- 支持英文大小写互换
# 快速开始 # 快速开始
@@ -40,10 +44,22 @@
<dependency> <dependency>
<groupId>com.github.houbb</groupId> <groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId> <artifactId>sensitive-word</artifactId>
<version>0.0.3</version> <version>0.0.4</version>
</dependency> </dependency>
``` ```
## api 概览
`SensitiveWordBs` 作为敏感词的引导类,核心方法如下:
| 方法 | 参数 | 返回值| 说明 |
|:---|:---|:---|:---|
| newInstance() | 无 | 引导类 | 初始化引导类 |
| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 |
| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 |
| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
## 使用实例 ## 使用实例
所有测试案例参见 [SensitiveWordBsTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java) 所有测试案例参见 [SensitiveWordBsTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java)
@@ -53,7 +69,7 @@
```java ```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
Assert.assertTrue(SensitiveWordBs.getInstance().contains(text)); Assert.assertTrue(SensitiveWordBs.newInstance().contains(text));
``` ```
### 返回第一个敏感词 ### 返回第一个敏感词
@@ -61,7 +77,7 @@ Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
```java ```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String word = SensitiveWordBs.getInstance().findFirst(text); String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("五星红旗", word); Assert.assertEquals("五星红旗", word);
``` ```
@@ -70,7 +86,7 @@ Assert.assertEquals("五星红旗", word);
```java ```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
List<String> wordList = SensitiveWordBs.getInstance().findAll(text); List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
``` ```
@@ -78,7 +94,7 @@ Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString())
```java ```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String result = SensitiveWordBs.getInstance().replace(text); String result = SensitiveWordBs.newInstance().replace(text);
Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result); Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result);
``` ```
@@ -86,6 +102,46 @@ Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result)
```java ```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String result = SensitiveWordBs.getInstance().replace(text, '0'); String result = SensitiveWordBs.newInstance().replace(text, '0');
Assert.assertEquals("0000迎风飘扬000的画像屹立在000前。", result); Assert.assertEquals("0000迎风飘扬000的画像屹立在000前。", result);
``` ```
# 更多特性
后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。
这是一场漫长的攻防之战。
## 忽略大小写
```java
final String text = "fuCK the bad words.";
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("fuCK", word);
```
## 忽略半角圆角
```java
final String text = " the bad words.";
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("", word);
```
# 后期 road-map
- 繁简体互换
- 重复词
- 停顿词
- 拼音互换
- 用户自定义敏感词和白名单
- 文字镜像翻转
- 敏感词标签支持

View File

@@ -1,3 +1,12 @@
# 字符 # 字符
全部使用小写+半角的形式匹配。 全部使用小写+半角的形式匹配。
## 忽略大小写
if(Character.isLetter) {
ignoreCase=true
ignoreWidth=true
}

View File

@@ -8,4 +8,10 @@
2数字 2数字
对于数字,除却象形,最常用的就是谐音。 对于数字,除却象形,最常用的就是谐音。
## 不可变性
这个涉及到拼音的 DFA 树构建,可能需要 wordMap 提供一个添加的接口。
这个需要在初始化的时候,直接指定。而且不可变化。

View File

@@ -52,6 +52,10 @@
<groupId>com.github.houbb</groupId> <groupId>com.github.houbb</groupId>
<artifactId>heaven</artifactId> <artifactId>heaven</artifactId>
</exclusion> </exclusion>
<exclusion>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
</exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<!--============================== OTHER ==============================--> <!--============================== OTHER ==============================-->

View File

@@ -0,0 +1,39 @@
package com.github.houbb.sensitive.word.api;
/**
* @author binbin.hou
* @since 0.0.4
*/
public interface IWordContext {
/**
* 是否忽略大小写
* @return 是否
* @since 0.0.4
*/
boolean ignoreCase();
/**
* 是否忽略半角圆角
* @return 是否
* @since 0.0.4
*/
boolean ignoreWidth();
/**
* 设置是否忽略大小写
* @param ignoreCase 是否忽略大小写
* @return this
* @since 0.0.4
*/
IWordContext ignoreCase(boolean ignoreCase);
/**
* 设置是否忽略半角圆角
* @param ignoreWidth 是否忽略半角圆角
* @return this
* @since 0.0.4
*/
IWordContext ignoreWidth(boolean ignoreWidth);
}

View File

@@ -1,6 +1,5 @@
package com.github.houbb.sensitive.word.api; package com.github.houbb.sensitive.word.api;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
import java.util.Collection; import java.util.Collection;
@@ -24,28 +23,34 @@ public interface IWordMap {
/** /**
* 是否包含敏感词 * 是否包含敏感词
* @param string 字符串 * @param string 字符串
* @param context 上下文
* @return 是否包含 * @return 是否包含
* @since 0.0.1 * @since 0.0.1
* @see ValidModeEnum#FAIL_FAST 建议使用快速返回模式 * @see ValidModeEnum#FAIL_FAST 建议使用快速返回模式
*/ */
boolean contains(final String string); boolean contains(final String string,
final IWordContext context);
/** /**
* 返回所有对应的敏感词 * 返回所有对应的敏感词
* @param string 原始字符串 * @param string 原始字符串
* @param context 上下文
* @return 结果 * @return 结果
* @since 0.0.1 * @since 0.0.1
* @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式 * @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式
*/ */
List<String> findAll(final String string); List<String> findAll(final String string,
final IWordContext context);
/** /**
* 返回第一个对应的敏感词 * 返回第一个对应的敏感词
* @param string 原始字符串 * @param string 原始字符串
* @param context 上下文
* @return 结果 * @return 结果
* @since 0.0.1 * @since 0.0.1
*/ */
String findFirst(final String string); String findFirst(final String string,
final IWordContext context);
/** /**
* 替换所有敏感词内容 * 替换所有敏感词内容
@@ -54,9 +59,11 @@ public interface IWordMap {
* *
* @param target 目标字符串 * @param target 目标字符串
* @param replaceChar 替换为的 char * @param replaceChar 替换为的 char
* @param context 上下文
* @return 替换后结果 * @return 替换后结果
* @since 0.0.2 * @since 0.0.2
*/ */
String replace(final String target, final char replaceChar); String replace(final String target, final char replaceChar,
final IWordContext context);
} }

View File

@@ -1,7 +1,7 @@
package com.github.houbb.sensitive.word.bs; package com.github.houbb.sensitive.word.bs;
import com.github.houbb.heaven.constant.CharConst; import com.github.houbb.heaven.constant.CharConst;
import com.github.houbb.heaven.support.instance.impl.Instances; import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordData; import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.support.data.SensitiveWordData; import com.github.houbb.sensitive.word.support.data.SensitiveWordData;
@@ -22,41 +22,68 @@ public class SensitiveWordBs {
*/ */
private SensitiveWordBs(){} private SensitiveWordBs(){}
/**
* 敏感数据信息
* @since 0.0.1
*/
private IWordData sensitiveWordData = Instances.singleton(SensitiveWordData.class);
/** /**
* 敏感词 map * 敏感词 map
* @since 0.0.1 * @since 0.0.1
*/ */
private IWordMap sensitiveWordMap = Instances.singleton(SensitiveWordMap.class); private static volatile IWordMap sensitiveWordMap;
/** /**
* 获取单例信息 * 默认的执行上下文
* @since 0.0.1 * @since 0.0.4
*/ */
private static final SensitiveWordBs INSTANCE; private volatile IWordContext context;
static { /**
synchronized (SensitiveWordBs.class) { * DCL 初始化 wordMap 信息
INSTANCE = new SensitiveWordBs(); * @return 初始化后的结果
List<String> lines = INSTANCE.sensitiveWordData.getWordData(); * @since 0.0.4
INSTANCE.sensitiveWordMap.initWordMap(lines); */
private static IWordMap initWordMap() {
if(sensitiveWordMap == null) {
synchronized (IWordMap.class) {
if(sensitiveWordMap == null) {
// 加载配置信息
IWordData wordData = new SensitiveWordData();
List<String> lines = wordData.getWordData();
// 初始化 DFA 信息
sensitiveWordMap = new SensitiveWordMap();
sensitiveWordMap.initWordMap(lines);
}
}
} }
return sensitiveWordMap;
} }
/** /**
* 新建验证实例 * 新建验证实例
*
* double-lock
* @return this * @return this
* @since 0.0.1 * @since 0.0.1
*/ */
public static SensitiveWordBs getInstance() { public static SensitiveWordBs newInstance() {
return INSTANCE; initWordMap();
SensitiveWordBs bs = new SensitiveWordBs();
bs.context = buildDefaultContext();
return bs;
} }
/**
* 构建默认的上下文
* @return 结果
* @since 0.0.4
*/
private static IWordContext buildDefaultContext() {
IWordContext wordContext = SensitiveWordContext.newInstance();
wordContext.ignoreCase(true);
wordContext.ignoreWidth(true);
return wordContext;
}
/** /**
* 是否包含敏感词 * 是否包含敏感词
* @param target 目标字符串 * @param target 目标字符串
@@ -64,7 +91,7 @@ public class SensitiveWordBs {
* @since 0.0.1 * @since 0.0.1
*/ */
public boolean contains(final String target) { public boolean contains(final String target) {
return this.sensitiveWordMap.contains(target); return sensitiveWordMap.contains(target, context);
} }
/** /**
@@ -76,7 +103,7 @@ public class SensitiveWordBs {
* @since 0.0.1 * @since 0.0.1
*/ */
public List<String> findAll(final String target) { public List<String> findAll(final String target) {
return this.sensitiveWordMap.findAll(target); return sensitiveWordMap.findAll(target, context);
} }
/** /**
@@ -87,7 +114,7 @@ public class SensitiveWordBs {
* @since 0.0.1 * @since 0.0.1
*/ */
public String findFirst(final String target) { public String findFirst(final String target) {
return this.sensitiveWordMap.findFirst(target); return sensitiveWordMap.findFirst(target, context);
} }
/** /**
@@ -98,7 +125,7 @@ public class SensitiveWordBs {
* @since 0.0.2 * @since 0.0.2
*/ */
public String replace(final String target, final char replaceChar) { public String replace(final String target, final char replaceChar) {
return this.sensitiveWordMap.replace(target, replaceChar); return sensitiveWordMap.replace(target, replaceChar, context);
} }
/** /**

View File

@@ -0,0 +1,88 @@
package com.github.houbb.sensitive.word.bs;
import com.github.houbb.sensitive.word.api.IWordContext;
/**
* 上下文
* @author binbin.hou
* @since 0.0.4
*/
public class SensitiveWordContext implements IWordContext {
/**
* 忽略大小写
* @since 0.0.4
*/
private boolean ignoreCase;
/**
* 忽略半角全角
* @since 0.0.4
*/
private boolean ignoreWidth;
/**
* 私有化构造器
* @since 0.0.4
*/
private SensitiveWordContext() {
}
/**
* 新建一个对象实例
* @return 对象实例
* @since 0.0.4
*/
public static SensitiveWordContext newInstance() {
return new SensitiveWordContext();
}
@Override
public boolean ignoreCase() {
return ignoreCase;
}
@Override
public SensitiveWordContext ignoreCase(boolean ignoreCase) {
this.ignoreCase = ignoreCase;
return this;
}
@Override
public boolean ignoreWidth() {
return ignoreWidth;
}
@Override
public SensitiveWordContext ignoreWidth(boolean ignoreWidth) {
this.ignoreWidth = ignoreWidth;
return this;
}
private static class ContextHolder {
private static final SensitiveWordContext INSTANCE = new SensitiveWordContext();
static {
INSTANCE.ignoreCase(true);
INSTANCE.ignoreWidth(true);
}
}
/**
* 默认配置
* @return 结果
* @since 0.0.4
*/
private static SensitiveWordContext defaultContext() {
return ContextHolder.INSTANCE;
}
@Override
public String toString() {
return "SensitiveWordContext{" +
"ignoreCase=" + ignoreCase +
", ignoreWidth=" + ignoreWidth +
'}';
}
}

View File

@@ -5,7 +5,7 @@ package com.github.houbb.sensitive.word.constant.enums;
* <p> create on 2020/1/7 22:46 </p> * <p> create on 2020/1/7 22:46 </p>
* *
* @author Administrator * @author Administrator
* @since 1.0.0 * @since 0.0.1
*/ */
public enum ValidModeEnum { public enum ValidModeEnum {

View File

@@ -43,8 +43,8 @@ public class CheckSensitiveWordResult {
return sensitiveWordSize; return sensitiveWordSize;
} }
public CheckSensitiveWordResult sentiveWordSize(int sentiveWordSize) { public CheckSensitiveWordResult sentiveWordSize(int sensitiveWordSize) {
this.sensitiveWordSize = sentiveWordSize; this.sensitiveWordSize = sensitiveWordSize;
return this; return this;
} }

View File

@@ -31,7 +31,7 @@ public class SensitiveWordData implements IWordData {
defaultLines = StreamUtil.readAllLines("/dict.txt"); defaultLines = StreamUtil.readAllLines("/dict.txt");
defaultLines.addAll(StreamUtil.readAllLines("/dict_en.txt")); defaultLines.addAll(StreamUtil.readAllLines("/dict_en.txt"));
long end = System.currentTimeMillis(); long end = System.currentTimeMillis();
System.out.println("Sensitive data loaded!, cost time: " + (end - start) + " ms"); System.out.println("Sensitive data loaded!, cost time: " + (end - start) + "ms");
} }
} }

View File

@@ -7,6 +7,7 @@ import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.heaven.util.util.MapUtil; import com.github.houbb.heaven.util.util.MapUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.constant.AppConst; import com.github.houbb.sensitive.word.constant.AppConst;
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
@@ -30,7 +31,7 @@ public class SensitiveWordMap implements IWordMap {
* *
* @since 0.0.1 * @since 0.0.1
*/ */
private static Map sensitiveWordMap; private Map innerWordMap;
/** /**
* 读取敏感词库将敏感词放入HashSet中构建一个DFA算法模型 * 读取敏感词库将敏感词放入HashSet中构建一个DFA算法模型
@@ -46,13 +47,13 @@ public class SensitiveWordMap implements IWordMap {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public void initWordMap(Collection<String> collection) { public void initWordMap(Collection<String> collection) {
// 避免重复加载 // 避免重复加载
if (MapUtil.isNotEmpty(sensitiveWordMap)) { if (MapUtil.isNotEmpty(innerWordMap)) {
return; return;
} }
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
// 避免扩容带来的消耗 // 避免扩容带来的消耗
sensitiveWordMap = new HashMap(collection.size()); innerWordMap = new HashMap(collection.size());
for (String key : collection) { for (String key : collection) {
if (StringUtil.isEmpty(key)) { if (StringUtil.isEmpty(key)) {
@@ -64,7 +65,7 @@ public class SensitiveWordMap implements IWordMap {
final int size = chars.length; final int size = chars.length;
// 每一个新词的循环,直接将结果设置为当前 map所有变化都会体现在结果的 map 中 // 每一个新词的循环,直接将结果设置为当前 map所有变化都会体现在结果的 map 中
Map currentMap = sensitiveWordMap; Map currentMap = innerWordMap;
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
// 截取敏感词当中的字在敏感词库中字为HashMap对象的Key键值 // 截取敏感词当中的字在敏感词库中字为HashMap对象的Key键值
@@ -78,7 +79,7 @@ public class SensitiveWordMap implements IWordMap {
currentMap = (Map) wordMap; currentMap = (Map) wordMap;
} else { } else {
//不存在则则构建一个新的map同时将isEnd设置为0因为他不是最后一 //不存在则则构建一个新的map同时将isEnd设置为0因为他不是最后一
Map<String, Boolean> newWordMap = new HashMap<>(); Map<String, Boolean> newWordMap = new HashMap<>(8);
newWordMap.put(AppConst.IS_END, false); newWordMap.put(AppConst.IS_END, false);
// 将新的节点放入当前 map 中 // 将新的节点放入当前 map 中
@@ -96,7 +97,7 @@ public class SensitiveWordMap implements IWordMap {
} }
long endTime = System.currentTimeMillis(); long endTime = System.currentTimeMillis();
System.out.println("Init sensitive word map end! Cost time " + (endTime - startTime) + "ms"); System.out.println("Init sensitive word map end! Cost time: " + (endTime - startTime) + "ms");
} }
/** /**
@@ -109,13 +110,13 @@ public class SensitiveWordMap implements IWordMap {
* @since 0.0.1 * @since 0.0.1
*/ */
@Override @Override
public boolean contains(String string) { public boolean contains(String string, final IWordContext context) {
if (StringUtil.isEmpty(string)) { if (StringUtil.isEmpty(string)) {
return false; return false;
} }
for (int i = 0; i < string.length(); i++) { for (int i = 0; i < string.length(); i++) {
int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST); int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST, context);
// 快速返回 // 快速返回
if (checkResult > 0) { if (checkResult > 0) {
return true; return true;
@@ -134,13 +135,13 @@ public class SensitiveWordMap implements IWordMap {
* @since 0.0.1 * @since 0.0.1
*/ */
@Override @Override
public List<String> findAll(String string) { public List<String> findAll(String string, final IWordContext context) {
return getSensitiveWords(string, ValidModeEnum.FAIL_OVER); return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context);
} }
@Override @Override
public String findFirst(String string) { public String findFirst(String string, final IWordContext context) {
List<String> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST); List<String> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context);
if (CollectionUtil.isEmpty(stringList)) { if (CollectionUtil.isEmpty(stringList)) {
return null; return null;
@@ -150,12 +151,12 @@ public class SensitiveWordMap implements IWordMap {
} }
@Override @Override
public String replace(String target, char replaceChar) { public String replace(String target, char replaceChar, final IWordContext context) {
if(StringUtil.isEmpty(target)) { if(StringUtil.isEmpty(target)) {
return target; return target;
} }
return this.replaceSensitiveWord(target, ValidModeEnum.FAIL_OVER, replaceChar); return this.replaceSensitiveWord(target, replaceChar, context);
} }
/** /**
@@ -166,7 +167,8 @@ public class SensitiveWordMap implements IWordMap {
* @return 结果列表 * @return 结果列表
* @since 0.0.1 * @since 0.0.1
*/ */
private List<String> getSensitiveWords(final String text, final ValidModeEnum modeEnum) { private List<String> getSensitiveWords(final String text, final ValidModeEnum modeEnum,
final IWordContext context) {
//1. 是否存在敏感词,如果比存在,直接返回空列表 //1. 是否存在敏感词,如果比存在,直接返回空列表
if (StringUtil.isEmpty(text)) { if (StringUtil.isEmpty(text)) {
return Guavas.newArrayList(); return Guavas.newArrayList();
@@ -174,7 +176,7 @@ public class SensitiveWordMap implements IWordMap {
List<String> resultList = Guavas.newArrayList(); List<String> resultList = Guavas.newArrayList();
for (int i = 0; i < text.length(); i++) { for (int i = 0; i < text.length(); i++) {
int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER); int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER, context);
// 命中 // 命中
if (wordLength > 0) { if (wordLength > 0) {
@@ -215,19 +217,23 @@ public class SensitiveWordMap implements IWordMap {
* @param txt 文本信息 * @param txt 文本信息
* @param beginIndex 开始下标 * @param beginIndex 开始下标
* @param validModeEnum 验证模式 * @param validModeEnum 验证模式
* @param context 执行上下文
* @return 敏感词对应的长度 * @return 敏感词对应的长度
* @since 0.0.1 * @since 0.0.1
*/ */
private int checkSensitiveWord(final String txt, final int beginIndex, private int checkSensitiveWord(final String txt, final int beginIndex,
final ValidModeEnum validModeEnum) { final ValidModeEnum validModeEnum,
Map nowMap = sensitiveWordMap; final IWordContext context) {
Map nowMap = innerWordMap;
// 记录敏感词的长度 // 记录敏感词的长度
int lengthCount = 0; int lengthCount = 0;
int actualLength = 0; int actualLength = 0;
for (int i = beginIndex; i < txt.length(); i++) { for (int i = beginIndex; i < txt.length(); i++) {
char charKey = txt.charAt(i); char c = txt.charAt(i);
char charKey = getActualChar(c, context);
// 判断该字是否存在于敏感词库中 // 判断该字是否存在于敏感词库中
// 并且将 nowMap 替换为新的 map进入下一层的循环。 // 并且将 nowMap 替换为新的 map进入下一层的循环。
nowMap = (Map) nowMap.get(charKey); nowMap = (Map) nowMap.get(charKey);
@@ -256,16 +262,36 @@ public class SensitiveWordMap implements IWordMap {
return actualLength; return actualLength;
} }
/**
* 获取实际对应的符号
* @param c 编号
* @param context 上下文
* @return 结果
* @since 0.0.4
*/
private char getActualChar(final char c,
final IWordContext context) {
char resultChar = c;
if(context.ignoreCase()) {
resultChar = Character.toLowerCase(resultChar);
}
if(context.ignoreWidth()) {
resultChar = CharUtil.toHalfWidth(resultChar);
}
return resultChar;
}
/** /**
* 直接替换敏感词,返回替换后的结果 * 直接替换敏感词,返回替换后的结果
* @param target 文本信息 * @param target 文本信息
* @param validModeEnum 验证模式
* @return 脱敏后的字符串 * @return 脱敏后的字符串
* @since 0.0.2 * @since 0.0.2
*/ */
private String replaceSensitiveWord(final String target, private String replaceSensitiveWord(final String target,
final ValidModeEnum validModeEnum, final char replaceChar,
final char replaceChar) { final IWordContext context) {
if(StringUtil.isEmpty(target)) { if(StringUtil.isEmpty(target)) {
return target; return target;
} }
@@ -275,7 +301,7 @@ public class SensitiveWordMap implements IWordMap {
for (int i = 0; i < target.length(); i++) { for (int i = 0; i < target.length(); i++) {
char currentChar = target.charAt(i); char currentChar = target.charAt(i);
// 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词 // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
int wordLength = checkSensitiveWord(target, i, validModeEnum); int wordLength = checkSensitiveWord(target, i, ValidModeEnum.FAIL_OVER, context);
// 敏感词 // 敏感词
if(wordLength > 0) { if(wordLength > 0) {

View File

@@ -22,7 +22,7 @@ public class SensitiveWordBsTest {
public void containsTest() { public void containsTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
Assert.assertTrue(SensitiveWordBs.getInstance().contains(text)); Assert.assertTrue(SensitiveWordBs.newInstance().contains(text));
} }
/** /**
@@ -33,7 +33,7 @@ public class SensitiveWordBsTest {
public void findAllTest() { public void findAllTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
List<String> wordList = SensitiveWordBs.getInstance().findAll(text); List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
} }
@@ -45,7 +45,7 @@ public class SensitiveWordBsTest {
public void findFirstTest() { public void findFirstTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String word = SensitiveWordBs.getInstance().findFirst(text); String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("五星红旗", word); Assert.assertEquals("五星红旗", word);
} }
@@ -57,7 +57,7 @@ public class SensitiveWordBsTest {
public void replaceTest() { public void replaceTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String result = SensitiveWordBs.getInstance().replace(text); String result = SensitiveWordBs.newInstance().replace(text);
Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result); Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result);
} }
@@ -69,8 +69,32 @@ public class SensitiveWordBsTest {
public void replaceCharTest() { public void replaceCharTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String result = SensitiveWordBs.getInstance().replace(text, '0'); String result = SensitiveWordBs.newInstance().replace(text, '0');
Assert.assertEquals("0000迎风飘扬000的画像屹立在000前。", result); Assert.assertEquals("0000迎风飘扬000的画像屹立在000前。", result);
} }
/**
* 忽略大小写
* @since 0.0.4
*/
@Test
public void ignoreCaseTest() {
final String text = "fuCK the bad words.";
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("fuCK", word);
}
/**
* 忽略半角圆角
* @since 0.0.4
*/
@Test
public void ignoreWidthTest() {
final String text = " the bad words.";
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("", word);
}
} }