[Feature] add for new

This commit is contained in:
binbin.hou
2020-01-09 13:34:43 +08:00
parent 200a60c3ba
commit fa9348d55d
13 changed files with 363 additions and 77 deletions

View File

@@ -1,6 +1,6 @@
# sensitive-word
[sensitive-word](https://github.com/houbb/sensitive-word) 基于 DFA 算法实现的敏感词工具。
[sensitive-word](https://github.com/houbb/sensitive-word) 基于 DFA 算法实现的高性能敏感词工具。
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.houbb/sensitive-word/badge.svg)](http://mvnrepository.com/artifact/com.github.houbb/sensitive-word)
@@ -10,21 +10,25 @@
实现一款好用敏感词工具。
基于 DFA 算法实现,目前敏感词库内容收录 18W+ 感觉过于臃肿
基于 DFA 算法实现,目前敏感词库内容收录 6W+(源文件 18W+,经过一次删减)
后期将进行相关优化,降低字典的数量
后期将进行持续优化和补充敏感词库,并进一步提升算法的性能
希望可以细化敏感词的分类,感觉工作量比较大,暂时没有太好的思路
希望可以细化敏感词的分类,感觉工作量比较大,暂时没有进行
## 后期目标
## 特性
- 持续扩容对应的敏感词(如合法的数据抓取)
- 6W+ 词库,且不断优化更新
- 添加英文大小写忽略,全角半角忽略
- 基于 DFA 算法,性能很好
- 中文添加拼音相关转换,添加繁简体转换忽略
- 基于 fluent-api 实现,优雅方便
- 允许用户自定义敏感词和白名单
- 支持敏感词的判断、返回、脱敏等常见操作
- 支持全角半角互换
- 支持英文大小写互换
# 快速开始
@@ -40,10 +44,22 @@
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId>
<version>0.0.3</version>
<version>0.0.4</version>
</dependency>
```
## api 概览
`SensitiveWordBs` 作为敏感词的引导类,核心方法如下:
| 方法 | 参数 | 返回值| 说明 |
|:---|:---|:---|:---|
| newInstance() | 无 | 引导类 | 初始化引导类 |
| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 |
| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 |
| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
## 使用实例
所有测试案例参见 [SensitiveWordBsTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java)
@@ -53,7 +69,7 @@
```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
Assert.assertTrue(SensitiveWordBs.newInstance().contains(text));
```
### 返回第一个敏感词
@@ -61,7 +77,7 @@ Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String word = SensitiveWordBs.getInstance().findFirst(text);
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("五星红旗", word);
```
@@ -70,7 +86,7 @@ Assert.assertEquals("五星红旗", word);
```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
List<String> wordList = SensitiveWordBs.getInstance().findAll(text);
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
```
@@ -78,7 +94,7 @@ Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString())
```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String result = SensitiveWordBs.getInstance().replace(text);
String result = SensitiveWordBs.newInstance().replace(text);
Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result);
```
@@ -86,6 +102,46 @@ Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result)
```java
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String result = SensitiveWordBs.getInstance().replace(text, '0');
String result = SensitiveWordBs.newInstance().replace(text, '0');
Assert.assertEquals("0000迎风飘扬000的画像屹立在000前。", result);
```
# 更多特性
后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。
这是一场漫长的攻防之战。
## 忽略大小写
```java
final String text = "fuCK the bad words.";
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("fuCK", word);
```
## 忽略半角圆角
```java
final String text = " the bad words.";
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("", word);
```
# 后期 road-map
- 繁简体互换
- 重复词
- 停顿词
- 拼音互换
- 用户自定义敏感词和白名单
- 文字镜像翻转
- 敏感词标签支持

View File

@@ -1,3 +1,12 @@
# 字符
全部使用小写+半角的形式匹配。
## 忽略大小写
if(Character.isLetter) {
ignoreCase=true
ignoreWidth=true
}

View File

@@ -9,3 +9,9 @@
2数字
对于数字,除却象形,最常用的就是谐音。
## 不可变性
这个涉及到拼音的 DFA 树构建,可能需要 wordMap 提供一个添加的接口。
这个需要在初始化的时候,直接指定。而且不可变化。

View File

@@ -52,6 +52,10 @@
<groupId>com.github.houbb</groupId>
<artifactId>heaven</artifactId>
</exclusion>
<exclusion>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--============================== OTHER ==============================-->

View File

@@ -0,0 +1,39 @@
package com.github.houbb.sensitive.word.api;
/**
* @author binbin.hou
* @since 0.0.4
*/
public interface IWordContext {
/**
* 是否忽略大小写
* @return 是否
* @since 0.0.4
*/
boolean ignoreCase();
/**
* 是否忽略半角圆角
* @return 是否
* @since 0.0.4
*/
boolean ignoreWidth();
/**
* 设置是否忽略大小写
* @param ignoreCase 是否忽略大小写
* @return this
* @since 0.0.4
*/
IWordContext ignoreCase(boolean ignoreCase);
/**
* 设置是否忽略半角圆角
* @param ignoreWidth 是否忽略半角圆角
* @return this
* @since 0.0.4
*/
IWordContext ignoreWidth(boolean ignoreWidth);
}

View File

@@ -1,6 +1,5 @@
package com.github.houbb.sensitive.word.api;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
import java.util.Collection;
@@ -24,28 +23,34 @@ public interface IWordMap {
/**
* 是否包含敏感词
* @param string 字符串
* @param context 上下文
* @return 是否包含
* @since 0.0.1
* @see ValidModeEnum#FAIL_FAST 建议使用快速返回模式
*/
boolean contains(final String string);
boolean contains(final String string,
final IWordContext context);
/**
* 返回所有对应的敏感词
* @param string 原始字符串
* @param context 上下文
* @return 结果
* @since 0.0.1
* @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式
*/
List<String> findAll(final String string);
List<String> findAll(final String string,
final IWordContext context);
/**
* 返回第一个对应的敏感词
* @param string 原始字符串
* @param context 上下文
* @return 结果
* @since 0.0.1
*/
String findFirst(final String string);
String findFirst(final String string,
final IWordContext context);
/**
* 替换所有敏感词内容
@@ -54,9 +59,11 @@ public interface IWordMap {
*
* @param target 目标字符串
* @param replaceChar 替换为的 char
* @param context 上下文
* @return 替换后结果
* @since 0.0.2
*/
String replace(final String target, final char replaceChar);
String replace(final String target, final char replaceChar,
final IWordContext context);
}

View File

@@ -1,7 +1,7 @@
package com.github.houbb.sensitive.word.bs;
import com.github.houbb.heaven.constant.CharConst;
import com.github.houbb.heaven.support.instance.impl.Instances;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.support.data.SensitiveWordData;
@@ -22,41 +22,68 @@ public class SensitiveWordBs {
*/
private SensitiveWordBs(){}
/**
* 敏感数据信息
* @since 0.0.1
*/
private IWordData sensitiveWordData = Instances.singleton(SensitiveWordData.class);
/**
* 敏感词 map
* @since 0.0.1
*/
private IWordMap sensitiveWordMap = Instances.singleton(SensitiveWordMap.class);
private static volatile IWordMap sensitiveWordMap;
/**
* 获取单例信息
* @since 0.0.1
* 默认的执行上下文
* @since 0.0.4
*/
private static final SensitiveWordBs INSTANCE;
private volatile IWordContext context;
static {
synchronized (SensitiveWordBs.class) {
INSTANCE = new SensitiveWordBs();
List<String> lines = INSTANCE.sensitiveWordData.getWordData();
INSTANCE.sensitiveWordMap.initWordMap(lines);
/**
* DCL 初始化 wordMap 信息
* @return 初始化后的结果
* @since 0.0.4
*/
private static IWordMap initWordMap() {
if(sensitiveWordMap == null) {
synchronized (IWordMap.class) {
if(sensitiveWordMap == null) {
// 加载配置信息
IWordData wordData = new SensitiveWordData();
List<String> lines = wordData.getWordData();
// 初始化 DFA 信息
sensitiveWordMap = new SensitiveWordMap();
sensitiveWordMap.initWordMap(lines);
}
}
}
return sensitiveWordMap;
}
/**
* 新建验证实例
*
* double-lock
* @return this
* @since 0.0.1
*/
public static SensitiveWordBs getInstance() {
return INSTANCE;
public static SensitiveWordBs newInstance() {
initWordMap();
SensitiveWordBs bs = new SensitiveWordBs();
bs.context = buildDefaultContext();
return bs;
}
/**
* 构建默认的上下文
* @return 结果
* @since 0.0.4
*/
private static IWordContext buildDefaultContext() {
IWordContext wordContext = SensitiveWordContext.newInstance();
wordContext.ignoreCase(true);
wordContext.ignoreWidth(true);
return wordContext;
}
/**
* 是否包含敏感词
* @param target 目标字符串
@@ -64,7 +91,7 @@ public class SensitiveWordBs {
* @since 0.0.1
*/
public boolean contains(final String target) {
return this.sensitiveWordMap.contains(target);
return sensitiveWordMap.contains(target, context);
}
/**
@@ -76,7 +103,7 @@ public class SensitiveWordBs {
* @since 0.0.1
*/
public List<String> findAll(final String target) {
return this.sensitiveWordMap.findAll(target);
return sensitiveWordMap.findAll(target, context);
}
/**
@@ -87,7 +114,7 @@ public class SensitiveWordBs {
* @since 0.0.1
*/
public String findFirst(final String target) {
return this.sensitiveWordMap.findFirst(target);
return sensitiveWordMap.findFirst(target, context);
}
/**
@@ -98,7 +125,7 @@ public class SensitiveWordBs {
* @since 0.0.2
*/
public String replace(final String target, final char replaceChar) {
return this.sensitiveWordMap.replace(target, replaceChar);
return sensitiveWordMap.replace(target, replaceChar, context);
}
/**

View File

@@ -0,0 +1,88 @@
package com.github.houbb.sensitive.word.bs;
import com.github.houbb.sensitive.word.api.IWordContext;
/**
* 上下文
* @author binbin.hou
* @since 0.0.4
*/
public class SensitiveWordContext implements IWordContext {
/**
* 忽略大小写
* @since 0.0.4
*/
private boolean ignoreCase;
/**
* 忽略半角全角
* @since 0.0.4
*/
private boolean ignoreWidth;
/**
* 私有化构造器
* @since 0.0.4
*/
private SensitiveWordContext() {
}
/**
* 新建一个对象实例
* @return 对象实例
* @since 0.0.4
*/
public static SensitiveWordContext newInstance() {
return new SensitiveWordContext();
}
@Override
public boolean ignoreCase() {
return ignoreCase;
}
@Override
public SensitiveWordContext ignoreCase(boolean ignoreCase) {
this.ignoreCase = ignoreCase;
return this;
}
@Override
public boolean ignoreWidth() {
return ignoreWidth;
}
@Override
public SensitiveWordContext ignoreWidth(boolean ignoreWidth) {
this.ignoreWidth = ignoreWidth;
return this;
}
private static class ContextHolder {
private static final SensitiveWordContext INSTANCE = new SensitiveWordContext();
static {
INSTANCE.ignoreCase(true);
INSTANCE.ignoreWidth(true);
}
}
/**
* 默认配置
* @return 结果
* @since 0.0.4
*/
private static SensitiveWordContext defaultContext() {
return ContextHolder.INSTANCE;
}
@Override
public String toString() {
return "SensitiveWordContext{" +
"ignoreCase=" + ignoreCase +
", ignoreWidth=" + ignoreWidth +
'}';
}
}

View File

@@ -5,7 +5,7 @@ package com.github.houbb.sensitive.word.constant.enums;
* <p> create on 2020/1/7 22:46 </p>
*
* @author Administrator
* @since 1.0.0
* @since 0.0.1
*/
public enum ValidModeEnum {

View File

@@ -43,8 +43,8 @@ public class CheckSensitiveWordResult {
return sensitiveWordSize;
}
public CheckSensitiveWordResult sentiveWordSize(int sentiveWordSize) {
this.sensitiveWordSize = sentiveWordSize;
public CheckSensitiveWordResult sentiveWordSize(int sensitiveWordSize) {
this.sensitiveWordSize = sensitiveWordSize;
return this;
}

View File

@@ -7,6 +7,7 @@ import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.heaven.util.util.MapUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.constant.AppConst;
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
@@ -30,7 +31,7 @@ public class SensitiveWordMap implements IWordMap {
*
* @since 0.0.1
*/
private static Map sensitiveWordMap;
private Map innerWordMap;
/**
* 读取敏感词库将敏感词放入HashSet中构建一个DFA算法模型
@@ -46,13 +47,13 @@ public class SensitiveWordMap implements IWordMap {
@SuppressWarnings("unchecked")
public void initWordMap(Collection<String> collection) {
// 避免重复加载
if (MapUtil.isNotEmpty(sensitiveWordMap)) {
if (MapUtil.isNotEmpty(innerWordMap)) {
return;
}
long startTime = System.currentTimeMillis();
// 避免扩容带来的消耗
sensitiveWordMap = new HashMap(collection.size());
innerWordMap = new HashMap(collection.size());
for (String key : collection) {
if (StringUtil.isEmpty(key)) {
@@ -64,7 +65,7 @@ public class SensitiveWordMap implements IWordMap {
final int size = chars.length;
// 每一个新词的循环,直接将结果设置为当前 map所有变化都会体现在结果的 map 中
Map currentMap = sensitiveWordMap;
Map currentMap = innerWordMap;
for (int i = 0; i < size; i++) {
// 截取敏感词当中的字在敏感词库中字为HashMap对象的Key键值
@@ -78,7 +79,7 @@ public class SensitiveWordMap implements IWordMap {
currentMap = (Map) wordMap;
} else {
//不存在则则构建一个新的map同时将isEnd设置为0因为他不是最后一
Map<String, Boolean> newWordMap = new HashMap<>();
Map<String, Boolean> newWordMap = new HashMap<>(8);
newWordMap.put(AppConst.IS_END, false);
// 将新的节点放入当前 map 中
@@ -96,7 +97,7 @@ public class SensitiveWordMap implements IWordMap {
}
long endTime = System.currentTimeMillis();
System.out.println("Init sensitive word map end! Cost time " + (endTime - startTime) + "ms");
System.out.println("Init sensitive word map end! Cost time: " + (endTime - startTime) + "ms");
}
/**
@@ -109,13 +110,13 @@ public class SensitiveWordMap implements IWordMap {
* @since 0.0.1
*/
@Override
public boolean contains(String string) {
public boolean contains(String string, final IWordContext context) {
if (StringUtil.isEmpty(string)) {
return false;
}
for (int i = 0; i < string.length(); i++) {
int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST);
int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST, context);
// 快速返回
if (checkResult > 0) {
return true;
@@ -134,13 +135,13 @@ public class SensitiveWordMap implements IWordMap {
* @since 0.0.1
*/
@Override
public List<String> findAll(String string) {
return getSensitiveWords(string, ValidModeEnum.FAIL_OVER);
public List<String> findAll(String string, final IWordContext context) {
return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context);
}
@Override
public String findFirst(String string) {
List<String> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST);
public String findFirst(String string, final IWordContext context) {
List<String> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context);
if (CollectionUtil.isEmpty(stringList)) {
return null;
@@ -150,12 +151,12 @@ public class SensitiveWordMap implements IWordMap {
}
@Override
public String replace(String target, char replaceChar) {
public String replace(String target, char replaceChar, final IWordContext context) {
if(StringUtil.isEmpty(target)) {
return target;
}
return this.replaceSensitiveWord(target, ValidModeEnum.FAIL_OVER, replaceChar);
return this.replaceSensitiveWord(target, replaceChar, context);
}
/**
@@ -166,7 +167,8 @@ public class SensitiveWordMap implements IWordMap {
* @return 结果列表
* @since 0.0.1
*/
private List<String> getSensitiveWords(final String text, final ValidModeEnum modeEnum) {
private List<String> getSensitiveWords(final String text, final ValidModeEnum modeEnum,
final IWordContext context) {
//1. 是否存在敏感词,如果比存在,直接返回空列表
if (StringUtil.isEmpty(text)) {
return Guavas.newArrayList();
@@ -174,7 +176,7 @@ public class SensitiveWordMap implements IWordMap {
List<String> resultList = Guavas.newArrayList();
for (int i = 0; i < text.length(); i++) {
int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER);
int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER, context);
// 命中
if (wordLength > 0) {
@@ -215,19 +217,23 @@ public class SensitiveWordMap implements IWordMap {
* @param txt 文本信息
* @param beginIndex 开始下标
* @param validModeEnum 验证模式
* @param context 执行上下文
* @return 敏感词对应的长度
* @since 0.0.1
*/
private int checkSensitiveWord(final String txt, final int beginIndex,
final ValidModeEnum validModeEnum) {
Map nowMap = sensitiveWordMap;
final ValidModeEnum validModeEnum,
final IWordContext context) {
Map nowMap = innerWordMap;
// 记录敏感词的长度
int lengthCount = 0;
int actualLength = 0;
for (int i = beginIndex; i < txt.length(); i++) {
char charKey = txt.charAt(i);
char c = txt.charAt(i);
char charKey = getActualChar(c, context);
// 判断该字是否存在于敏感词库中
// 并且将 nowMap 替换为新的 map进入下一层的循环。
nowMap = (Map) nowMap.get(charKey);
@@ -256,16 +262,36 @@ public class SensitiveWordMap implements IWordMap {
return actualLength;
}
/**
* 获取实际对应的符号
* @param c 编号
* @param context 上下文
* @return 结果
* @since 0.0.4
*/
private char getActualChar(final char c,
final IWordContext context) {
char resultChar = c;
if(context.ignoreCase()) {
resultChar = Character.toLowerCase(resultChar);
}
if(context.ignoreWidth()) {
resultChar = CharUtil.toHalfWidth(resultChar);
}
return resultChar;
}
/**
* 直接替换敏感词,返回替换后的结果
* @param target 文本信息
* @param validModeEnum 验证模式
* @return 脱敏后的字符串
* @since 0.0.2
*/
private String replaceSensitiveWord(final String target,
final ValidModeEnum validModeEnum,
final char replaceChar) {
final char replaceChar,
final IWordContext context) {
if(StringUtil.isEmpty(target)) {
return target;
}
@@ -275,7 +301,7 @@ public class SensitiveWordMap implements IWordMap {
for (int i = 0; i < target.length(); i++) {
char currentChar = target.charAt(i);
// 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
int wordLength = checkSensitiveWord(target, i, validModeEnum);
int wordLength = checkSensitiveWord(target, i, ValidModeEnum.FAIL_OVER, context);
// 敏感词
if(wordLength > 0) {

View File

@@ -22,7 +22,7 @@ public class SensitiveWordBsTest {
public void containsTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
Assert.assertTrue(SensitiveWordBs.newInstance().contains(text));
}
/**
@@ -33,7 +33,7 @@ public class SensitiveWordBsTest {
public void findAllTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
List<String> wordList = SensitiveWordBs.getInstance().findAll(text);
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
}
@@ -45,7 +45,7 @@ public class SensitiveWordBsTest {
public void findFirstTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String word = SensitiveWordBs.getInstance().findFirst(text);
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("五星红旗", word);
}
@@ -57,7 +57,7 @@ public class SensitiveWordBsTest {
public void replaceTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String result = SensitiveWordBs.getInstance().replace(text);
String result = SensitiveWordBs.newInstance().replace(text);
Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result);
}
@@ -69,8 +69,32 @@ public class SensitiveWordBsTest {
public void replaceCharTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
String result = SensitiveWordBs.getInstance().replace(text, '0');
String result = SensitiveWordBs.newInstance().replace(text, '0');
Assert.assertEquals("0000迎风飘扬000的画像屹立在000前。", result);
}
/**
* 忽略大小写
* @since 0.0.4
*/
@Test
public void ignoreCaseTest() {
final String text = "fuCK the bad words.";
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("fuCK", word);
}
/**
* 忽略半角圆角
* @since 0.0.4
*/
@Test
public void ignoreWidthTest() {
final String text = " the bad words.";
String word = SensitiveWordBs.newInstance().findFirst(text);
Assert.assertEquals("", word);
}
}