mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
release branch 0.0.8
This commit is contained in:
30
README.md
30
README.md
@@ -36,6 +36,8 @@
|
||||
|
||||
- 支持英文常见形式的互换
|
||||
|
||||
- 支持用户自定义敏感词和白名单
|
||||
|
||||
## 变更日志
|
||||
|
||||
[CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/doc/CHANGE_LOG.md)
|
||||
@@ -54,7 +56,7 @@
|
||||
<dependency>
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.0.7</version>
|
||||
<version>0.0.8</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@@ -178,14 +180,36 @@ List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString());
|
||||
```
|
||||
|
||||
# 用户自定义
|
||||
|
||||
## 敏感词和白名单
|
||||
|
||||
直接在 resource 目录下新建文件,每一行对应一个敏感词。
|
||||
|
||||
`sensitive_word_deny.txt` 代表用户自定义敏感词文件。
|
||||
|
||||
`sensitive_word_allow.txt` 代表用户自定义白名单文件。
|
||||
|
||||
## 测试
|
||||
|
||||
我们在敏感词文件中加入一行,内容为 `自定义敏感词`,同时在白名单文件中加入一行,
|
||||
内容为 `gender` 作为用户不认为是敏感词的信息。
|
||||
|
||||
- 测试代码
|
||||
|
||||
```java
|
||||
final String text = "gender 我们认为应该通过,自定义敏感词我们认为应该拒绝。";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[自定义敏感词]", wordList.toString());
|
||||
```
|
||||
|
||||
# 后期 road-map
|
||||
|
||||
- 停顿词
|
||||
|
||||
- 拼音互换
|
||||
|
||||
- 用户自定义敏感词和白名单
|
||||
|
||||
- 文字镜像翻转
|
||||
|
||||
- 敏感词标签支持
|
||||
|
||||
@@ -61,4 +61,10 @@
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | A | 添加忽略重复词支持 | 2020-1-10 09:34:35 | |
|
||||
| 1 | A | 添加忽略重复词支持 | 2020-1-10 09:34:35 | |
|
||||
|
||||
# release_0.0.8
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | A | 添加用户自定义敏感词和白名单 | 2020-1-10 09:34:35 | |
|
||||
2
pom.xml
2
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.0.8-SNAPSHOT</version>
|
||||
<version>0.0.8</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
|
||||
@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
|
||||
|
||||
:: 版本号信息(需要手动指定)
|
||||
:::: 旧版本名称
|
||||
SET version=0.0.7
|
||||
SET version=0.0.8
|
||||
:::: 新版本名称
|
||||
SET newVersion=0.0.8
|
||||
SET newVersion=0.0.9
|
||||
:::: 组织名称
|
||||
SET groupName=com.github.houbb
|
||||
:::: 项目名称
|
||||
|
||||
@@ -31,4 +31,16 @@ public final class AppConst {
|
||||
*/
|
||||
public static final int DICT_EN_SIZE = 12;
|
||||
|
||||
/**
|
||||
* 拒绝的词语
|
||||
* @since 0.0.8
|
||||
*/
|
||||
public static final String SENSITIVE_WORD_DENY_PATH = "/sensitive_word_deny.txt";
|
||||
|
||||
/**
|
||||
* 用户允许的词语
|
||||
* @since 0.0.8
|
||||
*/
|
||||
public static final String SENSITIVE_WORD_ALLOW_PATH = "/sensitive_word_allow.txt";
|
||||
|
||||
}
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
package com.github.houbb.sensitive.word.model;
|
||||
|
||||
/**
|
||||
* 检测敏感词结果
|
||||
*
|
||||
* TODO: 这里需要结合 KMP 和 暴力匹配算法。
|
||||
*
|
||||
* 暂时不使用,后期会使用到。
|
||||
* @author binbin.hou
|
||||
* @since 0.0.2
|
||||
*/
|
||||
@Deprecated
|
||||
public class CheckSensitiveWordResult {
|
||||
|
||||
/**
|
||||
* 是否匹配到了敏感词
|
||||
* @since 0.0.2
|
||||
*/
|
||||
private boolean hasMatch;
|
||||
|
||||
/**
|
||||
* 敏感词长度
|
||||
* @since 0.0.2
|
||||
*/
|
||||
private int sensitiveWordSize;
|
||||
|
||||
/**
|
||||
* 普通单词的长度
|
||||
* @since 0.0.2
|
||||
*/
|
||||
private int commonWordSize;
|
||||
|
||||
public boolean hasMatch() {
|
||||
return hasMatch;
|
||||
}
|
||||
|
||||
public CheckSensitiveWordResult hasMatch(boolean hasMatch) {
|
||||
this.hasMatch = hasMatch;
|
||||
return this;
|
||||
}
|
||||
|
||||
public int sentiveWordSize() {
|
||||
return sensitiveWordSize;
|
||||
}
|
||||
|
||||
public CheckSensitiveWordResult sentiveWordSize(int sensitiveWordSize) {
|
||||
this.sensitiveWordSize = sensitiveWordSize;
|
||||
return this;
|
||||
}
|
||||
|
||||
public int commonWordSize() {
|
||||
return commonWordSize;
|
||||
}
|
||||
|
||||
public CheckSensitiveWordResult commonWordSize(int commonWordSize) {
|
||||
this.commonWordSize = commonWordSize;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "CheckSensitiveWordResult{" +
|
||||
"hasMatch=" + hasMatch +
|
||||
", sensitiveWordSize=" + sensitiveWordSize +
|
||||
", commonWordSize=" + commonWordSize +
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
@@ -3,6 +3,7 @@ package com.github.houbb.sensitive.word.support.data;
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.heaven.util.io.StreamUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordData;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
|
||||
@@ -30,6 +31,15 @@ public class SensitiveWordData implements IWordData {
|
||||
defaultLines = Guavas.newArrayList(AppConst.DICT_SIZE+AppConst.DICT_EN_SIZE);
|
||||
defaultLines = StreamUtil.readAllLines("/dict.txt");
|
||||
defaultLines.addAll(StreamUtil.readAllLines("/dict_en.txt"));
|
||||
|
||||
// 用户自定义
|
||||
List<String> denyList = StreamUtil.readAllLines("/sensitive_word_deny.txt");
|
||||
defaultLines.addAll(denyList);
|
||||
|
||||
// 移除白名单词语
|
||||
List<String> allowList = StreamUtil.readAllLines("/sensitive_word_allow.txt");
|
||||
defaultLines = CollectionUtil.difference(defaultLines, allowList);
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("Sensitive data loaded!, cost time: " + (end - start) + "ms");
|
||||
}
|
||||
|
||||
0
src/main/resources/sensitive_word_allow.txt
Normal file
0
src/main/resources/sensitive_word_allow.txt
Normal file
0
src/main/resources/sensitive_word_deny.txt
Normal file
0
src/main/resources/sensitive_word_deny.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* <p> project: sensitive-word-SensitiveWordBsTest </p>
|
||||
* <p> create on 2020/1/7 23:43 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 0.0.8
|
||||
*/
|
||||
public class SensitiveWordBsUserDefineTest {
|
||||
|
||||
/**
|
||||
* 自定义允许和拒绝的文件
|
||||
* @since 0.0.8
|
||||
*/
|
||||
@Test
|
||||
public void allowAndDenyTest() {
|
||||
final String text = "gender 我们认为应该通过,自定义敏感词我们认为应该拒绝。";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[自定义敏感词]", wordList.toString());
|
||||
}
|
||||
|
||||
}
|
||||
1
src/test/resources/sensitive_word_allow.txt
Normal file
1
src/test/resources/sensitive_word_allow.txt
Normal file
@@ -0,0 +1 @@
|
||||
gender
|
||||
1
src/test/resources/sensitive_word_deny.txt
Normal file
1
src/test/resources/sensitive_word_deny.txt
Normal file
@@ -0,0 +1 @@
|
||||
自定义敏感词
|
||||
Reference in New Issue
Block a user