mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
release branch 0.10.0
This commit is contained in:
@@ -206,3 +206,9 @@
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:-----|-----------------|:--------------------|:------|
|
||||
| 1 | O | 移除单个汉字+部分常用词的脏词 | 2023-11-17 23:51:58 | 降低误判率 |
|
||||
|
||||
# release_0.10.0
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:-----|------------------|:--------------------|:------|
|
||||
| 1 | A | 添加脏词的标签接口,便于后续拓展 | 2023-12-05 23:51:58 | |
|
||||
|
||||
95
README.md
95
README.md
@@ -40,6 +40,8 @@
|
||||
|
||||
- [支持数据的数据动态更新(用户自定义),实时生效](https://github.com/houbb/sensitive-word#%E5%8A%A8%E6%80%81%E5%8A%A0%E8%BD%BD%E7%94%A8%E6%88%B7%E8%87%AA%E5%AE%9A%E4%B9%89)
|
||||
|
||||
- [支持敏感词的标签接口]()
|
||||
|
||||
## 变更日志
|
||||
|
||||
[CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/CHANGE_LOG.md)
|
||||
@@ -58,7 +60,7 @@
|
||||
<dependency>
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.9.0</version>
|
||||
<version>0.10.0</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@@ -66,16 +68,17 @@
|
||||
|
||||
`SensitiveWordHelper` 作为敏感词的工具类,核心方法如下:
|
||||
|
||||
| 方法 | 参数 | 返回值| 说明 |
|
||||
|:---|:---|:---|:---|
|
||||
| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 |
|
||||
| replace(String, ISensitiveWordReplace) | 使用指定的替换策略替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 |
|
||||
| findFirst(String) | 待验证的字符串 | 字符串 | 返回字符串中第一个敏感词 |
|
||||
| findAll(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串列表 | 返回字符串中所有敏感词 |
|
||||
| findFirst(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串 | 返回字符串中第一个敏感词 |
|
||||
| 方法 | 参数 | 返回值 | 说明 |
|
||||
|:---------------------------------------|:-------------------------|:-------|:-------------|
|
||||
| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 |
|
||||
| replace(String, ISensitiveWordReplace) | 使用指定的替换策略替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 |
|
||||
| findFirst(String) | 待验证的字符串 | 字符串 | 返回字符串中第一个敏感词 |
|
||||
| findAll(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串列表 | 返回字符串中所有敏感词 |
|
||||
| findFirst(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串 | 返回字符串中第一个敏感词 |
|
||||
| tags(String) | 获取敏感词的标签 | 敏感词字符串 | 返回敏感词的标签列表 |
|
||||
|
||||
## IWordResultHandler 结果处理类
|
||||
|
||||
@@ -387,6 +390,70 @@ Assert.assertTrue(wordBs.contains(text));
|
||||
| 10 | enableWordCheck | 是否启用敏感单词检测 | true |
|
||||
| 11 | numCheckLen | 数字检测,自定义指定长度。 | 8 |
|
||||
|
||||
# 敏感词标签
|
||||
|
||||
## 说明
|
||||
|
||||
有时候我们希望对敏感词加一个分类标签:比如社情、暴/力等等。
|
||||
|
||||
这样后续可以按照标签等进行更多特性操作,比如只处理某一类的标签。
|
||||
|
||||
支持版本:v0.10.0
|
||||
|
||||
## 入门例子
|
||||
|
||||
### 接口
|
||||
|
||||
这里只是一个抽象的接口,用户可以自行定义实现。比如从数据库查询等。
|
||||
|
||||
```java
|
||||
public interface IWordTag {
|
||||
|
||||
/**
|
||||
* 查询标签列表
|
||||
* @param word 脏词
|
||||
* @return 结果
|
||||
*/
|
||||
Set<String> getTag(String word);
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
### 配置文件
|
||||
|
||||
我们可以自定义 dict 标签文件,通过 WordTags.file() 创建一个 WordTag 实现。
|
||||
|
||||
- dict_tag_test.txt
|
||||
|
||||
```
|
||||
五星红旗 政治,国家
|
||||
```
|
||||
|
||||
格式如下:
|
||||
|
||||
```
|
||||
敏感词 tag1,tag2
|
||||
```
|
||||
|
||||
### 实现
|
||||
|
||||
具体的效果如下,在引导类设置一下即可。
|
||||
|
||||
默认的 wordTag 是空的。
|
||||
|
||||
```java
|
||||
String filePath = "dict_tag_test.txt";
|
||||
IWordTag wordTag = WordTags.file(filePath);
|
||||
|
||||
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
|
||||
.wordTag(wordTag)
|
||||
.init();
|
||||
|
||||
Assert.assertEquals("[政治, 国家]", sensitiveWordBs.tags("五星红旗").toString());;
|
||||
```
|
||||
|
||||
后续会考虑引入一个内置的标签文件策略。
|
||||
|
||||
# 动态加载(用户自定义)
|
||||
|
||||
## 情景说明
|
||||
@@ -667,9 +734,11 @@ ps: 不同环境会有差异,但是比例基本稳定。
|
||||
|
||||
remove、add、edit?
|
||||
|
||||
- [ ] 敏感词标签支持 + 分级支持
|
||||
- [x] 敏感词标签接口支持
|
||||
|
||||
比较耗时间。
|
||||
- [ ] 敏感词处理时标签支持
|
||||
|
||||
TODO: 比较耗时间。
|
||||
|
||||
- [x] wordData 的内存占用对比 + 优化
|
||||
|
||||
|
||||
2
pom.xml
2
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.9.0</version>
|
||||
<version>0.10.0</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
|
||||
@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
|
||||
|
||||
:: 版本号信息(需要手动指定)
|
||||
:::: 旧版本名称
|
||||
SET version=0.9.0
|
||||
SET version=0.10.0
|
||||
:::: 新版本名称
|
||||
SET newVersion=0.10.0
|
||||
SET newVersion=0.11.0
|
||||
:::: 组织名称
|
||||
SET groupName=com.github.houbb
|
||||
:::: 项目名称
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
import com.github.houbb.sensitive.word.bs.SensitiveWordContext;
|
||||
|
||||
/**
|
||||
* @author binbin.hou
|
||||
* @since 0.0.4
|
||||
@@ -231,4 +233,8 @@ public interface IWordContext {
|
||||
*/
|
||||
IWordContext wordData(IWordData wordMap);
|
||||
|
||||
IWordTag wordTag();
|
||||
|
||||
SensitiveWordContext wordTag(IWordTag wordTag);
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* 获取脏词的标签,便于分类
|
||||
*
|
||||
* @author dh
|
||||
* @since 0.10.0
|
||||
*/
|
||||
public interface IWordTag {
|
||||
|
||||
/**
|
||||
* 查询标签列表
|
||||
* @param word 脏词
|
||||
* @return 结果
|
||||
*/
|
||||
Set<String> getTag(String word);
|
||||
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import com.github.houbb.heaven.support.handler.IHandler;
|
||||
import com.github.houbb.heaven.util.common.ArgUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.sensitive.word.api.*;
|
||||
import com.github.houbb.sensitive.word.api.combine.IWordAllowDenyCombine;
|
||||
@@ -16,9 +17,12 @@ import com.github.houbb.sensitive.word.support.data.WordDatas;
|
||||
import com.github.houbb.sensitive.word.support.deny.WordDenys;
|
||||
import com.github.houbb.sensitive.word.support.replace.WordReplaces;
|
||||
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
|
||||
import com.github.houbb.sensitive.word.support.tag.WordTags;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* 敏感词引导类
|
||||
@@ -146,6 +150,12 @@ public class SensitiveWordBs {
|
||||
*/
|
||||
private IWordAllowDenyCombine wordAllowDenyCombine = WordAllowDenyCombines.defaults();
|
||||
|
||||
/**
|
||||
* 单词标签
|
||||
* @since 0.10.0
|
||||
*/
|
||||
private IWordTag wordTag = WordTags.none();
|
||||
|
||||
/**
|
||||
* 新建验证实例
|
||||
* <p>
|
||||
@@ -214,10 +224,18 @@ public class SensitiveWordBs {
|
||||
context.sensitiveCheckNumLen(numCheckLen);
|
||||
context.wordReplace(wordReplace);
|
||||
context.wordData(wordData);
|
||||
context.wordTag(wordTag);
|
||||
|
||||
return context;
|
||||
}
|
||||
|
||||
public SensitiveWordBs wordTag(IWordTag wordTag) {
|
||||
ArgUtil.notNull(wordTag, "wordTag");
|
||||
|
||||
this.wordTag = wordTag;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SensitiveWordBs wordCheckCombine(IWordCheckCombine wordCheckCombine) {
|
||||
ArgUtil.notNull(wordCheckCombine, "wordCheckCombine");
|
||||
|
||||
@@ -509,6 +527,22 @@ public class SensitiveWordBs {
|
||||
return sensitiveWord.replace(target, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取敏感词的标签
|
||||
*
|
||||
* @param word 敏感词
|
||||
* @return 结果
|
||||
* @since 0.10.0
|
||||
*/
|
||||
public Set<String> tags(final String word) {
|
||||
if(StringUtil.isEmpty(word)) {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
// 是否需要格式化?
|
||||
return wordTag.getTag(word);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------------ 公开方法 END
|
||||
|
||||
}
|
||||
|
||||
@@ -100,6 +100,13 @@ public class SensitiveWordContext implements IWordContext {
|
||||
*/
|
||||
private IWordData wordData;
|
||||
|
||||
/**
|
||||
* 单词标签
|
||||
*
|
||||
* @since 0.10.0
|
||||
*/
|
||||
private IWordTag wordTag;
|
||||
|
||||
public IWordData wordData() {
|
||||
return wordData;
|
||||
}
|
||||
@@ -273,4 +280,14 @@ public class SensitiveWordContext implements IWordContext {
|
||||
this.wordFormat = wordFormat;
|
||||
return this;
|
||||
}
|
||||
|
||||
public IWordTag wordTag() {
|
||||
return wordTag;
|
||||
}
|
||||
|
||||
public SensitiveWordContext wordTag(IWordTag wordTag) {
|
||||
this.wordTag = wordTag;
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
package com.github.houbb.sensitive.word.support.tag;
|
||||
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordTag;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* 抽象的单词标签
|
||||
*
|
||||
* @since 0.10.0
|
||||
*/
|
||||
public abstract class AbstractWordTag implements IWordTag {
|
||||
|
||||
|
||||
/**
|
||||
* 获取标签
|
||||
* @param word 单词
|
||||
* @return 结果
|
||||
*/
|
||||
protected abstract Set<String> doGetTag(String word);
|
||||
|
||||
@Override
|
||||
public Set<String> getTag(String word) {
|
||||
if(StringUtil.isEmpty(word)) {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
return doGetTag(word);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
package com.github.houbb.sensitive.word.support.tag;
|
||||
|
||||
import com.github.houbb.heaven.util.common.ArgUtil;
|
||||
import com.github.houbb.heaven.util.io.FileUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* 基于文件的标签
|
||||
*
|
||||
* word tag1,tag2
|
||||
* @since 0.10.0
|
||||
*/
|
||||
public class FileWordTag extends AbstractWordTag {
|
||||
|
||||
/**
|
||||
* 文件路径
|
||||
*/
|
||||
protected final String filePath;
|
||||
/**
|
||||
* 词和标签的分隔符
|
||||
*/
|
||||
protected final String wordSplit;
|
||||
/**
|
||||
* 标签的分隔符
|
||||
*/
|
||||
protected final String tagSplit;
|
||||
|
||||
protected Map<String, Set<String>> wordTagMap = new HashMap<>();
|
||||
|
||||
public FileWordTag(String filePath) {
|
||||
this(filePath, " ", ",");
|
||||
}
|
||||
|
||||
public FileWordTag(String filePath, String wordSplit, String tagSplit) {
|
||||
ArgUtil.notEmpty(filePath, "filePath");
|
||||
ArgUtil.notEmpty(wordSplit, "wordSplit");
|
||||
ArgUtil.notEmpty(tagSplit, "tagSplit");
|
||||
|
||||
this.wordSplit = wordSplit;
|
||||
this.tagSplit = tagSplit;
|
||||
this.filePath = filePath;
|
||||
|
||||
this.initWordTagMap();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 初始化
|
||||
*/
|
||||
protected synchronized void initWordTagMap() {
|
||||
List<String> lines = FileUtil.readAllLines(filePath);
|
||||
if(CollectionUtil.isEmpty(lines)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for(String line : lines) {
|
||||
if(StringUtil.isEmpty(line)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 处理每一行
|
||||
handleInitLine(line);
|
||||
}
|
||||
}
|
||||
|
||||
protected synchronized void handleInitLine(String line) {
|
||||
String[] strings = line.split(wordSplit);
|
||||
if(strings.length < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
String word = strings[0];
|
||||
String tagText = strings[1];
|
||||
|
||||
|
||||
String[] tags = tagText.split(tagSplit);
|
||||
Set<String> tagSet = new HashSet<>(Arrays.asList(tags));
|
||||
wordTagMap.put(word, tagSet);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Set<String> doGetTag(String word) {
|
||||
return wordTagMap.get(word);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.github.houbb.sensitive.word.support.tag;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* 空标签
|
||||
*
|
||||
* word tag1,tag2
|
||||
* @since 0.10.0
|
||||
*/
|
||||
public class NoneWordTag extends AbstractWordTag {
|
||||
|
||||
@Override
|
||||
protected Set<String> doGetTag(String word) {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package com.github.houbb.sensitive.word.support.tag;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordTag;
|
||||
|
||||
/**
|
||||
* 单词标签
|
||||
*
|
||||
* @since 0.10.0
|
||||
*/
|
||||
public class WordTags {
|
||||
|
||||
public static IWordTag none() {
|
||||
return new NoneWordTag();
|
||||
}
|
||||
|
||||
public static IWordTag file(String filePath) {
|
||||
return new FileWordTag(filePath);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordTag;
|
||||
import com.github.houbb.sensitive.word.support.tag.WordTags;
|
||||
import org.junit.Assert;
|
||||
|
||||
/**
|
||||
* <p> project: sensitive-word-SensitiveWordBsTest </p>
|
||||
* <p> create on 2020/1/7 23:43 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 0.10.0
|
||||
*/
|
||||
public class SensitiveWordBsTagTest {
|
||||
|
||||
public static void main(String[] args) {
|
||||
String filePath = "D:\\code\\github\\sensitive-word\\src\\test\\resources\\dict_tag_test.txt";
|
||||
|
||||
IWordTag wordTag = WordTags.file(filePath);
|
||||
|
||||
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
|
||||
.wordTag(wordTag)
|
||||
.init()
|
||||
;
|
||||
|
||||
Assert.assertEquals("[政治, 国家]", sensitiveWordBs.tags("五星红旗").toString());;
|
||||
}
|
||||
|
||||
}
|
||||
1
src/test/resources/dict_tag_test.txt
Normal file
1
src/test/resources/dict_tag_test.txt
Normal file
@@ -0,0 +1 @@
|
||||
五星红旗 政治,国家
|
||||
Reference in New Issue
Block a user