From b700d12aa35ca357fe367d5d99eb7eb9444c896a Mon Sep 17 00:00:00 2001
From: "binbin.hou" <1060732496@qq.com>
Date: Wed, 8 Jan 2020 10:57:24 +0800
Subject: [PATCH] [Feature] add for new
---
README.md | 4 +-
doc/CHANGE_LOG.md | 9 ++
doc/issues/关联框架.md | 8 +-
pom.xml | 2 +-
.../houbb/sensitive/word/api/IWordMap.java | 12 ++
.../sensitive/word/bs/SensitiveWordBs.java | 35 +++--
.../sensitive/word/constant/AppConst.java | 11 +-
.../word/model/CheckSensitiveWordResult.java | 69 +++++++++
.../word/support/data/SensitiveWordData.java | 9 +-
.../word/support/map/SensitiveWordMap.java | 91 ++++++++---
.../sensitive/word/util/CharsetUtils.java | 140 ++++++++---------
.../sensitive/word/util/StreamUtils.java | 144 +++++++++---------
.../word/bs/SensitiveWordBsTest.java | 38 ++++-
.../sensitive/word/data/DataInitTest.java | 5 +-
.../sensitive/word/util/StreamUtilsTest.java | 22 ---
15 files changed, 392 insertions(+), 207 deletions(-)
create mode 100644 src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java
delete mode 100644 src/test/java/com/github/houbb/sensitive/word/util/StreamUtilsTest.java
diff --git a/README.md b/README.md
index cc3c767..d435551 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@
com.github.houbb
sensitive-word
- 0.0.1
+ 0.0.2
```
@@ -51,7 +51,7 @@
### 判断是否包含敏感词
```java
-final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。。";
+final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
```
diff --git a/doc/CHANGE_LOG.md b/doc/CHANGE_LOG.md
index 565693a..0bd9ef4 100644
--- a/doc/CHANGE_LOG.md
+++ b/doc/CHANGE_LOG.md
@@ -14,3 +14,12 @@
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:---|:---|:---|:--|
| 1 | A | 基本功能的实现 | 2020-1-7 21:46:32 | |
+
+# release_0.0.2
+
+| 序号 | 变更类型 | 说明 | 时间 | 备注 |
+|:---|:---|:---|:---|:--|
+| 1 | O | 优化最大长度匹配模式 | 2020-1-8 09:34:35 | |
+| 2 | A | 新增替换实现 | 2020-1-8 09:34:35 | 性能优于各种博客的直接正则替换。|
+| 3 | O | 优化公共代码到 heaven 项目 | 2020-1-8 09:34:35 | 便于后期统一维护整理。|
+| 4 | O | 初步优化 DFA 对应 map 的大小 | 2020-1-8 09:34:35 | |
\ No newline at end of file
diff --git a/doc/issues/关联框架.md b/doc/issues/关联框架.md
index 2389f0d..3fa6900 100644
--- a/doc/issues/关联框架.md
+++ b/doc/issues/关联框架.md
@@ -8,4 +8,10 @@
中文英文转换
-手写 Regex
\ No newline at end of file
+手写 Regex
+
+## 核心原理
+
+DFA 算法
+
+根据又穷状态机去处理。
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 0324728..41daf20 100644
--- a/pom.xml
+++ b/pom.xml
@@ -25,7 +25,7 @@
1.7
- 0.1.66
+ 0.1.67-SNAPSHOT
4.12
diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java
index e004b61..39a6849 100644
--- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java
+++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java
@@ -47,4 +47,16 @@ public interface IWordMap {
*/
String findFirst(final String string);
+ /**
+ * 替换所有敏感词内容
+ *
+ * ps: 这里可以添加优化。
+ *
+ * @param target 目标字符串
+ * @param replaceChar 替换为的 char
+ * @return 替换后结果
+ * @since 0.0.2
+ */
+ String replace(final String target, final char replaceChar);
+
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
index 1cba7a1..95fec06 100644
--- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
+++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
@@ -1,9 +1,7 @@
package com.github.houbb.sensitive.word.bs;
+import com.github.houbb.heaven.constant.CharConst;
import com.github.houbb.heaven.support.instance.impl.Instances;
-import com.github.houbb.heaven.util.guava.Guavas;
-import com.github.houbb.heaven.util.lang.StringUtil;
-import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.support.data.SensitiveWordData;
@@ -59,16 +57,6 @@ public class SensitiveWordBs {
return INSTANCE;
}
- /**
- * 是否合法
- * @param target 目标字符串
- * @return 是否
- * @since 0.0.1
- */
- public boolean valid(final String target) {
- return !contains(target);
- }
-
/**
* 是否包含敏感词
* @param target 目标字符串
@@ -102,4 +90,25 @@ public class SensitiveWordBs {
return this.sensitiveWordMap.findFirst(target);
}
+ /**
+ * 替换所有内容
+ * @param target 目标字符串
+ * @param replaceChar 替换为的 char
+ * @return 替换后结果
+ * @since 0.0.2
+ */
+ public String replace(final String target, final char replaceChar) {
+ return this.sensitiveWordMap.replace(target, replaceChar);
+ }
+
+ /**
+ * 替换所有内容
+ * @param target 目标字符串
+ * @return 替换后结果
+ * @since 0.0.2
+ */
+ public String replace(final String target) {
+ return this.replace(target, CharConst.STAR);
+ }
+
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java
index 8d4d059..f041231 100644
--- a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java
+++ b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java
@@ -9,11 +9,20 @@ package com.github.houbb.sensitive.word.constant;
*/
public final class AppConst {
+ private AppConst(){}
+
/**
* 是否为结束标识
* ps: 某种角度而言,我不是很喜欢这种风格。
+ * (1)正常的 char 只會占用一個字符,这里直接给定两个字符即可,降低 Map 的容量。
* @since 0.0.1
*/
- public static final String IS_END = "isEnd";
+ public static final String IS_END = "ED";
+
+ /**
+ * 字典的大小
+ * @since 0.0.1
+ */
+ public static final int DICT_SIZE = 183836;
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java b/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java
new file mode 100644
index 0000000..9c4d030
--- /dev/null
+++ b/src/main/java/com/github/houbb/sensitive/word/model/CheckSensitiveWordResult.java
@@ -0,0 +1,69 @@
+package com.github.houbb.sensitive.word.model;
+
+/**
+ * 检测敏感词结果
+ *
+ * TODO: 这里需要结合 KMP 和 暴力匹配算法。
+ *
+ * 暂时不使用,后期会使用到。
+ * @author binbin.hou
+ * @since 0.0.2
+ */
+@Deprecated
+public class CheckSensitiveWordResult {
+
+ /**
+ * 是否匹配到了敏感词
+ * @since 0.0.2
+ */
+ private boolean hasMatch;
+
+ /**
+ * 敏感词长度
+ * @since 0.0.2
+ */
+ private int sensitiveWordSize;
+
+ /**
+ * 普通单词的长度
+ * @since 0.0.2
+ */
+ private int commonWordSize;
+
+ public boolean hasMatch() {
+ return hasMatch;
+ }
+
+ public CheckSensitiveWordResult hasMatch(boolean hasMatch) {
+ this.hasMatch = hasMatch;
+ return this;
+ }
+
+ public int sentiveWordSize() {
+ return sensitiveWordSize;
+ }
+
+ public CheckSensitiveWordResult sentiveWordSize(int sentiveWordSize) {
+ this.sensitiveWordSize = sentiveWordSize;
+ return this;
+ }
+
+ public int commonWordSize() {
+ return commonWordSize;
+ }
+
+ public CheckSensitiveWordResult commonWordSize(int commonWordSize) {
+ this.commonWordSize = commonWordSize;
+ return this;
+ }
+
+ @Override
+ public String toString() {
+ return "CheckSensitiveWordResult{" +
+ "hasMatch=" + hasMatch +
+ ", sensitiveWordSize=" + sensitiveWordSize +
+ ", commonWordSize=" + commonWordSize +
+ '}';
+ }
+
+}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java
index bae6890..b35e1b7 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java
@@ -1,10 +1,11 @@
package com.github.houbb.sensitive.word.support.data;
import com.github.houbb.heaven.annotation.ThreadSafe;
+import com.github.houbb.heaven.util.guava.Guavas;
+import com.github.houbb.heaven.util.io.StreamUtil;
import com.github.houbb.sensitive.word.api.IWordData;
-import com.github.houbb.sensitive.word.util.StreamUtils;
+import com.github.houbb.sensitive.word.constant.AppConst;
-import java.util.ArrayList;
import java.util.List;
/**
@@ -26,8 +27,8 @@ public class SensitiveWordData implements IWordData {
static {
synchronized (SensitiveWordData.class) {
long start = System.currentTimeMillis();
- defaultLines = new ArrayList<>(183836);
- defaultLines = StreamUtils.readAllLines("/dict.txt");
+ defaultLines = Guavas.newArrayList(AppConst.DICT_SIZE);
+ defaultLines = StreamUtil.readAllLines("/dict.txt");
long end = System.currentTimeMillis();
System.out.println("Sensitive data loaded!, cost time: " + (end - start) + " ms");
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java
index b440f90..870acb0 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java
@@ -2,6 +2,7 @@ package com.github.houbb.sensitive.word.support.map;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.guava.Guavas;
+import com.github.houbb.heaven.util.lang.CharUtil;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
@@ -95,7 +96,7 @@ public class SensitiveWordMap implements IWordMap {
}
long endTime = System.currentTimeMillis();
- System.out.println("Init sensitive word map end! Cost time " + (endTime-startTime) + "ms");
+ System.out.println("Init sensitive word map end! Cost time " + (endTime - startTime) + "ms");
}
/**
@@ -114,7 +115,7 @@ public class SensitiveWordMap implements IWordMap {
}
for (int i = 0; i < string.length(); i++) {
- int checkResult = checkSensitiveWord(string, i);
+ int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST);
// 快速返回
if (checkResult > 0) {
return true;
@@ -148,6 +149,15 @@ public class SensitiveWordMap implements IWordMap {
return stringList.get(0);
}
+ @Override
+ public String replace(String target, char replaceChar) {
+ if(StringUtil.isEmpty(target)) {
+ return target;
+ }
+
+ return this.replaceSensitiveWord(target, ValidModeEnum.FAIL_OVER, replaceChar);
+ }
+
/**
* 获取敏感词列表
*
@@ -164,15 +174,15 @@ public class SensitiveWordMap implements IWordMap {
List resultList = Guavas.newArrayList();
for (int i = 0; i < text.length(); i++) {
- int wordLength = checkSensitiveWord(text, i);
+ int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER);
// 命中
if (wordLength > 0) {
// 保存敏感词
- String sensitiveWord = text.substring(i, i+wordLength);
+ String sensitiveWord = text.substring(i, i + wordLength);
// 添加去重
- if(!resultList.contains(sensitiveWord)) {
+ if (!resultList.contains(sensitiveWord)) {
resultList.add(sensitiveWord);
}
@@ -183,6 +193,7 @@ public class SensitiveWordMap implements IWordMap {
// 增加 i 的步长
// 为什么要-1,因为默认就会自增1
+ // TODO: 这里可以根据字符串匹配算法优化。
i += wordLength - 1;
}
}
@@ -196,17 +207,24 @@ public class SensitiveWordMap implements IWordMap {
* (1)如果未命中敏感词,直接返回 0
* (2)命中敏感词,则返回敏感词的长度。
*
- * @param txt 文本信息
- * @param beginIndex 开始下标
+ * ps: 这里结果进行优化,
+ * 1. 是否包含敏感词。
+ * 2. 敏感词的长度
+ * 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复)
+ *
+ * @param txt 文本信息
+ * @param beginIndex 开始下标
+ * @param validModeEnum 验证模式
* @return 敏感词对应的长度
* @since 0.0.1
*/
- private int checkSensitiveWord(String txt, int beginIndex) {
+ private int checkSensitiveWord(final String txt, final int beginIndex,
+ final ValidModeEnum validModeEnum) {
Map nowMap = sensitiveWordMap;
- boolean flag = false;
// 记录敏感词的长度
- int sensitiveWordLength = 0;
+ int lengthCount = 0;
+ int actualLength = 0;
for (int i = beginIndex; i < txt.length(); i++) {
char charKey = txt.charAt(i);
@@ -214,15 +232,20 @@ public class SensitiveWordMap implements IWordMap {
// 并且将 nowMap 替换为新的 map,进入下一层的循环。
nowMap = (Map) nowMap.get(charKey);
if (ObjectUtil.isNotNull(nowMap)) {
- sensitiveWordLength++;
+ lengthCount++;
// 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
boolean isEnd = (boolean) nowMap.get(AppConst.IS_END);
if (isEnd) {
- flag = true;
+ // 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
+ // eg: 敏感词 敏感词xxx
+ // 如果是 【敏感词x】也会被匹配。
+ actualLength = lengthCount;
- // 这里直接默认 fail-fast 即可。
- break;
+ // 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。
+ if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
+ break;
+ }
}
} else {
// 直接跳出循环
@@ -230,10 +253,44 @@ public class SensitiveWordMap implements IWordMap {
}
}
- if (!flag) {
- sensitiveWordLength = 0;
+ return actualLength;
+ }
+
+ /**
+ * 直接替换敏感词,返回替换后的结果
+ * @param target 文本信息
+ * @param validModeEnum 验证模式
+ * @return 脱敏后的字符串
+ * @since 0.0.2
+ */
+ private String replaceSensitiveWord(final String target,
+ final ValidModeEnum validModeEnum,
+ final char replaceChar) {
+ if(StringUtil.isEmpty(target)) {
+ return target;
}
- return sensitiveWordLength;
+ // 用于结果构建
+ StringBuilder resultBuilder = new StringBuilder();
+
+ for (int i = 0; i < target.length(); i++) {
+ char currentChar = target.charAt(i);
+ // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
+ int wordLength = checkSensitiveWord(target, i, validModeEnum);
+
+ // 敏感词
+ if(wordLength > 0) {
+ String replaceStr = CharUtil.repeat(replaceChar, wordLength);
+ resultBuilder.append(replaceStr);
+
+ // 直接跳过敏感词的长度
+ i += wordLength-1;
+ } else {
+ // 普通词
+ resultBuilder.append(currentChar);
+ }
+ }
+
+ return resultBuilder.toString();
}
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/util/CharsetUtils.java b/src/main/java/com/github/houbb/sensitive/word/util/CharsetUtils.java
index e3d0fc3..20ad14f 100644
--- a/src/main/java/com/github/houbb/sensitive/word/util/CharsetUtils.java
+++ b/src/main/java/com/github/houbb/sensitive/word/util/CharsetUtils.java
@@ -1,70 +1,70 @@
-package com.github.houbb.sensitive.word.util;
-
-import com.github.houbb.heaven.annotation.CommonEager;
-import com.github.houbb.heaven.util.lang.StringUtil;
-
-/**
- * @author binbin.hou
- * @since 0.0.1
- */
-@CommonEager
-public class CharsetUtils {
-
- /**
- * 是否为中文字符
- * @param c char
- * @return 是否
- * @since 0.0.1
- */
- public static boolean isChinese(char c) {
- boolean result = false;
- // 汉字范围 \u4e00-\u9fa5 (中文)
- if (c >= 19968 && c <= 171941) {
- result = true;
- }
- return result;
- }
-
- /**
- * 是否包含中文
- * @param string 字符串
- * @return 是否
- * @since 0.0.1
- */
- public static boolean isContainsChinese(String string) {
- if(StringUtil.isEmpty(string)) {
- return false;
- }
-
- char[] chars = string.toCharArray();
- for(char c : chars) {
- if(isChinese(c)) {
- return true;
- }
- }
-
- return false;
- }
-
- /**
- * 是否全是中文
- * @param string 字符串
- * @return 是否
- * @since 0.0.1
- */
- public static boolean isAllChinese(String string) {
- if(StringUtil.isEmpty(string)) {
- return false;
- }
-
- char[] chars = string.toCharArray();
- for(char c : chars) {
- if(!isChinese(c)) {
- return false;
- }
- }
-
- return true;
- }
-
-}
+//package com.github.houbb.sensitive.word.util;
+//
+//import com.github.houbb.heaven.annotation.CommonEager;
+//import com.github.houbb.heaven.util.lang.StringUtil;
+//
+///**
+// * @author binbin.hou
+// * @since 0.0.1
+// */
+//@CommonEager
+//public class CharsetUtils {
+//
+// /**
+// * 是否为中文字符
+// * @param c char
+// * @return 是否
+// * @since 0.0.1
+// */
+// public static boolean isChinese(char c) {
+// boolean result = false;
+// // 汉字范围 \u4e00-\u9fa5 (中文)
+// if (c >= 19968 && c <= 171941) {
+// result = true;
+// }
+// return result;
+// }
+//
+// /**
+// * 是否包含中文
+// * @param string 字符串
+// * @return 是否
+// * @since 0.0.1
+// */
+// public static boolean isContainsChinese(String string) {
+// if(StringUtil.isEmpty(string)) {
+// return false;
+// }
+//
+// char[] chars = string.toCharArray();
+// for(char c : chars) {
+// if(isChinese(c)) {
+// return true;
+// }
+// }
+//
+// return false;
+// }
+//
+// /**
+// * 是否全是中文
+// * @param string 字符串
+// * @return 是否
+// * @since 0.0.1
+// */
+// public static boolean isAllChinese(String string) {
+// if(StringUtil.isEmpty(string)) {
+// return false;
+// }
+//
+// char[] chars = string.toCharArray();
+// for(char c : chars) {
+// if(!isChinese(c)) {
+// return false;
+// }
+// }
+//
+// return true;
+// }
+//
+//}
diff --git a/src/main/java/com/github/houbb/sensitive/word/util/StreamUtils.java b/src/main/java/com/github/houbb/sensitive/word/util/StreamUtils.java
index dbb1014..2d3b624 100644
--- a/src/main/java/com/github/houbb/sensitive/word/util/StreamUtils.java
+++ b/src/main/java/com/github/houbb/sensitive/word/util/StreamUtils.java
@@ -1,72 +1,72 @@
-package com.github.houbb.sensitive.word.util;
-
-import com.github.houbb.heaven.annotation.CommonEager;
-import com.github.houbb.heaven.constant.CharsetConst;
-import com.github.houbb.heaven.util.lang.StringUtil;
-import com.github.houbb.sensitive.word.exception.SensitiveWordException;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.Charset;
-import java.util.*;
-
-/**
- * 流工具类
- * @author binbin.hou
- * @since 0.0.1
- */
-@CommonEager
-public final class StreamUtils {
-
- private StreamUtils(){}
-
- /**
- * 构建数据集合
- *
- * 后期考虑:是否允许用户自定义字典?
- * 目前不支持这些操作。后期如果需要,再把这些限制放开。
- * @param path 文件路径
- * @return 返回数据集合
- * @since 0.0.1
- */
- public static List readAllLines(final String path) {
- return readAllLines(path, CharsetConst.UTF8, true);
- }
-
- /**
- * 构建数据集合
- *
- * 后期考虑:是否允许用户自定义字典?
- * 目前不支持这些操作。后期如果需要,再把这些限制放开。
- * @param path 文件路径
- * @param charset 文件编码
- * @param ignoreEmpty 是否忽略空白行
- * @return 返回数据集合
- * @since 0.0.1
- */
- public static List readAllLines(final String path,
- final String charset,
- final boolean ignoreEmpty) {
- try {
- List lines = new ArrayList<>();
- InputStream is = StreamUtils.class.getResourceAsStream(path);
- BufferedReader e = new BufferedReader(new InputStreamReader(is,
- Charset.forName(charset)));
-
- while (e.ready()) {
- String entry = e.readLine();
- if (StringUtil.isEmpty(entry)
- && ignoreEmpty) {
- continue;
- }
- lines.add(entry);
- }
- return lines;
- } catch (IOException e) {
- throw new SensitiveWordException("dict init failed!", e);
- }
- }
-
-}
+//package com.github.houbb.sensitive.word.util;
+//
+//import com.github.houbb.heaven.annotation.CommonEager;
+//import com.github.houbb.heaven.constant.CharsetConst;
+//import com.github.houbb.heaven.util.lang.StringUtil;
+//import com.github.houbb.sensitive.word.exception.SensitiveWordException;
+//
+//import java.io.BufferedReader;
+//import java.io.IOException;
+//import java.io.InputStream;
+//import java.io.InputStreamReader;
+//import java.nio.charset.Charset;
+//import java.util.*;
+//
+///**
+// * 流工具类
+// * @author binbin.hou
+// * @since 0.0.1
+// */
+//@CommonEager
+//public final class StreamUtils {
+//
+// private StreamUtils(){}
+//
+// /**
+// * 构建数据集合
+// *
+// * 后期考虑:是否允许用户自定义字典?
+// * 目前不支持这些操作。后期如果需要,再把这些限制放开。
+// * @param path 文件路径
+// * @return 返回数据集合
+// * @since 0.0.1
+// */
+// public static List readAllLines(final String path) {
+// return readAllLines(path, CharsetConst.UTF8, true);
+// }
+//
+// /**
+// * 构建数据集合
+// *
+// * 后期考虑:是否允许用户自定义字典?
+// * 目前不支持这些操作。后期如果需要,再把这些限制放开。
+// * @param path 文件路径
+// * @param charset 文件编码
+// * @param ignoreEmpty 是否忽略空白行
+// * @return 返回数据集合
+// * @since 0.0.1
+// */
+// public static List readAllLines(final String path,
+// final String charset,
+// final boolean ignoreEmpty) {
+// try {
+// List lines = new ArrayList<>();
+// InputStream is = StreamUtils.class.getResourceAsStream(path);
+// BufferedReader e = new BufferedReader(new InputStreamReader(is,
+// Charset.forName(charset)));
+//
+// while (e.ready()) {
+// String entry = e.readLine();
+// if (StringUtil.isEmpty(entry)
+// && ignoreEmpty) {
+// continue;
+// }
+// lines.add(entry);
+// }
+// return lines;
+// } catch (IOException e) {
+// throw new SensitiveWordException("dict init failed!", e);
+// }
+// }
+//
+//}
diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java
index d48d364..bfc3ecd 100644
--- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java
+++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java
@@ -14,13 +14,21 @@ import java.util.List;
*/
public class SensitiveWordBsTest {
+ /**
+ * 是否包含
+ * @since 0.0.1
+ */
@Test
public void containsTest() {
- final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。。";
+ final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
}
+ /**
+ * 返回所有敏感词
+ * @since 0.0.1
+ */
@Test
public void findAllTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
@@ -29,6 +37,10 @@ public class SensitiveWordBsTest {
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
}
+ /**
+ * 返回所有第一个匹配的敏感词
+ * @since 0.0.1
+ */
@Test
public void findFirstTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
@@ -37,4 +49,28 @@ public class SensitiveWordBsTest {
Assert.assertEquals("五星红旗", word);
}
+ /**
+ * 默认的替换策略
+ * @since 0.0.2
+ */
+ @Test
+ public void replaceTest() {
+ final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
+
+ String result = SensitiveWordBs.getInstance().replace(text);
+ Assert.assertEquals("****迎风飘扬,***的画像屹立在***前。", result);
+ }
+
+ /**
+ * 自定义字符的替换策略
+ * @since 0.0.2
+ */
+ @Test
+ public void replaceCharTest() {
+ final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
+
+ String result = SensitiveWordBs.getInstance().replace(text, '0');
+ Assert.assertEquals("0000迎风飘扬,000的画像屹立在000前。", result);
+ }
+
}
diff --git a/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java b/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java
index 8a76d4d..fd0ab4b 100644
--- a/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java
+++ b/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java
@@ -3,8 +3,8 @@ package com.github.houbb.sensitive.word.data;
import com.github.houbb.heaven.support.filter.IFilter;
import com.github.houbb.heaven.util.io.FileUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
+import com.github.houbb.heaven.util.util.CharsetUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
-import com.github.houbb.sensitive.word.util.CharsetUtils;
import org.junit.Ignore;
import org.junit.Test;
@@ -38,7 +38,6 @@ public class DataInitTest {
List trimLines = CollectionUtil.distinct(CollectionUtil.trimCollection(lines));
final String target = "D:\\github\\sensitive-word\\src\\main\\resources\\dict.txt";
- FileUtil.write(target, trimLines);
}
/**
@@ -65,7 +64,7 @@ public class DataInitTest {
List resultList = CollectionUtil.distinct(CollectionUtil.filterList(lines, new IFilter() {
@Override
public boolean filter(String s) {
- return CharsetUtils.isContainsChinese(s);
+ return CharsetUtil.isContainsChinese(s);
}
}));
Collections.sort(resultList);
diff --git a/src/test/java/com/github/houbb/sensitive/word/util/StreamUtilsTest.java b/src/test/java/com/github/houbb/sensitive/word/util/StreamUtilsTest.java
deleted file mode 100644
index 1b5e8d9..0000000
--- a/src/test/java/com/github/houbb/sensitive/word/util/StreamUtilsTest.java
+++ /dev/null
@@ -1,22 +0,0 @@
-package com.github.houbb.sensitive.word.util;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.util.List;
-
-/**
- * @author binbin.hou
- * @since 0.0.1
- */
-public class StreamUtilsTest {
-
- @Test
- public void sizeTest() {
- final String dictPath = "/dict.txt";
-
- List stringList = StreamUtils.readAllLines(dictPath);
- Assert.assertEquals(183836, stringList.size());
- }
-
-}