release branch 0.0.7

2026-03-22 08:27:36 +08:00 · 2020-01-10 15:14:25 +08:00
parent e1af586403
commit 3c41f4e60d
11 changed files with 131 additions and 29 deletions
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@
 <dependency>
    <groupId>com.github.houbb</groupId>
    <artifactId>sensitive-word</artifactId>
-    <version>0.0.6</version>
+    <version>0.0.7</version>
 </dependency>
 ```

@@ -169,9 +169,16 @@ List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
 Assert.assertEquals("[Ⓕⓤc⒦]", wordList.toString());
 ```

-# 后期 road-map
+## 忽略重复词

- 重复词
+```java
+final String text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words";
+
+List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
+Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString());
+```
+
+# 后期 road-map

 - 停顿词

--- a/doc/CHANGE_LOG.md
+++ b/doc/CHANGE_LOG.md
@@ -55,4 +55,10 @@
 | 1 | A | 添加中文繁简体转换支持 | 2020-1-10 09:34:35 | |
 | 2 | A | 添加英文常见写法转换支持 | 2020-1-10 09:34:35 | |
 | 3 | A | 新增敏感词 `艹` | 2020-1-10 09:34:35 | |
-| 4 | D | 移除单个词 `k买仆办功务动区卖台吨天房本歌滚灾独证踢弓` | 2020-1-10 09:34:35 | |
+| 4 | D | 移除单个词 `k买仆办功务动区卖台吨天房本歌滚灾独证踢弓` | 2020-1-10 09:34:35 | |
+
+# release_0.0.7
+
+| 序号 | 变更类型 | 说明 | 时间 | 备注 |
+|:---|:---|:---|:---|:--|
+| 1 | A | 添加忽略重复词支持 | 2020-1-10 09:34:35 | |
--- a/doc/issues/roadmap/v007-重复词的处理.md
+++ b/doc/issues/roadmap/v007-重复词的处理.md
@@ -4,4 +4,10 @@ ffffuuuuccckkk

 f xxx 

-x 如果和上一个字符一样，则直接忽略。
+x 如果和上一个字符一样，则直接忽略。
+
+# 细节
+
+当开启的时候，如果在敏感词获取的时候，如果下一个字没有找到，则进行去重。
+
+即如果当前字符和上一个字符完全一样，则直接跳过。（仅仅在没有普匹配的场景下）
--- a/doc/issues/roadmap/v012-停顿词的处理.md
+++ b/doc/issues/roadmap/v012-停顿词的处理.md
@@ -1,3 +1,9 @@
+# 标点符号
+
+无论中文英文数字，其中特殊符号一定是停顿词。
+
+可以这么粗俗的认为。
+
 # 英文

 核心是英文停顿词。
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@

    <groupId>com.github.houbb</groupId>
    <artifactId>sensitive-word</artifactId>
-    <version>0.0.6</version>
+    <version>0.0.7</version>

    <properties>
        <!--============================== All Plugins START ==============================-->
--- a/release.bat
+++ b/release.bat
@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."

 :: 版本号信息(需要手动指定)
 :::: 旧版本名称
-SET version=0.0.6
+SET version=0.0.7
 :::: 新版本名称
-SET newVersion=0.0.7
+SET newVersion=0.0.8
 :::: 组织名称
 SET groupName=com.github.houbb
 :::: 项目名称
--- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java
+++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java
@@ -29,7 +29,6 @@ public interface IWordContext {
     */
    boolean ignoreNumStyle();

-
    /**
     * 设置是否忽略大小写
     * @param ignoreCase 是否忽略大小写
@@ -64,12 +63,11 @@ public interface IWordContext {
    /**
     * 设置是否忽略中文繁简体格式
     * @param ignoreChineseStyle 是否忽略
-     * @return 是否
+     * @return this
     * @since 0.0.6
     */
    IWordContext ignoreChineseStyle(final boolean ignoreChineseStyle);

-
    /**
     * 获取敏感词信息
     * @return 敏感词
@@ -95,7 +93,7 @@ public interface IWordContext {
    /**
     * 设置敏感数字检测
     * @param sensitiveNumCheck 数字格式检测
-     * @return 数字检测
+     * @return this
     * @since 0.0.5
     */
    IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck);
@@ -110,9 +108,24 @@ public interface IWordContext {
    /**
     * 设置忽略英文的写法
     * @param ignoreEnglishStyle 是否忽略
-     * @return 数字检测
+     * @return this
     * @since 0.0.6
     */
    IWordContext ignoreEnglishStyle(final boolean ignoreEnglishStyle);

+    /**
+     * 忽略重复词
+     * @return 是否忽略
+     * @since 0.0.7
+     */
+    boolean ignoreRepeat();
+
+    /**
+     * 设置忽略重复词
+     * @param ignoreRepeat 是否忽略
+     * @return this
+     * @since 0.0.7
+     */
+    IWordContext ignoreRepeat(final boolean ignoreRepeat);
+
 }
--- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
+++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
@@ -85,6 +85,7 @@ public class SensitiveWordBs {
        wordContext.ignoreNumStyle(true);
        wordContext.ignoreChineseStyle(true);
        wordContext.ignoreEnglishStyle(true);
+        wordContext.ignoreRepeat(true);

        // 开启校验
        wordContext.sensitiveNumCheck(true);
--- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java
+++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java
@@ -53,6 +53,12 @@ public class SensitiveWordContext implements IWordContext {
     */
    private boolean ignoreEnglishStyle;

+    /**
+     * 忽略重复词
+     * @since 0.0.7
+     */
+    private boolean ignoreRepeat;
+
    /**
     * 私有化构造器
     * @since 0.0.4
@@ -147,16 +153,14 @@ public class SensitiveWordContext implements IWordContext {
    }

    @Override
-    public String toString() {
-        return "SensitiveWordContext{" +
-                "ignoreCase=" + ignoreCase +
-                ", ignoreWidth=" + ignoreWidth +
-                ", ignoreNumStyle=" + ignoreNumStyle +
-                ", sensitiveWordMap=" + sensitiveWordMap +
-                ", sensitiveNumCheck=" + sensitiveNumCheck +
-                ", ignoreChineseStyle=" + ignoreChineseStyle +
-                ", ignoreEnglishStyle=" + ignoreEnglishStyle +
-                '}';
+    public boolean ignoreRepeat() {
+        return ignoreRepeat;
+    }
+
+    @Override
+    public SensitiveWordContext ignoreRepeat(boolean ignoreRepeat) {
+        this.ignoreRepeat = ignoreRepeat;
+        return this;
    }

 }
--- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java
@@ -2,7 +2,6 @@ package com.github.houbb.sensitive.word.support.check;

 import com.github.houbb.heaven.annotation.ThreadSafe;
 import com.github.houbb.heaven.support.instance.impl.Instances;
-import com.github.houbb.heaven.util.lang.CharUtil;
 import com.github.houbb.heaven.util.lang.ObjectUtil;
 import com.github.houbb.sensitive.word.api.ISensitiveCheck;
 import com.github.houbb.sensitive.word.api.IWordContext;
@@ -29,12 +28,9 @@ public class SensitiveWordCheck implements ISensitiveCheck {
        int actualLength = 0;

        for (int i = beginIndex; i < txt.length(); i++) {
-            char c = txt.charAt(i);
-            char charKey = Instances.singleton(CharFormatChain.class).format(c, context);
+            // 获取当前的 map 信息
+            nowMap = getNowMap(nowMap, context, txt, i);

-            // 判断该字是否存在于敏感词库中
-            // 并且将 nowMap 替换为新的 map，进入下一层的循环。
-            nowMap = (Map) nowMap.get(charKey);
            if (ObjectUtil.isNotNull(nowMap)) {
                lengthCount++;

@@ -60,4 +56,38 @@ public class SensitiveWordCheck implements ISensitiveCheck {
        return actualLength;
    }

+    /**
+     * 获取当前的 Map
+     * @param nowMap 原始的当前 map
+     * @param context 上下文
+     * @param txt 文本信息
+     * @param index 下标
+     * @return 实际的当前 map
+     * @since 0.0.7
+     */
+    private Map getNowMap(Map nowMap,
+                          final IWordContext context,
+                          final String txt,
+                          final int index) {
+        char c = txt.charAt(index);
+        char mappingChar = Instances.singleton(CharFormatChain.class).format(c, context);
+
+        // 这里做一次重复词的处理
+        Map currentMap = (Map) nowMap.get(mappingChar);
+        // 启用忽略重复&当前下标不是第一个
+        if(context.ignoreRepeat()
+            && index > 0) {
+            char preChar = txt.charAt(index-1);
+            char preMappingChar = Instances.singleton(CharFormatChain.class)
+                    .format(preChar, context);
+
+            // 直接赋值为上一个 map
+            if(preMappingChar == mappingChar) {
+                currentMap = nowMap;
+            }
+        }
+
+        return currentMap;
+    }
+
 }
--- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsRepeatTest.java
+++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsRepeatTest.java
@@ -0,0 +1,29 @@
+package com.github.houbb.sensitive.word.bs;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.List;
+
+/**
+ * <p> project: sensitive-word-SensitiveWordBsTest </p>
+ * <p> create on 2020/1/7 23:43 </p>
+ *
+ * @author Administrator
+ * @since 0.0.7
+ */
+public class SensitiveWordBsRepeatTest {
+
+    /**
+     * 忽略重复词
+     * @since 0.0.7
+     */
+    @Test
+    public void ignoreChineseStyleTest() {
+        final String text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words";
+
+        List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
+        Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString());
+    }
+
+}