diff --git a/README.md b/README.md index 794175a..8eddaf4 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,72 @@ [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.houbb/sensitive-word/badge.svg)](http://mvnrepository.com/artifact/com.github.houbb/sensitive-word) -[![](https://img.shields.io/badge/license-Apache2-FF0080.svg)](https://github.com/houbb/sensitive-word/blob/master/LICENSE.txt) \ No newline at end of file +[![](https://img.shields.io/badge/license-Apache2-FF0080.svg)](https://github.com/houbb/sensitive-word/blob/master/LICENSE.txt) + +## 创作目的 + +实现一款好用敏感词工具。 + +基于 DFA 算法实现,目前敏感词库内容收录 18W+ 感觉过于臃肿。 + +后期将进行相关优化,降低字典的数量。 + +希望可以细化敏感词的分类,感觉工作量比较大,暂时没有太好的思路。 + +## 后期目标 + +- 持续扩容对应的敏感词(如合法的数据抓取) + +- 添加英文大小写忽略,全角半角忽略 + +- 中文添加拼音相关转换,添加繁简体转换忽略 + +- 允许用户自定义敏感词和白名单 + +# 快速开始 + +## 准备 + +- JDK1.7+ + +- Maven 3.x+ + +## Maven 引入 + +```xml + + com.github.houbb + sensitive-word + 0.0.1 + +``` + +## 使用实例 + +所有测试案例参见 [SensitiveWordBsTest]() + +### 判断是否包含敏感词 + +```java +final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。。"; + +Assert.assertTrue(SensitiveWordBs.getInstance().contains(text)); +``` + +### 返回第一个敏感词 + +```java +final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + +String word = SensitiveWordBs.getInstance().findFirst(text); +Assert.assertEquals("五星红旗", word); +``` + +### 返回所有敏感词 + +```java +final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + +List wordList = SensitiveWordBs.getInstance().findAll(text); +Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); +``` diff --git a/doc/CHANGE_LOG.md b/doc/CHANGE_LOG.md index 0984e75..565693a 100644 --- a/doc/CHANGE_LOG.md +++ b/doc/CHANGE_LOG.md @@ -13,10 +13,4 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:---|:---|:---|:--| -| 1 | A | 网易 163 邮箱功能的实现 | 2019-12-31 17:12:14 | | - -# release_0.0.2 - -| 序号 | 变更类型 | 说明 | 时间 | 备注 | -|:---|:---|:---|:---|:--| -| 1 | A | 多个收件人的特性支持 | 2020-1-6 17:12:14 | | +| 1 | A | 基本功能的实现 | 2020-1-7 21:46:32 | | diff --git a/doc/issues/issues.md b/doc/issues/issues.md index 3b4eaf1..f9b6fc6 100644 --- a/doc/issues/issues.md +++ b/doc/issues/issues.md @@ -10,3 +10,6 @@ ## stop-word +## 重复词 + +ffffuuuucccckkk diff --git a/doc/issues/v2-基本敏感词的简化.md b/doc/issues/v2-基本敏感词的简化.md new file mode 100644 index 0000000..a742359 --- /dev/null +++ b/doc/issues/v2-基本敏感词的简化.md @@ -0,0 +1,264 @@ +# 简化部分信息 + +## 纯数字 + +移除 + +## 去重 + +移除大量重复的信息。 + +提取出关键的敏感词语即可。 + +## 包含 stop-word 的信息 + +移除 stop-word 之后进行相关的处理。 + +## 单个字 + +48339 === Q +83586 === q +117538 === ━ +117539 === │ +117540 === ┃ +117541 === ┄ +117542 === ┅ +117554 === ┆ +117555 === ┇ +117556 === ┈ +117557 === ┉ +117558 === ┊ +117559 === ┋ +117560 === ┌ +117561 === ┍ +117562 === ┎ +117563 === ┏ +117564 === ┐ +117565 === ┑ +117566 === ┒ +117567 === ┓ +117568 === └ +117569 === ┕ +117570 === ┖ +117571 === ┗ +117572 === ┘ +117573 === ┙ +117574 === ┚ +117575 === ┛ +117576 === ├ +117577 === ┝ +117578 === ┞ +117579 === ┟ +117580 === ┠ +117581 === ┡ +117582 === ┢ +117583 === ┣ +117584 === ┤ +117585 === ┥ +117586 === ┦ +117587 === ┧ +117588 === ┨ +117589 === ┩ +117590 === ┪ +117591 === ┫ +117592 === ┬ +117593 === ┭ +117594 === ┮ +117595 === ┯ +117596 === ┰ +117597 === ┱ +117598 === ┲ +117599 === ┳ +117600 === ┴ +117601 === ┵ +117602 === ┶ +117603 === ┷ +117604 === ┸ +117605 === ┹ +117606 === ┺ +117607 === ┻ +117609 === ┼ +117610 === ┽ +117611 === ┾ +117612 === ┿ +117613 === ╀ +117614 === ╁ +117615 === ╂ +117616 === ╃ +117617 === ╄ +117618 === ╅ +117619 === ╆ +117620 === ╇ +117621 === ╈ +117622 === ╉ +117623 === ╊ +117624 === ╋ +117846 === ㄖ +121501 === 买 +121979 === 乳 +123013 === 仆 +133622 === 功 +133786 === 動 +133790 === 務 +134011 === 區 +134255 === 卐 +134287 === 卖 +134910 === 卵 +135512 === 口 +136392 === 吊 +136576 === 吨 +137367 === 喷 +137479 === 嘸 +139926 === 奸 +140085 === 妈 +140126 === 妓 +140373 === 姘 +140397 === 姦 +140409 === 姩 +140464 === 娘 +140498 === 娼 +140503 === 婊 +140519 === 婬 +140562 === 媽 +140585 === 嫖 +140668 === 孕 +141291 === 寇 +141668 === 射 +142550 === 尻 +142603 === 尿 +142620 === 屄 +142639 === 屌 +142650 === 屍 +142653 === 屎 +142665 === 屙 +143107 === 巯 +143346 === 干 +143535 === 幹 +143735 === 床 +144165 === 弓 +144386 === 弩 +144931 === 忍 +145146 === 性 +145905 === 慰 +145913 === 慾 +146837 === 戳 +146919 === 房 +147574 === 扣 +149446 === 抠 +149774 === 抽 +150089 === 挂 +150244 === 捻 +150260 === 掛 +150296 === 掯 +151938 === 插 +152406 === 操 +153468 === 日 +154328 === 曰 +154902 === 本 +155789 === 枪 +156187 === 槍 +156578 === 歌 +156780 === 死 +158105 === 氟 +158172 === 氯 +158265 === 氰 +158565 === 汞 +159598 === 洱 +159944 === 淪 +159948 === 淫 +161116 === 滚 +161125 === 滛 +161669 === 灾 +161676 === 炮 +161774 === 烂 +161845 === 烯 +161856 === 烷 +162055 === 爛 +162196 === 爽 +162941 === 獨 +162985 === 獸 +163396 === 甙 +163934 === 畜 +165856 === 眯 +165880 === 睾 +165889 === 瞳 +166039 === 砒 +166049 === 砜 +166086 === 砷 +166097 === 础 +166234 === 硼 +166254 === 碡 +166265 === 碱 +166275 === 碼 +166290 === 磷 +166298 === 磺 +166876 === 穴 +167390 === 糞 +167499 === 統 +167536 === 綸 +167961 === 罂 +168722 === 羟 +168800 === 羰 +169070 === 耣 +169444 === 肏 +169474 === 肛 +169508 === 肝 +169679 === 肼 +169680 === 肾 +169725 === 胂 +169729 === 胍 +169883 === 胺 +169907 === 脬 +169939 === 腈 +170004 === 膦 +170283 === 臺 +170406 === 色 +171007 === 苄 +171216 === 茎 +171229 === 草 +171395 === 萋 +171473 === 葵 +171614 === 蔻 +172474 === 裸 +172599 === 褻 +172877 === 証 +174115 === 賤 +174531 === 贱 +174972 === 踢 +174984 === 蹣 +175044 === 躶 +175063 === 輪 +175475 === 轮 +175543 === 辦 +176368 === 逼 +176679 === 酐 +176733 === 酮 +176734 === 酯 +176735 === 酰 +176767 === 醚 +176768 === 醛 +177126 === 鈤 +177295 === 鎷 +177321 === 钒 +177332 === 钠 +177487 === 铀 +177569 === 铊 +179476 === 锇 +179520 === 镉 +179521 === 镍 +179803 === 阴 +180109 === 陰 +180173 === 隂 +180292 === 雞 +180594 === 靠 +181185 === 騒 +181190 === 騷 +181303 === 驽 +181352 === 骚 +182246 === 鯫 +182247 === 鰢 +182306 === 鸠 +182308 === 鸡 +182405 === 鸨 +183438 === B +183491 === b diff --git a/doc/issues/v2-基本敏感词的标签.md b/doc/issues/v5-基本敏感词的标签.md similarity index 100% rename from doc/issues/v2-基本敏感词的标签.md rename to doc/issues/v5-基本敏感词的标签.md diff --git a/pom.xml b/pom.xml index fe7808f..8b341ca 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.0.1-SNAPSHOT + 0.0.1 @@ -141,8 +141,8 @@ - email - The most elegant email tool for java. + sensitive-word + The sensitive word tool for java with DFA. org.sonatype.oss @@ -157,8 +157,8 @@ - https://github.com/houbb/email - https://github.com/houbb/email.git + https://github.com/houbb/sensitive-word + https://github.com/houbb/sensitive-word.git https://houbb.github.io/ diff --git a/release.bat b/release.bat index 89ec420..1fcd753 100644 --- a/release.bat +++ b/release.bat @@ -10,13 +10,13 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.0.2 +SET version=0.0.1 :::: 新版本名称 -SET newVersion=0.0.3 +SET newVersion=0.0.2 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 -SET projectName=email +SET projectName=sensitive-word :: release 项目版本 :::: snapshot 版本号 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java index 885a869..e004b61 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java @@ -1,9 +1,10 @@ package com.github.houbb.sensitive.word.api; -import com.github.houbb.sensitive.word.model.WordMapEntry; +import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import java.util.Collection; -import java.util.Map; +import java.util.List; /** * 敏感词 map @@ -12,12 +13,38 @@ import java.util.Map; */ public interface IWordMap { + /** - * 获取单词 map - * @param collection 集合 - * @return 敏感词 map + * 初始化敏感词 map + * @param collection 集合信息 * @since 0.0.1 */ - Map getWordMap(final Collection collection); + void initWordMap(Collection collection); + + /** + * 是否包含敏感词 + * @param string 字符串 + * @return 是否包含 + * @since 0.0.1 + * @see ValidModeEnum#FAIL_FAST 建议使用快速返回模式 + */ + boolean contains(final String string); + + /** + * 返回所有对应的敏感词 + * @param string 原始字符串 + * @return 结果 + * @since 0.0.1 + * @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式 + */ + List findAll(final String string); + + /** + * 返回第一个对应的敏感词 + * @param string 原始字符串 + * @return 结果 + * @since 0.0.1 + */ + String findFirst(final String string); } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 34d6f66..f3f7cbb 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -5,7 +5,9 @@ import com.github.houbb.heaven.util.guava.Guavas; import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.sensitive.word.api.IWordData; +import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.support.data.SensitiveWordData; +import com.github.houbb.sensitive.word.support.map.SensitiveWordMap; import java.util.List; @@ -22,14 +24,6 @@ public class SensitiveWordBs { */ private SensitiveWordBs(){} - /** - * 待验证字符串信息 - * ps: 可以添加多个辅助类 xxxStringProvider - * 如 FileXXX - * @since 0.0.1 - */ - private volatile String target; - /** * 敏感数据信息 * @since 0.0.1 @@ -37,82 +31,73 @@ public class SensitiveWordBs { private IWordData sensitiveWordData = Instances.singleton(SensitiveWordData.class); /** - * 新建验证实例 - * @param string 字符串 - * @return this + * 敏感词 map * @since 0.0.1 */ - public static SensitiveWordBs newInstance(final String string) { - SensitiveWordBs instance = new SensitiveWordBs(); - instance.target = string; - return instance; + private IWordMap sensitiveWordMap = Instances.singleton(SensitiveWordMap.class); + + /** + * 获取单例信息 + * @since 0.0.1 + */ + private static final SensitiveWordBs INSTANCE; + + static { + synchronized (SensitiveWordBs.class) { + INSTANCE = new SensitiveWordBs(); + List lines = INSTANCE.sensitiveWordData.getWordData(); + INSTANCE.sensitiveWordMap.initWordMap(lines); + } } /** - * 指定目标字符串信息 - * @param string 字符串 + * 新建验证实例 * @return this * @since 0.0.1 */ - public SensitiveWordBs target(final String string) { - this.target = string; - return this; + public static SensitiveWordBs getInstance() { + return INSTANCE; } /** * 是否合法 + * @param target 目标字符串 * @return 是否 * @since 0.0.1 - * @see #contains() 是否包含 */ - public boolean valid() { - return !contains(); + public boolean valid(final String target) { + return !contains(target); } /** * 是否包含敏感词 + * @param target 目标字符串 * @return 是否 * @since 0.0.1 - * @see #findAll() 列表不为空即可 */ - public boolean contains() { - return CollectionUtil.isNotEmpty(findAll()); + public boolean contains(final String target) { + return this.sensitiveWordMap.contains(target); } /** * 返回所有的敏感词 - * 1. 这里是默认去重的。 + * 1. 这里是默认去重的,且是有序的。 + * 2. 如果不存在,返回空列表 * @return 敏感词列表 * @since 0.0.1 */ - public List findAll() { - if(StringUtil.isEmpty(target)) { - return Guavas.newArrayList(); - } - - // 分词 - return null; + public List findAll(final String target) { + return this.sensitiveWordMap.findAll(target); } /** - * 执行过滤 - * 1. 使用默认策略 - * 2. 默认策略就是直接移除。 - * @return 过滤后的结果 + * 返回第一个敏感词 + * (1)如果不存在,则返回 {@code null} + * @return 敏感词 * @since 0.0.1 */ - private String filter() { - return filter(StringUtil.EMPTY); - } - - /** - * 指定过滤的字符,执行过滤 - * 1. filter 只是一种特殊的字符串替换策略。 - * @return 过滤后的结果 - * @since 0.0.1 - */ - private String filter(final String filter) { - return ""; + public String findFirst(final String target) { + return this.sensitiveWordMap.findFirst(target); } } diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java new file mode 100644 index 0000000..8d4d059 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java @@ -0,0 +1,19 @@ +package com.github.houbb.sensitive.word.constant; + +/** + *

project: sensitive-word-AppConst

+ *

create on 2020/1/7 23:39

+ * + * @author Administrator + * @since 0.0.1 + */ +public final class AppConst { + + /** + * 是否为结束标识 + * ps: 某种角度而言,我不是很喜欢这种风格。 + * @since 0.0.1 + */ + public static final String IS_END = "isEnd"; + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java b/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java new file mode 100644 index 0000000..e90b2fc --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java @@ -0,0 +1,23 @@ +package com.github.houbb.sensitive.word.constant.enums; + +/** + *

project: sensitive-word-ValidModeEnum

+ *

create on 2020/1/7 22:46

+ * + * @author Administrator + * @since 1.0.0 + */ +public enum ValidModeEnum { + + /** + * 快速失败 + * @since 0.0.1 + */ + FAIL_FAST, + + /** + * 全部遍历 + * @since 0.0.1 + */ + FAIL_OVER +} diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/package-info.java b/src/main/java/com/github/houbb/sensitive/word/constant/package-info.java new file mode 100644 index 0000000..1b6d452 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/constant/package-info.java @@ -0,0 +1,8 @@ +/** + *

project: sensitive-word-package-info

+ *

create on 2020/1/7 22:46

+ * + * @author Administrator + * @since 1.0.0 + */ +package com.github.houbb.sensitive.word.constant; \ No newline at end of file diff --git a/src/main/java/com/github/houbb/sensitive/word/model/WordMapEntry.java b/src/main/java/com/github/houbb/sensitive/word/model/WordMapEntry.java deleted file mode 100644 index a8e60bd..0000000 --- a/src/main/java/com/github/houbb/sensitive/word/model/WordMapEntry.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.github.houbb.sensitive.word.model; - -import java.util.List; - -/** - * 所有的敏感词,第一个字都是 key - * - * @author binbin.hou - * @since 0.0.1 - */ -public class WordMapEntry { - - /** - * 单个单词 - * @since 0.0.1 - */ - private String word; - - /** - * 是否为结束 - * @since 0.0.1 - */ - private boolean isEnd; - - /** - * 下一层的信息列表 - * @since 0.0.1 - */ - private List nextEntryList; - - public String word() { - return word; - } - - public WordMapEntry word(String word) { - this.word = word; - return this; - } - - public boolean end() { - return isEnd; - } - - public WordMapEntry end(boolean end) { - isEnd = end; - return this; - } - - public List nextEntryList() { - return nextEntryList; - } - - public WordMapEntry nextEntryList(List nextEntryList) { - this.nextEntryList = nextEntryList; - return this; - } - -} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java index b10a555..38ec961 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java @@ -9,6 +9,7 @@ import java.util.List; /** * 数据加载使用单例的模式,只需要加载一次即可。 + * * @author binbin.hou * @since 0.0.1 */ @@ -17,18 +18,22 @@ public class SensitiveWordData implements IWordData { /** * 默认的内置行 + * * @since 0.0.1 */ private static List defaultLines; static { - long start = System.currentTimeMillis(); - defaultLines = new ArrayList<>(183837); - defaultLines = StreamUtils.readAllLines("/dict.txt"); - long end = System.currentTimeMillis(); - System.out.println("Sensitive data loaded!, cost time: " + (end-start) + " ms"); + synchronized (SensitiveWordData.class) { + long start = System.currentTimeMillis(); + defaultLines = new ArrayList<>(183837); + defaultLines = StreamUtils.readAllLines("/dict.txt"); + long end = System.currentTimeMillis(); + System.out.println("Sensitive data loaded!, cost time: " + (end - start) + " ms"); + } } + @Override public List getWordData() { return defaultLines; diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java index 0b43335..b440f90 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java @@ -1,12 +1,18 @@ package com.github.houbb.sensitive.word.support.map; import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.util.guava.Guavas; import com.github.houbb.heaven.util.lang.ObjectUtil; +import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.heaven.util.util.CollectionUtil; +import com.github.houbb.heaven.util.util.MapUtil; import com.github.houbb.sensitive.word.api.IWordMap; -import com.github.houbb.sensitive.word.model.WordMapEntry; +import com.github.houbb.sensitive.word.constant.AppConst; +import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import java.util.Collection; import java.util.HashMap; +import java.util.List; import java.util.Map; /** @@ -18,89 +24,216 @@ import java.util.Map; @ThreadSafe public class SensitiveWordMap implements IWordMap { + /** + * 脱敏单词 map + * + * @since 0.0.1 + */ + private static Map sensitiveWordMap; /** * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型: * - *
-     * 中 = {
-     * isEnd = 0
-     * 国 = {
-     * isEnd = 1
-     * 人 = {isEnd = 0
-     * 民 = {isEnd = 1}
-     * }
-     * 男  = {
-     * isEnd = 0
-     * 人 = {
-     * isEnd = 1
-     * }
-     * }
-     * }
-     * }
-     *
-     * 五 = {
-     * isEnd = 0
-     * 星 = {
-     * isEnd = 0
-     * 红 = {
-     * isEnd = 0
-     * 旗 = {
-     * isEnd = 1
-     * }
-     * }
-     * }
-     * }
-     * 
- * - * key: 对应的中文 - * value: 是否为结束。 - * - * 日本人,日本鬼子为例 - * - * 1、在hashMap中查询“日”看其是否在hashMap中存在,如果不存在,则证明已“日”开头的敏感词还不存在,则我们直接构建这样的一棵树。跳至3。 - * 2、如果在hashMap中查找到了,表明存在以“日”开头的敏感词,设置hashMap = hashMap.get("日"),跳至1,依次匹配“本”、“人”。 - * 3、判断该字是否为该词中的最后一个字。若是表示敏感词结束,设置标志位isEnd = 1,否则设置标志位isEnd = 0; - * * @param collection 敏感词库集合 * @since 0.0.1 - * + *

* 使用对象代码 map 的这种一直递归。 - * + * 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html + * https://blog.csdn.net/chenssy/article/details/26961957 */ @Override - public Map getWordMap(Collection collection) { - Map resultMap = new HashMap<>(collection.size()); + @SuppressWarnings("unchecked") + public void initWordMap(Collection collection) { + // 避免重复加载 + if (MapUtil.isNotEmpty(sensitiveWordMap)) { + return; + } + + long startTime = System.currentTimeMillis(); + // 避免扩容带来的消耗 + sensitiveWordMap = new HashMap(collection.size()); for (String key : collection) { + if (StringUtil.isEmpty(key)) { + continue; + } + + // 用来按照相应的格式保存敏感词库数据 char[] chars = key.toCharArray(); final int size = chars.length; - for (int i = 0; i < size; i++) { - String charStr = String.valueOf(chars[i]); + // 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中 + Map currentMap = sensitiveWordMap; - // 直接获取对应的 map - WordMapEntry wordMapEntry = resultMap.get(charStr); + for (int i = 0; i < size; i++) { + // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值 + char charKey = chars[i]; + // 如果集合存在 + Object wordMap = currentMap.get(charKey); // 如果集合存在 - if(ObjectUtil.isNotNull(wordMapEntry)) { - + if (ObjectUtil.isNotNull(wordMap)) { + // 直接将获取到的 map 当前当前 map 进行继续的操作 + currentMap = (Map) wordMap; } else { -// // 如果集合不存在,直接新建一个 map -// wordMap = new HashMap<>(size); -// // 判断是否为最后一个,如果是则设置为1 -// boolean isEnd = i == size - 1; -// // 设置最后的结果 -// wordMap.put(charStr, isEnd); + //不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一 + Map newWordMap = new HashMap<>(); + newWordMap.put(AppConst.IS_END, false); + + // 将新的节点放入当前 map 中 + currentMap.put(charKey, newWordMap); + + // 将新节点设置为当前节点,方便下一次节点的循环。 + currentMap = newWordMap; + } + + // 判断是否为最后一个,添加是否结束的标识。 + if (i == size - 1) { + currentMap.put(AppConst.IS_END, true); } } - } - return resultMap; + + long endTime = System.currentTimeMillis(); + System.out.println("Init sensitive word map end! Cost time " + (endTime-startTime) + "ms"); } - public static void main(String[] args) { - System.out.println("s".toCharArray()[0]+""); + /** + * 是否包含 + * (1)直接遍历所有 + * (2)如果遇到,则直接返回 true + * + * @param string 字符串 + * @return 是否包含 + * @since 0.0.1 + */ + @Override + public boolean contains(String string) { + if (StringUtil.isEmpty(string)) { + return false; + } + + for (int i = 0; i < string.length(); i++) { + int checkResult = checkSensitiveWord(string, i); + // 快速返回 + if (checkResult > 0) { + return true; + } + } + return false; + } + + /** + * 返回所有对应的敏感词 + * (1)结果是有序的 + * (2)结果是默认去重的 + * + * @param string 原始字符串 + * @return 结果 + * @since 0.0.1 + */ + @Override + public List findAll(String string) { + return getSensitiveWords(string, ValidModeEnum.FAIL_OVER); + } + + @Override + public String findFirst(String string) { + List stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST); + + if (CollectionUtil.isEmpty(stringList)) { + return null; + } + + return stringList.get(0); + } + + /** + * 获取敏感词列表 + * + * @param text 文本 + * @param modeEnum 模式 + * @return 结果列表 + * @since 0.0.1 + */ + private List getSensitiveWords(final String text, final ValidModeEnum modeEnum) { + //1. 是否存在敏感词,如果比存在,直接返回空列表 + if (StringUtil.isEmpty(text)) { + return Guavas.newArrayList(); + } + + List resultList = Guavas.newArrayList(); + for (int i = 0; i < text.length(); i++) { + int wordLength = checkSensitiveWord(text, i); + + // 命中 + if (wordLength > 0) { + // 保存敏感词 + String sensitiveWord = text.substring(i, i+wordLength); + + // 添加去重 + if(!resultList.contains(sensitiveWord)) { + resultList.add(sensitiveWord); + } + + // 快速返回 + if (ValidModeEnum.FAIL_FAST.equals(modeEnum)) { + break; + } + + // 增加 i 的步长 + // 为什么要-1,因为默认就会自增1 + i += wordLength - 1; + } + } + + return resultList; + } + + /** + * 检查敏感词数量 + *

+ * (1)如果未命中敏感词,直接返回 0 + * (2)命中敏感词,则返回敏感词的长度。 + * + * @param txt 文本信息 + * @param beginIndex 开始下标 + * @return 敏感词对应的长度 + * @since 0.0.1 + */ + private int checkSensitiveWord(String txt, int beginIndex) { + Map nowMap = sensitiveWordMap; + + boolean flag = false; + // 记录敏感词的长度 + int sensitiveWordLength = 0; + + for (int i = beginIndex; i < txt.length(); i++) { + char charKey = txt.charAt(i); + // 判断该字是否存在于敏感词库中 + // 并且将 nowMap 替换为新的 map,进入下一层的循环。 + nowMap = (Map) nowMap.get(charKey); + if (ObjectUtil.isNotNull(nowMap)) { + sensitiveWordLength++; + + // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测 + boolean isEnd = (boolean) nowMap.get(AppConst.IS_END); + if (isEnd) { + flag = true; + + // 这里直接默认 fail-fast 即可。 + break; + } + } else { + // 直接跳出循环 + break; + } + } + + if (!flag) { + sensitiveWordLength = 0; + } + return sensitiveWordLength; } } diff --git a/src/main/resources/dict.txt b/src/main/resources/dict.txt index 767eb1d..5a344e9 100644 --- a/src/main/resources/dict.txt +++ b/src/main/resources/dict.txt @@ -164386,7 +164386,6 @@ z以留吧以其以武 百花故事 百花盛放 百行教师贱 -的 的同修 的妹 子 都 很 急 约 的阿斗 diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java new file mode 100644 index 0000000..d48d364 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java @@ -0,0 +1,40 @@ +package com.github.houbb.sensitive.word.bs; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.0.1 + */ +public class SensitiveWordBsTest { + + @Test + public void containsTest() { + final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。。"; + + Assert.assertTrue(SensitiveWordBs.getInstance().contains(text)); + } + + @Test + public void findAllTest() { + final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + + List wordList = SensitiveWordBs.getInstance().findAll(text); + Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString()); + } + + @Test + public void findFirstTest() { + final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + + String word = SensitiveWordBs.getInstance().findFirst(text); + Assert.assertEquals("五星红旗", word); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java b/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java index 410c970..8a76d4d 100644 --- a/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java @@ -102,4 +102,18 @@ public class DataInitTest { FileUtil.write(target, disctinct); } + @Test + @Ignore + public void oneWordTest() { + final String source = "D:\\_github\\sensitive-word\\src\\main\\resources\\dict.txt"; + + List lines = FileUtil.readAllLines(source); + for(int i = 0; i < lines.size(); i++) { + String line = lines.get(i); + if(line.trim().length() == 1) { + System.out.println(i + " === " + line); + } + } + } + }