From 579a98c6a7463fbafd21583970612b548124d508 Mon Sep 17 00:00:00 2001 From: "binbin.hou" <1060732496@qq.com> Date: Fri, 10 Jan 2020 14:21:16 +0800 Subject: [PATCH] release branch 0.0.6 --- README.md | 28 ++++++++++++-- doc/CHANGE_LOG.md | 11 +++++- doc/issues/roadmap/v006-繁简体转换实现.md | 5 ++- ...008-拼音的处理.md => v014-声近字的处理.md} | 8 +++- doc/issues/roadmap/v014-形近字的处理.md | 13 +++++++ pom.xml | 3 +- release.bat | 4 +- .../sensitive/word/api/IWordContext.java | 31 +++++++++++++++ .../sensitive/word/bs/SensitiveWordBs.java | 2 + .../word/bs/SensitiveWordContext.java | 38 +++++++++++++++++++ .../sensitive/word/constant/AppConst.java | 2 +- .../word/support/format/CharFormatChain.java | 7 +++- .../format/IgnoreChineseStyleFormat.java | 24 ++++++++++++ .../format/IgnoreEnglishStyleFormat.java | 21 ++++++++++ src/main/resources/dict.txt | 22 +---------- .../word/bs/SensitiveWordBsChineseTest.java | 29 ++++++++++++++ .../word/bs/SensitiveWordBsEnglishTest.java | 29 ++++++++++++++ .../houbb/sensitive/word/data/DataUtil.java | 15 ++++++++ 18 files changed, 258 insertions(+), 34 deletions(-) rename doc/issues/roadmap/{v008-拼音的处理.md => v014-声近字的处理.md} (76%) create mode 100644 doc/issues/roadmap/v014-形近字的处理.md create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreChineseStyleFormat.java create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreEnglishStyleFormat.java create mode 100644 src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java create mode 100644 src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEnglishTest.java diff --git a/README.md b/README.md index 7b254d9..3903c95 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,11 @@ - 支持英文大小写互换 -- 支持数字各种形式的互换 +- 支持数字常见形式的互换 + +- 支持中文繁简体互换 + +- 支持英文常见形式的互换 ## 变更日志 @@ -50,7 +54,7 @@ com.github.houbb sensitive-word - 0.0.5 + 0.0.6 ``` @@ -147,9 +151,25 @@ List wordList = SensitiveWordBs.newInstance().findAll(text); Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString()); ``` -# 后期 road-map +## 忽略繁简体 -- 繁简体互换 +```java +final String text = "我爱我的祖国和五星紅旗。"; + +List wordList = SensitiveWordBs.newInstance().findAll(text); +Assert.assertEquals("[五星紅旗]", wordList.toString()); +``` + +## 忽略英文的书写格式 + +```java +final String text = "Ⓕⓤc⒦ the bad words"; + +List wordList = SensitiveWordBs.newInstance().findAll(text); +Assert.assertEquals("[Ⓕⓤc⒦]", wordList.toString()); +``` + +# 后期 road-map - 重复词 diff --git a/doc/CHANGE_LOG.md b/doc/CHANGE_LOG.md index eae8699..1dbe0e9 100644 --- a/doc/CHANGE_LOG.md +++ b/doc/CHANGE_LOG.md @@ -46,4 +46,13 @@ | 3 | O | 责任链模式优化代码实现 | 2020-1-10 09:34:35 | | | 4 | A | 支持数字格式化转换 | 2020-1-10 09:34:35 | | | 5 | A | 支持数字敏感词验证 | 2020-1-10 09:34:35 | | -| 6 | O | 优化所有写法的数字为阿拉伯写法 | 2020-1-10 09:34:35 | | \ No newline at end of file +| 6 | O | 优化所有写法的数字为阿拉伯写法 | 2020-1-10 09:34:35 | | + +# release_0.0.6 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:---|:---|:---|:--| +| 1 | A | 添加中文繁简体转换支持 | 2020-1-10 09:34:35 | | +| 2 | A | 添加英文常见写法转换支持 | 2020-1-10 09:34:35 | | +| 3 | A | 新增敏感词 `艹` | 2020-1-10 09:34:35 | | +| 4 | D | 移除单个词 `k买仆办功务动区卖台吨天房本歌滚灾独证踢弓` | 2020-1-10 09:34:35 | | \ No newline at end of file diff --git a/doc/issues/roadmap/v006-繁简体转换实现.md b/doc/issues/roadmap/v006-繁简体转换实现.md index 7e07ff4..df2bdec 100644 --- a/doc/issues/roadmap/v006-繁简体转换实现.md +++ b/doc/issues/roadmap/v006-繁简体转换实现.md @@ -1,3 +1,6 @@ # 在遍历的时候 -如果是中文,则直接进行替换。 \ No newline at end of file +如果是中文,则直接进行替换。 + +# 忽略英文的写法样式 + diff --git a/doc/issues/roadmap/v008-拼音的处理.md b/doc/issues/roadmap/v014-声近字的处理.md similarity index 76% rename from doc/issues/roadmap/v008-拼音的处理.md rename to doc/issues/roadmap/v014-声近字的处理.md index 6cfb9d9..c62a44f 100644 --- a/doc/issues/roadmap/v008-拼音的处理.md +++ b/doc/issues/roadmap/v014-声近字的处理.md @@ -24,4 +24,10 @@ 现在的转化为数字的,也生成一份拼音。 -然后将二者进行合并。 \ No newline at end of file +然后将二者进行合并。 + +## 拼音的处理 + +拼音的处理只是形声字。 + +还可以有象形字,所以第一份包含中文写法的字段很重要。 \ No newline at end of file diff --git a/doc/issues/roadmap/v014-形近字的处理.md b/doc/issues/roadmap/v014-形近字的处理.md new file mode 100644 index 0000000..3ce6449 --- /dev/null +++ b/doc/issues/roadmap/v014-形近字的处理.md @@ -0,0 +1,13 @@ +# 形近字 + +比如:王 玉 这种。 + +这种相对而言比较难,需要有一张完整的近似表。 + +# 组合字 + +甚至包含偏旁部首: + +如 `法`==》【氵去】【水去】等等。 + +这种可以通过原来的字直接进行拆分。 \ No newline at end of file diff --git a/pom.xml b/pom.xml index 66c7e51..c2ec8d8 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.0.5 + 0.0.6 @@ -46,7 +46,6 @@ com.github.houbb opencc4j ${opencc4j.version} - true com.github.houbb diff --git a/release.bat b/release.bat index 687e022..6209cd6 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.0.5 +SET version=0.0.6 :::: 新版本名称 -SET newVersion=0.0.6 +SET newVersion=0.0.7 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index faa9619..d352d16 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -54,6 +54,22 @@ public interface IWordContext { */ IWordContext ignoreNumStyle(boolean ignoreNumStyle); + /** + * 忽略中文繁简体格式 + * @return 是否 + * @since 0.0.6 + */ + boolean ignoreChineseStyle(); + + /** + * 设置是否忽略中文繁简体格式 + * @param ignoreChineseStyle 是否忽略 + * @return 是否 + * @since 0.0.6 + */ + IWordContext ignoreChineseStyle(final boolean ignoreChineseStyle); + + /** * 获取敏感词信息 * @return 敏感词 @@ -78,9 +94,24 @@ public interface IWordContext { /** * 设置敏感数字检测 + * @param sensitiveNumCheck 数字格式检测 * @return 数字检测 * @since 0.0.5 */ IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck); + /** + * 忽略英文的写法 + * @return 数字检测 + * @since 0.0.6 + */ + boolean ignoreEnglishStyle(); + + /** + * 设置忽略英文的写法 + * @return 数字检测 + * @since 0.0.6 + */ + IWordContext ignoreEnglishStyle(final boolean ignoreEnglishStyle); + } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 1746216..1669604 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -83,6 +83,8 @@ public class SensitiveWordBs { wordContext.ignoreCase(true); wordContext.ignoreWidth(true); wordContext.ignoreNumStyle(true); + wordContext.ignoreChineseStyle(true); + wordContext.ignoreEnglishStyle(true); // 开启校验 wordContext.sensitiveNumCheck(true); diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index 250f34a..232709b 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -40,6 +40,19 @@ public class SensitiveWordContext implements IWordContext { * @since 0.0.6 */ private boolean sensitiveNumCheck; + + /** + * 是否忽略中文繁简体 + * @since 0.0.6 + */ + private boolean ignoreChineseStyle; + + /** + * 是否忽略英文的写法 + * @since 0.0.6 + */ + private boolean ignoreEnglishStyle; + /** * 私有化构造器 * @since 0.0.4 @@ -111,13 +124,38 @@ public class SensitiveWordContext implements IWordContext { return this; } + @Override + public boolean ignoreChineseStyle() { + return ignoreChineseStyle; + } + + @Override + public SensitiveWordContext ignoreChineseStyle(boolean ignoreChineseStyle) { + this.ignoreChineseStyle = ignoreChineseStyle; + return this; + } + + @Override + public boolean ignoreEnglishStyle() { + return ignoreEnglishStyle; + } + + @Override + public SensitiveWordContext ignoreEnglishStyle(boolean ignoreEnglishStyle) { + this.ignoreEnglishStyle = ignoreEnglishStyle; + return this; + } + @Override public String toString() { return "SensitiveWordContext{" + "ignoreCase=" + ignoreCase + ", ignoreWidth=" + ignoreWidth + ", ignoreNumStyle=" + ignoreNumStyle + + ", sensitiveWordMap=" + sensitiveWordMap + ", sensitiveNumCheck=" + sensitiveNumCheck + + ", ignoreChineseStyle=" + ignoreChineseStyle + + ", ignoreEnglishStyle=" + ignoreEnglishStyle + '}'; } diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java index 0ab3f60..14090b9 100644 --- a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java +++ b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java @@ -23,7 +23,7 @@ public final class AppConst { * 字典的大小 * @since 0.0.1 */ - public static final int DICT_SIZE = 65295; + public static final int DICT_SIZE = 65275; /** * 英语词典的大小 diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/CharFormatChain.java b/src/main/java/com/github/houbb/sensitive/word/support/format/CharFormatChain.java index 029473e..0b72d8f 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/format/CharFormatChain.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/CharFormatChain.java @@ -21,9 +21,11 @@ public class CharFormatChain implements ICharFormat { char result = original; List charFormats = Guavas.newArrayList(); + if(context.ignoreEnglishStyle()) { + charFormats.add(Instances.singleton(IgnoreEnglishStyleFormat.class)); + } if(context.ignoreCase()) { charFormats.add(Instances.singleton(IgnoreCaseCharFormat.class)); - } if(context.ignoreWidth()) { charFormats.add(Instances.singleton(IgnoreWidthCharFormat.class)); @@ -31,6 +33,9 @@ public class CharFormatChain implements ICharFormat { if(context.ignoreNumStyle()) { charFormats.add(Instances.singleton(IgnoreNumStyleCharFormat.class)); } + if(context.ignoreChineseStyle()) { + charFormats.add(Instances.singleton(IgnoreChineseStyleFormat.class)); + } // 循环执行 for(ICharFormat charFormat : charFormats) { diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreChineseStyleFormat.java b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreChineseStyleFormat.java new file mode 100644 index 0000000..60efcba --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreChineseStyleFormat.java @@ -0,0 +1,24 @@ +package com.github.houbb.sensitive.word.support.format; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap; +import com.github.houbb.opencc4j.support.segment.impl.CharSegment; +import com.github.houbb.sensitive.word.api.ICharFormat; +import com.github.houbb.sensitive.word.api.IWordContext; + +/** + * 忽略大小写 + * @author binbin.hou + * @since 0.0.5 + */ +@ThreadSafe +public class IgnoreChineseStyleFormat implements ICharFormat { + + @Override + public char format(char original, IWordContext context) { + String string = String.valueOf(original); + String simple = ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(string); + return simple.charAt(0); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreEnglishStyleFormat.java b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreEnglishStyleFormat.java new file mode 100644 index 0000000..e132254 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/IgnoreEnglishStyleFormat.java @@ -0,0 +1,21 @@ +package com.github.houbb.sensitive.word.support.format; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.sensitive.word.api.ICharFormat; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.utils.CharUtils; + +/** + * 忽略英文的各种格式 + * @author binbin.hou + * @since 0.0.6 + */ +@ThreadSafe +public class IgnoreEnglishStyleFormat implements ICharFormat { + + @Override + public char format(char original, IWordContext context) { + return CharUtils.getMappingChar(original); + } + +} diff --git a/src/main/resources/dict.txt b/src/main/resources/dict.txt index 7967e15..09d6466 100644 --- a/src/main/resources/dict.txt +++ b/src/main/resources/dict.txt @@ -3315,7 +3315,6 @@ jz女模 j巴 j总病危 j糸己元 -k k2精炼粉批发代理 k3精炼粉批发代理 k456⒓2 @@ -10785,7 +10784,6 @@ z以留吧以其以武 书办理 书记 书记群p艳照 -买 买1送1 买2送1 买54手枪 @@ -11939,7 +11937,6 @@ z以留吧以其以武 仁科百华bt种子 仁科百华种子 仁青加 -仆 仆不怕饮 仆街 仇保兴 @@ -20979,7 +20976,6 @@ z以留吧以其以武 力满库 力霸 力骗中央 -办 办1个会繁荣1座城市 办46级证 办仿真2代身份证 @@ -21436,7 +21432,6 @@ z以留吧以其以武 办高仿证件 办高仿证件qq 办高利贷 -功 功劳不亚于杨利伟 功友 功夫online @@ -21520,14 +21515,12 @@ z以留吧以其以武 加非猫现实世界历险 加非猫现实世界历险记 加骚妹qq -务 务员答案 务员考试 劣乐 劣等人 劣等民族 劣质蜜饯加工 -动 动5感地带 动乱 动向 @@ -21783,7 +21776,6 @@ z以留吧以其以武 北野光种子 北韩 北高联 -区 区的雷人 医世无忧 医保用户数据 @@ -22059,7 +22051,6 @@ z以留吧以其以武 单身白领油城姻缘 单身白领聚会交友 单身群 -卖 卖1代个人身份证 卖1代假冒身份证 卖1代假身份证 @@ -23249,7 +23240,6 @@ z以留吧以其以武 可过机假钱qq 可随意改变称重结果 可非 -台 台du 台wan 台军 @@ -23811,7 +23801,6 @@ z以留吧以其以武 吡利啶 吡啶胺 吧灵吧灵7灵露灵 -吨 含乳 含住我的小鸡鸡 含住肉棒上下使劲吸吮起来 @@ -26113,7 +26102,6 @@ z以留吧以其以武 大龟头顶住子宫深处 大龟头顶住花心 大龟头顶紧子宫口 -天 天上人间博彩网站 天上导弹乱 天上掉下个打工妹 @@ -30327,7 +30315,6 @@ z以留吧以其以武 弄儿的后宫免费阅读 弄花香满衣 式粉推 -弓 弓nu 弓nu买卖 弓nu供应 @@ -32855,7 +32842,6 @@ z以留吧以其以武 户外砍刀销售 户外砍刀销售qq 户外露胸舔鲍 -房 房主信息出售qq 房主信息销售 房主信息销售qq @@ -39771,7 +39757,6 @@ z以留吧以其以武 末裔之殇 末裔之殇txt 末路惊情 -本 本人代购 本人何美婷想寻1位真诚男士 本人出租 @@ -41292,7 +41277,6 @@ z以留吧以其以武 款到发货 歇b 歇逼 -歌 歌功颂德 歌华 歌女也把将星扛 @@ -45304,7 +45288,6 @@ z以留吧以其以武 滕任 滕文生 滕武 -滚 滚圆大乳 滚床单 滚滚球 @@ -45802,7 +45785,6 @@ z以留吧以其以武 灼灼桃花十里取1朵放在心上足矣你能陪我会聊天吗 灼灼桃花十里取1朵放在心上足矣这是我薇信 灼热的龟头紧顶住柔嫩的菊花口 -灾 灾区 炎症治疗 炒作公司 @@ -46747,7 +46729,6 @@ z以留吧以其以武 狩猎气狗专卖qq 狩猎汽动狗出售 狩猎用弩 -独 独占princesscollection若叶 独占动画女热大6file2 独占动画潮崎高校女教师陵辱12 @@ -54645,6 +54626,7 @@ z以留吧以其以武 茶鱼 茶鱼论坛 荆棘护卫兵 +艹 草 草bbbbbbb 草你 @@ -56680,7 +56662,6 @@ z以留吧以其以武 访问youtube 访问找我 访问链接 -证 证1次性 证书 证书办 @@ -57748,7 +57729,6 @@ z以留吧以其以武 践踏我 踏莎行3军战震魔 踏莎行3军战震魔txt -踢 踢到中国脸 踢思皮这几个的首字母漆山妖这几个是数字 踢踢球 diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java new file mode 100644 index 0000000..1fac846 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsChineseTest.java @@ -0,0 +1,29 @@ +package com.github.houbb.sensitive.word.bs; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.0.6 + */ +public class SensitiveWordBsChineseTest { + + /** + * 忽略中文繁简体 + * @since 0.0.6 + */ + @Test + public void ignoreChineseStyleTest() { + final String text = "我爱我的祖国和五星紅旗。"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[五星紅旗]", wordList.toString()); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEnglishTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEnglishTest.java new file mode 100644 index 0000000..78ccb9a --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEnglishTest.java @@ -0,0 +1,29 @@ +package com.github.houbb.sensitive.word.bs; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.0.6 + */ +public class SensitiveWordBsEnglishTest { + + /** + * 忽略英文写法 + * @since 0.0.6 + */ + @Test + public void ignoreEnglishStyleTest() { + final String text = "Ⓕⓤc⒦ the bad words"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[Ⓕⓤc⒦]", wordList.toString()); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/data/DataUtil.java b/src/test/java/com/github/houbb/sensitive/word/data/DataUtil.java index e350e54..207ef05 100644 --- a/src/test/java/com/github/houbb/sensitive/word/data/DataUtil.java +++ b/src/test/java/com/github/houbb/sensitive/word/data/DataUtil.java @@ -2,6 +2,8 @@ package com.github.houbb.sensitive.word.data; import com.github.houbb.heaven.util.io.FileUtil; import com.github.houbb.heaven.util.util.CollectionUtil; +import org.junit.Ignore; +import org.junit.Test; import java.util.Collection; import java.util.Collections; @@ -33,4 +35,17 @@ public class DataUtil { return stringList; } + @Test + @Ignore + public void singleCharTest() { + final String path = "D:\\github\\sensitive-word\\src\\main\\resources\\dict.txt"; + + List stringList = FileUtil.readAllLines(path); + for(String s : stringList) { + if(s.length() == 1) { + System.out.println(s); + } + } + } + }