diff --git a/.gitignore b/.gitignore index af3d4f9..35514a8 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,5 @@ target/ .DS_Store Thumbs.db + +*.jfr \ No newline at end of file diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 38bb3e0..64c893b 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -450,3 +450,9 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:-----|-----------|:-------------------|:---------------------------------------------------| | 1 | F | 修正词库缺失的问题 | 2025-7-24 23:09:10 | https://github.com/houbb/sensitive-word/issues/125 | + +# release_0.28.0 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:-----|------------|:------------------|:---------------------------------------------------| +| 1 | O | 优化 char 映射 | 2025-9-4 16:22:24 | https://github.com/houbb/sensitive-word/issues/131 | diff --git a/README.md b/README.md index 97a297b..b49bd59 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ v0.24.0 开始内置支持对敏感词的分类细化,不过工作量比较大 com.github.houbb sensitive-word - 0.27.1 + 0.28.0 ``` diff --git a/pom.xml b/pom.xml index e975135..3f9e861 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.27.1 + 0.28.0-SNAPSHOT @@ -22,7 +22,7 @@ UTF-8 - 1.7 + 1.8 0.13.0 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index 4147aef..ddda5f8 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -251,6 +251,23 @@ public interface IWordContext { */ IWordFormat wordFormat(); + /** + * 设置IWordFormatText + * + * @param wordFormatText 字符处理 + * @return 结果 + * @since 0.3.0 + */ + IWordContext wordFormatText(final IWordFormatText wordFormatText); + + /** + * 文本格式化策略 + * + * @return 策略 + * @since 0.28.0 + */ + IWordFormatText wordFormatText(); + /** * 获取 wordMap 策略 * @return 策略 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordFormatText.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordFormatText.java new file mode 100644 index 0000000..6a72dda --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordFormatText.java @@ -0,0 +1,23 @@ +package com.github.houbb.sensitive.word.api; + +import java.util.Map; + +/** + * 单词整体格式化 + * + * @author binbin.hou + * @since 0.28.0 + */ +public interface IWordFormatText { + + /** + * 针对 text 格式化映射,提升对整体的控制力 + * + * @param text 原始 文本 + * @param context 上下文 + * @return 格式化后的 char + * @since 0.28.0 + */ + Map format(final String text, final IWordContext context); + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 625d15d..c2e0eb4 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -15,6 +15,7 @@ import com.github.houbb.sensitive.word.support.combine.check.WordCheckCombines; import com.github.houbb.sensitive.word.support.combine.format.WordFormatCombines; import com.github.houbb.sensitive.word.support.data.WordDatas; import com.github.houbb.sensitive.word.support.deny.WordDenys; +import com.github.houbb.sensitive.word.support.format.mapping.WordFormatTexts; import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores; import com.github.houbb.sensitive.word.support.replace.WordReplaces; import com.github.houbb.sensitive.word.support.result.WordResultHandlers; @@ -219,6 +220,12 @@ public class SensitiveWordBs implements ISensitiveWordDestroy { */ private IWordCheck wordCheckIpv4 = WordChecks.ipv4(); + /** + * 文本处理类 + * @since 0.28.0 + */ + private IWordFormatText wordFormatText = WordFormatTexts.defaults(); + /** * 新建验证实例 *

@@ -246,6 +253,7 @@ public class SensitiveWordBs implements ISensitiveWordDestroy { final IWordFormat charFormat = wordFormatCombine.initWordFormat(context); context.wordFormat(charFormat); + // 3. 初始化对应的 Check 策略 final IWordCheck sensitiveCheck = wordCheckCombine.initWordCheck(context); context.sensitiveCheck(sensitiveCheck); @@ -285,6 +293,7 @@ public class SensitiveWordBs implements ISensitiveWordDestroy { context.ignoreEnglishStyle(ignoreEnglishStyle); context.ignoreRepeat(ignoreRepeat); context.wordFailFast(wordFailFast); + context.wordFormatText(this.wordFormatText); // 开启校验 context.enableNumCheck(enableNumCheck); @@ -450,6 +459,13 @@ public class SensitiveWordBs implements ISensitiveWordDestroy { return this; } + public SensitiveWordBs wordFormatText(IWordFormatText wordFormatText) { + ArgUtil.notNull(wordFormatText, "wordFormatText"); + + this.wordFormatText = wordFormatText; + return this; + } + //-------------------------------------------------------- 基础属性设置 /** * 是否启用 ipv4 校验 diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index 59cf16f..4bbe8f6 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -106,6 +106,12 @@ public class SensitiveWordContext implements IWordContext { */ private IWordFormat wordFormat; + /** + * 文本格式化策略 + * @since 0.28.0 + */ + private IWordFormatText wordFormatText; + /** * 单词 map 信息 * @@ -379,6 +385,17 @@ public class SensitiveWordContext implements IWordContext { return this; } + @Override + public IWordContext wordFormatText(IWordFormatText wordFormatText) { + this.wordFormatText = wordFormatText; + return this; + } + + @Override + public IWordFormatText wordFormatText() { + return wordFormatText; + } + public IWordTag wordTag() { return wordTag; } diff --git a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java index 7af8705..944b45a 100644 --- a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java @@ -62,7 +62,7 @@ public class SensitiveWord extends AbstractSensitiveWord { //TODO: 这里拆分为2个部分,从而保障性能。但是要注意处理下标的问题。 //1. 原始的敏感词部分 //2. email/url/num 的单独一次遍历处理。 - final Map characterCharacterMap = InnerWordFormatUtils.formatCharsMapping(text, context); + final Map characterCharacterMap = context.wordFormatText().format(text, context); final InnerSensitiveWordContext checkContext = InnerSensitiveWordContext.newInstance() .originalText(text) .wordContext(context) diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/AbstractConditionWordCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/AbstractConditionWordCheck.java index 090d03e..72b98d2 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/AbstractConditionWordCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/AbstractConditionWordCheck.java @@ -5,6 +5,7 @@ import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; import com.github.houbb.sensitive.word.support.result.WordLengthResult; +import com.github.houbb.sensitive.word.utils.InnerWordFormatUtils; import java.util.Map; @@ -67,7 +68,7 @@ public abstract class AbstractConditionWordCheck extends AbstractWordCheck { char currentChar = txt.charAt(i); // 映射处理 - char mappingChar = formatCharMapping.get(currentChar); + char mappingChar = InnerWordFormatUtils.getMappingChar(formatCharMapping, currentChar); // 符合条件 boolean currentCondition = isCharCondition(mappingChar, i, checkContext); diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java b/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java index a105fda..cbeeb29 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java @@ -9,6 +9,7 @@ import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum; import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; import com.github.houbb.sensitive.word.support.result.WordLengthResult; +import com.github.houbb.sensitive.word.utils.InnerWordFormatUtils; import java.util.Map; @@ -57,7 +58,7 @@ public class WordCheckWord extends AbstractWordCheck { skipLen++; continue; } - char mappingChar = formatCharMapping.get(rawChars[i]); + char mappingChar = InnerWordFormatUtils.getMappingChar(formatCharMapping, rawChars[i]); stringBuilder.append(mappingChar); tempLen++; diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/AbstractWordFormatText.java b/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/AbstractWordFormatText.java new file mode 100644 index 0000000..e3cfc57 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/AbstractWordFormatText.java @@ -0,0 +1,29 @@ +package com.github.houbb.sensitive.word.support.format.mapping; + +import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordFormatText; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * 抽象实现 + * @author binbin.hou + * @since 0.28.0 + */ +public abstract class AbstractWordFormatText implements IWordFormatText { + + protected abstract Map doFormat(String text, IWordContext context); + + @Override + public Map format(String text, IWordContext context) { + if(StringUtil.isEmpty(text)) { + return Collections.emptyMap(); + } + + return doFormat(text, context); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTextDefault.java b/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTextDefault.java new file mode 100644 index 0000000..b41b20d --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTextDefault.java @@ -0,0 +1,41 @@ +package com.github.houbb.sensitive.word.support.format.mapping; + +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordFormat; +import com.github.houbb.sensitive.word.support.check.WordCheckNone; +import com.github.houbb.sensitive.word.support.format.WordFormatNone; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * 默认实现 + * + * @author d + * @since 0.28.0 + */ +public class WordFormatTextDefault extends AbstractWordFormatText { + + @Override + protected Map doFormat(String text, IWordContext context) { + // 单个字符串里信息 + final IWordFormat wordFormat = context.wordFormat(); + // 不需要处理的场景 + if(wordFormat.getClass().getName().equals(WordFormatNone.class.getName())) { + return Collections.emptyMap(); + } + + Map map = new HashMap<>(); + for(int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + char mc = wordFormat.format(c, context); + + if(c != mc) { + map.put(c, mc); + } + } + return map; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTexts.java b/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTexts.java new file mode 100644 index 0000000..210ef57 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTexts.java @@ -0,0 +1,18 @@ +package com.github.houbb.sensitive.word.support.format.mapping; + +import com.github.houbb.sensitive.word.api.IWordFormatText; + +/** + * 格式化工具类 + * @author binbin.hou + * @since 0.28.0 + */ +public final class WordFormatTexts { + + private WordFormatTexts(){} + + public static IWordFormatText defaults() { + return new WordFormatTextDefault(); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java index 44ce918..25cdb7c 100644 --- a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java +++ b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java @@ -46,29 +46,17 @@ public final class InnerWordFormatUtils { /** * 字符串统一的格式化处理 - * @param original 原始文本 - * @param context 上下文 + * @param map 映射集合 + * @param c 原始 * @return 结果 - * @since 0.6.0 + * @since 0.28.0 */ - public static Map formatCharsMapping(final String original, final IWordContext context) { - if(StringUtil.isEmpty(original)) { - return Collections.emptyMap(); + public static char getMappingChar(final Map map, char c) { + Character mc = map.get(c); + if(mc != null) { + return mc; } - - final int len = original.length(); - - char[] rawChars = original.toCharArray(); - Map map = new HashMap<>(rawChars.length); - - IWordFormat charFormat = context.wordFormat(); - for(int i = 0; i < len; i++) { - final char currentChar = rawChars[i]; - char formatChar = charFormat.format(currentChar, context); - map.put(currentChar, formatChar); - } - - return map; + return c; } /** diff --git a/src/test/java/com/github/houbb/sensitive/word/issues/Issue131.java b/src/test/java/com/github/houbb/sensitive/word/issues/Issue131.java new file mode 100644 index 0000000..1e4bdb1 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/issues/Issue131.java @@ -0,0 +1,56 @@ +package com.github.houbb.sensitive.word.issues; + +import com.github.houbb.sensitive.word.api.IWordDeny; +import com.github.houbb.sensitive.word.bs.SensitiveWordBs; +import com.github.houbb.sensitive.word.support.allow.WordAllows; +import com.github.houbb.sensitive.word.support.tag.WordTags; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +public class Issue131 { + + /** + * 慢在哪里? + * 和是否指定没关系,第一次就是慢,要 13ms,为什么? + * + * @param args + */ + public static void main(String[] args) throws IOException { + final List allWord = Arrays.asList("敏感","最强","定制", "81", "医疗器械"); + String demo1 = "产品尺寸参数§60mn§50mm§210枚/包§160枚/包§名称A4银色不干胶§规格60mm*40mm 送配套模板§规格70mm*50mm 送配套模板§数量每大张21枚一包10张总计210枚§数量每大张16枚一包10张总计160枚§适用激光打印机打印油性笔书写§95mm§100mn§55mm§100枚/包§80枚/包§名称 A4银色不干胶§规格95mm*55mm 送配套模板§规格100mm*70mm 送配套模板§数量每大张10枚一包10张总计100枚§数量 每大张8枚一包10张 总计80枚§100mm§120枚/包§140枚/包§规格80mm*50mm 送配套模板§规格100mm*40mm 送配套模板§数量每大张12枚一包10张总计120枚§数量§每大张14枚包10张总计140枚§适用 激光打印机打印油性笔书写§40mm§65mm§70mm§35mm§200枚/包§240枚/包§规格70mm*40mm送配套模板§规格§65mm*35mm 送配套模板§数量 每大张20枚一包10张总计200枚§每大张24枚包10张总计240枚§适 激光打印机打印油性笔书写§适用§激光打印机打印油性笔书写§40mn§280枚/包§360枚/包§规格50mm*40mm 送配套模板§规格40mm*30mm 送配套模板§数量每大张28枚一包10张总计280枚§数量每大张36枚一包10张总计360枚§45.7mm§38.1mm§400枚/包§650枚/包§45.7mm*25.4mm送配套模板§38.1mm*21.2mm 送配套模板§每大张40枚一包10张总计400枚§数量每大张65枚一包10张总计650枚§30mm§25mr§20mm§840枚/包§1260枚/包§规格 30mm*20mm 送配套模板§规格25mm*13mm 送配套模板§数量每张84枚包10张总计840枚§数量每大张126枚一包10张总计1260枚§46mm§意制§任§1000枚/包§定§名称定制A4内割银不胶§规格46mm*11.1mm送配套模板§任意规格定制§每大张100枚包10张总计1000枚§包10张满5包送专属模板§适激光打印机打印油性笔书写§产品实拍§8格打印实拍展示(100mm*70mm)§上海荠骞文化用品固定资产标识卡§资产编号:§规格型号:§资产名称:§使用状态:§资产类别:§资产原值§存放地点§生产厂家:§使用人§备§注:§*请爱护公司财产,不要随意撕毁此标签§16格全内容打印实拍展示§固定资产标识卡§资产名称§四层货架(平板)§资产编号§3F跑菜区§规格型号§1800×500×1500§使用部门§财务部§使用时间§2019-04-26§李强§21格手写款打印展示 (60mm*40mm)§固定资标识卡§36格打印实拍展示(40mm*30mm)§固定资产标签§名称:§编号:§部门:§40格打印实拍展示(45.7mm*25.4mm)§固定资§名称:电脑§编号:20210§部门:财务部§20210201§使用人:我最强§八:找最强§编号:20210201§65格打印实拍展示(38mm*21mm)§名称:§编号:§数量:§数量:§100格打印实拍展示(46mm*11.1mm)§客服电话:159 9569 3815§: 159 9569 3815§.§客服电话:159 9569§客服电话:1599§客服电话§服电话:159 9569 3815§话:159 9569 3815§客服电话:1599569 3815§电话:159 9569 3815§9569 3815§159 9569 3815§客服电话:§低值易耗品标识牌(70mm*50mm)§购买日期§保管部门§责任人§生产厂家§不要随意撕毁此标牌*§*请爱护公司财产,不要随意撕导§品标识牌§低值易耗品标识牌§随意撕毁此标牌*§*请爱护公司财产,不要随意撕毁此标牌*§三人沙发§行政酒廊§2200*860*900§2018-07-23§应用范围§多用于产品信息固有资产登记航空仓库管理 医疗政府机构等§Mainly used for product information inherent assets registration, aviation warehouse management, medi§cal government institutions, etc§政府单位§企业办公§仓储行业§医疗器械§教育单位§耐用品§电子产品包装§商城卖场"; + + // 初始化敏感词库 + SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() + .wordFailFast(true) + .wordAllow(WordAllows.empty()) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return allWord; + } + }) + .ignoreChineseStyle(false) + .ignoreCase(false) + .ignoreEnglishStyle(false) + .ignoreNumStyle(false) + .ignoreRepeat(false) + .ignoreWidth(false) + .wordTag(WordTags.none()) + .init(); + long time = System.currentTimeMillis(); + costTimeTest(sensitiveWordBs, demo1); + long cTime = System.currentTimeMillis() - time; + System.out.println("---DONE"+cTime); + } + + private static void costTimeTest(SensitiveWordBs sensitiveWordBs, String demo1) throws IOException { + int count = 10000; + + for (int i = 0; i < count; i++) { + List emitWord1 = sensitiveWordBs.findAll(demo1); + } + } + +}