mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 00:17:35 +08:00
release branch 0.27.1
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -31,3 +31,5 @@ target/
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
|
||||
*.jfr
|
||||
@@ -450,3 +450,9 @@
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:-----|-----------|:-------------------|:---------------------------------------------------|
|
||||
| 1 | F | 修正词库缺失的问题 | 2025-7-24 23:09:10 | https://github.com/houbb/sensitive-word/issues/125 |
|
||||
|
||||
# release_0.28.0
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:-----|------------|:------------------|:---------------------------------------------------|
|
||||
| 1 | O | 优化 char 映射 | 2025-9-4 16:22:24 | https://github.com/houbb/sensitive-word/issues/131 |
|
||||
|
||||
@@ -110,7 +110,7 @@ v0.24.0 开始内置支持对敏感词的分类细化,不过工作量比较大
|
||||
<dependency>
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.27.1</version>
|
||||
<version>0.28.0</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
|
||||
4
pom.xml
4
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.27.1</version>
|
||||
<version>0.28.0-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
@@ -22,7 +22,7 @@
|
||||
|
||||
<!--============================== MAIN ==============================-->
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.compiler.level>1.7</project.compiler.level>
|
||||
<project.compiler.level>1.8</project.compiler.level>
|
||||
|
||||
<!--============================== INTER ==============================-->
|
||||
<heaven.version>0.13.0</heaven.version>
|
||||
|
||||
@@ -251,6 +251,23 @@ public interface IWordContext {
|
||||
*/
|
||||
IWordFormat wordFormat();
|
||||
|
||||
/**
|
||||
* 设置IWordFormatText
|
||||
*
|
||||
* @param wordFormatText 字符处理
|
||||
* @return 结果
|
||||
* @since 0.3.0
|
||||
*/
|
||||
IWordContext wordFormatText(final IWordFormatText wordFormatText);
|
||||
|
||||
/**
|
||||
* 文本格式化策略
|
||||
*
|
||||
* @return 策略
|
||||
* @since 0.28.0
|
||||
*/
|
||||
IWordFormatText wordFormatText();
|
||||
|
||||
/**
|
||||
* 获取 wordMap 策略
|
||||
* @return 策略
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 单词整体格式化
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.28.0
|
||||
*/
|
||||
public interface IWordFormatText {
|
||||
|
||||
/**
|
||||
* 针对 text 格式化映射,提升对整体的控制力
|
||||
*
|
||||
* @param text 原始 文本
|
||||
* @param context 上下文
|
||||
* @return 格式化后的 char
|
||||
* @since 0.28.0
|
||||
*/
|
||||
Map<Character, Character> format(final String text, final IWordContext context);
|
||||
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import com.github.houbb.sensitive.word.support.combine.check.WordCheckCombines;
|
||||
import com.github.houbb.sensitive.word.support.combine.format.WordFormatCombines;
|
||||
import com.github.houbb.sensitive.word.support.data.WordDatas;
|
||||
import com.github.houbb.sensitive.word.support.deny.WordDenys;
|
||||
import com.github.houbb.sensitive.word.support.format.mapping.WordFormatTexts;
|
||||
import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores;
|
||||
import com.github.houbb.sensitive.word.support.replace.WordReplaces;
|
||||
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
|
||||
@@ -219,6 +220,12 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
|
||||
*/
|
||||
private IWordCheck wordCheckIpv4 = WordChecks.ipv4();
|
||||
|
||||
/**
|
||||
* 文本处理类
|
||||
* @since 0.28.0
|
||||
*/
|
||||
private IWordFormatText wordFormatText = WordFormatTexts.defaults();
|
||||
|
||||
/**
|
||||
* 新建验证实例
|
||||
* <p>
|
||||
@@ -246,6 +253,7 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
|
||||
final IWordFormat charFormat = wordFormatCombine.initWordFormat(context);
|
||||
context.wordFormat(charFormat);
|
||||
|
||||
|
||||
// 3. 初始化对应的 Check 策略
|
||||
final IWordCheck sensitiveCheck = wordCheckCombine.initWordCheck(context);
|
||||
context.sensitiveCheck(sensitiveCheck);
|
||||
@@ -285,6 +293,7 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
|
||||
context.ignoreEnglishStyle(ignoreEnglishStyle);
|
||||
context.ignoreRepeat(ignoreRepeat);
|
||||
context.wordFailFast(wordFailFast);
|
||||
context.wordFormatText(this.wordFormatText);
|
||||
|
||||
// 开启校验
|
||||
context.enableNumCheck(enableNumCheck);
|
||||
@@ -450,6 +459,13 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
|
||||
return this;
|
||||
}
|
||||
|
||||
public SensitiveWordBs wordFormatText(IWordFormatText wordFormatText) {
|
||||
ArgUtil.notNull(wordFormatText, "wordFormatText");
|
||||
|
||||
this.wordFormatText = wordFormatText;
|
||||
return this;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------- 基础属性设置
|
||||
/**
|
||||
* 是否启用 ipv4 校验
|
||||
|
||||
@@ -106,6 +106,12 @@ public class SensitiveWordContext implements IWordContext {
|
||||
*/
|
||||
private IWordFormat wordFormat;
|
||||
|
||||
/**
|
||||
* 文本格式化策略
|
||||
* @since 0.28.0
|
||||
*/
|
||||
private IWordFormatText wordFormatText;
|
||||
|
||||
/**
|
||||
* 单词 map 信息
|
||||
*
|
||||
@@ -379,6 +385,17 @@ public class SensitiveWordContext implements IWordContext {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IWordContext wordFormatText(IWordFormatText wordFormatText) {
|
||||
this.wordFormatText = wordFormatText;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IWordFormatText wordFormatText() {
|
||||
return wordFormatText;
|
||||
}
|
||||
|
||||
public IWordTag wordTag() {
|
||||
return wordTag;
|
||||
}
|
||||
|
||||
@@ -62,7 +62,7 @@ public class SensitiveWord extends AbstractSensitiveWord {
|
||||
//TODO: 这里拆分为2个部分,从而保障性能。但是要注意处理下标的问题。
|
||||
//1. 原始的敏感词部分
|
||||
//2. email/url/num 的单独一次遍历处理。
|
||||
final Map<Character, Character> characterCharacterMap = InnerWordFormatUtils.formatCharsMapping(text, context);
|
||||
final Map<Character, Character> characterCharacterMap = context.wordFormatText().format(text, context);
|
||||
final InnerSensitiveWordContext checkContext = InnerSensitiveWordContext.newInstance()
|
||||
.originalText(text)
|
||||
.wordContext(context)
|
||||
|
||||
@@ -5,6 +5,7 @@ import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
|
||||
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
|
||||
import com.github.houbb.sensitive.word.utils.InnerWordFormatUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@@ -67,7 +68,7 @@ public abstract class AbstractConditionWordCheck extends AbstractWordCheck {
|
||||
|
||||
char currentChar = txt.charAt(i);
|
||||
// 映射处理
|
||||
char mappingChar = formatCharMapping.get(currentChar);
|
||||
char mappingChar = InnerWordFormatUtils.getMappingChar(formatCharMapping, currentChar);
|
||||
|
||||
// 符合条件
|
||||
boolean currentCondition = isCharCondition(mappingChar, i, checkContext);
|
||||
|
||||
@@ -9,6 +9,7 @@ import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum;
|
||||
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
|
||||
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
|
||||
import com.github.houbb.sensitive.word.utils.InnerWordFormatUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@@ -57,7 +58,7 @@ public class WordCheckWord extends AbstractWordCheck {
|
||||
skipLen++;
|
||||
continue;
|
||||
}
|
||||
char mappingChar = formatCharMapping.get(rawChars[i]);
|
||||
char mappingChar = InnerWordFormatUtils.getMappingChar(formatCharMapping, rawChars[i]);
|
||||
stringBuilder.append(mappingChar);
|
||||
tempLen++;
|
||||
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.github.houbb.sensitive.word.support.format.mapping;
|
||||
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormatText;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 抽象实现
|
||||
* @author binbin.hou
|
||||
* @since 0.28.0
|
||||
*/
|
||||
public abstract class AbstractWordFormatText implements IWordFormatText {
|
||||
|
||||
protected abstract Map<Character, Character> doFormat(String text, IWordContext context);
|
||||
|
||||
@Override
|
||||
public Map<Character, Character> format(String text, IWordContext context) {
|
||||
if(StringUtil.isEmpty(text)) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
return doFormat(text, context);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
package com.github.houbb.sensitive.word.support.format.mapping;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
import com.github.houbb.sensitive.word.support.check.WordCheckNone;
|
||||
import com.github.houbb.sensitive.word.support.format.WordFormatNone;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 默认实现
|
||||
*
|
||||
* @author d
|
||||
* @since 0.28.0
|
||||
*/
|
||||
public class WordFormatTextDefault extends AbstractWordFormatText {
|
||||
|
||||
@Override
|
||||
protected Map<Character, Character> doFormat(String text, IWordContext context) {
|
||||
// 单个字符串里信息
|
||||
final IWordFormat wordFormat = context.wordFormat();
|
||||
// 不需要处理的场景
|
||||
if(wordFormat.getClass().getName().equals(WordFormatNone.class.getName())) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
Map<Character, Character> map = new HashMap<>();
|
||||
for(int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
char mc = wordFormat.format(c, context);
|
||||
|
||||
if(c != mc) {
|
||||
map.put(c, mc);
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package com.github.houbb.sensitive.word.support.format.mapping;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordFormatText;
|
||||
|
||||
/**
|
||||
* 格式化工具类
|
||||
* @author binbin.hou
|
||||
* @since 0.28.0
|
||||
*/
|
||||
public final class WordFormatTexts {
|
||||
|
||||
private WordFormatTexts(){}
|
||||
|
||||
public static IWordFormatText defaults() {
|
||||
return new WordFormatTextDefault();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -46,29 +46,17 @@ public final class InnerWordFormatUtils {
|
||||
|
||||
/**
|
||||
* 字符串统一的格式化处理
|
||||
* @param original 原始文本
|
||||
* @param context 上下文
|
||||
* @param map 映射集合
|
||||
* @param c 原始
|
||||
* @return 结果
|
||||
* @since 0.6.0
|
||||
* @since 0.28.0
|
||||
*/
|
||||
public static Map<Character, Character> formatCharsMapping(final String original, final IWordContext context) {
|
||||
if(StringUtil.isEmpty(original)) {
|
||||
return Collections.emptyMap();
|
||||
public static char getMappingChar(final Map<Character, Character> map, char c) {
|
||||
Character mc = map.get(c);
|
||||
if(mc != null) {
|
||||
return mc;
|
||||
}
|
||||
|
||||
final int len = original.length();
|
||||
|
||||
char[] rawChars = original.toCharArray();
|
||||
Map<Character, Character> map = new HashMap<>(rawChars.length);
|
||||
|
||||
IWordFormat charFormat = context.wordFormat();
|
||||
for(int i = 0; i < len; i++) {
|
||||
final char currentChar = rawChars[i];
|
||||
char formatChar = charFormat.format(currentChar, context);
|
||||
map.put(currentChar, formatChar);
|
||||
}
|
||||
|
||||
return map;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
package com.github.houbb.sensitive.word.issues;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordDeny;
|
||||
import com.github.houbb.sensitive.word.bs.SensitiveWordBs;
|
||||
import com.github.houbb.sensitive.word.support.allow.WordAllows;
|
||||
import com.github.houbb.sensitive.word.support.tag.WordTags;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class Issue131 {
|
||||
|
||||
/**
|
||||
* 慢在哪里?
|
||||
* 和是否指定没关系,第一次就是慢,要 13ms,为什么?
|
||||
*
|
||||
* @param args
|
||||
*/
|
||||
public static void main(String[] args) throws IOException {
|
||||
final List<String> allWord = Arrays.asList("敏感","最强","定制", "81", "医疗器械");
|
||||
String demo1 = "产品尺寸参数§60mn§50mm§210枚/包§160枚/包§名称A4银色不干胶§规格60mm*40mm 送配套模板§规格70mm*50mm 送配套模板§数量每大张21枚一包10张总计210枚§数量每大张16枚一包10张总计160枚§适用激光打印机打印油性笔书写§95mm§100mn§55mm§100枚/包§80枚/包§名称 A4银色不干胶§规格95mm*55mm 送配套模板§规格100mm*70mm 送配套模板§数量每大张10枚一包10张总计100枚§数量 每大张8枚一包10张 总计80枚§100mm§120枚/包§140枚/包§规格80mm*50mm 送配套模板§规格100mm*40mm 送配套模板§数量每大张12枚一包10张总计120枚§数量§每大张14枚包10张总计140枚§适用 激光打印机打印油性笔书写§40mm§65mm§70mm§35mm§200枚/包§240枚/包§规格70mm*40mm送配套模板§规格§65mm*35mm 送配套模板§数量 每大张20枚一包10张总计200枚§每大张24枚包10张总计240枚§适 激光打印机打印油性笔书写§适用§激光打印机打印油性笔书写§40mn§280枚/包§360枚/包§规格50mm*40mm 送配套模板§规格40mm*30mm 送配套模板§数量每大张28枚一包10张总计280枚§数量每大张36枚一包10张总计360枚§45.7mm§38.1mm§400枚/包§650枚/包§45.7mm*25.4mm送配套模板§38.1mm*21.2mm 送配套模板§每大张40枚一包10张总计400枚§数量每大张65枚一包10张总计650枚§30mm§25mr§20mm§840枚/包§1260枚/包§规格 30mm*20mm 送配套模板§规格25mm*13mm 送配套模板§数量每张84枚包10张总计840枚§数量每大张126枚一包10张总计1260枚§46mm§意制§任§1000枚/包§定§名称定制A4内割银不胶§规格46mm*11.1mm送配套模板§任意规格定制§每大张100枚包10张总计1000枚§包10张满5包送专属模板§适激光打印机打印油性笔书写§产品实拍§8格打印实拍展示(100mm*70mm)§上海荠骞文化用品固定资产标识卡§资产编号:§规格型号:§资产名称:§使用状态:§资产类别:§资产原值§存放地点§生产厂家:§使用人§备§注:§*请爱护公司财产,不要随意撕毁此标签§16格全内容打印实拍展示§固定资产标识卡§资产名称§四层货架(平板)§资产编号§3F跑菜区§规格型号§1800×500×1500§使用部门§财务部§使用时间§2019-04-26§李强§21格手写款打印展示 (60mm*40mm)§固定资标识卡§36格打印实拍展示(40mm*30mm)§固定资产标签§名称:§编号:§部门:§40格打印实拍展示(45.7mm*25.4mm)§固定资§名称:电脑§编号:20210§部门:财务部§20210201§使用人:我最强§八:找最强§编号:20210201§65格打印实拍展示(38mm*21mm)§名称:§编号:§数量:§数量:§100格打印实拍展示(46mm*11.1mm)§客服电话:159 9569 3815§: 159 9569 3815§.§客服电话:159 9569§客服电话:1599§客服电话§服电话:159 9569 3815§话:159 9569 3815§客服电话:1599569 3815§电话:159 9569 3815§9569 3815§159 9569 3815§客服电话:§低值易耗品标识牌(70mm*50mm)§购买日期§保管部门§责任人§生产厂家§不要随意撕毁此标牌*§*请爱护公司财产,不要随意撕导§品标识牌§低值易耗品标识牌§随意撕毁此标牌*§*请爱护公司财产,不要随意撕毁此标牌*§三人沙发§行政酒廊§2200*860*900§2018-07-23§应用范围§多用于产品信息固有资产登记航空仓库管理 医疗政府机构等§Mainly used for product information inherent assets registration, aviation warehouse management, medi§cal government institutions, etc§政府单位§企业办公§仓储行业§医疗器械§教育单位§耐用品§电子产品包装§商城卖场";
|
||||
|
||||
// 初始化敏感词库
|
||||
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
|
||||
.wordFailFast(true)
|
||||
.wordAllow(WordAllows.empty())
|
||||
.wordDeny(new IWordDeny() {
|
||||
@Override
|
||||
public List<String> deny() {
|
||||
return allWord;
|
||||
}
|
||||
})
|
||||
.ignoreChineseStyle(false)
|
||||
.ignoreCase(false)
|
||||
.ignoreEnglishStyle(false)
|
||||
.ignoreNumStyle(false)
|
||||
.ignoreRepeat(false)
|
||||
.ignoreWidth(false)
|
||||
.wordTag(WordTags.none())
|
||||
.init();
|
||||
long time = System.currentTimeMillis();
|
||||
costTimeTest(sensitiveWordBs, demo1);
|
||||
long cTime = System.currentTimeMillis() - time;
|
||||
System.out.println("---DONE"+cTime);
|
||||
}
|
||||
|
||||
private static void costTimeTest(SensitiveWordBs sensitiveWordBs, String demo1) throws IOException {
|
||||
int count = 10000;
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
List<String> emitWord1 = sensitiveWordBs.findAll(demo1);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user