黑白名单一次遍历

This commit is contained in:
yudasen
2025-01-29 23:41:36 +08:00
parent 9ddc383621
commit 76d0e1d203
10 changed files with 142 additions and 139 deletions

View File

@@ -4,10 +4,8 @@ import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.sensitive.word.api.*;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum;
import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum;
import com.github.houbb.sensitive.word.support.check.WordCheckResult;
import com.github.houbb.sensitive.word.support.check.WordCheckWordAllow;
import com.github.houbb.sensitive.word.support.result.WordResult;
import com.github.houbb.sensitive.word.utils.InnerWordFormatUtils;
@@ -71,21 +69,18 @@ public class SensitiveWord extends AbstractSensitiveWord {
.formatCharMapping(characterCharacterMap);
final IWordResultCondition wordResultCondition = context.wordResultCondition();
final IWordCheck wordCheckAllow = new WordCheckWordAllow();
for (int i = 0; i < text.length(); i++) {
// v0.21.0 白名单跳过 TODO: 感觉这种实现性能一般,考虑后续优化。
WordCheckResult wordCheckAllowResult = wordCheckAllow.sensitiveCheck(i, checkContext);
int wordLengthAllow = wordCheckAllowResult.index();
// v0.21.0 白名单跳过
WordCheckResult checkResult = sensitiveCheck.sensitiveCheck(i, checkContext);
int wordLengthAllow = checkResult.wordLengthResult().wordAllowLen();
if(wordLengthAllow > 0) {
i += wordLengthAllow-1;
continue;
}
WordCheckResult checkResult = sensitiveCheck.sensitiveCheck(i, checkContext);
// 命中
int wordLength = checkResult.index();
int wordLength = checkResult.wordLengthResult().wordDenyLen();
if (wordLength > 0) {
// 保存敏感词
WordResult wordResult = WordResult.newInstance()

View File

@@ -3,6 +3,7 @@ package com.github.houbb.sensitive.word.support.check;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
import java.util.Map;
@@ -37,7 +38,7 @@ public abstract class AbstractConditionWordCheck extends AbstractWordCheck {
final StringBuilder stringBuilder, InnerSensitiveWordContext checkContext);
@Override
protected int getActualLength(int beginIndex, InnerSensitiveWordContext checkContext) {
protected WordLengthResult getActualLength(int beginIndex, InnerSensitiveWordContext checkContext) {
final String txt = checkContext.originalText();
final IWordContext context = checkContext.wordContext();
final Map<Character, Character> formatCharMapping = checkContext.formatCharMapping();
@@ -69,7 +70,9 @@ public abstract class AbstractConditionWordCheck extends AbstractWordCheck {
actualLength = stringBuilder.length();
}
return actualLength;
return WordLengthResult.newInstance()
.wordDenyLen(actualLength)
.wordAllowLen(0);
}
}

View File

@@ -4,6 +4,7 @@ import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.api.IWordCheck;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
/**
* 抽象实现策略
@@ -28,7 +29,7 @@ public abstract class AbstractWordCheck implements IWordCheck {
* @return 长度
* @since 0.4.0
*/
protected abstract int getActualLength(int beginIndex, final InnerSensitiveWordContext checkContext);
protected abstract WordLengthResult getActualLength(int beginIndex, final InnerSensitiveWordContext checkContext);
/**
* 获取类别
@@ -42,17 +43,21 @@ public abstract class AbstractWordCheck implements IWordCheck {
final InnerSensitiveWordContext checkContext) {
Class<? extends IWordCheck> clazz = getSensitiveCheckClass();
final String txt = checkContext.originalText();
WordLengthResult wordLengthResult = WordLengthResult.newInstance()
.wordAllowLen(0)
.wordDenyLen(0);
if(StringUtil.isEmpty(txt)) {
return WordCheckResult.newInstance()
.index(0)
.wordLengthResult(wordLengthResult)
.type(getType())
.checkClass(clazz);
}
int actualLength = getActualLength(beginIndex, checkContext);
wordLengthResult = getActualLength(beginIndex, checkContext);
return WordCheckResult.newInstance()
.index(actualLength)
.wordLengthResult(wordLengthResult)
.type(getType())
.checkClass(clazz);
}

View File

@@ -4,6 +4,7 @@ import com.github.houbb.heaven.support.pipeline.Pipeline;
import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline;
import com.github.houbb.sensitive.word.api.IWordCheck;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
import java.util.List;
@@ -34,7 +35,8 @@ public abstract class WordCheckInit implements IWordCheck {
for(IWordCheck sensitiveCheck : sensitiveChecks) {
WordCheckResult result = sensitiveCheck.sensitiveCheck(beginIndex, checkContext);
if(result.index() > 0) {
WordLengthResult wordLengthResult = result.wordLengthResult();
if(wordLengthResult.wordAllowLen() > 0 || wordLengthResult.wordDenyLen()> 0) {
return result;
}
}

View File

@@ -4,6 +4,7 @@ import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordCheck;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum;
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
/**
* 未匹配
@@ -28,7 +29,7 @@ public class WordCheckNone implements IWordCheck {
*/
private static final WordCheckResult NONE_RESULT = WordCheckResult.newInstance()
.type(WordTypeEnum.DEFAULTS.getCode())
.index(0)
.wordLengthResult(WordLengthResult.newInstance())
.checkClass(WordCheckNone.class);
public static WordCheckResult getNoneResult() {

View File

@@ -1,6 +1,7 @@
package com.github.houbb.sensitive.word.support.check;
import com.github.houbb.sensitive.word.api.IWordCheck;
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
/**
* 敏感信息监测接口结果
@@ -12,10 +13,9 @@ import com.github.houbb.sensitive.word.api.IWordCheck;
public class WordCheckResult {
/**
* 下标
* @since 0.0.12
* 命中的黑白名单的长度对象
*/
private int index;
private WordLengthResult wordLengthResult;
/**
* 检测类
@@ -35,12 +35,12 @@ public class WordCheckResult {
return new WordCheckResult();
}
public int index() {
return index;
public WordLengthResult wordLengthResult() {
return wordLengthResult;
}
public WordCheckResult index(int index) {
this.index = index;
public WordCheckResult wordLengthResult(WordLengthResult wordLengthResult) {
this.wordLengthResult = wordLengthResult;
return this;
}
@@ -65,7 +65,7 @@ public class WordCheckResult {
@Override
public String toString() {
return "WordCheckResult{" +
"index=" + index +
"wordLengthResult=" + wordLengthResult +
", checkClass=" + checkClass +
", type='" + type + '\'' +
'}';

View File

@@ -1,6 +1,7 @@
package com.github.houbb.sensitive.word.support.check;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore;
import com.github.houbb.sensitive.word.api.IWordCheck;
import com.github.houbb.sensitive.word.api.IWordContext;
@@ -9,6 +10,7 @@ import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum;
import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
import java.util.Map;
@@ -35,54 +37,63 @@ public class WordCheckWord extends AbstractWordCheck {
}
@Override
protected int getActualLength(int beginIndex, InnerSensitiveWordContext innerContext) {
protected WordLengthResult getActualLength(int beginIndex, InnerSensitiveWordContext innerContext) {
final String txt = innerContext.originalText();
final Map<Character, Character> formatCharMapping = innerContext.formatCharMapping();
final WordValidModeEnum wordValidModeEnum = innerContext.modeEnum();
final IWordContext context = innerContext.wordContext();
// 采用 ThreadLocal 应该可以提升性能,减少对象的创建。
int actualLength = 0;
final IWordData wordData = context.wordData();
final IWordData wordDataAllow = context.wordDataAllow();
final ISensitiveWordCharIgnore wordCharIgnore = context.charIgnore();
// 前一个条件
StringBuilder stringBuilder = new StringBuilder();
char[] rawChars = txt.toCharArray();
final ISensitiveWordCharIgnore wordCharIgnore = context.charIgnore();
int tempLen = 0;
for(int i = beginIndex; i < rawChars.length; i++) {
// 判断是否跳过?
// 避免开始的时候命中 https://github.com/houbb/sensitive-word/issues/68
if(wordCharIgnore.ignore(i, rawChars, innerContext) && tempLen != 0) {
int maxWhite = 0;
int maxBlack = 0;
boolean firstCheck = true;
WordContainsTypeEnum wordContainsTypeEnumAllow = wordDataAllow.contains(stringBuilder, innerContext);
WordContainsTypeEnum wordContainsTypeEnumDeny = wordData.contains(stringBuilder, innerContext);
for (int i = beginIndex; i < rawChars.length; i++) {
if (wordCharIgnore.ignore(i, rawChars, innerContext) && tempLen != 0) {
tempLen++;
continue;
}
// 映射处理
final char currentChar = rawChars[i];
char mappingChar = formatCharMapping.get(currentChar);
char mappingChar = formatCharMapping.get(rawChars[i]);
stringBuilder.append(mappingChar);
tempLen++;
// 判断是否存在
WordContainsTypeEnum wordContainsTypeEnum = wordData.contains(stringBuilder, innerContext);
if(WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnum)) {
actualLength = tempLen;
// 是否遍历全部匹配的模式
if(WordValidModeEnum.FAIL_FAST.equals(wordValidModeEnum)) {
break;
if (firstCheck || !WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnumAllow)) {
wordContainsTypeEnumAllow = wordDataAllow.contains(stringBuilder, innerContext);
if (WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnumAllow)) {
maxWhite += tempLen;
wordContainsTypeEnumAllow = WordContainsTypeEnum.NOT_FOUND;
}
}
// 如果不包含,则直接返回。后续遍历无意义
if(WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnum)) {
if (firstCheck || !WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnumDeny)) {
wordContainsTypeEnumDeny = wordData.contains(stringBuilder, innerContext);
if (WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnumDeny)) {
maxBlack += tempLen;
wordContainsTypeEnumDeny = WordContainsTypeEnum.NOT_FOUND;
}
}
firstCheck = false;
if (WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnumAllow) &&
WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnumDeny)) {
break;
}
}
return actualLength;
return WordLengthResult.newInstance()
.wordAllowLen(maxWhite)
.wordDenyLen(maxBlack);
}
@Override

View File

@@ -1,90 +0,0 @@
package com.github.houbb.sensitive.word.support.check;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore;
import com.github.houbb.sensitive.word.api.IWordCheck;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum;
import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum;
import java.util.Map;
/**
* 敏感词监测实现(白名单)
* @author binbin.hou
* @since 0.21.0
*/
@ThreadSafe
public class WordCheckWordAllow extends AbstractWordCheck {
private static final IWordCheck INSTANCE = new WordCheckWordAllow();
public static IWordCheck getInstance() {
return INSTANCE;
}
@Override
protected Class<? extends IWordCheck> getSensitiveCheckClass() {
return WordCheckWordAllow.class;
}
@Override
protected int getActualLength(int beginIndex, InnerSensitiveWordContext innerContext) {
final String txt = innerContext.originalText();
final Map<Character, Character> formatCharMapping = innerContext.formatCharMapping();
final WordValidModeEnum wordValidModeEnum = innerContext.modeEnum();
final IWordContext context = innerContext.wordContext();
// 采用 ThreadLocal 应该可以提升性能,减少对象的创建。
int actualLength = 0;
final IWordData wordDataAllow = context.wordDataAllow();
// 前一个条件
StringBuilder stringBuilder = new StringBuilder();
char[] rawChars = txt.toCharArray();
final ISensitiveWordCharIgnore wordCharIgnore = context.charIgnore();
int tempLen = 0;
for(int i = beginIndex; i < rawChars.length; i++) {
// 判断是否跳过?
// 避免开始的时候命中 https://github.com/houbb/sensitive-word/issues/68
if(wordCharIgnore.ignore(i, rawChars, innerContext) && tempLen != 0) {
tempLen++;
continue;
}
// 映射处理
final char currentChar = rawChars[i];
char mappingChar = formatCharMapping.get(currentChar);
stringBuilder.append(mappingChar);
tempLen++;
// 判断是否存在
WordContainsTypeEnum wordContainsTypeEnum = wordDataAllow.contains(stringBuilder, innerContext);
if(WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnum)) {
actualLength = tempLen;
// 是否遍历全部匹配的模式
if(WordValidModeEnum.FAIL_FAST.equals(wordValidModeEnum)) {
break;
}
}
// 如果不包含,则直接返回。后续遍历无意义
if(WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnum)) {
break;
}
}
return actualLength;
}
@Override
protected String getType() {
return WordTypeEnum.WORD.getCode();
}
}

View File

@@ -0,0 +1,32 @@
package com.github.houbb.sensitive.word.support.result;
public class WordLengthResult {
private int wordAllowLen;
private int wordDenyLen;
private WordLengthResult(){}
public static WordLengthResult newInstance(){
return new WordLengthResult();
}
public int wordAllowLen(){
return this.wordAllowLen;
}
public WordLengthResult wordAllowLen(int wordAllowLen){
this.wordAllowLen=wordAllowLen;
return this;
}
public int wordDenyLen(){
return this.wordDenyLen;
}
public WordLengthResult wordDenyLen(int wordDenyLen){
this.wordDenyLen=wordDenyLen;
return this;
}
}

View File

@@ -1,11 +1,16 @@
package com.github.houbb.sensitive.word.benchmark;
import com.github.houbb.heaven.util.util.RandomUtil;
import com.github.houbb.sensitive.word.api.IWordAllow;
import com.github.houbb.sensitive.word.api.IWordDeny;
import com.github.houbb.sensitive.word.bs.SensitiveWordBs;
import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
import org.junit.Ignore;
import org.junit.Test;
import java.util.Collections;
import java.util.List;
@Ignore
public class BenchmarkBasicTest {
@@ -66,6 +71,45 @@ public class BenchmarkBasicTest {
System.out.println("------------------ COST: " + (end-start));
}
/**
* 黑白名单一次遍历
*/
@Test
public void costTimeOneTraceTest() {
StringBuilder sb=new StringBuilder();
for(int i=0;i<100;i++){
sb.append("地铁口交易").append(i);
}
String text = sb.toString();
// 1W 次
long start = System.currentTimeMillis();
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
.wordDeny(new IWordDeny() {
@Override
public List<String> deny() {
return Collections.singletonList("口交");
}
})
.wordAllow(new IWordAllow() {
@Override
public List<String> allow() {
return Collections.singletonList("地铁口交易");
}
})
.enableWordCheck(true)
.enableNumCheck(false)
.enableUrlCheck(false)
.enableEmailCheck(false)
.init();
for(int i = 0; i < 10000; i++) {
sensitiveWordBs.findAll(text);
}
long end = System.currentTimeMillis();
System.out.println("------------------ COST: " + (end-start));
}
/**
*
* COST: 1540-pc