fix:优化AC自动机

This commit is contained in:
sunwenhaopro 2024-01-13 13:36:30 +08:00
parent 8e6841ba9f
commit 96a3c47a49
7 changed files with 34 additions and 47 deletions

View File

@ -1,7 +1,7 @@
package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import com.abin.mallchat.common.common.algorithm.ac.ACTrie;
import com.abin.mallchat.common.common.algorithm.ac.MatchResult;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.ac.ACTrie;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.ac.MatchResult;
import org.HdrHistogram.ConcurrentHistogram;
import org.apache.commons.lang3.StringUtils;

View File

@ -1,7 +1,6 @@
package com.abin.mallchat.common.common.algorithm.sensitiveWord;
import com.abin.mallchat.common.common.algorithm.ac.ACTrie;
import com.abin.mallchat.common.common.algorithm.acpro.ACProTrie;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.acpro.ACProTrie;
import io.micrometer.core.instrument.util.StringUtils;
import java.util.List;
@ -13,6 +12,7 @@ import java.util.Objects;
*@description: 基于ACFilter的优化增强版本
*/
public class ACProFilter implements SensitiveWordFilter{
private ACProTrie acProTrie;
@Override

View File

@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.algorithm.ac;
package com.abin.mallchat.common.common.algorithm.sensitiveWord.ac;
import com.google.common.collect.Lists;

View File

@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.algorithm.ac;
package com.abin.mallchat.common.common.algorithm.sensitiveWord.ac;
import com.google.common.collect.Maps;
import lombok.Getter;

View File

@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.algorithm.ac;
package com.abin.mallchat.common.common.algorithm.sensitiveWord.ac;
import lombok.AllArgsConstructor;
import lombok.Getter;

View File

@ -1,4 +1,4 @@
package com.abin.mallchat.common.common.algorithm.acpro;
package com.abin.mallchat.common.common.algorithm.sensitiveWord.acpro;
import java.util.*;
@ -87,52 +87,42 @@ public class ACProTrie {
// 匹配
public String match(String matchWord)
{
Word walkNode=root;
char[] wordArray=matchWord.toCharArray();
for(int i=0;i<wordArray.length;i++)
{
// 失败回调状态
while(!walkNode.hasChild(wordArray[i]) && walkNode.failOver!=null)
{
walkNode=walkNode.failOver;
Word walkNode = root;
char[] wordArray = matchWord.toCharArray();
for (int i = 0; i < wordArray.length; i++) {
// 失败"回溯"
while (!walkNode.hasChild(wordArray[i]) && walkNode.failOver != null) {
walkNode = walkNode.failOver;
}
if(walkNode.hasChild(wordArray[i])) {
walkNode=walkNode.next.get(wordArray[i]);
if(walkNode.end){
if (walkNode.hasChild(wordArray[i])) {
walkNode = walkNode.next.get(wordArray[i]);
if (walkNode.end) {
// sentinelA和sentinelB作为哨兵节点去后面探测是否仍存在end
Word sentinelA = walkNode; // 记录当前节点
Word sentinelB = walkNode; //记录end节点
int k = i+1;
boolean flag=false;
int k = i + 1;
boolean flag = false;
//判断end是不是最终end即敏感词是否存在包含关系(abc,abcd)
while(k < wordArray.length && sentinelA.hasChild(wordArray[k])) {
while (k < wordArray.length && sentinelA.hasChild(wordArray[k])) {
sentinelA = sentinelA.next.get(wordArray[k]);
k++;
if(sentinelA.end)
{
sentinelB=sentinelA;
flag=true;
if (sentinelA.end) {
sentinelB = sentinelA;
flag = true;
}
}
// 根据结果去替换*
if(flag){
int length=sentinelB.depth;
while(length>0)
{
length--;
wordArray[i+length]=MASK;
}
// 直接跳到最后的end节点failOver
i=i+length;
walkNode = sentinelB.failOver;
}else{
int length=walkNode.depth;
while (length>0){
length--;
wordArray[i-length]=MASK;
}
walkNode = walkNode.failOver;
// 计算替换长度
int len = flag ? sentinelB.depth : walkNode.depth;
while (len > 0) {
len--;
int index = flag ? i - walkNode.depth + 1 + len : i - len;
wordArray[index] = MASK;
}
// 更新i
i += flag ? sentinelB.depth : 0;
// 更新node
walkNode = flag ? sentinelB.failOver : walkNode.failOver;
}
}
}

View File

@ -4,9 +4,6 @@ import com.abin.mallchat.common.common.algorithm.sensitiveWord.ACFilter;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.ACProFilter;
import com.abin.mallchat.common.common.algorithm.sensitiveWord.DFAFilter;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.*;
/**
@ -50,7 +47,7 @@ public class SensitiveTest {
@Test
public void ACMulti() {
List<String> sensitiveList = Arrays.asList("白痴", "你是白痴", "白痴吗");
List<String> sensitiveList = Arrays.asList("你是白痴","你是");
ACFilter instance = new ACFilter();
instance.loadWord(sensitiveList);
System.out.println(instance.filter("你是白痴吗"));