Commit 51f558a1 authored by 刘基明's avatar 刘基明

查重bug fix

parent 31064242
...@@ -18,6 +18,7 @@ import java.util.List; ...@@ -18,6 +18,7 @@ import java.util.List;
@Slf4j @Slf4j
public class ThemeTextCheckService { public class ThemeTextCheckService {
public static final int SENTENCE_MIN_LENGTH = 5;
@Resource @Resource
private ThemeCheckDuplicateMapper themeCheckDuplicateMapper; private ThemeCheckDuplicateMapper themeCheckDuplicateMapper;
...@@ -37,8 +38,9 @@ public class ThemeTextCheckService { ...@@ -37,8 +38,9 @@ public class ThemeTextCheckService {
String[] split = content.split("。"); String[] split = content.split("。");
for (int i = 0; i < split.length; i++) { for (int i = 0; i < split.length; i++) {
String trim = StringUtils.trim(split[i]); String trim = StringUtils.trim(split[i]);
insert(themeId, trim, getHash(trim), i + 1, split.length, userId, themeType); if (trim.length() > SENTENCE_MIN_LENGTH) {
insert(themeId, trim, getHash(trim), i + 1, split.length, userId, themeType);
}
} }
} catch (Exception e) { } catch (Exception e) {
log.error("文本查重insert失败,themeId:" + themeId); log.error("文本查重insert失败,themeId:" + themeId);
...@@ -75,8 +77,15 @@ public class ThemeTextCheckService { ...@@ -75,8 +77,15 @@ public class ThemeTextCheckService {
List<Integer> list = new ArrayList<>(); List<Integer> list = new ArrayList<>();
for (int i = 0; i < split.length; i++) { for (int i = 0; i < split.length; i++) {
String trim = StringUtils.trim(split[i]); String trim = StringUtils.trim(split[i]);
list.add(getHash(trim)); if (trim.length() > SENTENCE_MIN_LENGTH) {
list.add(getHash(trim));
}
} }
if (list.size() <= 2) {
return false;
}
// 一年以内 // 一年以内
LambdaQueryWrapper<ThemeCheckDuplicateEntity> w = new LambdaQueryWrapper<ThemeCheckDuplicateEntity>().in(ThemeCheckDuplicateEntity::getPartitionHash, list) LambdaQueryWrapper<ThemeCheckDuplicateEntity> w = new LambdaQueryWrapper<ThemeCheckDuplicateEntity>().in(ThemeCheckDuplicateEntity::getPartitionHash, list)
.eq(ThemeCheckDuplicateEntity::getDeleteTag, BizStatus.DeleteTag.tag_init) .eq(ThemeCheckDuplicateEntity::getDeleteTag, BizStatus.DeleteTag.tag_init)
...@@ -84,7 +93,7 @@ public class ThemeTextCheckService { ...@@ -84,7 +93,7 @@ public class ThemeTextCheckService {
.groupBy(ThemeCheckDuplicateEntity::getThemeId); .groupBy(ThemeCheckDuplicateEntity::getThemeId);
Integer check = themeCheckDuplicateMapper.check(w); Integer check = themeCheckDuplicateMapper.check(w);
// 重复率大于80% // 重复率大于80%
if (check != null && check * 10 >= split.length * 8) { if (check != null && check >= list.size() * 0.8) {
return true; return true;
} }
return false; return false;
...@@ -114,19 +123,21 @@ public class ThemeTextCheckService { ...@@ -114,19 +123,21 @@ public class ThemeTextCheckService {
String[] split = content.split("。"); String[] split = content.split("。");
for (int i = 0; i < split.length; i++) { for (int i = 0; i < split.length; i++) {
String trim = StringUtils.trim(split[i]); String trim = StringUtils.trim(split[i]);
if (trim.length() > SENTENCE_MIN_LENGTH) {
ThemeCheckDuplicateEntity build = ThemeCheckDuplicateEntity.builder()
.themeId(themeId)
.partitionText(trim)
.partitionHash(getHash(split[i]))
.partitionNum(i + 1)
.totalParts(split.length)
.userId(userId)
.themeType(themeType)
.createTime(createTime)
.updateTime(createTime)
.build();
themeCheckDuplicateMapper.insert(build);
}
ThemeCheckDuplicateEntity build = ThemeCheckDuplicateEntity.builder()
.themeId(themeId)
.partitionText(trim)
.partitionHash(getHash(split[i]))
.partitionNum(i + 1)
.totalParts(split.length)
.userId(userId)
.themeType(themeType)
.createTime(createTime)
.updateTime(createTime)
.build();
themeCheckDuplicateMapper.insert(build);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment