package com.tanpu.community.service; import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; import com.tanpu.common.constant.BizStatus; import com.tanpu.community.dao.entity.community.ThemeCheckDuplicateEntity; import com.tanpu.community.dao.mapper.community.ThemeCheckDuplicateMapper; import com.tanpu.community.util.TimeUtils; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; import javax.annotation.Resource; import java.time.LocalDateTime; import java.util.ArrayList; import java.util.List; @Service @Slf4j public class ThemeTextCheckService { public static final int SENTENCE_MIN_LENGTH = 5; @Resource private ThemeCheckDuplicateMapper themeCheckDuplicateMapper; private final String regex = "[,。!?;;:, ]"; // 插入 public void insert(String content, String themeId, String userId, Integer themeType, String editThemeId) { if (StringUtils.isBlank(content) || content.length() < 50) { return; } try { if (StringUtils.isNotBlank(editThemeId)) { themeCheckDuplicateMapper.deleteByThemeId(editThemeId); } content = content.replaceAll(regex, "。"); String[] split = content.split("。"); for (int i = 0; i < split.length; i++) { String trim = StringUtils.trim(split[i]); if (trim.length() > SENTENCE_MIN_LENGTH) { insert(themeId, trim, getHash(trim), i + 1, split.length, userId, themeType); } } } catch (Exception e) { log.error("文本查重insert失败,themeId:" + themeId); } } // 删除 public void deleteByThemeId(String themeId) { themeCheckDuplicateMapper.deleteByThemeId(themeId); } private void insert(String themeId, String partition, Integer partitionHash, Integer num, Integer total, String userId, Integer themeType) { ThemeCheckDuplicateEntity build = ThemeCheckDuplicateEntity.builder() .themeId(themeId) .partitionText(partition) .partitionHash(partitionHash) .partitionNum(num) .totalParts(total) .userId(userId) .themeType(themeType) .build(); themeCheckDuplicateMapper.insert(build); } public boolean checkDuplicate(String content) { // 文字数小于50不查重 if (content.length() < 50) { return false; } content = content.replaceAll(regex, "。"); String[] split = content.split("。"); List<Integer> list = new ArrayList<>(); for (int i = 0; i < split.length; i++) { String trim = StringUtils.trim(split[i]); if (trim.length() > SENTENCE_MIN_LENGTH) { list.add(getHash(trim)); } } if (list.size() <= 2) { return false; } // 一年以内 LambdaQueryWrapper<ThemeCheckDuplicateEntity> w = new LambdaQueryWrapper<ThemeCheckDuplicateEntity>().in(ThemeCheckDuplicateEntity::getPartitionHash, list) .eq(ThemeCheckDuplicateEntity::getDeleteTag, BizStatus.DeleteTag.tag_init) .gt(ThemeCheckDuplicateEntity::getCreateTime, TimeUtils.getDaysBefore(360)) .groupBy(ThemeCheckDuplicateEntity::getThemeId); Integer check = themeCheckDuplicateMapper.check(w); // 重复率大于80% if (check != null && check >= list.size() * 0.8) { return true; } return false; } public static Integer getHash(String str) { return str.hashCode(); // try { // // 生成一个MD5加密计算摘要 // MessageDigest md = MessageDigest.getInstance("MD5"); // // 计算md5函数 // md.update(str.getBytes()); // // digest()最后确定返回md5 hash值,返回值为8为字符串。因为md5 hash值是16位的hex值,实际上就是8位的字符 // // BigInteger函数则将8位的字符串转换成16位hex值,用字符串来表示;得到字符串形式的hash值 // return new BigInteger(1, md.digest()).toString(16); // } catch (Exception e) { // throw new BizException("MD5加密出现错误"); // } } // 初始化 public void insertInit(String content, String themeId, String userId, LocalDateTime createTime, Integer themeType) { themeCheckDuplicateMapper.deleteByThemeId(themeId); content = content.replaceAll(regex, "。"); String[] split = content.split("。"); for (int i = 0; i < split.length; i++) { String trim = StringUtils.trim(split[i]); if (trim.length() > SENTENCE_MIN_LENGTH) { ThemeCheckDuplicateEntity build = ThemeCheckDuplicateEntity.builder() .themeId(themeId) .partitionText(trim) .partitionHash(getHash(split[i])) .partitionNum(i + 1) .totalParts(split.length) .userId(userId) .themeType(themeType) .createTime(createTime) .updateTime(createTime) .build(); themeCheckDuplicateMapper.insert(build); } } } }