当前位置：首页 > news >正文

java 文本内容相似度比对

news 2025/11/1 18:38:17

整体过程：

入参封装 -> 比对 -> 出参封装

比对过程简要描述：

1、数据清洗，去除 html 富文本信息；

2、分词规则：适用中文的是 HanLP、Jieba 两种；

3、相似度计算：余弦相似度、Jaccard相似度；

工具方法如下：


import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.SegToken;
import com.jzai.weblab.pojo.gateway.CheckResult;
import com.jzai.weblab.pojo.gateway.Homework;import java.util.*;
import java.util.stream.Collectors;/*** 中文文本相似度计算工具类* 支持基于 HanLP 和 Jieba 的分词，以及多种相似度算法*/
public class ChineseTextSimilarity {/*** 使用 HanLP 分词*/public static List<String> hanLPSegment(String text) {if (text == null || text.trim().isEmpty()) {return new ArrayList<>();}List<Term> termList = HanLP.segment(text);return termList.stream().map(term -> term.word.trim()).filter(word -> !word.isEmpty() && word.length() > 1) // 过滤单字.collect(Collectors.toList());}/*** 使用 Jieba 分词*/public static List<String> jiebaSegment(String text) {if (text == null || text.trim().isEmpty()) {return new ArrayList<>();}JiebaSegmenter segmenter = new JiebaSegmenter();List<SegToken> tokens = segmenter.process(text, JiebaSegmenter.SegMode.SEARCH);return tokens.stream().map(token -> token.word.trim()).filter(word -> !word.isEmpty() && word.length() > 1) // 过滤单字.collect(Collectors.toList());}/*** 基于余弦相似度计算文本相似度*/public static double cosineSimilarity(List<String> words1, List<String> words2) {if (words1.isEmpty() && words2.isEmpty()) {return 1.0;}if (words1.isEmpty() || words2.isEmpty()) {return 0.0;}// 构建词汇表Set<String> vocabulary = new HashSet<>();vocabulary.addAll(words1);vocabulary.addAll(words2);// 构建词频向量Map<String, Integer> freq1 = getWordFrequency(words1);Map<String, Integer> freq2 = getWordFrequency(words2);// 计算点积double dotProduct = 0.0;for (String word : vocabulary) {int count1 = freq1.getOrDefault(word, 0);int count2 = freq2.getOrDefault(word, 0);dotProduct += count1 * count2;}// 计算模长double norm1 = calculateNorm(freq1);double norm2 = calculateNorm(freq2);if (norm1 == 0 || norm2 == 0) {return 0.0;}return dotProduct / (norm1 * norm2);}/*** 基于 Jaccard 相似度计算文本相似度*/public static double jaccardSimilarity(List<String> words1, List<String> words2) {if (words1.isEmpty() && words2.isEmpty()) {return 1.0;}if (words1.isEmpty() || words2.isEmpty()) {return 0.0;}Set<String> set1 = new HashSet<>(words1);Set<String> set2 = new HashSet<>(words2);Set<String> intersection = new HashSet<>(set1);intersection.retainAll(set2);Set<String> union = new HashSet<>(set1);union.addAll(set2);if (union.isEmpty()) {return 0.0;}return (double) intersection.size() / union.size();}/*** 综合相似度计算（结合多种算法）*/public static double comprehensiveSimilarity(String text1, String text2,boolean useHanLP,double cosineWeight,double jaccardWeight) {List<String> words1 = useHanLP ? hanLPSegment(text1) : jiebaSegment(text1);List<String> words2 = useHanLP ? hanLPSegment(text2) : jiebaSegment(text2);double cosineSim = cosineSimilarity(words1, words2);double jaccardSim = jaccardSimilarity(words1, words2);// 加权平均return cosineSim * cosineWeight + jaccardSim * jaccardWeight;}/*** 获取词频统计*/private static Map<String, Integer> getWordFrequency(List<String> words) {Map<String, Integer> frequency = new HashMap<>();for (String word : words) {frequency.put(word, frequency.getOrDefault(word, 0) + 1);}return frequency;}/*** 计算向量模长*/private static double calculateNorm(Map<String, Integer> frequency) {double sum = 0.0;for (int count : frequency.values()) {sum += count * count;}return Math.sqrt(sum);}/*** 格式化输出相似度结果*/public static void printSimilarityResult(String text1, String text2) {System.out.println("文本1: " + text1);System.out.println("文本2: " + text2);System.out.println();// HanLP 分词结果List<String> hanlpWords1 = hanLPSegment(text1);List<String> hanlpWords2 = hanLPSegment(text2);System.out.println("HanLP 分词结果:");System.out.println("  文本1: " + hanlpWords1);System.out.println("  文本2: " + hanlpWords2);// Jieba 分词结果List<String> jiebaWords1 = jiebaSegment(text1);List<String> jiebaWords2 = jiebaSegment(text2);System.out.println("Jieba 分词结果:");System.out.println("  文本1: " + jiebaWords1);System.out.println("  文本2: " + jiebaWords2);System.out.println();// 相似度计算结果double hanlpCosine = cosineSimilarity(hanlpWords1, hanlpWords2);double jiebaCosine = cosineSimilarity(jiebaWords1, jiebaWords2);double hanlpJaccard = jaccardSimilarity(hanlpWords1, hanlpWords2);double jiebaJaccard = jaccardSimilarity(jiebaWords1, jiebaWords2);double comprehensiveHanLP = comprehensiveSimilarity(text1, text2, true, 0.7, 0.3);double comprehensiveJieba = comprehensiveSimilarity(text1, text2, false, 0.7, 0.3);System.out.println("相似度计算结果:");System.out.printf("  HanLP 余弦相似度: %.4f\n", hanlpCosine);System.out.printf("  Jieba 余弦相似度: %.4f\n", jiebaCosine);System.out.printf("  HanLP Jaccard相似度: %.4f\n", hanlpJaccard);System.out.printf("  Jieba Jaccard相似度: %.4f\n", jiebaJaccard);System.out.printf("  HanLP 综合相似度: %.4f\n", comprehensiveHanLP);System.out.printf("  Jieba 综合相似度: %.4f\n", comprehensiveJieba);System.out.println("=====================================");}public static double printSimilarityResultNew(String text1, String text2) {if (text1 == null || text2 == null) {return 0.0;}// 清洗文本String cleanedText1 = cleanText(text1);String cleanedText2 = cleanText(text2);// HanLP 分词结果List<String> hanlpWords1 = hanLPSegment(cleanedText1);List<String> hanlpWords2 = hanLPSegment(cleanedText2);System.out.println("HanLP 分词结果:");System.out.println("  文本1: " + hanlpWords1);System.out.println("  文本2: " + hanlpWords2);// 如果两段文本都为空，返回1.0if (hanlpWords1.isEmpty() && hanlpWords2.isEmpty()) {return 1.0;}// 如果其中一段为空，返回0.0if (hanlpWords1.isEmpty() || hanlpWords2.isEmpty()) {return 0.0;}// 相似度计算结果double comprehensiveHanLP = comprehensiveSimilarity(cleanedText1, cleanedText2, true, 0.7, 0.3);return comprehensiveHanLP;}/*** 清洗文本：去除HTML标签、转义字符等*/public static String cleanText(String text) {if (text == null || text.trim().isEmpty()) {return "";}String cleaned = text;// 1. 解码HTML实体转义字符cleaned = decodeHtmlEntities(cleaned);// 2. 去除HTML标签cleaned = removeHtmlTags(cleaned);// 3. 去除多余的空白字符cleaned = cleanWhitespace(cleaned);return cleaned.trim();}/*** 解码HTML实体转义字符*/private static String decodeHtmlEntities(String text) {if (text == null) return "";return text.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&").replace("&quot;", "\"").replace("&apos;", "'").replace("&nbsp;", " ").replace("&#34;", "\"").replace("&#39;", "'").replace("&#160;", " ").replace("&ensp;", " ").replace("&emsp;", " ");}/*** 去除HTML标签*/private static String removeHtmlTags(String text) {if (text == null) return "";// 移除HTML标签 <...>String withoutTags = text.replaceAll("<[^>]+>", "");// 移除可能残留的标签属性等withoutTags = withoutTags.replaceAll("&[a-zA-Z0-9#]+;", " ");return withoutTags;}/*** 清理空白字符*/private static String cleanWhitespace(String text) {if (text == null) return "";// 将多个连续空白字符替换为单个空格return text.replaceAll("\\s+", " ");}// 测试示例public static void main(String[] args) {// 测试用例String text1 = "&lt;p&gt;成功源于天赋还是努力&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;成功，这一人类永恒的追求，究竟源于天赋还是努力？这是一个困扰无数人的问题。在我看来，天赋与努力如同鸟之双翼，缺一不可，共同推动着个人走向成功的彼岸。&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;天赋，是上天赐予的礼物，它如同种子，蕴含着生长的潜力。爱因斯坦的相对论灵感、莫扎特的音乐才华，这些天赋的闪光点为他们奠定了成功的基石。天赋让人在特定领域拥有敏锐的洞察力和快速的学习能力，省去了摸索的弯路。然而，天赋若缺乏后天的浇灌，终将枯萎。王安石笔下的方仲永，虽有神童之誉，却因后天不学，最终\"泯然众人矣\"，这警示我们天赋的局限性。&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;努力，是成功的阶梯，它如同工匠手中的刻刀，将粗糙的原石雕琢成璀璨的宝石。爱迪生发明电灯时，经历了上千次失败，却坚信\"天才是百分之一的灵感加上百分之九十九的汗水\"。马云创建阿里巴巴前，多次求职被拒，但他从未放弃，最终缔造了商业帝国。努力弥补了天赋的不足，让平凡之人也能书写非凡篇章。正如华罗庚所言：\"勤能补拙是良训，一分辛苦一分才。\"&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;然而，天赋与努力并非孤立存在，而是相互交融、彼此推动。天赋为努力指明方向，努力则让天赋绽放光彩。贝多芬在失聪后仍创作出《第九交响曲》，他的音乐天赋与不懈努力共同铸就了艺术巅峰。在当今社会，我们既不应迷信天赋而忽视努力，也不该否定天赋的价值而陷入盲目苦干。正确的态度是：发现并发挥自身天赋，同时以持之以恒的努力去打磨它。&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;成功之路，天赋为帆，努力为桨。让我们扬帆起航，在天赋的指引下，以努力为动力，驶向成功的彼岸。 &lt;/p&gt;";String text2 = "&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;风把柳絮吹成浅白的云\u200B&lt;/p&gt;&lt;p&gt;落在窗台，像未拆的信\u200B&lt;/p&gt;&lt;p&gt;—— 那是春天最轻的脚印\u200B&lt;/p&gt;&lt;p&gt;妈妈的手掌抚过发顶\u200B&lt;/p&gt;&lt;p&gt;温度轻得像一片羽毛\u200B&lt;/p&gt;&lt;p&gt;却能接住我所有的委屈\u200B&lt;/p&gt;&lt;p&gt;旧书笺上的铅笔字迹\u200B&lt;/p&gt;&lt;p&gt;被时光磨得轻轻浅浅\u200B&lt;/p&gt;&lt;p&gt;却藏着童年最软的回忆\u200B&lt;/p&gt;&lt;p&gt;纸鸢在蓝天上飘呀飘\u200B&lt;/p&gt;&lt;p&gt;线轴转得轻轻\u200B&lt;/p&gt;&lt;p&gt;心却跟着它，飞向很远的风景\u200B&lt;/p&gt;&lt;p&gt;原来 “轻” 从不是单薄的意义\u200B&lt;/p&gt;&lt;p&gt;是春风拂过的温柔\u200B&lt;/p&gt;&lt;p&gt;是陪伴藏在细节里的暖意\u200B&lt;/p&gt;&lt;p&gt;是平凡日子里，悄悄发芽的欢喜\u200B&lt;br&gt;&lt;/p&gt;";printSimilarityResultNew(text1, text2);
/*        printSimilarityResult(text1, text3);printSimilarityResult(text1, text4);printSimilarityResult(text3, text4);*/}/*** 批量查重*/public static List<CheckResult> batchCheck(List<Homework> homeworks, Homework homework) {List<CheckResult> results = new ArrayList<>();for (int i = 0; i < homeworks.size(); i++) {Homework h1 = homeworks.get(i);double similarity = printSimilarityResultNew(homework.getContent(),h1.getContent());results.add(new CheckResult(homework,h1,  similarity));}// 按相似度降序排序results.sort((r1, r2) -> Double.compare(r2.getSimilarity(), r1.getSimilarity()));return results;}
}

入参格式：

import java.util.Date;public class Homework {private Integer userId;private String loginID;private String userName;private String content;private Date submitTime;public Homework() {}// getters and setterspublic Integer getUserId() {return userId;}public String getLoginID() {return loginID;}public String getUserName() {return userName;}public void setContent(String content) {this.content = content;}public String getContent() { return content; }public Date getSubmitTime() { return submitTime; }
}

出参格式：

public class CheckResult {private Homework homework1;private Homework homework2;private double similarity;private boolean isSuspicious;private Integer resultType; // 0: 正常, 1: 可疑, 2: 高度可疑public CheckResult(Homework h1, Homework h2, double similarity) {this.homework1 = h1;this.homework2 = h2;this.similarity = similarity;this.isSuspicious = similarity > 0.7; // 阈值可调整if(similarity > 0.6) {this.resultType = 2;} else if (similarity > 0.3) {this.resultType = 1;} else {this.resultType = 0;}}// getterspublic Homework getHomework1() { return homework1; }public Homework getHomework2() { return homework2; }public double getSimilarity() { return similarity; }public boolean isSuspicious() { return isSuspicious; }public void setResultType(Integer resultType) {this.resultType = resultType;}public Integer getResultType() {return resultType;}@Overridepublic String toString() {return String.format("相似度: %.2f%% - %s(%s) vs %s(%s)",similarity * 100,homework1.getUserName(), homework1.getLoginID(),homework2.getUserName(), homework2.getLoginID());}
}

查看全文

http://www.dtcms.com/a/544962.html