当前位置: 首页 > news >正文

java 文本内容 相似度比对

整体过程:

入参封装 ->    比对   ->  出参封装   

比对过程简要描述:

    1、数据清洗,去除 html 富文本信息;

    2、分词规则:适用中文的是  HanLP、Jieba 两种 ;

    3、相似度计算:余弦相似度、Jaccard相似度;

工具方法如下:


import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.SegToken;
import com.jzai.weblab.pojo.gateway.CheckResult;
import com.jzai.weblab.pojo.gateway.Homework;import java.util.*;
import java.util.stream.Collectors;/*** 中文文本相似度计算工具类* 支持基于 HanLP 和 Jieba 的分词,以及多种相似度算法*/
public class ChineseTextSimilarity {/*** 使用 HanLP 分词*/public static List<String> hanLPSegment(String text) {if (text == null || text.trim().isEmpty()) {return new ArrayList<>();}List<Term> termList = HanLP.segment(text);return termList.stream().map(term -> term.word.trim()).filter(word -> !word.isEmpty() && word.length() > 1) // 过滤单字.collect(Collectors.toList());}/*** 使用 Jieba 分词*/public static List<String> jiebaSegment(String text) {if (text == null || text.trim().isEmpty()) {return new ArrayList<>();}JiebaSegmenter segmenter = new JiebaSegmenter();List<SegToken> tokens = segmenter.process(text, JiebaSegmenter.SegMode.SEARCH);return tokens.stream().map(token -> token.word.trim()).filter(word -> !word.isEmpty() && word.length() > 1) // 过滤单字.collect(Collectors.toList());}/*** 基于余弦相似度计算文本相似度*/public static double cosineSimilarity(List<String> words1, List<String> words2) {if (words1.isEmpty() && words2.isEmpty()) {return 1.0;}if (words1.isEmpty() || words2.isEmpty()) {return 0.0;}// 构建词汇表Set<String> vocabulary = new HashSet<>();vocabulary.addAll(words1);vocabulary.addAll(words2);// 构建词频向量Map<String, Integer> freq1 = getWordFrequency(words1);Map<String, Integer> freq2 = getWordFrequency(words2);// 计算点积double dotProduct = 0.0;for (String word : vocabulary) {int count1 = freq1.getOrDefault(word, 0);int count2 = freq2.getOrDefault(word, 0);dotProduct += count1 * count2;}// 计算模长double norm1 = calculateNorm(freq1);double norm2 = calculateNorm(freq2);if (norm1 == 0 || norm2 == 0) {return 0.0;}return dotProduct / (norm1 * norm2);}/*** 基于 Jaccard 相似度计算文本相似度*/public static double jaccardSimilarity(List<String> words1, List<String> words2) {if (words1.isEmpty() && words2.isEmpty()) {return 1.0;}if (words1.isEmpty() || words2.isEmpty()) {return 0.0;}Set<String> set1 = new HashSet<>(words1);Set<String> set2 = new HashSet<>(words2);Set<String> intersection = new HashSet<>(set1);intersection.retainAll(set2);Set<String> union = new HashSet<>(set1);union.addAll(set2);if (union.isEmpty()) {return 0.0;}return (double) intersection.size() / union.size();}/*** 综合相似度计算(结合多种算法)*/public static double comprehensiveSimilarity(String text1, String text2,boolean useHanLP,double cosineWeight,double jaccardWeight) {List<String> words1 = useHanLP ? hanLPSegment(text1) : jiebaSegment(text1);List<String> words2 = useHanLP ? hanLPSegment(text2) : jiebaSegment(text2);double cosineSim = cosineSimilarity(words1, words2);double jaccardSim = jaccardSimilarity(words1, words2);// 加权平均return cosineSim * cosineWeight + jaccardSim * jaccardWeight;}/*** 获取词频统计*/private static Map<String, Integer> getWordFrequency(List<String> words) {Map<String, Integer> frequency = new HashMap<>();for (String word : words) {frequency.put(word, frequency.getOrDefault(word, 0) + 1);}return frequency;}/*** 计算向量模长*/private static double calculateNorm(Map<String, Integer> frequency) {double sum = 0.0;for (int count : frequency.values()) {sum += count * count;}return Math.sqrt(sum);}/*** 格式化输出相似度结果*/public static void printSimilarityResult(String text1, String text2) {System.out.println("文本1: " + text1);System.out.println("文本2: " + text2);System.out.println();// HanLP 分词结果List<String> hanlpWords1 = hanLPSegment(text1);List<String> hanlpWords2 = hanLPSegment(text2);System.out.println("HanLP 分词结果:");System.out.println("  文本1: " + hanlpWords1);System.out.println("  文本2: " + hanlpWords2);// Jieba 分词结果List<String> jiebaWords1 = jiebaSegment(text1);List<String> jiebaWords2 = jiebaSegment(text2);System.out.println("Jieba 分词结果:");System.out.println("  文本1: " + jiebaWords1);System.out.println("  文本2: " + jiebaWords2);System.out.println();// 相似度计算结果double hanlpCosine = cosineSimilarity(hanlpWords1, hanlpWords2);double jiebaCosine = cosineSimilarity(jiebaWords1, jiebaWords2);double hanlpJaccard = jaccardSimilarity(hanlpWords1, hanlpWords2);double jiebaJaccard = jaccardSimilarity(jiebaWords1, jiebaWords2);double comprehensiveHanLP = comprehensiveSimilarity(text1, text2, true, 0.7, 0.3);double comprehensiveJieba = comprehensiveSimilarity(text1, text2, false, 0.7, 0.3);System.out.println("相似度计算结果:");System.out.printf("  HanLP 余弦相似度: %.4f\n", hanlpCosine);System.out.printf("  Jieba 余弦相似度: %.4f\n", jiebaCosine);System.out.printf("  HanLP Jaccard相似度: %.4f\n", hanlpJaccard);System.out.printf("  Jieba Jaccard相似度: %.4f\n", jiebaJaccard);System.out.printf("  HanLP 综合相似度: %.4f\n", comprehensiveHanLP);System.out.printf("  Jieba 综合相似度: %.4f\n", comprehensiveJieba);System.out.println("=====================================");}public static double printSimilarityResultNew(String text1, String text2) {if (text1 == null || text2 == null) {return 0.0;}// 清洗文本String cleanedText1 = cleanText(text1);String cleanedText2 = cleanText(text2);// HanLP 分词结果List<String> hanlpWords1 = hanLPSegment(cleanedText1);List<String> hanlpWords2 = hanLPSegment(cleanedText2);System.out.println("HanLP 分词结果:");System.out.println("  文本1: " + hanlpWords1);System.out.println("  文本2: " + hanlpWords2);// 如果两段文本都为空,返回1.0if (hanlpWords1.isEmpty() && hanlpWords2.isEmpty()) {return 1.0;}// 如果其中一段为空,返回0.0if (hanlpWords1.isEmpty() || hanlpWords2.isEmpty()) {return 0.0;}// 相似度计算结果double comprehensiveHanLP = comprehensiveSimilarity(cleanedText1, cleanedText2, true, 0.7, 0.3);return comprehensiveHanLP;}/*** 清洗文本:去除HTML标签、转义字符等*/public static String cleanText(String text) {if (text == null || text.trim().isEmpty()) {return "";}String cleaned = text;// 1. 解码HTML实体转义字符cleaned = decodeHtmlEntities(cleaned);// 2. 去除HTML标签cleaned = removeHtmlTags(cleaned);// 3. 去除多余的空白字符cleaned = cleanWhitespace(cleaned);return cleaned.trim();}/*** 解码HTML实体转义字符*/private static String decodeHtmlEntities(String text) {if (text == null) return "";return text.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&").replace("&quot;", "\"").replace("&apos;", "'").replace("&nbsp;", " ").replace("&#34;", "\"").replace("&#39;", "'").replace("&#160;", " ").replace("&ensp;", " ").replace("&emsp;", " ");}/*** 去除HTML标签*/private static String removeHtmlTags(String text) {if (text == null) return "";// 移除HTML标签 <...>String withoutTags = text.replaceAll("<[^>]+>", "");// 移除可能残留的标签属性等withoutTags = withoutTags.replaceAll("&[a-zA-Z0-9#]+;", " ");return withoutTags;}/*** 清理空白字符*/private static String cleanWhitespace(String text) {if (text == null) return "";// 将多个连续空白字符替换为单个空格return text.replaceAll("\\s+", " ");}// 测试示例public static void main(String[] args) {// 测试用例String text1 = "&lt;p&gt;成功源于天赋还是努力&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;成功,这一人类永恒的追求,究竟源于天赋还是努力?这是一个困扰无数人的问题。在我看来,天赋与努力如同鸟之双翼,缺一不可,共同推动着个人走向成功的彼岸。&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;天赋,是上天赐予的礼物,它如同种子,蕴含着生长的潜力。爱因斯坦的相对论灵感、莫扎特的音乐才华,这些天赋的闪光点为他们奠定了成功的基石。天赋让人在特定领域拥有敏锐的洞察力和快速的学习能力,省去了摸索的弯路。然而,天赋若缺乏后天的浇灌,终将枯萎。王安石笔下的方仲永,虽有神童之誉,却因后天不学,最终\"泯然众人矣\",这警示我们天赋的局限性。&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;努力,是成功的阶梯,它如同工匠手中的刻刀,将粗糙的原石雕琢成璀璨的宝石。爱迪生发明电灯时,经历了上千次失败,却坚信\"天才是百分之一的灵感加上百分之九十九的汗水\"。马云创建阿里巴巴前,多次求职被拒,但他从未放弃,最终缔造了商业帝国。努力弥补了天赋的不足,让平凡之人也能书写非凡篇章。正如华罗庚所言:\"勤能补拙是良训,一分辛苦一分才。\"&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;然而,天赋与努力并非孤立存在,而是相互交融、彼此推动。天赋为努力指明方向,努力则让天赋绽放光彩。贝多芬在失聪后仍创作出《第九交响曲》,他的音乐天赋与不懈努力共同铸就了艺术巅峰。在当今社会,我们既不应迷信天赋而忽视努力,也不该否定天赋的价值而陷入盲目苦干。正确的态度是:发现并发挥自身天赋,同时以持之以恒的努力去打磨它。&lt;/p&gt;&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;成功之路,天赋为帆,努力为桨。让我们扬帆起航,在天赋的指引下,以努力为动力,驶向成功的彼岸。 &lt;/p&gt;";String text2 = "&lt;p&gt;&lt;br&gt;&lt;/p&gt;&lt;p&gt;风把柳絮吹成浅白的云\u200B&lt;/p&gt;&lt;p&gt;落在窗台,像未拆的信\u200B&lt;/p&gt;&lt;p&gt;—— 那是春天最轻的脚印\u200B&lt;/p&gt;&lt;p&gt;妈妈的手掌抚过发顶\u200B&lt;/p&gt;&lt;p&gt;温度轻得像一片羽毛\u200B&lt;/p&gt;&lt;p&gt;却能接住我所有的委屈\u200B&lt;/p&gt;&lt;p&gt;旧书笺上的铅笔字迹\u200B&lt;/p&gt;&lt;p&gt;被时光磨得轻轻浅浅\u200B&lt;/p&gt;&lt;p&gt;却藏着童年最软的回忆\u200B&lt;/p&gt;&lt;p&gt;纸鸢在蓝天上飘呀飘\u200B&lt;/p&gt;&lt;p&gt;线轴转得轻轻\u200B&lt;/p&gt;&lt;p&gt;心却跟着它,飞向很远的风景\u200B&lt;/p&gt;&lt;p&gt;原来 “轻” 从不是单薄的意义\u200B&lt;/p&gt;&lt;p&gt;是春风拂过的温柔\u200B&lt;/p&gt;&lt;p&gt;是陪伴藏在细节里的暖意\u200B&lt;/p&gt;&lt;p&gt;是平凡日子里,悄悄发芽的欢喜\u200B&lt;br&gt;&lt;/p&gt;";printSimilarityResultNew(text1, text2);
/*        printSimilarityResult(text1, text3);printSimilarityResult(text1, text4);printSimilarityResult(text3, text4);*/}/*** 批量查重*/public static List<CheckResult> batchCheck(List<Homework> homeworks, Homework homework) {List<CheckResult> results = new ArrayList<>();for (int i = 0; i < homeworks.size(); i++) {Homework h1 = homeworks.get(i);double similarity = printSimilarityResultNew(homework.getContent(),h1.getContent());results.add(new CheckResult(homework,h1,  similarity));}// 按相似度降序排序results.sort((r1, r2) -> Double.compare(r2.getSimilarity(), r1.getSimilarity()));return results;}
}

入参格式:

import java.util.Date;public class Homework {private Integer userId;private String loginID;private String userName;private String content;private Date submitTime;public Homework() {}// getters and setterspublic Integer getUserId() {return userId;}public String getLoginID() {return loginID;}public String getUserName() {return userName;}public void setContent(String content) {this.content = content;}public String getContent() { return content; }public Date getSubmitTime() { return submitTime; }
}

出参格式:

public class CheckResult {private Homework homework1;private Homework homework2;private double similarity;private boolean isSuspicious;private Integer resultType; // 0: 正常, 1: 可疑, 2: 高度可疑public CheckResult(Homework h1, Homework h2, double similarity) {this.homework1 = h1;this.homework2 = h2;this.similarity = similarity;this.isSuspicious = similarity > 0.7; // 阈值可调整if(similarity > 0.6) {this.resultType = 2;} else if (similarity > 0.3) {this.resultType = 1;} else {this.resultType = 0;}}// getterspublic Homework getHomework1() { return homework1; }public Homework getHomework2() { return homework2; }public double getSimilarity() { return similarity; }public boolean isSuspicious() { return isSuspicious; }public void setResultType(Integer resultType) {this.resultType = resultType;}public Integer getResultType() {return resultType;}@Overridepublic String toString() {return String.format("相似度: %.2f%% - %s(%s) vs %s(%s)",similarity * 100,homework1.getUserName(), homework1.getLoginID(),homework2.getUserName(), homework2.getLoginID());}
}

http://www.dtcms.com/a/544962.html

相关文章:

  • 切换jdk17
  • 定制型网站 成功案例网站建设费 税前扣除吗
  • 【SpringMVC】SpringMVC 请求与响应全解析:从 Cookie/Session 到状态码、Header 配置
  • 兰州网站建设ulezhi郑州网站建设培训短期班
  • 8.1.2 大数据方法论与实践指南-埋点实现方式分类
  • 7.1.5 大数据方法论与实践指南-日志系统+监控报警
  • Node.js Stream:深入理解与高效使用
  • 7.1.1 大数据方法论与实践指南-数仓元数据平台(数据地图)
  • 网站建设会计处理重庆网络公司产品设计
  • LeetCode 2001.可互换矩形的组数
  • 哈尔滨做网站哪家好电脑外设网站建设论文
  • 【Linux】数据链路层
  • 基于CentOS安装LNMP
  • Vue八股问题
  • 2025.10.21作业
  • SpringBoot面试题01-ApplicationContextInitializer
  • java之Future
  • Projection Error: Explanation and Causes 关于投影误差的解释与说明
  • php网站开发模板织梦移动端网站模板下载地址
  • 【JavaEE初阶】TCP的核心机制6——拥塞控制
  • LangChain4j学习一:聊天和语言模型
  • LeetCode hot100:056 合并区间:高效算法解析
  • uni-app开发入门手册
  • 做网站属于程序员吗网站搭建php源码
  • 什么是支架电容,它的原理是什么
  • 仓颉UI开发精髓:构建高复用、可组合的自定义组件
  • 校园文化宣传主题网站的建设做门户网站多少钱
  • 深入理解 Rust 的 Iterator Trait:惰性与抽象的力量
  • vs做网站怎么加文件夹商丘销售网站制作
  • 自定义ViewGroup实现要点