计算文章的相似度
1. 基于余弦相似度的文本相似度计算
java
import java.util.*;
import java.util.regex.Pattern;public class TextSimilarity {/*** 文本向量化 - 使用词频(TF)*/public static Map<String, Integer> getWordFrequency(String text) {Map<String, Integer> frequencyMap = new HashMap<>();// 简单的分词和清洗String[] words = text.toLowerCase().split("\\W+");for (String word : words) {if (word.length() > 1) { // 过滤掉单字符frequencyMap.put(word, frequencyMap.getOrDefault(word, 0) + 1);}}return frequencyMap;}/*** 计算余弦相似度*/public static double cosineSimilarity(String text1, String text2) {Map<String, Integer> freq1 = getWordFrequency(text1);Map<String, Integer> freq2 = getWordFrequency(text2);// 获取所有词汇Set<String> allWords = new HashSet<>();allWords.addAll(freq1.keySet());allWords.addAll(freq2.keySet());// 计算向量点积double dotProduct = 0;for (String word : allWords) {int count1 = freq1.getOrDefault(word, 0);int count2 = freq2.getOrDefault(word, 0);dotProduct += count1 * count2;}// 计算向量模长double norm1 = 0;for (int count : freq1.values()) {norm1 += count * count;}norm1 = Math.sqrt(norm1);double norm2 = 0;for (int count : freq2.values()) {norm2 += count * count;}norm2 = Math.sqrt(norm2);if (norm1 == 0 || norm2 == 0) {return 0.0;}return dotProduct / (norm1 * norm2);}
}
2. 使用Jaccard相似度
java
public class JaccardSimilarity {/***