<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.sws</groupId>
<artifactId>langchain4j-rag-demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>langchain4j-rag-demo</name>
<description>langchain4j-rag-demo</description>
<properties>
<java.version>17</java.version>
<langchain4j.version>1.0.0-beta1</langchain4j.version>
<spring-boot.version>3.4.3</spring-boot.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j</artifactId>
<version>${langchain4j.version}</version>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-open-ai</artifactId>
<version>${langchain4j.version}</version>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-ollama</artifactId>
<version>${langchain4j.version}</version>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-community-redis</artifactId>
<version>${langchain4j.version}</version>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-milvus</artifactId>
<version>${langchain4j.version}</version>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-dependencies</artifactId>
<version>${spring-boot.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-community-bom</artifactId>
<version>${langchain4j.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
package com.sws.langchain4jragdemo.controller;
import dev.langchain4j.community.store.embedding.redis.RedisEmbeddingStore;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.loader.FileSystemDocumentLoader;
import dev.langchain4j.data.document.splitter.DocumentByLineSplitter;
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.chat.ChatLanguageModel;
import dev.langchain4j.model.ollama.OllamaEmbeddingModel;
import dev.langchain4j.model.openai.OpenAiChatModel;
import dev.langchain4j.model.output.Response;
import dev.langchain4j.rag.content.Content;
import dev.langchain4j.rag.content.retriever.ContentRetriever;
import dev.langchain4j.rag.content.retriever.EmbeddingStoreContentRetriever;
import dev.langchain4j.rag.query.Query;
import dev.langchain4j.service.AiServices;
import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
import dev.langchain4j.store.embedding.EmbeddingSearchResult;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.milvus.MilvusEmbeddingStore;
import io.milvus.client.MilvusServiceClient;
import io.milvus.param.ConnectParam;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.util.List;
/**
* @author qushen
* @date 2025/4/9 19:40
* @description:
* @Version:1.0
*/
@RestController
@RequestMapping("/api/ai")
public class AiController {
final static String aiUrl = "xxxxxxx";
final static String aiModel = "xxxxxxx";
final static String apiKey = "xxxxxx";
/**
* 对话测试
*/
public static void chatTest() {
// 创建聊天模型
ChatLanguageModel chatLanguageModel = OpenAiChatModel.builder()
.baseUrl(aiUrl)
.apiKey(apiKey)
.modelName(aiModel)
.temperature(0.0)
.build();
String generate = chatLanguageModel.chat("你今年多大了");
System.out.println("Response: " + generate);
}
/**
* 向量测试
*/
public static void EmbeddingTest() {
OllamaEmbeddingModel model = OllamaEmbeddingModel.builder()
.baseUrl("http://localhost:11434")
.modelName("nomic-embed-text:latest")
.build();
String text = "hello world";
Response<Embedding> response = model.embed(text);
System.out.println("向量:" + response.content());
System.out.println("维度:" + response.content().vector().length);
}
/**
* 分词器
*/
public static void Embeddingtext() {
//向量模型
OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.baseUrl("http://localhost:11434")
.modelName("nomic-embed-text:latest")
.build();
//向量数据库
RedisEmbeddingStore embeddingStore = RedisEmbeddingStore.builder()
.host("127.0.0.1")
.port(6379)
.dimension(768)
.build();
// 模拟插入数据
TextSegment segment1 = TextSegment.from("北京猿人来自北京");
Embedding embedding1 = embeddingModel.embed(segment1).content();
embeddingStore.add(embedding1, segment1);
// 模拟插入数据
TextSegment segment2 = TextSegment.from("蓝田猿人来自蓝田");
Embedding embedding2 = embeddingModel.embed(segment2).content();
embeddingStore.add(embedding2, segment2);
// 需要查询的内容 向量化
Embedding queryEmbedding = embeddingModel.embed("北京猿人来自哪里").content();
// 构建查询条件
EmbeddingSearchRequest build = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(1)
.build();
//查询
EmbeddingSearchResult<TextSegment> segmentEmbeddingSearchResult = embeddingStore.search(build);
// 打印
segmentEmbeddingSearchResult.matches().forEach(embeddingMatch -> {
System.out.println(embeddingMatch.score());
System.out.println(embeddingMatch.embedded().text());
});
}
/**
* 本文向量化
*/
public static void EmbeddingDoc() {
//向量模型
OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.baseUrl("http://localhost:11434")
.modelName("nomic-embed-text:latest")
.build();
//向量数据库
MilvusServiceClient customMilvusClient = new MilvusServiceClient(
ConnectParam.newBuilder()
.withHost("114.215.148.75")
.withPort(19530)
.build()
);
EmbeddingStore<TextSegment> embeddingStore = MilvusEmbeddingStore.builder()
.milvusClient(customMilvusClient)
.collectionName("dafault")
.dimension(768)
.build();
Document document = FileSystemDocumentLoader.loadDocument("C:\\Users\\ASUS\\Desktop\\test.txt");
//文档分词器 -自然语言分词
// DocumentBySentenceSplitter sentenceSplitter = new DocumentBySentenceSplitter(
// 100, // 每段最长字数
// 20 // 自然语言最大重叠字数
// );
//文档分词器 -字数分词
DocumentByLineSplitter splitter = new DocumentByLineSplitter(
20, // 每段最长字数
10 // 自然语言最大重叠字数
);
List<TextSegment> segments = splitter.split(document);
// 向量化
List<Embedding> embeddings = embeddingModel.embedAll(segments).content();
// 存入向量数据库
embeddingStore.addAll(embeddings, segments);
System.out.println("向量化完成");
}
/**
* 向量搜索
*/
public static void search() {
//向量模型
OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.baseUrl("http://localhost:11434")
.modelName("nomic-embed-text:latest")
.build();
//向量数据库
RedisEmbeddingStore embeddingStore = RedisEmbeddingStore.builder()
.host("127.0.0.1")
.port(6379)
.dimension(768)
.build();
// 需要查询的内容 向量化
Embedding queryEmbedding = embeddingModel.embed("大数据平台包含哪些").content();
// 构建查询条件
EmbeddingSearchRequest build = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(1)
.build();
//查询
EmbeddingSearchResult<TextSegment> segmentEmbeddingSearchResult = embeddingStore.search(build);
// 打印
segmentEmbeddingSearchResult.matches().forEach(embeddingMatch -> {
System.out.println(embeddingMatch.score());
System.out.println(embeddingMatch.embedded().text());
});
}
/**
* 检索增强
*/
public static void rag() {
try {
// 1. 构建大语言模型
ChatLanguageModel chatLanguageModel = OpenAiChatModel.builder()
.baseUrl(aiUrl)
.apiKey(apiKey)
.modelName(aiModel)
.temperature(0.0)
.build();
// 2. 构建向量(嵌入)模型
OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.baseUrl("http://localhost:11434")
.modelName("nomic-embed-text:latest")
.build();
// 3. 初始化向量数据库(Redis),注意维度必须与嵌入模型一致
// RedisEmbeddingStore embeddingStore = RedisEmbeddingStore.builder()
// .host("127.0.0.1")
// .port(6379)
// .dimension(768)
// .build();
// 3. 初始化向量数据库(Milvus),注意维度必须与嵌入模型一致
MilvusServiceClient customMilvusClient = new MilvusServiceClient(
ConnectParam.newBuilder()
.withHost("xxxxxxxx")
.withPort(19530)
.build()
);
EmbeddingStore<TextSegment> embeddingStore = MilvusEmbeddingStore.builder()
.milvusClient(customMilvusClient)
.collectionName("dafault")
.dimension(768)
.build();
// 4. 构建检索器,并设置最大返回结果数和最低相似度阈值
ContentRetriever contentRetriever = EmbeddingStoreContentRetriever.builder()
.embeddingStore(embeddingStore)
.embeddingModel(embeddingModel)
.maxResults(5)
.minScore(0.6)
.build();
// 5. 构建服务,整合语言模型和检索器
Assistant assistant = AiServices.builder(Assistant.class)
.chatLanguageModel(chatLanguageModel)
.contentRetriever(contentRetriever)
.build();
// 6. 定义查询语句,并通过检索器查找相关内容
System.out.println(assistant.chat("大数据平台有哪些?"));
} catch (Exception e) {
// 捕获异常,打印错误信息,有助于调试
System.err.println("执行过程中发生错误: " + e.getMessage());
e.printStackTrace();
}
}
public static void main(String[] args) {
// EmbeddingDoc();
rag();
}
}
两个技术点
文档解析器 (Document Parser)
名称 | 功能描述 | 依赖 | 适用文件类型 |
---|
TextDocumentParser | 解析纯文本、HTML、Markdown 等简单格式文本 | 无需额外依赖 | .txt、.html、.md |
ApachePdfBoxDocumentParser | 基于 PDFBox 解析 PDF 文档,支持提取文本、字体、图像等 | Apache PDFBox | .pdf |
ApachePoiDocumentParser | 利用 Apache POI 解析 Office 文档,提取正文、表格、元数据等 | Apache POI | .doc、.docx、.xls、.xlsx、.ppt、.pptx |
ApacheTikaDocumentParser | 基于 Apache Tika 解析多种文件格式,自动识别 MIME 类型 | Apache Tika | 多种格式(PDF、Office、HTML、XML 等) |
文档分词器 (Document Splitter)
名称 | 分词方式 | 特点 | 适用场景 |
---|
DocumentByCharacterSplitter | 按字符分割 | 细粒度拆分,但容易丢失词语整体语义 | 特殊符号处理、字符级特征提取 |
DocumentByRegexSplitter | 利用正则表达式分割 | 灵活自定义,适用于按自定义规则提取特定标记 | 复杂格式文本、定制化标记拆分 |
DocumentByParagraphSplitter | 按段落分割 | 保持段落语义完整,适合长文本拆分 | 文章、报告、长文档 |
DocumentByLineSplitter | 按换行符分割 | 简单高效,逐行拆分文本,无需复杂处理 | 日志文件、CSV 数据、逐行记录文本 |
DocumentByWordSplitter | 按空白符或自定义规则分割 | 常用于英文文本分割;中文分词需专业工具辅助 | 英文单词拆分 |
DocumentBySentenceSplitter | 按句子分割 | 保持句子完整语义,适合句子级语义分析 | 语义分析、机器翻译、情感分析等 |