项目三:信息抽取与图谱问答(医疗科研文献知识图谱与智能问答平台)
项目原型
🏥 医疗知识图谱智能问答系统
==================================================[知识图谱统计]
📊 实体总数: 1,245,678 🔗 关系总数: 2,567,890
🔄 最后更新: 2024-01-15 14:30:25[文档处理面板]
待处理医学文献: 125篇
┌─────────────────────────────────────────────────────┐
│ 最新文献: 《2型糖尿病药物治疗新进展》 │
│ 抽取状态: ✅ 完成 (实体: 23, 关系: 45) │
└─────────────────────────────────────────────────────┘[智能问答界面]
用户问题: 糖尿病有哪些典型症状和常用药物?🤖 系统回答:
根据医疗知识图谱,糖尿病相关信息如下:【典型症状】
• 多饮、多尿、多食
• 体重下降
• 视力模糊
• 疲劳乏力【常用药物】
• 二甲双胍
• 胰岛素
• 格列美脲
• 西格列汀【相关检查】
• 血糖检测
• 糖化血红蛋白
• 口服葡萄糖耐量试验💡 温馨提示: 以上信息仅供参考,具体诊疗请咨询专业医生。[图谱可视化]
(显示糖尿病节点及其相关症状、药物、检查项目的网络图)[操作选项]
1. 🔍 查看详细图谱 2. 📚 文献管理 3. ⚙️ 系统配置
4. 📊 统计分析 5. 🔄 更新图谱 6. ❓ 帮助请输入选择 [1-6]:
配置参数
# config/kg_config.yaml
uie_model:model_path: "/models/uie-medical/"schema: - "疾病"- "症状" - "药物"- "检查项目"- "治疗方法"- "基因"- "蛋白质"- "科室"batch_size: 16max_seq_len: 512neo4j:uri: "bolt://localhost:7687"username: "neo4j"password: "medical_kg_2024"database: "medical_knowledge"encrypted: falseknowledge_graph:entity_threshold: 0.7relation_threshold: 0.6max_entities_per_doc: 50qa_system:cache_size: 1000timeout: 30max_results: 10
核心代码实现
import torch
from paddlenlp import Taskflow
from py2neo import Graph,Node,Relationship
import re
from typing import List,Dict,Tupleclass MedicalKGSystem:"""知识图谱系统包含信息抽取、图谱构建、智能回答"""def ____(self,neo4j_uri:str,username:str,password:str):#初始化UIE信息抽取模型self.schema = ["疾病", "症状", "药物", "检查项目", "治疗方法", "基因", "蛋白质", "科室"]self.uie = Taskflow("information_extraction", schema=self.schema,model="uie-base",task_path="/models/uie-medical/",device_id=0)#连接Neo4j数据库self.graph = Graph(neo4j_uri,auth=(username,password))#定义医疗实体关系self.relationships = {"疾病-症状": "has_symptom","疾病-检查项目": "need_check","疾病-药物": "treated_by", "疾病-治疗方法": "treated_with","药物-疾病": "treats","症状-疾病": "symptom_of","疾病-科室": "belongs_to_department"}def extract_from_text(self,text:str)->Dict:"""从医疗文本中抽取实体和关系"""try:#使用UIE进行实体抽取extraction_result = self.uie(text)#后处理抽取结果processed_entities = self._process_extraction_result(extraction_result)#基于规则的关系抽取relations = self._extract_relations(text,processed_entities)#更新知识图谱self._update_knowledge_graph(processed_entities,relations)return {"entities": processed_entities,"relations": relations,"status": "success"}except Exception as e:return {"entities": [],"relations": [], "status": f"error: {str(e)}"}def _process_extraction_result(self, extraction_result: Dict)->List[Dict]:"""处理UIE抽取结果,标准化实体"""processed_entities = []for entity_type,entities in extraction_result.items():for entity in entities:#实体标准化和去重normalized_entity = self._normalize_entity(entity['text']entity_type)processed_entities.append({'name': normalized_entity,'type': entity_type,'original_text': entity['text'],'confidence': entity.get('probability', 0.0)})return processed_entitiesdef _normalize_entity(self,entity_text:str,entity_type:str)->str:"""实体标准化,统一同义词,去除修饰词等"""#医疗实体标准化规则normalization_rules = {"疾病": {"糖尿病 mellitus": "糖尿病","DM": "糖尿病", "高血压病": "高血压"},"药物": {"阿司匹林片": "阿司匹林","ASP": "阿司匹林"}}#应用标准化规则if entity_type in normalizaiton_rules:for original,normalized in normalization_rules[entity_type].items():if orginal in entity_text:return normalized#去除修饰词modifiers = ["急性", "慢性", "重度", "轻度", "典型"]cleaned_text = entity_textfor modifier in modifiers:cleaned_text = cleaned_text.replace(modifier,"")return cleaned_text.strip()def _extract_relations(self, text: str, entities: List[Dict])->List<Dict>:"""基于规则和上下文的关系抽取"""relations = []#构建实体位置索引entity_positions = {}for entity in entities:start_pos = text.find(entity['original_text'])if start_pos != -1:entity_positions[entity['name']] = {'start': start_pos,'end': start_pos + len(entity['original_text']),'type': entity['type']}#基于距离和模式的关系抽取entity_names = [entity['name'] for entity in entities]for i,entity1 in enumerate(entities):for j,entity2 in enumerate(entities):if i != j:#检查是否存在预定义关系relation_key = f"{entity1['type']}--{entity2['type']}"if relation_key in self.relationships:#计算实体在文本中的距离pos1 = entity_positions.get(entity1['name'], {})pos2 = entity_positions.get(entity2['name'], {})if pos1 and pos2:distance = abs(pos1['start'] - pos2['start'])#如果实体在文本中距离较劲,人为存在关系if distance < 100:#100个字符以内relations.append({'source': entity1['name'],'target': entity2['name'], 'relation': self.relationships[relation_key],'confidence': 0.8,'source_type': entity1['type'],'target_type': entity2['type']})return relationsdef _update_knowledge_graph(self,entities:List[Dict],relations:List[Dict]):"""将抽取的实体和关系更新到Neo4j知识图谱"""tx = self.graph.begin()try:#创建或更新实体节点entity_nodes = {}for entity in entities:#检查节点是否已经存在existing_node = self.graph.nodes.match(entity['type'],name=entity['name']).first()if existing_node:node = existing_node#更新节点属性node['count'] = node.get('count',0) + 1node['last_updated'] = datetime.now().isoformat()else:#创建新节点node = Node(entity['type'],name=entity['name'],count=1,created_time=datetime.now().isoformat(),last_updated=datetime.now().isoformat())tx.create(node)entity_nodes[entity['name']] = node#创建关系for relation in relations:source_node = entity_nodes.get()target_node = entity_nodes.get()if source_node and target_node:#检查关系是否已存在existing_rel = self.graph.relationships.match({source_node,target_node},relation['relation']).first()if not existing_rel:#创建新关系new_rel = relationship(source_node, relation['relation'], target_node,confidence=relation['confidence'],created_time=datetime.now().isoformat())tx.xreate(new_rel)else:#更新关系权重existing_rel['weight'] = existing_rel.get('weight', 1) + 1existing_rel['last_updated'] = datetime.now().isoformat()tx.commit()except Exception as e:tx.rollback()raise e class GraphQASystem:"""图谱问答系统,将自然语言转换为Cypher查询"""def __init__(self,graph:Graph):self.graph = graphself.patterns = self._load_quesion_patterns()def answer_question()->Dict:"""回答自然语言问题""" #解析问题类型question_type = self._classify_question(question)#生成Cypher查询cypher_query = self._generate_cypher(question,question_type)#执行查询try:result = self.graph.run(cypher_query).data()answer = self._format_answer(result, question_type)return {"question": question,"cypher_query": cypher_query,"answer": answer,"raw_result": result,"status": "success"}except Exception as e:return {"question": question,"cypher_query": cypher_query,"answer": f"查询失败: {str(e)}","raw_result": [],"status": "error"}def _classify_question(self,quesion:str)->str:"""分类问题类型"""question_patterns = {"symptom_query": [r".*([糖尿病高血压]).*症状.*",r".*症状.*([糖尿病高血压]).*"],"treatment_query": [r".*([糖尿病高血压]).*治疗.*",r".*治疗.*([糖尿病高血压]).*"],"drug_query": [r".*([糖尿病高血压]).*药.*",r".*药.*治疗.*([糖尿病高血压]).*"],"department_query": [r".*([糖尿病高血压]).*挂.*科.*",r".*看.*科.*"]}for q_type, patterns in question_patterns.items():for pattern in patterns:if re.search(pattern, question):return q_typereturn "general_query"def _generate_cypher(self,question:str,question_type:str)->str:"""根据问题类型生成Cypher查询"""# 提取疾病实体disease_pattern = r"(糖尿病|高血压|冠心病|哮喘|肺炎)"disease_match = re.search(disease_pattern, question)disease = disease_match.group(1) if disease_match else Nonecypher_templates = {"symptom_query": f"""MATCH (d:疾病 {{name: '{disease}'}})-[r:has_symptom]->(s:症状)RETURN d.name as disease, collect(s.name) as symptoms""","treatment_query": f"""MATCH (d:疾病 {{name: '{disease}'}})-[r:treated_with]->(t:治疗方法)RETURN d.name as disease, collect(t.name) as treatments""","drug_query": f"""MATCH (d:疾病 {{name: '{disease}'}})-[r:treated_by]->(m:药物)RETURN d.name as disease, collect(m.name) as drugs""","department_query": f"""MATCH (d:疾病 {{name: '{disease}'}})-[r:belongs_to_department]->(dept:科室)RETURN d.name as disease, dept.name as department""","general_query": """MATCH (d:疾病)-[r]-(related)WHERE d.name CONTAINS $disease_nameRETURN d.name as disease, type(r) as relation, collect(related.name) as related_entitiesLIMIT 10"""}return cypher_templates.get(question_type, cypher_templates["general_query"])def _format_answer():"""格式化查询结果为自然语言回答"""if not result:return "抱歉,没有找到相关信息。"answer_templates = {"symptom_query": lambda r: f"{r['disease']}的常见症状包括:{', '.join(r['symptoms'])}。","treatment_query": lambda r: f"{r['disease']}的治疗方法包括:{', '.join(r['treatments'])}。","drug_query": lambda r: f"用于治疗{r['disease']}的药物有:{', '.join(r['drugs'])}。","department_query": lambda r: f"{r['disease']}应该挂{r['department']}科室。"}template = answer_templates.get(question_type)if template:return template(result[0])else:# 通用回答格式answers = []for item in result[:3]: # 最多显示3条结果answers.append(f"{item['disease']} - {item['relation']} - {item['related_entities']}")return "相关结果:\n" + "\n".join(answers)
===========================================
Java代码
系统架构
信息抽取与图谱问答系统
├── 前端界面 (Vue.js + Element Plus + ECharts)
├── RESTful API (Spring Boot)
├── 业务逻辑层
│ ├── 信息抽取服务
│ ├── 知识图谱构建服务
│ ├── 图谱问答服务
│ └── 图谱分析服务
├── 数据访问层
├── 图数据库 (Neo4j)
├── NLP处理层 (Stanford CoreNLP)
└── 缓存层 (Redis)
依赖文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.company</groupId><artifactId>kg-qa-system</artifactId><version>1.0.0</version><packaging>jar</packaging><name>Knowledge Graph QA System</name><description>Enterprise-level information extraction and knowledge graph QA system</description><parent><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-parent</artifactId><version>2.7.0</version><relativePath/></parent><properties><java.version>11</java.version><neo4j.version>4.4.9</neo4j.version><stanford-corenlp.version>4.5.0</stanford-corenlp.version></properties><dependencies><!-- Spring Boot Starters --><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-web</artifactId></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-data-jpa</artifactId></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-data-neo4j</artifactId></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-data-redis</artifactId></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-websocket</artifactId></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-validation</artifactId></dependency><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-actuator</artifactId></dependency><!-- Database --><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><scope>runtime</scope></dependency><!-- Neo4j --><dependency><groupId>org.neo4j</groupId><artifactId>neo4j-ogm-core</artifactId><version>${neo4j.version}</version></dependency><!-- NLP Processing --><dependency><groupId>edu.stanford.nlp</groupId><artifactId>stanford-corenlp</artifactId><version>${stanford-corenlp.version}</version></dependency><dependency><groupId>edu.stanford.nlp</groupId><artifactId>stanford-corenlp</artifactId><version>${stanford-corenlp.version}</version><classifier>models</classifier></dependency><!-- PDF Processing --><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>2.0.27</version></dependency><!-- Word Processing --><dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>5.2.3</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>5.2.3</version></dependency><!-- Utilities --><dependency><groupId>org.apache.commons</groupId><artifactId>commons-lang3</artifactId></dependency><dependency><groupId>com.google.guava</groupId><artifactId>guava</artifactId><version>31.1-jre</version></dependency><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.15.3</version></dependency><!-- JSON Processing --><dependency><groupId>com.fasterxml.jackson.core</groupId><artifactId>jackson-databind</artifactId></dependency><dependency><groupId>com.fasterxml.jackson.datatype</groupId><artifactId>jackson-datatype-jsr310</artifactId></dependency><!-- Test --><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-test</artifactId><scope>test</scope></dependency><dependency><groupId>org.testcontainers</groupId><artifactId>neo4j</artifactId><version>1.17.6</version><scope>test</scope></dependency><dependency><groupId>org.testcontainers</groupId><artifactId>junit-jupiter</artifactId><version>1.17.6</version><scope>test</scope></dependency></dependencies><build><plugins><plugin><groupId>org.springframework.boot</groupId><artifactId>spring-boot-maven-plugin</artifactId></plugin></plugins></build>
</project>
application.yml
server:port: 8082servlet:context-path: /kg-qaspring:datasource:url: jdbc:mysql://localhost:3306/kg_qa_system?useSSL=false&serverTimezone=UTCusername: rootpassword: passworddriver-class-name: com.mysql.cj.jdbc.Driverjpa:hibernate:ddl-auto: updateshow-sql: trueproperties:hibernate:dialect: org.hibernate.dialect.MySQL8Dialectformat_sql: truedata:neo4j:uri: bolt://localhost:7687username: neo4jpassword: passwordauto-index: updateredis:host: localhostport: 6379password: database: 0timeout: 2000mslettuce:pool:max-active: 8max-wait: -1msmax-idle: 8min-idle: 0websocket:allowed-origins: "*"# Knowledge Graph Configuration
kg:# Information Extraction Configurationextraction:max-text-length: 10000batch-size: 50enable-relation-extraction: trueenable-attribute-extraction: true# Graph Configurationgraph:auto-indexing: truebatch-insert-size: 1000cache-enabled: true# QA Configurationqa:max-path-length: 3timeout: 30000enable-fallback: true# NLP Configuration
nlp:stanford:annotators: tokenize, ssplit, pos, lemma, ner, parse, depparse, corefthreads: 4timeout: 30000# Application Configuration
app:cache:extraction-results-ttl: 3600query-results-ttl: 1800storage:upload-path: ./uploads/processed-path: ./processed/# Logging
logging:level:com.company.kg: DEBUGorg.springframework.web: INFOorg.hibernate: WARNorg.neo4j: WARNfile:name: logs/kg-qa-system.logpattern:file: "%d{yyyy-MM-dd HH:mm:ss} - %logger{36} - %msg%n"# Management Endpoints
management:endpoints:web:exposure:include: health,info,metrics,prometheusendpoint:health:show-details: always
数据模型层
文本源实体
/*** 文本源实体 - 存储待处理的文本数据源*/
@Entity
@Table(name = "text_source", indexes = {@Index(name = "idx_source_type", columnList = "sourceType"),@Index(name = "idx_source_status", columnList = "status"),@Index(name = "idx_source_domain", columnList = "domain")
})
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TextSource {public enum SourceType {PDF, DOC, DOCX, TXT, HTML, WEB_PAGE, DATABASE, API}public enum ProcessingStatus {PENDING, PROCESSING, EXTRACTED, FAILED, DELETED}@Id@GeneratedValue(strategy = GenerationType.IDENTITY)@EqualsAndHashCode.Includeprivate Long id;@NotBlank(message = "源名称不能为空")@Column(nullable = false, length = 500)private String name;@Enumerated(EnumType.STRING)@Column(nullable = false, length = 20)private SourceType sourceType;@Column(length = 50)private String fileType;@Columnprivate Long fileSize;@Column(length = 500)private String filePath;@Column(columnDefinition = "TEXT")private String content;@Column(length = 100)private String domain = "default";@Column(length = 100)private String category;@Enumerated(EnumType.STRING)@Column(nullable = false, length = 20)private ProcessingStatus status = ProcessingStatus.PENDING;@Column(nullable = false)private Integer entityCount = 0;@Column(nullable = false)private Integer relationCount = 0;@Column(nullable = false)private Integer attributeCount = 0;@Column(length = 1000)private String processingError;@Column(length = 500)private String metadata; // JSON格式的元数据@OneToMany(mappedBy = "textSource", cascade = CascadeType.ALL, fetch = FetchType.LAZY)private List<ExtractionResult> extractionResults = new ArrayList<>();@CreationTimestamp@Column(updatable = false)private LocalDateTime createdAt;@UpdateTimestampprivate LocalDateTime updatedAt;@Versionprivate Long version;public TextSource(String name, SourceType sourceType, String content) {this.name = name;this.sourceType = sourceType;this.content = content;}public boolean isProcessed() {return ProcessingStatus.EXTRACTED.equals(this.status);}
}
抽取结果实体
/*** 抽取结果实体 - 存储信息抽取的结果*/
@Entity
@Table(name = "extraction_result", indexes = {@Index(name = "idx_result_source", columnList = "textSource_id"),@Index(name = "idx_result_created", columnList = "createdAt")
})
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class ExtractionResult {@Id@GeneratedValue(strategy = GenerationType.IDENTITY)@EqualsAndHashCode.Includeprivate Long id;@ManyToOne(fetch = FetchType.LAZY)@JoinColumn(name = "textSource_id", nullable = false)private TextSource textSource;@Column(nullable = false)private Integer entityCount = 0;@Column(nullable = false)private Integer relationCount = 0;@Column(nullable = false)private Integer attributeCount = 0;@Column(precision = 5, scale = 4)private Double extractionConfidence;@Column(length = 50)private String extractionModel;@Column(columnDefinition = "TEXT")private String extractedEntities; // JSON格式的实体列表@Column(columnDefinition = "TEXT")private String extractedRelations; // JSON格式的关系列表@Column(columnDefinition = "TEXT")private String extractedAttributes; // JSON格式的属性列表@Column(length = 1000)private String processingLog;@Column(nullable = false)private Long processingTimeMs;@ElementCollection@CollectionTable(name = "extraction_statistics", joinColumns = @JoinColumn(name = "extraction_result_id"))@MapKeyColumn(name = "stat_key")@Column(name = "stat_value")private Map<String, String> statistics = new HashMap<>();@CreationTimestamp@Column(updatable = false)private LocalDateTime createdAt;@Versionprivate Long version;public ExtractionResult(TextSource textSource) {this.textSource = textSource;}
}
Neo4j图数据库模型
实体节点
/*** 实体节点 - 知识图谱中的实体*/
@NodeEntity
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class EntityNode {@Id@GeneratedValue@EqualsAndHashCode.Includeprivate Long id;@Index(unique = true)@Property(name = "entity_id")private String entityId;@Property(name = "name")private String name;@Property(name = "type")private String type;@Property(name = "description")private String description;@Property(name = "domain")private String domain = "default";@Property(name = "source")private String source;@Property(name = "confidence")private Double confidence;@Property(name = "created_at")private LocalDateTime createdAt;@Property(name = "updated_at")private LocalDateTime updatedAt;@Property(name = "metadata")private String metadata; // JSON格式的元数据@Relationship(type = "HAS_ATTRIBUTE", direction = Relationship.OUTGOING)private Set<AttributeRelation> attributes = new HashSet<>();@Relationship(type = "RELATED_TO", direction = Relationship.OUTGOING)private Set<EntityRelation> relations = new HashSet<>();public EntityNode(String entityId, String name, String type) {this.entityId = entityId;this.name = name;this.type = type;this.createdAt = LocalDateTime.now();this.updatedAt = LocalDateTime.now();}public EntityNode(String entityId, String name, String type, String domain) {this(entityId, name, type);this.domain = domain;}/*** 添加属性*/public void addAttribute(String key, String value, Double confidence) {AttributeRelation attribute = new AttributeRelation(this, key, value, confidence);this.attributes.add(attribute);}/*** 添加关系*/public void addRelation(EntityNode target, String relationType, Double confidence) {EntityRelation relation = new EntityRelation(this, target, relationType, confidence);this.relations.add(relation);}/*** 获取实体显示标签*/public String getDisplayLabel() {return String.format("%s (%s)", name, type);}
}
实体关系
/*** 实体关系 - 知识图谱中实体之间的关系*/
@RelationshipEntity(type = "RELATED_TO")
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class EntityRelation {@Id@GeneratedValue@EqualsAndHashCode.Includeprivate Long id;@StartNodeprivate EntityNode startEntity;@EndNodeprivate EntityNode endEntity;@Property(name = "relation_id")private String relationId;@Property(name = "type")private String type;@Property(name = "description")private String description;@Property(name = "confidence")private Double confidence;@Property(name = "source")private String source;@Property(name = "domain")private String domain = "default";@Property(name = "created_at")private LocalDateTime createdAt;@Property(name = "metadata")private String metadata; // JSON格式的元数据public EntityRelation(EntityNode startEntity, EntityNode endEntity, String type, Double confidence) {this.startEntity = startEntity;this.endEntity = endEntity;this.type = type;this.confidence = confidence;this.relationId = generateRelationId(startEntity, endEntity, type);this.createdAt = LocalDateTime.now();}/*** 生成关系ID*/private String generateRelationId(EntityNode start, EntityNode end, String relationType) {return String.format("%s_%s_%s_%d", start.getEntityId(), relationType, end.getEntityId(), System.currentTimeMillis());}/*** 获取关系显示标签*/public String getDisplayLabel() {return String.format("%s → %s", type, confidence != null ? String.format("(%.2f)", confidence) : "");}
}
属性关系
/*** 属性关系 - 实体与属性之间的关系*/
@RelationshipEntity(type = "HAS_ATTRIBUTE")
@Data
@NoArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class AttributeRelation {@Id@GeneratedValue@EqualsAndHashCode.Includeprivate Long id;@StartNodeprivate EntityNode entity;@Property(name = "attribute_id")private String attributeId;@Property(name = "key")private String key;@Property(name = "value")private String value;@Property(name = "data_type")private String dataType;@Property(name = "confidence")private Double confidence;@Property(name = "source")private String source;@Property(name = "created_at")private LocalDateTime createdAt;@Property(name = "metadata")private String metadata; // JSON格式的元数据public AttributeRelation(EntityNode entity, String key, String value, Double confidence) {this.entity = entity;this.key = key;this.value = value;this.confidence = confidence;this.attributeId = generateAttributeId(entity, key);this.createdAt = LocalDateTime.now();this.dataType = inferDataType(value);}/*** 生成属性ID*/private String generateAttributeId(EntityNode entity, String key) {return String.format("%s_%s_%d", entity.getEntityId(), key, System.currentTimeMillis());}/*** 推断数据类型*/private String inferDataType(String value) {if (value == null) return "STRING";// 检查是否为数字if (value.matches("-?\\d+")) return "INTEGER";if (value.matches("-?\\d+\\.\\d+")) return "FLOAT";// 检查是否为布尔值if ("true".equalsIgnoreCase(value) || "false".equalsIgnoreCase(value)) return "BOOLEAN";// 检查是否为日期if (value.matches("\\d{4}-\\d{2}-\\d{2}")) return "DATE";if (value.matches("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}")) return "DATETIME";return "STRING";}/*** 获取属性显示标签*/public String getDisplayLabel() {return String.format("%s: %s", key, value);}
}
Web控制器层
信息抽取API控制器
@RestController
@RequestMapping("/api/v1/extraction")
@Validated
@Slf4j
public class ExtractionController {@Autowiredprivate InformationExtractionService extractionService;@Autowiredprivate KnowledgeGraphService knowledgeGraphService;/**从文本中抽取信息*/@PostMapping("/extract-text")public ResponseEntity< ExtractionResponse> extractFromText(@Valid @RequestBody TextExtractionRequest request) {log.info("Text extraction request - Domain: {}, Length: {}", request.getDomain(), request.getText().length());try {InformationExtractionService.ExtractionData extractionData = extractionService.extractInformation(request.getText(), request.getDomain());ExtractionResponse response = new ExtractionResponse();response.setSuccess(true);response.setEntityCount(extractionData.getEntities().size());response.setRelationCount(extractionData.getRelations().size());response.setAttributeCount(extractionData.getAttributes().size());response.setExtractionData(extractionData);response.setMessage("信息抽取完成");return ResponseEntity.ok(response);} catch (Exception e) {log.error("Text extraction failed", e);return ResponseEntity.badRequest().body(ExtractionResponse.error("文本抽取失败: " + e.getMessage()));}}/*** 上传文件并抽取信息*/@PostMapping("/upload-file")public ResponseEntity<ExtractionResponse> uploadAndExtract(@RequestParam("file") MultipartFile file,@RequestParam("domain") String domain,@RequestParam(value = "category", required = false) String category) {log.info("File upload and extraction - File: {}, Domain: {}", file.getOriginalFilename(), domain);try {// 检查文件类型String contentType = file.getContentType();if (!isSupportedFileType(contentType)) {return ResponseEntity.badRequest().body(ExtractionResponse.error("不支持的文件类型: " + contentType));}// 读取文件内容String content = readFileContent(file);if (content == null || content.trim().isEmpty()) {return ResponseEntity.badRequest().body(ExtractionResponse.error("文件内容为空"));}// 创建文本源TextSource textSource = new TextSource(file.getOriginalFilename(),TextSource.SourceType.valueOf(getSourceType(contentType)),content);textSource.setDomain(domain);textSource.setCategory(category);textSource.setFileSize(file.getSize());textSource.setFileType(getFileExtension(file.getOriginalFilename()));// 执行信息抽取InformationExtractionService.ExtractionData extractionData = extractionService.extractInformation(content, domain);ExtractionResponse response = new ExtractionResponse();response.setSuccess(true);response.setEntityCount(extractionData.getEntities().size());response.setRelationCount(extractionData.getRelations().size());response.setAttributeCount(extractionData.getAttributes().size());response.setExtractionData(extractionData);response.setMessage("文件上传和信息抽取完成");return ResponseEntity.ok(response);} catch (Exception e) {log.error("File upload and extraction failed", e);return ResponseEntity.badRequest().body(ExtractionResponse.error("文件上传和抽取失败: " + e.getMessage()));}}/*** 批量处理文本源*/@PostMapping("/batch-process")public ResponseEntity<BatchExtractionResponse> batchProcess(@Valid @RequestBody BatchExtractionRequest request) {log.info("Batch extraction request - Count: {}, Domain: {}", request.getTexts().size(), request.getDomain());try {BatchExtractionResponse response = new BatchExtractionResponse();response.setTotalCount(request.getTexts().size());response.setSuccessCount(0);response.setFailedCount(0);for (String text : request.getTexts()) {try {InformationExtractionService.ExtractionData extractionData = extractionService.extractInformation(text, request.getDomain());ExtractionResult result = new ExtractionResult();result.setEntityCount(extractionData.getEntities().size());result.setRelationCount(extractionData.getRelations().size());result.setAttributeCount(extractionData.getAttributes().size());response.getResults().add(result);response.setSuccessCount(response.getSuccessCount() + 1);} catch (Exception e) {log.error("Batch extraction failed for one text", e);response.setFailedCount(response.getFailedCount() + 1);response.getErrors().add("处理失败: " + e.getMessage());}}response.setSuccess(true);response.setMessage("批量处理完成");return ResponseEntity.ok(response);} catch (Exception e) {log.error("Batch extraction failed", e);return ResponseEntity.badRequest().body(BatchExtractionResponse.error("批量处理失败: " + e.getMessage()));}}/*** 获取抽取结果统计*/@GetMapping("/statistics")public ResponseEntity<ExtractionStatisticsResponse> getExtractionStatistics(@RequestParam(value = "domain", required = false) String domain) {log.info("Getting extraction statistics - Domain: {}", domain);try {ExtractionStatisticsResponse response = knowledgeGraphService.getExtractionStatistics(domain);return ResponseEntity.ok(response);} catch (Exception e) {log.error("Failed to get extraction statistics", e);return ResponseEntity.badRequest().body(ExtractionStatisticsResponse.error("获取统计信息失败: " + e.getMessage()));}}// 工具方法private boolean isSupportedFileType(String contentType) {return contentType != null && (contentType.equals("text/plain") ||contentType.equals("application/pdf") ||contentType.equals("application/msword") ||contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));}private String getSourceType(String contentType) {if (contentType == null) return "TXT";switch (contentType) {case "application/pdf": return "PDF";case "application/msword": return "DOC";case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return "DOCX";default: return "TXT";}}private String getFileExtension(String filename) {if (filename == null) return "";int lastDot = filename.lastIndexOf(".");return lastDot > 0 ? filename.substring(lastDot + 1) : "";}private String readFileContent(MultipartFile file) throws IOException {String contentType = file.getContentType();if ("text/plain".equals(contentType)) {return new String(file.getBytes(), StandardCharsets.UTF_8);} else if ("application/pdf".equals(contentType)) {// 使用PDFBox读取PDF内容try (var pdfDocument = org.apache.pdfbox.pdmodel.PDDocument.load(file.getInputStream())) {var stripper = new org.apache.pdfbox.text.PDFTextStripper();return stripper.getText(pdfDocument);}} else {// 对于其他格式,返回简单提示return "文件内容需要特殊处理: " + contentType;}}
}
知识图谱问答API控制器
@RestController
@RequestMapping("/api/v1/qa")
@Validated
@Slf4j
public class QAController {@Autowiredprivate KnowledgeGraphQAService qaService;/*** 回答自然语言问题*/@PostMapping("/answer")public ResponseEntity<QAResponse> answerQuestion(@Valid @RequestBody QARequest request) {log.info("QA request - Question: {}, Domain: {}", request.getQuestion(), request.getDomain());try {KnowledgeGraphQAService.QAAnswer answer = qaService.answerQuestion(request.getQuestion(), request.getDomain());QAResponse response = new QAResponse();response.setSuccess(answer.isSuccess());response.setQuestion(answer.getQuestion());response.setAnswer(answer.getAnswer());response.setDomain(answer.getDomain());response.setProcessingTimeMs(answer.getProcessingTimeMs());response.setResultCount(answer.getResultCount());response.setFallbackUsed(answer.isFallbackUsed());if (!answer.isSuccess()) {response.setErrorMessage(answer.getErrorMessage());}return ResponseEntity.ok(response);} catch (Exception e) {log.error("QA request failed", e);return ResponseEntity.badRequest().body(QAResponse.error("问答请求失败: " + e.getMessage()));}}/*** 解析问题(不执行查询)*/@PostMapping("/parse-question")public ResponseEntity<QuestionParseResponse> parseQuestion(@Valid @RequestBody QARequest request) {log.info("Question parse request - Question: {}", request.getQuestion());try {KnowledgeGraphQAService.QueryTemplate queryTemplate = qaService.parseQuestion(request.getQuestion(), request.getDomain());QuestionParseResponse response = new QuestionParseResponse();response.setSuccess(true);response.setOriginalQuestion(queryTemplate.getOriginalQuestion());response.setQuestionType(queryTemplate.getQuestionType());response.setDomain(queryTemplate.getDomain());response.setEntities(queryTemplate.getEntities());response.setResolvedEntities(queryTemplate.getResolvedEntities());response.setParameters(queryTemplate.getParameters());// 生成Cypher查询预览String cypherQuery = qaService.generateCypherQuery(queryTemplate);response.setCypherQuery(cypherQuery);return ResponseEntity.ok(response);} catch (Exception e) {log.error("Question parse failed", e);return ResponseEntity.badRequest().body(QuestionParseResponse.error("问题解析失败: " + e.getMessage()));}}/*** 执行Cypher查询*/@PostMapping("/execute-cypher")public ResponseEntity<CypherExecutionResponse> executeCypher(@Valid @RequestBody CypherExecutionRequest request) {log.info("Cypher execution request - Query: {}", request.getQuery());try {// 这里应该添加查询验证和限制,防止恶意查询if (!isSafeQuery(request.getQuery())) {return ResponseEntity.badRequest().body(CypherExecutionResponse.error("查询包含不安全操作"));}var results = qaService.executeCypherQuery(request.getQuery());CypherExecutionResponse response = new CypherExecutionResponse();response.setSuccess(true);response.setQuery(request.getQuery());response.setResults(results);response.setResultCount(results.size());return ResponseEntity.ok(response);} catch (Exception e) {log.error("Cypher execution failed", e);return ResponseEntity.badRequest().body(CypherExecutionResponse.error("Cypher查询执行失败: " + e.getMessage()));}}/*** 获取知识图谱统计信息*/@GetMapping("/graph-statistics")public ResponseEntity<GraphStatisticsResponse> getGraphStatistics(@RequestParam(value = "domain", required = false) String domain) {log.info("Getting graph statistics - Domain: {}", domain);try {GraphStatisticsResponse response = new GraphStatisticsResponse();response.setSuccess(true);response.setDomain(domain);// 获取实体统计var entityStats = qaService.findEntitiesByName(".*", domain);response.setTotalEntities(entityStats.size());// 获取实体类型分布var typeDistribution = entityStats.stream().collect(java.util.stream.Collectors.groupingBy(e -> e.getType(), java.util.stream.Collectors.counting()));response.setEntityTypeDistribution(typeDistribution);// 这里可以添加更多统计信息...return ResponseEntity.ok(response);} catch (Exception e) {log.error("Failed to get graph statistics", e);return ResponseEntity.badRequest().body(GraphStatisticsResponse.error("获取图谱统计失败: " + e.getMessage()));}}/*** 检查查询安全性*/private boolean isSafeQuery(String query) {if (query == null) return false;// 检查是否包含危险操作String lowerQuery = query.toLowerCase();return !lowerQuery.contains("delete") && !lowerQuery.contains("remove") && !lowerQuery.contains("drop") && !lowerQuery.contains("create") &&!lowerQuery.contains("merge") &&!lowerQuery.contains("set") &&lowerQuery.contains("match");}
}
核心业务服务层
信息抽取服务
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
/*** 信息抽取服务 - 负责从文本中抽取实体、关系和属性*/
package com.company.kg.service;import com.company.kg.entity.ExtractionResult;
import com.company.kg.entity.TextSource;
import com.company.kg.graph.EntityNode;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;/*** 信息抽取服务 - 负责从文本中抽取实体、关系和属性*/
@Service
@Slf4j
public class InformationExtractionService {private final StanfordCoreNLP pipeline;private final KnowledgeGraphService knowledgeGraphService;// 实体类型映射private static final Map<String, String> ENTITY_TYPE_MAPPING = Map.of("PERSON", "Person","ORGANIZATION", "Organization","LOCATION", "Location","CITY", "Location","STATE_OR_PROVINCE", "Location","COUNTRY", "Location","NATIONALITY", "Nationality","MISC", "Miscellaneous","DATE", "Date","TIME", "Time","MONEY", "Money","PERCENT", "Percent","NUMBER", "Number");// 关系模式private static final Map<Pattern, String> RELATION_PATTERNS = Map.of(Pattern.compile("(\\w+)是(\\w+)的"), "IS_PART_OF",Pattern.compile("(\\w+)在(\\w+)工作"), "WORKS_AT",Pattern.compile("(\\w+)出生于(\\w+)"), "BORN_IN",Pattern.compile("(\\w+)创建了(\\w+)"), "FOUNDED_BY"),Pattern.compile("(\\w+)位于(\\w+)"), "LOCATED_IN",Pattern.compile("(\\w+)属于(\\w+)"), "BELONGS_TO");@Autowiredpublic InformationExtractionService(KnowledgeGraphService knowledgeGraphService) {this.knowledgeGraphService = knowledgeGraphService;// 初始化Stanford CoreNLP管道Properties props = new Properties();props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, depparse, coref");props.setProperty("coref.algorithm", "statistical");props.setProperty("threads", "4");this.pipeline = new StanfordCoreNLP(props);log.info("Stanford CoreNLP pipeline initialized successfully");}/*** 从文本源抽取信息*/public ExtractionResult extractFromTextSource(TextSource textSource) {long startTime = System.currentTimeMillis();ExtractionResult result = new ExtractionResult(textSource);try {log.info("Starting information extraction for text source: {}", textSource.getName());String text = textSource.getContent();if (text == null || text.trim().isEmpty()) {throw new IllegalArgumentException("文本内容为空");}// 执行信息抽取ExtractionData extractionData = extractInformation(text, textSource.getDomain());// 构建抽取结果result.setEntityCount(extractionData.getEntities().size());result.setRelationCount(extractionData.getRelations().size());result.setAttributeCount(extractionData.getAttributes().size());result.setExtractionConfidence(calculateAverageConfidence(extractionData));result.setExtractionModel("Stanford-CoreNLP-4.5.0");result.setProcessingTimeMs(System.currentTimeMillis() - startTime);// 转换为JSON存储result.setExtractedEntities(convertToJson(extractionData.getEntities()));result.setExtractedRelations(convertToJson(extractionData.getRelations()));result.setExtractedAttributes(convertToJson(extractionData.getAttributes()));// 构建知识图谱buildKnowledgeGraph(extractionData, textSource.getDomain(), textSource.getName());log.info("Information extraction completed: {} entities, {} relations, {} attributes", extractionData.getEntities().size(), extractionData.getRelations().size(), extractionData.getAttributes().size());} catch (Exception e) {log.error("Information extraction failed for text source: {}", textSource.getName(), e);result.setProcessingError("抽取失败: " + e.getMessage());}return result;}/*** 从文本中抽取信息*/public ExtractionData extractInformation(String text, String domain) {ExtractionData extractionData = new ExtractionData();try {// 创建文档注解Annotation document = new Annotation(text);// 执行所有注解pipeline.annotate(document);// 处理句子List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);for (CoreMap sentence : sentences) {// 抽取实体extractEntities(sentence, extractionData, domain);// 抽取关系extractRelations(sentence, extractionData, domain);// 抽取属性extractAttributes(sentence, extractionData, domain);}// 处理共指消解resolveCoreferences(document, extractionData);// 后处理:合并重复实体,验证关系等postProcessExtraction(extractionData);} catch (Exception e) {log.error("Information extraction failed", e);throw new RuntimeException("信息抽取失败: " + e.getMessage());}return extractionData;}/*** 抽取实体*/private void extractEntities(CoreMap sentence, ExtractionData extractionData, String domain) {List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);StringBuilder currentEntity = new StringBuilder();String currentType = null;int startPosition = -1;for (int i = 0; i < tokens.size(); i++) {CoreLabel token = tokens.get(i);String word = token.word();String ner = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);if (!"O".equals(ner)) {// 开始新实体或继续当前实体if (currentEntity.length() == 0) {startPosition = token.beginPosition();currentType = normalizeEntityType(ner);}currentEntity.append(word);// 检查下一个token是否属于同一实体if (i + 1 < tokens.size()) {CoreLabel nextToken = tokens.get(i + 1);String nextNer = nextToken.get(CoreAnnotations.NamedEntityTagAnnotation.class);if (ner.equals(nextNer)) {currentEntity.append(" ");continue;}}// 实体结束,添加到结果if (currentEntity.length() > 0 && currentType != null) {String entityName = currentEntity.toString().trim();if (isValidEntity(entityName, currentType)) {EntityInfo entity = new EntityInfo(generateEntityId(entityName, currentType),entityName,currentType,domain,calculateEntityConfidence(entityName, currentType),startPosition,token.endPosition());extractionData.addEntity(entity);}}// 重置currentEntity.setLength(0);currentType = null;startPosition = -1;}}}/*** 抽取关系*/private void extractRelations(CoreMap sentence, ExtractionData extractionData, String domain) {// 获取依存句法树SemanticGraph dependencies = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);if (dependencies == null) {return;}// 获取实体列表List<EntityInfo> sentenceEntities = extractionData.getEntities().stream().filter(e -> e.getDomain().equals(domain)).collect(Collectors.toList());// 基于依存关系抽取关系for (EntityInfo entity1 : sentenceEntities) {for (EntityInfo entity2 : sentenceEntities) {if (!entity1.equals(entity2)) {// 检查是否存在依存路径String relationType = detectRelationByDependencies(dependencies, entity1, entity2);if (relationType != null) {RelationInfo relation = new RelationInfo(generateRelationId(entity1, entity2, relationType),entity1.getEntityId(),entity2.getEntityId(),relationType,domain,calculateRelationConfidence(entity1, entity2, relationType));extractionData.addRelation(relation);}}}}// 基于模式匹配抽取关系extractRelationsByPatterns(sentence.toString(), extractionData, domain);}/*** 基于依存关系检测关系*/private String detectRelationByDependencies(SemanticGraph dependencies, EntityInfo entity1, EntityInfo entity2) {// 简化的依存关系分析// 实际项目中应该使用更复杂的规则String text = dependencies.toList();if (text.contains(entity1.getName()) && text.contains(entity2.getName())) {// 检查常见的依存关系模式if (text.contains("nsubj") && text.contains("dobj")) {return "SUBJECT_OBJECT";} else if (text.contains("prep") && (text.contains("in") || text.contains("at"))) {return "LOCATED_IN";} else if (text.contains("appos")) {return "ALSO_KNOWN_AS";}}return null;}/*** 基于模式匹配抽取关系*/private void extractRelationsByPatterns(String sentence, ExtractionData extractionData, String domain) {for (Map.Entry<Pattern, String> entry : RELATION_PATTERNS.entrySet()) {java.util.regex.Matcher matcher = entry.getKey().matcher(sentence);while (matcher.find()) {String entity1Name = matcher.group(1);String entity2Name = matcher.group(2);String relationType = entry.getValue();// 查找匹配的实体Optional<EntityInfo> entity1 = findEntityByName(extractionData, entity1Name);Optional<EntityInfo> entity2 = findEntityByName(extractionData, entity2Name);if (entity1.isPresent() && entity2.isPresent()) {RelationInfo relation = new RelationInfo(generateRelationId(entity1.get(), entity2.get(), relationType),entity1.get().getEntityId(),entity2.get().getEntityId(),relationType,domain,0.8 // 模式匹配的置信度);extractionData.addRelation(relation);}}}}/*** 抽取属性*/private void extractAttributes(CoreMap sentence, ExtractionData extractionData, String domain) {// 获取句法树Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);if (tree == null) {return;}// 简化的属性抽取:基于名词短语和动词短语List<EntityInfo> entities = extractionData.getEntities();for (EntityInfo entity : entities) {// 抽取基本属性:类型、长度等AttributeInfo typeAttr = new AttributeInfo(generateAttributeId(entity, "type"),entity.getEntityId(),"type",entity.getType(),"STRING",domain,0.9);extractionData.addAttribute(typeAttr);// 基于上下文抽取其他属性extractContextualAttributes(sentence, entity, extractionData, domain);}}/*** 抽取上下文属性*/private void extractContextualAttributes(CoreMap sentence, EntityInfo entity, ExtractionData extractionData, String domain) {String sentenceText = sentence.toString();int entityPosition = sentenceText.indexOf(entity.getName());if (entityPosition >= 0) {// 在实体周围寻找可能的属性String context = getContextAround(sentenceText, entityPosition, entity.getName().length(), 50);// 抽取数字属性extractNumericAttributes(context, entity, extractionData, domain);// 抽取描述性属性extractDescriptiveAttributes(context, entity, extractionData, domain);}}/*** 抽取数字属性*/private void extractNumericAttributes(String context, EntityInfo entity, ExtractionData extractionData, String domain) {java.util.regex.Pattern numberPattern = java.util.regex.Pattern.compile("\\d+(\\.\\d+)?");java.util.regex.Matcher matcher = numberPattern.matcher(context);while (matcher.find()) {String number = matcher.group();String attributeKey = inferNumericAttributeKey(context, number, entity.getType());if (attributeKey != null) {AttributeInfo attribute = new AttributeInfo(generateAttributeId(entity, attributeKey),entity.getEntityId(),attributeKey,number,"NUMBER",domain,0.7);extractionData.addAttribute(attribute);}}}/*** 推断数字属性的键*/private String inferNumericAttributeKey(String context, String number, String entityType) {context = context.toLowerCase();if (context.contains("年龄") || context.contains("岁")) {return "age";} else if (context.contains("工资") || context.contains("薪资") || context.contains("收入")) {return "salary";} else if (context.contains("身高")) {return "height";} else if (context.contains("体重")) {return "weight";} else if (context.contains("成立于") || context.contains("建立于")) {return "founded_year";} else if (context.contains("人口")) {return "population";}return "numeric_value";}/*** 抽取描述性属性*/private void extractDescriptiveAttributes(String context, EntityInfo entity, ExtractionData extractionData, String domain) {// 基于实体类型和上下文的简单规则switch (entity.getType()) {case "Person":extractPersonAttributes(context, entity, extractionData, domain);break;case "Organization":extractOrganizationAttributes(context, entity, extractionData, domain);break;case "Location":extractLocationAttributes(context, entity, extractionData, domain);break;}}/*** 抽取人物属性*/private void extractPersonAttributes(String context, EntityInfo entity, ExtractionData extractionData, String domain) {if (context.contains("博士") || context.contains("教授")) {AttributeInfo attribute = new AttributeInfo(generateAttributeId(entity, "title"),entity.getEntityId(),"title","博士","STRING",domain,0.8);extractionData.addAttribute(attribute);}if (context.contains("男")) {AttributeInfo attribute = new AttributeInfo(generateAttributeId(entity, "gender"),entity.getEntityId(),"gender","男","STRING",domain,0.9);extractionData.addAttribute(attribute);} else if (context.contains("女")) {AttributeInfo attribute = new AttributeInfo(generateAttributeId(entity, "gender"),entity.getEntityId(),"gender","女","STRING",domain,0.9);extractionData.addAttribute(attribute);}}/*** 抽取组织属性*/private void extractOrganizationAttributes(String context, EntityInfo entity, ExtractionData extractionData, String domain) {if (context.contains("公司") || context.contains("企业")) {AttributeInfo attribute = new AttributeInfo(generateAttributeId(entity, "organization_type"),entity.getEntityId(),"organization_type","公司","STRING",domain,0.8);extractionData.addAttribute(attribute);}}/*** 抽取地点属性*/private void extractLocationAttributes(String context, EntityInfo entity, ExtractionData extractionData, String domain) {if (context.contains("市") || context.contains("省") || context.contains("区")) {AttributeInfo attribute = new AttributeInfo(generateAttributeId(entity, "location_type"),entity.getEntityId(),"location_type","行政区划","STRING",domain,0.8);extractionData.addAttribute(attribute);}}/*** 处理共指消解*/private void resolveCoreferences(Annotation document, ExtractionData extractionData) {Map<Integer, CorefChain> corefChains = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);if (corefChains == null) {return;}for (CorefChain chain : corefChains.values()) {CorefChain.CorefMention representative = chain.getRepresentativeMention();if (representative != null) {String representativeText = representative.mentionSpan;// 为共指链中的其他提及创建别名关系for (CorefChain.CorefMention mention : chain.getMentions()) {if (!mention.equals(representative)) {String mentionText = mention.mentionSpan;// 查找对应的实体Optional<EntityInfo> representativeEntity = findEntityByName(extractionData, representativeText);Optional<EntityInfo> mentionEntity = findEntityByName(extractionData, mentionText);if (representativeEntity.isPresent() && mentionEntity.isPresent()) {// 创建别名关系RelationInfo aliasRelation = new RelationInfo(generateRelationId(representativeEntity.get(), mentionEntity.get(), "ALIAS"),representativeEntity.get().getEntityId(),mentionEntity.get().getEntityId(),"ALIAS",representativeEntity.get().getDomain(),0.95);extractionData.addRelation(aliasRelation);}}}}}}/*** 后处理抽取结果*/private void postProcessExtraction(ExtractionData extractionData) {// 合并重复实体(基于名称和类型的简单合并)mergeDuplicateEntities(extractionData);// 验证和清理关系validateRelations(extractionData);// 计算统计信息calculateStatistics(extractionData);}/*** 合并重复实体*/private void mergeDuplicateEntities(ExtractionData extractionData) {Map<String, EntityInfo> entityMap = new HashMap<>();List<EntityInfo> uniqueEntities = new ArrayList<>();for (EntityInfo entity : extractionData.getEntities()) {String key = entity.getName() + "|" + entity.getType();EntityInfo existing = entityMap.get(key);if (existing == null) {entityMap.put(key, entity);uniqueEntities.add(entity);} else {// 合并置信度double newConfidence = Math.max(existing.getConfidence(), entity.getConfidence());existing.setConfidence(newConfidence);}}extractionData.setEntities(uniqueEntities);}/*** 验证关系*/private void validateRelations(ExtractionData extractionData) {List<RelationInfo> validRelations = extractionData.getRelations().stream().filter(relation -> {// 检查关系中的实体是否存在boolean startExists = extractionData.getEntities().stream().anyMatch(e -> e.getEntityId().equals(relation.getStartEntityId()));boolean endExists = extractionData.getEntities().stream().anyMatch(e -> e.getEntityId().equals(relation.getEndEntityId()));return startExists && endExists;}).collect(Collectors.toList());extractionData.setRelations(validRelations);}/*** 计算统计信息*/private void calculateStatistics(ExtractionData extractionData) {Map<String, Long> entityTypeCount = extractionData.getEntities().stream().collect(Collectors.groupingBy(EntityInfo::getType, Collectors.counting()));Map<String, Long> relationTypeCount = extractionData.getRelations().stream().collect(Collectors.groupingBy(RelationInfo::getType, Collectors.counting()));extractionData.getStatistics().put("entity_types", String.valueOf(entityTypeCount.size()));extractionData.getStatistics().put("relation_types", String.valueOf(relationTypeCount.size()));extractionData.getStatistics().put("total_confidence", String.valueOf(extractionData.getEntities().stream().mapToDouble(EntityInfo::getConfidence).average().orElse(0.0)));}// 工具方法private String normalizeEntityType(String nerType) {return ENTITY_TYPE_MAPPING.getOrDefault(nerType, "Other");}private boolean isValidEntity(String name, String type) {return name != null && !name.trim().isEmpty() && name.length() > 1;}private double calculateEntityConfidence(String name, String type) {// 基于实体长度和类型的简单置信度计算double confidence = 0.5;if (name.length() >= 3) confidence += 0.2;if (name.length() >= 5) confidence += 0.1;if (!"Other".equals(type)) confidence += 0.2;return Math.min(confidence, 1.0);}private double calculateRelationConfidence(EntityInfo entity1, EntityInfo entity2, String relationType) {// 基于实体置信度和关系类型的简单置信度计算return (entity1.getConfidence() + entity2.getConfidence()) / 2 * 0.8;}private String generateEntityId(String name, String type) {return String.format("ENT_%s_%s_%d", type.toUpperCase(), name.hashCode() & 0xfffffff, System.currentTimeMillis() % 10000);}private String generateRelationId(EntityInfo entity1, EntityInfo entity2, String relationType) {return String.format("REL_%s_%s_%s_%d", entity1.getEntityId(), relationType, entity2.getEntityId(),System.currentTimeMillis() % 10000);}private String generateAttributeId(EntityInfo entity, String key) {return String.format("ATTR_%s_%s_%d", entity.getEntityId(), key,System.currentTimeMillis() % 10000);}private Optional<EntityInfo> findEntityByName(ExtractionData extractionData, String name) {return extractionData.getEntities().stream().filter(e -> e.getName().equals(name)).findFirst();}private String getContextAround(String text, int position, int length, int contextSize) {int start = Math.max(0, position - contextSize);int end = Math.min(text.length(), position + length + contextSize);return text.substring(start, end);}private double calculateAverageConfidence(ExtractionData extractionData) {double entityAvg = extractionData.getEntities().stream().mapToDouble(EntityInfo::getConfidence).average().orElse(0.0);double relationAvg = extractionData.getRelations().stream().mapToDouble(RelationInfo::getConfidence).average().orElse(0.0);return (entityAvg + relationAvg) / 2;}private String convertToJson(List<?> list) {try {com.fasterxml.jackson.databind.ObjectMapper mapper = new com.fasterxml.jackson.databind.ObjectMapper();return mapper.writeValueAsString(list);} catch (Exception e) {log.error("Failed to convert list to JSON", e);return "[]";}}/*** 构建知识图谱*/private void buildKnowledgeGraph(ExtractionData extractionData, String domain, String source) {try {log.info("Building knowledge graph for domain: {}, source: {}", domain, source);// 创建实体节点for (EntityInfo entityInfo : extractionData.getEntities()) {EntityNode entityNode = new EntityNode(entityInfo.getEntityId(),entityInfo.getName(),entityInfo.getType(),domain);entityNode.setConfidence(entityInfo.getConfidence());entityNode.setSource(source);knowledgeGraphService.saveEntity(entityNode);}// 创建关系for (RelationInfo relationInfo : extractionData.getRelations()) {EntityNode startEntity = knowledgeGraphService.findEntityById(relationInfo.getStartEntityId());EntityNode endEntity = knowledgeGraphService.findEntityById(relationInfo.getEndEntityId());if (startEntity != null && endEntity != null) {startEntity.addRelation(endEntity, relationInfo.getType(), relationInfo.getConfidence());knowledgeGraphService.saveEntity(startEntity);}}// 创建属性for (AttributeInfo attributeInfo : extractionData.getAttributes()) {EntityNode entity = knowledgeGraphService.findEntityById(attributeInfo.getEntityId());if (entity != null) {entity.addAttribute(attributeInfo.getKey(), attributeInfo.getValue(), attributeInfo.getConfidence());knowledgeGraphService.saveEntity(entity);}}log.info("Knowledge graph built successfully: {} entities, {} relations, {} attributes",extractionData.getEntities().size(),extractionData.getRelations().size(),extractionData.getAttributes().size());} catch (Exception e) {log.error("Failed to build knowledge graph", e);throw new RuntimeException("知识图谱构建失败: " + e.getMessage());}}/*** 抽取数据封装类*/@Datapublic static class ExtractionData {private List<EntityInfo> entities = new ArrayList<>();private List<RelationInfo> relations = new ArrayList<>();private List<AttributeInfo> attributes = new ArrayList<>();private Map<String, String> statistics = new HashMap<>();public void addEntity(EntityInfo entity) {this.entities.add(entity);}public void addRelation(RelationInfo relation) {this.relations.add(relation);}public void addAttribute(AttributeInfo attribute) {this.attributes.add(attribute);}}/*** 实体信息类*/@Datapublic static class EntityInfo {private String entityId;private String name;private String type;private String domain;private Double confidence;private Integer startPosition;private Integer endPosition;private Map<String, Object> metadata = new HashMap<>();public EntityInfo(String entityId, String name, String type, String domain, Double confidence, Integer startPosition, Integer endPosition) {this.entityId = entityId;this.name = name;this.type = type;this.domain = domain;this.confidence = confidence;this.startPosition = startPosition;this.endPosition = endPosition;}}/*** 关系信息类*/@Datapublic static class RelationInfo {private String relationId;private String startEntityId;private String endEntityId;private String type;private String domain;private Double confidence;private Map<String, Object> metadata = new HashMap<>();public RelationInfo(String relationId, String startEntityId, String endEntityId, String type, String domain, Double confidence) {this.relationId = relationId;this.startEntityId = startEntityId;this.endEntityId = endEntityId;this.type = type;this.domain = domain;this.confidence = confidence;}}/*** 属性信息类*/@Datapublic static class AttributeInfo {private String attributeId;private String entityId;private String key;private String value;private String dataType;private String domain;private Double confidence;private Map<String, Object> metadata = new HashMap<>();public AttributeInfo(String attributeId, String entityId, String key, String value, String dataType, String domain, Double confidence) {this.attributeId = attributeId;this.entityId = entityId;this.key = key;this.value = value;this.dataType = dataType;this.domain = domain;this.confidence = confidence;}}
}
知识图谱问答服务
package com.company.kg.service;import com.company.kg.graph.EntityNode;
import com.company.kg.graph.EntityRelation;
import lombok.extern.slf4j.Slf4j;
import org.neo4j.ogm.session.Session;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.neo4j.core.Neo4jTemplate;
import org.springframework.stereotype.Service;import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;/*** 知识图谱问答服务 - 提供基于知识图谱的智能问答功能*/
@Service
@Slf4j
public class KnowledgeGraphQAService {@Autowiredprivate Neo4jTemplate neo4jTemplate;@Autowiredprivate Session neo4jSession;@Value("${kg.qa.max-path-length:3}")private int maxPathLength;@Value("${kg.qa.timeout:30000}")private int timeout;@Value("${kg.qa.enable-fallback:true}")private boolean enableFallback;// 问题模式映射private static final Map<Pattern, String> QUESTION_PATTERNS = Map.ofEntries(Map.entry(Pattern.compile("(.*)是谁"), "PERSON_QUERY"),Map.entry(Pattern.compile("(.*)是什么"), "ENTITY_QUERY"),Map.entry(Pattern.compile("(.*)在哪里"), "LOCATION_QUERY"),Map.entry(Pattern.compile("(.*)的(.*)是什么"), "ATTRIBUTE_QUERY"),Map.entry(Pattern.compile("(.*)和(.*)有什么关系"), "RELATION_QUERY"),Map.entry(Pattern.compile("(.*)有哪些属性"), "PROPERTIES_QUERY"),Map.entry(Pattern.compile("(.*)属于哪个(.*)"), "CATEGORY_QUERY"),Map.entry(Pattern.compile("哪些(.*)位于(.*)"), "LOCATED_ENTITY_QUERY"),Map.entry(Pattern.compile("(.*)成立于什么时候"), "FOUNDED_QUERY"),Map.entry(Pattern.compile("(.*)的创始人是谁"), "FOUNDER_QUERY"));/*** 回答自然语言问题*/public QAAnswer answerQuestion(String question, String domain) {long startTime = System.currentTimeMillis();QAAnswer answer = new QAAnswer();answer.setQuestion(question);answer.setDomain(domain);try {log.info("Processing question: {}", question);// 1. 问题解析QueryTemplate queryTemplate = parseQuestion(question, domain);answer.setQueryTemplate(queryTemplate);// 2. 转换为Cypher查询String cypherQuery = generateCypherQuery(queryTemplate);answer.setCypherQuery(cypherQuery);// 3. 执行查询List<Map<String, Object>> results = executeCypherQuery(cypherQuery);answer.setQueryResults(results);// 4. 生成自然语言回答String naturalAnswer = generateNaturalLanguageAnswer(results, queryTemplate);answer.setAnswer(naturalAnswer);// 5. 设置回答元数据answer.setSuccess(true);answer.setProcessingTimeMs(System.currentTimeMillis() - startTime);answer.setResultCount(results.size());log.info("Question answered successfully: {} results in {}ms", results.size(), answer.getProcessingTimeMs());} catch (Exception e) {log.error("Failed to answer question: {}", question, e);answer.setSuccess(false);answer.setErrorMessage("回答问题失败: " + e.getMessage());// 降级回答if (enableFallback) {answer.setAnswer(generateFallbackAnswer(question));answer.setFallbackUsed(true);}}return answer;}/*** 解析自然语言问题*/public QueryTemplate parseQuestion(String question, String domain) {QueryTemplate template = new QueryTemplate();template.setOriginalQuestion(question);template.setDomain(domain);// 识别问题类型for (Map.Entry<Pattern, String> entry : QUESTION_PATTERNS.entrySet()) {java.util.regex.Matcher matcher = entry.getKey().matcher(question);if (matcher.find()) {template.setQuestionType(entry.getValue());// 提取实体和参数extractEntitiesAndParameters(matcher, template, question);break;}}// 如果未匹配到已知模式,使用默认查询if (template.getQuestionType() == null) {template.setQuestionType("ENTITY_SEARCH");template.getParameters().put("search_term", question);}return template;}/*** 提取实体和参数*/private void extractEntitiesAndParameters(java.util.regex.Matcher matcher, QueryTemplate template, String question) {String questionType = template.getQuestionType();switch (questionType) {case "PERSON_QUERY":template.getEntities().add(matcher.group(1));template.getParameters().put("entity_name", matcher.group(1));template.getParameters().put("entity_type", "Person");break;case "ENTITY_QUERY":template.getEntities().add(matcher.group(1));template.getParameters().put("entity_name", matcher.group(1));break;case "LOCATION_QUERY":template.getEntities().add(matcher.group(1));template.getParameters().put("entity_name", matcher.group(1));template.getParameters().put("relation_type", "LOCATED_IN");break;case "ATTRIBUTE_QUERY":template.getEntities().add(matcher.group(1));template.getParameters().put("entity_name", matcher.group(1));template.getParameters().put("attribute_key", matcher.group(2));break;case "RELATION_QUERY":template.getEntities().add(matcher.group(1));template.getEntities().add(matcher.group(2));template.getParameters().put("entity1_name", matcher.group(1));template.getParameters().put("entity2_name", matcher.group(2));break;case "PROPERTIES_QUERY":template.getEntities().add(matcher.group(1));template.getParameters().put("entity_name", matcher.group(1));break;case "CATEGORY_QUERY":template.getEntities().add(matcher.group(1));template.getParameters().put("entity_name", matcher.group(1));template.getParameters().put("category_type", matcher.group(2));break;case "LOCATED_ENTITY_QUERY":template.getParameters().put("entity_type", matcher.group(1));template.getEntities().add(matcher.group(2));template.getParameters().put("location_name", matcher.group(2));break;case "FOUNDED_QUERY":template.getEntities().add(matcher.group(1));template.getParameters().put("entity_name", matcher.group(1));template.getParameters().put("attribute_key", "founded_year");break;case "FOUNDER_QUERY":template.getEntities().add(matcher.group(1));template.getParameters().put("entity_name", matcher.group(1));template.getParameters().put("relation_type", "FOUNDED_BY");break;}// 查找实体在知识图谱中的存在resolveEntitiesInGraph(template);}/*** 解析知识图谱中的实体*/private void resolveEntitiesInGraph(QueryTemplate template) {for (String entityName : template.getEntities()) {List<EntityNode> matchedEntities = findEntitiesByName(entityName, template.getDomain());if (!matchedEntities.isEmpty()) {template.getResolvedEntities().addAll(matchedEntities);}}}/*** 生成Cypher查询*/public String generateCypherQuery(QueryTemplate template) {String questionType = template.getQuestionType();switch (questionType) {case "PERSON_QUERY":return generatePersonQuery(template);case "ENTITY_QUERY":return generateEntityQuery(template);case "LOCATION_QUERY":return generateLocationQuery(template);case "ATTRIBUTE_QUERY":return generateAttributeQuery(template);case "RELATION_QUERY":return generateRelationQuery(template);case "PROPERTIES_QUERY":return generatePropertiesQuery(template);case "CATEGORY_QUERY":return generateCategoryQuery(template);case "LOCATED_ENTITY_QUERY":return generateLocatedEntityQuery(template);case "FOUNDED_QUERY":return generateFoundedQuery(template);case "FOUNDER_QUERY":return generateFounderQuery(template);case "ENTITY_SEARCH":return generateEntitySearchQuery(template);default:return generateDefaultQuery(template);}}/*** 生成人物查询*/private String generatePersonQuery(QueryTemplate template) {String entityName = (String) template.getParameters().get("entity_name");return "MATCH (e:EntityNode) " +"WHERE e.name =~ $name AND e.type = 'Person' AND e.domain = $domain " +"RETURN e.name AS name, e.type AS type, e.description AS description, " +" e.confidence AS confidence, e.source AS source " +"LIMIT 10";}/*** 生成实体查询*/private String generateEntityQuery(QueryTemplate template) {String entityName = (String) template.getParameters().get("entity_name");return "MATCH (e:EntityNode) " +"WHERE e.name =~ $name AND e.domain = $domain " +"RETURN e.name AS name, e.type AS type, e.description AS description, " +" e.confidence AS confidence " +"LIMIT 10";}/*** 生成位置查询*/private String generateLocationQuery(QueryTemplate template) {String entityName = (String) template.getParameters().get("entity_name");String relationType = (String) template.getParameters().get("relation_type");return "MATCH (e:EntityNode)-[r:RELATED_TO]->(loc:EntityNode) " +"WHERE e.name =~ $name AND r.type = $relationType AND e.domain = $domain " +"RETURN loc.name AS location, r.type AS relation, r.confidence AS confidence " +"LIMIT 10";}/*** 生成属性查询*/private String generateAttributeQuery(QueryTemplate template) {String entityName = (String) template.getParameters().get("entity_name");String attributeKey = (String) template.getParameters().get("attribute_key");return "MATCH (e:EntityNode)-[a:HAS_ATTRIBUTE]->() " +"WHERE e.name =~ $name AND a.key = $attributeKey AND e.domain = $domain " +"RETURN a.key AS attribute_key, a.value AS attribute_value, " +" a.confidence AS confidence " +"LIMIT 10";}/*** 生成关系查询*/private String generateRelationQuery(QueryTemplate template) {String entity1Name = (String) template.getParameters().get("entity1_name");String entity2Name = (String) template.getParameters().get("entity2_name");return "MATCH (e1:EntityNode)-[r:RELATED_TO]-(e2:EntityNode) " +"WHERE e1.name =~ $entity1Name AND e2.name =~ $entity2Name " +"AND e1.domain = $domain AND e2.domain = $domain " +"RETURN e1.name AS entity1, e2.name AS entity2, r.type AS relation, " +" r.confidence AS confidence " +"LIMIT 10";}/*** 生成属性列表查询*/private String generatePropertiesQuery(QueryTemplate template) {String entityName = (String) template.getParameters().get("entity_name");return "MATCH (e:EntityNode)-[a:HAS_ATTRIBUTE]->() " +"WHERE e.name =~ $name AND e.domain = $domain " +"RETURN a.key AS attribute_key, a.value AS attribute_value, " +" a.data_type AS data_type, a.confidence AS confidence " +"ORDER BY a.confidence DESC " +"LIMIT 20";}/*** 生成分类查询*/private String generateCategoryQuery(QueryTemplate template) {String entityName = (String) template.getParameters().get("entity_name");String categoryType = (String) template.getParameters().get("category_type");return "MATCH (e:EntityNode)-[r:RELATED_TO]->(c:EntityNode) " +"WHERE e.name =~ $name AND c.type = $categoryType AND e.domain = $domain " +"RETURN c.name AS category, r.type AS relation, r.confidence AS confidence " +"LIMIT 10";}/*** 生成位置实体查询*/private String generateLocatedEntityQuery(QueryTemplate template) {String entityType = (String) template.getParameters().get("entity_type");String locationName = (String) template.getParameters().get("location_name");return "MATCH (e:EntityNode)-[r:RELATED_TO]->(loc:EntityNode) " +"WHERE e.type = $entityType AND loc.name =~ $locationName " +"AND r.type = 'LOCATED_IN' AND e.domain = $domain " +"RETURN e.name AS entity_name, e.type AS entity_type, " +" r.confidence AS confidence " +"LIMIT 10";}/*** 生成成立时间查询*/private String generateFoundedQuery(QueryTemplate template) {String entityName = (String) template.getParameters().get("entity_name");return "MATCH (e:EntityNode)-[a:HAS_ATTRIBUTE]->() " +"WHERE e.name =~ $name AND a.key = 'founded_year' AND e.domain = $domain " +"RETURN a.value AS founded_year, a.confidence AS confidence " +"LIMIT 5";}/*** 生成创始人查询*/private String generateFounderQuery(QueryTemplate template) {String entityName = (String) template.getParameters().get("entity_name");return "MATCH (e:EntityNode)<-[r:RELATED_TO]-(founder:EntityNode) " +"WHERE e.name =~ $name AND r.type = 'FOUNDED_BY' AND e.domain = $domain " +"RETURN founder.name AS founder_name, r.confidence AS confidence " +"LIMIT 10";}/*** 生成实体搜索查询*/private String generateEntitySearchQuery(QueryTemplate template) {String searchTerm = (String) template.getParameters().get("search_term");return "MATCH (e:EntityNode) " +"WHERE e.name =~ $searchTerm OR e.description =~ $searchTerm " +"AND e.domain = $domain " +"RETURN e.name AS name, e.type AS type, e.description AS description, " +" e.confidence AS confidence " +"ORDER BY e.confidence DESC " +"LIMIT 10";}/*** 生成默认查询*/private String generateDefaultQuery(QueryTemplate template) {return "MATCH (e:EntityNode) " +"WHERE e.domain = $domain " +"RETURN e.name AS name, e.type AS type " +"LIMIT 5";}/*** 执行Cypher查询*/private List<Map<String, Object>> executeCypherQuery(String cypherQuery) {try {Map<String, Object> parameters = new HashMap<>();// 这里应该根据查询类型设置相应的参数return neo4jTemplate.query(cypherQuery, parameters).queryResults();} catch (Exception e) {log.error("Failed to execute Cypher query: {}", cypherQuery, e);throw new RuntimeException("Cypher查询执行失败: " + e.getMessage());}}/*** 生成自然语言回答*/private String generateNaturalLanguageAnswer(List<Map<String, Object>> results, QueryTemplate template) {if (results.isEmpty()) {return "抱歉,我没有找到相关的信息。";}String questionType = template.getQuestionType();switch (questionType) {case "PERSON_QUERY":return generatePersonAnswer(results, template);case "ENTITY_QUERY":return generateEntityAnswer(results, template);case "LOCATION_QUERY":return generateLocationAnswer(results, template);case "ATTRIBUTE_QUERY":return generateAttributeAnswer(results, template);case "RELATION_QUERY":return generateRelationAnswer(results, template);case "PROPERTIES_QUERY":return generatePropertiesAnswer(results, template);default:return generateDefaultAnswer(results, template);}}/*** 生成人物回答*/private String generatePersonAnswer(List<Map<String, Object>> results, QueryTemplate template) {StringBuilder answer = new StringBuilder();String entityName = (String) template.getParameters().get("entity_name");if (results.size() == 1) {Map<String, Object> result = results.get(0);answer.append(entityName).append("是");answer.append(result.get("type")).append("。");if (result.get("description") != null) {answer.append(" ").append(result.get("description"));}} else {answer.append("找到了多个名为").append(entityName).append("的人物:\n");for (Map<String, Object> result : results) {answer.append("• ").append(result.get("name")).append(" (").append(result.get("type")).append(")");if (result.get("description") != null) {answer.append(" - ").append(result.get("description"));}answer.append("\n");}}return answer.toString();}/*** 生成实体回答*/private String generateEntityAnswer(List<Map<String, Object>> results, QueryTemplate template) {StringBuilder answer = new StringBuilder();String entityName = (String) template.getParameters().get("entity_name");if (results.size() == 1) {Map<String, Object> result = results.get(0);answer.append(entityName).append("是");answer.append(result.get("type")).append("。");if (result.get("description") != null) {answer.append(" ").append(result.get("description"));}} else {answer.append("找到了多个名为").append(entityName).append("的实体:\n");for (Map<String, Object> result : results) {answer.append("• ").append(result.get("name")).append(" (").append(result.get("type")).append(")");if (result.get("description") != null) {answer.append(" - ").append(result.get("description"));}answer.append("\n");}}return answer.toString();}/*** 生成位置回答*/private String generateLocationAnswer(List<Map<String, Object>> results, QueryTemplate template) {StringBuilder answer = new StringBuilder();String entityName = (String) template.getParameters().get("entity_name");if (!results.isEmpty()) {Map<String, Object> result = results.get(0);answer.append(entityName).append("位于").append(result.get("location")).append("。");} else {answer.append("没有找到").append(entityName).append("的位置信息。");}return answer.toString();}/*** 生成属性回答*/private String generateAttributeAnswer(List<Map<String, Object>> results, QueryTemplate template) {StringBuilder answer = new StringBuilder();String entityName = (String) template.getParameters().get("entity_name");String attributeKey = (String) template.getParameters().get("attribute_key");if (!results.isEmpty()) {Map<String, Object> result = results.get(0);answer.append(entityName).append("的").append(attributeKey).append("是");answer.append(result.get("attribute_value")).append("。");} else {answer.append("没有找到").append(entityName).append("的").append(attributeKey).append("信息。");}return answer.toString();}/*** 生成关系回答*/private String generateRelationAnswer(List<Map<String, Object>> results, QueryTemplate template) {StringBuilder answer = new StringBuilder();String entity1Name = (String) template.getParameters().get("entity1_name");String entity2Name = (String) template.getParameters().get("entity2_name");if (!results.isEmpty()) {Map<String, Object> result = results.get(0);answer.append(entity1Name).append("和").append(entity2Name).append("之间存在");answer.append(result.get("relation")).append("的关系。");} else {answer.append("没有找到").append(entity1Name).append("和").append(entity2Name).append("之间的关系。");}return answer.toString();}/*** 生成属性列表回答*/private String generatePropertiesAnswer(List<Map<String, Object>> results, QueryTemplate template) {StringBuilder answer = new StringBuilder();String entityName = (String) template.getParameters().get("entity_name");if (!results.isEmpty()) {answer.append(entityName).append("的属性包括:\n");for (Map<String, Object> result : results) {answer.append("• ").append(result.get("attribute_key")).append(": ");answer.append(result.get("attribute_value")).append("\n");}} else {answer.append("没有找到").append(entityName).append("的属性信息。");}return answer.toString();}/*** 生成默认回答*/private String generateDefaultAnswer(List<Map<String, Object>> results, QueryTemplate template) {StringBuilder answer = new StringBuilder();answer.append("找到以下相关信息:\n");for (Map<String, Object> result : results) {answer.append("• ").append(result.get("name")).append(" (").append(result.get("type")).append(")\n");}return answer.toString();}/*** 生成降级回答*/private String generateFallbackAnswer(String question) {return "抱歉,我暂时无法回答这个问题。您可以尝试:\n" +"1. 重新表述您的问题\n" +"2. 查询特定的实体或关系\n" +"3. 检查知识图谱中是否有相关数据\n\n" +"对于给您带来的不便,我们深表歉意。";}/*** 根据名称查找实体*/public List<EntityNode> findEntitiesByName(String name, String domain) {String query = "MATCH (e:EntityNode) " +"WHERE e.name =~ $name AND e.domain = $domain " +"RETURN e";Map<String, Object> parameters = new HashMap<>();parameters.put("name", "(?i).*" + name + ".*");parameters.put("domain", domain);return neo4jTemplate.query(query, parameters).to(EntityNode.class);}/*** 查找实体之间的关系路径*/public List<Map<String, Object>> findRelationPath(String entity1Id, String entity2Id, int maxPathLength) {String query = "MATCH path = (e1:EntityNode)-[r*1.." + maxPathLength + "]-(e2:EntityNode) " +"WHERE e1.entityId = $entity1Id AND e2.entityId = $entity2Id " +"RETURN path, length(path) as pathLength " +"ORDER BY pathLength " +"LIMIT 10";Map<String, Object> parameters = new HashMap<>();parameters.put("entity1Id", entity1Id);parameters.put("entity2Id", entity2Id);return neo4jTemplate.query(query, parameters).queryResults();}/*** 问答答案封装类*/@Datapublic static class QAAnswer {private boolean success;private String question;private String domain;private String answer;private QueryTemplate queryTemplate;private String cypherQuery;private List<Map<String, Object>> queryResults;private String errorMessage;private boolean fallbackUsed;private Long processingTimeMs;private Integer resultCount;public QAAnswer() {this.success = false;this.queryResults = new ArrayList<>();}}/*** 查询模板类*/@Datapublic static class QueryTemplate {private String originalQuestion;private String questionType;private String domain;private List<String> entities = new ArrayList<>();private List<EntityNode> resolvedEntities = new ArrayList<>();private Map<String, Object> parameters = new HashMap<>();public QueryTemplate() {this.parameters = new HashMap<>();this.entities = new ArrayList<>();this.resolvedEntities = new ArrayList<>();}}
}
测试用例
信息抽取服务测试
package com.company.kg.service;import com.company.kg.entity.TextSource;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit.jupiter.SpringExtension;import static org.junit.jupiter.api.Assertions.*;/*** 信息抽取服务测试*/
@ExtendWith(SpringExtension.class)
@SpringBootTest
class InformationExtractionServiceTest {@Autowiredprivate InformationExtractionService extractionService;@Testvoid testExtractInformation() {String text = "张三在北京的清华大学工作。李四是清华大学的教授。清华大学位于北京市。";String domain = "test";InformationExtractionService.ExtractionData result = extractionService.extractInformation(text, domain);assertNotNull(result);assertTrue(result.getEntities().size() >= 3); // 张三, 李四, 清华大学, 北京, 北京市assertTrue(result.getRelations().size() >= 2); // 工作在, 位于// 验证实体assertTrue(result.getEntities().stream().anyMatch(e -> "张三".equals(e.getName()) && "Person".equals(e.getType())));assertTrue(result.getEntities().stream().anyMatch(e -> "清华大学".equals(e.getName()) && "Organization".equals(e.getType())));}@Testvoid testEntityExtraction() {String text = "苹果公司由史蒂夫·乔布斯在1976年创立。公司总部位于加利福尼亚州。";String domain = "test";InformationExtractionService.ExtractionData result = extractionService.extractInformation(text, domain);assertNotNull(result);// 验证实体识别var entities = result.getEntities();assertTrue(entities.stream().anyMatch(e -> "苹果公司".equals(e.getName())));assertTrue(entities.stream().anyMatch(e -> "史蒂夫·乔布斯".equals(e.getName())));assertTrue(entities.stream().anyMatch(e -> "加利福尼亚州".equals(e.getName())));}@Testvoid testRelationExtraction() {String text = "马云是阿里巴巴集团的创始人。阿里巴巴集团位于杭州市。";String domain = "test";InformationExtractionService.ExtractionData result = extractionService.extractInformation(text, domain);assertNotNull(result);// 验证关系抽取var relations = result.getRelations();assertTrue(relations.size() > 0);// 应该包含创建关系和位置关系boolean hasFounderRelation = relations.stream().anyMatch(r -> "FOUNDED_BY".equals(r.getType()) || "创始人".contains(r.getType()));boolean hasLocationRelation = relations.stream().anyMatch(r -> "LOCATED_IN".equals(r.getType()) || "位于".contains(r.getType()));assertTrue(hasFounderRelation || hasLocationRelation);}@Testvoid testEmptyText() {String text = "";String domain = "test";assertThrows(IllegalArgumentException.class, () -> {extractionService.extractInformation(text, domain);});}@Testvoid testTextSourceExtraction() {TextSource textSource = new TextSource("测试文档", TextSource.SourceType.TXT, "百度公司由李彦宏于2000年在北京创立。");textSource.setDomain("test");var result = extractionService.extractFromTextSource(textSource);assertNotNull(result);assertTrue(result.getEntityCount() > 0);assertTrue(result.getRelationCount() > 0);assertNotNull(result.getExtractionConfidence());}
}
知识图谱问答服务测试
package com.company.kg.service;import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit.jupiter.SpringExtension;import static org.junit.jupiter.api.Assertions.*;/*** 知识图谱问答服务测试*/
@ExtendWith(SpringExtension.class)
@SpringBootTest
class KnowledgeGraphQAServiceTest {@Autowiredprivate KnowledgeGraphQAService qaService;@Testvoid testParseQuestion() {String question = "马云是谁?";String domain = "test";KnowledgeGraphQAService.QueryTemplate template = qaService.parseQuestion(question, domain);assertNotNull(template);assertEquals("PERSON_QUERY", template.getQuestionType());assertEquals("马云", template.getParameters().get("entity_name"));assertEquals("Person", template.getParameters().get("entity_type"));}@Testvoid testGenerateCypherQuery() {KnowledgeGraphQAService.QueryTemplate template = new KnowledgeGraphQAService.QueryTemplate();template.setQuestionType("PERSON_QUERY");template.getParameters().put("entity_name", "马云");template.setDomain("test");String cypherQuery = qaService.generateCypherQuery(template);assertNotNull(cypherQuery);assertTrue(cypherQuery.contains("MATCH"));assertTrue(cypherQuery.contains("EntityNode"));assertTrue(cypherQuery.contains("Person"));}@Testvoid testAnswerQuestion() {String question = "清华大学在哪里?";String domain = "test";KnowledgeGraphQAService.QAAnswer answer = qaService.answerQuestion(question, domain);assertNotNull(answer);// 由于测试环境可能没有数据,主要测试流程是否正常assertTrue(answer.isSuccess() || answer.isFallbackUsed());assertNotNull(answer.getAnswer());}@Testvoid testFindEntitiesByName() {String name = "测试实体";String domain = "test";var entities = qaService.findEntitiesByName(name, domain);assertNotNull(entities);// 测试环境可能没有数据,主要测试方法是否正常执行}@Testvoid testQuestionTypes() {String[] questions = {"马云是谁?","阿里巴巴是什么?", "清华大学在哪里?","马云的年龄是多少?","马云和阿里巴巴有什么关系?","阿里巴巴有哪些属性?"};String domain = "test";for (String question : questions) {KnowledgeGraphQAService.QueryTemplate template = qaService.parseQuestion(question, domain);assertNotNull(template);assertNotNull(template.getQuestionType());assertFalse(template.getQuestionType().isEmpty());// 生成Cypher查询应该不抛出异常String cypherQuery = qaService.generateCypherQuery(template);assertNotNull(cypherQuery);assertFalse(cypherQuery.isEmpty());}}
}
Docker部署
Dockerfile
FROM openjdk:11-jre-slim# 安装系统依赖
RUN apt-get update && apt-get install -y \curl \gnupg \&& rm -rf /var/lib/apt/lists/*# 创建应用目录
WORKDIR /app# 创建非root用户
RUN groupadd -r appuser && useradd -r -g appuser appuser# 复制JAR文件
COPY target/kg-qa-system-1.0.0.jar app.jar# 创建存储目录
RUN mkdir -p /app/uploads /app/processed /app/logs && \chown -R appuser:appuser /app# 切换用户
USER appuser# 暴露端口
EXPOSE 8082# 健康检查
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \CMD curl -f http://localhost:8082/kg-qa/actuator/health || exit 1# 启动应用
ENTRYPOINT ["java", "-jar", "app.jar"]
docker-compose.yml
version: '3.8'services:kg-qa-system:build: .ports:- "8082:8082"environment:- SPRING_PROFILES_ACTIVE=prod- SPRING_DATASOURCE_URL=jdbc:mysql://mysql:3306/kg_qa_system- SPRING_DATA_NEO4J_URI=bolt://neo4j:7687- SPRING_DATA_NEO4J_USERNAME=neo4j- SPRING_DATA_NEO4J_PASSWORD=password- SPRING_REDIS_HOST=redisdepends_on:- mysql- redis- neo4jvolumes:- ./uploads:/app/uploads- ./processed:/app/processed- ./logs:/app/logsnetworks:- kg-networkmysql:image: mysql:8.0environment:- MYSQL_ROOT_PASSWORD=rootpassword- MYSQL_DATABASE=kg_qa_system- MYSQL_USER=kg_user- MYSQL_PASSWORD=kg_passwordvolumes:- mysql_data:/var/lib/mysqlnetworks:- kg-networkredis:image: redis:7-alpinecommand: redis-server --appendonly yesvolumes:- redis_data:/datanetworks:- kg-networkneo4j:image: neo4j:4.4environment:- NEO4J_AUTH=neo4j/password- NEO4J_PLUGINS=["apoc"]volumes:- neo4j_data:/data- neo4j_logs:/logsports:- "7474:7474"- "7687:7687"networks:- kg-networkvolumes:mysql_data:redis_data:neo4j_data:neo4j_logs:networks:kg-network:driver: bridge
使用Docker快速启动
# 克隆项目
git clone <repository-url>
cd kg-qa-system# 构建项目
mvn clean package# 启动所有服务
docker-compose up -d# 查看日志
docker-compose logs -f kg-qa-system
手动启动
# 创建数据库
mysql -u root -p -e "CREATE DATABASE kg_qa_system;"# 启动Neo4j
neo4j start# 构建项目
mvn clean package# 启动应用
java -jar target/kg-qa-system-1.0.0.jar
前端界面: http://localhost:8082/kg-qa
API文档: http://localhost:8082/kg-qa/swagger-ui.html
健康检查: http://localhost:8082/kg-qa/actuator/health
Neo4j浏览器: http://localhost:7474 (用户名: neo4j, 密码: password)
上传示例数据
# 创建示例文本文件
echo "阿里巴巴由马云在1999年创立。公司总部位于杭州市。马云是阿里巴巴的主要创始人。" > example.txt# 上传并抽取信息
curl -X POST "http://localhost:8082/kg-qa/api/v1/extraction/upload-file" \-F "file=@example.txt" \-F "domain=default" \-F "category=company"
测试问答功能
curl -X POST "http://localhost:8082/kg-qa/api/v1/qa/answer" \-H "Content-Type: application/json" \-d '{"question": "阿里巴巴的创始人是谁?","domain": "default"}'
信息抽取
curl -X POST "http://localhost:8082/kg-qa/api/v1/extraction/extract-text" \-H "Content-Type: application/json" \-d '{"text": "清华大学位于北京市海淀区。清华大学是中国著名的高等学府。","domain": "education"}'
知识图谱问答
curl -X POST "http://localhost:8082/kg-qa/api/v1/qa/answer" \-H "Content-Type: application/json" \-d '{"question": "清华大学在哪里?","domain": "education"}'
执行Cypher查询
curl -X POST "http://localhost:8082/kg-qa/api/v1/qa/execute-cypher" \-H "Content-Type: application/json" \-d '{"query": "MATCH (e:EntityNode) WHERE e.domain = \"education\" RETURN e.name, e.type LIMIT 5"}'
