Dify 父子模式详解:如何实现模块化与高效协作
一、父子模式实现分析
1. 父子模式如何切分和建立关系的?
1.1 切分策略
父子模式支持两种父节点模式:
PARAGRAPH模式(段落模式):
if rules.parent_mode == ParentMode.PARAGRAPH:# Split the text documents into nodes.splitter = self._get_splitter(processing_rule_mode=process_rule.get("mode"),max_tokens=rules.segmentation.max_tokens,chunk_overlap=rules.segmentation.chunk_overlap,separator=rules.segmentation.separator,embedding_model_instance=kwargs.get("embedding_model_instance"),)for document in documents:# document cleandocument_text = CleanProcessor.clean(document.page_content, process_rule)document.page_content = document_text# parse document to nodesdocument_nodes = splitter.split_documents([document])split_documents = []for document_node in document_nodes:if document_node.page_content.strip():doc_id = str(uuid.uuid4())hash = helper.generate_text_hash(document_node.page_content)document_node.metadata["doc_id"] = doc_iddocument_node.metadata["doc_hash"] = hash# delete Splitter characterpage_content = document_node.page_contentif page_content.startswith(".") or page_content.startswith("。"):page_content = page_content[1:].strip()else:page_content = page_contentif len(page_content) > 0:document_node.page_content = page_content# parse document to child nodeschild_nodes = self._split_child_nodes(document_node, rules, process_rule.get("mode"), kwargs.get("embedding_model_instance"))document_node.children = child_nodessplit_documents.append(document_node)all_documents.extend(split_documents)
FULL_DOC模式(全文模式):
elif rules.parent_mode == ParentMode.FULL_DOC:page_content = "\n".join([document.page_content for document in documents])document = Document(page_content=page_content, metadata=documents[0].metadata)# parse document to child nodeschild_nodes = self._split_child_nodes(document, rules, process_rule.get("mode"), kwargs.get("embedding_model_instance"))document.children = child_nodesdoc_id = str(uuid.uuid4())hash = helper.generate_text_hash(document.page_content)document.metadata["doc_id"] = doc_iddocument.metadata["doc_hash"] = hashall_documents.append(document)
1.2 子节点切分
子节点通过_split_child_nodes
方法进行切分:
def _split_child_nodes(self,document_node: Document,rules: Rule,process_rule_mode: str,embedding_model_instance: Optional[ModelInstance],
) -> list[ChildDocument]:child_splitter = self._get_splitter(processing_rule_mode=process_rule_mode,max_tokens=rules.subchunk_segmentation.max_tokens,chunk_overlap=rules.subchunk_segmentation.chunk_overlap,separator=rules.subchunk_segmentation.separator,embedding_model_instance=embedding_model_instance,)# parse document to child nodeschild_nodes = []child_documents = child_splitter.split_documents([document_node])for child_document_node in child_documents:if child_document_node.page_content.strip():doc_id = str(uuid.uuid4())hash = helper.generate_text_hash(child_document_node.page_content)child_document = ChildDocument(page_content=child_document_node.page_content, metadata=document_node.metadata)child_document.metadata["doc_id"] = doc_idchild_document.metadata["doc_hash"] = hashchild_page_content = child_document.page_contentif child_page_content.startswith(".") or child_page_content.startswith("。"):child_page_content = child_page_content[1:].strip()if len(child_page_content) > 0:child_document.page_content = child_page_contentchild_nodes.append(child_document)return child_nodes
1.3 向量存储
只有子节点被存储到向量数据库中:
def load(self, dataset: Dataset, documents: list[Document], with_keywords: bool = True, **kwargs):if dataset.indexing_technique == "high_quality":vector = Vector(dataset)for document in documents:child_documents = document.childrenif child_documents:formatted_child_documents = [Document(**child_document.model_dump()) for child_document in child_documents]vector.create(formatted_child_documents)
2. 父子模式在检索召回时如何构建召回文本内容,topk控制
2.1 检索流程
检索时首先召回子节点,然后通过format_retrieval_documents
方法构建父子关系:
@staticmethod
def format_retrieval_documents(documents: list[Document]) -> list[RetrievalSegments]:records = []include_segment_ids = []segment_child_map = {}for document in documents:document_id = document.metadata["document_id"]dataset_document = db.session.query(DatasetDocument).filter(DatasetDocument.id == document_id).first()if dataset_document and dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:child_index_node_id = document.metadata["doc_id"]result = (db.session.query(ChildChunk, DocumentSegment).join(DocumentSegment, ChildChunk.segment_id == DocumentSegment.id).filter(ChildChunk.index_node_id == child_index_node_id,DocumentSegment.dataset_id == dataset_document.dataset_id,DocumentSegment.enabled == True,DocumentSegment.status == "completed",).first())if result:child_chunk, segment = resultif not segment:continueif segment.id not in include_segment_ids:include_segment_ids.append(segment.id)child_chunk_detail = {"id": child_chunk.id,"content": child_chunk.content,"position": child_chunk.position,"score": document.metadata.get("score", 0.0),}map_detail = {"max_score": document.metadata.get("score", 0.0),"child_chunks": [child_chunk_detail],}segment_child_map[segment.id] = map_detailrecord = {"segment": segment,}records.append(record)else:child_chunk_detail = {"id": child_chunk.id,"content": child_chunk.content,"position": child_chunk.position,"score": document.metadata.get("score", 0.0),}segment_child_map[segment.id]["child_chunks"].append(child_chunk_detail)segment_child_map[segment.id]["max_score"] = max(segment_child_map[segment.id]["max_score"], document.metadata.get("score", 0.0))else:continueelse:index_node_id = document.metadata["doc_id"]segment = (db.session.query(DocumentSegment).filter(DocumentSegment.dataset_id == dataset_document.dataset_id,DocumentSegment.enabled == True,DocumentSegment.status == "completed",DocumentSegment.index_node_id == index_node_id,).first())if not segment:continueinclude_segment_ids.append(segment.id)record = {"segment": segment,"score": document.metadata.get("score", None),}records.append(record)for record in records:if record["segment"].id in segment_child_map:record["child_chunks"] = segment_child_map[record["segment"].id].get("child_chunks", None)record["score"] = segment_child_map[record["segment"].id]["max_score"]return [RetrievalSegments(**record) for record in records]
2.2 TopK控制机制
向量检索阶段的TopK:
@classmethod
def embedding_search(cls,flask_app: Flask,dataset_id: str,query: str,top_k: int,score_threshold: Optional[float],reranking_model: Optional[dict],all_documents: list,retrieval_method: str,exceptions: list,
):with flask_app.app_context():try:dataset = cls._get_dataset(dataset_id)vector = Vector(dataset=dataset)documents = vector.search_by_vector(query,search_type="similarity_score_threshold",top_k=top_k,score_threshold=score_threshold,filter={"group_id": [dataset.id]},)
重排序阶段的TopK(针对聚合后的父块):
rerank_result = self.rerank_model_instance.invoke_rerank(query=query, docs=docs, score_threshold=score_threshold, top_n=top_n, user=user)rerank_documents = []for result in rerank_result.docs:if score_threshold is None or result.score >= score_threshold:# format documentrerank_document = Document(page_content=result.text,metadata=documents[result.index].metadata,provider=documents[result.index].provider,)if rerank_document.metadata is not None:rerank_document.metadata["score"] = result.scorererank_documents.append(rerank_document)rerank_documents.sort(key=lambda x: x.metadata.get("score", 0.0), reverse=True)return rerank_documents[:top_n] if top_n else rerank_documents
2.3 召回文本构建
父子关系聚合:
- 系统首先召回子节点(child chunks)
- 通过
segment_child_map
将属于同一父节点的子节点聚合 - 取所有子节点中的最高分数作为父节点的分数
- 最终返回包含父子关系的
RetrievalSegments
对象
文本内容构建:
class RetrievalSegments(BaseModel):"""Retrieval segments."""model_config = {"arbitrary_types_allowed": True}segment: DocumentSegmentchild_chunks: Optional[list[RetrievalChildChunk]] = Nonescore: Optional[float] = None
3. 关键特点总结
- 分层存储:只有子节点存储在向量数据库中,父节点作为逻辑结构存在
- 智能聚合:检索时自动将相关子节点聚合到父节点下
- 分数继承:父节点分数取所有相关子节点的最高分
- 灵活配置:支持段落模式和全文模式两种父节点策略
- TopK控制:在向量检索和重排序两个阶段都有TopK控制, top_k 先作用于“子块”召回/重排序;随后在构建结果时按父块聚合去重,返回父块级结果,不再追加补齐。
举例:设置 top_k=10,检索命中 10 个子块,但它们只隶属于 7 个父块,最终返回就是 7 条。