LangChain企业知识库权限控制方案
基于LangChain框架实现企业知识库的文档权限控制,需要从多个层面进行设计。以下是完整的解决方案:
1. 系统架构设计
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from typing import List, Dict, Set
import jsonclass PermissionAwareKnowledgeBase:def __init__(self):self.vector_store = Noneself.department_permissions = {"finance": {"finance", "hr", "management"},"technology": {"technology", "rd", "management"},"hr": {"hr", "management"},"sales": {"sales", "marketing", "management"},"management": {"finance", "technology", "hr", "sales", "management"}}
2. 文档权限元数据设计
from pydantic import BaseModel
from datetime import datetimeclass DocumentMetadata(BaseModel):department: str # 文档所属部门visible_departments: List[str] # 可访问的部门列表security_level: str # 安全级别:public, internal, confidential, secretowner: str # 文档所有者created_time: datetimeexpires_at: datetime = Noneclass PermissionAwareDocument(Document):def __init__(self, page_content: str, metadata: Dict = None):if metadata is None:metadata = {}# 确保包含权限元数据required_metadata = {"department": metadata.get("department", "general"),"visible_departments": metadata.get("visible_departments", []),"security_level": metadata.get("security_level", "internal"),"owner": metadata.get("owner", "system"),"created_time": metadata.get("created_time", datetime.now())}metadata.update(required_metadata)super().__init__(page_content=page_content, metadata=metadata)
3. 用户认证和权限管理
class User:def __init__(self, user_id: str, department: str, roles: List[str]):self.user_id = user_idself.department = departmentself.roles = rolesself.permissions = self._calculate_permissions()def _calculate_permissions(self) -> Set[str]:"""计算用户权限集合"""base_permissions = {self.department}# 根据角色添加额外权限role_mappings = {"manager": ["management"],"director": ["management"],"admin": ["finance", "technology", "hr", "sales", "management"]}for role in self.roles:if role in role_mappings:base_permissions.update(role_mappings[role])return base_permissionsclass AuthManager:def __init__(self):self.users = {}def authenticate_user(self, token: str) -> User:"""根据token验证用户身份"""# 实际应用中这里会连接LDAP、OAuth等认证系统user_data = self._validate_token(token)return User(user_id=user_data["user_id"],department=user_data["department"],roles=user_data["roles"])def can_access_document(self, user: User, document_metadata: Dict) -> bool:"""检查用户是否有权限访问文档"""doc_department = document_metadata.get("department", "")visible_depts = document_metadata.get("visible_departments", [])security_level = document_metadata.get("security_level", "internal")# 检查部门权限if doc_department in user.permissions:return True# 检查可见部门列表if any(dept in user.permissions for dept in visible_depts):return True# 检查安全级别if security_level == "public":return Truereturn False
4. 权限感知的向量存储
class PermissionAwareVectorStore:def __init__(self, embedding_model, persist_directory: str = "./chroma_db"):self.embedding_model = embedding_modelself.vector_store = Chroma(persist_directory=persist_directory,embedding_function=embedding_model)self.auth_manager = AuthManager()def add_documents(self, documents: List[PermissionAwareDocument], user: User):"""添加文档,包含权限验证"""# 验证用户是否有权限添加该部门文档for doc in documents:doc_department = doc.metadata.get("department")if doc_department not in user.permissions and "admin" not in user.roles:raise PermissionError(f"用户无权添加{doc_department}部门的文档")self.vector_store.add_documents(documents)def similarity_search(self, query: str, user: User, **kwargs) -> List[Document]:"""带权限控制的相似度搜索"""# 首先执行普通搜索all_results = self.vector_store.similarity_search(query, **kwargs)# 过滤无权限的文档filtered_results = []for doc in all_results:if self.auth_manager.can_access_document(user, doc.metadata):filtered_results.append(doc)return filtered_resultsdef search_by_department(self, query: str, user: User, department: str = None, **kwargs):"""按部门搜索"""if department and department not in user.permissions:raise PermissionError(f"无权访问{department}部门的文档")# 使用元数据过滤器filter_dict = {}if department:filter_dict["department"] = departmentelse:# 只搜索用户有权限的部门filter_dict["department"] = {"$in": list(user.permissions)}return self.vector_store.similarity_search(query, filter=filter_dict,**kwargs)
5. 完整的知识库实现
class EnterpriseKnowledgeBase:def __init__(self):self.embedding_model = OpenAIEmbeddings()self.vector_store = PermissionAwareVectorStore(self.embedding_model)self.auth_manager = AuthManager()def process_document(self, file_path: str, department: str, owner: str, security_level: str = "internal", visible_departments: List[str] = None):"""处理并存储文档"""from langchain.document_loaders import PyPDFLoader, TextLoaderfrom langchain.text_splitter import RecursiveCharacterTextSplitter# 加载文档if file_path.endswith('.pdf'):loader = PyPDFLoader(file_path)else:loader = TextLoader(file_path)documents = loader.load()# 文档分割text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)splits = text_splitter.split_documents(documents)# 添加权限元数据permission_docs = []for doc in splits:metadata = doc.metadatametadata.update({"department": department,"visible_departments": visible_departments or [],"security_level": security_level,"owner": owner,"created_time": datetime.now().isoformat()})permission_docs.append(PermissionAwareDocument(page_content=doc.page_content,metadata=metadata))return permission_docsdef query_knowledge_base(self, query: str, user_token: str, department_filter: str = None) -> List[Dict]:"""查询知识库(主要接口)"""# 用户认证user = self.auth_manager.authenticate_user(user_token)# 执行搜索if department_filter:results = self.vector_store.search_by_department(query, user, department_filter)else:results = self.vector_store.similarity_search(query, user)# 格式化结果formatted_results = []for doc in results:formatted_results.append({"content": doc.page_content,"department": doc.metadata.get("department"),"security_level": doc.metadata.get("security_level"),"source": doc.metadata.get("source", "unknown")})return formatted_results
6. 使用示例
# 初始化知识库
kb = EnterpriseKnowledgeBase()# 处理财务文档(只有财务部和管理部能访问)
finance_docs = kb.process_document(file_path="financial_report.pdf",department="finance",owner="finance_user",security_level="confidential",visible_departments=["management"]
)# 处理技术文档(技术部和管理部能访问)
tech_docs = kb.process_document(file_path="technical_spec.docx",department="technology", owner="tech_user",security_level="internal",visible_departments=["management", "rd"]
)# 用户查询
finance_user_token = "finance_user_token"
tech_user_token = "tech_user_token"# 财务用户查询
finance_results = kb.query_knowledge_base("季度财报", finance_user_token
)
print(f"财务用户看到 {len(finance_results)} 个结果")# 技术用户查询同样的内容
tech_results = kb.query_knowledge_base("季度财报",tech_user_token
)
print(f"技术用户看到 {len(tech_results)} 个结果") # 应该是0,因为无权限
7. 高级权限特性
# 动态权限控制
class DynamicPermissionManager:def __init__(self):self.department_hierarchy = {"finance": ["finance", "management"],"technology": ["technology", "rd", "management"],"hr": ["hr", "management"],"management": ["finance", "technology", "hr", "sales", "management"]}def get_accessible_departments(self, user: User) -> List[str]:"""获取用户可访问的所有部门"""accessible = set()for dept in user.permissions:if dept in self.department_hierarchy:accessible.update(self.department_hierarchy[dept])return list(accessible)# 审计日志
class AuditLogger:def log_access(self, user: User, document_id: str, action: str):"""记录访问日志"""log_entry = {"timestamp": datetime.now().isoformat(),"user_id": user.user_id,"department": user.department,"document_id": document_id,"action": action}# 存储到日志系统print(f"AUDIT: {json.dumps(log_entry)}")
关键要点
- 元数据驱动:在文档元数据中嵌入权限信息
- 查询时过滤:在向量搜索后基于权限过滤结果
- 用户上下文:在每个请求中携带用户身份信息
- 分层权限:支持部门、角色、安全级别等多维度权限
- 审计追踪:记录所有文档访问行为
这种设计确保了财务文档对技术部门不可见,同时保持了系统的灵活性和可扩展性。
