当前位置: 首页 > news >正文

LangChain企业知识库权限控制方案

基于LangChain框架实现企业知识库的文档权限控制,需要从多个层面进行设计。以下是完整的解决方案:

1. 系统架构设计

from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from typing import List, Dict, Set
import jsonclass PermissionAwareKnowledgeBase:def __init__(self):self.vector_store = Noneself.department_permissions = {"finance": {"finance", "hr", "management"},"technology": {"technology", "rd", "management"},"hr": {"hr", "management"},"sales": {"sales", "marketing", "management"},"management": {"finance", "technology", "hr", "sales", "management"}}

2. 文档权限元数据设计

from pydantic import BaseModel
from datetime import datetimeclass DocumentMetadata(BaseModel):department: str  # 文档所属部门visible_departments: List[str]  # 可访问的部门列表security_level: str  # 安全级别:public, internal, confidential, secretowner: str  # 文档所有者created_time: datetimeexpires_at: datetime = Noneclass PermissionAwareDocument(Document):def __init__(self, page_content: str, metadata: Dict = None):if metadata is None:metadata = {}# 确保包含权限元数据required_metadata = {"department": metadata.get("department", "general"),"visible_departments": metadata.get("visible_departments", []),"security_level": metadata.get("security_level", "internal"),"owner": metadata.get("owner", "system"),"created_time": metadata.get("created_time", datetime.now())}metadata.update(required_metadata)super().__init__(page_content=page_content, metadata=metadata)

3. 用户认证和权限管理

class User:def __init__(self, user_id: str, department: str, roles: List[str]):self.user_id = user_idself.department = departmentself.roles = rolesself.permissions = self._calculate_permissions()def _calculate_permissions(self) -> Set[str]:"""计算用户权限集合"""base_permissions = {self.department}# 根据角色添加额外权限role_mappings = {"manager": ["management"],"director": ["management"],"admin": ["finance", "technology", "hr", "sales", "management"]}for role in self.roles:if role in role_mappings:base_permissions.update(role_mappings[role])return base_permissionsclass AuthManager:def __init__(self):self.users = {}def authenticate_user(self, token: str) -> User:"""根据token验证用户身份"""# 实际应用中这里会连接LDAP、OAuth等认证系统user_data = self._validate_token(token)return User(user_id=user_data["user_id"],department=user_data["department"],roles=user_data["roles"])def can_access_document(self, user: User, document_metadata: Dict) -> bool:"""检查用户是否有权限访问文档"""doc_department = document_metadata.get("department", "")visible_depts = document_metadata.get("visible_departments", [])security_level = document_metadata.get("security_level", "internal")# 检查部门权限if doc_department in user.permissions:return True# 检查可见部门列表if any(dept in user.permissions for dept in visible_depts):return True# 检查安全级别if security_level == "public":return Truereturn False

4. 权限感知的向量存储

class PermissionAwareVectorStore:def __init__(self, embedding_model, persist_directory: str = "./chroma_db"):self.embedding_model = embedding_modelself.vector_store = Chroma(persist_directory=persist_directory,embedding_function=embedding_model)self.auth_manager = AuthManager()def add_documents(self, documents: List[PermissionAwareDocument], user: User):"""添加文档,包含权限验证"""# 验证用户是否有权限添加该部门文档for doc in documents:doc_department = doc.metadata.get("department")if doc_department not in user.permissions and "admin" not in user.roles:raise PermissionError(f"用户无权添加{doc_department}部门的文档")self.vector_store.add_documents(documents)def similarity_search(self, query: str, user: User, **kwargs) -> List[Document]:"""带权限控制的相似度搜索"""# 首先执行普通搜索all_results = self.vector_store.similarity_search(query, **kwargs)# 过滤无权限的文档filtered_results = []for doc in all_results:if self.auth_manager.can_access_document(user, doc.metadata):filtered_results.append(doc)return filtered_resultsdef search_by_department(self, query: str, user: User, department: str = None, **kwargs):"""按部门搜索"""if department and department not in user.permissions:raise PermissionError(f"无权访问{department}部门的文档")# 使用元数据过滤器filter_dict = {}if department:filter_dict["department"] = departmentelse:# 只搜索用户有权限的部门filter_dict["department"] = {"$in": list(user.permissions)}return self.vector_store.similarity_search(query, filter=filter_dict,**kwargs)

5. 完整的知识库实现

class EnterpriseKnowledgeBase:def __init__(self):self.embedding_model = OpenAIEmbeddings()self.vector_store = PermissionAwareVectorStore(self.embedding_model)self.auth_manager = AuthManager()def process_document(self, file_path: str, department: str, owner: str, security_level: str = "internal", visible_departments: List[str] = None):"""处理并存储文档"""from langchain.document_loaders import PyPDFLoader, TextLoaderfrom langchain.text_splitter import RecursiveCharacterTextSplitter# 加载文档if file_path.endswith('.pdf'):loader = PyPDFLoader(file_path)else:loader = TextLoader(file_path)documents = loader.load()# 文档分割text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)splits = text_splitter.split_documents(documents)# 添加权限元数据permission_docs = []for doc in splits:metadata = doc.metadatametadata.update({"department": department,"visible_departments": visible_departments or [],"security_level": security_level,"owner": owner,"created_time": datetime.now().isoformat()})permission_docs.append(PermissionAwareDocument(page_content=doc.page_content,metadata=metadata))return permission_docsdef query_knowledge_base(self, query: str, user_token: str, department_filter: str = None) -> List[Dict]:"""查询知识库(主要接口)"""# 用户认证user = self.auth_manager.authenticate_user(user_token)# 执行搜索if department_filter:results = self.vector_store.search_by_department(query, user, department_filter)else:results = self.vector_store.similarity_search(query, user)# 格式化结果formatted_results = []for doc in results:formatted_results.append({"content": doc.page_content,"department": doc.metadata.get("department"),"security_level": doc.metadata.get("security_level"),"source": doc.metadata.get("source", "unknown")})return formatted_results

6. 使用示例

# 初始化知识库
kb = EnterpriseKnowledgeBase()# 处理财务文档(只有财务部和管理部能访问)
finance_docs = kb.process_document(file_path="financial_report.pdf",department="finance",owner="finance_user",security_level="confidential",visible_departments=["management"]
)# 处理技术文档(技术部和管理部能访问)
tech_docs = kb.process_document(file_path="technical_spec.docx",department="technology", owner="tech_user",security_level="internal",visible_departments=["management", "rd"]
)# 用户查询
finance_user_token = "finance_user_token"
tech_user_token = "tech_user_token"# 财务用户查询
finance_results = kb.query_knowledge_base("季度财报", finance_user_token
)
print(f"财务用户看到 {len(finance_results)} 个结果")# 技术用户查询同样的内容
tech_results = kb.query_knowledge_base("季度财报",tech_user_token  
)
print(f"技术用户看到 {len(tech_results)} 个结果")  # 应该是0,因为无权限

7. 高级权限特性

# 动态权限控制
class DynamicPermissionManager:def __init__(self):self.department_hierarchy = {"finance": ["finance", "management"],"technology": ["technology", "rd", "management"],"hr": ["hr", "management"],"management": ["finance", "technology", "hr", "sales", "management"]}def get_accessible_departments(self, user: User) -> List[str]:"""获取用户可访问的所有部门"""accessible = set()for dept in user.permissions:if dept in self.department_hierarchy:accessible.update(self.department_hierarchy[dept])return list(accessible)# 审计日志
class AuditLogger:def log_access(self, user: User, document_id: str, action: str):"""记录访问日志"""log_entry = {"timestamp": datetime.now().isoformat(),"user_id": user.user_id,"department": user.department,"document_id": document_id,"action": action}# 存储到日志系统print(f"AUDIT: {json.dumps(log_entry)}")

关键要点

  1. 元数据驱动:在文档元数据中嵌入权限信息
  2. 查询时过滤:在向量搜索后基于权限过滤结果
  3. 用户上下文:在每个请求中携带用户身份信息
  4. 分层权限:支持部门、角色、安全级别等多维度权限
  5. 审计追踪:记录所有文档访问行为

这种设计确保了财务文档对技术部门不可见,同时保持了系统的灵活性和可扩展性。

http://www.dtcms.com/a/554045.html

相关文章:

  • 网站建设主要推广方式wordpress 登录 404
  • 4-Azido-L-phenylalanine,CAS号:33173-53-4,分子结构特点
  • 网站建设及解决方案在网站开发中如何设置用户登录
  • Android 12 模块编译的常用命令小结(更新中)
  • 如何使用 Python 转换 Excel 工作表到 PDF 文档
  • 网站建设伍金手指下拉2公众号图片到wordpress
  • 亚远景-在开发中的 “功能安全(ISO 26262)” 与 “网络安全(ISO/SAE 21434)”关联实践
  • wordpress关闭谷歌北京百度seo代理
  • Python中如何安全地存储和验证密码
  • fixed-bug:JPA 关联关系的对象序列化循环引用问题
  • Nginx入门基础-访问配置
  • 装饰网站建设辽宁省建设工程信息网官网新网站
  • 【LeetCode热题100(56/100)】组合总和
  • 什么是离线语音识别芯片(离线语音识别芯片有哪些优点)
  • 代前导页的网站定制网站建设案例课堂
  • 四川住房城乡建设厅官方网站天津制作企业网站的
  • 图的邻接矩阵实现以及遍历
  • 伟淼科技发布11月营销前瞻:解码 “温暖狂欢感恩” 增长公式
  • 前端学习css
  • 外链推广网站都有哪些网站首页分辨率
  • 网站查询域名解析用extjs做的网站
  • 【计算机网络】NAT技术深度解析:从原理到NAPT实现的工作机制
  • 外设模块学习(10)——红外避障模块(STM32)
  • 60 d3.js 不能正确展示节点连线, 以及一个基础的demo
  • 优质网站建设是哪家北京论坛网站建设
  • 黑马商城day8-ES01
  • 创意网站建设公司阿里云宝塔面板一键安装wordpress
  • F043 vue+flask天气预测可视化系统大数据(浅色版)+机器学习+管理端+爬虫+超酷界面+顶级可视化水平
  • 电脑无法识别WiFi 7路由器的解决方式
  • 海南网站推广微商目前十大火爆产品