python 通过Serper API联网搜索并大模型整理内容
1. Serper API
网址
https://www.searchapi.io/
获取api
验证api
$ curl https://www.searchapi.io/api/v1/me -H "Accept: application/json" -H "Authorization: Bearer your_api"
{"account": {"current_month_usage": 0,"monthly_allowance": 0,"remaining_credits": 97},"api_usage": {"searches_this_hour": 0,"hourly_rate_limit": 200000}
验证成功
2. 代码
在~/.bashrc 设置
#SerpAPI
export SERPER_API_KEY=xxxx
然后
$ source ~/.bashrc
查看:
$ echo $SERPER_API_KEY
d89bcd2dae4xxxx
联网搜索:
def search_web(query: str) -> str:"""使用Serper API执行网络搜索Args:query (str): 搜索查询字符串Returns:str: JSON格式的搜索结果Raises:ValueError: 如果未设置SERPER_API_KEY环境变量http.client.HTTPException: 如果发生HTTP错误"""api_key = os.getenv("SERPER_API_KEY")if not api_key:raise ValueError("请设置 SERPER_API_KEY 环境变量")conn = Nonetry:conn = http.client.HTTPSConnection("google.serper.dev")payload = json.dumps({"q": query,"gl": "cn","hl": "zh-cn","autocorrect": False,"tbs": "qdr:m","num": 5})headers = {'X-API-KEY': api_key,'Content-Type': 'application/json'}conn.request("POST", "/search", payload, headers)res = conn.getresponse()# 检查HTTP状态码if res.status == 403:raise http.client.HTTPException("API密钥无效或配额耗尽")elif res.status != 200:raise http.client.HTTPException(f"请求失败,状态码: {res.status}")data = res.read()return data.decode("utf-8")except http.client.HTTPException as e:print(f"请求失败: {e}")if "API密钥无效" in str(e):print("请检查API密钥是否正确,或联系服务提供商确认配额状态")raiseexcept Exception as e:print(f"发生未知错误: {e}")raisefinally:if conn:conn.close()
3. 完整代码
import json
import os
import re
import http.client
import pandas as pd
import time
from datetime import datetime, timedelta
import requests
from newspaper import Article, ArticleException
import openai
from typing import List, Dict, Any
from openai import OpenAI# 从环境变量获取 DeepSeek API Key
api_key = os.getenv("DEEPSEEK_API_KEY")
if not api_key:raise ValueError("请设置 DEEPSEEK_API_KEY 环境变量")# 初始化 OpenAI 客户端(假设 DeepSeek 的 API 兼容 OpenAI 格式)
client = OpenAI(api_key=api_key,base_url="https://api.siliconflow.cn/v1", # DeepSeek API 的基地址
)def search_web(query: str) -> str:"""使用Serper API执行网络搜索Args:query (str): 搜索查询字符串Returns:str: JSON格式的搜索结果Raises:ValueError: 如果未设置SERPER_API_KEY环境变量http.client.HTTPException: 如果发生HTTP错误"""api_key = os.getenv("SERPER_API_KEY")if not api_key:raise ValueError("请设置 SERPER_API_KEY 环境变量")conn = Nonetry:conn = http.client.HTTPSConnection("google.serper.dev")payload = json.dumps({"q": query,"gl": "cn","hl": "zh-cn","autocorrect": False,"tbs": "qdr:m","num": 5})headers = {'X-API-KEY': api_key,'Content-Type': 'application/json'}conn.request("POST", "/search", payload, headers)res = conn.getresponse()# 检查HTTP状态码if res.status == 403:raise http.client.HTTPException("API密钥无效或配额耗尽")elif res.status != 200:raise http.client.HTTPException(f"请求失败,状态码: {res.status}")data = res.read()return data.decode("utf-8")except http.client.HTTPException as e:print(f"请求失败: {e}")if "API密钥无效" in str(e):print("请检查API密钥是否正确,或联系服务提供商确认配额状态")raiseexcept Exception as e:print(f"发生未知错误: {e}")raisefinally:if conn:conn.close()def parse_search_data(json_str):# 解析 JSON 字符串data = json.loads(json_str)base_date = datetime.now()# print("\ndata")# print(data)# 确保存在所有必要的字段if "organic" not in data:data["organic"] = []if "topStories" not in data:data["topStories"] = []# 处理日期转换for result in data["organic"]+ data["topStories"]:# 确保有date字段if "date" not in result:result["date"] = "未知日期"continue# 处理带空格的日期格式(如 "1 天前")date_str = result["date"].replace(" ", "")if "小时前" in date_str:hours_ago = int(date_str.split("小时")[0].strip())result["date"] = (base_date - timedelta(hours=hours_ago)).strftime("%Y-%m-%d")elif "分钟前" in date_str:minutes_ago = int(date_str.split("分钟")[0].strip())result["date"] = (base_date - timedelta(minutes=minutes_ago)).strftime("%Y-%m-%d")# 处理 "X天前" 格式elif "天前" in date_str:days_ago = int(date_str.split("天")[0].strip())result_date = base_date - timedelta(days=days_ago)result["date"] = result_date.strftime("%Y-%m-%d")# 处理 "X周前" 格式elif "周前" in date_str:weeks_ago = int(date_str.split("周")[0].strip())result["date"] = (base_date - timedelta(weeks=weeks_ago)).strftime("%Y-%m-%d")# 处理完整日期格式(如 "2025年5月28日")elif "年" in date_str and "月" in date_str and "日" in date_str:# 移除中文日期标识并转换为标准日期clean_date = date_str.replace("年", "-").replace("月", "-").replace("日", "")try:date_obj = datetime.strptime(clean_date, "%Y-%m-%d")result["date"] = date_obj.strftime("%Y-%m-%d")except ValueError:result["date"] = date_str # 保留原始格式elif re.match(r"\d{4}-\d{2}-\d{2}", date_str):result["date"] = date_str# 其他无法识别的日期格式else:result["date"] = "未知日期"# 转换为 DataFrameorganic_df = pd.json_normalize(data, "organic")organic_df.drop('position', axis=1, inplace=True)top_stories_df = pd.json_normalize(data, "topStories")# print("\norganic_df:")# print(organic_df.columns)# print("\ntop_stories_df:")# print(top_stories_df.columns) # 添加缺失的列以确保两个DataFrame有相同的结构for df in [organic_df, top_stories_df]:if 'source' not in df.columns:df['source'] = '未知来源'if 'snippet' not in df.columns:df['snippet'] = ''# print("\norganic_df:")# print(organic_df.columns)# print("\ntop_stories_df:")# print(top_stories_df.columns) combined_df = pd.concat([organic_df, top_stories_df], ignore_index=True)# print("\ncombined_df:")# print(combined_df)return combined_dfdef clean_search_data(df):# 移除重复结果df = df.drop_duplicates(subset=["link"])# # 统一日期格式df["date"] = pd.to_datetime(df["date"], errors="coerce")# 提取域名作为来源df["source_domain"] = df["link"].apply(lambda x: re.search(r"https?://([^/]+)", x).group(1) if re.search(r"https?://([^/]+)", x) else "未知域名")return dfdef extract_article_content(url: str, timeout: int = 60) -> str:"""从网页URL提取主要内容Args:url (str): 网页URLtimeout (int): 请求超时时间(秒)Returns:str: 提取的网页内容(截取前2000字符)"""try:# 设置自定义User-Agentheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}article = Article(url, headers=headers, request_timeout=timeout)article.download()article.parse()# 返回前2000个字符以控制长度return article.text[:2000]except ArticleException as e:print(f"提取内容失败: {url} - {str(e)}")return f"内容提取失败: {str(e)}"except Exception as e:print(f"处理URL时出错: {url} - {str(e)}")return f"处理错误: {str(e)}"def summarize_with_llm(articles: List[Dict[str, Any]], query: str) -> str:"""使用大模型(deepseek-v3)整合搜索结果Args:articles (List[Dict]): 文章信息列表,包含title, link, content等query (str): 原始搜索查询Returns:str: 大模型生成的整合摘要"""# 构建提示词prompt = f"请根据以下搜索结果,整合关于'{query}'的信息。请使用中文回答,保持专业且全面。\n\n"for i, article in enumerate(articles, 1):# 确保所有字段都存在title = article.get('title', '无标题')source = article.get('source_domain', '未知来源')content = article.get('content', '无内容')prompt += f"结果 {i}:\n标题: {article['title']}\n来源: {article['source_domain']}\n"prompt += f"内容摘要: {article['content'][:500]}...\n\n"prompt += "请总结关键信息,并标注信息来源。避免重复,突出最重要的发现。"try:response = client.chat.completions.create(model="Pro/deepseek-ai/DeepSeek-V3",messages=[{"role": "system", "content": "你是一位专业的搜索引擎结果分析师,擅长整合多来源信息。"},{"role": "user", "content": prompt}],temperature=0.3,max_tokens=2000)return response.choices[0].message.content.strip()except client.error.OpenAIError as e:print(f"大模型API错误: {str(e)}")return f"摘要生成失败: {str(e)}"except Exception as e:print(f"大模型处理错误: {str(e)}")import tracebacktraceback.print_exc()return f"处理错误: {str(e)}"def process_search_results(df: pd.DataFrame, query: str, top_n: int = 3) -> Dict[str, Any]:"""处理搜索结果:提取内容并生成摘要Args:df (pd.DataFrame): 搜索结果DataFramequery (str): 原始搜索查询top_n (int): 要处理的前N个结果Returns:dict: 包含原始结果和处理后摘要的字典"""# # 选择最相关的前N个结果# if 'date' not in df.columns:# df['date'] = datetime.now()sorted_df = df.sort_values(by='date', ascending=False)top_results = sorted_df.head(top_n).copy()print(f"正在提取{top_n}篇网页内容...")# 提取网页内容top_results["content"] = top_results["link"].apply(lambda url: extract_article_content(url))# 准备大模型输入数据articles_for_llm = top_results.to_dict(orient="records")print("正在使用大模型整合结果...")# 使用大模型生成摘要summary = summarize_with_llm(articles_for_llm, query)return {"raw_results": df,"top_articles": top_results,"llm_summary": summary}if __name__ == "__main__":try:# 测试函数search_query = "OpenAI大模型最新进展"result = search_web(search_query)# 解析搜索结果df = parse_search_data(result)df = clean_search_data(df)print("\n清洗后的搜索结果:")print(df)processed = process_search_results(df, search_query, top_n=3)print("\n===== 大模型整合摘要 =====")print(processed["llm_summary"])except ValueError as e:print(f"配置错误: {e}")except http.client.HTTPException as e:print(f"网络请求错误: {e}")except Exception as e:print(f"发生未知错误: {e}")
输出结果:
清洗后的搜索结果:title link ... source source_domain
0 隆重推出面向开发人员的GPT-5 - OpenAI https://openai.com/zh-Hans-CN/index/introducin... ... 未知来源 openai.com
1 OpenAI推出最新人工智能模型GPT-5 - 新华网 http://www.news.cn/tech/20250808/67d497c93d8b4... ... 未知来源 www.news.cn
2 OpenAI最强模型GPT-5来了!免费可用,Altman高呼迈向AGI一大步 https://wallstreetcn.com/articles/3752897 ... 未知来源 wallstreetcn.com
3 GPT-5发布,这一次OpenAI没有颠覆式创新|新京报专栏 https://www.bjnews.com.cn/detail/1754634733168... ... 未知来源 www.bjnews.com.cn
4 GPT-5登场!OpenAI奥特曼:幻觉大幅降低,已从大学生变博士级专家 https://m.thepaper.cn/newsDetail_forward_31335762 ... 未知来源 m.thepaper.cn
5 OpenAI推出最新人工智能模型GPT-5 https://finance.sina.com.cn/stock/t/2025-08-08... ... 新浪财经 finance.sina.com.cn
6 隆重推出 gpt-oss https://openai.com/zh-Hans-CN/index/introducin... ... OpenAI openai.com
7 OpenAI、谷歌等深夜更新多款模型,展示开源、智能体、世界模型进展 https://www.yicai.com/news/102760136.html ... 第一财经 www.yicai.com[8 rows x 6 columns]
正在提取3篇网页内容...
提取内容失败: https://openai.com/zh-Hans-CN/index/introducing-gpt-5-for-developers/ - Article `download()` failed with 403 Client Error: Forbidden for url: https://openai.com/zh-Hans-CN/index/introducing-gpt-5-for-developers/ on URL https://openai.com/zh-Hans-CN/index/introducing-gpt-5-for-developers/
正在使用大模型整合结果...===== 大模型整合摘要 =====
根据现有搜索结果,以下是关于OpenAI大模型最新进展的关键信息整合:1. **GPT-5正式发布** - OpenAI已推出新一代人工智能模型GPT-5(来源:OpenAI官网、新华网、华尔街见闻)。 - 官网虽未提供详细技术文档(403访问限制),但多方信源确认该模型已面向开发者开放。2. **核心突破与定位** - 被OpenAI CEO Sam Altman称为"迈向AGI(通用人工智能)的重要一步"(来源:华尔街见闻),暗示其在多模态理解、复杂推理或自主任务执行方面有显著提升。3. **商业化与开放策略** - 提供免费使用选项(来源:华尔街见闻),可能延续"基础功能免费+高级服务付费"模式。 - 明确聚焦开发者生态建设(来源:OpenAI官网标题),或包含API接口优化或定制化开发支持。4. **行业影响** - 获中国官方媒体新华网报道,显示其全球影响力及技术认可度。 - 华尔街见闻强调"最强模型"定位,可能指其在基准测试或实际应用中的性能优势。*注:因OpenAI官网技术细节访问受限,建议后续通过官方渠道或权威技术媒体获取模型参数量、训练数据等具体信息。*