当前位置：首页 > news >正文

改写爬虫， unsplash 图片爬虫 (网站改动了，重写爬虫)

news 2025/9/2 3:59:31

最后根据我最后的修改代码。帮我继续写：1. 这个代码，完成了哪些功能。
2. 分享给其他用户，如何使用。按照这2部分来写。

好的，我们来针对你最终完善的那个结合了 Selenium 和多线程 requests 下载的 Unsplash 爬虫脚本，撰写一份功能介绍和使用指南。

完整代码

import time
import random
import json
import pickle
import os
import requests
from urllib.parse import quote_plus, urlparse  # 新增 urlparse 用于解析URL
import concurrent.futuresfrom selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException# ... (其他 import 和配置保持不变) ...
# --- 配置区 ---
UNSPLASH_BASE_URL = "https://unsplash.com/"
UNSPLASH_API_URL_TEMPLATE = "https://unsplash.com/napi/search/photos?orientation=landscape&plus=none&per_page={per_page}&page={page}&query={query}"COOKIES_JSON_FILE = "add_cookies_from_EditThisCookie.json"
COOKIES_PICKLE_FILE = "unsplash_cookies.pkl"
BASE_IMAGE_DOWNLOAD_DIR = "images"
API_RESPONSES_DIR = "api_responses"
IMAGES_PER_PAGE = 20
MAX_DOWNLOAD_THREADS = 10chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])class UnsplashScraperCombined:# ... (__init__, _convert_cookie_for_selenium, _process_cookies_from_json_to_pickle, _load_cookies_to_browser 保持不变) ...def __init__(self, search_keyword: str, num_pages_to_scrape: int):self.task_start_time = time.time()  # 任务开始计时self.search_keyword_original = search_keywordself.safe_search_keyword_for_foldername = "".join(c if c.isalnum() or c in " _-" else "_" for c in search_keyword)self.search_keyword_encoded = quote_plus(search_keyword)self.num_pages_to_scrape = num_pages_to_scrapeself.current_image_download_dir = os.path.join(BASE_IMAGE_DOWNLOAD_DIR, self.safe_search_keyword_for_foldername)self.current_api_response_dir = os.path.join(API_RESPONSES_DIR, self.safe_search_keyword_for_foldername)print(f"🚀 正在为关键词 '{self.search_keyword_original}' 初始化 Unsplash 组合爬虫...")try:service = Service(ChromeDriverManager().install())self.driver = webdriver.Chrome(service=service, options=chrome_options)print("✅ WebDriver (无头模式) 初始化成功。")except Exception as e:print(f"🚨 WebDriver 初始化错误: {e}")raiseif not os.path.exists(self.current_image_download_dir):os.makedirs(self.current_image_download_dir)print(f"✅ 已创建图片下载目录: '{self.current_image_download_dir}'")if not os.path.exists(self.current_api_response_dir):os.makedirs(self.current_api_response_dir)print(f"✅ 已创建 API 响应保存目录: '{self.current_api_response_dir}'")def _convert_cookie_for_selenium(self, cookie_from_json):cookie_for_selenium = {'name': cookie_from_json['name'],'value': cookie_from_json['value']}if 'path' in cookie_from_json:cookie_for_selenium['path'] = cookie_from_json['path']if 'domain' in cookie_from_json:if "unsplash.com" in cookie_from_json['domain']:cookie_for_selenium['domain'] = cookie_from_json['domain']if 'secure' in cookie_from_json:cookie_for_selenium['secure'] = cookie_from_json['secure']if 'expirationDate' in cookie_from_json and cookie_from_json['expirationDate']:try:cookie_for_selenium['expiry'] = int(cookie_from_json['expirationDate'])except ValueError:print(f"⚠️ 警告: 无法转换 expirationDate '{cookie_from_json['expirationDate']}' (cookie: '{cookie_from_json['name']}').")return cookie_for_seleniumdef _process_cookies_from_json_to_pickle(self):print(f"🍪 正在从 '{COOKIES_JSON_FILE}' 处理 Cookies...")if not os.path.exists(COOKIES_JSON_FILE):print(f"🚨 错误: Cookie 文件 '{COOKIES_JSON_FILE}' 未找到！")return Falsewith open(COOKIES_JSON_FILE, 'r', encoding='utf-8') as f:try:raw_cookies = json.load(f)except json.JSONDecodeError as e:print(f"🚨 从 '{COOKIES_JSON_FILE}' 解码 JSON 时出错: {e}")return Falseprocessed_cookies_for_selenium = []for c in raw_cookies:if 'domain' in c and 'unsplash.com' in c['domain']:processed_cookies_for_selenium.append(self._convert_cookie_for_selenium(c))with open(COOKIES_PICKLE_FILE, 'wb') as f:pickle.dump(processed_cookies_for_selenium, f)print(f"✅ Cookies 处理完成并已保存到 '{COOKIES_PICKLE_FILE}'。")return Truedef _load_cookies_to_browser(self):print("🚗 正在导航到 Unsplash 以设置 Cookies...")self.driver.get(UNSPLASH_BASE_URL)time.sleep(random.uniform(1, 2))if not os.path.exists(COOKIES_PICKLE_FILE):if not self._process_cookies_from_json_to_pickle():return Falseprint(f"🍪 正在从 '{COOKIES_PICKLE_FILE}' 加载 Cookies 到浏览器...")with open(COOKIES_PICKLE_FILE, 'rb') as f:cookies = pickle.load(f)if not cookies:print("⚠️ 警告: Pickle 文件中没有找到适用于 Unsplash 的 Cookies。")for cookie in cookies:try:self.driver.add_cookie(cookie)except Exception as e:print(f"⚠️ 警告: 无法添加 Cookie: {cookie.get('name', 'N/A')}。错误: {e}")print("🔄 正在使用新 Cookies 刷新页面...")self.driver.refresh()time.sleep(random.uniform(2, 3))print("✅ Cookies 加载完毕且页面已刷新。")return Truedef _fetch_api_page_data_with_selenium(self, page_num):"""使用 Selenium 获取单个 API 页面的 JSON 数据，并保存原始 JSON。"""current_api_url = UNSPLASH_API_URL_TEMPLATE.format(per_page=IMAGES_PER_PAGE,page=page_num,query=self.search_keyword_encoded)print(f"  📡 (Selenium) 导航至 API URL (第 {page_num} 页): {current_api_url[:100]}...")self.driver.get(current_api_url)time.sleep(random.uniform(0.5, 1.5))try:json_text_element = self.driver.find_element(By.TAG_NAME, "pre")json_text = json_text_element.textapi_response_data_for_validation = json.loads(json_text)api_json_filename = f"api_page_{page_num}.json"api_json_filepath = os.path.join(self.current_api_response_dir, api_json_filename)try:with open(api_json_filepath, 'w', encoding='utf-8') as f_api:f_api.write(json_text)print(f"    💾 API 原始 JSON (第 {page_num} 页) 已保存至: '{api_json_filepath}'")except Exception as e_save:print(f"    🚨 保存 API JSON (第 {page_num} 页) 失败: {e_save}")return api_response_data_for_validationexcept NoSuchElementException:print(f"  🚨 (Selenium) 错误: 在 API 页面 {current_api_url} 未找到 <pre> 标签。")print(f"     页面源码 (前 200 字符): {self.driver.page_source[:200]}")except json.JSONDecodeError as e:print(f"  🚨 (Selenium) 错误: 无法从 API 页面 {current_api_url} 解码 JSON。错误: {e}")print(f"     获取到的内容 (前 200 字符): {json_text[:200] if 'json_text' in locals() else 'N/A'}")except Exception as e:print(f"  🚨 (Selenium) 第 {page_num} 页发生未知错误: {e}")return Nonedef _download_single_image(self, photo_detail, session_cookies, session_ua):"""使用 requests 下载单个图片（用于多线程），并使用 URL slug 作为文件名。"""photo_id = photo_detail.get('id')  # 仍然获取 photo_id 作为备用和可能的元数据image_url = photo_detail.get('url')if not image_url:  # 检查 image_url 是否有效print(f"    ⚠️ (下载线程) 图片数据中缺少 URL: {photo_detail}")return False# --- 修改文件名提取逻辑 ---try:parsed_url = urlparse(image_url)# URL 路径通常是 /<slug>?ixid=...# 我们取路径的第一部分，去掉开头的 '/'path_segments = parsed_url.path.strip('/').split('/')slug_candidate = path_segments[0] if path_segments else ""if slug_candidate.startswith("photo-") and len(slug_candidate) > 6:  # 简单校验 slug 格式filename_base = slug_candidateelif photo_id:  # 如果 slug 不符合预期，回退到使用 photo_idfilename_base = photo_idelse:  # 如果两者都不可用，生成一个随机名或跳过print(f"    ⚠️ (下载线程) 无法为 URL '{image_url[:70]}' 确定有效的文件名基础。将使用随机名。")# 或者可以完全跳过: return Falsefilename_base = f"unknown_{random.randint(1000, 9999)}"except Exception as e_parse:print(f"    ⚠️ (下载线程) 解析 URL '{image_url[:70]}' 以提取文件名时出错: {e_parse}。将使用 photo_id。")if photo_id:filename_base = photo_idelse:filename_base = f"error_{random.randint(1000, 9999)}"filename = f"{filename_base}.jpg"# --- 文件名提取逻辑修改结束 ---filepath = os.path.join(self.current_image_download_dir, filename)if os.path.exists(filepath):return Trueheaders = {'User-Agent': session_ua if session_ua else "Mozilla/5.0",'Referer': UNSPLASH_BASE_URL}try:img_response = requests.get(image_url, headers=headers, cookies=session_cookies, stream=True, timeout=45)img_response.raise_for_status()with open(filepath, 'wb') as f_img:for chunk in img_response.iter_content(chunk_size=81920):f_img.write(chunk)return Trueexcept requests.exceptions.HTTPError as e:print(f"    🚨 (下载线程) HTTP 错误 {e.response.status_code} 下载 {filename}: {image_url[:70]}")except requests.exceptions.RequestException as e:print(f"    🚨 (下载线程) 请求错误下载 {filename}: {e} URL: {image_url[:70]}")except Exception as e:print(f"    🚨 (下载线程) 未知错误下载 {filename}: {e} URL: {image_url[:70]}")return Falsedef scrape_and_download(self):"""执行完整的抓取和下载流程。"""# (API 获取部分的核心逻辑保持不变)if not self._process_cookies_from_json_to_pickle():self.close_browser()returnif not self._load_cookies_to_browser():print("🚨 加载 Cookies 到浏览器失败。正在中止。")self.close_browser()returnprint(f"\n📸 (Selenium) 开始为关键词 '{self.search_keyword_original}' 获取图片信息...")all_photos_to_download = []for page_num in range(1, self.num_pages_to_scrape + 1):api_data = self._fetch_api_page_data_with_selenium(page_num)if api_data:photos_on_page = api_data.get('results', [])if photos_on_page:print(f"  第 {page_num} 页: 从 API 获取到 {len(photos_on_page)} 张图片信息。")for photo_data in photos_on_page:photo_id = photo_data.get('id')  # 确保 photo_id 被传递raw_url = photo_data.get('urls', {}).get('raw')if photo_id and raw_url:  # photo_id 用于备用文件名all_photos_to_download.append({'id': photo_id, 'url': raw_url})else:total_available = api_data.get('total', 0)print(f"  第 {page_num} 页: API 响应中 'results' 为空。")if page_num * IMAGES_PER_PAGE > total_available and total_available > 0:print(f"    已到达可用结果的末尾 (总计: {total_available})。停止此关键词的 API 获取。")breakif total_available == 0 and page_num == 1:print(f"    API 返回该关键词无任何图片。")breakelse:print(f"  第 {page_num} 页:未能获取或解析API数据。")if not all_photos_to_download:print(f"\n🤷 未能为关键词 '{self.search_keyword_original}' 获取到任何图片链接。")self.close_browser()returnprint(f"\n📥 准备使用 {MAX_DOWNLOAD_THREADS} 个线程下载 {len(all_photos_to_download)} 张图片...")current_selenium_cookies = {c['name']: c['value'] for c in self.driver.get_cookies()}try:current_selenium_ua = self.driver.execute_script("return navigator.userAgent;")except:current_selenium_ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"successful_downloads = 0failed_downloads = 0# (多线程下载和进度报告部分保持不变)with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_DOWNLOAD_THREADS) as executor:future_to_photo = {executor.submit(self._download_single_image, photo_detail, current_selenium_cookies,current_selenium_ua): photo_detailfor photo_detail in all_photos_to_download}total_futures = len(future_to_photo)for i, future in enumerate(concurrent.futures.as_completed(future_to_photo)):try:if future.result():successful_downloads += 1else:failed_downloads += 1except Exception as exc:print(f'  线程下载中产生异常: {exc}')failed_downloads += 1processed_count = i + 1if processed_count == total_futures or (processed_count % (total_futures // 20 or 1) == 0 and processed_count > 0):print(f"    下载进度: {processed_count}/{total_futures} (成功: {successful_downloads}, 失败: {failed_downloads})")if total_futures > 0 and (total_futures % (total_futures // 20 or 1) != 0):print(f"    下载进度: {total_futures}/{total_futures} (成功: {successful_downloads}, 失败: {failed_downloads})")print(f"\n👍 多线程下载完成。成功/已存在: {successful_downloads} 张, 失败: {failed_downloads} 张。")task_end_time = time.time()task_duration = task_end_time - self.task_start_timeprint(f"⏱️  关键词 '{self.search_keyword_original}' 任务耗时: {task_duration:.2f} 秒。")self.close_browser()def close_browser(self):# (此函数保持不变)if hasattr(self, 'driver') and self.driver:print("👋 正在关闭浏览器...")try:self.driver.quit()print("✅ 浏览器已关闭。")except Exception as e:print(f"🚨 关闭浏览器时发生错误: {e}")# ... (if __name__ == "__main__": 部分保持不变) ...
if __name__ == "__main__":script_total_start_time = time.time()# 德国# tasks = [#     {"keyword": "Berlin skyline cityscape", "pages": 10},#     {"keyword": "Munich skyline cityscape", "pages": 10},#     {"keyword": "Frankfurt skyline cityscape", "pages": 10},#     {"keyword": "Berlin skyline cityscape", "pages": 10},#     {"keyword": "Hamburg skyline cityscape", "pages": 10},#     {"keyword": "Leipzig skyline cityscape", "pages": 10},#     {"keyword": "Heidelberg skyline cityscape", "pages": 10},# ]# 法国tasks = [{"keyword": "France Marseille skyline cityscape", "pages": 10},{"keyword": "France Lyon skyline cityscape", "pages": 10},{"keyword": "France Toulouse skyline cityscape", "pages": 10},{"keyword": "France Nice skyline cityscape", "pages": 10},{"keyword": "France Nantes skyline cityscape", "pages": 10},{"keyword": "France Bordeaux monuments cityscape", "pages": 10}]# todo 其他欧洲城市！ 西班牙。。。。if not os.path.exists(COOKIES_JSON_FILE):print(f"🚨 严重错误: '{COOKIES_JSON_FILE}' 文件未找到！请先导出 Cookies。")print(f"   请从 Unsplash 网站 (例如 unsplash.com) 导出 cookies 并保存为 '{COOKIES_JSON_FILE}'")else:if not os.path.exists(API_RESPONSES_DIR):os.makedirs(API_RESPONSES_DIR)print(f"✅ 已创建总 API 响应目录: '{API_RESPONSES_DIR}'")total_tasks = len(tasks)for i, task_info in enumerate(tasks):search_query = task_info["keyword"]num_pages = task_info["pages"]print(f"\n===== [任务 {i + 1}/{total_tasks}] 开始处理: 关键词='{search_query}', 页数={num_pages} =====")scraper_instance = Nonetry:scraper_instance = UnsplashScraperCombined(search_keyword=search_query, num_pages_to_scrape=num_pages)scraper_instance.scrape_and_download()except Exception as e_main_task:print(f"🚨 处理关键词 '{search_query}' 时发生主错误: {e_main_task}")if scraper_instance and hasattr(scraper_instance, 'driver') and scraper_instance.driver:scraper_instance.close_browser()print(f"===== [任务 {i + 1}/{total_tasks}] 完成: 关键词='{search_query}' =====\n")if i < total_tasks - 1:sleep_duration = random.uniform(3, 7)print(f"😴 任务间休眠 {sleep_duration:.1f} 秒...")time.sleep(sleep_duration)script_total_end_time = time.time()script_total_duration = script_total_end_time - script_total_start_timeprint(f"🏁 所有任务处理完毕！总耗时: {script_total_duration:.2f} 秒 ({script_total_duration / 60:.2f} 分钟)。")

Unsplash 高清美图批量下载器：我的私人定制版“风景收割机”

在经历了与 Unsplash 服务器反爬机制的一番“亲切交流”后，我终于炼成了这款“Unsplash 高清美图批量下载器”！它可能不是最高效的，但绝对是我目前用着最顺手、也最有故事的一款工具。下面，就让我来给你详细介绍一下它的“十八般武艺”以及如何让它为你服务。

一、它能帮你做什么？（功能清单）

这款小工具虽然代码量不大，但五脏俱全，主要能帮你实现以下功能：

精准关键词搜索与多页爬取：
- 你可以指定任意你感兴趣的搜索关键词（比如 “法国里昂城市风光”, “日本北海道雪景”）。
- 可以设定要为每个关键词爬取多少页的图片信息（每页通常包含20张图片）。
模拟浏览器高级“潜入”：
- 核心武器是 Selenium！它会启动一个真实的 Chrome 浏览器（在无头模式下静默运行，不打扰你）。
- 最关键的是，它能加载你手动从浏览器导出的 Unsplash 网站 Cookies。这意味着，它能完美模拟你已登录或已配置偏好的浏览器会话，轻松绕过那些让普通 requests 脚本头疼的“身份验证”和反爬检查。
智能图片信息获取与保存：
- Selenium 会带着你的“身份凭证”（Cookies）去访问 Unsplash 内部的图片搜索 API。
- 获取到的每一页 API 响应（原始的 JSON 数据）都会被完整地保存到本地的 api_responses/[你的搜索关键词]/api_page_X.json 文件中。这就像是把每次“侦察”到的情报都存档了，方便后续查阅或重新处理。
高清原图下载与命名优化：
- 从 API 数据中提取出图片的高清原图下载链接（通常是 raw 链接）。
- 图片文件名会尝试从图片 URL 中提取类似 photo-xxxxxxxxx.jpg 这样更具辨识度的部分，如果提取失败，则会使用图片自身的唯一 ID 作为文件名，确保每个文件都有名有姓。
分类整理，井井有条：
- 所有下载的图片都会自动保存在以搜索关键词命名的子文件夹下（位于 images/ 目录中）。比如，搜索 “京都红叶” 的图片，会存放在 images/京都红叶/ 文件夹里，找图一目了然。
多线程加速下载：
- 图片下载部分采用了多线程技术（默认配置了10个线程），并发下载，大大缩短了等待时间，让美图更快到碗里来！
任务化与自动化：
- 你可以轻松地在脚本中配置一个任务列表，一次性安排多个关键词的爬取任务。脚本会按顺序逐个完成。
过程反馈与计时：
- 脚本在运行过程中会打印详细的日志信息，让你清楚地了解当前的进度（例如，正在处理哪个关键词、第几页、下载进度等）。
- 它还会为每个关键词任务以及整个脚本的运行计时，让你对效率有个直观的感受。

简单来说，就是你提供关键词和页数，再“喂”给它一些你浏览器里的 Unsplash “小饼干”，它就能勤勤恳恳地帮你把对应的高清大图搬运到你的硬盘里，并且整理得妥妥当当。

二、如何驾驭这台“风景收割机”？（使用指南）

想让这台“收割机”为你所用？很简单，只需几步：

准备“燃料”—— 安装必要的库：
打开你的终端或命令行，确保安装了以下 Python 库：
```
pip install selenium requests webdriver-manager
```
(如果你已经安装过，可以跳过此步。)
获取“通行证”—— 导出 Unsplash Cookies:
- 关键步骤！ 打开你的 Chrome 浏览器（或其他支持导出 Cookies 插件的浏览器）。
- 访问 https://unsplash.com/。如果你有 Unsplash 账号并且通常是登录状态，请先登录。进行一些浏览或搜索操作，确保 Cookies 是活跃的。
- 安装一个浏览器插件，比如 “EditThisCookie”。
- 使用该插件，导出 unsplash.com 域下的所有 Cookies。通常会导出一个 JSON 格式的文件。
- 将这个导出的 JSON 文件重命名为 add_cookies_from_EditThisCookie.json （脚本中是这样配置的），并将其放置在与你保存的 Python 爬虫脚本相同的目录下。

配置你的“狩猎”目标：
打开 Python 爬虫脚本（就是你保存的那份 .py 文件），找到末尾的 if __name__ == "__main__": 部分。你会看到一个类似这样的 tasks 列表：

if __name__ == "__main__":script_total_start_time = time.time()# --- 用户配置区 ---tasks = [# 示例：搜索"法国马赛城市风光"，爬取10页{"keyword": "France Marseille skyline cityscape", "pages": 10}, {"keyword": "France Lyon skyline cityscape", "pages": 10},# ... 你可以按照这个格式添加更多任务 ...]# ... 后续代码 ...

修改或添加任务: 根据你的喜好，修改 keyword（你想搜索的图片主题）和 pages（你想为这个主题爬取多少页）。

启动“收割机”！：
一切准备就绪后，在你的终端或命令行中，切换到脚本所在的目录，然后运行：
```
python your_script_name.py 
```
(将 your_script_name.py 替换为你实际保存的脚本文件名。)
坐和放宽，欣赏成果：
- 脚本会自动启动一个无头 Chrome 浏览器（你看不到界面的那种），加载 Cookies，然后开始工作。
- 你会在控制台看到它打印的进度信息。
- 爬取到的 API JSON 数据会保存在项目根目录下的 api_responses/[你的搜索关键词]/ 文件夹中。
- 下载的图片会保存在 images/[你的搜索关键词]/ 文件夹中。
- 耐心等待所有任务完成，脚本会告诉你总共耗时多久。

一些小贴士和注意事项：

Cookies 的时效性: 浏览器 Cookies 是有有效期的，而且某些操作（如退出登录）可能会使它们失效。如果脚本突然无法正常工作（例如，API 获取失败，或者图片下载出现403），首先要考虑的就是 Cookies 是否过期了。这时，你需要重新从浏览器导出最新的 Cookies，并替换掉旧的 add_cookies_from_EditThisCookie.json 文件。
网络环境: 确保你的网络连接稳定。如果你的网络环境需要代理才能访问 Unsplash，你需要确保 Selenium 和 requests 都正确配置了代理（当前脚本中 requests 部分并未显式配置代理，它会默认跟随系统代理或不使用代理）。
ChromeDriver: 脚本使用了 webdriver-manager 来自动下载和管理 ChromeDriver，这通常能解决大部分驱动版本不匹配的问题。如果遇到 WebDriver 相关的错误，检查你的 Chrome 浏览器是否为最新版本。
尊重网站规则: 虽然我们用了些“小技巧”，但还是要注意爬取频率，避免对 Unsplash 服务器造成过大负担。任务间的休眠有助于此。Unsplash 也有官方的开发者 API，如果你的需求更偏向于应用集成，那会是更规范的选择。

好了，指南就到这里！希望这款小工具能帮你轻松收集到心仪的高清美图，让你的桌面和图库都充满惊喜！如果在使用过程中遇到什么问题，或者有什么好点子，欢迎随时交流！

请帮我写一篇文章来介绍这个项目。 从最初的要求到最后实现的效果。 要求1. 篇幅不要太长，语言幽默有趣， 平易近人， 有吸引力。 2. 重点介绍的是起因，即，需求和起因增加篇幅，其他地方篇幅减少。3. 用中文写。单独新建一个文件。4. 请帮我分条理, 看起来很清晰。

好的，没问题！这真是一段有趣的“折腾”之旅，从一个简单的想法到最终实现一个功能完善的爬虫，确实值得记录一下。

下面是一篇关于这个 Unsplash 图片爬虫项目的文章，希望能符合你的要求：

我的 Unsplash 爬虫“奇遇记”：从一个API到满硬盘的美图

嘿，老兄！今天想跟你聊聊我最近捣鼓的一个小项目——一个 Unsplash 图片爬虫。你可能会说：“爬虫？听起来好高大上，是不是特复杂？”别急，且听我慢慢道来，这趟旅程可比你想的要“接地气”多了，中间还充满了各种啼笑皆非的“斗智斗勇”。

第一章：最初的“小目标”与美丽的“误会”

故事的起源，往往简单得可爱。

一切都源于一个朴素的愿望：我想从 Unsplash 上下载一些特定主题的高清壁纸，比如“柏林城市风光”、“法国马赛街景”之类的。手动一张张右键保存？太慢了，程序员的“懒惰”可是第一生产力！

于是，我寻寻觅觅，发现了一个看起来像是“藏宝图入口”的 API 地址：

https://unsplash.com/napi/search/photos?orientation=landscape&page=2&per_page=20&plus=none&query=sweden

这玩意儿，就像一个会自动贩卖图片的机器，你告诉它关键词（比如 query=sweden），它就吐出20张横向图片的信息，里面包含了图片的下载链接和独一无二的 ID。我的小目标就是：写个 Python 脚本，自动请求这个 API，然后把图片哗啦啦地下载到本地 images/ 文件夹，文件名就用图片的 ID。

美丽的“误会”——差点请出“屠龙刀”

一开始，我寻思着：“爬虫嘛，是不是得上 Scrapy 这把‘屠龙刀’？” 毕竟名声在外，功能强大。于是，我还煞有介事地构思了如何用 Scrapy 的 Item, Pipeline, Spider 来组织我的代码，甚至连启动命令都想好了：

import shlex
from scrapy import cmdline
# ...此处省略了N行关于Scrapy的宏伟蓝图...

结果呢？刚起了个头，我就发现，对于我这个“只想从一个固定 API 地址下载几页图片”的小需求来说，Scrapy 简直就是“杀鸡用牛刀”，不，是“大炮打蚊子”！太复杂了，光是理解那些概念就够我喝一壶的。于是，我果断放弃了这个“美丽的误会”，决定返璞归真。

第二章：“简单点，请求的方式简单点”—— `requests` 的闪亮登场与“403大魔王”

回到梦开始的地方：requests 库

“既然只是请求一个 URL，那 Python 的 requests 库不就挺好？” 我一拍大腿，对啊！简单直接，几行代码就能搞定网络请求。于是，关键词固定为 sweden，先爬1页试试水。

“当头一棒”—— 403 Forbidden

代码写好，信心满满地运行，结果……啪！一个鲜红的 403 Client Error: Forbidden 甩在了我的脸上。服务器冷冰冰地告诉我：“禁止访问！” 我纳闷了，这 API 地址在浏览器里打开得好好的，怎么用脚本就不行了呢？响应内容空空如也，连个解释的机会都不给。

第三章：与“403大魔王”的拉锯战

这下可把我给整不会了。接下来就是一段漫长而曲折的“Debug 血泪史”：

伪装！我是浏览器！
我想，肯定是服务器看我像个脚本，不给面子。于是，我祭出了爬虫工程师的必备技能——伪造 Headers！User-Agent, Accept-Language, Referer……能从浏览器开发者工具里抄的全抄上，把自己打扮成一个“根正苗红”的 Chrome 浏览器。结果呢？依然 403！Varnish 服务器（从响应头里看到的）像个铁面无私的门卫，根本不吃我这套。
Cookie！神秘的“小饼干”
我注意到浏览器访问时，请求头里带着一大串 Cookie。这玩意儿就像网站给你的“通行证”。于是，我把浏览器里的 Cookie 复制粘贴到脚本里。心情激动地再次运行……还是 403！难道这“小饼干”是“一次性”的？或者它和我脚本的“气质”不符？
代理！换个“马甲”试试？
会不会是我的 IP 地址上了“黑名单”？我尝试加上了本地代理。结果……你懂的，依旧是熟悉的 403。

这段时间，我就像一个侦探，对着请求头、响应头翻来覆去地研究，尝试各种组合，但“403大魔王”就是不肯放行。我甚至开始怀疑人生，是不是 Unsplash 的程序员在我电脑对面，边喝咖啡边嘲笑我：“小样儿，还想爬我？”

第四章：救星登场！Selenium 大法好！

就在我快要放弃，准备回去一张张右键保存的时候，一道灵光闪过：“如果 requests 模拟得不像，那我直接用一个真正的浏览器来访问不就行了？”

于是，Selenium 这位“浏览器操控大师”闪亮登场！

我的策略是：

曲线救国：用 Selenium 打开浏览器。
手动喂 Cookie：把我从浏览器（配合 EditThisCookie 插件）导出的、针对 Unsplash 网站的 完整 Cookie 信息，通过 Selenium 加载到它的浏览器实例中。
让 Selenium 访问 API：因为 Selenium 控制的浏览器现在有了“合法身份”（即有效的 Cookies），它再去访问那个 napi 的 API 地址。
提取 JSON，然后快速下载：Selenium 成功获取到 API 返回的 JSON 数据后，这个“慢吞吞”的大家伙的任务就完成了。后续解析 JSON 和下载图片，还是交给轻巧高效的 requests 库，并且使用多线程加速下载。

效果拔群！

当我把这一套流程跑起来后，奇迹发生了！Selenium 带着我的“皇家曲奇”，成功地从 API 获取到了 JSON 数据！那一刻，我感觉整个世界都亮了！后续的图片下载也顺理成章。

第五章：精益求精，打造称手“工具”

解决了核心的“身份认证”问题后，我又对脚本进行了一些优化和功能增强：

参数化配置：搜索关键词和要爬取的页数，都可以方便地在脚本末尾配置，想爬什么城市的风景，想爬多少页，改改参数就行。
图片分类存储：下载的图片会按照搜索关键词自动创建子文件夹进行存放，比如 images/柏林城市风光/，井井有条。
API 响应也存一份：Selenium 获取到的原始 API JSON 数据，也会按照关键词和页码保存下来，方便以后查阅或重新处理，不用每次都启动 Selenium。
无头模式运行：Selenium 可以在“无头模式”下静默运行，不弹出浏览器窗口，更像一个后台脚本。
多线程下载加速：图片下载使用了多线程，比如同时开10个线程去下载，大大提高了效率。
任务计时：为每个关键词的爬取任务和整个脚本的运行都加上了计时，看看成果，也方便优化。
文件名优化：图片文件名从最初的图片ID，改为了从图片URL中提取的、更具可读性的 photo-xxxx.jpg 格式。