scrapy-redis项目:爬取某网站图书信息
目标
网站:All products | Books to Scrape - Sandbox
需求
-
获取标题(title),价格(price),分类(category),详情(description),upc号码(upc),链接地址(url),图片地址(img_url)
-
如果有翻页就进行翻页处理
-
保存csv文件
步骤
-
创建scrapy项目
-
修改settings.py文件中需要进行修改的项
-
通过需求来写items.py字段
-
对页面进行数据抓包,分析页面
-
获取首页的分类列表,提前分类名称和详情链接
-
将获取到的url进行拼接,并且向分类页继续发送链接
-
获取分类页的每一个图书的详情页链接,并且判断该页是否有20条数据,有就进行翻页处理
-
进入图书详情页获取上述的信息
-
更改settings.py的配置文件,改用redis
-
将spider里面的爬虫继承的类修改成redisspider
-
进入命令行,输入redis-cli
-
定义初识的url(books:start_urls)
lpush books:start_urls http://books.toscrape.com
-
开启几个终端,输入scrapy crawl books开始分布式爬虫
代码
settings.py
BOT_NAME = "scrapy_day04"
SPIDER_MODULES = ["scrapy_day04.spiders"]
NEWSPIDER_MODULE = "scrapy_day04.spiders"
ADDONS = {}
# 启用Redis调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 启用Redis去重过滤器
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 持久化指纹队列 (暂停后可恢复)
SCHEDULER_PERSIST = True
# Redis连接配置 (按需修改)
REDIS_HOST = 'localhost' # 若用Docker则改为'redis'
REDIS_PORT = 6379 #端口号
# REDIS_PASSWORD = '你的密码'
# 项目管道(存储到Redis) # 也可以存储到指定的mysql数据库
ITEM_PIPELINES = {'scrapy_redis.pipelines.RedisPipeline': 300,
}
# 避免请求延迟(开发时可设为False)
CONCURRENT_REQUESTS = 32 # 并发请求数
# 队列配置
SCHEDULER_PERSIST = True # 爬虫关闭时保留队列
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' # 队列名称
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Concurrency and throttling settings
#CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 1
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"
# CSV文件导出配置
FEEDS = {'图书.csv': {'format': 'csv','encoding': 'utf8','fields': ['title', 'price', 'description', 'upc', 'url', 'img_url', 'category'],'overwrite': True,}
}
items.py
import scrapy
class ScrapyDay04Item(scrapy.Item):# define the fields for your item here like:# 标题title = scrapy.Field()# 价格price = scrapy.Field()# 分类category = scrapy.Field()# 详情description = scrapy.Field()# upc号码upc = scrapy.Field()# 链接地址url = scrapy.Field()# 图片地址img_url = scrapy.Field()
books.py
import scrapy
from urllib.parse import urljoin
from ..items import ScrapyDay04Item
from scrapy_redis.spiders import RedisSpider
class BooksSpider(RedisSpider):name = "books"allowed_domains = ["books.toscrape.com"]start_urls = ["https://books.toscrape.com"]
def parse(self, response):# 获取页面的分类列表category_list = response.xpath("//*/div[@class='side_categories']/ul/li/ul/li")print('一共获取到的数量:',len(category_list))
for category_item in category_list:# 获取分类列表里面的名称category_item_title = category_item.xpath("./a/text()").extract_first().strip()# 获取分类列表里面的url地址category_item_url = category_item.xpath("./a/@href").extract_first().strip()
# 将获取到的url进行拼接,并发送请求category_item_url = urljoin(response.url,category_item_url)print(f'title:{category_item_title}--------url:{category_item_url}')
yield scrapy.Request(url=category_item_url,callback=self.parse_category_page,meta={"category":category_item_title})
# 解析分类页的方法def parse_category_page(self,response):print(f'当前解析分类的名称{response.meta['category']}')# 获取当前页面的所有图书的url链接books_url_list = response.xpath("//*/article[@class='product_pod']/h3/a/@href").extract()# 当前页面的数量books_url_list_len = len(books_url_list)
for books_url_item in books_url_list:book_url = books_url_item# 将获取到的url进行拼接,并发送请求category_item_url = urljoin(response.url, book_url)print(f'当前请求的url:{category_item_url}')yield scrapy.Request(url=category_item_url,callback=self.parse_books,meta={"category": response.meta['category']})
# 如果当前页面的数据大于20条,就获取下一页的链接if books_url_list_len >= 20:next_url = response.xpath("//*/li[@class='next']/a/@href").extract_first().strip()next_url = urljoin(response.url, next_url)print('有下一页:',next_url)yield scrapy.Request(url=next_url,callback=self.parse_category_page,meta={"category": response.meta['category']})else:print('已经是最后一页')
# 获取数据的方法def parse_books(self, response):# 提取书名book_title = response.xpath("//*/div[@class='col-sm-6 product_main']/h1/text()").extract_first()book_title = book_title.strip() if book_title else "未知书名"
# 提取价格book_price = response.xpath("//*/div[@class='col-sm-6 product_main']/p/text()").extract_first()book_price = book_price.strip() if book_price else "未知价格"
# 提取描述book_description = response.xpath("//*/article[@class='product_page']/p/text()").extract_first()book_description = book_description.strip() if book_description else "无描述"
# 获取upc号码 - 添加空值检查# following-sibling选取当前节点后的所有同级节点# following-sibling::td选择当前节点后同级节点的所有td节点book_upc = response.xpath('//th[text()="UPC"]/following-sibling::td/text()').extract_first()book_upc = book_upc.strip() if book_upc else "未知UPC"
# 获取链接地址book_url = response.url
# 获取图片的urlbook_img_url = response.xpath("//*/div[@class='item active']/img/@src").extract_first()if book_img_url:book_img_url = urljoin(response.url, book_img_url.strip())else:book_img_url = "未知图片URL"
# 创建item对象并返回数据(如果你要保存数据的话)item = ScrapyDay04Item()item['title'] = book_titleitem['price'] = book_priceitem['description'] = book_descriptionitem['upc'] = book_upcitem['url'] = book_urlitem['img_url'] = book_img_urlitem['category'] = response.meta['category']yield item