当前位置: 首页 > news >正文

Python 爬虫案例(不定期更新)

目录

  • 一、网页爬虫 — DrissionPage 模块
    • 1. Boss 直聘(滑动翻页)
    • 2. 前程无忧(点击翻页)
    • 3. 智联招聘(点击翻页)
    • 4. 猎聘网(点击翻页)
  • 二、网页爬虫 — Requests 模块
    • 1. 得物(JS 逆向加密)
    • 2. 闲鱼(MD5 加密)


一、网页爬虫 — DrissionPage 模块

DrissionPage 模块文档:【DrissionPage官网】

1. Boss 直聘(滑动翻页)

Python 代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import time
import csvdef deal_with(data_r, cd):# 键值对取值,提取信息为列表data_list = data_r['zpData']['jobList']# for 循环遍历,提取列表里的元素for index in data_list:# 分割薪资制度salary_list = index['salaryDesc'].split('·')salary = salary_list[0]if len(salary_list) == 2:salary_system = salary_list[1]else:salary_system = '12薪'# 提取具体数据信息保存字典中temporarily_dict = {'公司名称': index['brandName'],'公司行业': index['brandIndustry'],'公司规模': index['brandScaleName'],'融资阶段': index['brandStageName'],'工作区域': index['cityName'] + ' ' + index['areaDistrict'] + ' ' + index['businessDistrict'],'学历要求': index['jobDegree'],'工作经验': index['jobExperience'],'职位名称': index['jobName'],'薪资待遇': salary,'薪资制度': salary_system,'沟通职员': index['bossTitle'] + '-' + index['bossName'],'所需技能': ' '.join(index['skills']),'公司福利': ' '.join(index['welfareList']),}cd.writerow(temporarily_dict)def main():# 打开浏览器(实例化浏览器对象)google = ChromiumPage()# 监听数据包(自行修改)google.listen.start(r"wapi/zpgeek/search/joblist.json")# 访问指定网站的页面(自行修改)google.get(r"https://www.zhipin.com/web/geek/jobs?city=101280100&query=%E9%A1%B9%E7%9B%AE%E5%8A%A9%E7%90%86")# 创建文件对象f = open('boss_project_assistant.csv', mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=['公司名称', '公司行业', '公司规模', '融资阶段', '工作区域', '学历要求','工作经验', '职位名称', '薪资待遇', '薪资制度', '沟通职员', '所需技能','公司福利'])cd.writeheader()num = 50for page in range(1, num + 1):print(f'正在处理第 {page} 页数据……')# 获取数据包加载try:data_load = google.listen.wait(timeout=2)except TimeoutError:print('Overtime')exit(0)else:if data_load:# 获取响应数据(字典)data_response = data_load.response.body# 处理数据deal_with(data_response, cd)if page < num:# 下滑页面到底部google.scroll.to_bottom()time.sleep(1)else:print('No more information!')exit(1)if __name__ == '__main__':main()

运行结果展示:

2. 前程无忧(点击翻页)

Python 代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import csvdef deal_with(data_r, cd):# 键值对取值,提取信息为列表data_list = data_r['resultbody']['job']['items']# for 循环遍历,提取列表里的元素for index in data_list:# 处理薪资salary_list = index['provideSalaryString'].split('·')salary = salary_list[0]salary_system = '12薪'if len(salary_list) == 2:salary_system = salary_list[1]# 处理地区district_string = '未知'if 'districtString' in index['jobAreaLevelDetail']:district_string = index['jobAreaLevelDetail']['districtString']# 处理公司类型company_type = index['companyIndustryType1Str']if 'companyIndustryType2Str' in index and index['companyIndustryType2Str'] != index['companyIndustryType1Str']:company_type = index['companyIndustryType1Str'] + ';' + index['companyIndustryType2Str']# 处理 HR 状态hr_labels, hr_active_status_green, hr_info = '未知', '未知', '未知'if 'hrLabels' in index and index['hrLabels'] != []:hr_labels = index['hrLabels'][0]if 'hrActiveStatusGreen' in index:hr_active_status_green = index['hrActiveStatusGreen']if 'hrPosition' in index and 'hrName' in index:hr_info = index['hrPosition'] + '-' + index['hrName']# 提取具体数据信息保存字典中temporarily_dict = {'公司名称': index['fullCompanyName'],'公司性质': index['companyTypeString'],'公司领域': company_type,'公司规模': index['companySizeString'],'职位名称': index['jobName'],'优先专业': index['major1Str'] + ' ' + index['major2Str'],'所在省份': index['jobAreaLevelDetail']['provinceString'],'所在城市': index['jobAreaLevelDetail']['cityString'],'所在地区': district_string,'薪资范围': salary,'薪资制度': salary_system,'工作形式': index['termStr'],'所需学历': index['degreeString'],'所需经验': index['workYearString'],'沟通HR': hr_info,'处理速度': hr_labels,'在线时间': hr_active_status_green,'投递频率': index['applyTimeText'],'公司详情页': index['companyHref'],'其他标签': ','.join(index['jobTags'])}cd.writerow(temporarily_dict)def main():# 打开浏览器(实例化浏览器对象)google = ChromiumPage()# 监听数据包google.listen.start(r"api/job/search-pc")# 访问指定网站的页面google.get(r"https://we.51job.com/pc/search?jobArea=260200&keyword=Python&searchType=2&keywordType=")# 创建文件对象f = open('51job_artificial_intelligence.csv', mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=['公司名称', '公司性质', '公司领域', '公司规模','职位名称', '优先专业', '所在省份', '所在城市','所在地区', '薪资范围', '薪资制度', '工作形式','所需学历', '所需经验', '沟通HR', '处理速度','在线时间', '投递频率', '公司详情页', '其他标签'])cd.writeheader()num = 10for page in range(1, num + 1):if page == 1:# 滑到页面底部google.scroll.to_bottom()# 定位下一页按钮并点击button = google.ele('css:.el-icon-arrow-right')button.run_js('this.click();')google.scroll.to_bottom()# 暂停监听,清空已获取队列google.listen.pause(clear=True)# 继续暂停的监听google.listen.resume()# 定位上一页按钮并点击button = google.ele('css:.el-icon-arrow-left')button.run_js('this.click();')# 获取数据包加载try:data_load = google.listen.wait(timeout=2)except TimeoutError:print('Overtime')exit(0)else:if data_load:print(f'正在处理第 {page} 页数据……')# 获取响应数据(字典)data_response = data_load.response.body# 处理数据deal_with(data_response, cd)if page < num:# 滑到页面底部google.scroll.to_bottom()# 定位下一页按钮并点击button = google.ele('css:.el-icon-arrow-right')button.run_js('this.click();')else:print('No more information!')exit(1)if __name__ == '__main__':main()

dp.ele() → 通过元素面板定位元素位置,其中 dp 为浏览器对象;ele 为元素面板的缩写。

运行结果展示:

3. 智联招聘(点击翻页)

Python 代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import csvdef deal_with(data_r, cd):# 键值对取值,提取信息为列表data_list = data_r['data']['list']# for 循环遍历,提取列表里的元素for index in data_list:# 处理技能skill_result = ''for skill_dictionary in index['skillLabel']:for key, value in skill_dictionary.items():if key == 'value':skill_result += value + ' '# 处理薪资salary_list = index['salary60'].split('·')salary = salary_list[0]salary_system = '12薪'if len(salary_list) == 2:salary_system = salary_list[1]# 处理福利welfare_str = ' '.join(index['welfareTagList'])if 'jobKnowledgeWelfareFeatures' in index and len(index['jobKnowledgeWelfareFeatures']) > len(index['welfareTagList']):welfare_str = ' '.join(index['jobKnowledgeWelfareFeatures'])# 处理 HR 回复速度hr_processing_speed = '未知'if 'hrStateInfo' in index and len(index['hrStateInfo']) > 0:hr_processing_speed = index['hrStateInfo']# 提取具体数据信息保存字典中temporarily_dict = {'公司名称': index['companyName'],'公司性质': index['property'],'公司领域': index['industryName'],'公司规模': index['companySize'],'职位名称': index['name'],'所在城市': index['workCity'],'所在地区': index['cityDistrict'],'所在街道': index['streetName'],'公司源址': index['jobRootOrgInfo']['cityName'],'薪资范围': salary,'薪资制度': salary_system,'工作形式': index['workType'],'所需学历': index['education'],'所需经验': index['workingExp'],'所需技能': skill_result,'沟通HR': index['staffCard']['hrJob'] + '-' + index['staffCard']['staffName'],'处理速度': hr_processing_speed,'在线时间': index['staffCard']['hrOnlineState'],'公司详情页': index['companyUrl'],'职位详情页': index['positionUrl'],'其他福利': welfare_str}cd.writerow(temporarily_dict)def main():# 打开浏览器(实例化浏览器对象)google = ChromiumPage()# 监听数据包google.listen.start(r"c/i/search/positions")# 访问指定网站的页面google.get(r"https://www.zhaopin.com/sou/jl765/kw01800U80EG06G03F01N0/p2?kt=3")# 创建文件对象f = open('zhaopin_python.csv', mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=['公司名称', '公司性质', '公司领域', '公司规模', '职位名称','所在城市', '所在地区', '所在街道', '公司源址', '薪资范围','薪资制度', '工作形式', '所需学历', '所需经验', '所需技能','沟通HR', '处理速度', '在线时间', '公司详情页', '职位详情页', '其他福利'])cd.writeheader()num = 10for page in range(1, num + 1):if page == 1:# 暂停监听,清空已获取队列google.listen.pause(clear=True)# 继续暂停的监听google.listen.resume()# 定位上一页按钮并点击button = google.ele('css:.soupager a:first-of-type')# 滑动页面google.scroll.to_see(button)button.run_js('this.click();')# 获取数据包加载try:data_load = google.listen.wait(timeout=2)except TimeoutError:print('Overtime')exit(0)else:if data_load:print(f'正在处理第 {page} 页数据……')# 获取响应数据(字典)data_response = data_load.response.body# 处理数据deal_with(data_response, cd)if page < num:# 定位下一页按钮并点击button = google.ele('css:.soupager a:last-of-type')# 滑动页面google.scroll.to_see(button)button.run_js('this.click();')else:print('No more information!')exit(1)if __name__ == '__main__':main()
  • css:.soupager a:first-of-type → css:.soupager 定位类名为 soupager 的标签;a:first-of-type 表示提取第一个 a 标签。
  • css:.soupager a:last-of-type → css:.soupager 定位类名为 soupager 的标签;a:last-of-type 表示提取最后一个 a 标签。
  • a:nth-of-type(even) 表示提取偶数位置的 a 标签;nth-of-type(odd) 表示提取奇数位置的 a 标签。

运行结果展示:

4. 猎聘网(点击翻页)

Python 代码:

# 导入自动化模块
from DrissionPage import ChromiumPage
import csvdef deal_with(data_r, cd):# 键值对取值,提取信息为列表data_list = data_r['data']['soJobForms']# for 循环遍历,提取列表里的元素for index in data_list:# 处理薪资salary_list = index['salary'].split('·')salary_value = salary_list[0]salary_system = '12薪'if len(salary_list) == 2:salary_system = salary_list[1]# 处理技能和福利skill_value, welfare_value = '', ''if 'jobLabels' in index and 'sellingPointList' in index:skill_list = [item for item in index['jobLabels'] if item not in index['sellingPointList']]if len(skill_list) >= 1:skill_value = ';'.join(skill_list)if len(index['sellingPointList']) >= 1:welfare_value = ';'.join(index['sellingPointList'])elif 'jobLabels' in index:if len(index['jobLabels']) >= 1:skill_value = ';'.join(index['jobLabels'])elif 'sellingPointList' in index:if len(index['sellingPointList']) >= 1:welfare_value = ';'.join(index['sellingPointList'])# 处理沟通 HRhr = index['recruiterName']if 'recruiterTitle' in index:hr = index['recruiterTitle'] + '-' + index['recruiterName']# 处理公司规模scale_value = '未知'if 'compScale' in index:scale_value = index['compScale']# 提取具体数据信息保存字典中temporarily_dict = {'公司名称': index['company'],'公司领域': index['industry'],'公司规模': scale_value,'职位名称': index['title'],'所在地址': index['dq'],'薪资范围': salary_value,'薪资制度': salary_system,'所需学历': index['requireEduLevel'],'所需技能': skill_value,'所需经验': index['requireWorkYears'],'沟通HR': hr,'发布时间': index['date'],'公司福利': welfare_value}cd.writerow(temporarily_dict)def main():# 打开浏览器(实例化浏览器对象)google = ChromiumPage()# 监听数据包google.listen.start(r"api/com.liepin.searchfront4c.h5-search-job")# 访问指定网站的页面google.get(r"https://m.liepin.com/zhaopin/?dqs=170020&keyword=Python")# 创建文件对象f = open('liepin_python.csv', mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=['公司名称', '公司领域', '公司规模', '职位名称', '所在地址','薪资范围', '薪资制度', '所需学历', '所需技能', '所需经验','沟通HR', '发布时间', '公司福利'])cd.writeheader()num = 10for page in range(1, num + 1):try:data_load = google.listen.wait(timeout=2)except TimeoutError:print('Overtime')exit(0)else:if data_load:print(f'正在处理第 {page} 页数据……')# 获取响应数据(字典)data_response = data_load.response.body# 处理数据deal_with(data_response, cd)if page < num:# 滑动页面google.scroll.to_bottom()# 定位下一页按钮并点击button = google('下一页')button.run_js('this.click();')else:print('No more information!')exit(1)if __name__ == '__main__':main()

运行结果展示:

二、网页爬虫 — Requests 模块

1. 得物(JS 逆向加密)

js_file.js 模块代码:【对得物进行爬虫时使用到的 js 模块】

Python 代码:

import requests
import pandas as pd
# 导入编译 js 代码模块
import execjs
# ------------------------------------------------
import openpyxl
from openpyxl.drawing.image import Image as xlImage
from openpyxl.utils import get_column_letter
from PIL import Image
from io import BytesIOdef get_data_xlsx(js_path, save_path):# 请求标头request_header = {'accept': '*/*','accept-encoding': 'gzip, deflate, br, zstd','accept-language': 'zh-CN,zh;q=0.9','connection': 'keep-alive','content-length': '124','content-type': 'application/json','cookie': '...','host': 'app.dewu.com','ltk': '...','origin': 'https://www.dewu.com','referer': 'https://www.dewu.com/','sec-ch-ua': '"Google Chrome";v="137", "Chromium";v="137", "Not/A)Brand";v="24"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-site','sessionid': '...','shumeiid': '...','sk': '','traceparent': '...','user-agent': '...'}# 请求网址request_url = r'https://app.dewu.com/api/v1/h5/commodity-pick-interfaces/pc/pick-rule-result/feeds/info'# 请求载荷request_parameters = {'filterUnbid': True,'pageNum': 1,  # 页码'pageSize': 24,'pickRuleId': 644443,  # 类目 ID'showCspu': True}# 编译 js 代码js_code = execjs.compile(open(js_path, encoding='utf-8').read())# 获取 sign 加密参数sign_data = js_code.call('c', request_parameters)# 0e5d10fb111f2afef6ac0a1776187e23# 将 sign 添加至请求载荷中request_parameters['sign'] = sign_dataprint('Data is being requested and processed…')# 请求数据response = requests.post(url=request_url, json=request_parameters, headers=request_header)# 获取数据data_json = response.json()# 创建一个空列表dewu_info = []# 解析数据info_list = data_json['data']['list']for index in info_list:info_dict = {'标题': index['title'],'价格': index['price'] / 100,'图片网址': index['logoUrl']}# 写入数据dewu_info.append(info_dict)# 转换数据df = pd.DataFrame(dewu_info)# 导出保存为 Excel 表格df.to_excel(save_path, index=False)print(f'The data is already saved in {save_path}')def download_image(url):rg_url = requests.get(url)# 检查响应状态码if rg_url.status_code == 200:# 创建图像对象image = Image.open(BytesIO(rg_url.content))# 统一图像类型if image.mode != 'RGB':image = image.convert('RGB')# 调整图像大小return image.resize((150, 96))else:raise Exception(f"Unable to download images, status codes: {rg_url.status_code}")def link_to_png(source_path, destination_path):# 加载 Excel 文件wb = openpyxl.load_workbook(source_path)# 默认为第一个 sheetsheet = wb.active# 调整行高和列宽for row in range(2, sheet.max_row + 1):sheet.row_dimensions[row].height = 75sheet.column_dimensions['C'].width = 20# 读取链接并下载图片插入到对应位置for row in range(2, sheet.max_row + 1):# 假设图片链接在第 2 行开始,第 C 列是链接(对应 column = 3),获取链接单元格的值link = sheet.cell(row=row, column=3).value# 清空内容sheet.cell(row=row, column=3).value = None# 如果链接不为空if link:# 发送 HTTP 请求下载图片try:# 尝试下载图像resized_image = download_image(link)except OSError:print(f"Failed to download image {link}")continueelse:# 将调整后的图像插入到工作表中img_bytes = BytesIO()resized_image.save(img_bytes, format='PNG')  # 将图片保存到内存中img = xlImage(img_bytes)sheet.add_image(img, f'{get_column_letter(3)}{row}')  # 插入图片到指定位置wb.save(destination_path)  # 必要wb.close()  # 必要if __name__ == '__main__':j_path = './js_file.js's_path = './dewu_link.xlsx'# 获取数据并保存为 Excel 文件get_data_xlsx(j_path, s_path)d_path = './dewu_png.xlsx'print('Excel file is being processed…')link_to_png(s_path, d_path)print(f'The data is already saved in {d_path}')

运行结果展示:

2. 闲鱼(MD5 加密)

Python 代码:

# 导入数据请求模块
import requests
import csv
# 导入哈希模块
import hashlib
import timedef get_sign(page):d_token = '...'  # d_token 具有时效性,自行填写j = int(time.time() * 1000)h = '34839810'c_data = ('{"pageNumber": %d, ''"keyword": "python爬虫书籍", ''"fromFilter": false, ''"rowsPerPage": 30, ''"sortValue": "", ''"sortField": "", ''"customDistance": "", ''"gps": "", ''"propValueStr": {}, ''"customGps": "", ''"searchReqFromPage": "pcSearch", ''"extraFilterValue": "{}", ''"userPositionJson": "{}"}') % pageresult_str = d_token + "&" + str(j) + "&" + h + "&" + c_data# 使用 md5 加密md_str = hashlib.md5()# 传入加密参数md_str.update(result_str.encode('utf-8'))# 进行加密处理sign = md_str.hexdigest()return sign, j, c_datadef get_data_csv(file_path, head_name):# 模拟浏览器(请求标头)request_header = {'Referer': 'https://www.goofish.com/',# cookie 代表用户信息,常用于检测是否有登陆账户(不论是否登录都有 cookie)# cookie 具有时效性,自行填写'Cookie': '...',# user-agent 代表用户代理,显示浏览器 / 设备的基本身份信息'User-Agent': '...'}# 请求网址request_url = r'https://h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search/1.0/'# 创建文件对象f = open(file_path, mode='a', encoding='utf-8-sig', newline='')# 字典写入方法cd = csv.DictWriter(f, fieldnames=head_name)cd.writeheader()# for 构建循环翻页num = 10for i in range(1, num + 1):print(f'正在采集第 {i} 页数据…')# 获取 sign 加密参数、时间戳和表单数据sign, j_time, c_data = get_sign(i)# 查询参数query_parameters = {'jsv': '2.7.2','appKey': '34839810','t': str(j_time),'sign': sign,'v': '1.0','type': 'originaljson','accountSite': 'xianyu','dataType': 'json','timeout': '20000','api': 'mtop.taobao.idlemtopsearch.pc.search','sessionOption': 'AutoLoginOnly','spm_cnt': 'a21ybx.search.0.0','spm_pre': 'a21ybx.home.searchSuggest.1.4c053da6IXTxSx','log_id': '4c053da6IXTxSx'}# 表单数据form_data = {"data": c_data}# 发送请求response = requests.post(url=request_url, params=query_parameters, data=form_data, headers=request_header)# 获取响应的 json 数据 → 字典数据类型data_json = response.json()# 键值对取值,提取商品信息所在列表info_list = data_json['data']['resultList']# for 循环遍历,提取列表里的元素for index in info_list:# 处理用户名nick_name = '未知'if 'userNickName' in index['data']['item']['main']['exContent']:nick_name = index['data']['item']['main']['exContent']['userNickName']# 处理售价price_list = index['data']['item']['main']['exContent']['price']price = ''for p in price_list:price += p['text']# 处理详情页链接item_id = index['data']['item']['main']['exContent']['itemId']link = f'https://www.goofish.com/item?id={item_id}'temporarily_dict = {'标题': index['data']['item']['main']['exContent']['title'],'地区': index['data']['item']['main']['exContent']['area'],'售价': price,'用户名': nick_name,'详情页链接': link}cd.writerow(temporarily_dict)f.close()if __name__ == '__main__':f_path = './fish_python.csv'h_name = ['标题', '地区', '售价', '用户名', '详情页链接']get_data_csv(f_path, h_name)

运行结果展示:

相关文章:

  • 缓存与加速技术实践-Kafka消息队列
  • 网络安全基础:从CIA三元组到密钥交换与消息认证
  • 【软考高级系统架构论文】论 SOA 在企业集成架构设计中的应用
  • 从C++编程入手设计模式——观察者模式
  • TensorFlow 安装与 GPU 驱动兼容(h800)
  • 人工智能学习45-Incep网络
  • 经济法-4- 合同法律制度
  • 从0开始学linux韦东山教程Linux驱动入门实验班(1)
  • Web攻防-XSS跨站Cookie盗取数据包提交网络钓鱼BEEF项目XSS平台危害利用
  • 【软考高级系统架构论文】论软件系统架构风格
  • 【simulink】IEEE5节点系统潮流仿真模型(2机5节点全功能基础模型)
  • 【Java】对象
  • 操作系统内核态和用户态--1-基础认识
  • 操作系统内核态和用户态--2-系统调用是什么?
  • 分布式锁 不同的拒绝策略 应用场景 业务上的思考
  • QT vscode cmake 编译 undefined reference to `vtable for 问题解决
  • 自定义 Hook:在 Vue3 中复用逻辑
  • 【C++】pybind11:生成 Python 可用的动态库
  • 5.3 VSCode使用FFmpeg库
  • 国家级与省级(不含港澳台)标准地图服务网站汇总
  • wordpress和iss/优化关键词排名推广
  • 千博企业网站管理系统完整版 2014/搜索引擎营销的特征
  • 网站开发属于什么系统/google推广seo
  • 首页重庆网站建设/百度今日小说搜索风云榜
  • 设计工作室怎么找客户/seo优化教学视频
  • 恶意网站是怎么实现的/下载百度语音导航地图安装