获取某厂招聘岗位信息
今天方向一个爬虫案例,爬取某厂招聘岗位信息数据,通过程序可以学习pymysql的使用,通过pycharm工具获取数据,并且导入mysql数据库中。
1 导入必要的包
import requests
import pymysql
2 主体代码
class Baidu(object):
def __init__(self):
self.db = pymysql.connect(host="127.0.0.1", user="root", password="88888888", db="test_db")
self.cursor = self.db.cursor()
self.url = 'https://talent.alibaba.com/position/search'
self.headers = {
'cookie': '自己的cookie',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/547.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/547.36'
}
self.params = {
"_csrf": "09d5fe8f-08a2-4d3c-a43f"
}
def get_data(self, page): # 获取地址和User-Agent
data = {
"channel": "group_official_site",
"language": "zh",
"batchId": "",
"categories": "",
"deptCodes": [],
"key": "",
"pageIndex": page,
"pageSize": 19,
"regions": "",
"subCategories": ""
}
response = requests.post(url=self.url, params=self.params, headers=self.headers, json=data)
return response.json()
def parse_data(self, response):
# print(response)
data_list = response["content"]['datas']
for node in data_list:
workLocations = ','.join(node['workLocations'])
name = node['name']
requirement = node['requirement']
self.save_data(workLocations, name, requirement)
def create_table(self):
# 使用预处理语句创建表
sql = '''
CREATE TABLE IF NOT EXISTS ali_quarter_bill(
id int primary key auto_increment not null,
workLocations VARCHAR(255) NOT NULL,
name VARCHAR(255) NOT NULL,
requirement TEXT)
'''
try:
self.cursor.execute(sql)
print("CREATE TABLE SUCCESS.")
except Exception as ex:
print(f"CREATE TABLE FAILED,CASE:{ex}")
def save_data(self, workLocations, name, requirement):
# SQL 插入语句
sql = 'INSERT INTO ali(id, workLocations, name, requirement) values(%s, %s, %s, %s)'
# 执行 SQL 语句
try:
self.cursor.execute(sql, (0, workLocations, name, requirement))
# 提交到数据库执行
self.db.commit()
print('数据插入成功...')
except Exception as e:
print(f'数据插入失败: {e}')
# 如果发生错误就回滚
self.db.rollback()
def run(self):
self.create_table()
for i in range(1, 19):
response = self.get_data(i)
self.parse_data(response)
# 关闭数据库连接
self.db.close()
if __name__ == '__main__':
baidu = Baidu()
baidu.run()
结果: