爬虫逆向--Day16Day17--核心逆向案例3(拦截器关键字、路径关键字、请求堆栈、连续请求)
一、入口定位
入口定位-- 关键字搜索-- 方法关键字--最简单,最高效的 排第一-- encrypt 加密-- decrypt 解密-- JSON.stringify 给一个JS对象做Json字符串处理的把一个对象转换为Json字符串JSON.stringify({a:'1',b:"2"})'{"a":"1","b":"2"}'-- JSON.parse 把一个JS字符串转换为JS的对象JSON.parse('{"a":"1","b":"2"}'){a: '1', b: '2'}备注:当我们要搜索的key放到headers中时,我们可以搜索headers;如果搜索的key放到了请求体中,我们就需要搜索JSON.stringify,因为结构化数据没办法传输,只能传输字符串形式的。当我们发送完请求,服务器给我们返回数据以后,基本上返回的数据都是Json字符串,所以我们就需要把Json字符串转换为结构化对象,就需要用到JSON.parse
-- key关键字--最高频,用的最多 排第二 例如:portal_sign-- 拦截器关键字--比较有利的补充 排第三interceptors.request.use(func)interceptors.response.use(func)t.headers["p"+"o"+"r"+"t"+"a"+"l" +"-" +"s" +"i" +"g" +"n"] = f.getxxx(e),-- headers关键字 --偶尔会用 排第四-- 路径关键字 --偶尔会用 排第四请求入口定位,与响应无关-- 请求堆栈请求入口定位,与响应无关
二、拦截器关键字
因为很多的接口可能都需要相同的解密操作,所以前端开发人员就把相同功能的代码抽取出来集中放到了拦截器中,降低了代码冗余,
很多一些中小网站,都不会针对每个接口写一个独立的加密 / 解密代码,通常都会让很多接口使用相同的加密 /解密 方式,并且为了减少代码冗余,实现代码的高内聚、低耦合,都会把共用的代码放到拦截器中,在拦截器关键字搜索中,我们通常加密的情况下搜索【interceptors.request.use】,解密搜索【interceptors.response.use】,所以通搜索得出下图
三、路径关键字
3.1、根据路径关键字定位参数
正则找单词边界进行定位
3.2、一层一层,深入定位请求头
四、请求堆栈
补充内容:断点扩展
普通断点:我们平时使用的断点,只要是走到这里就卡住,只要执行到就卡住
条件断点:点击修改断点,修改为条件断点
日志断点
XHR断点
4.1、条件断点:当在什么时间卡住
4.2、XHR断点,等价于条件断点中的includes 建议用这个
五、Day13&Day14作业案例二
破解网站:https://www.swhysc.com/swhysc/news/company
5.1、JS文件:01 swhy.js
const cryptoJs = require("crypto-js")function decrypt_data(e) {var t = cryptoJs.enc.Utf8.parse("rewin-swhysc1234"), n = cryptoJs.AES.decrypt(e, t, {mode: cryptoJs.mode.ECB,padding: cryptoJs.pad.Pkcs7});return cryptoJs.enc.Utf8.stringify(n).toString() }// 测试 /*第一步把JS代码拷贝出来以后,先做一下测试,第二步肯定会报错,报找不到u.a第三步npm安装第三方库 crypto-js,安装到对应的文件下,导入然后替换然后Ctrl+r 替换,把u.a替换为第三方库cryptoJs注意:ECB模式,是不需要iv的,只需要有key就行 key应该是固定的,点击页码进行多次测试即可确定不是CBC*/ data = "" console.log(decrypt_data(data))
5.2、Python文件:02 swhy.py
import execjs import requestscookies = {'Hm_lvt_553ce4fa7b2bd3ea6d85c1fb6b901c6c': '1755603688','HMACCOUNT': '1D88C5C5B0786DD8','zh_choose': 's','sajssdk_2015_cross_new_user': '1','sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%22198c2226013cf0-0c2b796ee5f188-26001151-2073600-198c2226014af4%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTk4YzIyMjYwMTNjZjAtMGMyYjc5NmVlNWYxODgtMjYwMDExNTEtMjA3MzYwMC0xOThjMjIyNjAxNGFmNCJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22198c2226013cf0-0c2b796ee5f188-26001151-2073600-198c2226014af4%22%7D','Hm_lpvt_553ce4fa7b2bd3ea6d85c1fb6b901c6c': '1755603826', }headers = {'Accept': 'application/json, text/plain, */*','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'keep-alive',# 'Cookie': 'Hm_lvt_553ce4fa7b2bd3ea6d85c1fb6b901c6c=1755603688; HMACCOUNT=1D88C5C5B0786DD8; zh_choose=s; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22198c2226013cf0-0c2b796ee5f188-26001151-2073600-198c2226014af4%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTk4YzIyMjYwMTNjZjAtMGMyYjc5NmVlNWYxODgtMjYwMDExNTEtMjA3MzYwMC0xOThjMjIyNjAxNGFmNCJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%22198c2226013cf0-0c2b796ee5f188-26001151-2073600-198c2226014af4%22%7D; Hm_lpvt_553ce4fa7b2bd3ea6d85c1fb6b901c6c=1755603826','Referer': 'https://www.swhysc.com/swhysc/news/company','Sec-Fetch-Dest': 'empty','Sec-Fetch-Mode': 'cors','Sec-Fetch-Site': 'same-origin','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36','Xdemeter': '{"DeviceType":"PW"}','sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"', }params = {'topFlag': '3','pageSize': '10','status': '2','pageNum': '2','channelId': '00010002000100030001', }response = requests.get('https://www.swhysc.com/swhy/service/wscms/v1/cms/infobaselist',params=params,cookies=cookies,headers=headers, )# 1、通过https://curlconverter.com/,自动生成基础爬虫请求代码 # 2、打印返回的数据,查看是否是解密数据 # print(response.text) 可以正常返回加密,证明生成的请求代码没问题# 先打开,然后逐行读取JS代码 with open("01 swhy.js", encoding="utf-8") as f:js_code = f.read()# 获取JS代码的编译器 js_compile = execjs.compile(js_code)# 通过JS代码编译器,调用decrypt_data方法 data = js_compile.call("decrypt_data", response.text) print(data)
六、案例三:清华大学大学排名
地址链接:https://www.shanghairanking.cn/institution/tsinghua-university 爬取数据地址链接:https://www.shanghairanking.cn/api/v2010/univ_comm/univ/tsinghua-university
现在已经把加密的authorization获取到了,所以我们就需要在Python代码中调用该JS代码,把生成的加密authorization放到heaers中即可
代码升级:可以获取到任意学校的数据
并且时间戳也完全符合JS代码中的逻辑进行替换
6.1、JS代码文件:04 清华大学排名.js
const cryptoJs = require("crypto-js")var v = "/api", f = {arr0: [161, 65, 7, 6, 94, 210, 25, 42, 44, 89, 27, 57, 139, 56, 189, 28, 73, 107, 165, 33, 137, 63, 177, 185, 161, 91, 82, 130, 147, 159, 62, 45, 62, 141, 0, 60] }, h = null, m = function (t) {var e = f.arr0.length, n = new Array(e);return f.arr0.forEach((function (o, i) {n[i] = o ^ t[e - 1 - i]})),String.fromCodePoint.apply(String, n) } x = function (t, e, n, o, base) {var r = m([10, 52, 187, 12, 28, 14, 168, 164, 183, 51, 56, 145, 148, 134, 12, 190, 64, 136, 88, 112, 36, 137, 21, 191, 13, 42, 96, 1, 78, 46, 183, 111, 55, 49, 118, 151]), l = function (t) {if (!(t && t instanceof Object))return "";var e = Object.keys(t);return e.sort(),e.map((function (e) {return e + "=" + t[e]})).join("&")}(n), d = e.replace(base, "");d = d.replace("/api", "");var v = t.toUpperCase() + " " + d + " " + l, f = o + (new Date).getTime(), h = "3#" + r + "#" + v + "#" + f, x = "3:" + cryptoJs.SHA256(h) + ":" + f;return x = cryptoJs.enc.Utf8.parse(x),cryptoJs.enc.Base64.stringify(x) }function get_authorization(timer, school) {let method = "get"let url = "/api/v2010/univ_comm/univ/" + school // 根据学校获取对应学校的加密值let n = {}let h = timer - (new Date).getTime()let v = "/api"return x(method, url, n, h, v) }console.log(get_authorization())
6.2、Python代码文件:04 清华大学排名.py
import requests import execjscookies = {'Hm_lvt_af1fda4748dacbd3ee2e3a69c3496570': '1755675277','HMACCOUNT': '0AEF3215315FCD60','_clck': '1tsp5b3%5E2%5Efym%5E0%5E2058','Hm_lpvt_af1fda4748dacbd3ee2e3a69c3496570': '1755675292','_clsk': 'f3dj7u%5E1755675292420%5E2%5E1%5Ej.clarity.ms%2Fcollect', } headers = {'accept': 'application/json, text/plain, */*','accept-language': 'zh-CN,zh;q=0.9',# 'authorization': 'MzplZjJlYzBmNWNiMGU0M2FkYzY0MTg1ZDRjOTcxYWM5MWQ2MzEwNDdjM2EwZjJjOTQzMmUxNDJhNDMwYzlkNWIyOjE3NTU2NzUyOTI4MDk=',# 'cookie': 'Hm_lvt_af1fda4748dacbd3ee2e3a69c3496570=1755675277; HMACCOUNT=0AEF3215315FCD60; _clck=1tsp5b3%5E2%5Efym%5E0%5E2058; Hm_lpvt_af1fda4748dacbd3ee2e3a69c3496570=1755675292; _clsk=f3dj7u%5E1755675292420%5E2%5E1%5Ej.clarity.ms%2Fcollect','priority': 'u=1, i','referer': 'https://www.shanghairanking.cn/institution/tsinghua-university','sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', }def get_timestamp():# (1)发起请求获取服务器时间戳response = requests.get('https://www.shanghairanking.cn/api/pub/v1/ms', cookies=cookies, headers=headers)print("time:::", response.text) # time::: {"code":200,"msg":"success","data":1755684227463}return response.json().get("data") # 把返回数据中的data字段返回def main():# 把需要获取的学校的名称提取出来school = "university-of-jinan"# 发送请求获取服务器时间戳timer = get_timestamp()# 读取JS代码获取加密的authorization值authorization = execjs.compile(open("04 清华大学排名.js", encoding="utf-8").read()).call("get_authorization", timer, school)print("authorization:::", authorization) # 可以成功获取到headers["authorization"] = authorizationresponse = requests.get(f'https://www.shanghairanking.cn/api/v2010/univ_comm/univ/{school}',cookies=cookies,headers=headers,)print(response.text)main()