欢迎来到尧图网

客户服务 关于我们

您的位置:首页 > 新闻 > 社会 > 爬虫案例学习6

爬虫案例学习6

2024/12/21 22:56:13 来源:https://blog.csdn.net/weixin_47617631/article/details/144550941  浏览:    关键词:爬虫案例学习6

获取淘宝商品数据2024-12-18

参考学习:
大佬博客
视频教程
通过搜索发现,数据是通过发送请求过来的,不是静态存在源代码的
在这里插入图片描述
所以我们需要请求这个接口获取数据:比如标题,价格,图片等信息
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/
但是我们直接发请求,携带上参数,无法获取到数据,会返回非法请求的字样。
因为有个参数sign是加密的,我们需要逆向

逆向参数获取sign

sign参数:貌似是一些参数经过哈希加密算法之后生成的32位小写加密参数。
具体的需要查看对应的js
点击main.js
在这里插入图片描述
在这里插入图片描述
搜素sign:相关的,分析
在这里插入图片描述
eT = eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)这一行就是生成sign
在这里插入图片描述
点击断点,可以查看变量的值
不过这里推荐打断点的时机,先鼠标滚动到下面的页码处,再接着打断点,点击下一页,此时进入js源码的参数才是正确的。
如果不这样做,鼠标滚轮下滑也进入了断点,ep.data的值不是我们需要的,需要放行很多次。
在这里插入图片描述
在这里插入图片描述
切换到控制台,输出这些值,等一下在python代码中需要使用,这里先记录一下

eE(em.token + "&" + eC + "&" + eS + "&" + ep.data) # 返回值是sign的值
em.token
eC 
eS 
ep.data

在这里插入图片描述

获得sign 8a3593958c55ff4115e359745dc9a665是0-9,a-f MD5加密的字符串
我们需要带代码中生成sign

构建字符串MD5加密

#构建字符串str = em.token + "&" + eC + "&" + eS + "&" + ep.data
#Ec是时间
def getSign(eC):em = 'cbee62bc9b064d508514dd6eb1c6cebd' # em变量存储tokeneS = '12574478'# signParam 是ep.data中的params字段signParam = {"device": "HMA-AL00","isBeta": "false","grayHair": "false","from": "nt_history","brand": "HUAWEI","info": "wifi","index": "4","rainbow": "","schemaType": "auction","elderHome": "false","isEnterSrpSearch": "true","newSearch": "false","network": "wifi","subtype": "","hasPreposeFilter": "false","prepositionVersion": "v2","client_os": "Android","gpsEnabled": "false","searchDoorFrom": "srp","debug_rerankNewOpenCard": "false","homePageVersion": "v7","searchElderHomeOpen": "false","search_action": "initiative","sugg": "_4_1","sversion": "13.6","style": "list","ttid": "600000@taobao_pc_10.7.0","needTabs": "true","areaCode": "CN","vm": "nw","countryNum": "156","m": "pc","page": 2,"n": 48,"q": "%E8%A3%A4%E5%AD%90","qSource": "url","pageSource": "","tab": "all","pageSize": "48","totalPage": "100","totalResults": "137306","sourceS": "0","sort": "_coefp","bcoffset": "-13","ntoffset": "13","filterTag": "","service": "","prop": "","loc": "","start_price": None,"end_price": None,"startPrice": None,"endPrice": None,"categoryp": "","ha3Kvpairs": None,"couponFilter": 0,"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}

在这里插入图片描述
json在线格式化
在这里插入图片描述
复制到python的函数的signParam字典中,将null值修改为None
接着继续完善getSign函数的MD5加密工作
import hashlib

 n = json.dumps(signParam)# print(json.dumps(json.dumps(signParam)))data = {"appId": "34385","params": n}# print(data)n_data = json.dumps(data).replace(" ", "")eC= "1734492057250" # 时间戳str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")# print(str)MD5 = hashlib.md5()MD5.update(str.encode("utf-8"))sign = MD5.hexdigest()return sign,n_data

调用函数,获取签名sign,上面的时间戳我是写死静态可,可以删除,改为动态的,
等一下在完整源码中会修改为动态当前时间戳

date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
# print(sign)
# f94586b665e0d865a20aa6d3acf708f3

有了sign,就可以发起请求,获取数据了,直接上完整源码
请求数据所在的api接口
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/

完整源码

# 可以运行版
# 获取淘宝数据:https://s.taobao.com/
# 搜索键盘相关数据,会自动拦截登录页面(所以需要cookie)
import csv
import timeimport requests
from pprint import pprint
import hashlib
import json
import re
url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36","Referer": "https://s.taobao.com/","cookie":"自己的cookie"
}
"""mtopjsonp6({"api":"mtop.relationrecommend.wirelessrecommend.recommend","data":{},"ret":["FAIL_SYS_ILLEGAL_ACCESS::非法请求"]sign参数每次请求都会变化,导致请求不到数据(参数sign逆向)
"""
# eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
def getSign(eC):em = 'db1e1adce046132af55f1e37728ca39b'eS = '12574478'signParam = {"device": "HMA-AL00","isBeta": "false","grayHair": "false","from": "nt_history","brand": "HUAWEI","info": "wifi","index": "4","rainbow": "","schemaType": "auction","elderHome": "false","isEnterSrpSearch": "true","newSearch": "false","network": "wifi","subtype": "","hasPreposeFilter": "false","prepositionVersion": "v2","client_os": "Android","gpsEnabled": "false","searchDoorFrom": "srp","debug_rerankNewOpenCard": "false","homePageVersion": "v7","searchElderHomeOpen": "false","search_action": "initiative","sugg": "_4_1","sversion": "13.6","style": "list","ttid": "600000@taobao_pc_10.7.0","needTabs": "true","areaCode": "CN","vm": "nw","countryNum": "156","m": "pc","page": 1,"n": 48,"q": "%E8%A3%A4%E5%AD%90","qSource": "url","pageSource": "","tab": "all","pageSize": "48","totalPage": "100","totalResults": "5000","sourceS": "48","sort": "_coefp","bcoffset": "-26","ntoffset": "0","filterTag": "","service": "","prop": "","loc": "","start_price": None,"end_price": None,"startPrice": None,"endPrice": None,"categoryp": "","ha3Kvpairs": None,"couponFilter": 0,"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}n = json.dumps(signParam)# print(json.dumps(json.dumps(signParam)))data = {"appId": "34385","params": n}# print(data)n_data = json.dumps(data).replace(" ", "")str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")# print(str)MD5 = hashlib.md5()MD5.update(str.encode("utf-8"))sign = MD5.hexdigest()return sign,n_datadate_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
print(sign)
params = {
'jsv': '2.7.4',
'appKey': '12574478',
't': date_time,
'sign': sign,
'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
'v': '2.0',
'timeout': '10000',
'type': 'jsonp',
'dataType': 'jsonp',
'callback': 'mtopjsonp6',
'data': n
}
resp = requests.get(url,params=params, headers=headers)
# print(resp.text)
html = resp.text
# 采集数据
info = re.findall(r'mtopjsonp\d+\((.*)', html)[0].replace(')','')
# pprint(info)
jsonData = json.loads(info)
# 循环获取数据
with open('taobao.csv',mode="w",newline='',encoding="utf-8") as f:writer = csv.writer(f)# 写入表头head = ['标题','图片链接','价格','地区','销量','店铺']writer.writerow(head)for item in jsonData['data']['itemsArray']:dit = {'title': item['title'].replace('<span class=H>', '').replace('</span>',''),'img': item['pic_path'],'price': item['price'],'procity': item['procity'],'realSales': item['realSales'],'shopName': item['nick'],}writer.writerow(dit.values())print(dit)

在这里插入图片描述
在这里插入图片描述

注:需要获取其他数据

修改源码几个参数
url所在浏览器位置
在这里插入图片描述
改Referer和cookie
在这里插入图片描述

重写getSign函数的em值,eS值,signParam值
data中的appid也改
修改真正数据接口的参数:params
在这里插入图片描述
最后运行代码,即可获取数据
在这里插入图片描述

版权声明:

本网仅为发布的内容提供存储空间,不对发表、转载的内容提供任何形式的保证。凡本网注明“来源:XXX网络”的作品,均转载自其它媒体,著作权归作者所有,商业转载请联系作者获得授权,非商业转载请注明出处。

我们尊重并感谢每一位作者,均已注明文章来源和作者。如因作品内容、版权或其它问题,请及时与我们联系,联系邮箱:809451989@qq.com,投稿邮箱:809451989@qq.com