淘宝商品信息定向爬虫
先贴代码,后看解析:
#淘宝商品信息定向爬虫
import requests
import re
#获得页面
def getHTMLText(url):
try:
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
headers = {
'authority': 'i.taobao.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
'sec-fetch-user': '?1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'navigate',
'referer': 'https://login.taobao.com/member/login.jhtml?from=taobaoindex&f=top&style=&sub=true&redirect_url=https%3A%2F%2Fi.taobao.com%2Fmy_taobao.htm%3Fspm%3Da21bo.2017.201864-1.1.2e8e11d9vo2Ast%26ad_id%3D%26am_id%3D%26cm_id%3D%26pm_id%3D1501036000a02c5c3739',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 't=6a14d922a5ccdf410cf61b8a7d676cff; cna=dGmbFhCoQXcCAXAgYmoPXQLr; lgc=%5Cu8FC7%5Cu5BA2ljd; tracknick=%5Cu8FC7%5Cu5BA2ljd; tg=0; mt=ci=2_1; thw=cn; v=0; cookie2=110991b8eefb85d5f8c765d284e787cd; _tb_token_=e3eaee3da6fe4; _samesite_flag_=true; dnk=%5Cu8FC7%5Cu5BA2ljd; enc=1B3F381ziwGsMftxhP8Fe6csX9dar6plw4P71IaQJ%2BwoL4FaQtO2LaG5xMP476EEK8tcF%2FsHesoU4T4SqIqCRw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; _mw_us_time_=1579680174094; unb=2567573738; uc1=lng=zh_CN&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&existShop=false&cookie14=UoTblAGu7t2ZtA%3D%3D&pas=0&cookie21=Vq8l%2BKCLjhS4UhJVbhgU&tag=8&cookie15=U%2BGCWk%2F75gdr5Q%3D%3D; uc3=nk2=2nLNM%2FFIuw%3D%3D&vt3=F8dBxdrOu2PtC2x4eRY%3D&lg2=UtASsssmOIJ0bQ%3D%3D&id2=UU20sr67TFYA3A%3D%3D; csg=806ca7d3; cookie17=UU20sr67TFYA3A%3D%3D; skt=aaab3aae58e61948; existShop=MTU3OTY4MDE5Nw%3D%3D; uc4=nk4=0%402EiIau6LxxDAk%2BCx7wx2HFFW&id4=0%40U2%2Fz99G3B5JVS0pXyBxPeNKUyd5l; _cc_=WqG3DMC9EA%3D%3D; _l_g_=Ug%3D%3D; sg=d86; _nk_=%5Cu8FC7%5Cu5BA2ljd; cookie1=B0avTxixm8wz3r%2FxbcUWzku9pi7fZ0YpXHusB8qx7OA%3D; isg=BCcnChNwNRZEd7GUQmPrlk1KtlvxrPuOzgwha_mUQ7bd6EeqAXyL3mXqDuj2G9MG; l=cBQO09_mQaXsIxtsBOCanurza77OSIRYYuPzaNbMi_5dE6Ts_e7Oocn__F96VjWd9NLB43ral1J9-etkZaA4-aR8E5vP.',
}
params = (
('spm', 'a21bo.2017.201864-1.1.2e8e11d9vo2Ast'),
('ad_id', ''),
('am_id', ''),
('cm_id', ''),
('pm_id', '1501036000a02c5c3739'),
('nekot', 'uf2/zWxqZA==1579680197588'),
)
r = requests.get(url, timeout=30, headers=headers,params=params)
r.raise_for_status() # 如果状态不是200,引发HTTPError异常
r.encoding = r.apparent_encoding
# print(r.text[:1000])
return r.text
except:
return ""
#关键:解析每一个获得的页面
def parsePage(ilt, html): #结果的页表类型
try:
plt = re.findall(r'"view_price":"[\d.]*"', html)#获得商品价格和价格前对应的标识,保
#存在plt列表
tlt = re.findall(r'"raw_title":".*?"', html)#获取商品本身的名字
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price, title])
except:
print("")
#输出商品信息
def printGoodlist(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号", "价格", "商品名称"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))
def main():
goods = '书包'
depth = 2
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
for i in range(depth):#对每一页进行单独的访问处理
try:
url = start_url + '&s=' + str(44 * i) #对URL中最后的s变量赋值形成每一商品页的URL
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodlist(infoList)
#if __name__ == "__main__":
main()
代码解析:
由于淘宝具有反爬虫机制,所以在爬取商品信息的时候需要有以下几点注意:
1.首先登录淘宝网页,然后登录你的淘宝账号
2.按下F12查看控制台(我使用的是Chrome浏览器),点击Network->All,刷新页面,点击下面Name列的第一行,右键Copy->Copy as cURL(bash)
3.进入一下网站:https://curl.trillworks.com/,将刚才copy的内容复制到左边的框,下面按钮选中python,发现右边的框中出现你需要的headers和params,替换文中的相应的代码部分即可。