#2020/8/20
# 淘宝商品比价
import requests
import re
def getHtmlText(url):
try:
header = {
'authority': 's.taobao.com',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'sec-fetch-dest': 'document',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'referer': ,
'accept-language': 'zh-CN,zh;q=0.9',
'cookie':
}# 隐去了cookie信息和referer信息,用你自己的就ok
r = requests.get(url, headers=header)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("爬取失败")
return ""
def parsePage(ilist, html):
try:
plt = re.findall(r'\"view_price\":\"\d+\.\d*\"', html)
tlt = re.findall(r'\"raw_title\":\".*?\"', html)
# print(tlt)
print(len(plt))
for i in range(len(plt)):
price = eval(plt[i].split('\"')[3])
title = tlt[i].split('\"')[3]
ilist.append([title, price])
# print(ilist)
except:
print("解析出错")
def printGoodsList(ilist, num):
print("=====================================================================================================")
tplt = "{0:<3}\t{1:<50}\t{2:>8}"
f = open(r"E:\Jobs\nlpproject\project1\马毛价格.txt", "w") #一
f.write(tplt.format("序号", "商品名称", "价格") )
f.write("\n")
print(tplt.format("序号", "商品名称", "价格"))
count = 0 #起始编号
for g in ilist:
count += 1
if count <= num:
f.write(tplt.format(count, g[0], g[1]))
f.write("\n")
print(tplt.format(count, g[0], g[1]))
print("=====================================================================================================")
f.close()
def main():
goods = "马毛" #二
depth = 1
#三
#start_url = "https://s.taobao.com/search?q=" + goods
#start_url = "https://s.taobao.com/search?spm=a230r.1.1998181369.1.41d83ce6WLACAa&q=" + goods
start_url = "https://s.taobao.com/search?spm=a230r.1.1998181369.d4919860.41105c12njESrB&q=" + goods
infoList = []
num = 5000
for i in range(depth):
try: #四
#url = start_url + '$S=' + str(44 * i)
#url = start_url + '$imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200820&ie=utf8&cps=yes&ppath=20021%3A30084' + '20021%3A30084'+'$S=' + str(44 * i)
#url = start_url + '$imgfile=&js=1&initiative_id=staobaoz_20200820&ie=utf8&style=grid&tab=all&fs=1&globalbuy=1&uniq=imgo' +'&bcoffset=0&p4ppushleft=%2C44&s=' + str(44 * i)
url = start_url + '$imgfile=&js=1&style=grid&initiative_id=staobaoz_20200820&ie=utf8&tab=mall&cps=yes&cat=50039094'
html = getHtmlText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList, num)
main()
爬虫1
最新推荐文章于 2022-11-01 19:26:14 发布