参考博客:(https://blog.csdn.net/qq_26373925/article/details/101185979)
import re
import csv
import requests
import time
class shopping_list():
def __init__(self,searchkey,pageTotal):
self.searchkey = searchkey
self.startPage = 1 # 起始页面
self.pageTotal = pageTotal # 爬取多少页
self.pagecount = 0
self.nowTime = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime())
self.startTime = time.time()
self.savefile = r'E:/{}购物清单_{}.csv'.format(self.searchkey,self.nowTime)
def run(self):
url = 'https://s.taobao.com/search'
params = {'q':self.searchkey, 'ie':'utf8'} #q 搜索的关键字
headers = {'cookie': 'thw=cn; t=8615e43948e4f325a452fcfc75658295; enc=5aX3Fxp7gnjdnzYhyZgW5C8bsEB9iy%2Fm9FINNSxeLkSyzHw%2BW%2FQ75wyPT1B2d%2FuBnBL1M3%2Fesyz5gCF5olGU%2Fw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; cna=pZzuFYcYfncCATo5S54w4cVz; miid=1475301187983023342; lgc=%5Cu8D60%5Cu6211%5Cu4E00%5Cu573A%5Cu7E41%5Cu534E%5Cu68A6; tracknick=%5Cu8D60%5Cu6211%5Cu4E00%5Cu573A%5Cu7E41%5Cu534E%5Cu68A6; _cc_=UtASsssmfA%3D%3D; tg=0; mt=ci=104_1; uc3=nk2=ty9RApAMboOj14GWEeg%3D&id2=UUGk2KGOvdSPAg%3D%3D&lg2=UtASsssmOIJ0bQ%3D%3D&vt3=F8dByuK6XCEdAazEr0o%3D; uc4=nk4=0%40tVx%2FqaVOleOvXlf%2BkJkZlRjcY1N55lu2bw%3D%3D&id4=0%40U2OT6E4rYowWmY6LDe8ez%2F%2FRBzTs; _m_h5_tk=abaf9db87e4b021b982ac24aa4a8008f_1569383153151; _m_h5_tk_enc=afefc5a7ad9c15ba310176c664e01f64; v=0; cookie2=10078b0643deba39a84cfd017ed07cdf; _tb_token_=5373b37ee10de; uc1=cookie14=UoTaEcfKm1vnRA%3D%3D; l=cBg9SfccqHIxBPbLBOCanurza77OSIRYYuPzaNbMi_5QV6T_-u_Ok6j8tF96VjWdOcTB4tm2-gv9-etkZwMy6uBKpdx1.; isg=BMPDNpg7mqWqPlYynWUrL58qUodtOFd6f31HOvWgHyKZtOPWfQjnyqEmLgRfD69y',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
#源代码:"raw_title":"蔓越莓曲奇饼干网红零食充饥夜宵整箱一箱","view_price":"9.90","view_fee":"0.00","item_loc":"安徽 宿州","view_sales":"4899人付款","comment_count":"","user_id":"4157761164","nick":"卓滋旗舰店"
keys = ('raw_title','view_price','item_loc','view_sales','comment_count','nick') #可做表头,也可做提取内容所需的键名
with open(self.savefile,'w') as f:
f.write(','.join(keys)+'\n')
for page in range(self.startPage,self.pageTotal+1):
params['s'] = str((page-1) * 44)
time.sleep(2)
response = requests.get(url,params,headers = headers)
response.encoding = 'utf8'
html = response.text
results = [re.findall(r'"{}":"([^"]+)"'.format(key),html) for key in keys]
print('page{}: 正在写入数据...'.format(page))
with open(self.savefile,'a') as f:
for row in range(len(results[0])): #48
print('\r正在写入第{}条..'.format(row+1))
self.pagecount += 1
for key in range(len(results)): #6
try:
f.write('{}{}'.format(results[key][row], ',' if key+1<len(results) else '\n')) #如果索引越界就换行
except: #如有缺失,以null填充
f.write('null{}'.format(',' if key+1<len(results) else '\n'))
print('page{}: 写入完成'.format(page))
print('\n任务完成!! 页面总数: {} | 写入数据: {}条 | 用时: {:.2f}s'.format(self.pageTotal, self.pagecount, time.time()-self.startTime))
if __name__ == "__main__":
list_1 = shopping_list('零食',3)
list_1.run()
list_2 = shopping_list('女外套',2)
list_2.run()
运行结果: