背景:
1、平时都是挨个点进去每个收集效率太低;
2、整体不复杂,分析了页面发现详情页的内容不是json返回的,而是在原网页页面中以类似json的字符串形式出现,所以用到了正则提取;
上代码
import requests
from lxml import etree
import json
import re
import openpyxl
def createSheet():
# 创建新工作簿
workbook = openpyxl.Workbook()
# 获取默认工作表
sheet = workbook.active
# 设置表头
header=['搜索品类','公司名称','标题','产品名','价格']
sheet.append(header)
return workbook,sheet
# 获取搜索界面列表,以便下步遍历获取信息
def getSearchList(keyword,i):
url='https://p4psearch.1688.com/hamlet/async/v1.json?beginpage=1&asyncreq=2&keywords=&keyword='+keyword+'&sortType=&descendOrder=&province=&city=&priceStart=&priceEnd=&dis=&ptid=hrb1413b6dddb01d&exp=pcSemWwClick%3AB%3Bqztf%3AD%3Blantu%3AA&cosite=&salt=17223176815639&sign=55ade859ecfeab5dce6b7396f3ddeef2&hmaTid=3&hmaQuery=graphDataQuery&pidClass=pc_list_336&cpx=cpc%2Cnature&api=pcSearch'
res=requests.get(url=url).text
data_list=json.loads(res)
company=data_list['module']['offer']['list'][i]['company']
title=data_list['module']['offer']['list'][i]['simpleSubject']
eurl=data_list['module']['offer']['list'][i]['eurl']
# print(company,title,eurl)
return company,title,eurl
def getProductDetail(eurl):
names=[]
prices=[]
res=requests.get(url=eurl).text
# print(res)
# 1-获取skuInfoMap后面的内容返回列表,取其中内容为字符串
pattern=r'"skuInfoMap.*?}}'
data_temp=re.findall(pattern,res)[0]
# 2-匹配字符串中的字典内容形成列表
data_temps=re.findall('{.*?}',data_temp)
# print(data_temps[0])
# 3-遍历列表内容,每一个就是一款产品信息
for item in data_temps:
# print(item)
name_temp=item.split(',')[1]
price_temp=item.split(',')[2]
name=name_temp.split(':')[1]
price=price_temp.split(':')[1]
print(name,price)
names.append(name)
prices.append(price)
return names,prices
# 写入数据到单元格
def dataToExcel(keyword,company,title,names,prices,ws):
# 构造数据
datas=[]
for i,name in enumerate(names):
data=[]
data.append(keyword)
data.append(company)
data.append(title)
data.append(name)
data.append(prices[i])
datas.append(data)
# 遍历出来插入表格
for row in datas:
ws.append(row)
# 1-创建excel用于存储
wb,ws=createSheet()
# 2-要搜索的关键词,以及要获取第几个
keyword='麻辣烫'
total=10
# 3-传入关键词,获取关键词列表,明确要获取第几个的数据,
for i in range(total):
company,title,eurl=getSearchList(keyword,i)
# print(eurl)
# print(company,title,eurl)
# 4-进入这个的详情页获取产品名称和价格
names,prices=getProductDetail(eurl)
# 5-将数据写入excel
dataToExcel(keyword,company,title,names,prices,ws)
# 6-保持excel
wb.save('1688-'+keyword+'-前'+str(total)+'的推荐产品信息汇总.xlsx')
最后,根据以上代码只需要输入:keyword和total的内容,也就是你要搜索的内容和要获取多少个。