最近好多小伙伴私信我要代码,因为是去年完成的,某通更改了部分结构,导致代码失效,最近花了一点时间,更新了代码!
回顾前景,已经说明某通的数据是js渲染过的,但是也存放在网页源码中,转换成了json格式,我们需要通过正则来提取。
两种思路:
1)selenium;对速度没要求,想方便一点,推荐,直接使用xpath提取数据,但是速度较慢,不适用于爬取量过大
2)request;速度快,爬取量大,就是提取数据的时候比较麻烦,要分析结构
3)逆向就不推荐了,需要懂js,难度较高,有想法可以试着本地加载js渲染数据
- 读取链接
def read_url():
with open('url.txt', 'r', encoding='UTF-8') as f: # 读取url文本
file = f.readlines()
data = []
for i in range(len(file)):
value = {"total": len(file), "index": i, "value": file[i]}
data.append(value)
return data
- 请求网页(requests)
def get_html(addr_map):
headers = {
'User-agent': UserAgent().random,
'Cookie':'ali_apache_id=33.50.162.23.1638768571497.305192.2; xman_f=PB+/WUW+7p99qf93WsvlgzlVbukHrCZcEDj0sZEPgwZaYUqchboZlsPaw28L1b4ZgCluYmNEUX+eN/ReyOUk4d7TWLmDrhCtnEkjUv0CdIfEgNim3RJvLg==; cna=VGE0GluNuAICAXd7IplItu6D; _fbp=fb.1.1646474001849.1574633962; _gcl_au=1.1.1066639008.1646474002; xman_t=rVenJ2rmNOEV5R56UUvR/uNqG9rNpJ/eM/3yVCjaGSBHv8IwjUXCEjC13GevhGtJ; account_v=1; XSRF-TOKEN=d842951d-f332-4e0a-9e67-531672d62101; acs_usuc_t=x_csrf=lfwhorq7aehg&acs_rt=f45123692e274fb6941134efaaa6864d; ali_apache_track=; ali_apache_tracktmp=; _bl_uid=LmlCF2XL0jd8vzedF1zCeg719I0s; intl_locale=fr_FR; aep_usuc_f=site=fra&province=&city=&c_tp=EUR&ups_d=1|1|1|1&ups_u_t=1665296464087®ion=FR&b_locale=fr_FR&ae_u_p_s=2; _mle_tmp_enc0=Ey%2Fp8LswzxA3J47VsqxI%2B7%2FX5zfHsbjG5rixBvMobRtIo20TFSI87DZCIt0bQG0CUAftZb8p2osmppBPU3AhKboEPcPfFZVi%2BhQ7q37GJt%2BWgxK6Ou7Rrz6B8s3VCnmV; xman_us_f=x_locale=fr_FR&x_l=0&x_c_chg=0&x_as_i=%7B%22cookieCacheEffectTime%22%3A1650335012234%2C%22isCookieCache%22%3A%22Y%22%2C%22ms%22%3A%220%22%7D&acs_rt=6e6397a6dc0845089758e2830f6b7560; xlly_s=1; _gid=GA1.2.211531649.1650875286; _m_h5_tk=5e0f15c71147ed011c5c8eb8fedc8623_1650879697974; _m_h5_tk_enc=e664897138fa3b7cc4a1fa57d3ef8029; aep_history=keywords%5E%0Akeywords%09%0A%0Aproduct_selloffer%5E%0Aproduct_selloffer%0932952011209%094001354377511%0932952011209%091005004111444866%091005004146540365%091005002059594364%0933013223561%091005001843605941; JSESSIONID=145A322AFD9AB8BE0DA2F3A2343347E8; intl_common_forever=IRRci8dLj0TLqATBPMRTXPN3MdTD6O+7Heuhnug0no3deUKGitDlAQ==; tfstk=cOeRBO9CYZblFJiYb7C0YkHsTP2dZkbKl3gHpJ3xmYvfHqvdiXygB-NNNDgZyiC..; l=eBQNuxwRgnUoc8SQmOfwourza77OSIRAguPzaNbMiOCPOS1p55HPB6qrTNY9C3GVh62HR3Jfz-p8BeYBqI0tiitwgdTndvMmn; isg=BBQUx7m0CqFCt52mwVgJqgbm5VKGbThX1vbzWK71oB8imbTj1n0I58obmZEBYXCv; _gat=1; _ga_VED1YSGNC7=GS1.1.1650875286.36.1.1650878519.0; _ga=GA1.1.2143085281.1646474002'
}
# 请求headers头
try:
address = addr_map['value']
address = re.findall('(.*?)\n', address)[0] # 提取url
res = requests.get(address, headers=headers).content.decode('utf8')
print('已抓取到' + address)
return res
except Exception as e:
print(e)
- 提取数据(正则表达式)
def process_htmls(htmls):
result_list = [] #创建一个列表,保存数据
for html in htmls: # 遍历抓取每一条链接
source = html
state = re.findall('data: (.*)', source) # 用正则定位抓取所有数据
if 'Page Not Found - Aliexpress.com' in source: # 做一个判断,如果是坏链接,则终止循环
continue
states = re.sub(r'[\\"]', "", state[0]) # 剔除特殊符号
property = re.findall('skuPropertyValues:(.*)',states) # 存放图片
name = re.findall('propertyValueDefinitionName:(.*?),',property[0])
img = re.findall('skuPropertyImagePath:(.*?),', property[0]) # 主图
value = re.findall('propertyValueId:(.*?),', property[0]) # skuID
imgs = [dict(name=i, img=j) for i, j in zip(name, img)] # 图片+sku字典
sku_value = re.findall('skuPriceList:(.*)',states)
sku = re.findall('skuAttr:(.*?),',sku_value[0])
amount = re.findall('skuActivityAmount:(.*?)}', sku_value[0]) # 价格信息
rating_value = re.findall("averageStar:(.+),aver", states)[0] # 评分
rating_num = re.findall("totalValidNum:(.+),trial", states)[0] # 评论数
order_count = re.findall("tradeCount:(.+),trade", states)[0] # 订单
sku_st = set([i + '+' + a for i, a in zip(sku, amount)])
title = re.findall("subject:(.*?),ti", states)[0] # 标题
product_id = re.findall('productId:(.+),root', states)[0] # 产品ID
url = 'https://fr.aliexpress.com/item/' + product_id + '.html' # 链接
attrName = re.findall('attrName:(.*?),', states) # 描述名
attrValue = re.findall('attrValue:(.*?),', states) # 描述值
description = [i + ':' + j for i, j in zip(attrName, attrValue)] # 合并描述
pro_description = ','.join(description)
for ss in sku_st:
sku = re.findall('#(.*?)\+',ss)[0] # sku
price = re.findall('value:(.*)', ss)[0] # 价格
for i in imgs:
if sku in i['name']:
img = i['img'] # 图片
result = {}
result['产品标题'] = title
result['标准描述'] = pro_description
result['主图'] = img
result['评分'] = rating_value
result['评论数'] = rating_num
result['订单'] = order_count
result['产品链接'] = url
result['价格'] = price
result['SKU'] = sku
result_list.append(result) # 把所有数据添加列表中
# print(result_list)
return result_list
- 保存数据(csv)
with open('test.csv', 'wt', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f,
fieldnames=['产品标题','标准描述','主图','评分','评论数','订单','产品链接','价格','SKU'])
writer.writeheader()
writer.writerows(results)
成果:
selenium就不讲解了,比较简单,xpath可以直接调用,没啥难点。有任何不懂可以私信。
完整项目已上传到资源,有需要的朋友可自行下载👇👇👇
各位的支持和认可就是我最大的动力!