项目分析
- 确定获取数据(‘标题’, ‘来源’, ‘价格’)
- 明确采集的目标地址:wayfair
- 保存数据的格式csv
- 开发环境python3.7,Windows10
- 开发工具pycharm
- 所需工具包requests,csv,lxml
项目解析
-
数据抓包
-
xpath提取数据
-
xpath语法提取数据是有时会和网页源代码有出入
原因:
- 网页渲染数据和源代码不一致
- 数据是动态提交
-
只能获取12个数据
-
将网址直接修改成12个数据页面展示(别问为什么不直接解析,问就是懒)
-
https://www.wayfair.com/furniture/sb0/sofas-c413892.html?itemsperpage=12&curpage=2
源码展示
python入门学习资料获取,学习沟通裙:731685275
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author : BaiChuan
# @File : 数据采集.py
import requests
from lxml import etree
import csv
url = 'https://www.wayfair.com/furniture/sb0/sofas-c413892.html?itemsperpage=12&curpage=2'
headers = {
'cookie': 'CSNUtId=23e17d3a-6076-9566-352a-432dc55ee602; ExCSNUtId=23e17d3a-6076-9566-352a-432dc55ee602; vid=23e17d3a-6076-9566-352a-432dc55ee602; SFSID=404051738ada9a0c684f43245f4fe7a7; canary=0; WFDC=DSM; serverUAInfo=%7B%22browser%22%3A%22Google%20Chrome%22%2C%22browserVersion%22%3A89.04389114%2C%22OS%22%3A%22Windows%22%2C%22OSVersion%22%3A%2210%22%2C%22isMobile%22%3Afalse%2C%22isTablet%22%3Afalse%2C%22isTouch%22%3Afalse%7D; _pxhd=a8eb65dfb04a827ef94744fbffb32d7a6a5a3cccdd64e37013f92f53bcf05891:7f4eb201-9cf0-11eb-ae26-fb9eaee6d2f3; CSNPersist=page_of_visit%3D2; CSN=g_countryCode%3DUS%26g_zip%3D67346%26CLVW%3D305; categoryId=45974; _pxvid=7f4eb201-9cf0-11eb-ae26-fb9eaee6d2f3; _px3=44cd92501c3ef2b4b422580a232bac226cf7091bcdae52768f4b376813d5e5af:vZGlSrQi/BrNr6MyNkDZs2IfFM/CGOu6gtEq5rbzbh3x3hUV2vZNni3S2gvgdYWXEXiP8rjmEClCs/+W6+Icyg==:1000:JfNA/ZCmtKEcqsxF6ni+pRxODERn3B94d8U+2l+m+kn5GXRAKADx6e751NnLATlskTLYCfL5msgiM3tNzJlJ+djvUaga/E3RMuDgpXHDacIQYyM+CH0VLIRRn7gkxECcCvyk1tZBAErHvSMwLthzHlSeRXHLtoG8inJTFZamyTM=',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'referer': 'https://www.wayfair.com/furniture/cat/furniture-c45974.html',
# 'cookie': 'CSNUtId=23e17d3a-6075-b5be-352a-432d710a2502; ExCSNUtId=23e17d3a-6075-b5be-352a-432d710a2502; vid=23e17d3a-6075-b5be-352a-432d710a2502; SFSID=b6198ed2962142502a3506f7210da359; canary=1; WFDC=DSM; serverUAInfo=%7B%22browser%22%3A%22Google%20Chrome%22%2C%22browserVersion%22%3A88.04324104%2C%22OS%22%3A%22Windows%22%2C%22OSVersion%22%3A%2210%22%2C%22isMobile%22%3Afalse%2C%22isTablet%22%3Afalse%2C%22isTouch%22%3Afalse%7D; __ssid=a1d0cfe445f83053367b76d391efb8f; IR_gbd=wayfair.com; rskxRunCookie=0; rCookie=t6dmfypqgyfqwjwycgro7okng64qcq; AppInterstitial=visit_date_1%3D2021-04-13; _ga=GA1.2.1655478551.1618327159; _gid=GA1.2.826818772.1618327159; TopNavCSSCachedByBrowser=true; CSN=g_countryCode%3DUS%26g_zip%3D67346; otx=I+F9OmB1t7m/FkgpBmYVAg==; categoryId=45974; CSNPersist=page_of_visit%3D47; IR_12051=1618327505620%7C0%7C1618327505620%7C%7C; lastRskxRun=1618327506200'
'upgrade-insecure-requests': '1'
}
response = requests.get(url, headers=headers).text
data = etree.HTML(response.encode())
new_data = data.xpath('//div[@data-hb-id="Grid.Item"]/div/div/div/a/div[2]')
# print(len(new_data))
with open('sofa.csv', 'a', newline="", encoding='utf-8')as f:
writer = csv.DictWriter(f, fieldnames=['标题', '来源', '价格'])
for div in new_data:
title = div.xpath('.//h2/text()')[0]
source = "by" + div.xpath('./div/p/text()')[0]
price = div.xpath('./div[3]//span[1]/text()')
if len(price) == 2:
price = div.xpath('./div[3]//span[1]/text()')[0] + div.xpath('./div[3]//span[1]/text()')[1]
else:
price = 'from ' + div.xpath('./div[3]//span[1]/text()')[0]
data = {'标题': title, '来源': source, '价格': price}
writer.writerow(data)