scrapy爬虫万变不离其中,就几个步骤:
1、分析数据
2、创建爬虫框架
3、编写相应的代码
分析数据
今天要爬取的数据是“京东女装“商品的价格信息与评价数;由于小伙伴对女装特别感兴趣就准备爬取前后一星期是价格与评论数来进行数据分析,话不多说上代码…额 搞错了 先分析数据:
1、对京东女装列表进行分析:
第一页:https://search.jd.com/Search?keyword=%E5%A5%B3%E8%A3%85&suggest=1.his.0.0&wq=%E5%A5%B3%E8%A3%85&page=1&s=1141&click=0
第二页:https://search.jd.com/Search?keyword=%E5%A5%B3%E8%A3%85&suggest=1.his.0.0&wq=%E5%A5%B3%E8%A3%85&page=2&s=1141&click=0
因为京东商品的第一页首先显示30条数据,往下拉再动态加载30条数据,所有第一页到第二页实际是1、3、5…形式的
上面是我分析后的1、2、3…page页数显示没有下拉,直接获取当前页的30条页数即可
2、获取女装列表的半个超链接:
检查元素查找相应的超链接:
3、获取单个商品超链接,部分代码如下:
4、进入详情页获取详细信息:
5、将数据传递到items:
6、将传递的字典转换成json文件:(最好在pipelines中进行转换)
7、注意:爬取的数据为空时,要设置相应的请求头:
8、因为转换后的json格式信息没有包含当天价格与评价数信息,所有在写一个python爬虫爬取当天价格与评论数;因为价格与评论数是动态变换的,那么要爬取动态的数据就要重新检查元素,选择相应的方法了
当检查网络数据时会发现点商品评价时:
点击进入会发现当前商品的所有评价数都以json格式显示:
此时分析当前连接:
https://sclub.jd.com/comment/skuProductPageComments.action?productId=5552018&score=0&sortType=5&page=0&pageSize=10 ;发现productId为商品的编号,那么只需改动商品编号就能找到对于的商品评价数信息
同理商品的价格参照:https://blog.csdn.net/wuhui2100/article/details/107124276获取
那么接下来
创建项目:
1、创建相应的爬虫项目名
代码部分
woman.py
# -*- coding: utf-8 -*-
import scrapy
from urllib.parse import quote
class WomanSpider(scrapy.Spider):
name = 'woman'
allowed_domains = ['search.jd.com/Search?keyword=女装']
start_urls = ['https://search.jd.com/Search?keyword=女装&wq=女装&page=1']
def parse(self, response):
print("开始爬虫")
keyword = int(input("请输入要爬取的页数:"))
# 遍历访问的页数
a_href = []
a = 1
# 关键词中有中文的话,百度就会将其转码为%开始的编码
keyword1 = quote("女装", encoding="utf-8")
print(keyword1)
for a in range(0, keyword):
b = "https://search.jd.com/Search?keyword=%E5%A5%B3%E8%A3%85&wq=%E5%A5%B3%E8%A3%85&page=" + str(a + 1)
a_href.append(b)
a = a + 1
print(a_href)
#遍历爬取的页数
for href in a_href:
print("==" * 40)
print(href)
# 因为请求被去重过滤了,所以才调试不了啊!要加入dont_filter=True才可正常访问
yield scrapy.Request(url=href,callback=self.parse_woman_list,dont_filter=True)
def parse_woman_list(self, response):
# 进入京东女装列表页
print("进入京东女装列表页")
li_list = response.xpath("//*[@id='J_goodsList']/ul")
print(li_list)
for li in li_list:
item = {}
a_href = li.xpath(".//li/div/div[4]/a/@href").extract()
print(a_href)
# 组合详情页
if a_href is not None:
a_href = ['https:' + x for x in a_href]
print(len(a_href))
# 遍历组合好的连接
for c_href in a_href:
print("==" * 40)
print(c_href)
yield scrapy.Request(
url=c_href,
callback=self.parse_woman_li,
dont_filter=True
)
def parse_woman_li(self,response):
item = {}
print("进入详情页")
#
li = response.xpath("//*[@id='detail']/div[2]/div[1]/div[1]/ul[2]")
b = ':'
# print(li)
for li_book in li:
books = li_book.xpath(".//li/text()").extract()
print(books)
# 正则表达式替换:之前的字符串
for a in books:
# print(a)
c = a[:a.find(b)]
# print('c=',c)
if c == '商品名称':
# 商品名称
item["商品名称"] = a[a.find(b):].replace(":", "")
print("商品名称===:",item["商品名称"])
if c == '商品编号':
# 商品编号
item["商品编号"] = a[a.find(b):].replace(":", "")
print("商品编号===:",item["商品编号"])
if c == '商品毛重':
# 商品毛重
item["商品毛重"] = a[a.find(b):].replace(":", "")
print("商品毛重===:",item["商品毛重"])
if c == '货号':
# 货号
item["货号"] = a[a.find(b):].replace(":", "")
print("货号===:",item["货号"])
if c == '材质':
# 材质
item["材质"] = a[a.find(b):].replace(":", "")
print("材质===:",item["材质"])
if c == '风格':
# 风格
item["风格"] = a[a.find(b):].replace(":", "")
print("风格===:",item["风格"])
if c == '流行元素':
# 流行元素
item["流行元素"] = a[a.find(b):].replace(":", "")
print("流行元素===:",item["流行元素"])
if c == '适用年龄':
# 适用年龄
item["适用年龄"] = a[a.find(b):].replace(":", "")
print("适用年龄===:",item["适用年龄"])
if c == '上市时间':
# 上市时间
item["上市时间"] = a[a.find(b):].replace(":", "")
print("上市时间===:",item["上市时间"])
if c == '图案':
# 图案
item["图案"] = a[a.find(b):].replace(":", "")
print("图案===:",item["图案"])
yield item
woman_detail.py
# -*- coding: utf-8 -*-
import scrapy
import json
import requests
import datetime
import copy
from urllib.parse import quote
from urllib import request
from urllib import parse
class WomanSpider(scrapy.Spider):
name = 'woman_detail'
allowed_domains = ['search.jd.com/Search?keyword=女装']
start_urls = ['https://search.jd.com/Search?keyword=女装&wq=女装&page=1']
def parse(self, response):
with open('.//woman1.json', 'r', encoding='utf-8') as load_f:
strF = load_f.read()
if len(strF) > 0:
datas = json.loads(strF)
a = datas["woman"]
print("这是文件中的json数据:")
print(a)
item = {}
for b in a:
# 商品名称
if '商品名称' in b:
item["商品名称"] = b["商品名称"]
# 商品编号
if '商品编号' in b:
item["商品编号"] = b["商品编号"]
if '商品毛重' in b:
item["商品毛重"] = b["商品毛重"]
if '货号' in b:
item["货号"] = b["货号"]
if '材质' in b:
item["材质"] = b["材质"]
if '风格' in b:
item["风格"] = b["风格"]
if '流行元素' in b:
item["流行元素"] = b["流行元素"]
if '适用年龄' in b:
item["适用年龄"] = b["适用年龄"]
if '上市时间' in b:
item["上市时间"] = b["上市时间"]
if '图案' in b:
item["图案"] = b["图案"]
# 遍历商品编码
c = b["商品编号"]
print(c)
url = "https://sclub.jd.com/comment/skuProductPageComments.action?productId=" + c +"&score=0&sortType=5&page=0&pageSize=10"
# print(url)
yield scrapy.Request(
url=url,
callback=self.parse_womans,
meta={"item": copy.deepcopy(item)},
dont_filter=True
)
else:
print("这个文件中的json数据为空")
datas = {}
def parse_womans(self,response):
item = response.meta["item"]
# 调用body_as_unicode()是为了能处理unicode编码的数据
li = json.loads(response.body_as_unicode())
print("li:", li)
# 总评价数
item["总评价数"] = li["productCommentSummary"]["commentCount"]
print("总评价数", item["总评价数"])
# 好评度
item["好评度"] = li["productCommentSummary"]["goodRateShow"]
print("好评度", item["好评度"])
# 好评数
item["好评数"] = li["productCommentSummary"]["goodCount"]
print("好评数", item["好评数"])
# 默认好评数
item["默认好评数"] = li["productCommentSummary"]["defaultGoodCount"]
print("默认好评数", item["默认好评数"])
# 中评数
item["中评数"] = li["productCommentSummary"]["generalCount"]
print("中评数", item["中评数"])
# 差评数
item["差评数"] = li["productCommentSummary"]["poorCount"]
print("差评数", item["差评数"])
# 视频晒单
item["视频晒单"] = li["productCommentSummary"]["videoCount"]
print("视频晒单", item["视频晒单"])
# 遍历商品编码
c = item["商品编号"]
print(c)
url = "https://p.3.cn/prices/mgets?skuIds=J_" + c
# print(url)
yield scrapy.Request(
url=url,
callback=self.parse_womans_message,
meta={"item": copy.deepcopy(item)},
dont_filter=True
)
def parse_womans_message(self,response):
item = response.meta["item"]
# 调用body_as_unicode()是为了能处理unicode编码的数据
price_list = json.loads(response.body_as_unicode())
# print sites['k'].split(',')
print("price_list:", price_list)
for li in price_list:
item["商品价格"] = li["p"]
# 商品价格
print("商品价格", item["商品价格"])
# 爬取时间
item["爬取时间"] = datetime.datetime.now().strftime('%Y-%m-%d')
print("爬取时间:", item["爬取时间"])
yield item
item.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class JdwomanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
商品名称 = scrapy.Field()
商品编号 = scrapy.Field()
商品毛重 = scrapy.Field()
货号 = scrapy.Field()
材质 = scrapy.Field()
风格 = scrapy.Field()
流行元素 = scrapy.Field()
适用年龄 = scrapy.Field()
上市时间 = scrapy.Field()
图案 = scrapy.Field()
总评价数 = scrapy.Field()
好评度 = scrapy.Field()
好评数 = scrapy.Field()
默认好评数 = scrapy.Field()
中评数 = scrapy.Field()
差评数 = scrapy.Field()
视频晒单 = scrapy.Field()
商品价格 = scrapy.Field()
爬取时间 = scrapy.Field()
pipeline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
from openpyxl import Workbook
class JdwomanPipeline(object):
def __init__(self):
# 转为json格式
self.book_fp = open('woman.json', 'wb')
self.book_exporter = JsonLinesItemExporter(self.book_fp, ensure_ascii=False)
#转为excel表
self.wb = Workbook() # 类实例化
self.ws = self.wb.active # 激活工作表
self.ws.append(['商品名称', '商品编号', '商品毛重', '货号', '材质', '风格', '流行元素',
'适用年龄','上市时间','图案','总评价数','好评度','好评数',
'默认好评数', '中评数', '差评数', '视频晒单', '商品价格', '爬取时间']) # 添加表头
def process_item(self, item, spider):
#接收item(json)
self.book_exporter.export_item(item)
# excel
data = [item["商品名称"], item["商品编号"], item["商品毛重"], item["货号"], item["材质"],
item["风格"], item["流行元素"], item["适用年龄"],item["上市时间"],
item["图案"], item["总评价数"], item["好评度"],item["好评数"], item["默认好评数"],
item["中评数"], item["差评数"], item["视频晒单"], item["商品价格"], item["爬取时间"]]
self.ws.append(data) # 将数据以行的形式添加到工作表中
self.wb.save('woman_1.xlsx') # 保存
return item
def close_spider(self,spider):
self.book_fp.close()
middlewares.py
from scrapy import signals
import random
class UserAgentDownloadMiddleware(object):
USER_AGENTS=[
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
"Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
"Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
]
def process_request(self,request,spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent']=user_agent
setting.py
# -*- coding: utf-8 -*-
# Scrapy settings for jdWoman project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'jdWoman'
SPIDER_MODULES = ['jdWoman.spiders']
NEWSPIDER_MODULE = 'jdWoman.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'jdWoman (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'jdWoman.middlewares.JdwomanSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'jdWoman.middlewares.JdwomanDownloaderMiddleware': 543,
'jdWoman.middlewares.UserAgentDownloadMiddleware': 10,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'jdWoman.pipelines.JdwomanPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
main.py启动类(启动woman.py)
from scrapy.cmdline import execute
execute(["scrapy", "crawl", "woman"])
main1.py 启动类(启动woman_detail.py)
from scrapy.cmdline import execute
execute(["scrapy", "crawl", "woman_detail"])
最累的时候家人是你最好的归宿!