这一篇使用scrapy爬虫框架实现亚马逊商品评论的抓取。
1、创建一个爬虫项目:
scrapy startproject MySpiderTest
2、item.py中定义数据item:
import scrapy
from scrapy.item import Field, Item
class ItcastItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = Field()
title = Field()
info = Field()
# 评论信息
class AmazonReviewItem(Item):
user_id = Field()
user_name = Field()
data_asin = Field()
name = Field() # 商品名称
review_title = Field()
review_star_rating = Field() # 评分
review_date = Field() # 日期
review_info = Field()
# 商品信息
class AmazonGoodsItem(scrapy.Item):
# define the fields for your item here like:
#collection = 'amazon' # 数据表
s_href = scrapy.Field() # 小分类url
data_asin = scrapy.Field() # 商品编号
name = scrapy.Field() # 商品名称
goods_url = scrapy.Field() # 商品url
brand = scrapy.Field() # 商品品牌
price = scrapy.Field() # 商品价格
freight = scrapy.Field() # 运费
3、spider目录创建爬虫amazon_review.py:
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse as url_parse
from mySpiderTest.items import AmazonGoodsItem, AmazonReviewItem
import re
from copy import deepcopy
# 爬取亚马逊评论信息
# 通过搜索关键字查询出来的列表,如k=phone
class AmazonReviewSpider(scrapy.Spider):
name = 'amazon_review'
allowed_domains = ['www.amazon.com']
# start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss']
def __init__(self, category=None, *args, **kwargs):
super(AmazonReviewSpider, self).__init__(*args, **kwargs)
self.start_urls = []
if category is not None:
keys = category.split(",")
for key