目录
(一)目录结构
(二)yiyao.py中的内容
代码关键位置都有注释,请仔细看
# -*- coding: utf-8 -*-
import scrapy
from lxml import etree
from ..items import YaoItem
class YiyaoSpider(scrapy.Spider):
name = 'yiyao'
allowed_domains = ['111.com.cn']
start_urls = [] # 此列表中的内容会被加入到请求队列
for page in range(1, 51):
url = 'https://www.111.com.cn/categories/953710-j%s.html' % page
start_urls.append(url)
#start_urls中的每个请求结果都会传到这里来处理
def parse(self, response):
# content = response.body.decode('utf-8')
# print(content)
# tree = etree.HTML(content)
li_list = response.xpath('//ul[@class="itemSearchList"]/li')
print(len(li_list))
for li in li_list:
div_list = li.xpath('./div')
for div in div_list:
# 实例化items模块中的YaoItem类
item = YaoItem()
# 价钱
price = div.xpath('./p[1]/span/text()').extract()
price = ''.join([i.strip() for i in price])
print(price)
# 描述
description = div.xpath('./p[2]//text()').extract()
description = ''.join([i.strip() for i in description])
print(description)
# 店名
store = div.xpath('./div[1]/span[2]/text()').extract()
store = ''.join([i.strip() for i in store])
print(store)
# 评论数量
comment = div.xpath('./div[1]//em/text()').extract()
comment = ''.join([i.strip() for i in comment])
if comment:
comment = comment
else:
comment = 0
print(comment)
item['price'] = price
item['description'] = description
item['store'] = store
item['comment'] = comment
# yield将实例化后并带有赋值的YaoItem返回到piplines.py进行处理保存
#怎么返回的呢?通过settings里的
# ITEM_PIPELINES = {
# # 'crawl.pipelines.CrawlPipeline': 300,
# 'crawl.pipelines.YaoPipeline': 300,
# }
yield item
最后的yield会走settings里的
# ITEM_PIPELINES = {
# # 'crawl.pipelines.CrawlPipeline': 300,
# 'crawl.pipelines.YaoPipeline': 300,
# }
从而指向piplines.py里的YaoPipeline类
(三)settings的相关配置
(四)piplines.py相关设置
代码:
import json
import pymongo
class YaoPipeline(object):
def __init__(self):
#连接数据库
self.client = pymongo.MongoClient('localhost')
#创建库
self.db = self.client['yiyao']
#创建表
self.table = self.db['yao_detail']
def process_item(self,item,spider):
print('___________________==================++++++')
self.table.insert(dict(item))
return item
(五)运行
mian.py右键运行即可开始爬取数据与写入
(六)界面化工具查看保存到MongoDB的内容
1、打开Robo 3T - 1.2
点击File,点击connect
点击create:
设置后点击save
其中yiyao是数据库名,来源
yao_detail是表,来源