仅供个人学习,如有侵权联系删除
spider文件:lianjia.py
# -*- coding: utf-8 -*-
import scrapy
from pachong6.items import Pachong6Item
class LianjiaSpider(scrapy.Spider):
name = 'lianjia'
allowed_domains = ['m.lianjia.com']
start_urls = ['https://m.lianjia.com/bj/jingjiren/ao22pg' + str(x) for x in range(1,4)]
def parse(self, response):
agentlist = response.xpath('//*[@class="jingjiren-list__agent-item"]')
for agent in agentlist:
item = Pachong6Item()
item['name'] = agent.xpath('div/div/div[2]/div[1]/span/text()').extract_first()
item['region'] = agent.xpath('div/div/div[2]/p/text()').extract_first()
item['tran_num'] = agent.xpath('div/div/div[2]/div[3]/div[1]/span/text()').extract_first()
# print("经纪人姓名:", item['name'])
# print("经纪人所负责区域:", item['region'])
# print("经纪人历史成交量为:", item['tran_num'])
yield item
数据存入mongodb:
# -*- coding: utf-8 -*-
from pymongo import MongoClient
class Pachong6Pipeline(object):
# 在open_spider方法中连接MongoDB,创建数据库和集合,也可以在__init__初始化方法中处理这些操作
def open_spider(self, spider):
dataIp = '127.0.0.1'
dataPort = 27017
dbName = 'lianjia_db'
sheetName = 'collections_db'
# 获取数据库连接
self.client = MongoClient(dataIp, dataPort)
# 指定数据库
self.db = self.client[dbName]
# 指定集合
self.collection = self.db[sheetName]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
# 把Item转化成字典方式,然后添加数据
#print("item数据:", item)
self.collection.insert_one(dict(item))
# return item
结果: