scrapy贝壳小区均价数据爬取

朴拙数科

已于 2022-08-12 10:45:51 修改

阅读量593

点赞数 2

分类专栏：爬虫文章标签： scrapy python 开发语言

于 2022-08-12 10:45:06 首次发布

本文链接：https://blog.csdn.net/weixin_45934622/article/details/126299439

版权

爬虫专栏收录该内容

14 篇文章 0 订阅

订阅专栏

spider

import time

import scrapy
from shell.items import ShellItem
from lxml import etree
from inline_requests import inline_requests
from fake_useragent import UserAgent
from scrapy.selector import Selector
from shell.dataprocess import get_ymd_time
from lxml import html

class ShellscrpySpider(scrapy.Spider):
    name = 'Allcity'
    allowed_domains = ['bj.ke.com']
    start_urls = ['https://xa.ke.com/xiaoqu/']#https://www.ke.com/city']  # https://bj.ke.com/ershoufang/']
    headers = {
        "User-Agent": UserAgent().random,#随机UserAgent
        "Cookie": '填自己的',
        "Referer":"https://xa.ke.com/xiaoqu/weiyang/"
#         "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
# Accept-Encoding: gzip, deflate, br"
    }

    classItems = {
        # "dbName": "esfcommunity", "collectionName": "beke",
        "province": "陕西省", "city": "西安市", "spiderDate": get_ymd_time()
    }

    def start_requests(self):  # 必要的请求结构
        page=1
        url = self.start_urls[0].format(page)  # 构造请求体
        yield scrapy.Request(url=url, headers=self.headers, dont_filter=True) # dont_filter=True, meta={"page": page})


    @inline_requests
    def parse(self, response):
        # item = ShellItem()
        res=response
        quurl = res.xpath("//div[@data-role=\"ershoufang\"]/div/a/@href").extract()
        #print(quurl)
        listquurl=["".join(["https://xa.ke.com",i]) for i in quurl]
        items=ShellItem()
        items.update(self.classItems)
        print(len(listquurl))
        for i in range(len(listquurl)):#len(listquurl)):
            url=listquurl[i]
            # print(url)
            res1=yield scrapy.Request(url=url, headers=self.headers, dont_filter=True)
            # print(res1.text)
            #二级页面
            endnum=res1.xpath("//div[@class=\"page-box fr\"]/div/@page-data").get()#判断有多少页
            a1=endnum.split("\":")[1].split(",\"")
            #print("当前爬取到：",a1[1].split("\":")[1])
            list2=["".join([url,"pg",str(j)]) for j in range(1,int(a1[0])+1)]
            # print(list2)
            #print(len(list2))
            time.sleep(2)
            for k in range(len(list2)):
                print(list2[k])
                res3 = yield scrapy.Request(url=list2[k], headers=self.headers, dont_filter=True)
                # print(res3.text)
                res3_xpath=res3.xpath("//ul[@class=\"listContent\"]/li")
                for rx in res3_xpath:
                    # items = ShellItem()
                    # items.update(self.classItems)
                    rx1=etree.HTML(rx.get())#Selector(
                    items["projectName"]=rx1.xpath("//div[1]/div[1]/a[@class=\"maidian-detail\"]/text()")[0]#.get()#小区名称
                    try:
                        items["referencePrice"]=rx1.xpath("//div[2]/div[1]/div[@class=\"totalPrice\"]/span/text()")[0]#.get()#价格
                    except:
                        items["referencePrice"] ="--"
                    items["region"]=rx1.xpath("//div[1]/div[3]/a[position()<2]/text()")[0]+"区"
                    items["updateDate"]="2022-0"+rx1.xpath("//div[2]/div[1]/div[@class=\"priceDesc\"]/text()")[0].split("月")[0]#.get()
                    # print(items["projectName"])
                    # print(items["referencePrice"])
                    # print(items["region"])
                    # print(items["updateDate"])
                    yield items

pipline

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymongo as pymongo
from .items import ShellItem

import pandas as pd
class ShellPipeline:
    def __init__(self,mongo_url_bk,mongo_db_bk,mongo_coll_bk,mongo_account_number,mongo_password):
        self.mongo_url_bk = mongo_url_bk
        self.mongo_db_bk = mongo_db_bk
        self.mongo_coll_bk = mongo_coll_bk
        self.mongo_account_number=mongo_account_number
        self.mongo_password=mongo_password
        # self.MONGO_HOST=settings['MONGO_HOST']
        # self.MONGO_PORT=settings['MONGO_PORT']
        # self.MONGO_URL_BK = settings['MONGO_URL_BK']
        # self.MONGO_DB = settings['MONGO_DB']
        # self.MONGO_COLL = settings['MONGO_COLL']
        #super().__init__()


    @classmethod
    def from_crawler(cls, crawler):
        """
            scrapy为我们访问settings提供了这样的一个方法，这里，
            我们需要从settings.py文件中，取得数据库的URI和数据库名称
        """
        return cls(mongo_url_bk=crawler.settings.get('MONGO_URL_BK'),
                   mongo_db_bk=crawler.settings.get('MONGO_DB_BK'),
                   mongo_coll_bk=crawler.settings.get('MONGO_COLL'),
                   mongo_account_number=crawler.settings.get('MONGO_ACCOUNT_NUMBER'),
                   mongo_password=crawler.settings.get('MONGO_PASSWORD')

        )




    def open_spider(self, spider):
        """
        爬虫一旦开启，就会实现这个方法，连接到数据库
        """
        #client = pymongo.MongoClient(host=self.MONGO_HOST], port=self.MONGO_PORT)
        self.client_bk = pymongo.MongoClient(self.mongo_url_bk) # 连接数据
        self.db = self.client_bk[self.mongo_db_bk]  # 获得数据库名称
        # self.db.authenticate(self.mongo_account_number,self.mongo_password)
        self.coll = self.db[self.mongo_coll_bk]#self.coll = self.db[settings['MONGO_COLL']]



    def close_spider(self, spider):
        """
        爬虫一旦关闭，就会实现这个方法，关闭数据库连接
        """
        self.client_bk.close()


    def process_item(self, items, spider):
        items1= ItemAdapter(items).asdict()
        #items1=items1.pop('dbName')
        # del items['classItemName']# 删除不需要字段
        # dbName = items.pop('dbName')

        #collectionName = 'collectionName'#items.pop('collectionName')
        postItem = dict(items1)  # 把item转化成字典形式
        self.coll.insert_one(postItem)
        #self.db[collectionName].insert(postItem)  # 向数据库插入一条记
        # df=pd.DataFrame(item['city'],item['city_url'])
        # print(df)#

        # df.to_csv("C:/Users/LiNan/Desktop/data.csv",encoding="UTF-8")
        return items1