03-02 成都二手房之爬虫部分

1、spider文件

import scrapy
import time
import random
from LianJia.items import LianjiaItem


class LianJiaSpider(scrapy.Spider):
    name = 'LianJia'
    # allowed_domains = ['www.xxx.com']
    # 起始url
    # start_urls = ['https://cd.lianjia.com/ershoufang/pg1/']
    District_list=['jinjiang', 'qingyang', 'wuhou', 'gaoxin7', 'chenghua', 'jinniu', 'tianfuxinqu', 'gaoxinxi1', 'shuangliu'
    , 'wenjiang', 'pidou', 'longquanyi', 'xindou', 'tianfuxinqunanqu', 'qingbaijiang', 'doujiangyan', 'pengzhou',
    'jianyang', 'xinjin', 'chongzhou1', 'dayi', 'jintang','pujiang','qionglai']
    # url模板
    url='https://cd.lianjia.com/ershoufang/{}/pg{}/'

    def start_requests(self):
        for district in self.District_list:
            for i in range(1,3):
                url=self.url.format(district,i)
                yield scrapy.Request(url=url,callback=self.parse_detail_url)

    # 通过详情页url解析获取房子详细信息
    def parse_detail_page(self, response):
        info={}
        item = LianjiaItem()
        try:
            item['title'] = response.xpath('.//div[@class="title"]/h1/text()').extract_first()
            item['total_price'] = response.xpath('/html/body/div[5]/div[2]/div[3]/span//text()').extract_first()
            item['price'] = response.xpath('/html/body/div[5]/div[2]/div[3]/div[1]/div[1]/span//text()').extract_first()
            item['build_time'] = response.xpath('/html/body/div[5]/div[2]/div[4]/div[3]/div[2]/text()').extract_first()
            item['community_name'] = response.xpath('/html/body/div[5]/div[2]/div[5]/div[1]/a[1]/text()').extract_first()
            item['district'] = response.xpath('/html/body/div[5]/div[2]/div[5]/div[2]/span[2]//text()').extract_first()

            item['number'] = response.xpath('/html/body/div[5]/div[2]/div[5]/div[4]/span[2]/text()').extract_first()

            item['house_type'] = response.xpath('.//div[@class="content"]/ul/li[1]//text()')[1].extract()
            item['floor'] = response.xpath('.//div[@class="content"]/ul/li[2]//text()')[1].extract()
            item['area'] = response.xpath('.//div[@class="content"]/ul/li[3]//text()')[1].extract()
            item['structure'] = response.xpath('.//div[@class="content"]/ul/li[4]//text()')[1].extract()
            item['inside_space'] = response.xpath('.//div[@class="content"]/ul/li[5]//text()')[1].extract()
            item['building_type'] = response.xpath('.//div[@class="content"]/ul/li[6]//text()')[1].extract()
            item['orientation'] = response.xpath('.//div[@class="content"]/ul/li[7]//text()')[1].extract()

            item['decoration'] = response.xpath('.//div[@class="content"]/ul/li[9]//text()')[1].extract()
            item['tihubi'] = response.xpath('.//div[@class="content"]/ul/li[10]//text()')[1].extract()
            item['lift'] = response.xpath('.//div[@class="content"]/ul/li[11]//text()')[1].extract()
            item['listing_time'] = response.xpath('.//div[@class="transaction"]/div/ul/li[1]/a/text()').extract_first()
            item['owner'] = response.xpath('.//div[@class="transaction"]/div/ul/li[2]/a/text()').extract_first()
            item['last_deal'] = response.xpath('.//div[@class="transaction"]/div/ul/li[3]/a/text()').extract_first()
            item['house_use'] = response.xpath('.//div[@class="transaction"]/div/ul/li[4]/a/text()').extract_first()
            item['house_age'] = response.xpath('.//div[@class="transaction"]/div/ul/li[5]/a/text()').extract_first()
            item['mortgage_details'] = response.xpath('.//div[@class="transaction"]/div/ul/li[7]/span[2]/text()').extract_first()
        except:
            print('ee')

        # item = LianjiaItem()
        # item=info

        yield item

    # 解析获取每套房子的详情页url
    def parse_detail_url(self, response):
        detail_url_list = response.xpath('//a[@class="noresultRecommend img LOGCLICKDATA"]/@href').extract()
        for url in detail_url_list:
            #
            yield scrapy.Request(url=url, callback=self.parse_detail_page)

setting文件:

BOT_NAME = 'LianJia'

SPIDER_MODULES = ['LianJia.spiders']
NEWSPIDER_MODULE = 'LianJia.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False #不遵从robots协议
LOG_LEVEL='ERROR'      #日志等级设定

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100

DOWNLOAD_TIMEOUT=3
RETRY_ENABLED=FalseITEM_PIPELINES = {
   'LianJia.pipelines.LianjiaPipeline': 300,
}

items文件


import scrapy


class LianjiaItem(scrapy.Item):
    # define the fields for your item here like:
    # title = scrapy.Field()
    total_price=scrapy.Field()
    price=scrapy.Field()
    build_time=scrapy.Field()
    community_name=scrapy.Field()

    district=scrapy.Field()
    number=scrapy.Field()
    house_type=scrapy.Field()
    floor=scrapy.Field()
    # area=scrapy.Field()

    structure=scrapy.Field()
    # inside_space=scrapy.Field()
    building_type=scrapy.Field()
    orientation=scrapy.Field()

    decoration=scrapy.Field()
    tihubi=scrapy.Field()
    lift=scrapy.Field()
    listing_time=scrapy.Field()
    ownerlast_deal=scrapy.Field()

    house_use=scrapy.Field()
    house_age=scrapy.Field()
    mortgage_details=scrapy.Field()

pipelines文件

import pymysql
# from pymysql import cousors

class LianjiaPipeline(object):
    conn=None
    cursor=None

    def open_spider(self,spider):
        self.conn = pymysql.Connect(host='localhost',
                                    port=3306,
                                    user='root',
                                    password='123456',
                                    database='chengdu'
                                    )
        print(self.conn)

    def process_item(self, item, spider):
        conn= self.conn.cursor()

        sql_col = ",".join(list(item.keys()))
        sql_value = ",".join([f"'{item[key]}'" for key in item.keys()])
        
        sql = "insert into lianjia ({}) value ({});".format(sql_col, sql_value)
        conn.execute(sql)
        self.conn.commit()
        return item
已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 游动-白 设计师:白松林 返回首页