scrapy框架爬取---链家房网

创建项目名令:
首先打开一个文件夹,在pycharm里面打开
scrapy startproject lianjiaspiders
使用名令 cd lianjiaspiders
在lianjiaspiders 里面创建一个项目:
scrapy genspider lianjia https://www.lianjia.com/city/
创建完成后显示:
在这里插入图片描述在这里插入图片描述

写爬虫代码在sqiders里面的lianjia.py里面完成

-- coding: utf-8 --

import scrapy
import re, requests
from lianjiaSpider.items import LianjiaspiderItem
from fake_useragent import UserAgent


class LianjiaSpider(scrapy.Spider):
    name = 'lianjia'
    # allowed_domains = ['bj.lianjia.com']
    # 定义爬虫起始url
    start_urls = ["https://www.lianjia.com/city/"]

    def parse(self, response):
        # 获取所有城市的url
        city_url_list = response.xpath("//div[@class='city_list']//ul/li/a/@href").extract()
        # 循环遍历拼接完整的url
        for url in city_url_list:
            city_url = url + "zufang"
            yield scrapy.Request(url=city_url, callback=self.business_parse)

    def business_parse(self, response):

        # 获取商圈url列表
        business_url_list = response.xpath("//ul[@data-target='area']/li[position()>1]/a/@href").extract()
        # print(business_url_list)
        for url in business_url_list:
            business_url = "https://bj.lianjia.com" + url
            # print(business_url)
            yield scrapy.Request(url=business_url, callback=self.parse_page_url, meta={"data": business_url})

    def parse_page_url(self, response):
        # print(response.url)

        # 缩小范围
        div_list = response.xpath("//div[@class='content__list']/div")
        for div in div_list:
            # 图片
            pic = div.xpath(".//img/@data-src").extract()[0]
            pic = pic.replace("250x182", "2500x1800")
            # print(pic)

            # 标题
            title = div.xpath(".//p[@class='content__list--item--title twoline']/a/text()").extract()[0].strip()
            # print(title)

            # 城区
            city_area = div.xpath(".//p[@class='content__list--item--des']/a[1]/text()").extract()[0]

            # 商圈
            business_circle = div.xpath(".//p[@class='content__list--item--des']/a[2]/text()").extract()[0]
            # print(city_area, business_circle)

            # 面积
            area = div.xpath(".//p[@class='content__list--item--des']//text()[4]").extract()
            area = area[0].strip() if area else ""  # 空值处理
            # print(area)

            # 朝向
            toward = div.xpath(".//p[@class='content__list--item--des']//text()[5]").extract()[0].strip()
            # print(toward)

            # 房间信息
            fang_info = div.xpath(".//p[@class='content__list--item--des']//text()[6]").extract()[0].strip()
            # print(fang_info)
            room = re.findall("(\d+)室", fang_info)  # 室
            hall = re.findall("(\d+)厅", fang_info)  # 厅
            toilet = re.findall("(\d+)卫", fang_info)  # 卫

            # 空值处理
            room = int(room[0]) if room else 0
            hall = int(hall[0]) if hall else 0
            toliet = int(toilet[0]) if toilet else 0
            # print(room, hall, toilet)

            # 发布时间
            publish_date = div.xpath(".//p[@class='content__list--item--time oneline']/text()").extract()[0]
            # print(publish_date)

            # 标签
            sign_list = div.xpath(".//p[@class='content__list--item--bottom oneline']/i/text()").extract()
            # print(sign_list)
            # 将标签转换为字符串
            sign = "-".join(sign_list)
            # print(sign)

            # 价格
            price = div.xpath(".//em/text()").extract()[0]
            # print(price)

            # 详情url
            detail_url = div.xpath(".//p[@class='content__list--item--title twoline']/a/@href").extract()[0]
            # 拼接完整的详情url
            detail_url = "https://bj.lianjia.com" + detail_url

            # 实例化item类 将字段存入item
            item = LianjiaspiderItem()

            item["room"] = room
            item["hall"] = hall
            item["toliet"] = toliet
            item["pic"] = pic
            item["title"] = title
            item["city_area"] = city_area
            item["business_circle"] = business_circle
            item["area"] = area
            item["toward"] = toward
            item["sign"] = sign
            item["price"] = price
            item["publish_date"] = publish_date
            item["detail_url"] = detail_url
            # print(item)

            # 进一步抓取详情页的
            yield scrapy.Request(url=detail_url, meta={"data": item}, callback=self.parse_detail, dont_filter=True)

        # 获取下一页的url
        max_page = response.xpath("//div[@class='content__pg']/@data-totalpage").extract()
        max_page = int(max_page[0]) if max_page else 1
        # print(max_page)
        for page in range(1, max_page):
            # 拼接完整的下一页url
            business_url = response.meta["data"]
            page_url = business_url + "pg{}/".format(page)
            # print(page_url)
            # 循环的调用自己
            yield scrapy.Request(url=page_url, callback=self.parse)

    def parse_detail(self, response):

        # 接受item参数
        item = response.meta["data"]

        # 获取去楼层
        floor = response.xpath("//ul/li[@class='fl oneline'][8]/text()").extract()
        floor = floor[0] if floor else ""
        # 获取电话号码,由于电话号码在借口里,要首先获取ucid和house_code
        # 1.获取ucid,封装到一个函数里
        ucid = self.get_ucid(response)
        # 2.获取house_code
        house_code = re.findall("zufang/(.*?).html", response.url)[0]
        # 拼接完整的经纪人接口
        agent_url = "https://bj.lianjia.com/zufang/aj/house/brokers?house_codes={}&position=bottom&ucid={}".format(
            house_code, ucid)

        try:
            # 获取接口中的信息
            headers = {"User-Agent": UserAgent().random}
            json_data = requests.get(agent_url, headers=headers).json()
            # print(json_data)
            phone = json_data.get("data")[house_code][house_code].get("tp_number")

            # print(phone)
        except Exception as e:
            print(e)
            phone = ''

        # 把电话号码和楼层信息放到item里
        item["phone"] = phone
        item["floor"] = floor
        # print(item)
        yield item

    # 获取ucid函数
    def get_ucid(self, response):
        count_ucid = 1
        try:
            ucid = response.xpath("//span[@class='contact__im im__online']/@data-info").extract()
            ucid = ucid[0] if ucid else ""
            # print(ucid)

            return ucid
        except Exception as e:
            print(e)
        if count_ucid == 3:
            return ""
        else:
            count_ucid += 1
            return self.get_ucid(response)

items里面的代码,这个里面的代码相当于定义,后面的参数必须和定义的相同

import scrapy


class LianjiaspiderItem(scrapy.Item):
    # define the fields for your item here like:
    room = scrapy.Field()#室
    hall = scrapy.Field()#厅
    toliet = scrapy.Field()#卫
    pic = scrapy.Field()#图片
    title = scrapy.Field()#标题
    city_area = scrapy.Field()#城市
    business_circle = scrapy.Field()#商圈
    area = scrapy.Field()#面积
    toward = scrapy.Field()#朝向
    sign = scrapy.Field()#标签
    price = scrapy.Field()#价格
    detail_url = scrapy.Field()#详情的url
    publish_date = scrapy.Field()#发布时间
    floor = scrapy.Field()#楼层
    phone = scrapy.Field()#电话

配置里面需要完成的:
在这里插入图片描述
在这里插入图片描述
pipelines管道里面需要完成的代码。包括写入数据库:

import pymysql
class LianjiaspiderPipeline(object):
    def __init__(self):
        self.count = 1
        self.conn_mysql()

    def conn_mysql(self):
        self.conn = pymysql.connect(host="127.0.0.1", user="root", password='123',
                                    database="0218", charset="utf8")
        # 创建操作数据库的对象
        self.cur = self.conn.cursor()
    def process_item(self, item, spider):
        # print(item)
        #把数据从item中拿出来
        room = item["room"]
        hall = item["hall"]
        toliet = item["toliet"]
        pic = item["pic"]
        title = item["title"]
        city_area = item["city_area"]
        business_circle = item["business_circle"]
        area = item["area"]
        toward = item["toward"]
        sign = item["sign"]
        price = item["price"]
        detail_url = item["detail_url"]
        floor = item["floor"]
        phone = item["phone"]
        publish_date = item["publish_date"]


        sql = 'insert into lianjia_copy(room,hall,toliet,pic,title,city_area,business_circle,area,toward,publish_date,sign,price,detail_url,floor,phone)VALUES({},{},{},"{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.\
            format(room, hall, toliet, pic, title, city_area, business_circle, area, toward,publish_date, sign, price, detail_url, floor, phone)


        # self.cur.execute(sql)
        # self.conn.commit()
        print(self.count,sql)
        self.count+=1

        return item

运行的名令 在main里面

from scrapy import cmdline
# cmdline.execute("scrapy crawl sina".split())
cmdline.execute("scrapy crawl sina --nolog".split())
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值