requests爬取链家网房源数据

最新推荐文章于 2024-08-13 21:38:27 发布

以梦为马越骑越傻

最新推荐文章于 2024-08-13 21:38:27 发布

阅读量735

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/weixin_42670402/article/details/110260053

版权

爬虫专栏收录该内容

22 篇文章 1 订阅

订阅专栏

无不良目的，纯学习

策略：

1、增量爬取二手房成交数据，最多3000条，所以每天直接增量爬取即可。

2、老数据有几种方式，我用的并不是最优的，先从安居客爬取所有小区入库（安居客反爬比较强，锁也是增量爬取），链家查询每个小区成交房源数据。

3、还有更好的策略，只是练习，所以没有完善，总共5万多数据，抓了4万。

代码:

增量代码：

import requests
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.common import WebRequests,Mc
from scrapy import Selector
import re
import datetime
from urllib import parse
import time
# 增量抓取所有链家房源
class Lianjia:
    def __init__(self):
        self.web_requests = WebRequests()
        self.mc = Mc()
    def run(self):
        url_one = "https://xa.lianjia.com/chengjiao/pg1/"
        response = self.web_requests.get(url_one)
        selector = Selector(text=response.text)
        url = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-url").extract_first()
        page = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-data").extract_first()
        page_dic = eval(page)
        total_page = page_dic.get('totalPage')
        curPage = page_dic.get('curPage')

        while curPage<=total_page:
            time.sleep(1)
            next_url = parse.urljoin(response.url, url.format(page=str(curPage)))
            print('===url:{}'.format(next_url))
            r = self.web_requests.get(next_url)
            selector = Selector(text=r.text)
            ul = selector.xpath("//ul[@class='listContent']/li")

            for li in ul:
                # 小区名 户型 面积
                title = li.xpath('.//div[@class="info"]/div[@class="title"]/a/text()').extract_first()
                a = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()
                house_id = int(re.match('.*?(\d+).*', a).group(1))
                # 朝向
                position = li.xpath('.//div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract_first()
                money_all = int(li.xpath('.//div[@class="totalPrice"]/span/text()').extract_first()) # 总价
                money_every = int(li.xpath('.//div[@class="unitPrice"]/span/text()').extract_first()) # 单价
                success_data = li.xpath('.//div[@class="dealDate"]/text()').extract_first() # 成交日期
                success_data = datetime.datetime.strptime(success_data,'%Y.%m.%d')
                link = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first() # 房屋链接
                try:
                    name,house_type,size = title.split(' ')
                except Exception as e:
                    print('====error,title:{}'.format(title))
                    continue
                img = li.xpath('.//a/img/@src').extract_first()
                house_size = float(size.replace('平米',''))
                sql = 'select * from lianjia_ershoufang_xian where house_id={}'.format(house_id)
                r = self.mc.query(sql)
                if not r:
                    sql = "insert into lianjia_ershoufang_xian (house_id,name,house_type,house_size,money_all,money_every,success_data,img,link) values ({},'{}','{}',{},{},{},'{}','{}','{}')".format(
                        house_id, name, house_type, house_size, money_all, money_every, success_data,img,link)
                    print(sql)
                    self.mc.insert(sql)
            curPage+=1




if __name__ == '__main__':
    Lianjia().run()

抓取安居客房源数据

from utils.common import WebRequests, Mc
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import re
import time

from scrapy import Selector

from utils.common import WebRequests, Mc


# 安居客抓取所有小区
class Home:
    def __init__(self):
        self.web_requests = WebRequests()
        self.mc = Mc()

    def run(self):
        url_one = "https://xa.anjuke.com/community/"
        response = self.web_requests.get(url_one)
        selector = Selector(text=response.text)
        urls = selector.xpath("//div[@class='div-border items-list']//div[1]/span[2]/a/@href").extract()
        positions = []
        for url in urls[15:]:
            position = re.match(r'https://xa.anjuke.com/community/(.*)/', url).group(1)
            positions.append(position)
        print(positions)
        anjuke_url = 'https://xa.anjuke.com/community/'
        for position in positions[1:]:
            url = anjuke_url + position + '/p{}'
            response = self.web_requests.get(url.format(1))
            selector = Selector(text=response.text)
            counts = selector.xpath("//div[@class='sortby']/span/em[2]/text()").extract()
            if counts and int(counts[0]) == 0:
                continue
            try:
                page_count = int(int(counts[0]) / 30)
            except Exception as e:
                print(e)
                print(counts)
            for page in range(1, page_count + 1):
                print('====position:{},page:{}'.format(position, page))
                time.sleep(1)
                response = self.web_requests.get(url.format(page))
                selector = Selector(text=response.text)
                homes = selector.xpath("//div[@class='list-content']/div")
                for item in homes[1:]:
                    home = item.xpath('.//div[@class="li-info"]/h3/a/text()').extract_first()
                    home = home.replace(' ', '').replace('\n', '')
                    quyu = item.xpath('.//div[@class="li-info"]/address/text()').extract_first().replace(' ',
                                                                                                         '').replace(
                        '\n', '')
                    price = item.xpath('.//div[@class="li-side"]/p/strong/text()').extract_first().replace('\n', '')

                    sql = "select * from xian_home where home='{}'".format(home)
                    r = self.mc.query(sql)
                    if not r:
                        sql = "insert into xian_home (home,position,money_every) values ('{}','{}',{})".format(home, quyu, price)
                        self.mc.insert(sql)
                    else:
                        sql = "update xian_home set money_every={} where home='{}'".format(price,home)

        


if __name__ == '__main__':
    Home().run()

通过安居客房源数据在链家上爬取小区所有房源

import urllib
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import requests
from utils.common import WebRequests,Mc
from scrapy import Selector
import re
import datetime
from urllib import parse
import time
# 通过库里的小区名，在链家上查询所有成交房源
class Lianjia:
    def __init__(self):
        self.web_requests = WebRequests()
        self.mc = Mc()
    def get_home(self):
        # 库里查询所有小区
        sql = 'select home from xian_home'
        homes = self.mc.query(sql)
        return homes

    def run(self):
        homes = self.get_home()
        for idx,home in enumerate(homes):
            url_first = 'http://xa.lianjia.com/chengjiao/pg1rs{}'.format(home[0])
            # url_first = url_first.decode('gbk', 'replace')
            # url_first = urllib.quote(url_first.encode('utf-8', 'replace'))
            response = self.web_requests.get(url_first)
            selector = Selector(text=response.text)
            count = selector.xpath('//div[@class="total fl"]/span/text()').extract_first()
            if count:
                count = int(count.replace(' ',''))

                pages = int(count/30)+1
            else:
                continue
            if pages>50:
                continue
            for page in range(1,pages+1):
                time.sleep(1)
                url = 'http://xa.lianjia.com/chengjiao/pg{}rs{}/'.format(page,home[0])
                response = self.web_requests.get(url)
                selector = Selector(text=response.text)
                items = selector.xpath("//ul[@class='listContent']/li")
                for li in items:
                    try:
                        title = li.xpath('.//div[@class="info"]/div[@class="title"]/a/text()').extract_first()
                        if '车位' in title:
                            continue
                        a = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()
                        house_id = int(re.match('.*?(\d+).*', a).group(1))
                        if house_id == 101109708199:
                            print('here')
                        # 朝向
                        position = li.xpath(
                            './/div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract_first()
                        money_all = int(li.xpath('.//div[@class="totalPrice"]/span/text()').extract_first())  # 总价
                        money_every = int(li.xpath('.//div[@class="unitPrice"]/span/text()').extract_first())  # 单价
                        success_data = li.xpath('.//div[@class="dealDate"]/text()').extract_first()  # 成交日期
                        success_data = datetime.datetime.strptime(success_data, '%Y.%m.%d')
                        link = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()  # 房屋链接
                        try:
                            name, house_type, size = title.split(' ')
                        except Exception as e:
                            print('====error,title:{}'.format(title))
                            continue
                        img = li.xpath('.//a/img/@src').extract_first()
                        try:
                            house_size = float(size.replace('平米', ''))
                        except Exception as e:
                            print('====error,title:{}'.format(title))
                        sql = 'select * from lianjia_ershoufang_xian where house_id={}'.format(house_id)
                        r = self.mc.query(sql)
                        if not r:
                            sql = "insert into lianjia_ershoufang_xian (house_id,name,house_type,house_size,money_all,money_every,success_data,img,link) values ({},'{}','{}',{},{},{},'{}','{}','{}')".format(
                                house_id, name, house_type, house_size, money_all, money_every, success_data, img, link)
                            print(sql)
                            self.mc.insert(sql)
                    except Exception as e:
                        continue



if __name__ == '__main__':
    Lianjia().run()

工具model（数据库信息隐藏）

import pymysql
import sys

class Mc:
    '''
    类Mc:把mysql的一些操作封装成类
    ExcQuery(sql):查找,返回类型:tuple
    ExcUpdate(sql):增删改,出错时输出错误信息
    用法:
    mc = Mc()
    sql = "SELECT * FROM `biaotiku`"
    data = mc.ExcQuery(sql)
    for i in data:
        print(i)

    sql="INSERT INTO `biaotiku` (`id`, `text`, `beizhu`) VALUES (NULL, 'test', '123')"
    mc.ExcUpdate(sql)
    '''

    def __init__(self, db_host="xxx.xxx.xxx.xxx", username="xxx", pw="xxx", dbname="spider"):
        self.db_host = db_host
        self.username = username
        self.pw = pw
        self.dbname = dbname
        self.db = pymysql.connect(self.db_host, self.username, self.pw, self.dbname)
        self.cursor = self.db.cursor()

    def query(self, sql):
        self.cursor.execute(sql)
        r = self.cursor.fetchall()
        if r:
            return list(r)
        else:
            return []

    def update(self, sql):
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except:
            print(sys.exc_info())

    def insert(self, sql):
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except:
            print(sys.exc_info())
    def __del__(self):
        self.db.close()
if __name__ == '__main__':
        r = Mc().query('select * from proxy_ip where id=3;')
        if r:
            print(r)

工具common

import random

import requests
import time
from utils.model import Mc


class WebRequests:
    def __init__(self):
        self.ips = []
        sql = 'select ip from proxy_ip where is_delete=0;'
        all = Mc().query(sql)
        for ip in all:
            self.ips.append(ip[0])

    @property
    def user_agent(self):
        """
        return an User-Agent at random
        :return:
        """
        from fake_useragent import UserAgent
        ua = UserAgent()
        return ua.random

    @property
    def header(self):
        """
        basic header
        :return:
        """
        return {'User-Agent': self.user_agent,
                'Accept': '*/*',
                'Connection': 'keep-alive',
                'Accept-Language': 'zh-CN,zh;q=0.8'}

    @property
    def proxy(self):
        return random.choice(self.ips)
        # return self.ips[random.randint(1, len(self.ips) + 1)]

    def get(self, url, header=None, retry_time=1, retry_interval=5, timeout=10, *args, **kwargs):
        """
        get method
        :param url: target url
        :param header: headers
        :param retry_time: retry time
        :param retry_interval: retry interval
        :param timeout: network timeout
        :return:
        """
        headers = self.header
        if header and isinstance(header, dict):
            headers.update(header)
        proxies = {"http": "http://" + str(self.proxy)}
        i = 0
        while True:
            try:
                # r = requests.get(url, headers=headers, proxies=proxies,timeout=timeout)
                r = requests.get(url, headers=headers,timeout=timeout)
                i = 0
                return r
            except Exception as e:
                i+=1
                print('====请求失败，{}s后重试{}次'.format(retry_time,i))
                time.sleep(retry_time)
                if i==retry_interval:
                    print('====请求失败，请检查：{}'.format(url))