requests练手，从贝壳网上爬取深圳各地区的租房信息（多线程+多进程爬取）

最新推荐文章于 2023-03-30 20:14:32 发布

JiaXionG_Lynn

最新推荐文章于 2023-03-30 20:14:32 发布

阅读量390

点赞数

本文链接：https://blog.csdn.net/JiaXionG_Lynn/article/details/102685254

版权

话不多说直接上代码

这是将数据导入到MySQL数据库中的代码

import mysql.connector

class HandleMySQL(object):
    def __init__(self):
    	# user和password要根据自己电脑设置的去写,
    	# 用这段代码之前一定要先创建一个数据库,名为crawl_beike
        self.connector = mysql.connector.connect(host='localhost', port=3306, user='root', password='123456',
                                                 database='crawl_beike')
        self.cursor = self.connector.cursor()
	
	# 插入数据
    def insert_one(self, info):
    	# 需要在数据库中先创建一个表house_info
        sql = "INSERT INTO house_info VALUES (%s,%s,%s,%s,%s)"
        data = (info['标题'], info['价格'], info['面积'], info['布局'], info['地区'])
        self.cursor.execute(sql, data)
        self.connector.commit()
	
	# 关闭数据库连接
    def close_connector(self):
        self.cursor.close()
        self.connector.close()

# 实例化
mysql = HandleMySQL()

这个是爬虫代码

import requests
import re
from lxml import etree
import multiprocessing
from concurrent.futures import ThreadPoolExecutor
# 导入上面写的存储到MySQL的模块（文件目录命名为crawl_beike,写的存储的py文件命名为handle_MySQL）
from crawl_beike.handle_MySQL import mysql


# 定义一个类
class CrawlBeike(object):
    def __init__(self):
        # 第一个请求的url,用来获取深圳各区district的url(罗湖区,福田区,龙岗区,南山区...
        self.first_url = 'https://sz.zu.ke.com/zufang/'
        # 创建一个地区area_空列表,用来存储从district的url中获取到的小区域的url(南山区→科技园,高新区,西丽....
        self.area_url = []
        # 设置请求头
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3902.4 Safari/537.36'
        }
        # 用来存储构造的每个area的页码url
        self.page_url = []

    # 请求url方法
    def request(self, url):
        response = requests.get(url=url, headers=self.header)
        return response.text

    # 获取和构造详细地区url
    def get_area_url(self):
        # 调用request方法
        response = self.request(self.first_url)
        html = etree.HTML(response)
        # 获取构造大区district的url
        district_url = []
        all_district = html.xpath('//ul[@data-target="area"]/li[@class="filter__item--level2  "]/a/@href')
        # print(all_area)
        for district in all_district:
            district_url.append('https://sz.zu.ke.com' + district)

        # 在大区url中获取和构造详细地区area的url
        for url in district_url:
            response = self.request(url)
            html = etree.HTML(response)
            all_area = html.xpath('//li[@class="filter__item--level3  "]/a/@href')
            # 存储构造的url
            for area in all_area:
                self.area_url.append('https://sz.zu.ke.com' + area + 'pg')
    
    # 构造页码url
    def get_page_url(self, url):
        page = 1
        while True:
            p_url = url + str(page)
            response = self.request(p_url)
            html = etree.HTML(response)
            num = html.xpath('//span[@class="content__title--hl"]/text()')[0]
            if num != '0':
                print(p_url)
                self.page_url.append(p_url)
                page += 1
            else:
                break

    # 使用XPath获取所需信息
    def get_info(self, url):
        response = self.request(url)
        html = etree.HTML(response)
        house_info = []
        all_div = html.xpath('//div[@class="content__list--item"]')
        pattern = r'([0-9-]*?㎡)[\d\D]*?(\d室\d厅\d卫)'
        for item in all_div:
            try:
                info = {}
                info['标题'] = str(item.xpath('.//p[@class="content__list--item--title twoline"]/a/text()')[0].strip())
                info['价格'] = str(item.xpath('.//em/text()')[0])
                detail = str(''.join(item.xpath('.//p[@class="content__list--item--des"]/text()')))
                info['面积'] = str(re.search(pattern, detail).group(1))
                info['布局'] = str(re.search(pattern, detail).group(2))
                info['地区'] = str(html.xpath('//li[@class="filter__item--level3  strong"]/a/text()')[0])
                house_info.append(info)
                mysql.insert_one(info)
            except Exception as e:
                print(e)
        print(house_info)

    def run(self):
        # 使用多线程获取每个地方的页码url
        self.get_area_url()
        executor = ThreadPoolExecutor()
        executor.map(self.get_page_url, self.area_url)
        executor.shutdown()

        # 使用多进程获取每个页码url里房子信息
        pool = multiprocessing.Pool(20)
        for url in self.page_url:
            pool.apply_async(self.get_info, args=(url,))
        mysql.close_connector()
        pool.close()
        pool.join()


if __name__ == '__main__':
    beike = CrawlBeike()
    beike.run()

这是爬取的数据
在这里插入图片描述

导入到数据库的效果图
在这里插入图片描述

JiaXionG_Lynn

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
requests练手，从贝壳网上爬取深圳各地区的租房信息（多线程+多进程爬取）

话不多说直接上代码这是将数据导入到MySQL数据库中的代码import mysql.connectorclass HandleMySQL(object): def __init__(self): # user和password要根据自己电脑设置的去写, # 用这段代码之前一定要先创建一个数据库,名为crawl_beike self.connect...
复制链接

扫一扫