BIG+碧家国际社区集中式公寓项目爬取_碧家国际社区公寓项目能做吗-CSDN博客

本文链接：https://blog.csdn.net/zengbowengood/article/details/100923251

BIG+碧家国际社区集中式公寓项目爬取

背景
代码
代码解读
结果截图
免责声明

背景

研究组对于集中式公寓项目的数据需求源源不断，这不又需要BIG+碧家国际社区集中式公寓项目在全国各城市的项目名称，起始租金，剩余房源数以及详细地址，话不多说，代码见。

代码

# -*- coding: utf-8 -*-
"""
project_name:big+
@author: 帅帅de三叔
Created on Tue Sep 17 09:21:15 2019
"""
import requests #导入网页请求模块
from bs4 import BeautifulSoup #导入网页解析模块
import re #导入正则模块

import pymysql #导入数据框接口模块
db=pymysql.Connect("localhost","root","123456","big",charset="UTF8MB4") #li按揭数据框
cursor=db.cursor() #获取操作数据框的游标
cursor.execute("drop table if exists  big_") #以重新写入的方式写入数据库
c_sql="""create table big_(
        city varchar(6),
        project_name varchar(20),
        start_price float(10),
        remaining_room_num int(4),
        address varchar(40))Engine=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=UTF8MB4"""
cursor.execute(c_sql) #创建表big_

header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"} #构造请求头


def spider(url,city): #定义爬虫函数
    response=requests.get(url,headers=header,timeout=30)
    response.encoding="UTF8MB4" #UTF-8编码
    soup=BeautifulSoup(response.text,'lxml') 
    project_list=soup.find("div",class_="list").findAll("dl") #找出所有的项目列表
    #print(project_list[0])
    for project in project_list: 
        city=city
        project_name=project.find("div",class_="detail-item3 f-ib").find("h2").get_text() #项目名称
        start_price=project.find("div",class_="detail-item4 f-fr f-tac").find("p").find("b").get_text().replace("￥","") #起始价格
        pattern=re.compile("\d+") #用以匹配数字
        remaining_room_num=int(re.search(pattern, project.find("div",class_="detail-item3 f-ib").find("s").get_text()).group(0)) #剩余房间数
        address=project.find("div",class_="detail-item3 f-ib").find("p").get_text() #地址
        print(city,project_name,start_price,remaining_room_num,address)
        insert_factor=("insert into big_(city,project_name,start_price,remaining_room_num,address)""values(%s,%s,%s,%s,%s)") #控制插入格式
        insert_data=([city,project_name,start_price,remaining_room_num,address]) #待插入数据
        cursor.execute(insert_factor,insert_data) #执行数据库的插入操作
        db.commit()

if __name__=="__main__":
    city_list=["bj","tj","sh","hz","xm","wh","cs","gz","sz","zq","dg","zq"] #构造城市简称列表
    city_name=["北京市","天津市","上海市","杭州市","厦门市","武汉市","长沙市","广州市","深圳市","肇庆市","东莞市","重庆市"] #构造城市中文名用以填补城市字段
    start_url=["http://www.bgy-bigplus.com/{}/{}-all.html".format(city,city) for city in city_list] #构造起始网址列表
    for index,url in enumerate(start_url):
        print("起始网址：",url)
        spider(url,city_name[index])

代码解读

在这里插入图片描述
由于只要爬取项目一些相关信息，不需要进入项目里面的房源爬取，故在设计的时候只需要对各个城市的起始网页进行请求，获取各项目的表面信息，而且各城市的项目数不多，没有翻页的操作，最后把构造的城市列表匹配到项目名称的前面一个字段。原网页设计的时候肇庆市和重庆市未分开，在爬取的时候会默认进入重庆市，会导致肇庆市的项目漏爬，可以人工补上，写入数据库截图如下。