一线城市二手房市场分析（数据采集）

最新推荐文章于 2024-07-31 23:03:34 发布

m0_73818300

最新推荐文章于 2024-07-31 23:03:34 发布

阅读量53

点赞数

文章标签： python pandas 开发语言

本文链接：https://blog.csdn.net/m0_73818300/article/details/131196763

版权

# 1.导入所需的模块，采用Bs4解析方式
from bs4 import BeautifulSoup
import requests
import time
import random
import csv
import pandas as pd

In [30]:

# 3.定义网页封装函数，request进行封装，构造请求头
def getHtml(url):
    # 语法 requests.get(url ,headers = {key:value} )
    res = requests.get(
        url,
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome\
            /103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'
        })

    return res

In [31]:

#  4.定义房产表存储函数
def createCsv():
    file=open("一线城市二手房信息表.csv",'w',encoding='utf-8',newline='')
    csv_head=csv.writer(file)
    csv_head.writerow(['城市','房产id','房产名称','房产信息','居室类型','小区位置','面积（平米）','朝向','装修','楼层','建造年份',
                       '单价（元/平）','总价（万元）','关注人数'])
    
def saveToCsv(HOUSE):
    f=open("一线城市二手房信息表.csv",'a',encoding='utf-8',newline='')
    csvfile=csv.writer(f)
    csvfile.writerows(HOUSE)

# saveToExcel将抓取的数据存入excel中
def saveToExcel():
    file=pd.read_csv("一线城市二手房信息表.csv")
    file.to_excel("一线城市二手房信息表.xlsx")

In [32]:

# 5.专家网页抓取
def getOnePage(list1,list_city):
    
    # 生成空list
    HOUSE = []
    for j in list_city:
        for i in list1:
            time.sleep(random.random())
            url = f'https://{j}.lianjia.com/ershoufang/pg{i+1}/'
    
            # request封装
            res = getHtml(url)

            # 转为 soup 对象
            soup = BeautifulSoup(res.text ,'html.parser')

            # 父节点获取
            parent=soup.find('ul',class_='sellListContent')

            # 找到一项内容
            lis = parent.find_all('div',class_="info clear")    # 每一间房产信息

            # 抓取房产网页信息
            for each in lis:
                # 房产id
                try:
                    ID=each.find('a').attrs['href'].split('/')[-1].replace('.html','')
                except AttributeError:
                    ID=''
                # 房产名称
                try:
                    name=each.find('div',class_='title').text
                except AttributeError:
                    name=''
                # 房屋信息
                try:
                    info=each.find('div',class_='houseInfo').text
                except AttributeError:
                    info=''
                # 居室类型
                try:
                    room=each.find('div',class_='houseInfo').text.split('|')[0].lstrip().rstrip()
                except AttributeError:
                    room=''
                # 小区位置
                try:
                    position=each.find('div',class_='positionInfo').text.split('-')[0].rstrip().lstrip()
                except AttributeError:
                    position=''
                # 面积
                try:
                    area=each.find('div',class_='houseInfo').text.split('|')[1].lstrip().rstrip().replace('平米','')
                except AttributeError:
                    area=''
                # 朝向
                try:
                    direction=each.find('div',class_='houseInfo').text.split('|')[2].lstrip().rstrip()
                except AttributeError:
                    direction=''
                # 装修
                try:
                    fitment=each.find('div',class_='houseInfo').text.split("|")[3].lstrip().rstrip()
                except AttributeError:
                    fitment=''
                # 楼层
                try:
                    floor=each.find('div',class_='houseInfo').text.split("|")[4].lstrip().rstrip()
                except AttributeError:
                    floor=''
                # 建造年份
                try:
                    year=each.find('div',class_='houseInfo').text.split("|")[5].lstrip().rstrip()
                except AttributeError:
                    year=''
                # 单价
                try:
                    unitprice=each.find('div',class_='unitPrice').text.replace(',','').replace('元/平','')
                except AttributeError:
                    unitprice=''
                # 总价
                try:
                    totalprice=each.find('div',class_='totalPrice totalPrice2').text.replace('万','').lstrip().rstrip()
                except AttributeError:
                    totalprice=''
                # 关注人数
                try:
                    follow=each.find('div',class_='followInfo').text.split('/')[0].replace('人关注','').lstrip().rstrip()
                except AttributeError:
                    follow=''


                HOUSE.append([j,ID,name,info,room,position,area,direction,fitment,floor,year,unitprice,totalprice,follow])

    saveToCsv(HOUSE)
    saveToExcel()

In [33]:

list1=list(range(100))
list_city=['sh','bj','gz','sz']

In [34]:

# 6.抓取数据+反爬虫+存入csv
start=time.time()
createCsv()
getOnePage(list1,list_city)
print("抓取结束")
end=time.time()
print('抓取时长:%s Seconds'%(end-start))

m0_73818300

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
一线城市二手房市场分析（数据采集）

unitprice=each.find('div',class_='unitPrice').text.replace(',','').replace('元/平','')csv_head.writerow(['城市','房产id','房产名称','房产信息','居室类型','小区位置','面积（平米）','朝向','装修','楼层','建造年份',file=open("一线城市二手房信息表.csv",'w',encoding='utf-8',newline='')# 抓取房产网页信息。
复制链接

扫一扫