# 1.导入所需的模块,采用Bs4解析方式 from bs4 import BeautifulSoup import requests import time import random import csv import pandas as pd
In [30]:
# 3.定义网页封装函数,request进行封装,构造请求头 def getHtml(url): # 语法 requests.get(url ,headers = {key:value} ) res = requests.get( url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome\ /103.0.5060.114 Safari/537.36 Edg/103.0.1264.49' }) return res
In [31]:
# 4.定义房产表存储函数 def createCsv(): file=open("一线城市二手房信息表.csv",'w',encoding='utf-8',newline='') csv_head=csv.writer(file) csv_head.writerow(['城市','房产id','房产名称','房产信息','居室类型','小区位置','面积(平米)','朝向','装修','楼层','建造年份', '单价(元/平)','总价(万元)','关注人数']) def saveToCsv(HOUSE): f=open("一线城市二手房信息表.csv",'a',encoding='utf-8',newline='') csvfile=csv.writer(f) csvfile.writerows(HOUSE) # saveToExcel将抓取的数据存入excel中 def saveToExcel(): file=pd.read_csv("一线城市二手房信息表.csv") file.to_excel("一线城市二手房信息表.xlsx")
In [32]:
# 5.专家网页抓取 def getOnePage(list1,list_city): # 生成空list HOUSE = [] for j in list_city: for i in list1: time.sleep(random.random()) url = f'https://{j}.lianjia.com/ershoufang/pg{i+1}/' # request封装 res = getHtml(url) # 转为 soup 对象 soup = BeautifulSoup(res.text ,'html.parser') # 父节点获取 parent=soup.find('ul',class_='sellListContent') # 找到一项内容 lis = parent.find_all('div',class_="info clear") # 每一间房产信息 # 抓取房产网页信息 for each in lis: # 房产id try: ID=each.find('a').attrs['href'].split('/')[-1].replace('.html','') except AttributeError: ID='' # 房产名称 try: name=each.find('div',class_='title').text except AttributeError: name='' # 房屋信息 try: info=each.find('div',class_='houseInfo').text except AttributeError: info='' # 居室类型 try: room=each.find('div',class_='houseInfo').text.split('|')[0].lstrip().rstrip() except AttributeError: room='' # 小区位置 try: position=each.find('div',class_='positionInfo').text.split('-')[0].rstrip().lstrip() except AttributeError: position='' # 面积 try: area=each.find('div',class_='houseInfo').text.split('|')[1].lstrip().rstrip().replace('平米','') except AttributeError: area='' # 朝向 try: direction=each.find('div',class_='houseInfo').text.split('|')[2].lstrip().rstrip() except AttributeError: direction='' # 装修 try: fitment=each.find('div',class_='houseInfo').text.split("|")[3].lstrip().rstrip() except AttributeError: fitment='' # 楼层 try: floor=each.find('div',class_='houseInfo').text.split("|")[4].lstrip().rstrip() except AttributeError: floor='' # 建造年份 try: year=each.find('div',class_='houseInfo').text.split("|")[5].lstrip().rstrip() except AttributeError: year='' # 单价 try: unitprice=each.find('div',class_='unitPrice').text.replace(',','').replace('元/平','') except AttributeError: unitprice='' # 总价 try: totalprice=each.find('div',class_='totalPrice totalPrice2').text.replace('万','').lstrip().rstrip() except AttributeError: totalprice='' # 关注人数 try: follow=each.find('div',class_='followInfo').text.split('/')[0].replace('人关注','').lstrip().rstrip() except AttributeError: follow='' HOUSE.append([j,ID,name,info,room,position,area,direction,fitment,floor,year,unitprice,totalprice,follow]) saveToCsv(HOUSE) saveToExcel()
In [33]:
list1=list(range(100)) list_city=['sh','bj','gz','sz']
In [34]:
# 6.抓取数据+反爬虫+存入csv start=time.time() createCsv() getOnePage(list1,list_city) print("抓取结束") end=time.time() print('抓取时长:%s Seconds'%(end-start))