房天下 上海 新房 信息采集
# http://newhouse.sh.fang.com/house/s/澜庭
# http://newhouse.sh.fang.com/house/s/b91/?ctm=1.sh.xf_search.page.1
import requests
from bs4 import BeautifulSoup
import re
# import csv
import pandas as pd
import datetime
import time
# 房天下 上海 新房 信息采集
# http://newhouse.sh.fang.com/house/s/澜庭
# 主存储链接
pages = set()
#http请求头
Hostreferer = {
'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
}
# 正则匹配房型链接
results = re.compile("http://[^www][A-Za-z0-9]{9,50}\.fang\.com[^*]$")
def getHouseLink(page):
# 获取房子网页
url = "http://newhouse.sh.fang.com/house/s/b9"+str(page)+"/?ctm=1.sh.xf_search.page.1"
start_html = requests.get(url, headers=Hostreferer)
bsObj = BeautifulSoup(start_html.text, "html.parser")
h_link = bsObj.find_all("a",{"target":"_blank","href":results})
for link in h_link:
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newpage = link.attrs['href']
print(newpage)
pages.add(newpage)
return pages
# 更多详细信息 url
HousesInfoLinks = set()
# 解析房型页面<更多详细信息 url>链接
def getHouseInfoLink(houselinks):
for link in houselinks:
start_html = requests.get(link, headers=Hostreferer)
bsObj = BeautifulSoup(start_html.text, "html.parser")
link_vas = bsObj.find("div", {"id": "orginalNaviBox", "class": "navleft tf"}) #
# 品名链接
house_name_link = link
# 详情链接
info_link_temp = link_vas.find("a",{"id":"xfptxq_B03_08","target":"_self"})
# 户型链接
houseTypeLink = link_vas.find("a",{"id":"xfptxq_B03_10","target":"_self"})
# 点评链接
dianping_links = link_vas.find("a",{"id":"xfptxq_B03_17","target":"_self"})
# print("58行品名链接"+house_name_link)
# print("59行详情链接" + info_link_temp)
# print("60行户型链接" + houseTypeLink)
# print("61行点评链接" + dianping_link)
if info_link_temp == None or houseTypeLink == None or dianping_links == None :
print("Title could not be found")
else:
infolink = info_link_temp.get('href')
typelink = houseTypeLink.get('href')
dianping_link = dianping_links.get('href')
print(infolink+typelink)
if infolink not in pages:
# newpage = infolink
print(infolink)
temps = getHouseInfo(house_name_link,infolink,typelink,dianping_link)
if temps == None:
print("------------------")
else:
HousesInfoLinks.add(infolink)
return HousesInfoLinks
# 更多详细信息 url
# dict = {}
# 解析房型详细信息页面 获取 基本信息 销售信息 小区规划 价格信息
# http://yuhushangyuanxh021.fang.com/house/1210125570/housedetail.htm
# 定义类型接收
# info_str = " "
titles = ['品名','户型','价格','用户点评', '品名链接','详情链接','户型链接','点评链接','看房团链接','物业类别', '项目特色', '建筑类别', '装修状况', '产权年限', '环线位置', '销售状态', '楼盘优惠', '开盘时间', '交房时间', '售楼地址', '咨询电话', '占地面积', '建筑面积', '容积率', '绿化率', '停车位', '楼栋总数', '总户数', '物业公司', '物业费']
info_lists = list()
def getHouseInfo(house_name_link,link,typelink,dianping_link):
info_list = list()
# 详情页采集
start_html = requests.get(link, headers=Hostreferer)
start_html.encoding = 'gb18030'
bsObj = BeautifulSoup(start_html.text, "html.parser")
# #户型页采集
# start_html1 = requests.get(typelink, headers=Hostreferer)
# start_html1.encoding = 'gb18030'
# bsObj1 = BeautifulSoup(start_html1.text, "html.parser")
title = bsObj.find("a",{"class":"ts_linear","id":""}).get_text()
# print("品名:"+title)
# .strip() 去除空格
price = bsObj.find("div",{"class":"main-info-price"}).em.get_text().strip()
# print("价格:"+price)
# 用户点评
pingjias = bsObj.find("div",{"class":"main-info-comment"}).get_text().replace("\n", "").replace("\t", "").split(" ")[1].split("[")[0]
# print("用户点评:"+pingjias)
# 户型
str_Type = huxingcaiji(typelink)
info_list.append(title)
info_list.append(str_Type)
info_list.append(price)
# 用户点评
info_list.append(pingjias)
# 链接
# '品名链接', '详情链接', '户型链接', '点评链接
# 品名链接
info_list.append(house_name_link)
# 详情链接
info_list.append(link)
# 户型链接
info_list.append(typelink)
# 点评链接
info_list.append(dianping_link)
# 看房团链接
temp_looks = bsObj.find("div",{"id":"sjina_C13_08","class":"contentHot"}).find_all("a",{"class":"btn-sign"})
look_link = ""
if temp_looks == None:
print("look Link is None")
else:
for item in temp_looks:
link = item.get('href')
look_link = look_link + "|" + link +"|"
# return look_link
info_list.append(look_link)
# 基本信息
info_1 = bsObj.find_all("ul",{"class":"list clearfix"})
# print(info_1)
for temp in info_1:
infotype = temp.find_all("div",{"class":"list-left"})
infotype_1 = temp.find_all("div",{"class":"list-right"})
# print(infotype+"---"+infotype_1)
for x, y in zip(infotype, infotype_1):
key = re.sub('[\t\n]', "", re.sub(r'<[^>]+>', "", str(x))).replace(":", "")
values = re.sub('[\t\n]', "", re.sub(r'<[^>]+>', "", str(y))).replace(" ", "").replace("普通住宅:", "")
info_list.append(values)
# print(key+":"+values)
# 小区规划
info_2 = bsObj.find_all("ul",{"class":"clearfix list"})
for temp in info_2:
infotype = temp.find_all("div",{"class":"list-left"})
infotype_1 = temp.find_all("div",{"class":"list-right"})
# print(infotype+"---"+infotype_1)
for x, y in zip(infotype, infotype_1):
key = re.sub('[\t\n]', "", re.sub(r'<[^>]+>', "", str(x))).replace(":", "")
values = re.sub('[\t\n]', "", re.sub(r'<[^>]+>', "", str(y))).replace(" ", "").replace("\xa0", "")
# for item in values:
if values == " " or values == "":
values = "-"
info_list.append(values)
info_lists.append(info_list)
return info_lists
# print(info_2)
def huxingcaiji(typelink):
# 户型页采集
start_html = requests.get(typelink, headers=Hostreferer)
start_html.encoding = 'gb18030'
bsObj = BeautifulSoup(start_html.text, "html.parser")
# 户型
hTypeName = bsObj.find_all("p",{"class":"tiaojian"}) # .find_all("span",{"class":"fl"})
# print(hTypeName)
# HRs = bsObj1.find("ul",{"id":"ListModel","class":"clearfix List_imglist"}).find_all("span",{"class":"fr"})
str_Type = ""
for temp in hTypeName:
# print(temp)
infotype = temp.find_all("span", {"class": "fl"})
infotype_1 = temp.find_all("span", {"class": "fr"})
for n, r in zip(infotype, infotype_1):
str_Type =str_Type + n.get_text() + " " + r.get_text() + "|"
return str_Type
if __name__ == '__main__':
# 开始时间
start = datetime.datetime.now()
print(start)
fileName = "D:/Fang_house_info_type_url.csv"
for i in range(1,2):
getHouseLink(i)
print(pages.__len__())
urlSet = getHouseInfoLink(pages)
print(urlSet.__len__())
print("写入cvs格式文件")
print(info_lists)
test = pd.DataFrame(columns=titles,data=info_lists)
test.to_csv(fileName)
# 完成时间
end = datetime.datetime.now()
print(end)
print("写入完成")
useSeconds = (end - start).total_seconds() # 精确秒数
print(useSeconds)