"""
爬取贝壳找房的房源
"""
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_data(keyword):
"""
获取原始数据
:param keyword:
:return:
"""
ip = '114.100.0.229:9999'
proxy = {"http": ip}
url = 'https://bj.ke.com/ershoufang/rs{}'.format(keyword)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"}
res = requests.get(url, headers=headers, proxies=proxy)
with open('beike.txt', 'w', encoding='utf-8') as f:
f.write(res.text)
soup = BeautifulSoup(res.text)
item_list = soup.find_all('div', class_='title')
house_info_list = soup.find_all('div', class_='houseInfo')
total_price_list = soup.find_all('div', class_='totalPrice')
results = {}
for i in range(len(house_info_list)):
title = item_list[i].a.text.strip()
results.setdefault(title, {})
house_info = house_info_list[i].text.split('|')
results[title]['floor'] = house_info[0].replace('\n', '').replace(' ', '')
results[title]['year'] = house_info[1].replace('\n', '').replace(' ', '')
results[title]['jiegou'] = house_info[2].replace('\n', '').replace(' ', '')
results[title]['area'] = house_info[3].replace('\n', '').replace(' ', '')
results[title]['direction'] = house_info[4].replace('\n', '').replace(' ', '')
price = total_price_list[i].text.strip().split('\n')[0].replace('\n', '').replace(' ', '')
results[title]['price'] = price
detail_url = item_list[i].a.attrs['href']
detail_info = requests.get(detail_url)
detail_soup = BeautifulSoup(detail_info.text)
detail_list = detail_soup.find_all('div', class_='content')
unti_price = detail_soup.find('div', class_='unitPrice').span.text
results[title]['unti_price'] = unti_price + '元/平米'
results[title]['detail_url'] = detail_url
for each in detail_list:
try:
if each.ul.li.span.text == '梯户比例':
tiHuBi = each.ul.li.text
results[title]['tiHuBi'] = tiHuBi
if each.ul.li.span.text == '建筑结构':
jiegou = each.ul.li.text
results[title]['jiegou'] = jiegou
if each.ul.li.span.text == '建筑类型':
leixing = each.ul.li.text
results[title]['leixing'] = leixing
except:
pass
with open('fangyuan.txt', 'w', encoding='utf-8') as f:
f.read(json.dumps(results))
return results
def filter_data():
"""
对原始数据进行筛选
:return:
"""
with open('fangyuan.txt', 'r', encoding='utf-8') as f:
results = json.loads(f.read())
df = pd.DataFrame(results)
print('yingkun')
return
if __name__ == '__main__':
keyword = '朝阳'
filter_data()