海量房源筛选

最新推荐文章于 2024-05-30 08:16:36 发布

木木的学习之路

最新推荐文章于 2024-05-30 08:16:36 发布

阅读量425

点赞数

分类专栏： C++ python

本文链接：https://blog.csdn.net/weixin_39437164/article/details/83416083

版权

C++ 同时被 2 个专栏收录

16 篇文章 0 订阅

订阅专栏

python

11 篇文章 0 订阅

订阅专栏

抓取原始数据

数据源：链家
地址：西安
语言：python
准备工作：申请开发者
代码如下，拷贝自别处，稍作修改：

from bs4 import BeautifulSoup
import requests
import csv
import re
def getlocation(name):#调用百度API查询位置
    bdurl='http://api.map.baidu.com/geocoder/v2/?address='
    output='json'
    ak='15528112125'#输入你刚才申请的密匙
    callback='showLocation'
    uri=bdurl+name+'&output=t'+output+'&ak='+ak+'&callback='+callback
    res=requests.get(uri)
    s=BeautifulSoup(res.text)
    lng=s.find('lng')
    lat=s.find('lat')
    if lng:
        return lng.get_text()+','+lat.get_text()

url='https://xa.lianjia.com/ershoufang/pg'
heade={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}#请求头，模拟浏览器登陆
page=list(range(0,500,1))
p=[]
hi =[]
fi=[]
for i in page:#循环访问链家的网页
    response=requests.get(url+str(i))
    soup=BeautifulSoup(response.text)
    #提取价格
    prices=soup.find_all('div',class_='priceInfo')
    for price in prices:
        p.append(price.span.string)

    #提取房源信息
    hs=soup.find_all('div',class_='houseInfo')
    for h in hs:
        hi.append(h.get_text())

    #提取关注度
    followInfo=soup.find_all('div',class_='followInfo')
    for f in followInfo:
        fi.append(f.get_text())
    print(i)

#houses=[]#定义列表用于存放房子的信息
n=0
num=len(p)
#当前目录下自动创建文件
file=open('data_LianJia.csv', 'w', newline='')
headers = ['name', 'loc', 'style', 'size', 'price', 'foc']
writers = csv.DictWriter(file, headers)
writers.writeheader()
while n<num:#循环将信息存放进列表
    h0=hi[n].split('|')
    name=h0[0]
    loc=getlocation(name)
    style = re.findall(r'\s\d.\d.\s', hi[n])#用到了正则表达式提取户型
    if style:
        style=style[0]
    size=re.findall(r'\s\d+\.?\d+',hi[n])#用到了正则表达式提取房子面积
    if size:
        size=size[0]
    price=p[n]
    foc=re.findall(r'^\d+',fi[n])[0]##用到了正则表达式提取房子的关注度
    house = {
        'name': '',
        'loc': '',
        'style': '',
        'size': '',
        'price': '',
        'foc': ''
    }
    #将房子的信息放进一个dict中
    house['name']=name
    house['loc']=loc
    house['style']=style
    house['size']=size
    house['price']=price
    house['foc']=foc
    writers.writerow(house)#将dict写入到csv文件中
    n+=1
    print(n)
file.close()

分析数据

从链家得到所有代售信息，用户感兴趣的并不是所有，因此，适当添加筛选条件，让数据更有效。初步读到的数据不含均价，因此在新建文件中将该数据计算后添加。

语言：C++
筛选条件：面积、单价、总价

//读取CSV文件中的所有类别
int main()
{
	string outpath = "D:\\Cat_Dog_CNN\\data_LianJia_out.csv";
	int count = 420;
	string line;
	int lines = 0;
	vector<vector<string>>datafromfile;

	ifstream fin("D:\\Cat_Dog_CNN\\data_LianJia.csv"); //打开文件流操作
	ofstream file(outpath);
	
	while (getline(fin, line))   //整行读取，换行符“\n”区分，遇到文件尾标志eof终止读取
	{

		lines++;
	
		istringstream stream_in(line); //将整行字符串line读入到字符串流istringstream中

		vector<string> fields; //声明一个字符串向量

		string field;

		while (getline(stream_in, field, ',')) //将字符串流sin中的字符读入到field字符串中，以逗号为分隔符
		{
			fields.push_back(field); //将刚刚读取的字符串添加到向量fields中
		}

		if (lines == 1)
		{
			file << fields[0] << "," << fields[1] << "," << fields[2] << "," << fields[3] << "," << fields[4] << "," << "meanPrice" << endl;
			continue;
		}

		int size = std::stoi(fields[3]);
		double total = std::stoi(fields[4]);
		double meanprice = total / size;
		string mean = to_string(meanprice);
		fields.push_back(mean);

		if (meanprice < 1.3)
		{
			if (size > 80)
			{
				if (total < 130)
				{		
					std::cout << fields[0] << "...";
					std::cout << fields[1] << "...";
					std::cout << fields[2] << "...";
					std::cout << fields[3] << "...";
					std::cout << fields[4] << std::endl;

					for (int i = 0; i < fields.size(); i++)
					{
						if (i==5)//这个数据没看懂，不写了
							continue;
						file << fields[i] << ",";
					}
					file << endl;
				}						
			}			
		}
	}
	fin.close();

	file.close();
	system("pause");
}

OK，根据实际情况筛选条件，得到最有效的数据。

木木的学习之路

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
2
评论
海量房源筛选

抓取原始数据数据源：链家地址：西安语言：python准备工作：申请开发者代码如下，拷贝自别处，稍作修改：from bs4 import BeautifulSoupimport requestsimport csvimport redef getlocation(name):#调用百度API查询位置 bdurl='http://api.map.baidu.com/geoc...
复制链接

扫一扫

专栏目录