一.知识准备
我们需要学习以下python库,包括爬虫获取数据的requests库,pandas读取cvs文件的库,BeautifulSoup网页截取库,csv保存数据到csv文件的库,numpy科学计算库,matplotlib数据可视化库,具体库知识学习可以下面评论留言
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import numpy as np
from matplotlib.pyplot import MultipleLocator
import matplotlib.pyplot as plt
二.代码更新(加入更多数据集,爬取多个网站数据,进行可视化显示)
"""""
作者:cpz
目的:盐城房价分析
版本:2.0
时间:30/07/2019
"""""
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import numpy as np
from matplotlib.pyplot import MultipleLocator
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def get_building_price(create_utl_list):
build_name_list = []
build_price_list = []
for url in create_utl_list:
print(url)
rtext = requests.get(url,timeout=30)
soup = BeautifulSoup(rtext.text,'lxml')
buliding_list = soup.find_all('div',{'class':'lp-list'})[0]
#print(buliding_list)
building_2_list = buliding_list.find_all('li',{'class':'list-item clearfix'})
#print(building_2_list)
#print(rtext.status_code)
#print(len(building_2_list))
for build in building_2_list:
build_name = build.find_all('a',{'class':'tit'})
build_price_div = build.find_all('div',{'class':'other fr'})
build_price__em = build_price_div[0].find_all('em',{'class':'arial'})
build_name_str = build_name[0].text.split('\n')[0]
build_name_list.append(build_name_str)
build_price_list.append(build_price__em[0].text)
#print(build_name_list)
#print(build_price_list)
#buildings_name_list = building_2_list.find_all('a',{'class':'tit'})[0]
#print(buildings_name_list)
return build_name_list,build_price_list
def get_clean_data(build_name_list,build_price_list):
build_price__name_int_list = []
for i in range(len(build_price_list)):
build_price__name_int_list.append((build_name_list[i],build_price_list[i]))
print(build_price__name_int_list)
Header = ['BuildName','Price']
buildName = []
buildPrice = []
with open('yancheng_build_price.csv','w',encoding='utf-8',newline='') as f:
writer = csv.writer(f)
writer.writerow(Header)
for i,build in enumerate(build_price__name_int_list):
if build[1] !='待定':
build_name = build[0]
buildName.append(build_name)
build_price = int(build[1])
buildPrice.append(build_price)
row = [build_name]+[build_price]
writer.writerow(row)
return buildName,buildPrice
def Create_url():
url_list = []
url_list.append('http://danke00.com/xinfang/1261')
for i in range(1,5):
url_list.append('http://danke00.com/xinfang/1261-no-no-no-no-no-no-no-no-no/'+str(i+1))
#print(url_list)
return url_list
def main():
cerate_url = Create_url()
build_name_list,build_price_list = get_building_price(cerate_url)
buildName,buildPrice = get_clean_data(build_name_list,build_price_list)
build_name_price_list = pd.read_csv('yancheng_build_price.csv')
print(build_name_price_list.info())
build_name_price_list.plot(kind='line',x='BuildName',y='Price',title='盐城盐都区各在售小区房价曲线图',figsize=(60,5),color='red')
x = np.arange(len(buildName))
tickpoints = np.arange(len(buildName))
plt.xticks(tickpoints, buildName)
for a,b in zip(x,buildPrice):
plt.text(a,b,b,ha='center',va='bottom',fontsize='10')
# x_major_locator = MultipleLocator(1)
# ax = plt.gca()
# ax.xaxis.set_major_locator(x_major_locator)
plt.show()
if __name__ == '__main__':
main()
四.总结
不说多少,该换电脑了,才爬了五六个网站,电脑就卡爆了,更不谈建立训练集了