本案例对旅游景点的热度,点评数量,排行等进行了爬取,后期数据处理部分还有提高空间,请读者自行编写。
"""
Created on Wed Apr 3 17:48:21 2019
@author: iHJX_Alienware
"""
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
print('导入模块')
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def get_urls(n):
return ['https://travel.qunar.com/p-cs300153-rizhao-jingdian-1-' + str(i+1) for i in range(n)]
def get_informations(u):
ri = requests.get(u)
soupi = BeautifulSoup(ri.text,'lxml')
infori = soupi.find('ul',class_="list_item clrfix").find_all('li')
datai = []
n=0
for i in infori:
n+=1
dic = {}
dic['lat'] = i['data-lat']
dic['lng'] = i['data-lng']
dic['景点名称'] = i.find('span',class_="cn_tit").text
dic['攻略提到数量'] = i.find('div',class_="strategy_sum").text
dic['点评数量'] = i.find('div',class_="comment_sum").text
dic['景点排名'] = i.find('span',class_="ranking_sum").text
dic['星级'] = i.find('span',class_="total_star").find('span')['style'].split(':')[1]
datai.append(dic)
return datai
def normalization(dfi, col):
dfi[col + "_nor"] = (dfi[col] - dfi[col].min())/(dfi[col].max() - dfi[col].min())
if __name__ == '__main__':
url_lst = get_urls(20)
df = pd.DataFrame()
for u in url_lst:
dfi = pd.DataFrame(get_informations(u))
df = pd.concat([df,dfi])
df.reset_index(inplace = True,drop = True)
df['lng'] = df['lng'].astype(np.float)
df['lat'] = df['lat'].astype(np.float)
df['点评数量'] = df['点评数量'].astype(np.int)
df['攻略提到数量'] = df['攻略提到数量'].astype(np.int)
df['星级'] = df['星级'].str.replace('%','').astype(np.float)
df['景点排名'] = df['景点排名'].str.split('第').str[1]
df['景点排名'].fillna(value = 0,inplace = True)
normalization(df, '点评数量')
df.to_excel("./result.xlsx")
效果图,有什么地方看不懂,可以留言给我