前言:
本次数据集使用的一次比赛的用户购买预测数据集,其中包括了用户的城市信息,其中要求对用户城市分布做可视化分析,所以在此要爬取各个城市的信息,使用的是百度的API来爬取(高德的API获取的字段信息没有百度的多,goole的申请APK有点麻烦,所以使用百度API)
正文:
首先导入数据集:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train1=pd.read_csv('C:/Users/admin/Desktop/datafc/sampleB/data/user_info.csv')
train3=pd.read_csv('C:/Users/admin/Desktop/datafc/sampleB/data/visit_info.csv')
train2=pd.read_csv('C:/Users/admin/Desktop/datafc/sampleB/data/login_day.csv')
train4=pd.read_csv('C:/Users/admin/Desktop/datafc/sampleB/data/result.csv')
对数据集进行简单的处理,如果想浏览更详细的处理内容,可以看我的另一篇文章:
train1['city_num']=train1['city_num'].replace('error','')
train1=train1.fillna(method="ffill",axis=0)#邻近填充法
train1_d=train1.drop(labels=range(135617,135967),axis=0)#对于多出来的数据进行删除法
train1_d=train1_d.drop(labels=135967,axis=0)
train1_d=train1_d.fillna(method="ffill",axis=0)
address=[]#获取用户城市
for i in range(len(train1_d['city_num'])):
address.append(train1_d['city_num'][i])
定义处理重复城市名的方法:
def del_repeatnum(addre):
s1=[]
for i in addre:
print(i)
if i not in s1:
s1.append(i)
else:
pass
return s1
正向地理编码,爬取城市的经纬度:
import json#正向爬取经纬度
from urllib.request import urlopen, quote
import requests,csv
def getlnglat(address):
url = 'http://api.map.baidu.com/geocoding/v3/'
output = 'json'
ak = '你申请的百度AK'#
add = quote(address) #为防止乱码,先用quote进行编码
city=quote(address)
uri = url + '?' + 'address=' + add + '&output=' + output + '&ak=' + ak+'&city='+city
req = urlopen(uri)
res = req.read().decode() #将其他编码的字符串解码成unicode
temp = json.loads(res) #对json数据进行解析
return temp
f = open('city12.csv','w',encoding='utf-8',newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(["city", "lng", "lat"])
for i in address2:
lng = getlnglat(i)['result']['location']['lng'] #获取经度
lat = getlnglat(i)['result']['location']['lat'] #获取纬度
str_temp = [i,lng,lat]
csv_writer.writerow(str_temp) #写入文档
f.close()
读取刚刚爬取的城市经纬度数据集:
city=pd.read_csv('./city12.csv')
逆向地理编码获取城市对应的省份:
import json#逆向获取省份
from urllib.request import urlopen, quote
import requests,csv
def getProvince(lng,lat):
url = 'http://api.map.baidu.com/reverse_geocoding/v3/'
output = 'json'
ak = '你申请的百度AK'#
coordtype='wgs84ll'
uri = url + '?' + 'ak=' +ak+ '&output=' + output +'&coordtype='+coordtype+'&location='+str(lat)+','+str(lng)
req = urlopen(uri)
res = req.read().decode("utf-8") #将其他编码的字符串解码成unicode
temp = json.loads(res) #对json数据进行解析
return temp
将爬取的内容保存起来:
f = open('province2.csv','w',encoding='utf-8',newline='')
csv_writer = csv.writer(f)
csv_writer.writerow(["province", "city"])
for i in range(len(city)):
pro=getProvince(city.lng[i],city.lat[i])
pro=pro['result']['addressComponent']['province']
str_temp=[pro,city['city'][i]]
csv_writer.writerow(str_temp) #写入文档
f.close()
统计各个城市出现的频率,将其用来代表用户量:
province=pd.read_csv('./province2.csv')
frency=[]
for i in city['city']:
frency.append(address.count(i))
province_name=list(province['province'])
num_feature6=[]
for col in range(len(province_name)):
if province_name[col]=='广东省':
num_feature6.append(list[col][1])
count=0
for i in range(len(num_feature6)):
print(count)
count=num_feature6[i]+count
name=['province','data']#保存为CSV文件
test=pd.DataFrame(columns=name,data=list)
print(test)
test.to_csv('./testcsv.csv',encoding='UTF-8')