随着共享单车的不断增加以及其重要性,很多人开始通过共享单车大数据对城市生活进行研究与分析。前段时间做ofo数据分析时一直被数据所困扰,通过学习与借鉴其他学者的研究,利用python对ofo单车数据进行爬取。
相对于其他爬取程序,该程序有以下特点:
- 本程序利用python2.7进行编写,考虑到arcpy模块使用版本问题;
- 由于爬取的是高德地图上面的ofo定位数据,所以又将火星坐标系下的定位数据转换为WGS84坐标系下的定位数据;
- 同时将转换后的数据导出为shp点数据,方便操作与研究。
首先呢,该ofo定位数据爬取程序是通过登录ofo网页爬取数据的,登录地址:https://common.ofo.so/newdist/?Journey。进行爬取前必须获取认证信息token,获取方式如下图所示:
在下面的程序代码中用的是我的token,为了长久使用,大家可以自己登录获取token。
同时该程序是通过定义矩形区域进行爬取的,所以要事先查询要爬取区域的左上角经纬度与右下角经纬度,调用start函数时填写该数据。
详细代码如下:
#!coding=utf-8
from __future__ import division
import requests
import datetime
import threading
import json
import os
import pandas as pd
import numpy as np
import time
import sqlite3
import math
import arcpy
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from requests_toolbelt.multipart.encoder import MultipartEncoder
from concurrent.futures import ThreadPoolExecutor
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class Crawler:
def __init__(self):
self.start_time = datetime.datetime.now()
self.db_name = "file:database?mode=memory&cache=shared"
self.save_path = "./data/" + datetime.datetime.now().strftime("%Y%m%d")
self.file_name = datetime.datetime.now().strftime("%Y%m%d_%H%M") + "_ofo"
self.lock = threading.Lock()
self.total = 0
self.done = 0
self.bikes_count = 0
self.x_pi = 3.14159265358979324 * 3000.0 / 180.0
self.pi = 3.1415926535897932384626 # π
self.a = 6378245.0 # 长半轴
self.ee = 0.00669342162296594323 # 偏心率平方
self.message = os.path.isdir(self.save_path)
def get_nearby_bikes(self, args):
try:
url = "https://san.ofo.so/ofo/Api/nearbyofoCar"
headers = {
'charset': "utf-8",
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN',
'Content-Length': '516',
'Content-Type': 'multipart/form-data; boundary=----ofo-boundary-MC4wOTk1ODUy',
'Host': 'san.ofo.so',
'Origin': 'https://common.ofo.so',
'Referer': 'https://common.ofo.so/newdist/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
}
self.request(headers, args, url)
except Exception as ex:
print(ex)
def request(self, headers, args, url):
multipart_encoder = MultipartEncoder(
fields={
"token": "e923c290-0d27-11e7-b9d2-b5e857d3318f",
"source": "0",
"source-version": "9999",
# "lat": "36.103235",
"lat": str(args[0]),
# "lng":"103.709681"
"lng": str(args[1])
# file为路径
},
boundary='----ofo-boundary-MC4wOTk1ODUy'
)
response = requests.request(
"POST", url, headers=headers,
timeout=30,
verify=False,
data=multipart_encoder
)
with self.lock:
with self.connect_db() as c:
try:
decoded = json.loads(response.text)['values']['info']['cars']
self.done += 1
for x in decoded:
self.bikes_count += 1
at = self.gcj02_to_wgs84_lat(x['lng'], x['lat'])#将火星坐标转换为WGS84--纬度
on = self.gcj02_to_wgs84_lng(x['lng'], x['lat'])#将火星坐标转换为WGS84--经度
c.execute("INSERT OR IGNORE INTO ofo VALUES (%d,'%s',%f,%f)" % (
int(time.time()) * 1000, x['carno'], at, on))
timespent = datetime.datetime.now() - self.start_time
percent = self.done / self.total
total = timespent.total_seconds() / percent
print("位置 %s, 未去重单车数量 %s, 进度 %0.2f%%, 速度 %0.2f个/分钟, 总时间 %s, 剩余时间 %s" % (
args, self.bikes_count, percent * 100, self.done / timespent.total_seconds() * 60, total,
total - timespent.total_seconds()))
except Exception as ex:
print(ex)
def connect_db(self):
return sqlite3.connect(self.db_name)
def generate_create_table_sql(self, brand):
return '''CREATE TABLE {0}
(
"bikeId" VARCHAR(12),
lat DOUBLE,
lng DOUBLE,
CONSTRAINT "{0}_bikeId_lat_lon_pk"
PRIMARY KEY (bikeId, lat, lng)
);'''.format(brand)
#创建shp点数据
def CreateFeaturclass(self, savepath, featurename, spatial):
if arcpy.Exists(savepath + '\\' + featurename + '.shp') == False:
arcpy.CreateFeatureclass_management(savepath, featurename, 'POINT', '', '', '', spatial)
else:
pass
#添加字段
def AddField(self, savepath, featurename):
arcpy.AddField_management(savepath + '\\' + featurename + '.shp', 'bikeid', 'TEXT')
arcpy.AddField_management(savepath + '\\' + featurename + '.shp', 'lon', 'TEXT')
arcpy.AddField_management(savepath + '\\' + featurename + '.shp', 'lat', 'TEXT')
#遍历点并添加字段值
def InsertRow(self, savepath, featurename, data):
Insercur = arcpy.InsertCursor(savepath + '\\' + featurename + '.shp')
for value in range(1,len(data)+1):
point = arcpy.Point()
newrow = Insercur.newRow()
point.X = float(data.head(value)['lng'][value-1])
point.Y = float(data.head(value)['lat'][value-1])
newrow.setValue('Id', value)
newrow.setValue('bikeid',data.head(value)['bikeId'][value-1])
newrow.setValue('lon',data.head(value)['lng'][value-1])
newrow.setValue('lat',data.head(value)['lat'][value-1])
pointGeo = arcpy.PointGeometry(point)
newrow.shape = pointGeo
Insercur.insertRow(newrow)
def gcj02_to_wgs84_lng(self, lng1, lat1):
if self.out_of_china(lng1, lat1): # 判断是否在国内
return lng1, lat1
dlng = self.transformlng(lng1 - 105.0, lat1 - 35.0)
radlat = lat1 / 180.0 * self.pi
magic = math.sin(radlat)
magic = 1 - self.ee * magic * magic
sqrtmagic = math.sqrt(magic)
dlng = (dlng * 180.0) / (self.a / sqrtmagic * math.cos(radlat) * self.pi)
mglng = lng1 + dlng
return lng1 * 2 - mglng
def gcj02_to_wgs84_lat(self, lng1, lat1):
if self.out_of_china(lng1, lat1): # 判断是否在国内
return lng1, lat1
dlat = self.transformlat(lng1 - 105.0, lat1 - 35.0)
radlat = lat1 / 180.0 * self.pi
magic = math.sin(radlat)
magic = 1 - self.ee * magic * magic
sqrtmagic = math.sqrt(magic)
dlat = (dlat * 180.0) / ((self.a * (1 - self.ee)) / (magic * sqrtmagic) * self.pi)
mglat = lat1 + dlat
return lat1 * 2 - mglat
def transformlat(self, lng1, lat1):
ret = -100.0 + 2.0 * lng1 + 3.0 * lat1 + 0.2 * lat1 * lat1 + \
0.1 * lng1 * lat1 + 0.2 * math.sqrt(math.fabs(lng1))
ret += (20.0 * math.sin(6.0 * lng1 * self.pi) + 20.0 *
math.sin(2.0 * lng1 * self.pi)) * 2.0 / 3.0
ret += (20.0 * math.sin(lat1 * self.pi) + 40.0 *
math.sin(lat1 / 3.0 * self.pi)) * 2.0 / 3.0
ret += (160.0 * math.sin(lat1 / 12.0 * self.pi) + 320 *
math.sin(lat1 * self.pi / 30.0)) * 2.0 / 3.0
return ret
def transformlng(self, lng1, lat1):
ret = 300.0 + lng1 + 2.0 * lat1 + 0.1 * lng1 * lng1 + \
0.1 * lng1 * lat1 + 0.1 * math.sqrt(math.fabs(lng1))
ret += (20.0 * math.sin(6.0 * lng1 * self.pi) + 20.0 *
math.sin(2.0 * lng1 * self.pi)) * 2.0 / 3.0
ret += (20.0 * math.sin(lng1 * self.pi) + 40.0 *
math.sin(lng1 / 3.0 * self.pi)) * 2.0 / 3.0
ret += (150.0 * math.sin(lng1 / 12.0 * self.pi) + 300.0 *
math.sin(lng1 / 30.0 * self.pi)) * 2.0 / 3.0
return ret
def out_of_china(self, lng1, lat1):
return not (lng1 > 73.66 and lng1 < 135.05 and lat1 > 3.86 and lat1 < 53.55)
def group_data(self):
print("正在导出数据")
conn = self.connect_db()
self.export_to_shp(conn, "ofo")
def export_to_shp(self, conn, brand):
spRef = arcpy.SpatialReference(4326)
df = pd.read_sql_query("SELECT * FROM %s" % brand, conn, parse_dates=True)
self.CreateFeaturclass(self.save_path, self.file_name, spRef)
self.AddField(self.save_path, self.file_name)
self.InsertRow(self.save_path, self.file_name, df)
print(brand)
print ("去重后数量")
print (len(df))
def start(self, top_lng, top_lat, bottom_lng, bottom_lat):
while True:
self.__init__()
if self.message == False:
os.makedirs(self.save_path)#创建路径
try:
with self.connect_db() as c:
c.execute(self.generate_create_table_sql('ofo'))
except Exception as ex:
print(ex)
pass
executor = ThreadPoolExecutor(max_workers=100)
print("Start")
self.total = 0
offset = 0.002
lat_range = np.arange(float(top_lat), float(bottom_lat), -offset)
for lat in lat_range:
lng_range = np.arange(float(top_lng), float(bottom_lng), offset)
for lon in lng_range:
self.total += 1
executor.submit(self.get_nearby_bikes, (lat, lon))
executor.shutdown()
self.group_data()
#是否继续运行
always_run = False
if not always_run:
break
waittime = 1
print("等待%s分钟后继续运行" % waittime)
time.sleep(waittime * 60)
if __name__ == '__main__':
c = Crawler()
c.start(103.686592, 36.114191, 103.741781, 36.091515)#爬取范围(左上角,右下角经纬度)
print("完成")
最后将导出为shp数据在ArcGIS中展示如下图所示:
参考文献
https://github.com/SilverBooker/ofoSpider
本文仅供参考学习,有不到之处,望大家谅解