接下来进入正题,第一步获取公交线路信息,这里我使用的是84bus.net中公交线路的信息。本次实验使用的python库有requests、lxml、pandas。这里以青岛市为例,首先从http://www.84bus.net/qingdao/lines_all.html获取每条公交线路详细信息的链接。
有了每条公交线路的链接就可以通过循环遍历获取每条线路的相关信息,包含线路名称、起始站点、终点站、运营时间、发班间隔、公司、更新时间、站点数、站点信息。
有了公交线路信息,就可以通过高德地图结合开发者工具获取真实存储公交线路拐点信息的链接并获取其中存储的具体内容。这里以高德地图为例搜索青岛101路为例。链接:
https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=12&city=370200&geoobj=120.400212%7C36.1631%7C120.572578%7C36.315688&keywords=%E9%9D%92%E5%B2%9B101%E8%B7%AF解析url地址,构建根据不同的公交线路信息调整网站链接。知道了存储每条公交线路位置信息的真实链接,接下来就是访问链接,获取公交线路信息以及拐点位置信息。由于高德地图的发爬虫机制越发的智能,为了避免出现rgv587_flag错误,建议将浏览器中的请求头全部照搬过来并且加上时间休眠避免爬虫被系统识别。这里我设置的随机休眠时间在10-120s之间,相对较长,确保不被检测识别为爬虫(如果还会出现爬虫被识别,这里可以加入代理IP,避免IP被封)
接下来就需要对获取的内容进行解析,利用json.cn了解需求信息存储格式和位置。在这里需要注意几个坑!!!
1. 不是所有的公交线路都有位置信息,因为我们之前获取的公交线路中有部分是支线,部分这些线路在高德地图中不存储位置信息,以干线信息为准。
2. 注意被识别爬虫的线路信息,一旦检测出是爬虫,程序就会返回rgv587_flag,从而导致无法获取线路信息,这里针对这类数据引入try except 错误识别机制进行识别并返回相应的标识is_fetch。
上面介绍了如何获取一条公交线路位置信息的全部过程,并将其封装为实例对象,通过构建循环遍历就可以获取某个城市全部公交线路的拐点信息以及公交线路信息。
有了公交线路的拐点位置信息,基于arcgis软件点集转线工具将各公交线拐点位置信息转换为公交线路。其中id为公交线路编号,nodeindex为不同公交线路拐点的编号顺序。
以青岛市为例,这里获取了青岛市全市的公交线路。有了公交线路静态数据信息,结合一些公交刷卡信息就可以了解一个城市公交客流的分布情况。
最后附上此次数据获取的所有源代码:代理获取、火星坐标wgs坐标转换、公交线路信息获取代码
1. 代理获取
import requestsfrom lxml import etreeimport jsonheaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' }def download(url): print('正在下载页面:{}'.format(url)) try: res = requests.get(url, headers=headers) # res.encoding = res.apparent_encoding if res.status_code == 200: print('下载完成') # print(res.text) return res.text else: raise ConnectionError except Exception: print('下载页面出错:{}'.format(url))def get_ip_list(resp): html=etree.HTML(resp) ips=html.xpath('//table[@id="ip_list"]//tr[not(@) and (position()>1)]/td[position()=2]/text()') ports=html.xpath('//table[@id="ip_list"]//tr[not(@) and (position()>1)]/td[position()=3]/text()') type= html.xpath('//table[@id="ip_list"]//tr[not(@) and (position()>1)]/td[position()=6]/text()') proxy_list=[{k.lower():i+':'+j} for i,j,k in zip(ips,ports,type)] return proxy_listdef detect(proxy): https_api = "https://icanhazip.com/" http_api = "http://icanhazip.com/" try: if list(proxy.keys())[0]=='http': # 遍历时,利用访问百度,设定timeout=1,即在1秒内,未送到响应就断开连接 res = requests.get(url=http_api, proxies=proxy, headers=headers, timeout=1) else: res = requests.get(url=https_api, proxies=proxy, headers=headers, timeout=1) return True except Exception: return Falsedef get_useable_proxies(proxy_list): proxies=[] for index,proxy in enumerate(proxy_list): if detect(proxy): proxies.append(proxy) print('代理:{} --可用'.format(proxy)) else: print('代理:{} --不可用'.format(proxy)) pass return proxiesdef main(): url ='https://www.xicidaili.com/' res = download(url) proxy_list=get_ip_list(res) proxies=get_useable_proxies(proxy_list) with open('proxy.json','w') as f: json.dump(proxies, f, indent=4, ensure_ascii=False)if __name__ == '__main__': main()
2. 火星坐标wgs坐标转换
import mathdef GCJ2WGS(lon,lat): a = 6378245.0 # 克拉索夫斯基椭球参数长半轴a ee = 0.00669342162296594323 #克拉索夫斯基椭球参数第一偏心率平方 PI = 3.14159265358979324 # 圆周率 # 以下为转换公式 x = lon - 105.0 y = lat - 35.0 # 经度 dLon = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x)); dLon += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0; dLon += (20.0 * math.sin(x * PI) + 40.0 * math.sin(x / 3.0 * PI)) * 2.0 / 3.0; dLon += (150.0 * math.sin(x / 12.0 * PI) + 300.0 * math.sin(x / 30.0 * PI)) * 2.0 / 3.0; #纬度 dLat = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x)); dLat += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0; dLat += (20.0 * math.sin(y * PI) + 40.0 * math.sin(y / 3.0 * PI)) * 2.0 / 3.0; dLat += (160.0 * math.sin(y / 12.0 * PI) + 320 * math.sin(y * PI / 30.0)) * 2.0 / 3.0; radLat = lat / 180.0 * PI magic = math.sin(radLat) magic = 1 - ee * magic * magic sqrtMagic = math.sqrt(magic) dLat = (dLat * 180.0) / ((a * (1 - ee)) / (magic * sqrtMagic) * PI); dLon = (dLon * 180.0) / (a / sqrtMagic * math.cos(radLat) * PI); wgsLon = lon - dLon wgsLat = lat - dLat return wgsLon,wgsLat
3. 公交线路信息获取代码
from urllib import request,parseimport requestsimport jsonfrom GCJ02_TO_WGS84 import GCJ2WGSimport pandas as pdimport randomimport timeclass BusLine_Coords_Spider(): ''' 这是一个获取公交线路的实例 ''' def __init__(self,name,city_code): self.name=name self.city_code=city_code def get_html(self,proxies): params = { 'keywords': self.name, 'zoom': '12', 'city': self.city_code, 'geoobj': '120.197818|36.194447|120.560407|36.283069' } headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36', 'Upgrade-Insecure-Requests':'1', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-Mode': 'navigate', 'If-None-Match': 'W/"4a548-FS/QFAS79ra/ddJpLkzHfCsH+As"', 'Host': 'www.amap.com', 'Cookie': 'acw_tc=ac11000115843437021618680e01091e8cee78128b2e2558f0483f3ba2fde7; cna=lxj2FonBGmwCAW/CwOZRoP9/; UM_distinctid=170e23e610710-00c6191594b92a-366b420b-144000-170e23e61082ed; _uab_collina=158434370470306853095316; passport_login=NDEzMDI0NjUzLGFtYXBBaktVaUl1MXAsNzU1ZmJ4c29yaG41eXN5YjZxbHB1bm8yY2VoZXJ3eWksMTU4NDgxMTMwNixOakkyTjJRd1pXWXlNamxoTm1NeU5ERmtNR1V4WW1Vd016QmhNMll4TmpFPQ%3D%3D; guid=f32b-f2fb-158a-39ef; CNZZDATA1255626299=1122227196-1584339388-%7C1584959047; x-csrf-token=8374961a7a75d559b8e34f8d2d11eae1; l=dBgRh9lcqtYgLPKBBOCNISrYS_7OSIRA_uo4MjN6i_5dZ1TsWZQOoPKYde96VjWfG78B4czYcr29-etkZQDmnddGeXE3xxDc.; isg=BICAe26KyjsuDbW2PGD2KH9BUQ5SCWTTUbf2t_oRTBsudSCfohk0Y1ZHjd21Qhyr', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9',} base_url = "https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&" url=base_url+parse.urlencode(params) print('url:{}'.format(url)) flag = False data='' i = 1 while flag==False: try: res=requests.get(url,headers=headers) data=res.text print(data) flag=True break except Exception: print(i) i += 1 if i > 20: print('访问{}失败'.format(self.name)) break return data,flag def get_info(self,data): jsonData=json.loads(data) df_busline_loc=pd.DataFrame() df_busline_info=pd.DataFrame() is_fetch= True is_exist = True try: busline_list=jsonData["data"]["busline_list"] try: busline_list=busline_list[0] busline_info = {} busline_loc = [] busline_info['id'] = busline_list['id'] busline_info['key_name'] = busline_list['key_name'] busline_info['name'] = busline_list['name'] busline_info['front_name'] = busline_list['front_name'] busline_info['terminal_name'] = busline_list['terminal_name'] busline_info['length'] = busline_list['length'] busline_info['total_price'] = busline_list['total_price'] busline_info['map_view_city'] = jsonData["data"]['lqii']['city_info_attr']['map_view_city'] x_set = busline_list['xs'].split(',') y_set = busline_list['ys'].split(',') for i in range(len(x_set)): nodeindex = i + 1 gcjlon, gcjlat = float(x_set[i]), float(y_set[i]) wgslon, wgslat = GCJ2WGS(float(x_set[i]), float(y_set[i])) busline_loc.append( [busline_info['id'], nodeindex, gcjlon, gcjlat, wgslon, wgslat, busline_info['map_view_city']]) df_busline_loc = pd.DataFrame(busline_loc, columns=['id', 'nodeindex', 'gcjlon', 'gcjlat', 'wgslon', 'wgslat', 'map_view_city'], index=[i + 1 for i in range(len(busline_loc))]) df_busline_info = pd.DataFrame(busline_info, index=[1]) print(df_busline_loc) print(df_busline_info) except Exception: is_exist=False except Exception: is_fetch=False return df_busline_info,df_busline_loc,is_exist,is_fetchdef Main(proxy_jsonfile,city_code,busname_file): proxies=json.load(open(proxy_jsonfile, 'r')) buslines_name=list(pd.read_excel(busname_file)['线路名称']) false_get_list=[] false_exist_list=[] false_fetch_list=[] buslines_info=pd.DataFrame() buslines_loc=pd.DataFrame() for index,name in enumerate(buslines_name): bcs = BusLine_Coords_Spider(name,city_code) data,is_get=bcs.get_html(proxies) if is_get==False: false_get_list.append(name) print('线路{}网址打开失败'.format(name)) else: busline_info,busline_loc,is_exist,is_fetch=bcs.get_info(data) if is_exist==False: false_exist_list.append(name) print('线路{}没有位置信息'.format(name)) elif is_fetch==False: false_fetch_list.append(name) print('线路{}获取失败'.format(name)) break else: buslines_info=buslines_info.append(busline_info) buslines_loc = buslines_loc.append(busline_loc) print('获取线路{}信息完成'.format(name)) print('爬取进度:{}'.format((index+1)/len(buslines_name))) time.sleep(random.randint(5,20 )) print('网页打开错误的线路:{}'.format(false_get_list)) print('不能存在线路位置信息:{}'.format(false_exist_list)) print('访问过快,反爬虫机制限制的线路:{}'.format(false_fetch_list)) return buslines_loc,buslines_infoif __name__ == '__main__': df_buslines_loc,df_buslines_info=Main('./proxy.json','370200','./青岛公交线路名称.xlsx') df_buslines_loc.to_excel('青岛公交线网拐点信息.xlsx',index=None) df_buslines_info.to_excel('青岛公交线路信息.xlsx',index=None)