杭州公交总览
http://bus.hangzhou.com.cn/all_line.php
找到每一路公交车对应网址
# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'
headers = {
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=5)
soup = BeautifulSoup(response.text, 'lxml') #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a') #找到每辆车网址信息
id_ = [] #每辆车网址
for i in range(len(href)):
id_one = re.findall('\d+', str(href[i]))[0]
id_.append(id_one)
任选一路车进入其页面
http://bus.hangzhou.com.cn/line.php?line_id=3
'''
进入每一辆车对应网页
'''
url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
response_ = requests.get(url, headers=headers, timeout=10)
print('url:{} count:{}'.format(url, count))
soup = BeautifulSoup(response_.text, 'lxml')
#找到始发站终点站对应位置
start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
完整代码
import requests
from bs4 import BeautifulSoup
import random
import tqdm as tqdm
import pandas as pd
import numpy as np
import re
'''
创建dataframe来保存起始站,终点站
'''
df = pd.DataFrame(columns=['count', 'start', 'terminal'])
# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'
headers = {
'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=5)
soup = BeautifulSoup(response.text, 'lxml') #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a') #找到每辆车网址信息
id_ = [] #每辆车网址
for i in range(len(href)):
id_one = re.findall('\d+', str(href[i]))[0]
id_.append(id_one)
'''
若不换ip则可能ip被封,因为采用count计数,可以知道停在了哪里,下一次把count改为相应值继续开始即可
'''
count = 0
while(count!=len(id_)):
'''
进入每一辆车对应网页
'''
url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
response_ = requests.get(url, headers=headers, timeout=10)
print('url:{} count:{}'.format(url, count))
soup = BeautifulSoup(response_.text, 'lxml')
#找到始发站终点站对应位置
start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
'''
根据-来分割文字,将其分别放入dataframe的相应位置
因为有些文字中分隔符不同,因此做以下处理
'''
if '——' in start_terminal_.text:
start = start_terminal_.text.split('——')[0]
terminal = start_terminal_.text.split('——')[1]
df.loc[count, 'count'] = count
df.loc[count, 'start'] = start
df.loc[count, 'terminal'] = terminal
count = count + 1
elif '-' in start_terminal_.text:
start = start_terminal_.text.split('-')[0]
terminal = start_terminal_.text.split('-')[1]
df.loc[count, 'count'] = count
df.loc[count, 'start'] = start
df.loc[count, 'terminal'] = terminal
count = count + 1
elif '—' in start_terminal_.text:
start = start_terminal_.text.split('—')[0]
terminal = start_terminal_.text.split('—')[1]
df.loc[count, 'count'] = count
df.loc[count, 'start'] = start
df.loc[count, 'terminal'] = terminal
count = count + 1
else:
df.loc[count, 'count'] = count
df.loc[count, 'start'] = start_terminal_.text
count = count + 1