python爬取杭州公交车始发站-终点站（正确代码，完整注释）

最新推荐文章于 2023-07-15 11:17:26 发布

诗人藏夜里

最新推荐文章于 2023-07-15 11:17:26 发布

阅读量667

点赞数

分类专栏：爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_43722026/article/details/103108537

版权

爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

杭州公交总览
http://bus.hangzhou.com.cn/all_line.php

找到每一路公交车对应网址
在这里插入图片描述

# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'

headers = {
    'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}


response = requests.get(url, headers=headers, timeout=5)

soup = BeautifulSoup(response.text, 'lxml')   #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a')    #找到每辆车网址信息
id_ = []    #每辆车网址
for i in range(len(href)):
    id_one = re.findall('\d+', str(href[i]))[0]
    id_.append(id_one)

任选一路车进入其页面
http://bus.hangzhou.com.cn/line.php?line_id=3

在这里插入图片描述

'''
    进入每一辆车对应网页
    '''
    url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
    response_ = requests.get(url, headers=headers, timeout=10)
    print('url:{} count:{}'.format(url, count))
    soup = BeautifulSoup(response_.text, 'lxml')
    #找到始发站终点站对应位置
    start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]

完整代码

import requests
from bs4 import BeautifulSoup
import random
import tqdm as tqdm

import pandas as pd
import numpy as np
import re
'''
创建dataframe来保存起始站，终点站
'''
df = pd.DataFrame(columns=['count', 'start', 'terminal'])
  
    
# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'

headers = {
    'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}


response = requests.get(url, headers=headers, timeout=5)

soup = BeautifulSoup(response.text, 'lxml')   #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a')    #找到每辆车网址信息
id_ = []    #每辆车网址
for i in range(len(href)):
    id_one = re.findall('\d+', str(href[i]))[0]
    id_.append(id_one)

'''
若不换ip则可能ip被封，因为采用count计数，可以知道停在了哪里，下一次把count改为相应值继续开始即可
'''
count = 0
while(count!=len(id_)):
    '''
    进入每一辆车对应网页
    '''
    url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
    response_ = requests.get(url, headers=headers, timeout=10)
    print('url:{} count:{}'.format(url, count))
    soup = BeautifulSoup(response_.text, 'lxml')
    #找到始发站终点站对应位置
    start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
    '''
    根据-来分割文字，将其分别放入dataframe的相应位置
    因为有些文字中分隔符不同，因此做以下处理
    '''
    if '——' in start_terminal_.text:
        start = start_terminal_.text.split('——')[0]
        terminal = start_terminal_.text.split('——')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    elif '-' in  start_terminal_.text:
        start = start_terminal_.text.split('-')[0]
        terminal = start_terminal_.text.split('-')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    elif '—' in  start_terminal_.text:
        start = start_terminal_.text.split('—')[0]
        terminal = start_terminal_.text.split('—')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    else:
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start_terminal_.text
        count = count + 1