python爬取杭州公交车始发站-终点站(正确代码,完整注释)

杭州公交总览
http://bus.hangzhou.com.cn/all_line.php

找到每一路公交车对应网址
在这里插入图片描述

# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'

headers = {
    'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}


response = requests.get(url, headers=headers, timeout=5)

soup = BeautifulSoup(response.text, 'lxml')   #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a')    #找到每辆车网址信息
id_ = []    #每辆车网址
for i in range(len(href)):
    id_one = re.findall('\d+', str(href[i]))[0]
    id_.append(id_one)

任选一路车进入其页面
http://bus.hangzhou.com.cn/line.php?line_id=3

在这里插入图片描述

'''
    进入每一辆车对应网页
    '''
    url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
    response_ = requests.get(url, headers=headers, timeout=10)
    print('url:{} count:{}'.format(url, count))
    soup = BeautifulSoup(response_.text, 'lxml')
    #找到始发站终点站对应位置
    start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]

完整代码

import requests
from bs4 import BeautifulSoup
import random
import tqdm as tqdm

import pandas as pd
import numpy as np
import re
'''
创建dataframe来保存起始站,终点站
'''
df = pd.DataFrame(columns=['count', 'start', 'terminal'])
  
    
# 获取每一辆车对应网页id
url = 'http://bus.hangzhou.com.cn/all_line.php'

headers = {
    'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}


response = requests.get(url, headers=headers, timeout=5)

soup = BeautifulSoup(response.text, 'lxml')   #获取网页代码信息
href = soup.find(attrs={'class': 'line_all'}).find_all('a')    #找到每辆车网址信息
id_ = []    #每辆车网址
for i in range(len(href)):
    id_one = re.findall('\d+', str(href[i]))[0]
    id_.append(id_one)

'''
若不换ip则可能ip被封,因为采用count计数,可以知道停在了哪里,下一次把count改为相应值继续开始即可
'''
count = 0
while(count!=len(id_)):
    '''
    进入每一辆车对应网页
    '''
    url = 'http://bus.hangzhou.com.cn/line.php?line_id=' + str(id_[count])
    response_ = requests.get(url, headers=headers, timeout=10)
    print('url:{} count:{}'.format(url, count))
    soup = BeautifulSoup(response_.text, 'lxml')
    #找到始发站终点站对应位置
    start_terminal_ = soup.find(attrs={'class': 'main_title'}).find_all('strong')[-1]
    '''
    根据-来分割文字,将其分别放入dataframe的相应位置
    因为有些文字中分隔符不同,因此做以下处理
    '''
    if '——' in start_terminal_.text:
        start = start_terminal_.text.split('——')[0]
        terminal = start_terminal_.text.split('——')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    elif '-' in  start_terminal_.text:
        start = start_terminal_.text.split('-')[0]
        terminal = start_terminal_.text.split('-')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    elif '—' in  start_terminal_.text:
        start = start_terminal_.text.split('—')[0]
        terminal = start_terminal_.text.split('—')[1]
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start
        df.loc[count, 'terminal'] = terminal
        count = count + 1
    else:
        df.loc[count, 'count'] = count
        df.loc[count, 'start'] = start_terminal_.text
        count = count + 1

结果展示

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值