import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
items = []
def parse_navigation():
url = 'https://jimo.8684.cn/'
r = requests.get(url=url, headers=headers)
#解析内容,获取所有的导航链接
# tree = etree.xpath(r.text)
tree = etree.HTML(r.text)
# 查找以数字开头的所有连接
number_href_list = tree.xpath('//div[@class="bus-layer depth w120"]/div[@class="pl10"][1]/div[@class="list"]/a/@href')
# 查找以字母开头的所有连接
char_href_list = tree.xpath('//div[@class="bus-layer depth w120"]/div[@class="pl10"][2]/div[@class="list"]/a/@href')
# 将需要爬取的所有连接返回
# print(number_href_list + char_href_list)
return number_href_list + char_href_list
def parse_third(content):
tree = etree.HTML(content)
# 依次获取公交信息内容
bus_number = tree.xpath('//div[@class="name"]/text()')[0]
# 获取公交运行时间
bus_time = tree.xpath('//ul[@class="bus-desc"]/li[1]/text()')
# 获取票价信息
bus_price = tree.xpath('//ul[@class="bus-desc"]/li[2]/text()')
# 获取上行总站数
bus_total_up = tree.xpath('//div[@class="total"]/text()')[0]
# 获取上行路线
bus_line_up = tree.xpath('//div[@class="bus-lzlist mb15"][1]/ol/li/a/text()')
try:
# 获取下行总站数
bus_total_down = tree.xpath('//div[@class="total"]/text()')[1]
# 获取下行路线
bus_line_down = tree.xpath('//div[@class="bus-lzlist mb15"][2]/ol/li/a/text()')
except Exception as e:
bus_total_down = ''
bus_line_down = ''
# 将每一条公交的线路信息存放到字典中
item = {
'线路名': bus_number,
'运行时间': bus_time,
'票价信息': bus_price,
'上行站数': bus_total_up,
'上行路线': bus_line_up,
'下行站数': bus_total_down,
'下行路线': bus_line_down
}
items.append(item)
def parse_second_route(content):
tree = etree.HTML(content)
# 写xpath,获取每一个路线
route_list = tree.xpath('//div[@class="list clearfix"]/a/@href')
# 遍历上面这个列表
for route in route_list:
route = 'https://jimo.8684.cn' + route
r = requests.get(url=route, headers=headers)
# 解析内容,获取每一路公交的详细信息
parse_third(r.text)
def parse_second(navi_list):
# 遍历上面的列表,依次发送请求,解析内容,获取每一个页面所有的公交路线url
#print(navi_list)
for first_url in navi_list:
first_url = 'https://jimo.8684.cn' + first_url
print('开始爬取%s所有的公交信息' % first_url)
r = requests.get(url=first_url, headers=headers)
# 解析内容,获取每一路公交的详细url
parse_second_route(r.text)
print('结束爬取%s所有的公交信息' % first_url)
pass
def main():
navi_list = parse_navigation()
# 爬取二级页面,需要找到以1开头的公交路线
parse_second(navi_list)
# 爬取完毕
fp = open('即墨公交.txt', 'w', encoding='utf8')
for item in items:
fp.write(str(item) + '\n')
fp.close()
if __name__ == '__main__':
main()
Python爬虫实例----公交线路
最新推荐文章于 2024-05-08 14:06:39 发布