爬虫之 requests 库爬取公交信息
爬取公交的信息
广州公交网站
https://guangzhou.8684.cn/
抓取广州市所有的公交信息
1、根据第一级页面链接获取第二级链接
右键检查元素
最初的页面的url是
点击字母或数字进入下级页面时url的变化
所以通过正则匹配url,拼接url
2、进入第二级页面链接
同样查找规律
用xpath插件查找所需的内容
然后进入第三级链接获取最终的信息
gongjiao .py
import requests
from lxml import etree
from time import sleep
import csv
import re
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# 1 )请求模块
# 一级页面
def request_first_pages(url):
res = requests.get(url=url,headers=headers)
# print(res.text)
# 解析出每一个开头对应的url
pat = re.compile(r'/list[0-9A-Z]')
list = pat.findall(res.text)
# print(list)
# 遍历list把他们拼接成下级界面的url
for next_url in list:
yield "https://guangzhou.8684.cn" + next_url
# 二级页面
def request_second_pages(pages):
for page in pages:
# print(page)
res = requests.get(url=page,headers=headers)
# 从二级页面中解析出每一路公交车的链接
tree = etree.HTML(res.text)
line_list = tree.xpath("//div[@class='stie_list']/a/@href")
for line in line_list:
# 拼接相对路径
yield "https://guangzhou.8684.cn"+line
# 三级页面
def request_third_pages(buslist):
for url in buslist:
print("正在请求:",url)
html = requests.get(url=url,headers=headers)
sleep(0.5)
# print(html)
yield html.text
# 2) 解析模块
def analysis_html(html_list):
for html in html_list:
tree = etree.HTML(html)
item = {}
item["lineName"] = tree.xpath("//h1//text()")[0]
item["time"] = tree.xpath("//p[@class='bus_i_t4']/text()")[0]
item["price"] = tree.xpath("//p[@class='bus_i_t4']/text()")[1]
item["campony"] = tree.xpath("//p[@class='bus_i_t4']/a/text()")[0]
# 获取线路
lines = tree.xpath("//div[@class='bus_line_site ']")
# print(len(lines))
ls = lines[0].xpath(".//text()")
item["upline"] = [ls[i]+"_"+ls[i+1] for i in range(0,len(ls),2)]
# 判断线路是否有下行
if len(lines) > 1:
ls = lines[1].xpath(".//text()")
item["downline"] = [ls[i] + "_" + ls[i + 1] for i in range(0, len(ls), 2)]
# print(item)
yield item
# 3) 存储
def write_to_csv(bus_infos):
fp = open('bus.csv','a+',newline='')
writer = csv.writer(fp)
writer.writerow(["lineName","time","price","campony","upline","downline"])
for bus_info in bus_infos:
row = []
row.append(bus_info["lineName"])
row.append(bus_info["time"])
row.append(bus_info["price"])
row.append(bus_info["campony"])
row.append(bus_info.get("upline"))
row.append(bus_info.get("downline"))
writer.writerow(row)
if __name__ == '__main__':
url = "https://guangzhou.8684.cn/"
linelist = request_first_pages(url)
bus_list = request_second_pages(linelist)
html_list = request_third_pages(bus_list)
bus_infos = analysis_html(html_list)
write_to_csv(bus_infos)