python爬虫学习1

最新推荐文章于 2024-07-18 12:57:54 发布

淞蓝霭

最新推荐文章于 2024-07-18 12:57:54 发布

阅读量488

点赞数 3

文章标签：爬虫学习

本文链接：https://blog.csdn.net/qq_52980359/article/details/135920160

版权

文章描述了使用Python的urllib和BeautifulSoup库从北京8684公交网站抓取实时公交信息的过程，包括获取一级页面的子链接，解析网页获取公交车名、类型、时间等信息，并将数据写入CSV文件。

摘要由CSDN通过智能技术生成

import urllib.request,csv
import time
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin

构造一个URL，获得一级页面下的URL：

url='https://beijing.8684.cn'
url_list=url+'list%d'
for k in range(1,10):
    urls=url_list%k
    print(urls)

结果：

https://beijing.8684.cnlist1
https://beijing.8684.cnlist2
https://beijing.8684.cnlist3
https://beijing.8684.cnlist4
https://beijing.8684.cnlist5
https://beijing.8684.cnlist6
https://beijing.8684.cnlist7
https://beijing.8684.cnlist8
https://beijing.8684.cnlist9

get_page_url()方法，获得公交车的真实网址：

def get_page_url(urls):
    html=urllib.request.urlopen(urls)
    soup=bs(html.read(),'html.parser')
    lu=soup.find('div',class_='list clearfix')
    hrefs_a=lu.find_all('a')
    url_real=[]
    for hrefs_code in hrefs_a:
        url_real.append(urljoin(url,hrefs_code['href']))
    return url_real
url='https://beijing.8684.cn'
url_list=url+'/list%d'
for k in range(1,2):#一开头
    urls=url_list%k
    bus_list_url=get_page_url(urls)
    for url_cur in bus_list_url:
        print(url_cur)

import urllib.request,csv
import time
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
cs=open('bus_info.csv','w',newline='')
writer=csv.writer(cs)
def get_page_url(urls):
    html=urllib.request.urlopen(urls)
    soup=bs(html.read(),'html.parser')
    lu=soup.find('div',class_='list clearfix')
    hrefs_a=lu.find_all('a')
    url_real=[]
    for hrefs_code in hrefs_a:
        url_real.append(urljoin(url,hrefs_code['href']))
    return url_real
def get_page_info(url_to):
    html=urllib.request.urlopen(url_to)
    soup=bs(html.read(),'html.parser')
    try:
        bus_name=soup.select('body>div.breadcrumbs.depth.mb15>span.cr_crumbs_txt')[0].string
    except:
        bus_name=None
    try:
        bus_type=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20>div.info>h1>a')[0].string.strip('[]')
    except:
        bus_type=None
    try:
        bus_time=soup.select('body>div.layout.layout--728-250>div.layout-left>' ' div.bus-lzinfo.mb20 > div.info > ul > li:nth-child(1)')[0].string
    except:
        bus_time=None
    try:
        ticket=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20> div.info > ul > li:nth-child(2)')[0].string
    except:
        ticket=None
    try:
        gongsi=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20> div.info > ul > li:nth-child(3) > a')[0].string
    except:
        gongsi=None
    try:
        gengxin=soup.find(class_="wechat-promote tooltip-btn").previous_sibling
    except:
        gengxin=None
    #输出公交车信息
    result_list=[bus_name,bus_type,bus_time,ticket,gongsi,gengxin]
    result2_list=[]
    for k in result_list:
        print(k)
        if k is not None:
            result2_list.append(k)
        else:
            result2_list.append(None)
    writer.writerow(result2_list)
if __name__=='__main__':
    url='https://beijing.8684.cn'
    url_list=url+'/list%d'
    for k in range(1,10):#一开头
        urls=url_list%k
        bus_list_url=get_page_url(urls)
        for url_cur in bus_list_url:
            get_page_info(url_cur)
            time.sleep(2)