python爬虫学习1

文章描述了使用Python的urllib和BeautifulSoup库从北京8684公交网站抓取实时公交信息的过程,包括获取一级页面的子链接,解析网页获取公交车名、类型、时间等信息,并将数据写入CSV文件。
摘要由CSDN通过智能技术生成
import urllib.request,csv
import time
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin

构造一个URL,获得一级页面下的URL:

url='https://beijing.8684.cn'
url_list=url+'list%d'
for k in range(1,10):
    urls=url_list%k
    print(urls)

结果:

https://beijing.8684.cnlist1
https://beijing.8684.cnlist2
https://beijing.8684.cnlist3
https://beijing.8684.cnlist4
https://beijing.8684.cnlist5
https://beijing.8684.cnlist6
https://beijing.8684.cnlist7
https://beijing.8684.cnlist8
https://beijing.8684.cnlist9

 get_page_url()方法,获得公交车的真实网址:

def get_page_url(urls):
    html=urllib.request.urlopen(urls)
    soup=bs(html.read(),'html.parser')
    lu=soup.find('div',class_='list clearfix')
    hrefs_a=lu.find_all('a')
    url_real=[]
    for hrefs_code in hrefs_a:
        url_real.append(urljoin(url,hrefs_code['href']))
    return url_real
url='https://beijing.8684.cn'
url_list=url+'/list%d'
for k in range(1,2):#一开头
    urls=url_list%k
    bus_list_url=get_page_url(urls)
    for url_cur in bus_list_url:
        print(url_cur)
import urllib.request,csv
import time
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
cs=open('bus_info.csv','w',newline='')
writer=csv.writer(cs)
def get_page_url(urls):
    html=urllib.request.urlopen(urls)
    soup=bs(html.read(),'html.parser')
    lu=soup.find('div',class_='list clearfix')
    hrefs_a=lu.find_all('a')
    url_real=[]
    for hrefs_code in hrefs_a:
        url_real.append(urljoin(url,hrefs_code['href']))
    return url_real
def get_page_info(url_to):
    html=urllib.request.urlopen(url_to)
    soup=bs(html.read(),'html.parser')
    try:
        bus_name=soup.select('body>div.breadcrumbs.depth.mb15>span.cr_crumbs_txt')[0].string
    except:
        bus_name=None
    try:
        bus_type=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20>div.info>h1>a')[0].string.strip('[]')
    except:
        bus_type=None
    try:
        bus_time=soup.select('body>div.layout.layout--728-250>div.layout-left>' ' div.bus-lzinfo.mb20 > div.info > ul > li:nth-child(1)')[0].string
    except:
        bus_time=None
    try:
        ticket=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20> div.info > ul > li:nth-child(2)')[0].string
    except:
        ticket=None
    try:
        gongsi=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20> div.info > ul > li:nth-child(3) > a')[0].string
    except:
        gongsi=None
    try:
        gengxin=soup.find(class_="wechat-promote tooltip-btn").previous_sibling
    except:
        gengxin=None
    #输出公交车信息
    result_list=[bus_name,bus_type,bus_time,ticket,gongsi,gengxin]
    result2_list=[]
    for k in result_list:
        print(k)
        if k is not None:
            result2_list.append(k)
        else:
            result2_list.append(None)
    writer.writerow(result2_list)
if __name__=='__main__':
    url='https://beijing.8684.cn'
    url_list=url+'/list%d'
    for k in range(1,10):#一开头
        urls=url_list%k
        bus_list_url=get_page_url(urls)
        for url_cur in bus_list_url:
            get_page_info(url_cur)
            time.sleep(2)
 

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值