import urllib.request,csv
import time
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
构造一个URL,获得一级页面下的URL:
url='https://beijing.8684.cn'
url_list=url+'list%d'
for k in range(1,10):
urls=url_list%k
print(urls)
结果:
https://beijing.8684.cnlist1
https://beijing.8684.cnlist2
https://beijing.8684.cnlist3
https://beijing.8684.cnlist4
https://beijing.8684.cnlist5
https://beijing.8684.cnlist6
https://beijing.8684.cnlist7
https://beijing.8684.cnlist8
https://beijing.8684.cnlist9
get_page_url()方法,获得公交车的真实网址:
def get_page_url(urls):
html=urllib.request.urlopen(urls)
soup=bs(html.read(),'html.parser')
lu=soup.find('div',class_='list clearfix')
hrefs_a=lu.find_all('a')
url_real=[]
for hrefs_code in hrefs_a:
url_real.append(urljoin(url,hrefs_code['href']))
return url_real
url='https://beijing.8684.cn'
url_list=url+'/list%d'
for k in range(1,2):#一开头
urls=url_list%k
bus_list_url=get_page_url(urls)
for url_cur in bus_list_url:
print(url_cur)
import urllib.request,csv
import time
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
cs=open('bus_info.csv','w',newline='')
writer=csv.writer(cs)
def get_page_url(urls):
html=urllib.request.urlopen(urls)
soup=bs(html.read(),'html.parser')
lu=soup.find('div',class_='list clearfix')
hrefs_a=lu.find_all('a')
url_real=[]
for hrefs_code in hrefs_a:
url_real.append(urljoin(url,hrefs_code['href']))
return url_real
def get_page_info(url_to):
html=urllib.request.urlopen(url_to)
soup=bs(html.read(),'html.parser')
try:
bus_name=soup.select('body>div.breadcrumbs.depth.mb15>span.cr_crumbs_txt')[0].string
except:
bus_name=None
try:
bus_type=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20>div.info>h1>a')[0].string.strip('[]')
except:
bus_type=None
try:
bus_time=soup.select('body>div.layout.layout--728-250>div.layout-left>' ' div.bus-lzinfo.mb20 > div.info > ul > li:nth-child(1)')[0].string
except:
bus_time=None
try:
ticket=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20> div.info > ul > li:nth-child(2)')[0].string
except:
ticket=None
try:
gongsi=soup.select('body>div.layout.layout--728-250>div.layout-left>' 'div.bus-lzinfo.mb20> div.info > ul > li:nth-child(3) > a')[0].string
except:
gongsi=None
try:
gengxin=soup.find(class_="wechat-promote tooltip-btn").previous_sibling
except:
gengxin=None
#输出公交车信息
result_list=[bus_name,bus_type,bus_time,ticket,gongsi,gengxin]
result2_list=[]
for k in result_list:
print(k)
if k is not None:
result2_list.append(k)
else:
result2_list.append(None)
writer.writerow(result2_list)
if __name__=='__main__':
url='https://beijing.8684.cn'
url_list=url+'/list%d'
for k in range(1,10):#一开头
urls=url_list%k
bus_list_url=get_page_url(urls)
for url_cur in bus_list_url:
get_page_info(url_cur)
time.sleep(2)