-
- requests模块获取每一页的html文件
- 主要利用BeautifulSoup进行提取
- 需要注意:港澳台地区的html源码不规范,所以这里用到了html5lib解析器;其他地区省会与城市这里要进行区分
- 代码如下:
import requests
from bs4 import BeautifulSoup
def get_source(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
soup = BeautifulSoup(text, 'html5lib')
conMidtab = soup.find('div', class_="conMidtab")
tables = conMidtab.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index == 0:
city_td = tds[1]
city = list(city_td.stripped_strings)[0]
temp_td = tds[-2]
temp = list(temp_td.stripped_strings)[0]
print('城市:',city, '最低气温:',temp)
def main():
url_list = [
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml'
]
for url in url_list:
get_source(url)
if __name__ == '__main__':
main()