import requests
from bs4 import BeautifulSoup
from pyecharts import Bar
ALL_DATA = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"
}
def parse_page(url):
response = requests.get(url, headers=headers)
text = response.content.decode("utf-8")
# print(text)
# soup = BeautifulSoup(text, "lxml")
# html5lib容错更高
soup = BeautifulSoup(text, "html5lib")
conMidtab = soup.find("div", class_='conMidtab')
tables = conMidtab.find_all("table")
for table in tables:
trs = table.find_all('tr')[2:]
for index, tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index==0:
city_td = tds[1]
city = list(city_td.stripped_strings)[0]
temp_td = tds[-2]
min_temp = list(temp_td.stripped_strings)[0]
ALL_DATA.append({"city":city, "min_temp":int(min_temp)})
print({"city":city, "min_temp":int(min_temp)})
data = ALL_DATA[0:10]
# pyecharts
# pip install pyecharts
def main():
# url = "http://www.weather.com.cn/textFC/hb.shtml"
# url = "http://www.weather.com.cn/textFC/db.shtml"
# url = 'http://www.weather.com.cn/textFC/hd.shtml'
# url = 'http://www.weather.com.cn/textFC/hz.shtml'
# url = 'http://www.weather.com.cn/textFC/hn.shtml'
# url = 'http://www.weather.com.cn/textFC/xb.shtml'
# url = 'http://www.weather.com.cn/textFC/xn.shtml'
# url = 'http://www.weather.com.cn/textFC/gat.shtml'
urls = [
"http://www.weather.com.cn/textFC/hb.shtml",
"http://www.weather.com.cn/textFC/db.shtml",
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml'
]
for url in urls:
parse_page(url)
# 分析数据
# 根据最低气温排序
# ALL_DATA = {
# {'city': '西双版纳', 'min_temp': '14'},
# {'city': '拉萨', 'min_temp': '-2'}
#
爬虫之中国天气网
最新推荐文章于 2024-06-27 17:20:29 发布
本文介绍了如何使用html5lib和BeautifulSoup库解析中国天气网的HTML页面,从而抓取实时天气信息。通过实例详细讲解了爬虫的步骤,包括请求网页、解析HTML、提取关键数据等技术。
摘要由CSDN通过智能技术生成