Python---运用BeautifulSoup爬取中国天气网

最新推荐文章于 2024-06-05 11:29:15 发布

Joe_yoy

最新推荐文章于 2024-06-05 11:29:15 发布

阅读量1.1k

点赞数 1

分类专栏： Python 文章标签：爬虫

本文链接：https://blog.csdn.net/YOU_hunter/article/details/82993824

版权

Python 专栏收录该内容

9 篇文章 1 订阅

订阅专栏

昨天做了一个爬虫小任务，要求运用BeautifulSoup。弄得我要死了，不过今天总算解决了，虽然方法可能有点笨，但是总归是解决了。

方法并未封装。

from bs4 import BeautifulSoup
import requests

url = 'http://www.weather.com.cn/textFC/hb.shtml'

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.9 Safari/537.36',
'charset': 'utf-8',
'Referer': 'http://www.weather.com.cn/textFC/hb.shtml'
}

res = requests.get(url=url, headers=headers)
res.encoding = 'utf-8'

soup = BeautifulSoup(res.text, 'lxml')

for div in soup.select('div.conMidtab2'):
for tr in div.select('tr'):
if tr.find('td', width='83'):
if tr.find('td', width='83').a:
print(tr.find('td', width='83').a.string)

if tr.find('td', width='89'):
print('上午：')
print(tr.find('td', width='89').string) # 天气
print(tr.find('td', width='162').contents[1].string) # 风力风向
print(tr.find('td', width='162').contents[3].string) # 风力风向
print('最高温度' + tr.find('td', width='92').string) # 最高温度
print('晚上：')
print(tr.find('td', width='98').string) # 天气
print(tr.find('td', width='177').contents[1].string) # 风力风向
print(tr.find('td', width='177').contents[3].string) # 风力风向
print('最低温度' + tr.find('td', width='86').string) # 最低气温
print('*****************')
else:
continue

---------------------------------------------------------------------------------------------

添加一下运用xpath爬取的方法：

import json

import requests
from lxml import etree

def get_url(url):
res = requests.get(url=url)
html = etree.HTML(res.text)
urls = html.xpath('//ul[@class="lq_contentboxTab2"]//a/@href')
for url in urls:
url = 'http://www.weather.com.cn' + url
yield url

def get_html(url):
res = requests.get(url=url)
res.encoding = 'utf-8'
return res.text

def parse_html(html):
html = etree.HTML(html)
htmls = html.xpath('//table')
aList = []
for html in html.xpath('//table'):
htmls = html.xpath('./tr')[3:]
date_time = html.xpath('./tr[1]/td[3]/text()')[0].replace('白天', '')
provice = html.xpath('./tr[3]/td[1]/a/text()')[0]

for html in htmls:
city = html.xpath('./td[1]/a/text()')[0]
describe = html.xpath('./td[2]/text()')[0]
wind1 = html.xpath('./td[3]/span[1]/text()')[0]
wind2 = html.xpath('./td[3]/span[2]/text()')[0]
h_tem = html.xpath('./td[4]/text()')[0]
l_tem = html.xpath('./td[7]/text()')[0]
city = {
'city': city,
'describe': describe,
'd_wind': wind1,
's_wind': wind2,
'h_tem': h_tem,
'l_tem': l_tem,
'date': date_time,
'provice': provice,

}
aList.append(city)

return aList

def go():
url = 'http://www.weather.com.cn/textFC/hb.shtml'
for url in get_url(url):
html = get_html(url)
aList = parse_html(html)
for x in aList:
print(x)
with open('tianqi.json', 'a+') as fb:
json.dump(x, fb, ensure_ascii=False)

if __name__ == "__main__":
go()

Joe_yoy

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Python---运用BeautifulSoup爬取中国天气网

昨天做了一个爬虫小任务，要求运用BeautifulSoup。弄得我要死了，不过今天总算解决了，虽然方法可能有点笨，但是总归是解决了。方法并未封装。 from bs4 import BeautifulSoupimport requestsurl = 'http://www.weather.com.cn/textFC/hb.shtml'headers = { 'Us...
复制链接

扫一扫