今天学习了使用python编写爬虫程序,从中国天气网爬取杭州的天气。使用到了urllib库和bs4。bs4提供了专门针对html的解析功能,比用RE方便许多。
# coding : UTF-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
from bs4 import BeautifulSoup
import csv
import urllib
def get_html(url):
html = urllib.urlopen(url)
return html.read()
def get_data(html_text):
final = []
bs = BeautifulSoup(html_text, "html.parser")
body = bs.body
data = body.find('div', {'id': '7d'})
ul = data.find('ul')
li = ul.find_all('li')
for day in li:
temp = []
date = day.find('h1').string
temp.append(date)
inf = day.find_all('p')
temp.append(inf[0].string,)
if inf[1].find('span') is None:
temperature_highest = None
else:
temperature_highest = inf[1].find('span').string
temperature_highest = temperature_highest.replace('C', '')
temperature_lowest = inf[1].find('i').string
temperature_lowest = temperature_lowest.replace('C', '')
temp.append(temperature_highest)
temp.append(temperature_lowest)
final.append(temp)
return final
def write_data(data, name):
file_name = name
with open(file_name, 'a') as f:
f_csv = csv.writer(f)
f_csv.writerows(data)
if __name__ == '__main__':
html_doc = get_html('http://www.weather.com.cn/weather/101190401.shtml')
result = get_data(html_doc)
write_data(result, 'weather.csv')
print result
运行结果保存在csv文件中,如下:
28日(今天),小雨,,13℃
29日(明天),小雨转阴,15℃,12℃
30日(后天),多云,19℃,14℃
31日(周一),小雨,16℃,14℃
1日(周二),阴转多云,16℃,10℃
2日(周三),多云转晴,17℃,10℃
3日(周四),多云转晴,18℃,11℃