2.1.3 自动收集数据
import urllib2
page = urllib2.urlopen("https://www.wunderground.com/history/airport/ZHCC/2017/9/8/DailyHistory.html" )
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(page)
images = soup.findAll('img' )
first_image = images[0 ]
print first_image
wxvalue = soup.findAll(attrs={"class" :"wx-value" })
print wxvalue
print wxvalue[0 ]
print wxvalue[0 ].span.string
print wxvalue[0 ].contents[0 ].string
for m in range(1 , 13 ):
for d in range(1 , 32 ):
if (m == 2 and d > 28 ):
break
elif (m in [4 , 6 , 9 , 11 ] and d > 30 ):
break
timestamp = '2016' + str(m) + str(d)
print "Getting data for " + timestamp
url = "https://www.wunderground.com/history/airport/ZHCC/2016/" + str(m) + "/" + str(d) + "/DailyHistory.html"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
dayTemp = soup.findAll(attrs={"class" :"wx-value" })[0 ].contents[0 ].string
if len(str(m)) < 2 :
mStamp = '0' + str(m)
else :
mStamp = str(m)
if len(str(d)) < 2 :
dStamp = '0' + str(d)
else :
dStamp = str(d)
timestamp = '2016' + mStamp + dStamp
print timestamp + ',' + dayTemp + '\n'
终端输入并运行文件
python get-weather-data.py
2.2.3 用代码来格式化
1. CSV转为XML
import csv
reader = csv.reader(open('wunder-data.txt' , 'r' ), delimiter="," )
print '<weather_data>'
for row in reader:
print '<observation>'
print '<date>' + row[0 ] + '</date>'
print '<temperature>' + row[1 ] + '</temperature>'
print '</observation>'
print '</weather_data>'
终端输入并运行文件
python csv2xml.py >wunder-data1.xml
f = open('wunder-data.xml' , 'w' )
f.write('<weather_data>' )
for row in reader:
f.write( '<observation>' )
f.write( '<date>' + row[0 ] + '</date>' )
f.write( '<temperature>' + row[1 ] + '</temperature>' )
f.write( '</observation>' )
f.write( '</weather_data>' )
f.close()
2. XML转为CSV
from BeautifulSoup import BeautifulStoneSoup
f = open('wunder-data.xml' , 'r' )
xml = f.read()
soup = BeautifulStoneSoup(xml)
observations = soup.findAll('observation' )
for o in observations:
print o.date.string + "," + o.temperature.string
终端输入并运行文件
python xml2csv.py >wunder-data1.txt
3. CSV转为JSON
import csv
reader = csv.reader(open('wunder-data.txt' , 'r' ), delimiter="," )
print '{ "observations": ['
rows_so_far = 0
for row in reader:
rows_so_far += 1
print '{'
print '"date": ' + '"' + row[0 ] + '", '
print '"temperature": ' + row[1 ]
if rows_so_far < 365 :
print " },"
else :
print " }"
print "] }"
终端输入并运行文件
python csv2json.py >wunder-data1.json
4.在循环中加入新的逻辑
import csv
reader = csv.reader(open('wunder-data.txt' , 'r' ), delimiter="," )
for row in reader:
if int(row[1 ]) <= 32 :
is_freezing = '1'
else :
is_freezing = '0'
print row[0 ] + "," + row[1 ] + "," + is_freezing
终端输入并运行文件
python freezingInfo.py >wunder-data-fz.txt