温度数据网址为:添加链接描述
python版本为3.7.7
重点在于不用再安装模块
# -*- coding: utf-8 -*-
"""
@author: CC
"""
import re
import urllib
import time
import urllib.request
#from urllib import request
"""
设置需要爬取的地区,使用该地区的拼音
设置需要爬取的年、月
"""
city = 'guangyuan'
years = ['2012','2019']
months = ['01','02']
def getHtml(city, year, month):
url = 'http://lishi.tianqi.com/' + city + '/' + str(year) + str(month) + '.html'
print (url)
request1=urllib.request.Request(url)
request1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36')
response = urllib.request.urlopen(request1)
html = response.read()
return html
def getTemp(html,year,month):
year=int(year)
month=int(month)
html=str(html)
a=re.compile('var hightemp = \[(-?[0-9]{1,2},)*-?\d+\]')
b=re.compile('var lowtemp = \[(-?[0-9]{1,2},)*-?\d+\]')
tablehigh=re.search(a,html).group()
tablelow=re.search(b,html).group()
low=tablelow.split("=")[1]
low=low.strip( )
low=low.strip('\[|\]')
lowtemp=low.split(',')
high=tablehigh.split("=")[1]
high=high.strip( )
high=high.strip('\[|\]')
hightemp=high.split(',')
time=[]
for i in range(len(hightemp)):
time.append([year,month,i+1,int(hightemp[i]),int(lowtemp[i])])
return time
if __name__ == "__main__":
with open(city + '.csv', 'w') as f:
writer = csv.writer(f)
row1 = [('time', 'high', 'low')]
writer.writerows(row1)
for year in years:
for month in months:
html = getHtml(city, year, month)
#flag=1
#if flag==1:
# print (html)
# flag+=1
rows = getTemp(html,year,month)
for i in rows:
for j in range(len(i)):
print ("%d\t"%(i[j]),end="")
print ('')
#writer.writerows(rows)
print (year + month + ' OK!')
time.sleep(2)