抓取网站上的历史温度数据

温度数据网址为:添加链接描述
python版本为3.7.7
重点在于不用再安装模块

# -*- coding: utf-8 -*-
"""
@author: CC
"""
 
import re
import urllib
import time
import urllib.request
#from urllib import request
"""
设置需要爬取的地区,使用该地区的拼音
设置需要爬取的年、月
"""
city = 'guangyuan'
years = ['2012','2019']
months = ['01','02']
 
 
def getHtml(city, year, month):
    url = 'http://lishi.tianqi.com/' + city + '/' + str(year) + str(month) + '.html'
    print (url)
    request1=urllib.request.Request(url)
    request1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36')
    response = urllib.request.urlopen(request1)
    html = response.read()
    return html

def getTemp(html,year,month):
    year=int(year)
    month=int(month)
    html=str(html)
    a=re.compile('var hightemp = \[(-?[0-9]{1,2},)*-?\d+\]')
    b=re.compile('var lowtemp = \[(-?[0-9]{1,2},)*-?\d+\]')
    tablehigh=re.search(a,html).group()
    tablelow=re.search(b,html).group()
    low=tablelow.split("=")[1]
    low=low.strip( )
    low=low.strip('\[|\]')
    lowtemp=low.split(',')
    high=tablehigh.split("=")[1]
    high=high.strip( )
    high=high.strip('\[|\]')
    hightemp=high.split(',')
    time=[]
    for i in range(len(hightemp)):
       time.append([year,month,i+1,int(hightemp[i]),int(lowtemp[i])])
    return time
 
if __name__ == "__main__":
    with open(city + '.csv', 'w') as f:
        writer = csv.writer(f)
        row1 = [('time', 'high', 'low')]
        writer.writerows(row1)
        for year in years:
            for month in months:
                html = getHtml(city, year, month)
                #flag=1
                #if flag==1:
                #    print (html)
                #    flag+=1
                rows = getTemp(html,year,month)
                for i in rows:
                    for j in range(len(i)):
                        print ("%d\t"%(i[j]),end="")
                    print ('')
                #writer.writerows(rows)
                print (year + month + ' OK!')
                time.sleep(2)
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值