python爬取天气信息
python爬取天气信息
1.导入python及其他模块
import re
import urllib
import urllib.request
import pprint
import os as myos
import datetime
2主程序
num =3
isend=[ ]
if __name__ == '__main__':
print(datetime.datetime.now())
url = r"http://www.weather.com.cn/weather/101280301.shtml" #进行多网页下载
print(url)
html = load_page(url,1)
# pprint.pprint(html)
get_image(html,num )
print(datetime.datetime.now())
3申请网站天气信息
def load_page(url,myflag):
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
if myflag==1:
data=response.read().decode()
if myflag==2:
data = response.read()
return data
4.清洗数据
def get_image(html,num):
location=[]
location_1=[]
location_2=[]
regx = r'hour3data=[\S]*'
pattern = re.compile(regx)
get_image = re.findall(pattern, repr(html))
print(type(get_image))
for item in get_image:
print(type(item))
print(item)
s=item.replace('"','[')
t=s.split('[')
for i1 in t:
regx = r'\d+日.*'
pattern = re.compile(regx)
t1=re.findall(pattern, repr(i1))
if t1!=[]:
location.append(t1[0])
#数据中有重复内容通过比对截取
myint1=0
for item2 in location:
if item2==location[0]:
location_1.append(myint1)
myint1+=1
pprint.pprint(location_1)
#location_1中只有一个元素,无重复数据。
if len(location_1)==1:
location_2=location
#location_1中大于一个元素,有重复数据
if len(location_1)>1:
location_2=location[location_1[1]:]
pprint.pprint(location_2)
print(len(location_2))
for item in location_2:
myweite=datetime.datetime.now().__str__()+ "----惠州天气---"+item[0:len(item)-1]+"\n"
with open('D:\\图片\\惠州天气.txt', 'a')as fb:
fb.write(myweite)
5运行效果图: