import os
import re
import urllib.request
import datetime
home = "http://www.tianqihoubao.com"
def down2txt(code, tit, url):
# 网页地址
page = urllib.request.urlopen(url).read()
try:
page = page.decode("gbk")
except:
page = page.decode("utf-8")
i_start = page.find("<h1>")
i_end = page.find("</h1>")
t = page[i_start : i_end]
#tit = t.replace("\r\n", "").replace("<h1>", "").strip(" ")
#print(tit)
# 创建目录
if not os.path.exists(os.getcwd() + "/data/" + code):
os.makedirs(os.getcwd() + "/data/" + code)
# 文件存在则不下载
file = os.getcwd() + "/data/" + code + "/" + tit + ".txt"
if os.path.exists(file):
f2 = open(file, "r")
ls2 = f2.readlines()
f2.close()
s_date = ls2[len(ls2) - 1][:10]
t_date = datetime.datetime.strptime(s_date, "%Y-%m-%d")
d2 = t_date + datetime.timedelta(1)
s_day2 = d2.strftime("%d")
# 结束日期不是当月最后一天, 删除重新下载数据
if s_day2 != "01":
os.remove(file)
else:
print("文件已存在:" + tit + ".txt")
return
# 截取表格文本
i_start = page.find('<table')
i_end = page.find('</table>')
page = page[i_start:i_end]
i_start = page.find(">")
page = page[i_start:]
page = page.replace("\r\n", "")
#page = page.replace("\r", "").replace("\r", "")
page = page.replace("</b>", "").replace("<b>", "")
#page = page.replace(" ", "").replace(" ", "").replace(" ", "").replace(" ", "")
re_c = re.compile(">(.+?)<")
ls = re.findall(re_c, page)
f = open(file, "w")
i = 0
s = ""
for l in ls:
l = l.strip(" ")
if l == "":
continue
s += l.strip() + " "
if i == 10:
#print(s)
f.write(s + "\n")
i+=1
if i >= 11:
i = 0
s = ""
f.close()
def down_city(name, code):
url = home + "/aqi/" + code + ".html"
print(url)
page = urllib.request.urlopen(url).read()
page = page.decode("gbk")
ls = re.findall(re.compile("href='(/aqi/" + code + "-" + ".+?html)'"), page)
for l in ls:
url = home + l
tit = l.replace("/aqi/", "").replace(".html", "")
print(url)
down2txt(code, tit, url)
#print(l)
if __name__ == "__main__":
# 查询 city.txt 获取下面的参数
down_city("兰州", "lanzhou")
print("finished!")
转载于:https://my.oschina.net/li0544/blog/823423