import requests
url='http://i.tianqi.com/index.php?c=code&a=getcode&id=55&py=haizhuqu'
headers = {
"Uset-Agent":r"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36",
"Accept-Language":r"zh-CN,zh;q=0.8"
}
def getPage(url):
"取得列表页面数据,并转码输出"
try:
wbDate=requests.get(url,headers=headers)#直接用requests.get(url,data)即可,其中.get表示为get方法,不需要对字典类型的data进行处理
wbDate.encoding = "utf-8" #手动指定编码方式
reTxt=wbDate.text #text方法是获取到响应为一个str,也不需要对res进行转换等处理
print ("获取页面OK")
return reTxt
except:
print("获取页面失败,等待5秒后重试")
import time
time.sleep(5)
getPage(url)#试一试递归,可以用
import re
content = getPage(url)
#------------------------------------------------------------------------分隔符
Aa1 =r"正则输出开始"
print(f"{'_':_>60}{Aa1:_<40}{'_':_>10}\n")
#------------------------------------------------------------------------分隔符
print('获取title 方法一')
title_pat = r'(?<=
).*?(?=)'title_ex = re.compile(title_pat,re.M|re.S)
title_obj = re.search(title_ex, content)
title = title_obj.group()
print (title)
print('获取title 方法二')
title = re.findall(r'
(.*?)', content)print (title)
print('农历')
# r'
08/19 己亥年七月十九' 这是是原文的串_pat = r'
(.*?)'_ex = re.compile(_pat,re.M|re.S)
nongli = re.findall(_ex, content)
print (nongli)
print('天气')
# r'雷阵雨 这是是原文的串
_pat = r'height: 18px;overflow: hidden;">(.*?)'
_ex = re.compile(_pat,re.M|re.S)
tianqi = re.findall(_ex, content)
print (tianqi)
print('温度')
# r'
27~34
' 这是是原文的串title_pat = r'
(.*?)~(.*?)
'title_ex = re.compile(title_pat,re.M|re.S)
wendu = re.findall(title_ex, content)
print (wendu)
print('指数')
# r'height:36px">
晨练指数
不宜