具体代码如下:
import re
import requests
import json
def get_html(url): # 获取网页源代码
try:
response = requests.get(url)
response.encoding = response.apparent_encoding
if response.status_code == 200:
html = response.text
return html
else:
print("连接异常!")
except:
print("获取失败!")
def get_result(html): # 答案
# print(html)
pattern1 = re.compile(r'<td class="first">.*?>(\d+)</span>', re.S) # 获取排名
rank = re.findall(pattern1, html)
# print(rank)
pattern2 = re.compile(r'<td class="keyword">.*?>(.*?)</a>', re.S) # 获取keyword
keyword = re.findall(pattern2, html)
# print(keyword)
pattern3 = re.compile(r'<td class="last">.*?>(\d+)</span>', re.S) # 获取流行指数
last = re.findall(pattern3, html)
# print(last)
pattern4 = re.compile(r'<td class="keyword">.*?<a href="(.*?)"', re.S) # 获取链接
link = re.findall(pattern4, html)
# print(link)
result = {}
for i in range(10):
dict1 = {
"关键字": keyword[i],
"流行指数": last[i],
"链接": link[i].replace('./detail?b=1&c=513&w',
'https://www.baidu.com/baidu?cl=3&tn=SE_baiduhomet8_jmjb7mjw&rsv_dl=fyb_top&fr=top1000&wd')
}
# print(dict1)
result[rank[i]] = dict1 # 将排行作为外层键,dict1作为结果的值,构成一个大的字典便于查询
return result
def main():
url = 'http://top.baidu.com/buzz?b=1&fr=topindex'
html = get_html(url)
result1 = get_result(html)
result = json.dumps(result1, indent=4, ensure_ascii=False) # 转json格式
print(result)
if __name__ == '__main__':
main()