import re
import os
import requests
url = 'http://www.shiren.org/xlib/lingshidao/gushi/tangdai.htm#001'
requests.get(url).encoding='UTF-8'
html = requests.get(url).text
def get_content(html):
content_big = re.findall('<p><p>(.*?)<p align="center">',html, re.S)[0]
content_little = re.findall('</a>(.*?)<p>(.*?)<p>', content_big,re.S)
article = ""
for i in content_little:
for j in i:
article = article + j.replace("<br>"," ") + '\n'
return article
def save(article):
os.makedirs('古诗',exist_ok=True)
with open(os.path.join('古诗.txt'), 'w', encoding="utf-8") as f:
f.write(article)
return print('下载成功')
save(get_content(html))
嘿嘿嘿,又爬了一篇古诗,看来我已经会熟练使用requests库和re正则模块来进行网页数据获取了!!!( •̀ ω •́ )y,NICE!