目前刚刚学了点爬虫,发现正则表达式的解析速度最快。下面是源码,复制粘贴就能用。2019年8月9日测试可用
需要安装的库:requests
import requests
import re
def parse_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
response = requests.get(url,headers=headers)
text=response.content.decode('utf-8')
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
dynasties = re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
authors = re.findall(r'<p\sclass="source">.*?<a.*?<a.*?>(.*?)</a>',text,re.DOTALL)
poetries = re.findall(r'<div\sclass="contson".*?>(.*?)</div>',text,re.DOTALL)
content=[]
for poetry in poetries:
poetry=re.sub('<p>|</p>','',poetry)
poetry=re.sub('<br />','\n',poetry).strip()
content.append(poetry)
poems = zip(titles,dynasties,authors,content)
for poem in poems:
title,dynasty,author,content = poem
print("《"+title+"》")
print(dynasty+" · "+author)
print(content+"\n")
def main():
start = int(input("从这一页开始爬取1+:"))
end = int(input("到这一页结束10-:"))
base_url = "https://www.gushiwen.org/default_{}.aspx"
for i in range(start,end+1):
url = base_url.format(i)
parse_page(url)
if __name__=="__main__":
main()
运行截图: