1. 以下就是古诗文网站的爬虫代码,请看:
# encoding:utf-8
import requests
import re
import json
def parse_page(url):
# 1.请求网站
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
}
response = requests.get(url, headers=headers)
text = response.text
# 2.解析网站
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
# print json.dumps(titles, encoding="utf-8", ensure_ascii=False)
times = re.findall(r'<p\sclass="source">.*?<a\s.*?>(.*?)</a>', text, re.DOTALL)
# print json.dumps(times, encoding="utf-8", ensure_ascii=False)
authors = re.findall(r'<p class="source">.*?<a.*?<a.*?>(.*?)</a>', text, re.DOTALL)
poems_ret = re.findall(r'<div class="contson" id=.*?>(.*?)</div>', text, re.DOTALL)
poems = []
for poem in poems_ret:
temp = re.sub("<.*?>", "", poem)
poems.append(temp.strip())
# for index, value in enumerate(titles):
# print titles[index]
# print times[index]
# print authors[index]
# print poems[index]
# print "*"*50
# zip函数自动实现上述组合
results = []
for value in zip(titles, times, authors, poems):
title, time, author, poem = value
result = {
"标题": title,
"朝代": time,
"作者": author,
"原文": poem
}
print result["标题"]
results.append(result)
# print results
def main():
url_base = "https://www.xzslx.net/gushi/"
for i in range(1, 11):
url = url_base.format(i)
print " "*20+"优美古诗文"+" "*20
print "*"*50
parse_page(url)
print "*"*50
if __name__ == '__main__':
main()
2. 输出来的结果是:
C:\DDD\python22\python.exe C:/PyCharm/dytt_spider/poems.py 古诗文 ************************************************** 关山月 明月出天山,苍茫云海间。 长风几×××,吹度玉门关。 汉下白登道,胡窥青海湾。 [2] 由来征战地,不见有人还。 戍客望边邑,思归多苦颜。 高楼当此夜,叹息未应闲。 ************************************************** 古诗文 ************************************************** 陇西行四首·其二 誓扫匈奴不顾身,五千貂锦丧胡尘。 可怜无定河边骨,犹是春闺梦里人! ************************************************** 古诗文 ************************************************** 嫦娥(嫦娥应悔偷灵药) 云母屏风烛影深, 长河渐落晓星沉。 嫦娥应悔偷灵药, 碧海青天夜夜心。 **************************************************
Process finished with exit code 0
转载于:https://blog.51cto.com/3214135/2156136