爬取古诗词
闲来无事,就写了个代码爬取古诗词名句来看看。我们用到的是这个网站 链接: https://www.xungushici.com/。刚刚开始学习Python爬虫,有些不对的地方还请大佬指出。
获取内容
首先我们看一下这个内容的基本架构。分别有注释、故事赏析等等。
分别获取它的位置。
对其进行提取
def get_body(url):
respond = requests.get(url)
html = parsel.Selector(respond.text)
title = html.css('body > div > div > div > div > div > h3::text').get()
origin = html.css('body > div > div > div> div > div> p > a::text').get()
try: #因为部分诗词没有注释和故事,所以在这里进行报错
note = html.css('body > div > div > div> div:nth-child(2) > div > p:nth-child(4)::text').get()
stories = html.css('body > div > div > div > div:nth-child(3) > div > p::text').getall()
story = "\n".join(stories) #对列表进行拼接
except:
pass
appreciates = html.css('body > div > div > div > div:nth-child(3) > div > p::text').getall()
appreciate = '\n'.join(appreciates)
进行文件保存
with open('title + '.txt', mode='a', encoding='utf-8') as f:
f.write(title)
f.write('\n')
f.write('出自于:'+'\n'+origin+'\n')
try: #因为部分诗词没有注释和故事,所以在这里进行报错
f.write('注释:'+'\n'+note+'\n')
f.write('故事:'+'\n'+story+'\n')
except:
pass
f.write('赏析:'+'\n'+appreciate+'\n')
f.close()
实现所有网页提取和翻页
得到每一个诗词的位置
进行编写
def get_url(url):
respond = requests.get(url)
html = parsel.Selector(respond.text)
url_links = html.css('body > div > div > div > div > div > ul > li > a::attr(href)').getall()
for url_link in url_links:
url_link = 'https://www.xungushici.com'+url_link
print(url_link)
get_body(url_link)
再分析每一页
不难得出规律
进行编写
def get_page():
url = 'https://www.xungushici.com/mingjus/p'
for i in range(1,3):
html = 'https://www.xungushici.com/mingjus/p%d'%i
get_url(html)
完整代码
# -*- coding = utf-8 -*-
# @Time : 2021/7/3 21:02
# @File : 尝试爬取寻古诗词网.py
# @Software : PyCharm
import requests
import parsel
def get_page():
url = 'https://www.xungushici.com/mingjus/p'
for i in range(1,3):
html = 'https://www.xungushici.com/mingjus/p%d'%i
get_url(html)
def get_url(url):
respond = requests.get(url)
html = parsel.Selector(respond.text)
url_links = html.css('body > div > div > div > div > div > ul > li > a::attr(href)').getall()
for url_link in url_links:
url_link = 'https://www.xungushici.com'+url_link
print(url_link)
get_body(url_link)
def get_body(url):
respond = requests.get(url)
html = parsel.Selector(respond.text)
title = html.css('body > div > div > div > div > div > h3::text').get()
origin = html.css('body > div > div > div> div > div> p > a::text').get()
try:
note = html.css('body > div > div > div> div:nth-child(2) > div > p:nth-child(4)::text').get()
stories = html.css('body > div > div > div > div:nth-child(3) > div > p::text').getall()
story = "\n".join(stories) #对列表进行拼接
except:
pass
appreciates = html.css('body > div > div > div > div:nth-child(3) > div > p::text').getall()
appreciate = '\n'.join(appreciates)
with open(title + '.txt', mode='a', encoding='utf-8') as f:
f.write(title)
f.write('\n')
f.write('出自于:'+'\n'+origin+'\n')
try:
f.write('注释:'+'\n'+note+'\n')
f.write('故事:'+'\n'+story+'\n')
except:
pass
f.write('赏析:'+'\n'+appreciate+'\n')
f.close()
get_page()