这一篇文章爬取我博客内容并且按照输入提示保存
有关参考
爬取文章标题:https://www.cnblogs.com/lcyzblog/p/11275188.html
源码分析:文本存放于div的clss属性为postBody的标签下
from lxml import etree
import requests
get_cookie = requests.session()
def get_titles_and_blogurl():
global url_blog
global title_blog
url_get_cookies = 'https://www.cnblogs.com/'
blog_url = "https://www.cnblogs.com/lcyzblog/"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
get_cookie.get(url=url_get_cookies, headers=header)
html_blog = get_cookie.get(blog_url)
myblog_html = etree.HTML(html_blog.text)
#get_myblog_title用于获取博客文章标题
#get_myblog_url用于获取博客链接
get_myblog_title = "//div[@class='postTitle']/a/text()"
get_myblog_url="//div[@class='postTitle']/a/@href"
title_blog=myblog_html.xpath(get_myblog_title)
url_blog=myblog_html.xpath(get_myblog_url)
blog_main=dict(zip(title_blog,url_blog))
print(blog_main)
return blog_main
def get_main(text_note):
#利用字典得到文章的标题去得到文章对应的url
main_blog_url=main_is_blog[text_note]
main_blog_html=get_cookie.get(main_blog_url)
blog_main_html=etree.HTML(main_blog_html.text)
main_blog_xpath="//div[@class='postBody']//text()"
main_blog_text = blog_main_html.xpath(main_blog_xpath)
print(main_blog_text)
numbers = input("是否需要保存文章?(yes/no)到本地: ")
if numbers=="yes":
with open("./"+text_note+".txt",'w',encoding='utf-8') as fp:
for str_blog in main_blog_text:
fp.write(str_blog)
else:
return main_blog_text
if __name__ == '__main__':
#得到博客文章的标题和url
main_is_blog=get_titles_and_blogurl()
note = input("请输入查看的文章:")
get_main(text_note=note)