提取网页网站-lxml结合xpath语法实例二（数据提取）

最新推荐文章于 2023-04-06 18:06:00 发布

weixin_30321449

最新推荐文章于 2023-04-06 18:06:00 发布

阅读量115

点赞数

原文链接：http://www.cnblogs.com/lcyzblog/p/11305985.html

版权

这一篇文章爬取我博客内容并且按照输入提示保存
有关参考
爬取文章标题：https://www.cnblogs.com/lcyzblog/p/11275188.html
源码分析：文本存放于div的clss属性为postBody的标签下

from lxml import etree
import requests

get_cookie = requests.session()

def get_titles_and_blogurl():
    global url_blog
    global title_blog
    url_get_cookies = 'https://www.cnblogs.com/'
    blog_url = "https://www.cnblogs.com/lcyzblog/"
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
    get_cookie.get(url=url_get_cookies, headers=header)
    html_blog = get_cookie.get(blog_url)
    myblog_html = etree.HTML(html_blog.text)
    #get_myblog_title用于获取博客文章标题
    #get_myblog_url用于获取博客链接
    get_myblog_title = "//div[@class='postTitle']/a/text()"
    get_myblog_url="//div[@class='postTitle']/a/@href"
    title_blog=myblog_html.xpath(get_myblog_title)
    url_blog=myblog_html.xpath(get_myblog_url)
    blog_main=dict(zip(title_blog,url_blog))
    print(blog_main)
    return blog_main

def get_main(text_note):
    #利用字典得到文章的标题去得到文章对应的url
    main_blog_url=main_is_blog[text_note]
    main_blog_html=get_cookie.get(main_blog_url)
    blog_main_html=etree.HTML(main_blog_html.text)
    main_blog_xpath="//div[@class='postBody']//text()"
    main_blog_text = blog_main_html.xpath(main_blog_xpath)
    print(main_blog_text)
    numbers = input("是否需要保存文章?(yes/no)到本地:  ")

    if numbers=="yes":
        with open("./"+text_note+".txt",'w',encoding='utf-8') as fp:
            for str_blog in main_blog_text:
                fp.write(str_blog)

    else:
        return main_blog_text



if __name__ == '__main__':
    #得到博客文章的标题和url
    main_is_blog=get_titles_and_blogurl()
    note = input("请输入查看的文章:")
    get_main(text_note=note)

转载于:https://www.cnblogs.com/lcyzblog/p/11305985.html