Python爬虫_小说爬取(差差差)

最新推荐文章于 2022-04-10 15:25:16 发布

Corgy.

最新推荐文章于 2022-04-10 15:25:16 发布

阅读量1.5k

点赞数

分类专栏： python 文章标签：其他 python xpath

本文链接：https://blog.csdn.net/small_dog_/article/details/106128836

版权

python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

前提准备

安装Python以及必要的模块（requests，xpath）

新笔趣阁

流程代码（弟弟代码）

import requests
import time
import sys
from lxml import etree



# 首先获取访问网站的URl
def get_content(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
        }
        r = requests.get(url=url, headers=headers)
        r.encoding = 'utf-8'
        content = r.text
        print(content)
        return content
    except:  # 反应错误信息
        s = sys.exc_info()
        print("Error '%s' happened on line %d" % (s[1], s[2].tb_lineno))
        return " ERROR "


# 解析得到的内容 今天主要学习方面
def get_analysis(content):
    print(type(content))
    ele = etree.HTML(content)
    # print(type(ele))
    result = ele.xpath("//div[@id='content']/text()")
    print(result)
    finishedProduct = "\n".join(result)
    print(finishedProduct)
    save(finishedProduct)
    # for result in result:
    #     element = result.xpath("br")[0]
    #     print(len(element))
    #     print(type(element))

# 写入文档
def save(finishedProduct):
    filename = "元尊.txt"
    f = open(filename, "a+", encoding='utf-8')
    f.write(finishedProduct+'\n')
    f.close

# 主程序
def main():
    start_time = time.time()
    content = get_content('https://www.xsbiquge.com/78_78513/108078.html')#限制了搜索的范围
    get_analysis(content)
    end_time = time.time()
    project_time = end_time - start_time
    print('程序用时', project_time)

main()