网络爬虫爬取小说

最新推荐文章于 2024-05-10 03:56:25 发布

一辈子的拥抱

最新推荐文章于 2024-05-10 03:56:25 发布

阅读量694

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/weixin_50719427/article/details/124145529

版权

import requests
import re, os
from lxml import etree
ua = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
url = 'https://www.bswtan.com/0/424/'
os.chdir(r"C:\Users\Administrator\Desktop")

def get_info(url):
r = requests.get(url, headers=ua)
r.encoding ='utf-8'
get_info_list = []
html = etree.HTML(r.text)
dd_list = html.xpath('//*[@id="list"]/dl/dd')
for dd in dd_list:
title = dd.xpath('a/text()')[0]
href = 'https://www.bswtan.com/0/424/' + dd.xpath('a/@href')[0]
chapter = {'title': title,'href': href}
get_info_list.append (chapter)
return get_info_list

def get_content(ge