用Python做的爬取“笔趣阁”的小说简单爬虫
import requests
from lxml import etree
def title_info(url,headers):
response = requests.get(url=url,headers=headers)
html = response.text
count = etree.HTML(html)
title = count.xpath('//a[@style]/text()')
href = count.xpath('//dd/a/@href')
get_info(title,href,url)
def get_info(title,href,url):
for_num = 0
page = len(title)
href.remove('/0_671/4962666.html')
for nums in href:
for_num += 1
numss = len(nums)
if numss > 12:
url = 'https://www.xs.la'
url1 = url + nums
response = requests.get(url=url1, headers=headers)
html = response.text
count = etree.HTML(html)
article = count.xpath('//div[@id="content"]/text()')
info_article = ""
nums = 0
for i in article:
article_long = len(article)
nums += 1
i2 = "\n" + i
info_article += i2
if nums == article_long:
print("正在下载第{}章".format(page))
with open("./校花的贴身高手/第{}章.txt".format(page),'w+',encoding="utf-8") as f:
f.write(info_article)
print("第{}章下载完成".format(page))
page -= 1
if __name__ == '__main__':
url = "https://www.xs.la/0_671/"
headers = {
"User - Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 72.0.3626.119Safari / 537.36"
}
title_info(url,headers)