利用下班空闲时间看了三天视频(纯小白),写了个抓取小说网站的爬虫脚本,目前代码不完善,需去除章节里/p标签,百度了半天没找到解决方案,有大佬可以给完善一下,在此谢过!
from multiprocessing import get_context
from tkinter import W
from turtle import title
import requests
from bs4 import BeautifulSoup
import lxml
if __name__ == '__main__':
url = 'https://b.faloo.com/1190629.html'
headers={
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
page_text = requests.get(url= url,headers=headers).content
soup = BeautifulSoup(page_text,'lxml')
li_list = soup.select('.DivTd')
fp = open('./siheyuan.txt','w',encoding='utf-8')
for DivTd in li_list:
title = DivTd