import requests
from bs4 import BeautifulSoup
def content(url):
target = url
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36 Edg/103.0.1264.77'
}
response = requests.get(target,headers=headers)
response.encoding = 'utf-8'
html = response.text
bf = BeautifulSoup(html,"lxml")
texts = bf.find("div",id = "content")
return texts.text
if __name__=='__main__':
target = 'xxxxxxxxxxxx'#这个自己找,不能发出来。
response = requests.get(target)
response.encoding = 'utf-8'
html = response.text
bf = BeautifulSoup(html,"lxml")
texts = bf.find("div",id = "list")
texts1 = texts.find_all("a")
sum = 0
names = []
urls = []
#windows下文件名中不能出现这些敏感字符 ? * : . < > \ / |,所以我们可以进行一些修改 '
for i in texts1:
names.append(i.string)
urls.append(i.get("href"))
for i in range(len(names)):
url = 'xxxxxxxxxxx'+urls[i]#同理,不能发出来。
word = content(url)
with open('xxxxxx.txt',"a",encoding='utf-8')as f:
f.write(word)
print("下载成功")
python爬取小说,接上文。
于 2022-08-08 16:59:05 首次发布