软件pycharm
实现下载request,lxml,BeautifulSoup这些包
pip install lxml
pip install BeautifulSoup
pip install request
UA伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
然后用BeautifulSoup定位到标签
全部代码如下:
from time import sleep
import requests
import lxml
from bs4 import BeautifulSoup
url = 'https://www.ibswtan.com/0/425/'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Host": "www.ibswtan.com",
"Referer": "https://www.ibswtan.com/0/425/"
}
resp = requests.get(url=url, headers=headers)
resp.encoding = 'utf-8'
page_text = resp.text
soup = BeautifulSoup(page_text, 'lxml')
list_title = soup.select('#list > dl > dd')
fp = open("斗破苍穹.txt", 'w', encoding='utf-8')
for list in list_title:
title = list.a.string
title_links = list.a['href']
title_url = "https://www.ibswtan.com/0/425/" + title_links
sleep(1)
list_resp = requests.get(url=title_url, headers=headers)
list_resp.encoding = 'utf-8'
list_page_text = list_resp.text
list_soup = BeautifulSoup(list_page_text, 'lxml')
list_tag_id = list_soup.find('div', id='content')
neirong = list_tag_id.text
fp.write(title +"\n"+ neirong + "\n")
print(title + " " + "爬取完成")