爬取漫画
- 一次性加载的网站:使用requests + bs4就足够了
- 带加载的网站:使用requests + selenium无界面操作
import requests, os, pprint, time, re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
chrome_opt = Options()
chrome_opt.add_argument('--headless')
chrome_opt.add_argument('--disable-gpu')
chrome_opt.add_argument('--window-size=1366,768')
driver = webdriver.Chrome(chrome_options=chrome_opt)
manhuaWebMap = 'https://www.xxxxxxxxxxxxxxxxxxxxxxxx.com'
res = requests.get(manhuaWebMap).text
soup = BeautifulSoup(res, 'lxml')
elems = soup.find(id="chapter-list-1").find_all('li')
for i in elems:
mainTarge = i.find('a')
manhuaOneMap = mainTarge.get('href')
manhuaOneName = mainTarge.get('title')
print(f'Downloading page {manhuaOneName}')
manhuaFolderMap = fr'G:\xxxxx\咒术回战\{manhuaOneName}'
os.makedirs(manhuaFolderMap, exist_ok=True)
driver.get(f'https://www.manhuabei.com/{manhuaOneMap}')
pageNumber = int((driver.find_element_by_xpath('//*[@id="images"]/p').text)[-3:-1])
for j in range(1, pageNumber+1):
driver.get(f'https://www.manhuabei.com/{manhuaOneMap}?p={j}')
manhuaPageMap = driver.find_element_by_xpath('//*[@id="images"]/img').get_attribute('src')
try:
with open(fr'{manhuaFolderMap}\{str(j).rjust(2, '0')}.jpg', 'wb') as f:
f.write(requests.get(manhuaPageMap).content)
except Exception:
pass
finally:
time.sleep(1)
print('-------------------------------------------')
time.sleep(3)
print(len(elems))