工具:python3
本文主要用python实现动漫之家的爬取
如果有安装模块疑问请自行百度,有代码格式问题,也可以在下方回复
分为两部分跟java那部分一样分为无框架和scrapy框架实现,步骤和实现原理都是一样的可以参考前篇
代码不再详细注释
无框架
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from urllib.request import urlretrieve
from urllib.request import build_opener
from urllib.request import install_opener
import os
import time
if __name__ == "__main__":
chrome_options = Options()
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--headless')
abspath = os.path.abspath(r"D:\chromedriver_win32\chromedriver.exe")
dr = webdriver.Chrome(executable_path=abspath,chrome_options=chrome_options)
dr.get("https://manhua.dmzj.com/shiling")
element=dr.find_elements_by_xpath("//div[@class='cartoon_online_border']/ul/li/a")
urllist=[]
for sel1 in element:
urllist.append(sel1.get_attribute("href"))
print(dr.title)
imglist=[]
for list2 in urllist:
print (list2)
dr.get(list2)
element2=dr.find_elements_by_xpath("//div[@class='btmBtnBox']/select/option")
for sel2 in element2:
title=dr.find_elements_by_xpath("//div[@class='display_middle']/h1/a")[0].text+dr.find_elements_by_xpath("//div[@class='display_middle']/span")[0].text;
imglist.append(title+'---https:'+sel2.get_attribute("value")+'---'+sel2.text)
dr.close()
dr.quit()
total=len(imglist)
index_dest="D:/manhua"
if not os.path.exists(index_dest) :
os.makedirs(index_dest)
else:
print(index_dest+"已创建")
j=0
for list3 in imglist:
imgdest=index_dest+list3.split('---')[0].split('-')[0].strip()
if not os.path.exists(imgdest):
os.makedirs(imgdest)
else:
print(imgdest+"已创建")
opener=build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36'),("Referer","https://manhua.dmzj.com/")]
install_opener(opener)
img_url=list3.split('---')[1]
filename=imgdest+"/"+list3.split('---')[2]+"."+img_url.split('.')[len(img_url.split('.'))-1]
j+=1
if filename not in os.listdir():
urlretrieve(url = img_url,filename = filename)
else:
print(filename+"已下载")
print(j+"/"+total)
time.sleep(1)
print("下载完成")
Scrapy框架
下期见