dm5爬虫(selenium)

暗碳

已于 2024-04-23 21:24:48 修改

阅读量627

点赞数

文章标签：爬虫 selenium python

于 2023-05-03 18:03:54 首次发布

本文链接：https://blog.csdn.net/qq_41490154/article/details/130475425

版权

因为纯requests请求获取不到图片链接,所以用了selenium

在这里插入图片描述



import requests 
import os
from lxml import etree
import re
from selenium import webdriver

url = 'https://www.dm5.com/manhua-qiangweishaonv/' # 漫画的url
response = requests.get(url) # 获取响应
print(response) # 打印响应
html = etree.HTML(response.text) # 解析响应的源代码
manga_title = html.xpath('/html/body/div[3]/section/div[2]/div[2]/p[1]/text()') # 获取漫画标题
chapter_title = html.xpath('/html/body/div[4]/div/div[2]/div[1]/div[2]/ul/li/a/text()') # 获取章节标题
chapter_links=html.xpath('/html/body/div[4]/div/div[2]/div[1]/div[2]/ul/li/a/@href') # 获取章节链接
chapter_img_count=html.xpath('/html/body/div[4]/div/div[2]/div[1]/div[2]/ul/li/a/span/text()') # 获取章节图片数量

chapter_img_count = [re.sub(r'\D', '', count) for count in chapter_img_count] # 保留每个元素中的数字
chapter_img_count = [int(count) for count in chapter_img_count] # 将每个元素转换为整数类型
chapter_links = ['https://www.dm5.com' + link for link in chapter_links] # 将前缀添加到每个链接
manga_title = manga_title[0].replace(' ','') # 去除漫画标题中的空格
chapter_title = [title.replace(' ', '') for title in chapter_title]# 去除章节标题列表中每个元素中的空格
chapter_title = list(filter(None, chapter_title))#删除空元素

print(manga_title)
print(chapter_title)
print(chapter_links)
print(chapter_img_count)


chapter_count = 2#len(chapter_links)#结束章节
x=1#开始章节
while x<chapter_count:

    print('链接',chapter_links[x])
    print('章节名',chapter_title[x])

    print('图片数',chapter_img_count[x])
    xx =0
    
    header = {

'referer': chapter_links[x],
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'image',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'cross-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
}

   
    while xx<chapter_img_count[x]: # 当前章节的图片数量

            chapter_img_url = chapter_links[x]+'#ipg{}'.format(xx) # 当前图片页的url
            print(chapter_img_url) # 打印当前图片页的url
            driver = webdriver.Chrome() # 启动Chrome浏览器
            driver.get(chapter_img_url) # 访问当前图片页的url
            chapter_response = driver.page_source # 获取当前页面的源代码
            chapter_html = etree.HTML(chapter_response) # 解析当前页面的源代码
            img_url = chapter_html.xpath('/html/body/div[6]/div/img/@src') # 获取当前图片的url
            print(img_url) # 打印当前图片的url
            img_response = requests.get(img_url[0],headers=header) # 获取当前图片的响应
        

            if not os.path.exists(f'd:/manga/{manga_title}/{chapter_title[x]}'): # 如果当前章节的文件夹不存在
                os.makedirs(f'd:/manga/{manga_title}/{chapter_title[x]}') # 创建当前章节的文件夹
        

            with open ('d:/manga/%s/%s/%s.jpg'%(manga_title,chapter_title[x],xx),'wb') as f: # 打开当前图片的文件
                f.write(img_response.content) # 写入当前图片的内容
            xx+=1 # 图片数量加1


    x+=1