python 主程序入口_python爬几部国漫

.前言

说实话这次的爬虫可能是目前我遇到的最难的一个爬虫,主要之前爬取的都是一些静态资源的网站,这次的网站虽然 反爬机制 虽然也只是低层次的,但是对于新手的我来说也算是比较难的了。

2.反爬过程

这样我们便能通过xpath直接定位得到了

def getLinks(html):

chapter_link=[]

chapter_title=[]

parse=parsel.Selector(html)

links=parse.xpath('//div[@class="tab-content tab-content-selected zj_list_con autoHeight"]/ul[@class="list_con_li autoHeight"]/li/a/@href').getall()

titles=parse.xpath('//div[@class="tab-content tab-content-selected zj_list_con autoHeight"]/ul[@class="list_con_li autoHeight"]/li/a/span[@class="list_con_zj"]/text()').getall()

for link in links:

chapter_link.insert(0,link)

for title in titles:

chapter_title.insert(0, title)

return chapter_link,chapter_title

1234567891011

只要注意他这里章节是降序排列的,所以我们爬取的过程中需要将他翻转过来,所以不能只用 append 方法,应用 insert 方法。

源码

我的代码:

import requests

import parsel

import pypinyin

from bs4 import BeautifulSoup

import re

import os

import time

# 伪装浏览器。设置请求头

headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",}

# 返回网页的请求信息

def askUrl(url):

response=requests.get(url,headers=headers)

html=response.content.decode('utf-8')

return html

# 获取所有的章节链接以及章节名称

def getLinks(html):

chapter_link=[]

chapter_title=[]

parse=parsel.Selector(html)

links=parse.xpath('//div[@class="tab-content tab-content-selected zj_list_con autoHeight"]/ul[@class="list_con_li autoHeight"]/li/a/@href').getall()

titles=parse.xpath('//div[@class="tab-content tab-content-selected zj_list_con autoHeight"]/ul[@class="list_con_li autoHeight"]/li/a/span[@class="list_con_zj"]/text()').getall()

for link in links:

chapter_link.insert(0,link)

for title in titles:

chapter_title.insert(0, title)

return chapter_link,chapter_title

# 获取所有漫画的链接

def getImgs(link):

pic_url=[]

response=requests.get(link,headers=headers)

html=BeautifulSoup(response.text,'lxml')

script_info=html.script

one = re.findall("\|(\d{4})\|", str(script_info))[0]

two = re.findall("\|(\d{5})\|", str(script_info))[0]

threes=re.findall('\d{13,14}',str(script_info))

for i, three in enumerate(threes):

if len(three) == 13:

threes[i] = three + '0'

threes = sorted(threes, key=lambda x: int(x))

for three in threes:

if three[-1]=='0':

pic_url.append("https://images.dmzj.com/img/chapterpic/"+one+"/"+two+"/"+three[:-1]+".jpg")

else:

pic_url.append("https://images.dmzj.com/img/chapterpic/" + one + "/" + two + "/" + three + ".jpg")

return pic_url

# 下载漫画

def download(url,links,dir_name):

headers1={

'Referer': url,

}

i=1;

for link in links:

pic_name = '%03d.jpg' % (i)

new_dir_name = os.path.join(dir_name, pic_name)

response=requests.get(link,headers=headers1)

with open(new_dir_name, 'wb')as f:

f.write(response.content)

print(pic_name+"下载完成")

i+=1

# main方法

def main():

manhuas=input("请输入你要下载的漫画名:")

dir_name = r'D:\漫画'

if not os.path.exists(dir_name + './' + manhuas):

os.makedirs(dir_name + './' + manhuas)

dir_name=dir_name + './' + manhuas

manhuas=pypinyin.pinyin(manhuas,style=pypinyin.NORMAL)

name=''

for manhua in manhuas:

name=name+''.join(manhua)

url="https://www.dmzj.com/info/"+name+".html"

html=askUrl(url)

links=getLinks(html)[0]

names = getLinks(html)[1]

for i,link in enumerate(links):

if not os.path.exists(dir_name + './' + str(names[i])):

os.makedirs(dir_name + './' + str(names[i]))

print("开始下载:"+names[i])

imglinks=getImgs(link)

download(link,imglinks,dir_name + './' + str(names[i]))

print(names[i]+"下载完毕")

print("休息一会儿,稍微继续下载下一章")

time.sleep(10)

print("————————————————————————————————————————————————————————————————————————————————")

print(manhuas+"已经完全下载完毕")

主函数入口

if __name__ == '__main__':

main()

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394

外汇MT4教程https://www.kaifx.cn/mt4.html

老哥的代码:

import requests

import os

import re

from bs4 import BeautifulSoup

from contextlib import closing

from tqdm import tqdm

import time

# 创建保存目录

save_dir = '妖神记'

if save_dir not in os.listdir('./'):

os.mkdir(save_dir)

target_url = "https://www.dmzj.com/info/yaoshenji.html"

# 获取动漫章节链接和章节名

r = requests.get(url = target_url)

bs = BeautifulSoup(r.text, 'lxml')

list_con_li = bs.find('ul', class_="list_con_li")

cartoon_list = list_con_li.find_all('a')

chapter_names = []

chapter_urls = []

for cartoon in cartoon_list:

href = cartoon.get('href')

name = cartoon.text

chapter_names.insert(0, name)

chapter_urls.insert(0, href)

# 下载漫画

for i, url in enumerate(tqdm(chapter_urls)):

download_header = {

'Referer': url

}

name = chapter_names[i]

# 去掉.

while '.' in name:

name = name.replace('.', '')

chapter_save_dir = os.path.join(save_dir, name)

if name not in os.listdir(save_dir):

os.mkdir(chapter_save_dir)

r = requests.get(url = url)

html = BeautifulSoup(r.text, 'lxml')

script_info = html.script

pics = re.findall('\d{13,14}', str(script_info))

for j, pic in enumerate(pics):

if len(pic) == 13:

pics[j] = pic + '0'

pics = sorted(pics, key=lambda x:int(x))

chapterpic_hou = re.findall('\|(\d{5})\|', str(script_info))[0]

chapterpic_qian = re.findall('\|(\d{4})\|', str(script_info))[0]

for idx, pic in enumerate(pics):

if pic[-1] == '0':

url = 'https://images.dmzj.com/img/c...' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic[:-1] + '.jpg'

else:

url = 'https://images.dmzj.com/img/c...' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic + '.jpg'

pic_name = '%03d.jpg' % (idx + 1)

pic_save_path = os.path.join(chapter_save_dir, pic_name)

with closing(requests.get(url, headers = download_header, stream = True)) as response:

chunk_size = 1024

content_size = int(response.headers['content-length'])

if response.status_code == 200:

with open(pic_save_path, "wb") as file:

for data in response.iter_content(chunk_size=chunk_size):

file.write(data)

else:

print('链接异常')

time.sleep(10)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值