**
python 爬虫+selenium 全自动化下载JS动态加载漫画
最近刚学的python,代码可能并不规范,希望大家见谅!
爬取之前,我们首先需要做一些准备工作,因为很多网站都有反爬检索,为了应对这种情况,通用的办法是添加headers和proxies,但是单一的headers依然有很大风险,所以我们需要随机获取这些:
1获取国内代理网站的代理:
from fake_useragent import UserAgent
import random
import requests
import re
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os
import time
import urllib.parse
def get_ip():
# 国内免费代理网站,可以换成其他,但是不同网站格式不同,需要自己修改regip表达式
url = 'https://www.kuaidaili.com/free/inha/'
# 获取多少页可以自己设置,但是不建议太多
url_list = [url + str(i + 1) for i in range(2)]
print(url_list)
ip_list = []
for i in range(len(url_list)):
url = url_list[i]
html = requests.get(url=url, ).text
regip = '<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>.*?<td.*?>(\d{1,5})</td>'
matcher = re.compile(regip, re.S)
ipstr = re.findall(matcher, html)
# 设置等待时间减少被禁的概率
time.sleep(1)
for j in ipstr:
ip_list.append(j[0] + ':' + j[1])
print('共收集到%d个代理ip' % len(ip_list))
print(ip_list)
return ip_list
2验证代理的可用性(并不是所有IP都是有效的)
def valVer(proxys):
badNum = 0
goodNum = 0
good = []
for proxy in proxys:
try:
proxy_host = proxy
protocol = 'https' if 'https' in proxy_host else 'http'
proxies = {protocol: proxy_host}
print('现在正在测试的IP:', proxies)
# 测试的网站可以自由填写,最好是你目标网站,响应时间根据自己需要调整
response = requests.get('https://www.manhuabei.com', proxies=proxies, timeout=4)
if response.status_code != 200:
badNum += 1
print(proxy_host, 'bad proxy')
else:
goodNum += 1
good.append(proxies)
print(proxy_host, 'success proxy')
except Exception as e:
print(e)
badNum += 1
continue
print('success proxy num : ', goodNum)
print(good)
return good
3 建立随机headers
代理已经搞定,接下来就是headers了,不过所幸的是python有一个随机库 fake_useragent,不需要我们自己去处理然后配合上面代理,一个随机代理+headers的组合就出现了
from fake_useragent import UserAgent
import requests
import random
headers = {"User-Agent": UserAgent().random}
print(headers)
t_list = get_ip()
proxies_list = valVer(t_list)
print(proxies_list)
# 随机
proxies = {'http': random.choice(proxies_list)}
headers = {"User-Agent": UserAgent().random}
url = "https://www.manhuabei.com/manhua/nvzixueyuandenansheng/"
# 随机代理+headers+url 请求
response = requests.get(url,proxies=proxies,headers=headers)
4(主体)漫画抓取
主体部分
需要先下好对应浏览器的驱动
def downloadimg(commicurl,main_url):
t_list = get_ip()
proxies_list = valVer(t_list)
print(proxies_list)
# 随机
proxies = {'http': random.choice(proxies_list)}
headers = {"User-Agent": UserAgent().random
,'Referer': commicurl}
url = commicurl
response = requests.get(url,proxies=proxies,headers=headers)
# 获取所有章节地址
html = response.text
print(html)
rexstr = commicurl.replace(main_url,'')+'\d.*?.html'
step1_url = re.findall(rexstr, html)
print(len(step1_url))
for list_url in step1_url :
time.sleep(2)
step2_url = 'https://www.manhua