Python3.7 + Selenium + BeautifulSoup4 + Requests + Threading 爬虫异步加载网站

最新推荐文章于 2023-12-06 18:33:17 发布

Haw_key

最新推荐文章于 2023-12-06 18:33:17 发布

阅读量492

点赞数

分类专栏：基础文章标签： Selenium Threading BeautifulSoup4 灵剑山漫画

本文链接：https://blog.csdn.net/How_key/article/details/87321537

版权

一个用于爬取采用JS脚本防爬虫漫画网站的脚本
具体看注释

# coding=utf-8
import pdfkit
import requests

from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import os,time,threading
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By # 标识
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

#网页分析，获取网址和标题
def parse_url_to_html(url,name, istart, iend):
    heads = {
   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    response=requests.get(url,headers = heads)
    print(response.status_code)
    soup=BeautifulSoup(response.content,"html.parser")
    mainpages = []
    maintitles= []
    allpages  = []#获取所有的网址
    alltitles = []#获取对应的标题
    tag_main  = soup.find_all(class_ = "cy_plist")[0]#获取第一个id为"nav"的标签，这个里面包含了网址和标题
    
    for i in tag_main.find_all("li"):
        
        if i == None:
            continue
        else:
            mainpages.append(i.a.get('href'))
            maintitles.append(i.a.get_text())
    mainpages.reverse()
    maintitles.reverse()
    
    print("write begin++++++++++++++++>>>>>>>>>>>>>....")
    #获取的只是标签集，需要加html前缀
    suffix = '_' + str(istart) + '-' + str(iend)
    #htmls = "<html><head><meta charset='UTF-8'></head><body> \n"#+str(tag_main)
    #with open(name+suffix+".html",'w',encoding='utf-8') as f:
    #    f.write(htmls)
    #with open("stat.pic"+suffix,'w',encoding='utf-8') as f:
    #    f.write("stats picture info \n")
    print(mainpages)
    
    return mainpages, maintitles

def downlaodImage(url, maintitles, chapter, istart, iend):
      
    heads =

最低0.47元/天解锁文章

Haw_key

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python3.7 + Selenium + BeautifulSoup4 + Requests + Threading 爬虫异步加载网站

一个用于爬取采用JS脚本防爬虫漫画网站的脚本具体看注释# coding=utf-8import pdfkitimport requestsfrom urllib.request import urlretrievefrom bs4 import BeautifulSoupimport os,time,threadingfrom urllib.parse import urljoi...
复制链接

扫一扫