import re
import img2pdf
from selenium import webdriver
import os
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from lxml import etree
import requests
class File_360(object):
def __init__(self):
self.url = "https://wenku.so.com/d/6f252fb8971abfce4474d919106c4678?src=ob_zz_juhe360wenku"
self.driver = webdriver.Chrome()
self.driver.get(self.url)
self.number = 1
def get_image_url(self, name):
text = self.driver.page_source
html = etree.HTML(text)
image_url_list = html.xpath('//*[@id="app"]/div[1]/div[2]/div[1]/div[1]/div[2]/div[2]//div')
for image_url in image_url_list:
try:
url = image_url.xpath('./div[2]/img/@src')[0]
title = str(self.number) + os.path.splitext(url)[-1]
self.number += 1
except:
continue
self.image_url_save(url, name, title)
def image_url_save(self, url, name, title):
with open(name + "/" + title, "wb") as file:
resp = requests.get(url).content
file.write(resp)
print(f"{title}数据保存成功")
def image_change(self, name):
filepath = os.getcwd() + "/" + name
files = os.listdir(filepath)
print(files)
# 排序,防止合并后文件页面乱序
filedict = {int(i.split('.')[0]): i for i in files}
print(filedict)
files = [filedict[i] for i in sorted(filedict)]
# 文件名+路径
files = ['./' + name + '/' + i for i in files]
print(files)
# 把所有图片拼接为pdf
with open(name + '/testpdf1.pdf', mode='wb') as f:
f.write(img2pdf.convert(files))
def main(self):
name = input("请输入你需要存储文件的名字:")
if not os.path.exists(name):
os.makedirs(name)
self.get_image_url(name)
time.sleep(2)
self.image_change(name)
if __name__ == '__main__':
func = File_360()
func.main()
selenium爬取360文库文档数据
最新推荐文章于 2024-07-09 11:31:50 发布