抓取PDF

最新推荐文章于 2024-05-03 14:53:18 发布

feiniugu

最新推荐文章于 2024-05-03 14:53:18 发布

阅读量302

点赞数

本文链接：https://blog.csdn.net/feiniugu/article/details/105038718

版权




import time
import requests
from bs4 import BeautifulSoup
from lxml import etree
from selenium import webdriver

from selenium.webdriver.common.keys import Keys


hd = "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"


# 根据单页文章列表网页链接 page_url 提取所有文章链接 art_url 及标题 arc_tiile
# page_url = 'https://2d.hep.com.cn/mobile/book/show/java2113?page=1'        #  单页网页链接
# res_name = requests.get(page_url).text
# # print(res_name)
# soup_name = BeautifulSoup(res_name,'lxml')
# a_list = soup_name.select(".weui_cells_access a")           #  找到所有符合条件的a标签，即每篇文章的链接
# for i in a_list:
#     print(i.text)
#     print("https://2d.hep.com.cn" + i["href"])              #1  单篇文章的链接
#



#  根据单篇文章地址 art_url 提取PDF每页图片地址 png_url 爬取框架iframe内动态加载的数据
art_url = 'https://2d.hep.com.cn/1252003/70'               #1  单篇文章的链接

# driver = webdriver.Chrome()
# driver.get(art_url)
# iframe = driver.find_element_by_tag_name('iframe')
# driver.switch_to.frame(iframe)
# soup_iframe = BeautifulSoup(driver.page_source,'lxml')          #   实例化框架iframe里的网页元素
# img_link = soup_iframe.select_one('img')                        #   找到单张图片链接的关键字
#
# # img_link = soup_iframe.select('img')                          #   找到所有文章中单张图片的链接
# # for i in img_link:
# #     print(i['src'])
#
# print(img_link['src'].split('/'))
# print("https://node2d-public.hep.com.cn/" + (img_link['src'].split('/'))[1] + (img_link['src'].split('.',1))[1])
# total = soup_iframe.select_one('.totalPage').get_text().split('/')[1]       #    获取本文章共有多少页数
# print(total)
# driver.close()
#



# # #根据PDF里的单张图片地址 png_url 下载图片内容，并保存

# for number in range(1,2):
#     png_url = "https://node2d-public.hep.com.cn/cf78c968226721a4ec8234b8e7a7410d.pdf/cf78c968226721a4ec8234b8e7a7410d.pdf.files/" + str(number) + ".png"
#
#     r = requests.get(png_url)
#     page = r.content
#     f = open("例1-8-1-" + str(number) +".png",'wb')
#     f.write(page)
#     f.close()
#

feiniugu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
抓取PDF

import timeimport requestsfrom bs4 import BeautifulSoupfrom lxml import etreefrom selenium import webdriverfrom selenium.webdriver.common.keys import Keyshd = "User-Agent: Mozilla/5.0 (Win...
复制链接

扫一扫