import time
import requests
from bs4 import BeautifulSoup
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
hd = "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
# 根据单页文章列表网页链接 page_url 提取所有文章链接 art_url 及标题 arc_tiile
# page_url = 'https://2d.hep.com.cn/mobile/book/show/java2113?page=1' # 单页网页链接
# res_name = requests.get(page_url).text
# # print(res_name)
# soup_name = BeautifulSoup(res_name,'lxml')
# a_list = soup_name.select(".weui_cells_access a") # 找到所有符合条件的a标签,即每篇文章的链接
# for i in a_list:
# print(i.text)
# print("https://2d.hep.com.cn" + i["href"]) #1 单篇文章的链接
#
# 根据单篇文章地址 art_url 提取PDF每页图片地址 png_url 爬取框架iframe内动态加载的数据
art_url = 'https://2d.hep.com.cn/1252003/70' #1 单篇文章的链接
# driver = webdriver.Chrome()
# driver.get(art_url)
# iframe = driver.find_element_by_tag_name('iframe')
# driver.switch_to.frame(iframe)
# soup_iframe = BeautifulSoup(driver.page_source,'lxml') # 实例化框架iframe里的网页元素
# img_link = soup_iframe.select_one('img') # 找到单张图片链接的关键字
#
# # img_link = soup_iframe.select('img') # 找到所有文章中单张图片的链接
# # for i in img_link:
# # print(i['src'])
#
# print(img_link['src'].split('/'))
# print("https://node2d-public.hep.com.cn/" + (img_link['src'].split('/'))[1] + (img_link['src'].split('.',1))[1])
# total = soup_iframe.select_one('.totalPage').get_text().split('/')[1] # 获取本文章共有多少页数
# print(total)
# driver.close()
#
# # #根据PDF里的单张图片地址 png_url 下载图片内容,并保存
# for number in range(1,2):
# png_url = "https://node2d-public.hep.com.cn/cf78c968226721a4ec8234b8e7a7410d.pdf/cf78c968226721a4ec8234b8e7a7410d.pdf.files/" + str(number) + ".png"
#
# r = requests.get(png_url)
# page = r.content
# f = open("例1-8-1-" + str(number) +".png",'wb')
# f.write(page)
# f.close()
#
抓取PDF
最新推荐文章于 2023-10-02 22:00:14 发布