Python 下载百度文库
下面展示一些 内联代码片
。
import requests
from selenium import webdriver
from lxml import etree
import re
from selenium.webdriver.common.keys import Keys
import time
from PIL import Image
import os
from bs4 import BeautifulSoup
import bs4
from docx import Document
import sys
def getHTMLText(url):
header = {'User-agent': 'Googlebot'}
try:
r = requests.get(url, headers = header, timeout = 30)
r.raise_for_status()
r.encoding = 'gbk'
# r.encoding = r.apparent_encoding
return r.text
except:
return ''
def parse_type(content):
return re.findall(r"docType.*?\:.*?\'(.*?)\'\,", content)[0]
def parse_txt(html): #解析网页源代码
plist = []
soup = BeautifulSoup(html, "html.parser")
plist.append(soup.title.string)
for div in soup.find_all('div', attrs={"class": "bd doc-reader"}):
plist.extend(div.get_text().split('\n'))
plist = [c.replace(' ', '') for c in plist]
plist = [c.replace('\x0c', '') for c in plist]
return plist
def print_docx(plist, filename): #读取文档
file = open(filename + '.txt', 'w',encoding='utf-8')
for str in plist:
file.write(str)
file.write('\n')
file.close()
with open(filename + '.txt', encoding='utf-8') as f:
docu = Document()
docu.add_paragraph(f.read())
docu.save(filename + '.docx')
def parse_doc(url, folderPath):
driver = webdriver.Chrome(r'./src/chromedriver.exe')
driver.get(url)
# 找到‘继续阅读’按钮 定位至<span class="moreBtn goBtn"><span>还剩35页未读,</span><span class="fc2e">继续阅读</span></span>
button = driver.find_element_by_xpath("//*[@id='html-reader-go-more']/div[2]/div[1]/span")
# 按下按钮
driver.execute_script("arguments[0].click();", button)
time.sleep(1)
source = re.compile(r'<span class="page-count">/(.*?)</span>')
number = int(source.findall(driver.page_source)[0])
# 获取页码数
# number = total[1]
time.sleep(1)
for i in range(2,number):
driver.find_element_by_class_name("page-input").clear()
driver.find_element_by_class_name("page-input").send_keys(f'{i}')
driver.find_element_by_class_name("page-input").send_keys(Keys.ENTER)
time.sleep(1)
html=etree.HTML(driver.page_source)
# 找到picture容器
links=html.xpath("//div[@class='reader-pic-item']/@style")
# 找到图片对应的url
part = re.compile(r'url[(](.*?)[)]')
qa="".join(links)
z=part.findall(qa)
if i == 2:
for m in range(3):
pic = requests.get(z[m]).content
with open(f'./照片/{m+1}.jpg','wb') as f:
f.write(pic)
f.close()
else:
pic = requests.get(z[2]).content
with open(f'./照片/{i+1}.jpg','wb') as f:
f.write(pic)
f.close()
time.sleep(1)
driver.quit()
def parse_other(url, folderPath):
driver = webdriver.Chrome(r'./src/chromedriver.exe')
driver.get(url)
# 找到‘继续阅读’按钮 定位至<span class="moreBtn goBtn"><span>还剩35页未读,</span><span class="fc2e">继续阅读</span></span>
button = driver.find_element_by_xpath("//*[@id='html-reader-go-more']/div[2]/div[1]/span")
# 按下按钮
driver.execute_script("arguments[0].click();", button)
time.sleep(1)
source = re.compile(r'<span class="page-count">/(.*?)</span>')
number = int(source.findall(driver.page_source)[0])
# 获取页码数
# number = total[1]
time.sleep(1)
# 获取图片
for i in range(2,number):
driver.find_element_by_class_name("page-input").clear()
driver.find_element_by_class_name("page-input").send_keys(f'{i}')
driver.find_element_by_class_name("page-input").send_keys(Keys.ENTER)
time.sleep(1)
html=etree.HTML(driver.page_source)
# 找到picture容器"//div[@class='reader-pic-item']/@style"
z=html.xpath('//div[@class="ppt-image-wrap"]/img/@src')
# print(z)
# 保存图片
if i == 2:
for m in range(3):
pic = requests.get(z[m]).content
with open(folderPath + f'/{m + 1}.jpg','wb') as f:
f.write(pic)
f.close()
else:
pic = requests.get(z[i]).content
with open(folderPath + f'/{i + 1}.jpg','wb') as f:
f.write(pic)
f.close()
time.sleep(1)
driver.quit()
def print_pdf(folderPath, filename):
files = os.listdir(folderPath)
jpgFiles = []
sources = []
for file in files:
if 'jpg' in file:
jpgFiles.append(file)
tep = []
for i in jpgFiles:
ex = i.split('.')
tep.append(int(ex[0]))
tep.sort()
jpgFiles=[folderPath +'/'+ str(i) + '.jpg' for i in tep]
output = Image.open(jpgFiles[0])
jpgFiles.pop(0)
for file in jpgFiles:
img = Image.open(file)
img = img.convert("P")
sources.append(img)
output.save(f"{filename}.pdf","PDF",save_all=True,append_images=sources)
from datetime import datetime
def main(url, istxt):
try:
ticks = time.time() # 获取时间(用于命名文件夹)
filepath = './照片' + str(ticks) # 保存爬取的图片
filename = './爬取结果' + str(ticks) # 爬取生成的文件名
ticks = datetime.now().strftime('%F')
filepath = os.getcwd()+'./照片' + str(ticks) # 保存爬取的图片
filename = os.getcwd()+'./爬取结果' + str(ticks) # 爬取生成的文件名
if not os.path.exists(filepath): # 新建文件夹
os.mkdir(filepath)
html = getHTMLText(url) # requests库爬取
type = parse_type(html) # 获取文库文件类型:ppt, pdf, docx
# 当你要爬取文档的文本时,打开下列注释
if(istxt == "1"):
type = 'txt'
if type == 'txt' :
plist = parse_txt(html)
print_docx(plist, filename)
elif type == 'doc' or type == 'pdf':
parse_doc(url, filepath)
print_pdf(filepath , filename)
else:
parse_other(url, filepath)
print_pdf(filepath, filename)
print('1')
except:
print('0')
if __name__ == '__main__':
#main(sys.argv[1],sys.argv[2])
url = 'https://wenku.baidu.com/view/b659fbd0e55c3b3567ec102de2bd960590c6d981.html?fr=search'
istxt = "1"
main(url,istxt)