python抓取某文章内容并保存为word格式

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from lxml import etree
from docx import Document
from docx.shared import Pt
from docx.shared import Inches
from docx.oxml.ns import qn
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import os
import re
import shutil
import requests
os.chdir(r’E:\文库\重要讲话数据库’)

url = ‘http://jhsjk.people.cn/’
browser = webdriver.Chrome()
browser.get(url)
wait = WebDriverWait(browser,10)
subs = {}
lis = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//body/div[@class=‘w1000 btn2 clearfix’]/a")))
for li in lis:
#print(li.find_element_by_xpath(’./img’).get_attribute(‘title’))
subject = li.find_element_by_xpath(’./img’).get_attribute(‘title’)
#print(li.get_attribute(‘href’))
url = li.get_attribute(‘href’)
subs[subject] = url
lis = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//body/div[@class=‘w1000 btn clearfix’]/a")))
for li in lis:
#print(li.find_element_by_xpath(’./img’).get_attribute(‘title’))
subject = li.find_element_by_xpath(’./img’).get_attribute(‘title’)
#print(li.get_attribute(‘href’))
url = li.get_attribute(‘href’)
subs[subject] = url
browser.close()
print(os.getcwd())

def get_detail_url():
lis = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,‘body > div.w1000.d2list.clearfix > div.fr > ul > li’)))
for li in lis:
name = li.text
print(name)
try:
source = li.get_attribute(‘outerHTML’)
#print(source)
#pattern = re.compile(’.?[(.?)].?’)
#date = re.search(’.
?[(.?)].?’,name,re.S)
#html = etree.parse(source,etree.HTMLParser())
html = etree.HTML(source)
#获取日期
date = str(html.xpath(’.//text()’)[1]).strip()
#print(date)
#获取标题
title_name = li.find_element_by_xpath(’./a’).text
title_name = re.sub(’\W+’,’’,title_name)
#print(name)
#将日期与标题进行拼接
name = date + title_name
#name = re.sub(r’/😗"?<>|:?“‘、 ‘,’’,name)
#name = re.sub(’\W+’,’’,name)
print(name)
except:
print(‘获取标题失败’)
try:
url_detail = li.find_element_by_xpath(’./a’).get_attribute(‘href’)
print(url_detail)
except:
print(‘获取链接失败’)
contents[name] = url_detail

def get_paper(contents):
for key,value in contents.items():
if os.path.exists(key+’.docx’):
print(‘文件已存在’)
pass
#break
else:
option = webdriver.ChromeOptions()
option.add_argument(‘headless’)
try:
browser = webdriver.Chrome(chrome_options=option)
browser.get(value)
wait = WebDriverWait(browser, 10)
except:
print(‘打开浏览器超时’)
break
document = Document()
document.styles[‘Normal’].font.name = u’仿宋_GB2312’
document.styles[‘Normal’].font.size = Pt(16)
document.styles[‘Normal’]._element.rPr.rFonts.set(qn(‘w:eastAsia’), u’仿宋_GB2312’)
try:
title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,‘body > div.d2txt.clearfix > h1’)))
h1 = title.text
print(h1)
run = document.add_heading(’’, level=3).add_run(h1) # 应用场景示例标题
run.font.name = u’方正小标宋简体’
run.font.size = Pt(25)
run._element.rPr.rFonts.set(qn(‘w:eastAsia’), u’方正小标宋简体’)
run.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
except TimeoutException:
print(‘无相关一级标题’)
try:
title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,‘body > div.d2txt.clearfix > h3’)))
h3 = title.text
print(h3)
document.add_heading(h3,level=1)
except TimeoutException:
print(‘无相关三级标题’)
try:
title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ‘body > div.d2txt.clearfix > div.d2txt_1.clearfix’)))
h4 = title.text
print(h4)
document.add_heading(h4, level=2)
except TimeoutException:
print(‘无相关四级标题’)
try:
paragrahs = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ‘body > div.d2txt.clearfix > div.d2txt_con.clearfix p’)))
#f = open(‘test.txt’,‘w’)
for paragrah in paragrahs:
try:
img = paragrah.find_element_by_xpath(’./img’)
if img != None:
img_url = img.get_attribute(‘src’)
#img_url = re.sub(’\W+’,’’,img_url)
img_url = str(img_url).strip()
#print(img_url,file = f)
r = requests.get(img_url,stream=True)
with open(‘tmp.jpg’,‘wb’) as f:
for chunk in r.iter_content(chunk_size=5):
f.write(chunk)
document.add_picture(‘tmp.jpg’, width=Inches(6))
os.remove(‘tmp.jpg’)
else:
pass
except:
pass
parah = document.add_paragraph(paragrah.text)
parah.paragraph_format.first_line_indent = Inches(0.5)
except TimeoutException:
print(‘无相关正文’)
try:
edit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ‘body > div.d2txt.clearfix > div.editor.clearfix’)))
edit_text = edit.text
print(edit_text)
document.add_paragraph(edit_text)
except TimeoutException:
print(‘无相关编辑信息’)
try:
if not os.path.exists(key+’.docx’):
document.save(key+’.docx’)
else:
print(‘文件已存在’)
except:
print(‘文件名称无法保存’)
browser.close()
for key,value in subs.items():
print(key)
print(value)

for key,value in subs.items():
print(key)
print(value)
os.chdir(r’E:\文库\重要讲话数据库’)
if not os.path.exists(key):
os.mkdir(key)
os.chdir(key)
browser = webdriver.Chrome()
browser.get(value)
wait = WebDriverWait(browser, 10)
contents = {}
get_detail_url()
browser.close()
get_paper(contents)
#browser.close()

path = r’E:\文库\重要讲话数据库’
target = ‘E:\文库\重要讲话数据库\汇总’
if not os.path.exists(target):
os.mkdir(target)
os.chdir(target)
for root,dirs,files in os.walk(path):
for file in files:
file_name = os.path.join(root,file)
target_name = os.path.join(target,file)
if os.path.exists(target_name):
pass
else:
shutil.copyfile(file_name,target_name)

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值