from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from lxml import etree
from docx import Document
from docx.shared import Pt
from docx.shared import Inches
from docx.oxml.ns import qn
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import os
import re
import shutil
import requests
os.chdir(r’E:\文库\重要讲话数据库’)
url = ‘http://jhsjk.people.cn/’
browser = webdriver.Chrome()
browser.get(url)
wait = WebDriverWait(browser,10)
subs = {}
lis = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//body/div[@class=‘w1000 btn2 clearfix’]/a")))
for li in lis:
#print(li.find_element_by_xpath(’./img’).get_attribute(‘title’))
subject = li.find_element_by_xpath(’./img’).get_attribute(‘title’)
#print(li.get_attribute(‘href’))
url = li.get_attribute(‘href’)
subs[subject] = url
lis = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//body/div[@class=‘w1000 btn clearfix’]/a")))
for li in lis:
#print(li.find_element_by_xpath(’