selenium option
selenium获取属性
获取属性值为get_attribute,获取文章为text
shape()
css
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import pandas as pd
options = Options()
# options.add_argument('-headless')
driver = Chrome(executable_path='./chromedriver.exe',options=options)
driver.get("http://news.hbue.edu.cn/jyyw/list1.htm")
driver.maximize_window()
def get_info(driver):
try:
News_title = driver.find_element_by_css_selector('h1.arti_title').text
except:
News_title = ""
try:
News_source = driver.find_element_by_css_selector('span.arti_ly').text
except:
News_source = ""
try:
News_pub = driver.find_element_by_css_selector('span.arti_publisher').text
except:
News_pub = ""
try:
News_date = driver.find_element_by_css_selector('span.arti_update').text
except:
News_date = ""
try:
News_content = driver.find_element_by_css_selector('div.wp_articlecontent').text
except:
News_content = ""
data = pd.DataFrame()
data['News_url'] = [driver.current_url]
data['News_title'] = [News_title]
data['News_source'] = [News_source]
data['News_pub'] = [News_pub]
data['News_date'] = [News_date]
data['News_content'] = [News_content]
return(data)
def get_list(driver):
urls = [x.get_attribute("href") for x in driver.find_elements_by_css_selector('span.Article_Title a')]
dates = [x.text for x in driver.find_elements_by_css_selector('span.Article_PublishDate')]
L = pd.DataFrame()
L['url'] = urls
L['date'] = dates
return L
获取下一页,如果是最后一页,属性之中有javascript,用append将下一页的url加入dataframe
urls = pd.DataFrame(columns=('url','date'))
empty = False
while empty != True:
new_urls = get_list(driver)
urls = urls.append(new_urls,ignore_index = True)
if all([x.find('2020')==0 for x in new_urls['date']]):
url_next = driver.find_element_by_css_selector('a.next').get_attribute('href')
if url_next.find('javascript')==0:
empty = True
else:
driver.get(url_next)
else:
empty = True
data['News_source'] = [x.replace('来源:','') for x in data.News_source]
data['News_pub'] = [x.replace('发布者:','') for x in data.News_pub]
data['News_date'] = [x.replace('发布时间:','') for x in data.News_date]
data.to_csv('web2.csv')