用selenium和pandas获取新闻

selenium option

selenium获取属性
获取属性值为get_attribute,获取文章为text
shape()
css

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import pandas as pd

options = Options()
# options.add_argument('-headless')
driver = Chrome(executable_path='./chromedriver.exe',options=options)
driver.get("http://news.hbue.edu.cn/jyyw/list1.htm")
driver.maximize_window()
def get_info(driver):
    try:
        News_title = driver.find_element_by_css_selector('h1.arti_title').text
    except:
        News_title = ""
    
    try:        
        News_source = driver.find_element_by_css_selector('span.arti_ly').text
    except:
        News_source = ""
        
    try:    
        News_pub = driver.find_element_by_css_selector('span.arti_publisher').text
    except:
        News_pub = ""
        
    try:
        News_date = driver.find_element_by_css_selector('span.arti_update').text
    except:
        News_date = ""
        
    try:    
        News_content = driver.find_element_by_css_selector('div.wp_articlecontent').text
    except:
        News_content = ""

    data = pd.DataFrame()
    data['News_url'] = [driver.current_url]
    data['News_title'] = [News_title]
    data['News_source'] = [News_source]
    data['News_pub'] = [News_pub]
    data['News_date'] = [News_date]
    data['News_content'] = [News_content]
    
    return(data)
    def get_list(driver):
    urls = [x.get_attribute("href") for x in driver.find_elements_by_css_selector('span.Article_Title a')]
    dates = [x.text for x in driver.find_elements_by_css_selector('span.Article_PublishDate')]
    L = pd.DataFrame()
    L['url'] = urls
    L['date'] = dates
    return L

获取下一页,如果是最后一页,属性之中有javascript,用append将下一页的url加入dataframe

urls = pd.DataFrame(columns=('url','date'))
empty = False
while empty != True:
    new_urls = get_list(driver)
    urls = urls.append(new_urls,ignore_index = True)
    if all([x.find('2020')==0 for x in new_urls['date']]):
        url_next = driver.find_element_by_css_selector('a.next').get_attribute('href')
        if url_next.find('javascript')==0:
            empty = True
        else:
            driver.get(url_next)
    else:
        empty = True            
    
data['News_source'] = [x.replace('来源:','') for x in data.News_source]
data['News_pub'] = [x.replace('发布者:','') for x in data.News_pub]
data['News_date'] = [x.replace('发布时间:','') for x in data.News_date]
data.to_csv('web2.csv')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值