Python——获取e21所有文章

import pandas  as pds
import urllib.request
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

browser = webdriver.Chrome()  #驱动谷歌浏览器

def enter(url,element):
     wait = WebDriverWait(browser, 3)
     try:
         browser.get(url)
         wait.until(
             EC.presence_of_element_located((By.XPATH,element)),
         )
     except TimeoutException:
          result = "在"+url+'\n'+'未定位到'+element
          print(result)



def get_detail(element):
    elements = browser.find_element_by_xpath(element)
    return elements.text


def get_element_attribute(element, attribute):
    elements = browser.find_element_by_xpath(element)
    return elements.get_attribute(attribute)

def get_ele_num(element):
     num_list = []
     elements = browser.find_elements_by_xpath(element)
     for eachone in elements:
          num_list.append(eachone.text)
     return len(num_list)
     

def get_one_url(urls,titles,num):
     for i in range(1,num):
          element = "/html/body/table/tbody/tr/td/table[3]/tbody/tr/td[1]/table[3]/tbody/tr["+str(i)+"]/td[2]/a"
          href  = get_element_attribute(element, "href")
          urls.append(href)
          title = get_detail(element)
          titles.append(title)
     return urls,titles



def main():
     urls = []
     titles = []
     ele = '//td[@align="left"]'
     for i in range(23):
          url = "http://zsxx.e21.cn/e21html/zhaosheng/listihszbkxth1p"+str(i)+".html"
          enter(url, ele)
          num = get_ele_num('//td[@align="left"]')
          [urls, titles]= get_one_url(urls,titles, num)

     result = pds.DataFrame(data = urls)
     result.to_csv('C:/Users/Administrator/Desktop/urls.csv', sep=',', mode='a',header=None,index=None)          
     result = pds.DataFrame(data = titles)
     result.to_csv('C:/Users/Administrator/Desktop/titles.csv', sep=',', mode='a',header=None,index=None)
     

if __name__ ==  "__main__":
     main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值