import pandas as pds
import urllib.request
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome() #驱动谷歌浏览器
def enter(url,element):
wait = WebDriverWait(browser, 3)
try:
browser.get(url)
wait.until(
EC.presence_of_element_located((By.XPATH,element)),
)
except TimeoutException:
result = "在"+url+'\n'+'未定位到'+element
print(result)
def get_detail(element):
elements = browser.find_element_by_xpath(element)
return elements.text
def get_element_attribute(element, attribute):
elements = browser.find_element_by_xpath(element)
return elements.get_attribute(attribute)
def get_ele_num(element):
num_list = []
elements = browser.find_elements_by_xpath(element)
for eachone in elements:
num_list.append(eachone.text)
return len(num_list)
def get_one_url(urls,titles,num):
for i in range(1,num):
element = "/html/body/table/tbody/tr/td/table[3]/tbody/tr/td[1]/table[3]/tbody/tr["+str(i)+"]/td[2]/a"
href = get_element_attribute(element, "href")
urls.append(href)
title = get_detail(element)
titles.append(title)
return urls,titles
def main():
urls = []
titles = []
ele = '//td[@align="left"]'
for i in range(23):
url = "http://zsxx.e21.cn/e21html/zhaosheng/listihszbkxth1p"+str(i)+".html"
enter(url, ele)
num = get_ele_num('//td[@align="left"]')
[urls, titles]= get_one_url(urls,titles, num)
result = pds.DataFrame(data = urls)
result.to_csv('C:/Users/Administrator/Desktop/urls.csv', sep=',', mode='a',header=None,index=None)
result = pds.DataFrame(data = titles)
result.to_csv('C:/Users/Administrator/Desktop/titles.csv', sep=',', mode='a',header=None,index=None)
if __name__ == "__main__":
main()
Python——获取e21所有文章
最新推荐文章于 2023-02-06 20:59:30 发布