预期效果
可以看到,该会计师事务所共有17页新闻,每一页有20条新闻,如果一个一新闻点开,收集具有某些关键字的新闻标题、内容、地址等信息,比较费时,使用爬虫可以较快爬取下来。
代码实现
from selenium import webdriver
import re
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import re
import csv
browser = webdriver.Chrome()
wait=WebDriverWait(browser, 10)
def search(i,j):
try:
browser.get('http://www.zhongzhuncpa.com/list-58956b7d6a3919c1448ca56a/page{}.shtml'.format(i))
print('打开第{}页'.format(i))
# input = wait.until(
# EC.presence_of_element_located((By.CSS_SELECTOR, "body > div:nth-child(2) > div > div.column.large-9.medium-12.small-12 > div > ul > li:nth-child(1) > a"))
# )
print('进入第{}页,第{}条新闻'.format(i, j))
submit=wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div:nth-child(2) > div > div.column.large-9.medium-12.small-12 > div > ul > li:nth-child({}) > a".format(j)))
)
submit.click()
print('打开第{}页,第{}条新闻'.format(i,j))
get_products(i,j)
print('返回第{}页'.format(i))
browser.get('http://www.zhongzhuncpa.com/list-58956b7d6a3919c1448ca56a/page{}.shtml'.format(i))
except TimeoutException:
return search(i,j)
def get_products(i,j):
html = browser.page_source
# print(html)
# html.encoding = 'GBK'
soup = BeautifulSoup(html, 'lxml')
title = soup.title.string
# title = ftitle[0].text
for key in ['调研', '莅临', '访', '研讨', '邀', '到', '视察','接见','召开','指导']:
result = re.search(key, title)
if result:
print(title)
results = re.findall("
(.*?)", html, re.S)
content=''
for result in results:
content = content + ' ' + result
url='{}&{}'.format(i,j)
print('保存第{}页,第{}条新闻,关键字是{}'.format(i,j,key))
row = [title,content, url, key]
with open('n1', 'a', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
print(content)
writer.writerow(row)
def main():
f = open('n1', 'w', encoding='utf-8-sig', newline='')
writer = csv.writer(f)
head = [ 'title', 'content', 'url', 'key']
writer.writerow(head)
for i in range(1, 18):
for j in range(1,21):
search(i,j)
print('执行完i={},j={}'.format(i,j))
if __name__=='__main__':
main()