python 会计师事务所_Selenium爬取会计师事务所新闻信息——以中准会计师事务所为例...

预期效果

可以看到,该会计师事务所共有17页新闻,每一页有20条新闻,如果一个一新闻点开,收集具有某些关键字的新闻标题、内容、地址等信息,比较费时,使用爬虫可以较快爬取下来。

代码实现

from selenium import webdriver

import re

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

import requests

from bs4 import BeautifulSoup

import re

import csv

browser = webdriver.Chrome()

wait=WebDriverWait(browser, 10)

def search(i,j):

try:

browser.get('http://www.zhongzhuncpa.com/list-58956b7d6a3919c1448ca56a/page{}.shtml'.format(i))

print('打开第{}页'.format(i))

# input = wait.until(

# EC.presence_of_element_located((By.CSS_SELECTOR, "body > div:nth-child(2) > div > div.column.large-9.medium-12.small-12 > div > ul > li:nth-child(1) > a"))

# )

print('进入第{}页,第{}条新闻'.format(i, j))

submit=wait.until(

EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div:nth-child(2) > div > div.column.large-9.medium-12.small-12 > div > ul > li:nth-child({}) > a".format(j)))

)

submit.click()

print('打开第{}页,第{}条新闻'.format(i,j))

get_products(i,j)

print('返回第{}页'.format(i))

browser.get('http://www.zhongzhuncpa.com/list-58956b7d6a3919c1448ca56a/page{}.shtml'.format(i))

except TimeoutException:

return search(i,j)

def get_products(i,j):

html = browser.page_source

# print(html)

# html.encoding = 'GBK'

soup = BeautifulSoup(html, 'lxml')

title = soup.title.string

# title = ftitle[0].text

for key in ['调研', '莅临', '访', '研讨', '邀', '到', '视察','接见','召开','指导']:

result = re.search(key, title)

if result:

print(title)

results = re.findall("

(.*?)

", html, re.S)

content=''

for result in results:

content = content + ' ' + result

url='{}&{}'.format(i,j)

print('保存第{}页,第{}条新闻,关键字是{}'.format(i,j,key))

row = [title,content, url, key]

with open('n1', 'a', encoding='utf-8-sig', newline='') as f:

writer = csv.writer(f)

print(content)

writer.writerow(row)

def main():

f = open('n1', 'w', encoding='utf-8-sig', newline='')

writer = csv.writer(f)

head = [ 'title', 'content', 'url', 'key']

writer.writerow(head)

for i in range(1, 18):

for j in range(1,21):

search(i,j)

print('执行完i={},j={}'.format(i,j))

if __name__=='__main__':

main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值