基于selenium和bs4的chinanews新闻爬取
本段代码实现了对chinanews的翻页爬取,入门水平,写的有点辣鸡,但还能用。
#本项目是针对中国新闻网的简单的爬虫实现
class NewsScraper:
def __init__(self):
self.headers = {
'User-Agent': 'XXX',
'Referer': 'https://sou.chinanews.com.cn/'
}
self.driver = webdriver.Edge("./edge/msedgedriver.exe")
self.conn = MongoClient()
self.db_name = 'toutiao'
self.col_name = 'news1'
self.database = self.conn[self.db_name]
self.collection = self.database[self.col_name]
def scrape_news(self, keywords):
for keyword in keywords:
self.driver.get('https://sou.chinanews.com.cn/search.do?q=' + keyword)
k = 0
while k < 200:
try:
current_page_results = self.driver.find_elements_by_xpath('/html/body/table/tbody/tr/td[1]/div[1]//table')
print(current_page_results)
for result in current_page_results:
result_t = result.find_element_by_xpath(".//tbody/tr[1]/td[2]/ul/li[1]/a").text
result_a = result.find_element_by_xpath(".//tbody/tr[1]/td[2]/ul/li[1]/a").get_attribute("href")
response = requests.get(result_a, headers=self.headers).content.decode('utf-8', errors="ignore")
soup = BeautifulSoup(response, 'html.parser')
div_element1 = soup.find('div', class_='content_left_time')
div_element2 = soup.find('div', class_='left_zw')
time = "0000年00月00日 00:00"
author = "***"
content = "暂无正文信息"
pattern = r'(\d+年\d+月\d+日\s+\d+:\d+)'
if div_element1:
time = div_element1.get_text()
match = re.search(pattern, time)
if match:
time = match.group(1)
if div_element1:
author = div_element1.find('a', class_='source')
if author:
author = author.get_text()
author = author
if div_element2:
content = div_element2.get_text()
if content:
content = content
data = {
"title": result_t,
"time": time,
"content": content,
"author": author,
"cata": keyword
}
self.collection.insert_one(data)
print("插入数据成功")
t.sleep(2)
#翻页操作
next_button = self.driver.find_elements_by_xpath('/html/body/table/tbody/tr/td[1]/div[2]/div//a')
next_button_t = next_button[-2]
if next_button_t.is_enabled():
next_button_t.click()
break
except Exception:
k = 200
self.driver.quit()
def main():
scraper = NewsScraper()
datas = ["生态环境", "文化旅游", "亚运会", "日本核废水", "gpt", "成都大运会", "国际"]
scraper.scrape_news(datas)
if __name__ == "__main__":
main()