目录
分享一个关于如何使用Python编写网易新闻爬虫的项目。在这个项目中,我们将使用requests库来获取网页源代码,使用lxml库来解析HTML,使用selenium库来模拟浏览器操作,以及使用multiprocessing库来实现多进程加速爬取速度。最后,我们将把爬取到的数据保存到CSV文件中。
1. 导入所需库
首先,导入所需要的库:
import requests
from lxml import etree
from selenium import webdriver
from time import sleep
import multiprocessing
import csv
2. 定义请求头
接下来,定义一个请求头,用于模拟浏览器访问网站:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0"
}
3. 获取所有板块的链接
编写一个函数get_news(headers)
,用于获取首页上所有板块的链接:
def get_news(headers):
url = "https://news.163.com/"
plates = []
plate_urls = []
response = requests.get(url, headers=headers).text
tree = etree.HTML(response)
li_list = tree.xpath('//div[@class="bd"]/div/ul/li')
for li in li_list:
plate = li.xpath('.//a/text()')[0]
plates.append(plate)
urls = li.xpath('.//a/@href')[0]
plate_urls.append(urls)
return plates, plate_urls # 所有板块的名称,所有板块的链接
4. 获取新闻标题和详情链接
编写一个函数get_news_by_selenium(plate_urls)
,用于获取新闻标题和详情链接:
def get_news_by_selenium(plate_urls):
edge = webdriver.Edge(r'/drive/msedgedriver.exe')
plate_urls_index = [1, 2, 4, 5]
titles = []
new_detail_urls = []
for i in plate_urls_index:
edge.get(url=plate_urls[i]) # 打开网页
edge.execute_script("window.scrollTo(0, document.body.scrollHeight)") # 滚动条滚动到底部
sleep(2) # 等待网页加载完成
plate_urls_source = edge.page_source # 获取页面响应数据
tree = etree.HTML(plate_urls_source)
div_list = tree.xpath('//div[@class="ndi_main"]/div')
for div in div_list:
try:
title = div.xpath('.//div[@class="news_title"]/h3/a/text()')[0]
titles.append(title)
new_detail_url = div.xpath('.//div[@class="news_title"]/h3/a/@href')[0]
new_detail_urls.append(new_detail_url)
except:
continue
edge.quit() # 关闭浏览器
return titles, new_detail_urls # 返回浏览器对象,所有新闻标题,所有新闻详情链接
5. 获取新闻详情页内容
接下来,需要编写一个函数get_news_detail_by_selenium(url)
,用于获取新闻详情页内容:
def get_news_detail_by_selenium(url):
index_list_urls = []
content_list = []
gentie_list = []
sanyu_list = []
rm_pl_list = []
rm_pl_lists = []
edge = webdriver.Edge(r'/drive/msedgedriver.exe')
edge.get(url=url) # 打开网页
print('正在爬取' + url + '的内容')
index_list_urls.append(url)
edge.execute_script("window.scrollTo(0, document.body.scrollHeight)") # 滚动条滚动到底部
sleep(10) # 等待网页加载完成
html_content = edge.page_source
html = etree.HTML(html_content)
try:
content = html.xpath('//*[@id="content"]/div[2]/p/text()')
except:
content = None
# 如果找不到gentie则返回None
try:
gentie = html.xpath('//*/div[@id="tie"]/div/div[1]/div/a[1]/text()')[0]
except:
gentie = None
try:
sanyu = html.xpath('//*/div[@id="tie"]/div/div[1]/div/a[2]/text()')[0]
except:
sanyu = None
try:
rm_pl_div = html.xpath('//*/div[@class="tie-hot"]/div[@class="tie-list"]/div')
try:
for div in rm_pl_div:
rm_pl = div.xpath('.//div[@class="bdy-inner"]/p/a/text()')
# 获取热评三条评论拼接成一条评论
rm_pl_str = ''
for pl in rm_pl:
rm_pl_str += pl
rm_pl_list.append(rm_pl_str)
except Exception as e:
print(f"Error in inner try block: {e}")
rm_pl = None
except Exception as e:
print(f"Error in outer try block: {e}")
rm_pl = None
content_list.append(content)
gentie_list.append(gentie)
sanyu_list.append(sanyu)
rm_pl_lists.append(rm_pl_list)
edge.quit()
return content_list, gentie_list, sanyu_list, rm_pl_lists
6. 多进程加速爬取
为了提高爬取速度,可以使用multiprocessing
库来实现多进程爬取:
def worker(url):
num_processes = 10
chunk_size = len(url) // num_processes
results = []
for i in range(0, len(url), chunk_size):
urls_chunk = url[i:i + chunk_size]
with multiprocessing.Pool(processes=num_processes) as pool:
results.extend(pool.map(get_news_detail_by_selenium, urls_chunk))
content_list = [item for sublist in results for item in sublist[0]]
gentie_list = [item for sublist in results for item in sublist[1]]
sanyu_list = [item for sublist in results for item in sublist[2]]
rm_pl_lists = [item for sublist in results for item in sublist[3]]
return content_list, gentie_list, sanyu_list, rm_pl_lists
7. 保存数据到CSV文件
最后,将爬取到的数据保存到CSV文件中:
if __name__ == '__main__':
plates, plate_urls = get_news(headers)
titles, new_detail_urls = get_news_by_selenium(plate_urls)
content_list, gentie_list, sanyu_list, rm_pl_lists = worker(new_detail_urls)
# 保存到csv文件中
with open('../../网易新闻.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['标题', '链接', '内容', '跟帖数', '参与人数', '热门评论'])
# 将 titles ,new_detail_urls, ''.join(content_list), gentie_list, sanyu_list保存到csv文件中
for i in range(len(new_detail_urls)):
writer.writerow([titles[i], new_detail_urls[i], ''.join(content_list[i]), gentie_list[i], sanyu_list[i],
rm_pl_lists[i]])
print('爬取完成')