初次写这个,有借鉴别人的地方,写的不好勿喷
开始就上代码,
#爬取的url,title,img,comment,source 通过字典的形式存储在列表里,需要的时候遍历提取就好
import re
from selenium import webdriver
import time
start_url = "https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D"
#爬取的url,title,img,comment,source 通过字典的形式存储在列表里,需要的时候遍历提取就好
info_list = []
def Verification_Code(driver):
try:
time.sleep(2)
driver.switch_to.window(driver.window_handles[0])
driver.find_element_by_class_name("icon").click()
time.sleep(2)
driver.refresh()
time.sleep(3)
driver.switch_to.window(driver.window_handles[0])
except :
print("无验证码")
pass
class TouTiao():
def __init__(self):
self.driver = webdriver.Chrome()
self.driver.maximize_window()
self.driver.implicitly_wait(10)
self.url = start_url
# self.driver.get(start_url)
# time.sleep(1)
def take_data_last(self):
#获取新闻URL和标题
content_list = self.driver.find_elements_by_class_name("link")
for content in content_list:
info_dict = {}
info_dict["url"] = content.get_attribute("href")
info_dict["title"] = content.text
info_list.append(info_dict)
# print(content)
#获取新闻来源
sources = self.driver.find_elements_by_class_name("J_source")
for source in sources:
info_list[sources.index(source)]["source"] = source.text
# 获取评论数
comment_list = self.driver.find_elements_by_class_name("comment")
for comment in comment_list:
info_list[comment_list.index(comment)]["comment"] = comment.text
#获取预览图片
view_img_list = self.driver.find_elements_by_class_name("articleCard")
for view_img in view_img_list:
img_html = view_img.get_attribute("innerHTML")
img = re.findall(r'<img alt="" src="(.*?)">',img_html)[-1]
info_list[view_img_list.index(view_img)]["view_img"] = img
print(info_list)
print(len(info_list))
def load_data(self):
self.driver.get(start_url)
time.sleep(1)
#这里调用方法直接X掉验证码刷新页面就好了
Verification_Code(self.driver)
self.driver.execute_script("window.scrollTo(0,1000);")
time.sleep(1)
while True:
# 获取当前页面所有新闻数量
before_num =len(self.driver.find_elements_by_class_name("articleCard"))
# 通过循环让浏览器的滚动条不断的向下拖动
for i in range(3):
#拖动滚动条到底部
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(3)
# 获取当前页面所有新闻数量
element_num = len(self.driver.find_elements_by_class_name("articleCard"))
if before_num == element_num:
self.take_data_last()
break
def close_browser(self):
time.sleep(5)
self.driver.quit()
def main(self):
self.load_data()
self.close_browser()
if __name__ == '__main__':
toutiao = TouTiao()
toutiao.main()
最后的效果,由于是新手,代码都是自己写的,可以参考一下,代码和功能还有许多改进之处,希望大佬多多指正。