本文提供了一种用Selenium库实现自动爬取知乎热榜小于回答数小于30的问题,并使用SMTP将内容发送到用户邮箱的方法。
// An highlighted block
var foo = 'bar';
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 17 07:57:51 2018
@author: ljc
"""
import numpy as np
import re
from selenium import webdriver
import time
from selenium.common.exceptions import StaleElementReferenceException
import random
def smtp_sender(mail_content = '你好,我是来自知乎的[邓旭东HIT] ,现在在进行一项用python登录qq邮箱发邮件的测试'):
from email.mime.text import MIMEText
from email.header import Header
from smtplib import SMTP_SSL
#qq邮箱smtp服务器
host_server = 'smtp.qq.com'
#sender_qq为发件人的qq号码
sender_qq = '145*1****'
#pwd为qq邮箱的授权码
pwd = '*******'
#发件人的邮箱
sender_qq_mail = '*******0@qq.com'
#收件人邮箱
receiver = '2*******@qq.com'
#邮件的正文内容
#邮件标题
mail_title = '热榜检测'
#ssl登录
smtp = SMTP_SSL(host_server)
#set_debuglevel()是用来调试的。参数值为1表示开启调试模式,参数值为0关闭调试模式
smtp.set_debuglevel(0)
smtp.ehlo(host_server)
smtp.login(sender_qq, pwd)
msg = MIMEText(mail_content, "plain", 'utf-8')
msg["Subject"] = Header(mail_title, 'utf-8')
msg["From"] = sender_qq_mail
msg["To"] = receiver
smtp.sendmail(sender_qq_mail, receiver, msg.as_string())
smtp.quit()
return
def gene_Hotstr(question_no = [],q_len = 0,question_no_list=[],q_list_len = 0):
option = webdriver.ChromeOptions()
option.add_argument("headless")
driver = webdriver.Chrome(r"C:\Users\ljc14\Desktop\chromedriver.exe",chrome_options=option)
driver.get('https://www.zhihu.com/billboard')
questionset = ''
questions_list_set = ''
# question_no = []
for i in range(50):
action2 = ''
if i<9:
if i>2:
action2 = driver.find_elements_by_xpath('//div[contains(text(), '+'\"0'+str(i+1)+'\")'+' and @class="HotList-itemIndex"]')
else:
action2 = driver.find_elements_by_xpath('//div[contains(text(), '+'\"0'+str(i+1)+'\")'+' and @class="HotList-itemIndex HotList-itemIndexHot"]')
else:
action2 = driver.find_elements_by_xpath('//div[contains(text(), '+'\"'+str(i+1)+'\")'+' and @class="HotList-itemIndex"]')
if action2:
try:
webdriver.ActionChains(driver).move_to_element(action2[0]).click(action2[0]).perform()
answer_number = driver.find_elements_by_class_name("List-headerText")
languages = [x.text for x in answer_number]
languages = str(languages)
numbers = re.sub("\D", "", languages)
if numbers:
numbers = int(numbers)
print(numbers)
questions_list = driver.find_elements_by_class_name("QuestionHeader-title")
questions_list = [x.text for x in questions_list]
questions_list = str(questions_list)
print(questions_list)
if questions_list not in question_no_list:
question_no_list.append(questions_list)
questions_list_set = questions_list_set + '\n' + questions_list
if numbers<30:
questions = driver.find_elements_by_class_name("QuestionHeader-title")
questions = [x.text for x in questions]
questions = str(questions)
if questions not in question_no:
question_no.append(questions)
# print(question_no)
print(questions)
questionset= questionset + '\n' + questions
driver.back()
except StaleElementReferenceException:
print(1)
time.sleep(0.5)
return [],0,''
if len(question_no)>q_len:
smtp_sender(questionset)
if len(question_no_list)>q_list_len:
smtp_sender(questions_list_set)
q_len = len(question_no)
q_list_len = len(question_no_list)
return question_no,q_len,questionset,question_no_list,q_list_len,questions_list_set
#driver = webdriver.Chrome(r"C:\Users\ljc14\Desktop\chromedriver.exe")
#driver.get('https://www.zhihu.com/billboard')
##action3 = driver.find_elements_by_xpath('//div[contains(text(), "01") and @class="HotList-itemIndex HotList-itemIndexHot"]')
#action3 = driver.find_elements_by_xpath('//div[contains(text(), "04") and @class="HotList-itemIndex"]')
##<div class="HotList-itemIndex HotList-itemIndexHot">01</div>
#print(len(action3))
#getHot(action3)
def time_pa():
# question_no = []
q_len = 0
while True:
if q_len ==0:
print(0)
[question_no,q_len,questionset,question_no_list,q_list_len,questions_list_set] = gene_Hotstr()
else:
print('Nothing new, so sad')
[question_no,q_len,questionset,question_no_list,q_list_len,questions_list_set] = gene_Hotstr(question_no,q_len,question_no_list,q_list_len)
time.sleep(random.randint(0,9))
time_pa()