import requests
import re
import os
from browsermobproxy import Server
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from pyquery import PyQuery
from smtplib import SMTP_SSL
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
classCsdn(object):def__init__(self):
self.page_links =[]
self.page_titles =[]
self.base_url ='https://blog.csdn.net/rank/list'
self.headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',}
self.main()defmain(self):
self.get_blank_links()
self.get_hot()print('数据已爬取完毕,正在发送邮件...')return self.hot
defget_blank_links(self):
n =0
ps =''
links =[]
titlese =[]
l =[]
options = webdriver.ChromeOptions()
options.add_argument('-headless')
server = Server(os.getcwd()+'\\video\\browsermob-proxy-2.1.4\\bin\\browsermob-proxy.bat')
server.start()
proxy = server.create_proxy()
options.add_argument('--proxy-server={0}'.format(proxy.proxy))
options.add_argument('ignore-certificate-errors')
browser = webdriver.Chrome(chrome_options=options)
proxy.new_har("ht_list2", options={'captureContent':True})
browser.get(self.base_url)
html = browser.page_source
browser.close()
htmls = re.findall(r'https://blog\.csdn\.net/.+?.*/article/details/\d*',html)
titles = re.findall(r'class=.title.>.+?.*</a>',html)for i in titles:
title = re.findall(r'>.+?.*<',i)
self.page_titles.append(str(title).strip('[]<>'))for i in htmls:
html = i.split(' ')for j in html:if'article'in j:
h = j.strip('="')
h = re.search('https://.+?.*',h).group(0)
self.page_links.append(h)for i in self.page_links:ifnot i in links:
links.append(i.strip('<>"'))for i in self.page_titles:ifnot i in titlese:
titlese.append(i.strip('<>"'))
self.page_links = links
for p in titlese:for i in p:if i !='<'and i !='>'and i !='"':
ps += i
l.append(ps)
ps =''
self.page_titles = l
len_link =len(self.page_links)
links =[]while n < len_link -2:
links.append(self.page_links[n])
n +=1
self.page_links = links
defget_hot(self):
page =''''''sum=0iflen(self.page_links)==len(self.page_titles):
page +='CSDN热榜:\n'whilesum<len(self.page_links):
page +=('Top'+str(sum+1)+self.page_titles[sum]+': '+ self.page_links[sum]+'\n')sum+=1else:print('error:links error')
self.hot = page
classEmail(object):def__init__(self):
self.csdn = Csdn()
self.hot = self.csdn.hot
self.email()defemail(self):
server ='smtp.163.com'
sender ='svip123456789svip@163.com'
pwd ='ETAVQOBPVVTYPSXZ'
receiver =['svip123456789svip@163.com','svip123456789svip@163.com']
mail_title ='CSDN热榜'
content =str(self.hot)
msg = MIMEMultipart()
msg["Subject"]= Header(mail_title,'utf-8')
msg["From"]= sender
msg['To']=";".join(receiver)
msg.attach(MIMEText(content,'plain','utf-8'))
smtp = SMTP_SSL(server)
smtp.login(sender, pwd)
smtp.sendmail(sender, receiver, msg.as_string())
smtp.quit()if __name__ =='__main__':
Email()