多线程爬取zxcs.me,运行可中断,能继续
PS:使用的是Python3.7版本
zhixuan.py
import urllib.request
import os
# from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from concurrent.futures import ThreadPoolExecutor
import myEmail
def request(url):
url_request = urllib.request.urlopen(url)
html_doc = url_request.read().decode('utf-8', 'ignore')
# soup = BeautifulSoup(html_doc, 'html.parser')
soup = pq(html_doc)
# print(soup.html())
return soup
# print(soup.prettify())
soup = request('http://www.zxcs.me/')
# common_list = iter(soup.find_all("li", class_="common"))
common_list = soup('.common').items()
dirN = ""
srcL = ""
page = 0
sort = 0
srcU = ""
DLCount = 0
levelTop = []
#保存文件目录
pathComm = "D:\\QMDownload\\"
# 文件下载
def download(P_dir, file_name, url):
file = P_dir + "\\" + file_name + ".rar"
if not os.path.exists(P_dir):
os.makedirs(P_dir)
urllib.request.urlretrieve(url, file)
global DLCount
DLCount += 1
print("开始下载至第" + str(DLCount) + "本")
if(DLCount % 1000 == 0):
myEmail.sendEmail("xxxxxxxxxx@qq.com", "已经下载到了第" + str(DLCount) + "本")
path = pathComm + "zhixuan.txt"
f = open(path, "a")
print("下载链接:"+file_name+"|"+url+"\n")
f.write("下载链接:"+file_name+"|"+url+"\n")
f.close()
# 记录下载量
path = pathComm + "down.log"
ff = open(path, "a")
ff.write("开始下载至第" + str(DLCount) + "本\n")
ff.close()
def ParsingThird(dirN, name, linkSrc):
soup = request(linkSrc)
href = soup(".filetit").children().attr("href")
soup = request(href)
d_href = soup(".downfile").eq(0).children().attr("href")
# print("下载链接:"+name+"|"+d_href)
download(pathComm + dirN, name, d_href)
return
pool = ThreadPoolExecutor(max_workers=3)
intt = 0
def ParsingSecondThread2(pageInfo):
# [pageMax, sort, srcU, dirN, page(now)]
pageMax = pageInfo[0]
sort = pageInfo[1]
srcU = pageInfo[2]
dirN = pageInfo[3]
page = pageInfo[4]
realUrl = srcU + sort + "/page/"
global pool
for i in range(page, int(pageMax)+1):
print("开始下载SORT【" + str(sort) + "】的第【" + str(i) + "】页")
soup = request(realUrl+str(i))
plist = soup("#plist dt a").items()
for st in plist:
name = st.text()
linkSrc = st.attr("href")
# print("第一链接: " + linkSrc)
pool.submit(ParsingThird, dirN, name, linkSrc)
# pool.submit(ThreadDown, dirN, realUrl2)
def ParsingSecondThread1(pageInfo):
# [pageMax, sort, srcU, dirN, page(now)]
pageMax = pageInfo[0]
sort = pageInfo[1]
srcU = pageInfo[2]
dirN = pageInfo[3]
page = pageInfo[4]
realUrl = srcU + sort + "/page/"
for i in range(page, int(pageMax)+1):
print("开始下载SORT【" + str(sort) + "】的第【" + str(i) + "】页")
ko = open(pathComm + "keep.log", "w+")
ko.write(str(sort) + "|" + str(i))
ko.close()
soup = request(realUrl+str(i))
plist = soup("#plist dt a").items()
with ThreadPoolExecutor(max_workers=3) as t:
for st in plist:
name = st.text()
linkSrc = st.attr("href")
# print("第一链接: " + linkSrc)
t.submit(ParsingThird, dirN, name, linkSrc)
def ParsingSecond(pageInfo):
# [pageMax, sort, srcU, dirN, page(now)]
pageMax = pageInfo[0]
sort = pageInfo[1]
srcU = pageInfo[2]
dirN = pageInfo[3]
page = pageInfo[4]
realUrl = srcU + sort + "/page/"
for i in range(page, int(pageMax)+1):
realUrl2 = realUrl+str(i)
print("开始下载SORT【" + str(sort) + "】的第【" + str(i) + "】页")
ko = open(pathComm + "keep.log", "w+")
ko.write(str(sort) + "|" + str(i))
ko.close()
soup = request(realUrl2)
# print(realUrl2)
plist = soup("#plist dt a").items()
for st in plist:
name = st.text()
linkSrc = st.attr("href")
ParsingThird(dirN, name, linkSrc)
def getPages(dirN, srcL):
print(dirN, ":", srcL)
soup = request(srcL)
lastFullUrl = soup('#pagenavi a:last').attr("href")
pageMax = lastFullUrl[lastFullUrl.rfind("/")+1:]
global sort
sort = srcL[srcL.rfind("/")+1:]
srcU = srcL[0:srcL.rfind("/")+1]
return [pageMax, sort, srcU]
def first():
for element in common_list:
cds = element
if(len(cds.children()) == 1):
dirN = cds.text().replace("·", "")
srcL = cds('a').attr("href")
pageInfo = getPages(dirN, srcL)
pageInfo.append(dirN)
pageInfo.append(1)
levelTop.append(pageInfo)
else:
lis = cds('ul li').items()
for li in lis:
dirN = li('a').text().replace("·", "")
srcL = li('a').attr("href")
pageInfo = getPages(dirN, srcL)
pageInfo.append(dirN)
pageInfo.append(1)
levelTop.append(pageInfo)
reFirst(levelTop)
def reFirst(levelTop):
kPath = pathComm + "keep.log"
# 不存在keep.log
if not os.path.exists(kPath):
print("不存在keep.log")
else:
kf = open(kPath, "r+")
line = kf.readline()
sort = line[0:line.find("|")]
page = int(line[line.find("|")+1:])
print(sort)
print(page)
kf.close()
# [pageMax, sort, srcU, dirN]
for pageInfo in levelTop:
if(sort != pageInfo[1]):
del levelTop[0]
else:
pageInfo[4] = page
break
# [pageMax, sort, srcU, dirN, page(now)]
print(levelTop)
for pageInfo in levelTop:
ParsingSecondThread1(pageInfo)
first()
myEmail.py
# smtplib 用于邮箱发信动作
import smtplib
from email.mime.text import MIMEText
# email 用于构建邮件内容
from email.header import Header
# from email.mime.multipart import MIMEMultipart
def sendEmail(to_addr, msg):
# 发信方的信息:发信邮件,邮箱授权码
from_addr = 'xxxxxxxxx@163.com'
# 测试直接使用登录密码可行,qq邮箱可能需要授权码,没有测试过
password = 'xxxxxxxx'
# 收信方邮箱
# to_addr = "xxxxxxx@qq.com"
# 发信服务器
smtp_server = "smtp.163.com"
# 邮箱正文内容,第一个参数为内容,第二个参数为格式(plain 为纯文本),第三个参数为编码
msg = MIMEText(msg, 'plain', 'utf-8')
# 邮件头信息
msg['From'] = Header(from_addr)
msg['To'] = Header(to_addr)
msg['Subject'] = Header("爬虫通知", "utf-8")
try:
# 开启发信服务,这里使用的是加密传输
server = smtplib.SMTP(smtp_server)
server.connect(smtp_server, 25)
# 登录发信邮箱
server.login(from_addr, password)
# 发送邮件
server.sendmail(from_addr, to_addr, msg.as_string())
# 关闭服务器
server.quit()
print("邮件发送成功")
except smtplib.SMTPException:
print("Error: 邮件发送失败")
finally:
# 必须要有return,不然方法调用后 会卡着 直至被抛出异常
return