前置
- ide:vscode
- python:3.9.6 64bit
- gui:wxPython+wxFormBuilder
咳,爬小说的一个demo,学习用。
主要遇到的困难&踩坑&过程如下
- 环境的配置(在指令和安装路径上都有所区别)
mac自带python2.7,python,pip
用brew安装的是python3,pip3 - 技术栈的选择
最开始(半年前)是用python写的,然后运行的时候要开vscode,挺麻烦,现在想打包成app或者在线的一个网址,找了一下web内嵌python脚本的实践没有特别好的,合理方案是web调接口然后用后台跑,另一个方案是改用node.js爬但这个在打包发布的时候又涉及到一个跨域的问题要用nginx做转发。(可以说两者都是服务器问题,没有纯web的实践方案)(T^T还是学生的时候没租一个腾讯服务器玩,后悔) - 还是环境的配置
最后还是选用python+wxPython(gui)用py2app打包成app。
在引入wx(import wx)的时候有遇到麻烦,
参考的这个MAC安装 WxPython 方法解决的。
在以下命令行有所区别(主要还是版本的问题)
参考命令行
到python的包安装目录
$cd /Library/Python/2.7/site-packages/
建立软连接
$sudo ln -s /usr/local/Cellar/wxpython/3.0.2.0/lib/python2.7/site-packages/wx-3.0-osx_cocoa/wx wx
我的版本
>①终端
python3
import site
site.getsitepackages()
>得出包安装路径:/usr/local/Cellar/python@3.9/3.9.4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages
>②终端
brew info wxpython
>得出wxpython安装路径:/usr/local/Cellar/wxpython/4.1.1_1
>打开文件夹找到wx的地址:/usr/local/Cellar/wxpython/4.1.1_1/libexec/lib/python3.9/site-packages/wx
>③终端
cd /usr/local/Cellar/python@3.9/3.9.4/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/
>到步骤①的python3的包安装路径
sudo ln -s /usr/local/Cellar/wxpython/4.1.1_1/libexec/lib/python3.9/site-packages/wx wx
>与步骤②的wx目录建立软连接
4.还有用可视化gui库时遇到的绑定的问题,原生是不支持绑定异步函数的(请求网页是密集io,用了异步&协程加速),然后引入了拓展包wxasync(import wxasync),参考了这个wxPython 和协程实现。
5.代码实现
# -*- coding: utf-8 -*-
import wx
import json
import requests
from requests.cookies import RequestsCookieJar
import re
from bs4 import BeautifulSoup
import time
import asyncio
import aiohttp
import os
import wxasync
# 下载一本书需要什么,书名,章节目录,cookie
# 多个网址->
# 一个网址->多本txt
# 一个txt->多个章节目录
def print(content, type='false'):
if(type == 'true'):
frame.getSelf().textEdit.SetValue(content)
else:
frame.getSelf().textEdit.AppendText('\n'+content)
# 解析网址
async def get_txt(url):
print('网址解析中...')
async with aiohttp.request("GET", url) as r:
html = await r.read()
soup = BeautifulSoup(html, 'html5lib')
# 书名
name = soup.find('h1').getText()
# 作者
soup.find('span', class_='item red').i.decompose()
user = soup.find('span', class_='item red').string
# 简介
jianjie = soup.find_all('meta')[8]['content']
# 输出文件名
txt = name+'by'+user+'.txt'
write_jianjie(txt, jianjie)
# 目录章节地址/标题
menuUrls = []
menuTitles = []
menus = soup.find_all('li')
for menu in menus:
menuUrls.append(url[0:20]+menu.find('a')['href'])
menuTitles.append(menu.string)
print("{} 章节目录解析中...".format(txt))
return txt, menuUrls, menuTitles
# 下载
async def download(url, chapter_list, key, menuTitles, txt):
async with aiohttp.request("GET", url) as r:
title = menuTitles[key]
html = await r.read()
chapter_soup = BeautifulSoup(html, 'html5lib')
chapter_content = chapter_soup.find(
'div', id='htmlContent').getText('\n ', '<br/><br/>')
chapter_content = re.sub(
r'read.*?;', '', chapter_content, count=0, flags=0)
chapter_list[key] = "第{}章 {}\n{}\n\n\n".format(
key+1, title, chapter_content)
a = len(chapter_list)/len(menuTitles)*100
print("当前下载进度{}%".format('%.02f' % a), 'true')
if len(chapter_list) == len(menuTitles):
writefile(txt, menuTitles, chapter_list)
def write_jianjie(txt, jianjie):
file = open(path+txt, 'w').close()
with open(path+txt, 'a') as file:
file.write("{}\n\n\n".format(jianjie))
def writefile(txt, menuTitles, chapter_list):
with open(path+txt, 'a') as file:
for key in range(0, len(menuTitles)):
file.write(chapter_list[key])
file.close()
async def main(url):
tasks = []
txt, menuUrls, menuTitles = await get_txt(url)
chapter_list = {}
tasks = []
for key, menuUrl in enumerate(menuUrls):
task = asyncio.ensure_future(
download(menuUrl, chapter_list, key, menuTitles, txt))
tasks.append(task)
await asyncio.gather(*tasks)
print("{} 下载成功!".format(txt))
print("下载目录:{}".format(path), 'true')
# https://www.8v8v.org/book/162/162046/ 沈总总在逼氪
path = '/Users/chenting/Downloads/Compressed/'
class MyFrame1(wx.Frame):
def getSelf(self):
return self
def __init__(self, parent):
wx.Frame.__init__(self, parent, id=wx.ID_ANY, title=u"小说下载-第三中文网(https://www.8v8v.org/)", pos=wx.DefaultPosition, size=wx.Size(600, 400),
style=wx.DEFAULT_FRAME_STYLE | wx.TAB_TRAVERSAL)
self.SetSizeHints(wx.DefaultSize, wx.DefaultSize)
bSizer1 = wx.BoxSizer(wx.VERTICAL)
bSizer2 = wx.BoxSizer(wx.HORIZONTAL)
self.fileName = wx.TextCtrl(
self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.DefaultSize, 0)
bSizer2.Add(self.fileName, 1, wx.ALL, 5)
self.openFile = wx.Button(
self, wx.ID_ANY, u"解析", wx.DefaultPosition, wx.DefaultSize, 0)
bSizer2.Add(self.openFile, 0, wx.ALL, 5)
bSizer1.Add(bSizer2, 0, wx.EXPAND, 5)
bSizer3 = wx.BoxSizer(wx.VERTICAL)
self.textEdit = wx.TextCtrl(
self, wx.ID_ANY, wx.EmptyString, wx.DefaultPosition, wx.DefaultSize, 0)
bSizer3.Add(self.textEdit, 1, wx.ALL | wx.EXPAND, 5)
bSizer1.Add(bSizer3, 1, wx.EXPAND, 5)
self.SetSizer(bSizer1)
self.Layout()
self.Centre(wx.BOTH)
# 在MyFrame1.__init__()下为方法绑定对应按键:
# self.openFile.Bind(wx.EVT_BUTTON, self.open)
# 绑定协程回调
wxasync.AsyncBind(wx.EVT_BUTTON, self.open, self.openFile)
self.fileName.SetValue('https://www.8v8v.org/book/224/224179/')
def __del__(self):
pass
async def open(self, event):
self.textEdit.SetValue('程序运行中...') # 将文本输出到textEdit里
# self.textEdit.AppendText('\n123')
await main(self.fileName.GetValue())
if __name__ == '__main__':
# app = wx.App() # 创建一个应用程序对象。每个wxPython程序必须有一个应用程序对象。
# frame = MyFrame1(None) # 创建一个MyFrame1对象
# frame.Show() # 调用该对象的 Show()方法以在屏幕上实际显示它
# # 进入主循环。主循环是一个无尽的循环。它捕获并发送应用程序生命周期中存在的所有事件。
# app.MainLoop()
# 获取 APP 实例
app = wxasync.WxAsyncApp()
frame = MyFrame1(None)
frame.Show()
# 这一步很重要
# 不能 asyncio.run(app.MainLoop()) 因为该函数总是会新建一个消息循环(在事件循环中只有一个协程,所以没有挂起任务执行其他任务这一过程)
loop = asyncio.get_event_loop()
loop.run_until_complete(app.MainLoop())
6.实现效果
7.打包参考Mac系统下将python程序打包成mac应用程序
pip3 install py2app
py2applet --make-setup test.py
#自己有依赖环境,打包速度快。
python3 setup.py py2app -A
#给其它没有依赖的电脑使用,包含lib库。
python3 setup.py py2app
8.另一个脚本,没做Gui,原网页是移动端的对目录做了分隔,获取完整目录需要访问不同页面,也是用了asyncio。
# -*-coding:utf-8 -*-
import json
import requests
from requests.cookies import RequestsCookieJar
import re
from bs4 import BeautifulSoup
import time
import asyncio
import aiohttp
import os
# 下载一本书需要什么,书名,章节目录,cookie
# 多个网址->
# 一个网址->多本txt
# 一个txt->多个章节目录
def now(): return time.time()
# 解析网址
async def get_txt(url):
print('网址解析中...')
async with aiohttp.request("GET", url) as r:
html = await r.read()
soup = BeautifulSoup(html, 'html5lib')
txt = soup.find('h1').string+'by'+soup.find('strong').string+'.txt'
print(txt, '章节目录解析中...')
menuPageAll = soup.find_all('option')
menuUrls = []
menuTitles = []
tasks_menus = []
for menuPage in menuPageAll:
menuPageUrl = url[0:-8]+menuPage['value']
menuPageTitle = menuPage.string
# _menuUrls, _menuTitles = await get_menus(menuPageUrl, url)
# menuUrls.extend(_menuUrls)
# menuTitles.extend(_menuTitles)
task = asyncio.ensure_future(
get_menus(menuPageUrl, url, menuPageTitle))
tasks_menus.append(task)
resList = await asyncio.gather(*tasks_menus)
for res in resList:
menuUrls.extend(res[0])
menuTitles.extend(res[1])
print(txt, '章节目录解析完毕!')
jianjie = soup.find_all('p', class_='p2')[1].string
file = open(path+txt, 'w').close()
with open(path+txt, 'a') as file:
file.write("{}\n\n\n".format(jianjie))
return txt, menuUrls, menuTitles
# 解析目录
async def get_menus(menuPageUrl, url, menuPageTitle):
async with aiohttp.request("GET", menuPageUrl) as r:
html = await r.read()
soup = BeautifulSoup(html, 'html5lib')
menuUrls = []
menuTitles = []
menus = soup.find_all('ul', class_="p2")[1].find_all('li')
for menu in menus:
me = menu.find('a')
if(me != None):
menuUrls.append(url[0:-8]+me['href'])
menuTitles.append(me.string)
print(menuPageTitle, '目录解析成功!')
return menuUrls, menuTitles
# 下载
async def download(url, chapter_list, key, menuTitles, txt):
async with aiohttp.request("GET", url) as r:
title = menuTitles[key]
html = await r.read()
chapter_soup = BeautifulSoup(html, 'html5lib')
chapter_content = chapter_soup.find(
'div', id='novelcontent').getText('\n ', '<p></p><br/><br/>')
chapter_list[key] = "{}\n{}\n\n\n".format(title, chapter_content)
print(time.strftime('%H:%M:%S ', time.localtime(
time.time())), txt, title, '下载成功!')
if len(chapter_list) == len(menuTitles):
writefile(txt, menuTitles, chapter_list)
# 写入文件
def writefile(txt, menuTitles, chapter_list):
with open(path+txt, 'a') as file:
for key in range(0, len(menuTitles)):
file.write(chapter_list[key])
file.close()
# 测试
async def download_test(url):
async with aiohttp.request("GET", url) as r:
html = await r.read()
chapter_soup = BeautifulSoup(html, 'html5lib')
chapter_content = chapter_soup.find(
'div', id='novelcontent').getText('\n ', '<p></p><br/><br/>')
title = chapter_soup.find('h1').string
test = "{}\n{}\n\n\n".format(title, chapter_content)
print(test)
async def main():
# await download_test('https://m.dawangwen.com/0/8076/1554739.html')
tasks = []
for url in urls:
txt, menuUrls, menuTitles = await get_txt(url)
chapter_list = {}
tasks = []
for key, menuUrl in enumerate(menuUrls):
task = asyncio.ensure_future(
download(menuUrl, chapter_list, key, menuTitles, txt))
tasks.append(task)
await asyncio.gather(*tasks)
print(txt, '下载成功!')
urls = ['https://m.dawangwen.com/0/8052/', 'https://m.dawangwen.com/0/8216/',
'https://m.dawangwen.com/0/7062/', 'https://m.dawangwen.com/0/7483/']
# https://m.dawangwen.com/0/8216/ 沙雕攻他重生了
# https://m.dawangwen.com/0/8052/ 你打算萌死我吗
# https://m.dawangwen.com/0/7062/ 拜拜
# https://m.dawangwen.com/0/7483/ 我终于抢救了他们的脑子
path = './小说txt/'
start = now()
# 多线程
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(main())
loop.close()
end = now()
print("本次运行总耗时:{}秒".format(round(end - start, 2)))
9.这两个简单网址都没弄反爬机制,之前爬成功的几个后续弄了反爬,有空研究一下,大概是代理ip池和js加解密这两块。这些都是直接返回html的,还是喜欢那种目录和url直接调一个接口请求的,不需要去解析页面,看下入参反参就好了。
10.想要更合适的效果还是弄成前后端分离的比较好。比如说查找换源,然后下载路径的选择,可选择部分内容下载,下载名称自定义之类的,或者下载填邮箱直接发送到邮箱这类的。