爬虫主要是bs4 beautifulsoup库 和 urllib 库
ui 是 pyqt5库
如果是希望通过ui反馈实施爬取进度,必须用多线程,否则ui会假死
先说爬虫
主要是通过biqukan这个网站爬取
逻辑是:
1.先获得小说章节列表,形成一个只有key,没有value的字典.以及一个各章节网址的列表
2.通过每个章节循环获得章节内容,更新到字典中去
3.把字典输出到txt.
获取章节列表以及字典代码如下:
def getlist(url,xsname):
print('开始读取章节列表')
k = []
txt = {}
head = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.biqukan.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'If-None-Match': "1518171634"
}
html = url
req = request.Request(html, headers=head)
response = request.urlopen(req)
page = response.read().decode('gbk', 'ignore')
soup = BeautifulSoup(page, 'lxml')
txtlist = soup.find('div', attrs={'class': 'listmain'})
begin_flag = 0
for child in txtlist.dl.children:
if child != '\n':
if child.string == "《" + xsname + "》正文卷":
begin_flag = 1
if begin_flag == 1 and child.a != None:
k.append(child.a.get('href'))
txt[str(child.a.getText())] = []
if len(k) != 0:
print('章节数读取完毕! 共有: %s 章' % len(k))
else:
print('章节数读取失败')
return k, txt
然后获取各章内容如下:
def gettxt(k,txt):
head = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.biqukan.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'If-None-Match': "1518171634"
}
html = 'http://www.biqukan.com' + k
req = request.Request(html, headers=head)
response = request.urlopen(req)
page = response.read().decode('gbk', 'ignore')
soup = BeautifulSoup(page, 'lxml')
text1 = soup.find('div', attrs={'id': 'content', 'class': 'showtxt'}).getText()
text1 = str(text1).replace('\xa0', ' ')
text1 = str(text1).replace('请记住本书首发域名:www.biqukan.com。笔趣阁手机版阅读网址:m.biqukan.com', '')
text1 = str(text1).replace(str(html), '')
text1.split()
text1 = '\n'.join(text1.split())
textname = soup.find('div', attrs={'class': 'content'})
textname = textname.h1.string
if list(text1) != 0 and list(textname) != 0:
txt[str(textname)] = str(text1)
print('[爬取成功] %-5s' % str(textname))
else:
print('[爬取失败] %-5s' % str(textname))
return txt
输出txt
def putouttxt(txt,xsname):
outputfile = open(str(os.path.split(os.path.realpath(__file__))[0]) + '\\'+ xsname + '.txt', 'a', encoding='utf-8')
for key in txt:
outputfile.write(str(key) + '\n\n')
outputfile.write(str(txt[key]) + '\n\n')
outputfile.close()
主程序
def main():
print('------------------------------------------------------------------')
print('!!只能从www.biqukan.com爬取小说!!')
print('------------------------------------------------------------------')
print('请在biqukan找到想看的小说,并在下方输入小说名称,例如:天道图书馆')
xsname = str(input('小说名称:'))
print('请在下方输入小说主页(目录页),例如:http://www.biqukan.com/17_17957/')
url = str(input('小说主页:'))
print('------------------------------------------------------------------')
print('爬虫开始干活了!')
print('------------------------------------------------------------------')
(k,txt) = getlist(url,xsname)
print('------------------------------------------------------------------')
print('开始爬取各章节内容')
for i in range(len(k)):
txt = gettxt(k[i],txt)
print('各章节内容爬取完毕')
print('------------------------------------------------------------------')
print('开始生成TXT文件')
putouttxt(txt,xsname)
print('TXT文件生成完毕')
print('------------------------------------------------------------------')
print('工作结束,好好看书吧!')
test = str(input(' '))
接下来说ui.
上面的其实以及可以用了..只是输入界面是在命令提示符.用搜狗输入法不太方便.所以想着弄个ui.
这里要用到pyqt5这个库,其中包含界面的,包含多线程的.都可以直接用.
主体窗口代码如下:
class Main(QtWidgets.QWidget):
def __init__(self, parent=None):
super(Main, self).__init__(parent)
self.setui()
def setui(self):
title = QtWidgets.QLabel('小说名称')
url = QtWidgets.QLabel('小说主页')
output = QtWidgets.QLabel('信息输出窗口')
time1 = QtWidgets.QLabel('进度')
self.timebar = QtWidgets.QProgressBar()
self.titleEdit = QtWidgets.QLineEdit()
self.urlEdit = QtWidgets.QLineEdit()
self.outputinfo = QtWidgets.QTextBrowser()
qb = QtWidgets.QPushButton('gogogo')
qb.resize(qb.sizeHint())
grid = QtWidgets.QGridLayout()
grid.setSpacing(15)
grid.addWidget(title, 1, 0)
grid.addWidget(self.titleEdit, 1, 1)
grid.addWidget(url, 2, 0, )
grid.addWidget(self.urlEdit, 2, 1)
grid.addWidget(time1, 3, 0)
grid.addWidget(self.timebar, 3, 1, 1, 3)
grid.addWidget(output, 4, 0)
grid.addWidget(self.outputinfo, 4, 1, 5, 3)
grid.addWidget(qb, 1, 3, 2, 1)
self.setLayout(grid)
self.setGeometry(300, 300, 720, 540)
self.setWindowTitle('对对牌小说读取器')
qb.clicked.connect(self.start_threads)
def start_threads(self):
xsname = str(self.titleEdit.text())
url = str(self.urlEdit.text())
thread = MyThread(self)
thread.setup(xsname,url)
thread.trigger.connect(self.update_text)
thread.i_trigger.connect(self.timerEvent)
thread.start()
def update_text(self, msg):
self.outputinfo.append(msg)
def timerEvent(self,i):
self.timebar.setValue(i)
第一个是主体窗口各种设置
第二个是开启爬虫的线程.不能干扰ui的主线程,否则会阻塞导致ui假死
后面两个是主线程和爬虫线程的信号通道
然后是爬虫线程:
class MyThread(QtCore.QThread):
trigger = QtCore.pyqtSignal(str)
i_trigger = QtCore.pyqtSignal(float)
def __init__(self, parent=None):
super(MyThread, self).__init__(parent)
def setup(self, xsname, url):
self.xsname = xsname
self.url = url
def run(self):
爬虫代码!!
注意要反馈信号到主线程
这里有两点需要注意
1.主程序传参数到爬虫线程建立了一个单独的方法 setup.
不知道为什么如果不这样做,直接传参数到爬虫的方法会报错.
2.两个类里的 这个 属性必须要有: parent=None 如果没有的话会报错..不知道为什么.按道理默认应该是none
这两个错都是很奇怪的..百度说是什么显卡问题.有说空指针问题的..
错误如下:
其他就没啥了….