用python做一个爬取百度百科的应用软件

用python做一个爬取百度百科的应用软件

1.原理:
(1)首先写一个爬取百度百科的脚本
(2)利用pyqt5做一个GUI来展现爬取的内容

2.效果展示:
在这里插入图片描述
在这里插入图片描述
3.代码展示:
爬取百度百科:

import requests
from pyquery import PyQuery as pq
import pymysql
import re

class Baike_baidu:
    def __init__(self,key):
        url1="https://baike.baidu.com/item/"
        self.url = url1+key

    #获取网页的html
    def html(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
        }
        req=requests.get(self.url,headers=headers)
        html=req.content
        html_doc=str(html,'utf-8')
        return html_doc

    #抓取百度百科概述
    def search_news_Summary(self):
        content=pq(self.html())
        result=content(".lemma-summary .para").text()
        result2 = result.replace('\xa0','')
        result3 = result2.replace(' ', '')
        return result3

    #抓取表格字条
    def search_news_form(self):
        content=pq(self.html())
        result1=content('.basic-info')
        result2=result1('.basicInfo-block').text()
        result3 = result2.replace('\n',':')
        result4 = result3.replace('\xa0','')
        return result4

    #抓取标题名称
    def search_news_title(self):
        search_news_str='<h2 class="title-text"><span class="title-prefix">.*?</span>(.*?)</h2>'
        content = pq(self.html())
        title1 = content('.para-title')
        title2 = title1('.title-text')
        str_title2 = str(title2)
        search = re.compile(search_news_str)
        result = search.findall(str_title2)
        return result

    #抓取内容:
    def search_news_content(self):
        search_news_str = '<div class="para" label-module="para">(.*)</div>'
        search_news_str1 = '<[^>]*>'
        content = pq(self.html())
        search = re.compile(search_news_str)
        str_content = str(content)
        result = search.findall(str_content)
        result_clear = result.remove(result[0])
        result_str = "".join(result)
        search2 = re.compile(search_news_str1)
        result_str2 = search2.sub('',result_str)
        return result_str2

GUI实现:(用的是PyQt5)

from PyQt5 import QtCore, QtGui, QtWidgets
from PyQt.baike.baike_baidu import Baike_baidu as bk

class Ui_Dialog(object):
    def setupUi(self, Dialog):
        Dialog.setObjectName("Dialog")
        Dialog.resize(801, 574)
        self.label = QtWidgets.QLabel(Dialog)
        self.label.setGeometry(QtCore.QRect(20, 20, 91, 16))
        self.label.setObjectName("label")
        self.groupBox = QtWidgets.QGroupBox(Dialog)
        self.groupBox.setGeometry(QtCore.QRect(10, 360, 781, 211))
        self.groupBox.setObjectName("groupBox")
        self.textBrowser_3 = QtWidgets.QTextBrowser(self.groupBox)
        self.textBrowser_3.setGeometry(QtCore.QRect(10, 21, 761, 181))
        self.textBrowser_3.setObjectName("textBrowser_3")
        self.groupBox_2 = QtWidgets.QGroupBox(Dialog)
        self.groupBox_2.setGeometry(QtCore.QRect(10, 200, 781, 161))
        self.groupBox_2.setObjectName("groupBox_2")
        self.textBrowser_2 = QtWidgets.QTextBrowser(self.groupBox_2)
        self.textBrowser_2.setGeometry(QtCore.QRect(10, 20, 761, 131))
        self.textBrowser_2.setObjectName("textBrowser_2")
        self.groupBox_3 = QtWidgets.QGroupBox(Dialog)
        self.groupBox_3.setGeometry(QtCore.QRect(9, 60, 781, 131))
        self.groupBox_3.setObjectName("groupBox_3")
        self.textBrowser = QtWidgets.QTextBrowser(self.groupBox_3)
        self.textBrowser.setGeometry(QtCore.QRect(10, 20, 761, 101))
        self.textBrowser.setObjectName("textBrowser")
        self.pushButton = QtWidgets.QPushButton(Dialog)
        self.pushButton.setGeometry(QtCore.QRect(550, 10, 93, 41))
        self.pushButton.setObjectName("pushButton")
        self.pushButton2 = QtWidgets.QPushButton(Dialog)
        self.pushButton2.setGeometry(QtCore.QRect(680, 10, 93, 41))
        self.pushButton2.setObjectName("pushButton")
        self.lineEdit = QtWidgets.QLineEdit(Dialog)
        self.lineEdit.setGeometry(QtCore.QRect(110, 10, 400, 41))
        self.lineEdit.setObjectName("lineEdit")

        self.retranslateUi(Dialog)
        self.pushButton.clicked.connect(self.btn_click)
        self.pushButton2.clicked.connect(self.btn_clear)
        QtCore.QMetaObject.connectSlotsByName(Dialog)

    def retranslateUi(self, Dialog):
        _translate = QtCore.QCoreApplication.translate
        Dialog.setWindowTitle(_translate("Dialog", "百度百科"))
        self.label.setText(_translate("Dialog", "输入关键字:"))
        self.groupBox.setTitle(_translate("Dialog", "Content"))
        self.groupBox_2.setTitle(_translate("Dialog", "Form"))
        self.groupBox_3.setTitle(_translate("Dialog", "Summary"))
        self.pushButton.setText(_translate("Dialog", "search"))
        self.pushButton2.setText(_translate("Dialog", "clear"))

    def btn_click(self):
        key=self.lineEdit.text()
        summary=bk(key).search_news_Summary()
        form=bk(key).search_news_form()
        content=bk(key).search_news_content()
        self.textBrowser.insertPlainText(summary)
        self.textBrowser_2.insertPlainText(form)
        self.textBrowser_3.insertPlainText(content)

    def btn_clear(self):
        self.textBrowser.clear()
        self.textBrowser_2.clear()
        self.textBrowser_3.clear()



if __name__ == "__main__":
    import sys
    app = QtWidgets.QApplication(sys.argv)
    widget = QtWidgets.QWidget()
    ui = Ui_Dialog()
    ui.setupUi(widget)
    widget.show()
    sys.exit(app.exec_())

4 .作品不足之处:
初学爬虫,对于爬取信息的过滤和修整能力有限,以及还没学习如何抓取和展示图片,因此,抓取的信息可能会过于冗长和不能完全还原百度百科。

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值