校园网新闻搜索引擎

最新推荐文章于 2024-07-18 20:04:15 发布
远古时代蛋黄派
最新推荐文章于 2024-07-18 20:04:15 发布
阅读量423
点赞数 1
文章标签： mysql 搜索引擎 python 数据分析 tf-idf
本文链接：https://blog.csdn.net/qq_20543169/article/details/115253477
版权
在这里插入图片描述
import requests
from bs4 import BeautifulSoup
import re
from mysql.connector import MySQLConnection
import jieba
import math
import logging
import ast  # 字符串转列表会用到
from tqdm import tqdm, trange  # 进度条

jieba.setLogLevel(logging.INFO)  # 隐藏jieba的日志输出


# 爬虫部分
def getHtmlPage(url):  # 功能:得到html
    try:
        re = requests.get(url)
        re.raise_for_status()
        re.encoding = re.apparent_encoding
        return re.text
    except:
        return ''


def getNewsUrl(html):  # 从html中解析每一条新闻页面的地址后缀
    frontSite = r'http://news.tust.edu.cn/kdxw/'  # 新闻页面网址前缀
    urlList = []  # 存放新闻网址的列表
    soup = BeautifulSoup(html, 'html.parser')
    # print(soup.prettify())
    aList = soup.find_all(name='a')  # 寻找a标签(返回一个列表,里面全是名字为a的标签)
    for tag in aList:  # 提取每个标签里面的地址属性
        try:
            str = tag.attrs["href"]  # 后缀的属性名为href
            match = re.search(r'.{32}(.htm)', str)  # 使用正则表达式提取
            newsUrlFullSite = frontSite + match.group(0)  # 网址=前缀加后缀
            if match:
                urlList.append(newsUrlFullSite)
        except:
            pass
    return urlList


def getNewsInfo(html, id, site, cursor88, dbc88):  # 得到新闻页面的内容
    contentList = []  # 内容列表
    soup2 = BeautifulSoup(html, 'html.parser')
    l = soup2.find_all('p', style="text-indent:2em")  # 新闻内容存在p标签里面

    try:
        title = soup2.find('p', align="center").text  # 获取标题
    except:
        pass

    for i in l:
        if i.text is not None:  # 获取标签以及子节点中的字符串内容  这里不能用.string,因为他只能获取标签下的,不能获得子节点中的内容
            # print(str(i.string))  # i.string 是标签类型
            content = str(i.text)
            contentList.append(content)
    contentStr = ''.join(contentList)
    # print(contentList)

    if contentStr:  # 个别网址结构特殊,不合符爬虫格式,爬下来会为空,这些就不要了
        sql88 = "INSERT INTO urlAndContent(ID,site,title,content) VALUES('%s','%s','%s','%s')" % (
            id, site, title, contentStr)

        try:
            cursor88.execute(sql88)
            dbc88.commit()  # 提交到数据库执行
        except:
            pass
    # print(l)
    # print('最终得到的内容字符串为\n',contentStr)


def spiderAndSaveModel(cursor88, dbc88):
    print("正在准备开始爬虫")
    url = 'http://news.tust.edu.cn/kdxw/'
    pageNum = 30  # 默认爬取30页新闻 (大概30页以后网页结构全部发生变化,不能再多爬,一共713条新闻)
    id = 1  # 数据库id

    # 第一页新闻地址没有页数后缀,单独爬取
    html = getHtmlPage(url)  # 得到新闻主页html文件
    urlList = getNewsUrl(html)  # 解析主页网址,获取主页中每一条新闻链接,存放在urlList列表中

    for i in urlList:
        getNewsInfo(html=getHtmlPage(i), id=id, site=i, cursor88=cursor88, dbc88=dbc88)  #
        id += 1
        # print('\n\n\n\n页面源代码',getHtmlPage(i))
        # print('\r', '已经爬取%d条新闻 进度为%.2f%%' % (id, id / 7), end='', flush=True)
        # print('已经爬取%d条新闻 进度为%.2f%%' % (id, id / 7))

    for i in trange(1, int(pageNum) + 1,
                    bar_format='爬虫进度:{percentage:3.0f}%|{bar}|{n}/{total}[{remaining}]'):  # 循环是获取多页的内容
        behind = 'index' + str(i) + '.html'  # 每一页网址的后缀
        newUrl = url + behind  # 完整的网址 (前缀+后缀)
        html = getHtmlPage(newUrl)  # 得到新闻主页html文件
        urlList = getNewsUrl(html)  # 解析主页网址,获取主页中每一条新闻链接,存放在urlList列表中

        for i in urlList:  # 遍历url
            getNewsInfo(html=getHtmlPage(i), id=id, site=i, cursor88=cursor88, dbc88=dbc88)  #
            id += 1
            # print('\r','已经爬取%d条新闻 进度为%.2f%%'%(id,id/7),end='',flush=True)


# 分词部分
def getAllInOne():  # 返回值为所有新闻内容组成的一个字符串,为分词准备
    allContentList = []  # 用来装所有新闻内容的列表

    dbc = MySQLConnection(user='root', passwd='123456', db='website')  # 连接数据库
    cursor = dbc.cursor()  # 生成游标对象

    sql = f"SELECT * FROM urlandcontent"  # 选择表
    cursor.execute(sql)

    while 1:  # 读取每一条数据
        res = cursor.fetchone()
        if res is None:  # 表示已经取完结果集
            break
        else:
            allContentList.append(res[3])  # 把当前读取到的一条新闻内容放到表里

    allContentStr = ''.join(allContentList)  # 列表转字符串

    # 去掉标点
    punctuation = r""" !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”？，！【】（）、。：；’‘……￥·〔〕《》"""  # 标点集
    dicts = {i: '' for i in punctuation}
    punc_table = str.maketrans(dicts)
    s = allContentStr.translate(punc_table)

    return s  # 返回去掉了标点符号的新闻内容字符串


def splitAndSave(str):
    print('-' * 75)
    print('已完成爬虫,正在准备获取分词')

    splitList = jieba.cut_for_search(str)  # jieba返回列表
    list0 = [a for a in splitList]
    list1 = list(set(list0))  # 去掉重复的词语

    dbc = MySQLConnection(user="root", passwd="123456", db="website")
    cursor = dbc.cursor()

    sql1 = """CREATE TABLE If Not Exists spliter (
       ID  INT NOT NULL,
       split TEXT

    )"""
    cursor.execute(sql1)
    idCounter = 1
    for i in tqdm(list1, bar_format='获取词语进度:{percentage:3.0f}%|{bar}|{n}/{total}[{remaining}]'):
        sql = "INSERT INTO spliter(ID,split) VALUES('%s','%s')" % (idCounter, i)
        try:
            cursor.execute(sql)
            dbc.commit()  # 提交到数据库执行
        except:
            dbc.rollback()  # 发生错误时回滚
        idCounter += 1
        # print('\r', "已获取", idCounter, '个词语', end='', flush=True)
    cursor.close()
    dbc.close()


# 计算相关性,建立索引,进行排名,正式搜索部分
def setDaoPaiList():  # 建立倒排索引     pageNum:搜索页数,用于计算文档总数(n)
    print('-' * 75)
    print('分词已经全部获取,准备计算每个词语对应的新闻得分情况')
    dbc = MySQLConnection(user="root", passwd="123456", db="website")
    dbc1 = MySQLConnection(user="root", passwd="123456", db="website")
    dbc2 = MySQLConnection(user="root", passwd="123456", db="website")

    dbc3 = MySQLConnection(user='root', passwd='123456', db='website')
    cursor3 = dbc3.cursor()  # 生成游标对象

    cursor = dbc.cursor()
    cursor1 = dbc1.cursor()
    cursor2 = dbc2.cursor()

    sql9 = """CREATE TABLE If Not Exists score (
          ID  INT NOT NULL,
          split TEXT,
          score TEXT
       )"""
    cursor.execute(sql9)

    # 词语库  res:ID  split  siteNum    新闻内容库  res1: ID  site title content

    # 先从数据库中的spliter表拿到第一个词语
    sql = f"SELECT * FROM spliter"
    cursor.execute(sql)
    temp = 1
    for i in tqdm(range(28634), bar_format='获取得分进度:{percentage:3.0f}%|{bar}|{n}/{total}[{remaining}]'):    # 一共28634个词
        res = cursor.fetchone()  # 读取一行数据
        if res is None:  # 表示已经取完结果集
            break
        else:
            # 打开urlandcontent表,遍历每一条新闻,统计当前外层循环中的res[1]词语在当前新闻中出现次数
            sql1 = f"SELECT * FROM urlandcontent"
            cursor1.execute(sql1)
            tf = {}  # tf = {新闻ID:出现频率}
            score = {}  # score={文档号：文档得分}

            while 1:  # 开始遍历每一条新闻
                res1 = cursor1.fetchone()  # 读取到一条新闻的数据
                if res1 is None:
                    break
                else:
                    counter = res1[3].count(res[1])  # 统计词语在当前新闻中出现次数

                    if counter > 0:  # 没出现过的情况就不要了
                        # print('词语ID:'res[0],'词语内容:',res[1],'出现在新闻ID:',res1[0], '出现次数为:',counter)
                        tf[res1[0]] = counter  # 把  新闻ID:出现频率  添加到字典tf中  tf={文档号：出现次数}

            # 现在已经计算完当前词语在所有新闻中的出现次数
            # 计算IDF
            df = len(tf)  # df是包含当前词语的新闻文档数目
            n = 713  # n是文档总数
            idf = 0
            if df:
                idf = math.log(n / df)  # 词条t的IDF计算公式：idf= log(N/df)

            # 计算TF-IDF
            for newsID in tf.keys():
                TF = tf[newsID]
                nowScore = TF * idf
                nowScore = float(format(nowScore, '.2f'))
                # 把当前文档的得分情况添加到score字典中
                if nowScore > 0:
                    score[newsID] = nowScore

            # 将分数保存到mysql的score表中
            if tf:
                # print(tf)
                # print('当前词语为',res[1],'分数为:', score)
                cursor3.execute("INSERT INTO score(ID,split,score) VALUES('%d','%s','%s')" % (temp, res[1], score))
                dbc3.commit()  # 提交到数据库执行
                temp += 1

def searchAndRank(searchStr):  # 搜索分词&排名模块    searchStr是要搜索的文本    返回新闻ID的搜索排名列表
    tempDic = {}
    searchSplitList = jieba.cut_for_search(searchStr)  # 把代搜内容分词
    # searchSplitList = jieba.cut(searchStr)  # 另一种分词搜索模式
    dbc = MySQLConnection(user="root", passwd="123456", db="website")
    for oneStr in searchSplitList:  # 依次在数据库中查找分词
        cur = dbc.cursor()  # 重建游标
        cur.execute("select * from score;")
        # print('+' * 30)
        # print('当前的分词为:',oneStr)
        while 1:
            res = cur.fetchone()
            if res is None:
                break
            elif res[1] == oneStr:
                nowDic = res[2]  # 获取分数信息
                nowDic = ast.literal_eval(nowDic)  # 将字符串转成字典
                c = {}
                if not tempDic:
                    c.update(nowDic)
                    tempDic.update(c)
                else:
                    for key1 in tempDic:
                        for key2 in nowDic:
                            if key1 in nowDic:
                                c[key1] = tempDic[key1] + nowDic[key1]
                            else:
                                c[key1] = tempDic[key1]
                                if key2 not in tempDic:
                                    c[key2] = nowDic[key2]
        try:
            # tempDic = sorted(tempDic.items(), key=lambda x: x[1], reverse=True)  # 排一下序,方便调试
            # nowDic = sorted(nowDic.items(), key=lambda x: x[1], reverse=True)  # 排一下序,方便调试
            # print('上一个分词对应的新闻ID排序:',tempDic)
            # print('当前的分词对应的新闻ID排序:',nowDic)
            tempDic = c

        except:
            pass
        try:
            sss = sorted(c.items(), key=lambda x: x[1], reverse=True)  # 根据字典的值排序
        except:
            pass
        # print(sss)
        # print(sss[0][0])
        f = []  # f列表装着的是相关的网址ID,相关度从高到低
        try:
            for d in range(len(sss)):
                f.append(sss[d][0])
        except:
            pass

    try:
        cur.close()
        dbc.commit()
        dbc.close()
    except:
        pass

    return f


def getFindUrl(idList):  # 根据提供的新闻ID列表,找到相应的新闻地址以及新闻标题
    print('\n\n以下为搜索结果')
    print('-' * 75)

    dbc = MySQLConnection(user="root", passwd="123456", db="website")
    if idList:
        for g in idList:
            cur = dbc.cursor()  # 重建游标
            cur.execute("select * from urlAndContent WHERE ID = %d" % g)
            res = cur.fetchone()
            print('第%d条新闻' % next(it))  # 应用迭代器
            print('新闻标题:%s' % res[2])  # 获取标题
            print('新闻链接:', res[1])
            print('-' * 75)
    else:
        print("未查询到")


# 正式搜索代码

def search():
    print('\n', '-' * 75)
    print('\n欢迎使用科大新闻网中文搜索引擎\n请问您要搜索什么?')
    searchWhat = input()
    k = searchAndRank(searchWhat)  # 将搜索词分词计算新闻排名,返回ID列表k

    numOfRes = len(k)  # 相关结果的数量
    num_of_result_page = numOfRes / 10  # 相关结果的页数

    print('-' * 18)
    print('共搜索到%3d条相关新闻' % numOfRes)
    print('-' * 18)

    if numOfRes == 0:  # 如果没搜到就终止代码
        return False

    temper = input("\n\n输入1获取下一页新闻内容,输入0退出程序\n")

    while (temper != '1') and (temper != '0'):
        print("error:请输入正确的数字")
        temper = input("\n\n输入1获取下一页新闻内容,输入0退出程序\n")

    numCounter = 1
    tempK = []  # 新闻ID的缓存列表(容量为10条新闻)   为了方便分页
    while int(temper):
        for j in range(10):
            try:
                tempK.append(k[j])  # 把k列表前10条新闻ID加入到tempK中
            except:
                pass  # 如果k长度小于10,避免出错

        getFindUrl(tempK)  # 把tempK传给获取新闻内容函数

        for fo in range(10):  # 把k列表前10条新闻删除
            try:
                k.pop(0)
            except:
                pass
        # print([x for x in k])

        if numCounter >= num_of_result_page:
            print("已经打印全部新闻")
            break

        temper = input("\n\n输入1获取下一页新闻内容,输入0退出程序\n")

        while (temper != '1') and (temper != '0'):
            print("error:请输入正确的数字")
            temper = input("\n\n输入1获取下一页新闻内容,输入0退出程序\n")

        numCounter += 1
        tempK = []  # 清空缓存列表

    print("欢迎下次使用,再见!")


# 主函数
def main():
    dbc88 = MySQLConnection(user='root', passwd='123456', db='website')
    cursor88 = dbc88.cursor()  # 生成游标对象

    sql10 = """CREATE TABLE If Not Exists urlAndContent (
       ID  INT NOT NULL,
       site  TEXT,
       title text,
       content TEXT
    )"""

    #cursor88.execute(sql10)
    #spiderAndSaveModel(cursor88, dbc88)  # 爬虫并保存
    #splitAndSave(getAllInOne())  # 分词并保存
    #setDaoPaiList()  # 计算得分并保存

    global it
    it = iter([p for p in range(1, 1000)])  # 迭代器来记下新闻编号
    search()  # 搜索函数


try:
    main()
except:
    pass
远古时代蛋黄派
关注
1
点赞
踩
2

收藏

觉得还不错? 一键收藏
2
评论
校园网新闻搜索引擎

import requestsfrom bs4 import BeautifulSoupimport refrom mysql.connector import MySQLConnectionimport jiebaimport mathimport loggingimport ast # 字符串转列表会用到from tqdm import tqdm, trange # 进度条jieba.setLogLevel(logging.INFO) # 隐藏jieba的日志输出...
复制链接

扫一扫