Python课设实验之爬虫应用——校园网搜索引擎（使用bs4进行数据分析.）

最新推荐文章于 2024-03-25 07:45:55 发布

冷月半明

最新推荐文章于 2024-03-25 07:45:55 发布

阅读量1.8k

点赞数

分类专栏： Pyhon 文章标签： python c++ 爬虫

本文链接：https://blog.csdn.net/kilig_CSM/article/details/121679262

版权

Pyhon 专栏收录该内容

33 篇文章 3 订阅

订阅专栏

题目要求：

第3题 爬虫应用——校园网搜索引擎

本题旨在使用Python建立一个适合校园网使用的Web搜索引擎系统，它能在较短时间内爬取页面信息，具有有效准确的中文分词功能，实现对校园网上新闻信息的快速检索展示。

代码：

import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
import os
from time import sleep
import sqlite3

def creatdb():#建库建表
    con = sqlite3.connect("爬虫.db")  # 打开/建立数据库
    Cur = con.cursor()
    con.execute("create table date_pa (title text primary key ,"  # 标题
                "ip text not null,"  # 链接
                "text1 text not null)")  # 正文
    con.commit()
    Cur.close()
    con.close()

def find_text(ip):#打开网页获取源码.
    ip="http://www.zueb.edu.cn"+ip
    rep = urllib.request.urlopen(ip)
    try:
        html = rep.read().decode('utf-8')
    except:
        print("出错了！")
    str1 = BeautifulSoup(html, 'html.parser')
    return  str1

def adddate(titel,ip,text):#增添数据
    con = sqlite3.connect("爬虫.db")  # 打开/建立数据库
    Cur = con.cursor()
    titel="'"+titel+"'"
    ip="'"+ip+"'"
    text="'"+text.replace("'","!")+"'"
   # print("insert into date_pa values (" + titel + " , " +ip +" , "+text+ " ) ")
    try:
        Cur.execute("insert into date_pa values (" + titel + "," + ip + "," + text + ")")  # 加入数据
        con.commit()  # 提交.
        print("插入成功")
        Cur.close()
        con.close()
    except:
        print("1")

def createdate():#初始化数据库
    url = "http://www.zueb.edu.cn/site/xsgate/xsyw/index.html"
    url1 = "http://www.zueb.edu.cn/site/xsgate/xsyw/index.html"
    print(url1)
    # 打开网页
    rep = urllib.request.urlopen(url1)
    # 获取网页内容
    try:
        html = rep.read().decode('utf-8')
    except:
        print("出错了！")
    # 正则匹配
    str1 = BeautifulSoup(html, 'html.parser')
    li = str1.find_all('div', attrs={'class': "main_box_inner_right"})
    m = str(li)
    m = BeautifulSoup(m, 'html.parser')
    soup = m.find_all('span', {'class': 'title'})
    soup1 = m.find_all('a', {'target': '_blank'})
    creatdb()#建库
    for i in range(0, len(soup1)):
        # print(soup1[i]['title'] + soup1[i]['href'])
        title = soup1[i]['title']
        ip = soup1[i]['href']
        print(title)
        print(ip)
        text = str(find_text(ip))  # 获取网页源码.
        adddate(title, ip, text)  # 将信息加入表中
    for yeshu in range(1, 121):  # 循环执行下边的其他网页.
        url1 = url[0:url.find('.h')] + '_' + str(yeshu) + url[url.find('.h'):]
        print(url1)
        # 打开网页
        rep = urllib.request.urlopen(url1)
        # 获取网页内容
        try:
            html = rep.read().decode('utf-8')
        except:
            print("出错了！")
        # 正则匹配
        str1 = BeautifulSoup(html, 'html.parser')
        li = str1.find_all('div', attrs={'class': "main_box_inner_right"})
        m = str(li)
        m = BeautifulSoup(m, 'html.parser')
        soup = m.find_all('span', {'class': 'title'})
        soup1 = m.find_all('a', {'target': '_blank'})
        for i in range(0, len(soup1)):
            # print(soup1[i]['title'] + soup1[i]['href'])
            title = soup1[i]['title']
            ip = soup1[i]['href']
            print(title)
            print(ip)
            text = str(find_text(ip))  # 获取网页源码.
            adddate(title, ip, text)  # 将信息加入表中

def find_key(key1):
    key1="'%%"+key1+"%%'"
    con = sqlite3.connect("爬虫.db")  # 打开/建立数据库
    Cur = con.cursor()  # 创建游标对象
    print("select * from date_pa where like title = "  + key1)
    Cur.execute("select * from date_pa where  title like "  + key1 )  # 模糊匹配.
    for row in Cur:
        print(row)
    Cur.close()
    con.close()

m=input("是否第一次打开本程序？（ 1 or 0 ）：")
if m == "1" :
    createdate()#初始化数据库
else:
    key1=input("请输入你要搜索的关键词：")
    find_key(key1)

本实验具体内容于：

Python课设实验报告.docx-讲义文档类资源-CSDN文库https://download.csdn.net/download/kilig_CSM/68857848