题目要求:
第3题 爬虫应用——校园网搜索引擎
本题旨在使用Python建立一个适合校园网使用的Web搜索引擎系统,它能在较短时间内爬取页面信息,具有有效准确的中文分词功能,实现对校园网上新闻信息的快速检索展示。
代码:
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
import os
from time import sleep
import sqlite3
def creatdb():#建库建表
con = sqlite3.connect("爬虫.db") # 打开/建立数据库
Cur = con.cursor()
con.execute("create table date_pa (title text primary key ," # 标题
"ip text not null," # 链接
"text1 text not null)") # 正文
con.commit()
Cur.close()
con.close()
def find_text(ip):#打开网页获取源码.
ip="http://www.zueb.edu.cn"+ip
rep = urllib.request.urlopen(ip)
try:
html = rep.read().decode('utf-8')
except:
print("出错了!")
str1 = BeautifulSoup(html, 'html.parser')
return str1
def adddate(titel,ip,text):#增添数据
con = sqlite3.connect("爬虫.db") # 打开/建立数据库
Cur = con.cursor()
titel="'"+titel+"'"
ip="'"+ip+"'"
text="'"+text.replace("'","!")+"'"
# print("insert into date_pa values (" + titel + " , " +ip +" , "+text+ " ) ")
try:
Cur.execute("insert into date_pa values (" + titel + "," + ip + "," + text + ")") # 加入数据
con.commit() # 提交.
print("插入成功")
Cur.close()
con.close()
except:
print("1")
def createdate():#初始化数据库
url = "http://www.zueb.edu.cn/site/xsgate/xsyw/index.html"
url1 = "http://www.zueb.edu.cn/site/xsgate/xsyw/index.html"
print(url1)
# 打开网页
rep = urllib.request.urlopen(url1)
# 获取网页内容
try:
html = rep.read().decode('utf-8')
except:
print("出错了!")
# 正则匹配
str1 = BeautifulSoup(html, 'html.parser')
li = str1.find_all('div', attrs={'class': "main_box_inner_right"})
m = str(li)
m = BeautifulSoup(m, 'html.parser')
soup = m.find_all('span', {'class': 'title'})
soup1 = m.find_all('a', {'target': '_blank'})
creatdb()#建库
for i in range(0, len(soup1)):
# print(soup1[i]['title'] + soup1[i]['href'])
title = soup1[i]['title']
ip = soup1[i]['href']
print(title)
print(ip)
text = str(find_text(ip)) # 获取网页源码.
adddate(title, ip, text) # 将信息加入表中
for yeshu in range(1, 121): # 循环执行下边的其他网页.
url1 = url[0:url.find('.h')] + '_' + str(yeshu) + url[url.find('.h'):]
print(url1)
# 打开网页
rep = urllib.request.urlopen(url1)
# 获取网页内容
try:
html = rep.read().decode('utf-8')
except:
print("出错了!")
# 正则匹配
str1 = BeautifulSoup(html, 'html.parser')
li = str1.find_all('div', attrs={'class': "main_box_inner_right"})
m = str(li)
m = BeautifulSoup(m, 'html.parser')
soup = m.find_all('span', {'class': 'title'})
soup1 = m.find_all('a', {'target': '_blank'})
for i in range(0, len(soup1)):
# print(soup1[i]['title'] + soup1[i]['href'])
title = soup1[i]['title']
ip = soup1[i]['href']
print(title)
print(ip)
text = str(find_text(ip)) # 获取网页源码.
adddate(title, ip, text) # 将信息加入表中
def find_key(key1):
key1="'%%"+key1+"%%'"
con = sqlite3.connect("爬虫.db") # 打开/建立数据库
Cur = con.cursor() # 创建游标对象
print("select * from date_pa where like title = " + key1)
Cur.execute("select * from date_pa where title like " + key1 ) # 模糊匹配.
for row in Cur:
print(row)
Cur.close()
con.close()
m=input("是否第一次打开本程序?( 1 or 0 ):")
if m == "1" :
createdate()#初始化数据库
else:
key1=input("请输入你要搜索的关键词:")
find_key(key1)
本实验具体内容于:
Python课设实验报告.docx-讲义文档类资源-CSDN文库https://download.csdn.net/download/kilig_CSM/68857848
(本报告包含课设目的,开发环境,课设内容,需求分析,设计过程,存储方式,过程及代码,调试过程,小结,参考文献.)
注意:
需要注意的是该代码用到了bs4包,没有下过的童鞋需要下载后才能导入.