前言
本搜索引擎使用go语言实现,采用redis作为存储数据库,使用到colly爬虫框架,gin-gonic前端框架。
实现了包括页面抓取,建立倒排索引等功能。
注:该项目只是一个大作业,存在很多不足,仅供参考!!!!!!!
1、项目架构图
2、运行效果
demo
3、项目目录结构
MySearchEngine
- collySpider
- collySpider.go
- dict
- stop_words.utf8
- models
- page.go
- templates
- search.html
- tools
- checkUrl.go
- createStopWords.go
- getUnVistedUrl.go
- getUrlMD5.go
- init.go
- initDB.go
- invert_index.go
- savePages.go
- saveUnVisitedUrl.go
- search.go
- main.go
4、具体代码实现
该爬虫爬取域名主要为www.chinanews.com.cn、www.news.cn,可以在init.go内进行修改。
4.1 collySpider
collySpider.go
package collySpider
import (
"log"
"os"
"pkg/models"
"pkg/tools"
"regexp"
"strings"
"sync"
"github.com/gocolly/colly"
)
var mu sync.Mutex // 互斥锁
func Craw() {
cl := colly.NewCollector(
colly.MaxDepth(6),
colly.AllowedDomains("www.chinanews.com.cn", "www.news.cn"),
)
cl.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
})
cl.OnError(func(r *colly.Response, err error) {
errorLog := log.New(os.Stderr, "Error: ", log.LstdFlags)
errorLog.Println(err)
})
var lake models.Page
cl.OnHTML("*", func(el *colly.HTMLElement) {
href := el.Attr("href")
href = strings.TrimSpace(href) // Remove leading and trailing whitespaces
if href != "" {
// fmt.Println(href)
tools.Saveunvisitedurl(href)
}
})
cl.OnHTML("body", func(el *colly.HTMLElement) {
title := strings.TrimSpace(el.ChildText("div[class='head-line clearfix'] > h1 > span[class='title']"))
if title == "" {
title = strings.TrimSpace(el.ChildText("h1[class='content_left_title']"))
if title == "" {
title = strings.TrimSpace(el.ChildText("div[class='content_title']"))
if title == "" {
title = strings.TrimSpace(el.ChildText("h1[class='page_title']"))
}
}
}
lake.Title = title
text := el.Text
el.ForEach("script", func(_ int, s *colly.HTMLElement) {
text = strings.ReplaceAll(text, s.Text, "")
})
el.ForEach("noscript", func(_ int, s *colly.HTMLElement) {
text = strings.ReplaceAll(text, s.Text, "")
})
el.ForEach("style", func(_ int, h *colly.HTMLElement) {
text = strings.ReplaceAll(text, h.Text, "")
})
text = strings.ReplaceAll(text, " ", "")
text = strings.ReplaceAll(text, "\n", "")
text = strings.ReplaceAll(text, "\t", "")
pattern := `[0-9A-Za-zΑ-Ωα-ω!"#$%&'()*+,-./:;<=>?@[\\\]^_` + "`" + `{|}~\p{P}\p{S}\p{M}\p{C}\p{Z}]`
re := regexp.MustCompile(pattern)
text = re.ReplaceAllString(text, "")
lake.Url = el.Request.URL.String()
if text == "" {
lake.Text = lake.Title
} else {
lake.Text = text
}
mu.Lock() // 加锁
tools.Savepages(lake.Url, lake.Title)
ids, _ := tools.GetDocID(lake.Url)
tools.AddDocument(ids, lake.Text)
mu.Unlock() // 解锁
})
// // 创建定时器,在一分钟后执行后面的代码
// timer := time.NewTimer(1 * time.Minute)
// go func() {
// <-timer.C
// const concurrentRequests = 64
// ch := make(chan string, concurrentRequests)
// wg := sync.WaitGroup{}
// for i := 0; i < concurrentRequests; i++ {
// wg.Add(1)
// go func() {
// defer wg.Done()
// for url := range ch {
// err := cl.Visit(url)
// if err != nil {
// errorLog := log.New(os.Stderr, "Error: ", log.LstdFlags)
// errorLog.Println(err)
// }
// }
// }()
// }
// for {
// url, err := tools.Getunvistedurl()
// if err != nil {
// if err.Error() == "redis: nil" {
// time.Sleep(20 * time.Second) // 等待1秒后重试
// continue
// }
// log.Println(err)
// return
// }
// ch <- url
// }
// }()
// 在定时器启动前执行前一分钟的代码
raw_url, _ := tools.Getunvistedurl()
err := cl.Visit(raw_url)
if err != nil {
errorLog := log.New(os.Stderr, "Error: ", log.LstdFlags)
errorLog.Println(err)
}
}
注:多线程似乎还存在部分问题,docID存在锁不住的问题,如需使用请检查改进后使用。
4.2 dict
该目录下存放停词库文件,请自行查找加入目录.
4.3 models
page.go
package models
type Page struct {
ID int `gorm:"primaryKey"`
Url string `gorm:"default:null"`
Title string `gorm:"default:null"`
Text string `gorm:"default:null"`
}
4.4 templates
search.html
<!DOCTYPE html>
<html>
<head>
<title>🍙搜索</title>
<style>
html, body {
height: 100%;
margin: 0;
padding: 0;
font-family: Arial, sans-serif;
overflow-x: hidden; /* Prevent horizontal scrollbar */
}
body {
overflow-y: auto; /* Allow vertical scrolling */
scroll-behavior: smooth; /* Enable smooth scrolling */
}
#particles-js {
position: fixed;
width: 100%;
height: 100%;
background-color: #2b2e4a;
z-index: -1;
}
#app {
position: relative;
z-index: 1;
max-width: 800px;
margin: 50px auto;
padding: 20px;
background: rgba(255, 255, 255, 0.9);
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
border-radius: 10px;
}
h1 {
text-align: center;
color: #4CAF50;
font-size: 2.5em;
margin-bottom: 20px;
}
input[type="text"] {
width: calc(100% - 22px);
padding: 10px;
margin-bottom: 20px;
border: 1px solid #ddd;
border-radius: 4px;
font-size: 1em;
}
button {
display: block;
width: 100%;
padding: 10px;
background-color: #4CAF50;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 1em;
}
button:hover {
background-color: #45a049;
}
ul {
list-style-type: none;
padding: 0;
}
li {
background: #f9f9f9;
margin: 10px 0;
padding: 20px;
border: 1px solid #ddd;
border-radius: 4px;
}
h3 {
margin: 0 0 10px;
}
a {
color: #4CAF50;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
</style>
</head>
<body>
<div id="particles-js"></div>
<div id="app">
<h1>🍙搜索</h1>
<input type="text" id="query" placeholder="原神启动!!!!!!!!!!" onkeyup="checkEnter(event)">
<button id="searchBtn" onclick="search()">启动</button>
<ul id="results">
<!-- Search results will be inserted here -->
</ul>
</div>
<script src="https://cdn.jsdelivr.net/particles.js/2.0.0/particles.min.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function() {
particlesJS("particles-js", {
particles: {
number: {
value: 80,
density: {
enable: true,
value_area: 800
}
},
color: {
value: "#ffffff"
},
shape: {
type: "circle",
stroke: {
width: 0,
color: "#000000"
},
polygon: {
nb_sides: 5
}
},
opacity: {
value: 0.5,
random: false,
anim: {
enable: false,
speed: 1,
opacity_min: 0.1,
sync: false
}
},
size: {
value: 3,
random: true,
anim: {
enable: false,
speed: 40,
size_min: 0.1,
sync: false
}
},
line_linked: {
enable: true,
distance: 150,
color: "#ffffff",
opacity: 0.4,
width: 1
},
move: {
enable: true,
speed: 6,
direction: "none",
random: false,
straight: false,
out_mode: "out",
bounce: false,
attract: {
enable: false,
rotateX: 600,
rotateY: 1200
}
}
},
interactivity: {
detect_on: "canvas",
events: {
onhover: {
enable: true,
mode: "repulse" // Interaction mode on hover
},
onclick: {
enable: true,
mode: "push"
},
resize: true
},
modes: {
grab: {
distance: 200,
line_linked: {
opacity: 1
}
},
bubble: {
distance: 400,
size: 40,
duration: 2,
opacity: 8,
speed: 3
},
repulse: {
distance: 100, // Distance of repulsion effect
duration: 0.4
},
push: {
particles_nb: 4
},
remove: {
particles_nb: 2
}
}
},
retina_detect: true
});
});
function checkEnter(event) {
if (event.key === 'Enter') {
document.getElementById('searchBtn').click();
}
}
function search() {
const query = document.getElementById('query').value;
fetch('/search', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ query })
})
.then(response => response.json())
.then(data => {
const resultsElement = document.getElementById('results');
resultsElement.innerHTML = data.map(result => `
<li>
<h3>${result.Title}</h3>
<p>Relevance Score: ${result.Score}</p>
<p>URL: <a href="${result.URL}" target="_blank">${result.URL}</a></p>
</li>
`).join('');
resultsElement.scrollIntoView({ behavior: 'smooth' });
})
.catch(error => console.error('Error:', error));
}
</script>
</body>
</html>
4.5 tools
checkUrl.go
package tools
import (
"fmt"
"net/url"
)
// IsValidURL 检查URL是否有效
func IsValidURL(rawURL string) bool {
u, err := url.ParseRequestURI(rawURL)
if err != nil {
return false
}
if u.Scheme != "http" && u.Scheme != "https" {
return false
}
return true
}
// Checkvistedurl 检查URL是否已访问过
func Checkvistedurl(urlMD5 string) (flag bool) {
flag = false
visitedQueueName := "visited_urls_queue"
// 从visited队列中检索一系列元素
elements, err := rdb.LRange(ctx, visitedQueueName, 0, -1).Result()
if err != nil {
fmt.Println("Error while retrieving elements from Redis:", err)
return
}
// 检查URL是否存在于检索到的元素中
for _, element := range elements {
if Getmd5(element) == urlMD5 {
// fmt.Println("URL已在visited队列中.")
flag = true
break
}
}
// fmt.Println("URL不在visited队列中.")
return flag
}
createStopWords.go
package tools
import (
"bufio"
"os"
"strings"
)
// Createstopwords 从文件中创建停用词列表
func Createstopwords() []string {
file, _ := os.Open("/home/nanzi/goCode/src/MySearchEngine/dict/stop_words.utf8")
defer file.Close()
scanner := bufio.NewScanner(file)
var stopWords []string
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
stopWords = append(stopWords, line)
}
return stopWords
}
getUnVistedUrl.go
package tools
import (
"fmt"
)
// Getunvistedurl 从未访问URL队列中获取一个URL
func Getunvistedurl() (string, error) {
// Retrieve a URL from the "unvisited_urls_queue"
url, err := rdb.LPop(ctx, "unvisited_urls_queue").Result()
if err != nil {
return "", fmt.Errorf("failed to get URL from the unvisited URL queue: %w", err)
}
// Check if the URL has been visited
if !Checkvistedurl(Getmd5(url)) {
// Add the URL to the "visited_urls_queue"
err = rdb.LPush(ctx, "visited_urls_queue", url).Err()
if err != nil {
return "", fmt.Errorf("failed to add URL to the visited URL queue: %w", err)
}
return url, nil
}
// 如果该URL已被访问过,则递归调用该函数获取新的URL
return Getunvistedurl()
}
// Getunvistedurlsize 获取未访问URL队列的大小
func Getunvistedurlsize() int {
queueLen, _ := rdb.LLen(ctx, "unvisited_urls_queue").Result()
return int(queueLen)
}
getUrlMD5.go
package tools
import (
"crypto/md5"
"fmt"
"io"
)
// Getmd5 计算URL的MD5哈希值
func Getmd5(url string) (urlMD5 string) {
hasher := md5.New()
io.WriteString(hasher, url)
result := fmt.Sprintf("%x", hasher.Sum(nil))
// fmt.Println("URL的MD5哈希值:", result)
return result
}
init.go
package tools
func Init() {
url_list := []string{"https://www.chinanews.com.cn/", "http://www.news.cn/"}
unvisitedQueueName := "unvisited_urls_queue"
_ = rdb.LPush(ctx, unvisitedQueueName, url_list).Err()
}
initDB.go
package tools
import (
"context"
"github.com/go-redis/redis/v8"
)
var rdb *redis.Client
var ctx = context.Background()
func init() {
rdb = redis.NewClient(&redis.Options{
Addr: "localhost:6379",
Password: "", // no password set
DB: 1, // use default DB
})
}
invert_index.go
package tools
import (
"strings"
"sync"
"github.com/yanyiwu/gojieba"
)
func in(s string, list []string) bool {
for _, e := range list {
if s == e {
return true
}
}
return false
}
var mu sync.Mutex // 互斥锁
// InvertedIndex 用于存储单词的倒排索引
var InvertedIndex = make(map[string][]string)
var jiebaInstance *gojieba.Jieba
var jiebaOnce sync.Once
func getJiebaInstance() *gojieba.Jieba {
jiebaOnce.Do(func() {
jiebaInstance = gojieba.NewJieba()
})
return jiebaInstance
}
// AddDocument 函数用于向索引中添加文档
func AddDocument(id_cnt string, content string) {
x := getJiebaInstance()
// 将内容拆分为单词并转换为小写
words := x.CutForSearch(strings.ToLower(content), true)
stop_words := Createstopwords()
// 更新每个单词的倒排索引
for _, word := range words {
if in(word, stop_words) {
continue
}
mu.Lock() // 加锁
docID := id_cnt
err := rdb.RPush(ctx, word, docID).Err()
mu.Unlock() // 解锁
if err != nil {
panic(err)
}
}
}
savePages.go
package tools
import (
"fmt"
"net/url"
)
// IsValidURL 检查URL是否有效
func IsValidURL(rawURL string) bool {
u, err := url.ParseRequestURI(rawURL)
if err != nil {
return false
}
if u.Scheme != "http" && u.Scheme != "https" {
return false
}
return true
}
// Checkvistedurl 检查URL是否已访问过
func Checkvistedurl(urlMD5 string) (flag bool) {
flag = false
visitedQueueName := "visited_urls_queue"
// 从visited队列中检索一系列元素
elements, err := rdb.LRange(ctx, visitedQueueName, 0, -1).Result()
if err != nil {
fmt.Println("Error while retrieving elements from Redis:", err)
return
}
// 检查URL是否存在于检索到的元素中
for _, element := range elements {
if Getmd5(element) == urlMD5 {
// fmt.Println("URL已在visited队列中.")
flag = true
break
}
}
// fmt.Println("URL不在visited队列中.")
return flag
}
saveUnVisitedUrl.go
package tools
import (
"fmt"
)
// Saveunvisitedurl 保存未访问的URL到Redis队列
func Saveunvisitedurl(url string) {
urlMD5 := Getmd5(url)
unvisitedQueueName := "unvisited_urls_queue"
if IsValidURL(url) {
if !Checkvistedurl(urlMD5) {
err := rdb.LPush(ctx, unvisitedQueueName, url).Err()
if err != nil {
fmt.Println("Failed to add URL to Redis queue:", err)
return
}
}
}
// fmt.Println("URL已添加到Redis队列:", urlMD5)
}
search.go
package tools
import (
"math"
"sort"
"strings"
"github.com/yanyiwu/gojieba"
)
type Document struct {
ID string
URL string
Title string
Score float64
}
func BestMatchSearch(query string) []Document {
x := gojieba.NewJieba()
// 将内容拆分为单词并转换为小写
words := x.CutForSearch(strings.ToLower(query), true)
// 获取每个单词对应的文档列表及其位置信息
docPositions := make(map[string]map[string][]int)
for _, word := range words {
docIDs, err := rdb.LRange(ctx, word, 0, -1).Result()
if err != nil {
panic(err)
}
for _, docID := range docIDs {
if _, ok := docPositions[word]; !ok {
docPositions[word] = make(map[string][]int)
}
docPositions[word][docID] = append(docPositions[word][docID], len(docPositions[word][docID]))
}
}
// 计算每个单词的IDF值和文档长度总和
idfValues := make(map[string]float64)
totalDocuments, err := rdb.DBSize(ctx).Result()
if err != nil {
panic(err)
}
totalDocLength := 0
for _, word := range words {
docIDs, err := rdb.LRange(ctx, word, 0, -1).Result()
if err != nil {
panic(err)
}
docCount := len(docIDs)
idf := math.Log(float64(totalDocuments+1) / (1 + float64(docCount)))
idfValues[word] = idf
for _, docID := range docIDs {
totalDocLength += len(docPositions[word][docID])
}
}
avgdl := float64(totalDocLength) / float64(totalDocuments)
// 计算每个文档的相关度
scores := make(map[string]Document)
for _, word := range words {
docIDs, err := rdb.LRange(ctx, word, 0, -1).Result()
if err != nil {
panic(err)
}
for _, docID := range docIDs {
positions := docPositions[word][docID]
bm25Score := calculateBM25Score(word, positions, avgdl)
tfidfScore := bm25Score * idfValues[word]
if _, ok := scores[docID]; !ok {
url, title := getURLAndTitle(docID)
if !isExcludedURL(url) && title != "" { // 检查标题是否为空
doc := Document{
ID: docID,
URL: url,
Title: title,
Score: 0,
}
scores[docID] = doc
}
}
doc := scores[docID]
doc.Score += tfidfScore
scores[docID] = doc
}
}
// 根据相关度排序结果
var sortedResults []Document
for _, doc := range scores {
sortedResults = append(sortedResults, doc)
}
// 根据相关度降序排序
sort.Slice(sortedResults, func(i, j int) bool {
return sortedResults[i].Score > sortedResults[j].Score
})
// 过滤没有内容的文档
var filteredResults []Document
for _, doc := range sortedResults {
if doc.Title != "" && !containsDocument(filteredResults, doc) { // 检查文档是否已经存在于切片中
filteredResults = append(filteredResults, doc)
}
}
// 返回前30个有内容的结果
return filteredResults[:min(30, len(filteredResults))]
}
func calculateBM25Score(_ string, positions []int, avgdl float64) float64 {
// BM25算法参数
k1 := 1.25
b := 0.75
// 计算文档长度
docLen := float64(len(positions))
// 计算词频
frequency := float64(len(positions))
// 计算BM25分数
score := math.Log(2) * (frequency * (k1 + 1)) / (frequency + k1*(1-b+b*docLen/avgdl))
return score
}
func getURLAndTitle(docID string) (string, string) {
values, err := rdb.LRange(ctx, docID, 0, 1).Result()
if err != nil {
panic(err)
}
if len(values) < 2 {
return "", ""
}
return values[0], values[1]
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func isExcludedURL(url string) bool {
return url == "http://www.news.cn/" || url == "https://www.news.cn/"
}
func containsDocument(docs []Document, doc Document) bool {
for _, d := range docs {
if d.ID == doc.ID {
return true
}
}
return false
}
4.6 主函数
main.go
注:
开始运行时将注释内容解开,等页面抓取足够后再注释起来.
package main
import (
"net/http"
"pkg/tools"
"github.com/gin-gonic/gin"
)
func main() {
//初始化
// tools.Init()
// // Get the current time
// startTime := time.Now()
// for tools.Getunvistedurlsize() > 0 {
// collySpider.Craw()
// // Check if 4 hours have elapsed
// if time.Since(startTime) >= 2*time.Hour {
// break
// }
// }
// 初始化一个新的 Gin 路由器实例
router := gin.Default()
// 从 "templates" 目录加载 HTML 模板
router.LoadHTMLGlob("templates/*")
// 定义一个 GET 路由来提供搜索页面
router.GET("/", func(c *gin.Context) {
// 当访问根 URL 时,渲染 "search.html" 模板
c.HTML(http.StatusOK, "search.html", nil)
})
// 定义一个 POST 路由来处理搜索请求
router.POST("/search", func(c *gin.Context) {
// 定义一个结构体来绑定 JSON 请求负载
var json struct {
Query string `json:"query"`
}
// 绑定 JSON 负载到结构体并检查错误
if err := c.ShouldBindJSON(&json); err != nil {
// 如果有错误,响应一个 Bad Request 状态和错误信息
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
// 使用请求负载中的查询执行搜索
results := tools.BestMatchSearch(json.Query)
// 以 JSON 格式响应搜索结果
c.JSON(http.StatusOK, results)
})
// 在端口 50086 上启动服务器
router.Run(":50086")
}