搜索引擎搭建

2分享一个简单的搜索引擎搭建,思路:爬取1000+条网页url,利用tfidf提取关键词进行倒排索引,接着用pagerank算法构建推荐引擎

1.爬取网页:

简单的网页爬取技术,一开始爬的是百度网址,但是百度有反爬防护,所以选择的是网易新闻网。虽然爬出来的网页文本有奇怪的字符,但是没关系,因为要的只是url链接

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

# 定义一个函数,用于爬取网页内容
def scrape_webpage(url, depth, visited_urls, index):
    try:
        # 发送HTTP请求获取页面内容
        response = requests.get(url)
        if response.status_code == 200:
            # 指定正确的字符编码
            response.encoding = response.apparent_encoding
            # 使用BeautifulSoup解析HTML内容
            soup = BeautifulSoup(response.text, 'html.parser')
            # 在这里可以提取你需要的信息,具体根据目标网页结构而定
            # 这里只是简单的获取页面标题
            title = soup.title.text

            # 将标题和URL存储在索引中
            index.append({'URL': url, 'Title': title})
            print(f"Depth {depth} - Title: {title}")

            # 如果深度未达到限制,继续爬取链接
            if depth < max_depth:
                links = soup.find_all('a', href=True)
                for link in links:
                    next_url = link['href']
                    # 构建绝对URL
                    if not next_url.startswith('http'):
                        next_url = urljoin(url, next_url)
                    # 检查URL是否已经访问过,避免重复
                    if next_url not in visited_urls:
                        visited_urls.add(next_url)
                        # 检查爬取数量是否达到
                        if len(index) >= 10000:
                            return
                        # 递归调用
                        scrape_webpage(next_url, depth + 1, visited_urls, index)
    except Exception as e:
        print(f"Error: {e}")

# 主程序
if __name__ == "__main__":
    # 设置最大爬取深度
    max_depth = 5
    # 设置爬取的起始URL
    start_url = 'https://www.baidu.com/?tn=62095104_28_oem_dg'
    
    # 使用集合存储已经访问过的URL
    visited_urls = set()
    visited_urls.add(start_url)
    
    # 使用列表存储索引
    index = []
    
    # 爬取起始URL
    scrape_webpage(start_url, 1, visited_urls, index)
    
    # 将索引转换为DataFrame
    df = pd.DataFrame(index)
    
    # 将DataFrame保存到Excel文件
    df.to_excel('webpages_baidu_10000.xlsx', index=False)

2.构建倒排索引:

from bs4 import BeautifulSoup
import jieba
import networkx as nx
from collections import Counter
import math
import pandas as pd
import requests

file_path = 'webpages200.xlsx' 
df = pd.read_excel(file_path)

webpages = df['URL'].tolist()

for title in df['Title'].tolist():
    print("title:",title)
    
def crawl_webpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, 'html.parser')
        # 提取网页文本内容
        text_content = soup.get_text()
        return text_content
    return ""


def chinese_segmentation(text):
    return list(jieba.cut(text))

def build_graph(webpages, index):
    G = nx.DiGraph()

    for webpage in webpages:
        words = index[webpage]['words'].keys()
        G.add_nodes_from(words)
        for target_webpage in webpages:
            if webpage != target_webpage:
                common_words = set(words) & set(index[target_webpage]['words'].keys())
                weight_sum = sum(index[webpage]['words'][word] for word in common_words)
                if weight_sum > 0:
                    G.add_edge(webpage, target_webpage, weight=weight_sum)

    return G


def calculate_pagerank(G):
    pagerank = nx.pagerank(G)
    return pagerank

def search_engine_query(query, webpages, index, pagerank):
    query_words = chinese_segmentation(query)
    G_query = nx.Graph()
    G_query.add_nodes_from(query_words)

    for word in query_words:
        for webpage in webpages:
            if word in index[webpage]:
                G_query.add_edge(word, webpage, weight=index[webpage][word])

    pagerank_query = nx.pagerank(G_query, weight='weight')

    search_results = []
    for webpage in webpages:
        common_words = set(query_words) & set(index[webpage]['words'].keys())
        weight_sum = sum(index[webpage]['words'][word] * pagerank_query.get(word, 0) for word in common_words)
        score = pagerank[webpage] + weight_sum
        search_results.append((webpage, score))

    search_results = sorted(search_results, key=lambda x: x[1], reverse=True)

    return search_results

index = {}
for url in webpages:
    content = crawl_webpage(url)
    words = chinese_segmentation(content)
    word_counter = Counter(words)
    index[url] = {'title': df[df['URL'] == url]['Title'].values[0], 'words': word_counter}


G = build_graph(webpages, index)
pagerank = calculate_pagerank(G)

4.演示结果:

  • 4
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值