2分享一个简单的搜索引擎搭建,思路:爬取1000+条网页url,利用tfidf提取关键词进行倒排索引,接着用pagerank算法构建推荐引擎
1.爬取网页:
简单的网页爬取技术,一开始爬的是百度网址,但是百度有反爬防护,所以选择的是网易新闻网。虽然爬出来的网页文本有奇怪的字符,但是没关系,因为要的只是url链接
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
# 定义一个函数,用于爬取网页内容
def scrape_webpage(url, depth, visited_urls, index):
try:
# 发送HTTP请求获取页面内容
response = requests.get(url)
if response.status_code == 200:
# 指定正确的字符编码
response.encoding = response.apparent_encoding
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(response.text, 'html.parser')
# 在这里可以提取你需要的信息,具体根据目标网页结构而定
# 这里只是简单的获取页面标题
title = soup.title.text
# 将标题和URL存储在索引中
index.append({'URL': url, 'Title': title})
print(f"Depth {depth} - Title: {title}")
# 如果深度未达到限制,继续爬取链接
if depth < max_depth:
links = soup.find_all('a', href=True)
for link in links:
next_url = link['href']
# 构建绝对URL
if not next_url.startswith('http'):
next_url = urljoin(url, next_url)
# 检查URL是否已经访问过,避免重复
if next_url not in visited_urls:
visited_urls.add(next_url)
# 检查爬取数量是否达到
if len(index) >= 10000:
return
# 递归调用
scrape_webpage(next_url, depth + 1, visited_urls, index)
except Exception as e:
print(f"Error: {e}")
# 主程序
if __name__ == "__main__":
# 设置最大爬取深度
max_depth = 5
# 设置爬取的起始URL
start_url = 'https://www.baidu.com/?tn=62095104_28_oem_dg'
# 使用集合存储已经访问过的URL
visited_urls = set()
visited_urls.add(start_url)
# 使用列表存储索引
index = []
# 爬取起始URL
scrape_webpage(start_url, 1, visited_urls, index)
# 将索引转换为DataFrame
df = pd.DataFrame(index)
# 将DataFrame保存到Excel文件
df.to_excel('webpages_baidu_10000.xlsx', index=False)
2.构建倒排索引:
from bs4 import BeautifulSoup
import jieba
import networkx as nx
from collections import Counter
import math
import pandas as pd
import requests
file_path = 'webpages200.xlsx'
df = pd.read_excel(file_path)
webpages = df['URL'].tolist()
for title in df['Title'].tolist():
print("title:",title)
def crawl_webpage(url):
response = requests.get(url)
if response.status_code == 200:
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
# 提取网页文本内容
text_content = soup.get_text()
return text_content
return ""
def chinese_segmentation(text):
return list(jieba.cut(text))
def build_graph(webpages, index):
G = nx.DiGraph()
for webpage in webpages:
words = index[webpage]['words'].keys()
G.add_nodes_from(words)
for target_webpage in webpages:
if webpage != target_webpage:
common_words = set(words) & set(index[target_webpage]['words'].keys())
weight_sum = sum(index[webpage]['words'][word] for word in common_words)
if weight_sum > 0:
G.add_edge(webpage, target_webpage, weight=weight_sum)
return G
def calculate_pagerank(G):
pagerank = nx.pagerank(G)
return pagerank
def search_engine_query(query, webpages, index, pagerank):
query_words = chinese_segmentation(query)
G_query = nx.Graph()
G_query.add_nodes_from(query_words)
for word in query_words:
for webpage in webpages:
if word in index[webpage]:
G_query.add_edge(word, webpage, weight=index[webpage][word])
pagerank_query = nx.pagerank(G_query, weight='weight')
search_results = []
for webpage in webpages:
common_words = set(query_words) & set(index[webpage]['words'].keys())
weight_sum = sum(index[webpage]['words'][word] * pagerank_query.get(word, 0) for word in common_words)
score = pagerank[webpage] + weight_sum
search_results.append((webpage, score))
search_results = sorted(search_results, key=lambda x: x[1], reverse=True)
return search_results
index = {}
for url in webpages:
content = crawl_webpage(url)
words = chinese_segmentation(content)
word_counter = Counter(words)
index[url] = {'title': df[df['URL'] == url]['Title'].values[0], 'words': word_counter}
G = build_graph(webpages, index)
pagerank = calculate_pagerank(G)
4.演示结果: