最终效果
一. 前言
对于自己的CSDN文章,由于只能工具标题关键字搜索,如果文章很多的话,记不清文章标题,这个时候搜索就会比较麻烦,那就自己做一个搜索功能。
二. 思路
- 把自己的CSDN文章都爬到本地
- 写个web服务器,负责执行搜索任务
- 写个静态html页面,负责输入搜索的关键字和展示搜索结果
三. 撸起袖子写代码
目录结构如下
1 python爬虫,爬自己的CSDN文章
python版本:3.8
需要安装库:requests
、BeautifulSoup
csdn_spider.py
# csdn_spider.py
import requests
from bs4 import BeautifulSoup
import base64
import os
import re
import json
from threading import Thread
import time
# 总页数
PAGE_CNT = 16
BLOG_URL = 'https://blog.csdn.net/linxinfa/article/list/'
DOWNLOADD_DIR = 'articles/'
page_done_cnt = 0
article_cnt = 0
title2url = None
def get_all_aritcles(page):
global title2url
global page_done_cnt
global article_cnt
web_url = BLOG_URL + str(page)
r = requests.get(web_url)
html_txt = r.text
# 去除换行
html_txt = html_txt.replace('<br>', '').replace('<br/>', '')
soup = BeautifulSoup(html_txt, 'lxml')
tag_main =soup.find('main')
tag_div = tag_main.find('div', class_='article-list')
if tag_div:
tag_h4_all = tag_div.find_all('h4')
for tag_h4 in tag_h4_all:
tag_a = tag_h4.find('a')
if 'linxinfa' in tag_a['href']:
url = tag_a['href']
# 这里不能直接用tag_a.string取出文章标题,因为a节点中含有子节点,需要做extract处理
tag_a = [s.extract() for s in tag_a]
# 取出标题
title = tag_a[1]
# 去除前后空格
title = title.strip()
print('%s %s: %s'%(article_cnt, title, url))
encode_title = base64.b64encode(title.encode('utf-8'))
encode_title = str(encode_title, encoding="utf-8").replace('/','%')
download_article(encode_title, url)
title2url[encode_title] = url
article_cnt = article_cnt + 1
page_done_cnt = page_done_cnt + 1
def download_article(title, url):
r = requests.get(url)
# 只取article标签内的内容
match = re.search('<article class="baidu_pl">((.|\n)*)</article>', r.text)
if match:
content_txt = match.group(1)
# 文章标题使用base64编码
# 以base64编码后的标题作为文件名,保存文章内容到本地
fw = open(DOWNLOADD_DIR + title + '.html', 'w', encoding = 'utf-8')
fw.write('<article class="baidu_pl">' + content_txt + '</article>')
fw.close()
def create_download_dir():
if not os.path.exists(DOWNLOADD_DIR):
os.mkdir(DOWNLOADD_DIR)
def save_title2url_json():
global title2url
txt = json.dumps(title2url, indent=2)
fw = open('title2url.json', 'w')
fw.write(txt)
fw.close()
def search_article(search_txt):
global title2url
target_articles = []
if None == title2url:
fr = open('title2url.json', 'r')
title2url_txt = fr.read()
title2url = json.loads(title2url_txt)
for root, dirs, fs in os.walk(DOWNLOADD_DIR):
for f in fs:
fr = open(os.path.join(root, f), 'r', encoding='utf-8')
article_txt = fr.read()
fr.close()
if re.search(search_txt, article_txt):
k = f.replace('.html', '').replace('%','/')
if k in title2url:
target_articles.append({"title":str(base64.b64decode(k),"utf-8"), "url":title2url[k]})
return target_articles
if '__main__' == __name__:
title2url = {}
page_done_cnt = 0
article_cnt = 0
create_download_dir()
for i in range(1, PAGE_CNT + 1):
thread = Thread(target=get_all_aritcles, args=(i,))
thread.setDaemon(True)
thread.start()
while page_done_cnt < PAGE_CNT:
time.sleep(0.5)
save_title2url_json()
print("done")
执行后,把文章爬到本地的articles
目录中
注:文件名是文章标题的base64编码(其中把/替换成%,否则会提示非法文件名)
然后还会生成一份标题到链接的映射表
2. web服务器
使用python的tornado框架,负责执行搜索任务
需要安装库:tornado
csdn_articles.py
# csdn_articles.py
import tornado.ioloop
import tornado.web
import json
import os
import re
import platform
import csdn_spider
PORT = 8987
class BaseHandler(tornado.web.RequestHandler):
def set_default_headers(self):
self.set_header("Access-Control-Allow-Origin", "*") # 这个地方可以写域名
self.set_header("Access-Control-Allow-Headers", "x-requested-with")
self.set_header('Access-Control-Allow-Methods', 'POST, GET, OPTIONS')
def post(self):
self.write('some post')
def get(self):
self.write('some get')
def options(self):
# no body
self.set_status(204)
self.finish()
class SearchHandler(BaseHandler):
def post(self):
jd = get_json_from_bytes(self.request.body)
print('SearchHandler.post', jd)
if None == jd:
self.write('data error')
else:
txt = jd['txt']
target_articles = csdn_spider.search_article(txt)
print(target_articles)
self.write(json.dumps(target_articles))
def get_json_from_bytes(data_bytes):
txt = str(data_bytes, encoding = "utf8")
if is_json(txt):
return json.loads(txt)
else:
print('get_json_from_bytes Error:', txt)
return None
def is_json(txt):
try:
json.loads(txt)
except ValueError:
return False
return True
def make_app():
return tornado.web.Application([
(r'/search_article', SearchHandler),
])
if __name__ == "__main__":
# 解决python3.8问题
if platform.system() == "Windows":
import asyncio
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
app = make_app()
app.listen(PORT)
print('start web server successfully, port:', PORT)
tornado.ioloop.IOLoop.current().start()
3. 静态网页
负责输入搜索的关键字和展示搜索结果
browser.html
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>林新发CSDN博客搜索</title>
<link href="browser.css" type="text/css" rel="stylesheet" />
</head>
<body>
<h1>林新发CSDN博客搜索</h1>
请输入关键字 <input id="txt" type="text" name="version" value="">
<button onclick="OnSearchBtn()">搜索</button>
<table cellspacing="1" width="100%">
<thead>
<tr>
<th width="10%">index</th>
<th width="50%">文章</th>
</tr>
</thead>
<tbody id='article_table'>
</tbody>
</table>
</html>
<script>
var ADDRESS = 'http://127.0.0.1:8987/';
function isJSON(str) {
if (typeof str == 'string') {
try {
JSON.parse(str);
return true;
} catch(e) {
return false;
}
}
console.log('It is not a string!')
}
function OnSearchBtn() {
var txt = document.getElementById('txt');
var version = txt.value;
data = { 'txt': txt.value };
var request_url = ADDRESS + 'search_article';
var request = new XMLHttpRequest();
request.open('POST', request_url, true);
request.send(JSON.stringify(data));
request.onload = function(e) {
console.log("请求成功");
console.log(request.response);
if (request.status === 200) {
console.log(request.responseText);
var tbody = document.querySelector('tbody');
var childs = tbody.childNodes;
for(var i = childs .length - 1; i >= 0; i--)
{
tbody.removeChild(childs[i]);
}
if(!isJSON(request.response))
{
return;
}
jd = JSON.parse(request.response);
for (var i = 0; i < jd.length; i++) {
//创建行tr
var tr = document.createElement('tr');
//将新创建的行tr添加给tbody
tbody.appendChild(tr);
var td = document.createElement('td');
td.innerHTML = i + 1;
//给tr添加td子元素
tr.appendChild(td);
var td = document.createElement('td');
var a = document.createElement('a');
a.innerHTML = jd[i]['title'];
a.href = jd[i]['url'];
a.target = 'blank';
td.appendChild(a);
tr.appendChild(td);
}
}
else{
console.log('search_article error, request.status: ' + request.status);
}
}
request.onerror = function(e) {
console.log('search_article error, status:' + request.status + ', statusText: ' + request.statusText);
}
}
</script>
browser.css
table {
border-collapse: collapse;
text-align: center;
}
td,
th {
border: 1px solid #333;
text-align: left;
}
thead tr {
background-color: #004B97;
color: white;
}
四. 启动web服务器,测试
执行csdn_articles.py
,启动web服务器
用浏览器打开broswer.html
搜索Unity
,就可以查到含有Unity
关键字的博客文章了(文章标题或内容还要Unity关键字都会被搜出来)