1. 抓取用到的工具
crawl4ai 0.6.3 异步抓取,代理用 squid ,数据存储用elasticsearch
ai 分析层用本地ollama的deepseek-r1:8b
通过这些工具来构建自动化简报系统的完整技术方案
2. 系统架构设计
3. 核心代码
import os
import json
from datetime import datetime
from elasticsearch import AsyncElasticsearch
from crawl4ai import AsyncWebCrawler
from crawl4ai.models import CrawlerResponse
import ollama
# 配置环境变量
os.environ["SQUID_PROXY"] = "http://user:pass@localhost:3128"
os.environ["ES_HOSTS"] = "http://localhost:9200"
class KnowledgePipeline:
def __init__(self):
self.es = AsyncElasticsearch(hosts=[os.getenv("ES_HOSTS")])
self.crawler = AsyncWebCrawler(
proxy=os.getenv("SQUID_PROXY"),
render_js=True, # 启用浏览器渲染
max_concurrency=10,
timeout=30
)
# 知识库分类配置
self.categories = {
"economic": {
"urls": [
"https://www.imf.org",
"https://data.worldbank.org"
],
"selector": "#main-content" # 定制CSS选择器
},
"technology": {
"urls": [
"https://arxiv.org",
"https://ieeexplore.ieee.org"
],
"screenshot": True
}
}
async def _analyze_content(self, text: str) -> dict:
"""使用本地Ollama模型分析内容"""
response = ollama.generate(
model='deepseek-r1:8b',
prompt=f"Generate a summary and 5 keywords for this text:\n{text[:3000]}",
options={
"temperature": 0.7,
"max_tokens": 300
}
)
return {
"summary": response['choices'][0]['text'],
"keywords": list(set(response['text'].split()[:5]))
}
async def _store_to_es(self, data: dict):
"""异步存储到Elasticsearch"""
doc = {
"title": data["title"],
"content": data["content"],
"summary": data["summary"],
"keywords": data["keywords"],
"timestamp": datetime.utcnow(),
"category": data["category"],
"url": data["url"]
}
await self.es.index(
index="daily_briefing",
document=doc
)
async def crawl_category(self, category: str, config: dict):
"""执行分类爬取任务"""
results = []
for url in config["urls"]:
response: CrawlerResponse = await self.crawler.crawl(
url=url,
css_selector=config.get("selector", "body"),
screenshot=config.get("screenshot", False)
)
if response.status_code == 200:
analysis = await self._analyze_content(response.content)
data = {
"category": category,
"url": url,
"title": response.metadata.get("og:title", url),
"content": response.content,
"screenshot": response.screenshot,
**analysis
}
await self._store_to_es(data)
results.append(data)
return results
async def generate_daily_briefing(self):
"""生成每日简报"""
# 从ES获取当天数据
query = {
"query": {
"range": {
"timestamp": {
"gte": "now-1d/d",
"lt": "now/d"
}
}
},
"size": 100
}
resp = await self.es.search(
index="daily_briefing",
body=query
)
# 生成HTML简报
template = """
<html><body>
<h1>每日知识简报 {{ date }}</h1>
{% for category in data %}
<div class="category">
<h2>{{ category.name }}</h2>
{% for item in category.items %}
<div class="item">
<h3><a href="{{ item.url }}">{{ item.title }}</a></h3>
<p>{{ item.summary }}</p>
<div class="keywords">
{% for kw in item.keywords %}
<span class="tag">{{ kw }}</span>
{% endfor %}
</div>
{% if item.screenshot %}
<img src="data:image/png;base64,{{ item.screenshot }}" width="300">
{% endif %}
</div>
{% endfor %}
</div>
{% endfor %}
</body></html>
"""
return await self._render_template(template, {
"date": datetime.now().strftime("%Y-%m-%d"),
"data": self._group_by_category(resp['hits']['hits'])
})
async def run(self):
"""执行完整流程"""
# 执行爬取任务
tasks = [
self.crawl_category(cat, config)
for cat, config in self.categories.items()
]
await asyncio.gather(*tasks)
# 生成并保存简报
html = await self.generate_daily_briefing()
with open(f"briefing_{datetime.today().strftime('%Y%m%d')}.html", "w") as f:
f.write(html)
# 运行示例
if __name__ == "__main__":
pipeline = KnowledgePipeline()
asyncio.run(pipeline.run())
4. 简报详情页动态加载
# 用户点击后的实时渲染(网页2、网页7)
@app.route('/detail')
def load_detail():
url = request.args.get('url')
# 检查缓存
cached = es.search(index="daily_news", query={"term": {"url_hash": hash(url)}})
if not cached:
# 动态渲染详情页
result = await crawler.arun(
url=url,
js_code=["document.querySelector('.paywall').remove()"], # 绕过付费墙
css_selector=".main-content"
)
es.index(...)
return render_template('detail.html', content=cached['raw_html'])
5. 配套部署架构
6.系统优化建议
- 代理配置优化:
# squid.conf 关键配置
acl knowledge_crawlers src 192.168.1.0/24
delay_pools 1
delay_class 1 1
delay_parameters 1 5000/5000 # 限速5MB/s
- Elasticsearch映射模板:
{
"mappings": {
"properties": {
"content": {"type": "text", "analyzer": "ik_max_word"},
"summary": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
"keywords": {"type": "keyword"},
"category": {"type": "keyword"}
}
}
}
- Ollama API扩展:
# 启动API服务
ollama serve --host 0.0.0.0:11434 --api-key YOUR_SECRET_KEY
# 自定义模型prompt模板
curl http://localhost:11434/api/prompts -d '{
"name": "briefing_summary",
"template": "作为知识库分析师,请用中文总结内容要点,包含:\n1. 核心观点\n2. 数据指标\n3. 行业影响\n原文:{{.Content}}"
}'
- 内容处理流水线:
7. 扩展建议
- 添加知识图谱构建:
from kgforge import KnowledgeGraph
kg = KnowledgeGraph()
kg.build_from_articles(data)
kg.visualize()
- 用户定制化功能:
- 关键词订阅
- 兴趣领域偏好
- 推送渠道选择(邮件/钉钉/Slack)
8. 监控方案(非必须)
# 使用Prometheus监控指标
- Elasticsearch指标:elasticsearch_exporter
- Squid流量监控:squid_exporter
- Ollama性能指标:ollama自带/metrics端点
# 关键告警规则
groups:
- name: crawler
rules:
- alert: CrawlerErrorRate
expr: rate(crawl_errors_total[5m]) > 0.1
- alert: ESHighLatency
expr: elasticsearch_indexing_latency_seconds > 2
9. 效果展示示例
每日简报界面:
## 2025-05-17 财经要闻速递
1. **央行降准0.5个百分点**
- 预计释放长期资金约1.2万亿元
- 分析认为将利好中小微企业融资
2. **AI芯片巨头发布新产品**
- 算力提升3倍,能效比达20TOPS/W
- 概念股集体上涨(点击查看详情)
[查看完整报告](http://briefing.site/detail?date=2025-05-17)
方案特点:
- 完全基于异步IO架构,适合高频抓取
- 使用本地LLM保障数据隐私
- 通过Squid实现统一代理管理
- Elasticsearch提供灵活检索能力
- 每日自动生成可存档的HTML报告
部署前请确保:
- Squid已配置正确ACL规则
- Elasticsearch索引模板已创建
- Ollama模型已下载完成(deepseek-r1:8b)
- 各服务网络策略已开放必要端口