本文介绍分布式爬取搜狐新闻的方法
- 基于redis 实现分布式
- 基于newspaper实现正文抽取 newspaper3k for python3
- 没有使用scrapy,python原生多线程实现
- 本地文件存储(对于ES,MongoDB自行添加代码支持,不难,但是本人侧重于使用本地txt存储)
获取所有新闻url
通过拼接Base URL即可访问到当天所有的新闻列表,一千多条的样子。存入redis以待消费。详细拼接办法见代码
def getALLUR()
pass
通过多机访问redis进行消费
def checkdone()
pass
from newspaper import Article
import json
import threading
from concurrent.futures import ThreadPoolExecutor
import time
import redis
#一些初始化工作
pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
output= open("sohu50.txt","w",encoding="utf-8")
#通过baseurl获取当日列表
def getURLS(url):
article = Article(url,language='zh')
article.download()
result = article.html
result = result[16:]
result = result.replace("category","\"category\"").replace("item","\"item\"")
try:
resultobj = json.loads(result)
except Exception as e:
return
return resultobj
#newpaper获取文章内容
def getContent(url):
article = Article(url,language='zh')
try:
article.download()
article.parse()
#article.nlp()
except Exception as e:
return
content = article.text
title = article.title
return title.strip(),content.strip()#,article.nlp()
# 单线程获取url对应的文章标题与内容,同时显示当个url处理时间,newpaper比较慢,更推荐goose
def signalThread(url):
value = r.get(url)
if value!="done":
start = time.time()
news = {}
news["url"]= url
news["title"],news["content"]=getContent(url)
line = json.dumps(news)
output.write(line+"\n")
print(time.time()-start)
r.set(url,"done")
return
def getALLURL():
url = "http://news.sohu.com/_scroll_newslist/20{}{}{}/news.inc"
#遍历所有日期去去获取当天的url中
for i in range(9,20):
for j in range(1,13):
for d in range(1,32):
target =url.format(str(i).zfill(2),str(j).zfill(2),str(d).zfill(2))
allurls = getURLS(target)
if not allurls:
continue
for item in allurls["item"]:
targetUrl = item[2]
res = r.set(targetUrl,1)
def checkdone():
with ThreadPoolExecutor(600) as executor:
put = []
for key in r.scan_iter(count = 600):#维护一个600大小的线程池,每次提交600个URL
put.append(key)
if len(put)>600:
executor.map(signalThread,put)
put= []
getALLURL()
checkdone()