代码地址:https://gitee.com/hardyJia/python_scrapy_wangyi_mobile.git
主要文件
-main.py
-Wangyi3GSpider.py
import json
import re
import scrapy
import time
import sys
import redis
from demjson import decode
sys.path.append('..')
from wangyi_mobile.items import BaseItem, WangyiCommentItem, CommenrItem
from wangyi_mobile.sm3Util import sm3Util
"""
爬取方式:滚动新闻-根据标题筛选-详情页-评论接口
记得改redis地址、es存储位置和es地址!
"""
class Wangyi3GSpider(scrapy.Spider):
name = '163.mobile'
base_url = 'http://3g.163.com/touch/reconstruct/article/list/{}/{}-10.html'
r = redis.ConnectionPool(host='localhost', port=6379)
redis_key = 'wangyiRoll1'
n_collection_web='网易'
keywords = []
keywordsRedis = r.lrange("KEYWORDS_LIST", 0, -1)
for kw in keywordsRedis:
keywords.append(kw)
# type_list = ['BBM54PGAwangning'] # 要闻,财经,科技
type_list = ['BBM54PGAwangning', 'BA8EE5GMwangning', 'BA8D4A3Rwangning'] # 要闻,财经,科技
def start_requests(self):
for type in self.type_list:
for page in range(0, 301, 10): # 最多310条
url = self.base_url.format(type, page)
yield scrapy.Request(url, callback=self.parse, meta={'type': type})
def parse(self, response):
try:
j_str = response.body.decode("utf-8")
except UnicodeDecodeError as e:
j_str = response.body.decode("gb18030")
print("163.com下utf-8解码失败,已转gb18030")
else:
daresta = json.loads(j_str[9:-1])
news = daresta[response.meta['type']]
for data in news:
item = BaseItem()
item['n_title'] = data['title']
item['n_comment_num'] = data['commentCount']
docid = data['docid']
publish_date = data['ptime']
item['n_publish_date'] = int(time.mktime(time.strptime(publish_date, "%Y-%m-%d %H:%M:%S")))
item['n_description']=data['digest']
n_link="https://3g.163.com/news/article/" + docid + ".html"
item['n_link'] = n_link
# yield scrapy.Request(item["n_link"], callback=self.parse_detail, meta={'item': item})
for keyword in self.keywords:
if re.search(keyword, item["n_title"]) is not None:
item["n_keywords"] = keyword
return_code = self.redis_conn.sadd(self.redis_key, item["n_link"])
if return_code != 0:
yield scrapy.Request(item["n_link"], callback=self.parse_detail, meta={'item': item})
break
else:
pass
else:
pass
def parse_detail(self, response):
item = response.meta['item']
content_results = response.xpath("//div[@class='content']//p")
pic_results = response.xpath("//div[@class='content']//div[@class='photo']//a")
content = []
for i in range(len(content_results)):
content.append(content_results[i].xpath("string()").extract_first())
pic = []
for i in range(len(pic_results)):
pic.append(pic_results[i].xpath("./@href").extract_first())
item['n_content'] = content + pic
item['n_collection_web'] = self.n_collection_web
item["n_id"] = sm3Util.getAuthorId(self, self.n_collection_web, item['n_link'])
item['n_crawling_time'] = int(time.time())
item['n_source'] = response.xpath("string(//meta[@property='article:author']/@content)").extract_first()
item['n_author'] = item['n_source']
item['n_classify'] = '新闻网站'
print("超链接")
print(item['n_comment_num'])
print(response.url)
doc_id=response.url.split("article/")[1][:-5]
print(doc_id)
if item['n_author'] is not None:
item['n_author_id'] = sm3Util.getAuthorId(self, self.n_collection_web, item['n_author'])
yield item
comment_url = "https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/" + doc_id+"/comments/newList?offset=0&limit=30&headLimit=3&tailLimit=2&ibc=newswap&showLevelThreshold=5&callback=callback_"+str(int(round(time.time() * 1000)))
# comment_url="https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/G9JIKK990511B8LM/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset=0&"
if item['n_comment_num'] > 0:
commenrItem=CommenrItem()
commenrItem['doc_id']=doc_id
commenrItem['n_id']=item['n_id']
commenrItem['offset']=0
commenrItem['page_limit']=30
yield scrapy.Request(url=comment_url, callback=self.parse_comment, meta={'commenrItem': commenrItem})
def parse_comment(self, response):
commenrItem = response.meta['commenrItem']
try:
j_str = response.body.decode("gb18030")
except UnicodeDecodeError as e:
j_str = response.body.decode("utf-8")
print("163.com下gb18030解码失败,已转utf-8")
print(j_str[23:-3])
list_json = decode(j_str[23:-3])
commentIds=list_json["commentIds"]
comments=list_json["comments"]
print(commentIds)
print(len(commentIds))
print(len(comments))
if len(commentIds) > 0:
for commentsId in commentIds:
comments=commentsId.split(",")
parent_id=''
if len(comments)>1:
parent_id=comments[0]
for id in comments:
# print(id)
# print(list_json["comments"][id])
commentItem = WangyiCommentItem()
commentItem["article_id"]=commenrItem["n_id"]
commentItem["comment_date"]=list_json["comments"][id]["createTime"]
commentItem["comment_id"]=id
commentItem["parent_id"]=parent_id
try:
comment_name=list_json["comments"][id]["user"]["nickname"]
if comment_name is None:
comment_name = "火星网友"
except:
comment_name="火星网友"
commentItem["comment_name"]=comment_name
commentItem["post_id"]=sm3Util.getAuthorId(self, list_json["comments"][id]["source"], comment_name)
commentItem["author_id"]=sm3Util.getAuthorId(self, self.n_collection_web, comment_name)
commentItem["level"]=list_json["comments"][id]["buildLevel"]
commentItem["comment_content"]=list_json["comments"][id]["content"]
commentItem["comment_love_num"]=list_json["comments"][id]["vote"]
commentItem["comment_criticism_num"]=""
print(commentItem)
yield commentItem
doc_id=commenrItem["doc_id"]
offset=commenrItem["offset"]
page_limit=commenrItem["page_limit"]
offset=offset+page_limit
commenrItem['offset']=offset
comment_url = "https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/" + doc_id + "/comments/newList?offset="+offset+"&limit="+page_limit+"&headLimit=3&tailLimit=2&ibc=newswap&showLevelThreshold=5&callback=callback_" + str(
int(round(time.time() * 1000)))
yield scrapy.Request(url=comment_url, callback=self.parse_comment, meta={'commenrItem': commenrItem})
源代码中是网易新闻的爬取代码。
爬虫程序启动方式,只需运行main.py即可,启动后会将爬虫获取的内容保存为es数据库。
from scrapy import cmdline
cmdline.execute('scrapy crawl 163.mobile'.split())