python—基于scrapy框架的网易新闻爬虫,并集成ES和redis

代码地址:https://gitee.com/hardyJia/python_scrapy_wangyi_mobile.git

主要文件
-main.py
-Wangyi3GSpider.py

import json
import re
import scrapy
import time
import sys
import redis
from demjson import decode
sys.path.append('..')
from wangyi_mobile.items import BaseItem, WangyiCommentItem, CommenrItem
from wangyi_mobile.sm3Util import sm3Util

"""
爬取方式:滚动新闻-根据标题筛选-详情页-评论接口
记得改redis地址、es存储位置和es地址!

"""


class Wangyi3GSpider(scrapy.Spider):
    name = '163.mobile'
    base_url = 'http://3g.163.com/touch/reconstruct/article/list/{}/{}-10.html'
    r = redis.ConnectionPool(host='localhost', port=6379)
    redis_key = 'wangyiRoll1'
    n_collection_web='网易'
    keywords = []
    keywordsRedis = r.lrange("KEYWORDS_LIST", 0, -1)
    for kw in keywordsRedis:
        keywords.append(kw)
    # type_list = ['BBM54PGAwangning']  # 要闻,财经,科技
    type_list = ['BBM54PGAwangning', 'BA8EE5GMwangning', 'BA8D4A3Rwangning']  # 要闻,财经,科技

    def start_requests(self):
        for type in self.type_list:
            for page in range(0, 301, 10):  # 最多310条
                url = self.base_url.format(type, page)
                yield scrapy.Request(url, callback=self.parse, meta={'type': type})

    def parse(self, response):
        try:
            j_str = response.body.decode("utf-8")
        except UnicodeDecodeError as e:
            j_str = response.body.decode("gb18030")
            print("163.com下utf-8解码失败,已转gb18030")
        else:
            daresta = json.loads(j_str[9:-1])
            news = daresta[response.meta['type']]
            for data in news:
                item = BaseItem()
                item['n_title'] = data['title']
                item['n_comment_num'] = data['commentCount']
                docid = data['docid']
                publish_date = data['ptime']
                item['n_publish_date'] = int(time.mktime(time.strptime(publish_date, "%Y-%m-%d %H:%M:%S")))
                item['n_description']=data['digest']
                n_link="https://3g.163.com/news/article/" + docid + ".html"
                item['n_link'] = n_link
                # yield scrapy.Request(item["n_link"], callback=self.parse_detail, meta={'item': item})
                for keyword in self.keywords:
                    if re.search(keyword, item["n_title"]) is not None:
                        item["n_keywords"] = keyword
                        return_code = self.redis_conn.sadd(self.redis_key, item["n_link"])
                        if return_code != 0:
                            yield scrapy.Request(item["n_link"], callback=self.parse_detail, meta={'item': item})
                            break
                        else:
                            pass
                    else:
                        pass

    def parse_detail(self, response):
        item = response.meta['item']
        content_results = response.xpath("//div[@class='content']//p")
        pic_results = response.xpath("//div[@class='content']//div[@class='photo']//a")

        content = []
        for i in range(len(content_results)):
            content.append(content_results[i].xpath("string()").extract_first())

        pic = []
        for i in range(len(pic_results)):
            pic.append(pic_results[i].xpath("./@href").extract_first())

        item['n_content'] = content + pic
        item['n_collection_web'] = self.n_collection_web
        item["n_id"] = sm3Util.getAuthorId(self, self.n_collection_web, item['n_link'])
        item['n_crawling_time'] = int(time.time())
        item['n_source'] = response.xpath("string(//meta[@property='article:author']/@content)").extract_first()
        item['n_author'] = item['n_source']
        item['n_classify'] = '新闻网站'
        print("超链接")
        print(item['n_comment_num'])
        print(response.url)
        doc_id=response.url.split("article/")[1][:-5]
        print(doc_id)

        if item['n_author'] is not None:
            item['n_author_id'] = sm3Util.getAuthorId(self, self.n_collection_web, item['n_author'])
        yield item
        comment_url = "https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/" + doc_id+"/comments/newList?offset=0&limit=30&headLimit=3&tailLimit=2&ibc=newswap&showLevelThreshold=5&callback=callback_"+str(int(round(time.time() * 1000)))
        # comment_url="https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/G9JIKK990511B8LM/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset=0&"
        if item['n_comment_num'] > 0:
            commenrItem=CommenrItem()
            commenrItem['doc_id']=doc_id
            commenrItem['n_id']=item['n_id']
            commenrItem['offset']=0
            commenrItem['page_limit']=30
            yield scrapy.Request(url=comment_url, callback=self.parse_comment, meta={'commenrItem': commenrItem})

    def parse_comment(self, response):
        commenrItem = response.meta['commenrItem']
        try:
            j_str = response.body.decode("gb18030")
        except UnicodeDecodeError as e:
            j_str = response.body.decode("utf-8")
            print("163.com下gb18030解码失败,已转utf-8")
        print(j_str[23:-3])
        list_json = decode(j_str[23:-3])
        commentIds=list_json["commentIds"]
        comments=list_json["comments"]
        print(commentIds)
        print(len(commentIds))
        print(len(comments))
        if len(commentIds) > 0:
            for commentsId in commentIds:
                comments=commentsId.split(",")
                parent_id=''
                if len(comments)>1:
                    parent_id=comments[0]
                for id in comments:
                    # print(id)
                    # print(list_json["comments"][id])
                    commentItem = WangyiCommentItem()
                    commentItem["article_id"]=commenrItem["n_id"]
                    commentItem["comment_date"]=list_json["comments"][id]["createTime"]
                    commentItem["comment_id"]=id
                    commentItem["parent_id"]=parent_id
                    try:
                        comment_name=list_json["comments"][id]["user"]["nickname"]
                        if comment_name is None:
                            comment_name = "火星网友"
                    except:
                        comment_name="火星网友"
                    commentItem["comment_name"]=comment_name
                    commentItem["post_id"]=sm3Util.getAuthorId(self, list_json["comments"][id]["source"], comment_name)
                    commentItem["author_id"]=sm3Util.getAuthorId(self, self.n_collection_web, comment_name)
                    commentItem["level"]=list_json["comments"][id]["buildLevel"]
                    commentItem["comment_content"]=list_json["comments"][id]["content"]
                    commentItem["comment_love_num"]=list_json["comments"][id]["vote"]
                    commentItem["comment_criticism_num"]=""
                    print(commentItem)
                    yield commentItem
            doc_id=commenrItem["doc_id"]
            offset=commenrItem["offset"]
            page_limit=commenrItem["page_limit"]
            offset=offset+page_limit
            commenrItem['offset']=offset
            comment_url = "https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/" + doc_id + "/comments/newList?offset="+offset+"&limit="+page_limit+"&headLimit=3&tailLimit=2&ibc=newswap&showLevelThreshold=5&callback=callback_" + str(
                int(round(time.time() * 1000)))
            yield scrapy.Request(url=comment_url, callback=self.parse_comment, meta={'commenrItem': commenrItem})

源代码中是网易新闻的爬取代码。
爬虫程序启动方式,只需运行main.py即可,启动后会将爬虫获取的内容保存为es数据库。

from scrapy import cmdline
cmdline.execute('scrapy crawl 163.mobile'.split())
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值