利用爬虫爬取微信文章的内容,不包括点赞数和浏览量

以安徽日报公众号为例

原理上是基于微信公众号订阅号的机制
测试1000以下不会封号,但是一次性爬取超过1100多就会封号一段时间

# -*- coding: utf-8 -*-
import requests
import time
import xlwt
import xlrd
import re
import os
import threading

s = requests.session()
s.keep_alive = False


class AnhuiDailySpider():
    def __init__(self, KeyWords=["疫情"]):
        '''网页请求的两个重要参数'''
        self.cookies = "这里填写的是微信订阅号网页请求的cookies"
        self.DownloadHeaders = {
                        "Cookie": self.cookies,
                        "User-Agent": "Mozilla/5.0 (Linux; Android 10; STK-AL00 Build/HUAWEISTK-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/78.0.3904.62 XWEB/2759 MMWEBSDK/201201 Mobile Safari/537.36 MMWEBID/1378 \
                            MicroMessenger/8.0.1.1841(0x280001B6) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64"
        }
        '''新闻计数器'''
        self.NewsCount = 0
        '''新闻标题-新闻链接-新闻时间保存的位置'''
        self.workbook = xlwt.Workbook(encoding='utf-8')
        self.Worksheet = self.workbook.add_sheet('AuhuiDaily_articles')
        self.NewsPath = "AnhuiDaily.xls"
        '''新闻内容保存的位置'''
        self.NewsContentPath = "./Download/"
        '''获取新闻内容请求的相关参数'''
        self.GetContentHeaders = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"
        }
        '''爬取新闻链接的网址'''
        self.wechat_url = "https://mp.weixin.qq.com/cgi-bin/appmsg"
        '''新闻计数器'''
        self.NewsCount = 0
        '''方便断点续传的参数'''
        self.startPages = self.downloadPages()
        '''关键词下载选项'''
        self.KeyWords = KeyWords
        '''新闻中需要清洗的内容'''
        self.needClearWords = ['<!-- 注意:这个文件是一个公共文件,被很多地方引用,改动需要注意其他类型页面是否有受影响 -->', '安徽日报']
        self.createSavePath()

    def createSavePath(self):
        if not os.path.exists(self.NewsContentPath):
            os.mkdir(self.NewsContentPath)

    def downloadPages(self):
        AnhuiDaily = xlrd.open_workbook("downloadPages.xls")
        AnhuiDailyArticles = AnhuiDaily.sheets()[0]
        downloadPages = AnhuiDailyArticles.cell(0, 0).value
        return int(downloadPages)

    '''把时间戳转换为具体的事件点'''
    def timeStampTrans(timeStamp):
        timeArray = time.localtime(timeStamp)
        otherStyleTime = time.strftime("%y-%m-%d %H:%M:%S", timeArray)
        return otherStyleTime

    """
    需要提交的data
    以下个别字段是否一定需要还未验证。
    注意修改yourtoken,number
    number表示从第number页开始爬取,为5的倍数,从0开始。如0、5、10……
    token可以使用Chrome自带的工具进行获取
    fakeid是公众号独一无二的一个id,等同于后面的__biz
    """
    def downloadNewsUrls(self):
        '''检查下载了多少篇文章,断点续传'''
        for newsPage in range(self.startPages, 316):
            print("正在下载"+str(newsPage)+"页")
            data = {
                    "token": 1130076224,
                    "lang": "zh_CN",
                    "f": "json",
                    "ajax": "1",
                    "action": "list_ex",
                    "begin": str(newsPage),
                    "count": "5",
                    "query": "",
                    "fakeid": "MzA5NTc5ODQwMA==",
                    "type": "9",
            }
            '''使用get方法获取'''
            content_json = requests.get(self.wechat_url, headers=self.DownloadHeaders, params=data).json()
            '''返回了一个json,里面是每一页的数据'''
            beforeCount = self.NewsCount
            try:
                for item in content_json["app_msg_list"]:
                    NewsTitle = item['title']
                    Newslink = item['link']
                    NewsTime = self.timeStampTrans(item['create_time'])
                    SingleNews = [NewsTitle, Newslink, NewsTime]
                    for i in range(0, 3):
                        self.Worksheet.write(self.NewsCount, i, SingleNews[i])
                    self.NewsCount += 1
                print("第"+str(newsPage)+"页,"+"有"+str(self.NewsCount-beforeCount)+"篇新闻,"+"共"+str(self.NewsCount)+"篇新闻")
                time.sleep(3)
                self.workbook.save(self.savePath)
            except KeyError:
                if newsPage != self.downloadPages():
                    Pages = xlwt.Workbook(encoding='utf-8')
                    PageSheet = Pages.add_sheet('downloadPages')
                    PageSheet.write(0, 0, str(newsPage))
                    Pages.save("downloadPages.xls")
                else:
                    print("封号中")
                    break

    def downLoadArticles(self):
        for newsPage in range(0, 316):
            print("正在下载"+str(newsPage)+"页")
            data = {
                    "token": 271655701,
                    "lang": "zh_CN",
                    "f": "json",
                    "ajax": "1",
                    "action": "list_ex",
                    "begin": str(newsPage),
                    "count": "5",
                    "query": "",
                    "fakeid": 'MzA5NTc5ODQwMA==',
                    "type": "9",
                }
            # 使用get方法进行提交
            content_json = requests.get(self.url, headers=self.DownloadHeaders, params=data).json()
            # 返回了一个json,里面是每一页的数据
            for item in content_json["app_msg_list"]:
                NewsTitle = item['title']
                Newslink = item['link']
                NewsTime = self.StampTrans(item['create_time'])
                SingleNews = [NewsTitle, Newslink, NewsTime]
                for i in range(0, 3):
                    self.Worksheet.write(self.NewsCount+1, i, SingleNews[i])
                self.NewsCount += 1
                print(self.NewsCount)
            time.sleep(10)
            self.workbook.save(self.NewsPath)
        print("下载完成,一共是 "+str(self.NewsCount)+" 篇新闻")

    '''进行数据清洗'''
    def dataClear(self, data):
        for word in self.needClearWords:
            data = data.replace(word, '')
        return data

    def collectNewsContent(self, threads):
        totalPages = len(self.readUrl("AnhuiDaily_Part1.xls"))
        print("共有 "+str(totalPages) + " 条新闻需要下载")
        for index in range(1, 100):
            if totalPages % index == 0:
                thread_num = int(totalPages/index)
        print("开启了 "+str(thread_num)+" 个线程")
        each = totalPages/thread_num
        i = -1
        for i in range(int(thread_num)-1):
            startPage, endPage = int(i*each), int((i+1)*each)
            t = threading.Thread(target=ScrapyInfo, args=[self.DownloadHeaders, startPage, endPage])
            t.start()
            threads.append(t)
            time.sleep(1)
        t = threading.Thread(target=ScrapyInfo, args=(self.DownloadHeaders, startPage, endPage))
        t.start()
        threads.append(t)
        for thread in threads:
            thread.join()

    def readUrl(self, path):
        ArticlesUrls = []
        AnhuiDaily = xlrd.open_workbook(path)
        AnhuiDailyArticles = AnhuiDaily.sheets()[0]
        for newsIndex in range(AnhuiDailyArticles.nrows):
            if AnhuiDailyArticles.cell(newsIndex, 1) is None:
                break
            ArticlesUrls.append(AnhuiDailyArticles.cell(newsIndex, 1).value)
        return ArticlesUrls

    def readTitle(self, path):
        ArticlesTitle = []
        AnhuiDaily = xlrd.open_workbook(path)
        AnhuiDailyArticles = AnhuiDaily.sheets()[0]
        for newsIndex in range(AnhuiDailyArticles.nrows):
            if AnhuiDailyArticles.cell(newsIndex, 0) is None:
                break
            ArticlesTitle.append(AnhuiDailyArticles.cell(newsIndex, 0).value)
        return ArticlesTitle


def ScrapyInfo(headers, startPage, endPage):
    AnhuiDailyDemo = AnhuiDailySpider()
    Urls = AnhuiDailyDemo.readUrl(path="AnhuiDaily_Part1.xls")
    Titles = AnhuiDailyDemo.readTitle(path="AnhuiDaily_Part1.xls")
    for urlIndex in range(startPage, endPage):
        try:
            res = requests.get(url=Urls[urlIndex], headers=headers, timeout=30)
        except Exception as e:
            print("********产生了一个异常***********")
            print(e+"异常的链接: "+Urls[urlIndex])
            continue
        print(str(urlIndex)+" "+Titles[urlIndex])
        res.encoding = 'utf-8'
        NewsContent = AnhuiDailyDemo.dataClear(res.text)
        regex = re.compile(r'[\u4e00-\u9fa5]+')
        chineseResult = regex.findall(NewsContent)
        NewsContentString = ""
        for String in chineseResult:
            NewsContentString = NewsContentString+String
        with open(AnhuiDailyDemo.NewsContentPath+str(urlIndex)+".txt", "a+") as f:
            f.write(NewsContentString)


if __name__ == "__main__":
    '''放置线程的列表'''
    threads = []
    '''开启的线程的数量'''
    thread_nums = 400
    os.system("cls")
    KeyWords = ["疫情"]
    AnhuiDailyDemo = AnhuiDailySpider(KeyWords=KeyWords)
    AnhuiDailyDemo.collectNewsContent(threads)
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值