搜狗微信爬虫获取文章信息

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/qq_38044574/article/details/88949986

author:Voccoo
time:2019-4-1

"""
1.本demo只是为了爬取指定公众号或指定关键字下公众号,限定时间内所发送的文章。
  若要获取公众号信息,一并存取,请根据第一条gzhurl做以修改,或者从文章中直接获取

2.本demo只是匆忙间完成的,还有许多细节并不够完美。
  比如对返回值为空的判断等。
  若要使用请根据自己的需求加以修改

3.本次代理使用了redis来存储,这是作者自己的习惯存储方式。
  若要使用,请自行修改方法redis_proxy().

4.‘用代理,就上芝麻IP!’

5.本demo只获取到文章的名称,更多信息请自行修改获取。
"""

from fake_useragent import UserAgent
import requests, time
from scrapy import Selector
import random
import redis, json
from urllib.parse import quote


# redis为ip池
# 从redis中获取ip
#
def redis_proxy():
    redis_conn = redis.StrictRedis(host='localhost',
                                   password='Cs123456.',
                                   port=6379,
                                   db=1
                                   )
    redis_ip = redis_conn.blpop('ips')
    ip = json.loads(redis_ip[1].decode('UTF-8'))
    proxy = {
        'https': 'https://{}'.format(ip['ip'])
    }

    return proxy


# 获取html
def get_html_act(url, referer):
    ua = UserAgent()
    while True:

        proxies = redis_proxy()
        try:
            headers = {
                'User-Agent': ua.random,
                'Upgrade-Insecure-Requests': '1',
            }

            session = requests.session()
            session.get('https://mp.weixin.qq.com/',
                        headers=headers,
                        proxies=proxies,
                        timeout=3
                        )
            html = requests.get(url,
                                headers=headers,
                                proxies=proxies,
                                # allow_redirects=False,
                                timeout=3)

            if html.status_code == 200:
                # print(html.text)
                return Selector(text=html.text)

            else:
                print('---状态码---{}被封了---!'.format(proxies['https']))

        except Exception as e:

            print('-----超时抛错----')


# 获取html
def get_html(url, referer):
    # print(url)
    ua = UserAgent()
    while True:

        proxies = redis_proxy()
        try:
            headers = {
                "Host": "weixin.sogou.com",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                'User-Agent': ua.random,
                "ContentType": "text/xml;charset=utf-8",
                'Referer': referer,
                'Upgrade-Insecure-Requests': '1',
            }

            session = requests.session()
            html = session.get(url,
                               headers=headers,
                               proxies=proxies,
                               allow_redirects=False,
                               timeout=3)

            if html.status_code == 200:
                return Selector(text=html.text)

            else:
                print('---状态码---{}被封了---!'.format(proxies['https']))

        except Exception as e:

            print('-----超时抛错----')


def run(gzh, start_time, endtime):
    """

    :param gzh:
    :param start_time:
    :param endtime:
    :return:
    ps:
        搜索公众号的url,为了获取wxid,搜索关键可以为关键字也可为公众号
        若使用关键字搜索公众号的wxid,本demo只获取第一页的,请自行翻页获取
    """
    gzh_url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_='.format(
        quote(gzh))
    gzh_html = get_html(gzh_url, 'https://weixin.sogou.com/')
    wxid_list = gzh_html.css('ul.news-list2 li::attr(d)').extract()

    for wxid in wxid_list:
        page_ = True
        page_count = 1
        url = 'https://weixin.sogou.com/weixin?type=2&ie=utf8&query={}&tsn=5&ft={}&et={}&interation=&wxid={}&usip={}&page={}'.format(
            quote(gzh), start_time, endtime, wxid, quote(gzh), page_count)
        referer = 'https://weixin.sogou.com/weixin?type=2&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_='.format(
            quote(gzh))
        while page_:
            response = get_html(url, referer)
            article_urls = response.css('div.news-box ul.news-list li div.txt-box h3 a::attr(data-share)').extract()
            if len(article_urls) == 10:
                print('--翻页--进入第{}页--'.format(page_count+1))
                url = url.replace('&page={}'.format(page_count),'&page={}'.format(page_count+1))
                page_count += 1
            else:
                page_ = False
            for al in article_urls:
                # print(al)
                article_html = get_html_act(al, '')
                article_name = article_html.css('#activity-name::text').extract_first()
                if article_name:
                    # 输出当前页面链接文章名称
                    print(article_name.strip())
                else:
                    print(al)


if __name__ == '__main__':
    # 开始时间
    start_time = '2019-03-01'
    # 结束时间
    endtime = '2019-04-01'
    # 公众号,也可以为公众号关键字
    gzh = '痴海'

    run(gzh, start_time, endtime)

展开阅读全文

没有更多推荐了,返回首页