搜狗微信文章爬取

# coding: utf8

import requests
import bs4
import json
import sys
import random


reload(sys)
sys.setdefaultencoding("utf-8")

class WXSogou:
    def __init__(self):
        self._session = requests.session()
        self._html = None
        self._bs = None

    @staticmethod
    def get_reading(dict_msg, key):
        if isinstance(dict_msg, dict):
            value = dict_msg.get(key, None)
            return None if not value else value.split(',')[0]

    @staticmethod
    def get_posting(dict_msg, key):
        if isinstance(dict_msg, dict):
            value = dict_msg.get(key, None)
            return None if not value else value.split(',')[1]

    @staticmethod
    def get_pure_value(descendants):
        r = ''
        for a in descendants:
            if not isinstance(a, bs4.element.Comment) \
               and isinstance(a, bs4.element.NavigableString):
                r += str(a)
        return r

    def parse_reading_and_posting(self):
        """

            获取月发文章数和平均阅读量

        """
        print "*" * 50
        print str(self._bs)
        print "*" * 50

        account_anti_url = 'http://weixin.sogou.com' + str(self._bs.find('div', attrs={'class': 'wrapper'}).find_all('script')[-1]).split('"')[-2]
        r = self._session.get(url=account_anti_url)
        if r.status_code == 200:
            r.encoding = 'utf8'
            json_code = json.loads(r.text)
            if json_code['code'] == 'success':
                return json_code.get('msg', None)
        return None

    def parse_data_of_html(self):
        """
            解析、提取搜狗公众号第一页的数据
        """
        msg  = self.parse_reading_and_posting()
        info = []
        for li in self._bs.find('ul', attrs={'class': 'news-list2'}).find_all('li'):

            acc_name     = self.get_pure_value(li.find('p', attrs={'class': 'tit'}).a.descendants)
            perm_post    = self.get_posting(msg, str(li['d']))
            aver_reading = self.get_reading(msg, str(li['d']))

            acc_number   = str(li.find('p', attrs={'class': 'info'}).label.contents[0])
            acc_img      = str(li.find('div', attrs={'class': 'img-box'}).img['src'])
            acc_qrcode   = str(li.find('div', attrs={'class': 'ew-pop'}).find_all('img')[-2]['src'])
            acc_url      = str(li.find('div', attrs={'class': 'img-box'}).a['href'])
            acc_intro    = self.get_pure_value(li.dl.dd.descendants)

            # 最近
            had_gone_to_posted = len(li.find_all('dl')) > 1
            tag_dd = li.find_all('dd')

            try:
                #可能没有最近文章
                art_brief = None if not had_gone_to_posted else self.get_pure_value(tag_dd[-1].a.descendants) 
            except:
                art_brief = None
            try:
                art_url = None if not had_gone_to_posted else str(tag_dd[-1].a['href'])
            except:
                art_url = None
            try:
                art_time     = None if not had_gone_to_posted else int(tag_dd[-1].span.script.contents[0].split('\'')[-2])
            except:
                art_time = None

            info.append({
                'acc_name'    : acc_name,
                'perm_post'   : perm_post,
                'aver_reading': aver_reading,
                'acc_number'  : acc_number,
                'acc_img'     : acc_img,
                'acc_qrcode'  : acc_qrcode,
                'acc_url'     : acc_url,
                'acc_intro'   : acc_intro,
                'art_brief'   : art_brief,
                'art_url'     : art_url,
                'art_time'    : art_time
            })
        return info

    def search(self, account,page):

        agents = [
        "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
        "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
        "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
        "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
        "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
        "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
        "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
        "Mozilla/2.02E (Win95; U)",
        "Mozilla/3.01Gold (Win95; I)",
        "Mozilla/4.8 [en] (Windows NT 5.1; U)",
        "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
    ]


        headers = {
                "Host":"weixin.sogou.com",
                "User-Agent":random.choice(agents),
                "Accept":"text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Cookie":"你的cookie",
                "Connection":"keep-alive"

            }

        url = 'http://weixin.sogou.com/weixin' \
              '?type=1' \
              '&s_from=input' \
              '&query={account}' \
              '&ie=utf8' \
              '&_sug_=n' \
              '&page={page}'\
              '&_sug_type_='.format(account=account,page=page)

        print url

        r = self._session.get(url=url,headers = headers)
        if r.status_code == 200:
            r.encoding = 'utf8'
            self._html = r.text
            self._bs = bs4.BeautifulSoup(markup=self._html, features='html.parser')
            info = self.parse_data_of_html()
            return info


if __name__ == '__main__':
    sogou = WXSogou()
    #因为没有登陆,所以只能显示100条数据,每页十条数据
    for i in range(1,11):

        info = sogou.search('电影公众号',i)

        with open("weixin_article.json","a") as f:
            for article in info:
                f.write(json.dumps(dict(article),ensure_ascii = False) +","+ "\n")

备注:爬取的次数太多要求输入验证码

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值