使用python抓取搜狗引擎公众号文章

最新推荐文章于 2021-12-14 13:54:37 发布

machunjie2003

最新推荐文章于 2021-12-14 13:54:37 发布

阅读量297

点赞数 1

本文链接：https://blog.csdn.net/machunjie2003/article/details/82801770

版权

初学python，以写一个爬虫程序入手，记录一下学习过程。

参考：http://www.cnblogs.com/haq5201314/p/9215569.html

还有些内容忘了从哪里参考的了。

首先搭建python运行环境python3.7.0，安装集成开发环境pycharm,具体过程忘了。

由于程序是模拟谷歌浏览器运行，需要下载chromedriver.exe，下载成功后放到项目根目录

下面开始正式编写。

首先，写个数据库链接类，我用的是SQLSERVER2008R2,dbhelper.py文件需要导入pymssql。

1、查询语句执行方法

def ExecQuery(self,sql):
    """
    执行查询语句
    返回的是一个包含tuple的list，list的元素是记录行，tuple的元素是每行记录的字段
    """
    cur = self.__GetConnect
    cur.execute(sql)
    resList = cur.fetchall()

    #查询完毕后必须关闭连接
    self.conn.close()
    return resList

2、非查询语句执行方法

def ExecNonQuery(self,sql):

    #执行非查询语句

    cur = self.__GetConnect
    cur.execute(sql)
    self.conn.commit()
    self.conn.close()

操作数据库的类写完了，开始正式抓取数据。

先模拟打开搜狗引擎的搜索微信公众号的页面

url='http://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=y&_sug_type_=&w=01015002&oq=jike&ri=0&sourceid=sugg&stj=0%3B0%3B0%3B0&stj2=0&stj0=0&stj1=0&hp=36&hp1=&sut=4432&sst0=1529305369937&lkt=5%2C1529305367635%2C1529305369835'.format(name)
r = requests.get(url=url, headers=headers)
#其中的format(name)中的name是要搜索的公众号名称
rsw = re.findall('src=.*&amp;timestamp=.*&amp;ver=.*&amp;signature=.*', str(r.text))

然后从搜索到接结果中选择第一个打开，进入公众号文章列表页

cis = re.findall('.*?==', str(rsw[0]))
qd = "".join(cis)
qd2 = "{}".format(qd)
qd3 = qd2.replace(';', '&')
urls = 'https://mp.weixin.qq.com/profile?'.strip() + qd3
driver = webdriver.Chrome()
driver.get(urls)

由于列表页的数据都是由js动态加载出来的，如果按照常规方法取html标签中的内容是取不到值的，这时候需要直接从js脚本里获取数据，这里会用到selenium库，直接在官网下载安装就行了。

r = driver.execute_script("return msgList")

从js里取数据就上面一行代码，不得不说python太强大了……

msgList是公众号文章列表中js变量的名称，有兴趣的可以在列表页查看源代码看一下。

数据已经拿到，下面就是解析并插入到数据库了。

源码：dbhelper.py

#coding=utf-8
#!/usr/bin/env python

import pymssql


class MSSQL:

    def __init__(self,host,user,pwd,db):
        self.host = host
        self.user = user
        self.pwd = pwd
        self.db = db

    @property
    def __GetConnect(self):
        if not self.db:
            raise(NameError,"没有设置数据库信息")
        self.conn = pymssql.connect(host=self.host,user=self.user,password=self.pwd,database=self.db,charset="utf8")
        cur = self.conn.cursor()
        if not cur:
            raise(NameError,"连接数据库失败")
        else:
            return cur

    def ExecQuery(self,sql):
        cur = self.__GetConnect
        cur.execute(sql)
        resList = cur.fetchall()

        #查询完毕后必须关闭连接
        self.conn.close()
        return resList

    def ExecNonQuery(self,sql):

        #执行非查询语句

        cur = self.__GetConnect
        cur.execute(sql)
        self.conn.commit()
        self.conn.close()

抓取数据类：

from selenium import webdriver
import requests
import re


class crawhelper:

        def grab(name):
                headers={}
                url='http://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=y&_sug_type_=&w=01015002&oq=jike&ri=0&sourceid=sugg&stj=0%3B0%3B0%3B0&stj2=0&stj0=0&stj1=0&hp=36&hp1=&sut=4432&sst0=1529305369937&lkt=5%2C1529305367635%2C1529305369835'.format(name)
                r = requests.get(url=url, headers=headers)
                rsw = re.findall('src=.*&amp;timestamp=.*&amp;ver=.*&amp;signature=.*', str(r.text))
                if '验证码' in str(r.text):
                        print('[-]发现验证码请访问URL:{}后在重新运行此脚本'.format(r.url))
                        exit()
                else:
                        cis = re.findall('.*?==', str(rsw[0]))
                        qd = "".join(cis)
                        qd2 = "{}".format(qd)
                        qd3 = qd2.replace(';', '&')
                        urls = 'https://mp.weixin.qq.com/profile?'.strip() + qd3
                        driver = webdriver.Chrome()
                        driver.get(urls)
                        if '验证码' in str(driver.page_source):
                                print('[-]发现验证码请访问URL:{}后在重新运行此脚本'.format(driver.current_url))
                                exit()
                        else:
                                r = driver.execute_script("return msgList")
                                #print(r)
                                return r

解析插入数据库类

#!/usr/bin/python
# -*- coding: utf-8 -*-
#从搜狗搜索公众号并抓取数据，保存到数据库(已有数据跳过)
import psp
import dbhelper
#import pymssql
import time
import datetime
#import SlopeTwo
#import numpy as np
#import pandas as pd
#from itertools import groupby
#import  requests
#import  re
from bs4 import BeautifulSoup

from urllib.request import urlopen
#import  sys
#import importlib
#importlib.reload(sys)
#codeint=sys.getdefaultencoding()
import  chardet

#获取公众号素材详细内容
def getContent(url):
    page = urlopen(url)
    html = page.read().decode('utf-8')
    if '验证码' in str(html):
        print('[-]发现验证码请访问URL:{}后在重新运行此脚本'.format(url))
        exit()
    soup=BeautifulSoup(html,"html.parser")
    result = soup.find_all('div',{'class':'rich_media_content'})
    #print (result)
    return result

if __name__=='__main__':
    try:


        ms = dbhelper.MSSQL(host="111.111.222.222",user="sa",pwd="123456",db="Test")
        #获取数据库中介入正常的公众号名称
        name_list=ms.ExecQuery('select id,name from account where status=2')
        for id,name in name_list:
            #获取每个公众号对应的最新10天的数据
            list=psp.crawhelper.grab(name)
            list=list["list"]
            for item in list:
                try:
                    author=item["app_msg_ext_info"]["author"]
                    url="https://mp.weixin.qq.com"+item["app_msg_ext_info"]["content_url"]
                    content=getContent(url.replace('&amp;','&'))
                    content=content[0]
                    content=str(content).replace("'","''")
                    cover=item["app_msg_ext_info"]["cover"]
                    title=item["app_msg_ext_info"]["title"]
                    multi_app_msg_item_list=item["app_msg_ext_info"]["multi_app_msg_item_list"]
                    date_time=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(item["comm_msg_info"]["datetime"]))
                    date_time=str(date_time)[0:-3]
                    index=1
                    create_time=str(datetime.datetime.now())[0:-3]
                    status=1
                    print(name+":"+date_time+":第"+str(index)+"条")
                    result=bytes(content, encoding = "UTF-8")
                    #print (chardet.detect(result))
                    list=ms.ExecQuery("select * from news where title=\'%s\' and account_id=\'%s\'"%(title,id))
                    if(len(list)==0):
                        sql="insert into news(update_time,author,thumb_url,title,url,account_id,index_id,create_time,status,content,public_time) values\
                            (\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')" %(date_time,author,cover.replace('&amp;','&'),title,url.replace('&amp;','&'),id,index,create_time,status,content,date_time)
                        #print(sql)
                        ms.ExecNonQuery(sql)
                    for model in multi_app_msg_item_list:
                        try:
                            author=model["author"]
                            url="https://mp.weixin.qq.com"+model["content_url"]
                            content=getContent(url.replace('&amp;','&'))
                            content=content[0]
                            content=str(content).replace("'","''")
                            cover=model["cover"]
                            title=model["title"]
                            index=index+1
                            print(name+":"+date_time+":第"+str(index)+"条")
                            result=bytes(content, encoding = "UTF-8")
                            #print (chardet.detect(result))
                            list=ms.ExecQuery("select * from news where title=\'%s\' and account_id=\'%s\'"%(title,id))
                            if(len(list)==0):
                                sql="insert into news(update_time,author,thumb_url,title,url,account_id,index_id,create_time,status,content,public_time) values\
                                (\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')" %(date_time,author,cover.replace('&amp;','&'),title,url.replace('&amp;','&'),id,index,create_time,status,content,date_time)
                                #print (sql)
                                ms.ExecNonQuery(sql)
                        except Exception as m:
                            print (str(m))
                            continue
                except Exception as n:
                    print (str(n))
                    continue
        print('完成')
    except Exception as e:
        print(str(e))