网络爬虫-微信公众号-近期文章-MySQL数据库

搜狗微信客户端爬取的网址都是临时接口,为了网址接口长久性,我采用了微信链接转永久接口

import re
import json
import time
import pymysql
import requests
from bs4 import BeautifulSoup
from ShowapiRequest import ShowapiRequest

config = {
          'host':'.com',
          'port':3306,
          'user':'',
          'password':'',
          'db':'',
          'charset':'utf8mb4',
          'cursorclass':pymysql.cursors.DictCursor,
}

_URL = [
]

connection = pymysql.connect(**config)
cursor = connection.cursor()

# Read entrant_page_sum len
sum = "SELECT abstracts FROM entrant_page_sum"
_sql_sum = cursor.execute(sum)
_result = cursor.fetchall()

rk_id_have = cate_id = 0

for num in range(len(_URL)):
    print('{}-'.format(num + 1), end='')
    _wechats = requests.get(_URL[num])
    _wechats.encoding = 'utf-8'
    _soup = BeautifulSoup(_wechats.text, 'html.parser')
    while _soup.select('.profile_nickname') == None:
        time.sleep(3)
        print('_Soup Error!')
        _wechats = requests.get(_URL[num])
        _wechats.encoding = 'utf-8'
        _soup = BeautifulSoup(_wechats.text, 'html.parser')

    _wechats_name = _soup.select('.profile_nickname')[0].text.strip()
    print('文章来源:{}'.format(_wechats_name))
    _wechats_value = _soup.select('.profile_desc_value')[0].text.strip()
    _search = re.search('{(.+)}}]}', _wechats.text).group()

    _jd = json.loads(_search)

    for i in range(len(_jd['list'])):
        print(_jd['list'][i]['app_msg_ext_info']['cover'])
        if _result['abstracts'] == _jd['list'][i]['app_msg_ext_info']['digest']:
             rk_id_have = 1
             break
        if rk_id_have == 1:
            rk_id_have = 0
            print('have')
            continue
        else:
            _sql_sum += 1
            wechat_address = 'https://mp.weixin.qq.com{}'.format(_jd['list'][i]['app_msg_ext_info']['content_url']).replace('amp;', '')
            r = ShowapiRequest("url","id","serect")
            r.addBodyPara("url", wechat_address)
            res = r.post()
            _sql_address = re.search('http://mp.weixin.qq.com/s(.+)#wechat_redirect', res.text)
            while _sql_address == None:
                time.sleep(3)
                print('_Sql_Address Error!')
                r = ShowapiRequest("http://route.showapi.com/1456-1", "70333", "00")
                r.addBodyPara("url", wechat_address)
                res = r.post()
                _sql_address = re.search('http://mp.weixin.qq.com/s(.+)#wechat_redirect', res.text)

            _sql_address = _sql_address.group()
            _sql_abstracts = _jd['list'][i]['app_msg_ext_info']['digest']
            _sql_title = _jd['list'][i]['app_msg_ext_info']['title']
            _sql_image = _jd['list'][i]['app_msg_ext_info']['cover']

            # Create a new record
            sql = "INSERT INTO entrant_page_sum (pageid, image, abstracts, address, title, category1_id, category2_id, category3_id, weight, gmt_create, gmt_modified) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, '0', '2018-07-20 19:27:01', '2018-07-20 19:27:03')"
            if num <= 6:
                # 0~6
                cursor.execute(sql, (_sql_sum, _sql_image, _sql_abstracts, _sql_address, _sql_title, '6', '29', _sql_sum))
            else:
                # 7~21
                if num != 9:
                    cursor.execute(sql, (_sql_sum, _sql_image, _sql_abstracts, _sql_address, _sql_title, '3', str(cate_id + 8), _sql_sum))
                else:
                    cate_id -= 1
                    cursor.execute(sql, (_sql_sum, _sql_image, _sql_abstracts, _sql_address, _sql_title, '3', '16', _sql_sum))

            print('    {}、{} OK!'.format(_sql_sum, _jd['list'][i]['app_msg_ext_info']['title']))
            connection.commit()
            time.sleep(1)

    cate_id += 1

connection.close()

'''
print('    标题名:{}'.format(_jd['list'][i]['app_msg_ext_info']['title']))
print('    说明:{}'.format(_jd['list'][i]['app_msg_ext_info']['digest']))
print('    标题临时链接:https://mp.weixin.qq.com{}'.format(_jd['list'][i]['app_msg_ext_info']['content_url']).replace('amp;', ''))
print('    图片链接:{}'.format(_jd['list'][i]['app_msg_ext_info']['cover']))
'''

登录数据库测试:

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值