搜狗微信客户端爬取的网址都是临时接口,为了网址接口长久性,我采用了微信链接转永久接口
import re
import json
import time
import pymysql
import requests
from bs4 import BeautifulSoup
from ShowapiRequest import ShowapiRequest
config = {
'host':'.com',
'port':3306,
'user':'',
'password':'',
'db':'',
'charset':'utf8mb4',
'cursorclass':pymysql.cursors.DictCursor,
}
_URL = [
]
connection = pymysql.connect(**config)
cursor = connection.cursor()
# Read entrant_page_sum len
sum = "SELECT abstracts FROM entrant_page_sum"
_sql_sum = cursor.execute(sum)
_result = cursor.fetchall()
rk_id_have = cate_id = 0
for num in range(len(_URL)):
print('{}-'.format(num + 1), end='')
_wechats = requests.get(_URL[num])
_wechats.encoding = 'utf-8'
_soup = BeautifulSoup(_wechats.text, 'html.parser')
while _soup.select('.profile_nickname') == None:
time.sleep(3)
print('_Soup Error!')
_wechats = requests.get(_URL[num])
_wechats.encoding = 'utf-8'
_soup = BeautifulSoup(_wechats.text, 'html.parser')
_wechats_name = _soup.select('.profile_nickname')[0].text.strip()
print('文章来源:{}'.format(_wechats_name))
_wechats_value = _soup.select('.profile_desc_value')[0].text.strip()
_search = re.search('{(.+)}}]}', _wechats.text).group()
_jd = json.loads(_search)
for i in range(len(_jd['list'])):
print(_jd['list'][i]['app_msg_ext_info']['cover'])
if _result['abstracts'] == _jd['list'][i]['app_msg_ext_info']['digest']:
rk_id_have = 1
break
if rk_id_have == 1:
rk_id_have = 0
print('have')
continue
else:
_sql_sum += 1
wechat_address = 'https://mp.weixin.qq.com{}'.format(_jd['list'][i]['app_msg_ext_info']['content_url']).replace('amp;', '')
r = ShowapiRequest("url","id","serect")
r.addBodyPara("url", wechat_address)
res = r.post()
_sql_address = re.search('http://mp.weixin.qq.com/s(.+)#wechat_redirect', res.text)
while _sql_address == None:
time.sleep(3)
print('_Sql_Address Error!')
r = ShowapiRequest("http://route.showapi.com/1456-1", "70333", "00")
r.addBodyPara("url", wechat_address)
res = r.post()
_sql_address = re.search('http://mp.weixin.qq.com/s(.+)#wechat_redirect', res.text)
_sql_address = _sql_address.group()
_sql_abstracts = _jd['list'][i]['app_msg_ext_info']['digest']
_sql_title = _jd['list'][i]['app_msg_ext_info']['title']
_sql_image = _jd['list'][i]['app_msg_ext_info']['cover']
# Create a new record
sql = "INSERT INTO entrant_page_sum (pageid, image, abstracts, address, title, category1_id, category2_id, category3_id, weight, gmt_create, gmt_modified) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, '0', '2018-07-20 19:27:01', '2018-07-20 19:27:03')"
if num <= 6:
# 0~6
cursor.execute(sql, (_sql_sum, _sql_image, _sql_abstracts, _sql_address, _sql_title, '6', '29', _sql_sum))
else:
# 7~21
if num != 9:
cursor.execute(sql, (_sql_sum, _sql_image, _sql_abstracts, _sql_address, _sql_title, '3', str(cate_id + 8), _sql_sum))
else:
cate_id -= 1
cursor.execute(sql, (_sql_sum, _sql_image, _sql_abstracts, _sql_address, _sql_title, '3', '16', _sql_sum))
print(' {}、{} OK!'.format(_sql_sum, _jd['list'][i]['app_msg_ext_info']['title']))
connection.commit()
time.sleep(1)
cate_id += 1
connection.close()
'''
print(' 标题名:{}'.format(_jd['list'][i]['app_msg_ext_info']['title']))
print(' 说明:{}'.format(_jd['list'][i]['app_msg_ext_info']['digest']))
print(' 标题临时链接:https://mp.weixin.qq.com{}'.format(_jd['list'][i]['app_msg_ext_info']['content_url']).replace('amp;', ''))
print(' 图片链接:{}'.format(_jd['list'][i]['app_msg_ext_info']['cover']))
'''
登录数据库测试: