def job():
from urllib import request
import pandas as pd
from datetime import datetime
head={}
head['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
jts_url="https://ts.21cn.com/front/api/ranking/merchantPostList.do?pageNo=1&merchantId=7037&listType=1&offset=b911ff99c1c81c99"
jts_req=request.Request(url=jts_url,headers=head)
jts_respon=request.urlopen(jts_req)
jts_html=jts_respon.read().decode('utf-8','ignore')
jts_dict=eval(jts_html)
jts_ctime_list=[]
jts_id_list=[]
jts_picture_list=[]
jts_title_list=[]
jts_shortTopic_list=[]
jts_shuqiu_list=[]
jts_tail_url_list=[]
output2=pd.DataFrame()
for jts_url_num in range(10):
jts_ctime_list.append(jts_dict.get('postList')[jts_url_num].get('ctime'))
jts_id_list.append(jts_dict.get('postList')[jts_url_num].get('id'))
jts_picture_list.append(jts_dict.get('postList')[jts_url_num].get('picture'))
jts_title_list.append(jts_dict.get('postList')[jts_url_num].get('title'))
jts_shortTopic_list.append(jts_dict.get('postList')[jts_url_num].get('shortTopic'))
jts_shuqiu_list.append(jts_dict.get('postList')[jts_url_num].get('shuqiu'))
jts_tail_url_list.append("https://ts.21cn.com/tousu/show/id/"+str(jts_dict.get('postList')[jts_url_num].get('id')))
output2 = pd.DataFrame({'ctime':jts_ctime_list,
'id':jts_id_list,
'picture':jts_picture_list,
'title':jts_title_list,
'shortTopic':jts_shortTopic_list,
'shuqiu':jts_shuqiu_list,
'tail_url':jts_tail_url_list})
def stamp_to_datetime(stamp):
"""
将时间戳(1539100800)转换为 datetime2018-10-09 16:00:00格式并返回
:param stamp:
:return:
"""
time_stamp_array = datetime.utcfromtimestamp(stamp)
date_time = time_stamp_array.strftime("%Y-%m-%d %H:%M:%S")
# 如果直接返回 date_time则为字符串格式2018-10-09 16:00:00
date = datetime.strptime(date_time,"%Y-%m-%d %H:%M:%S")
return date
output2['ctime']=output2.ctime.apply(lambda txt:stamp_to_datetime(txt))
def output_wx(title,shortTopic,tail_url,picture):
import json
import requests
import base64
import hashlib
header = { "Content-Type":"application/json", "Charset":"utf-8" }
url = "微信机器人地址"
markdown = {
"msgtype": "news",
"news": {
"articles" : [
{
"title" :title,
"description" : shortTopic,
"url" : tail_url,
"picurl" :picture
}
]
}
}
request = requests.post(url,data = json.dumps(markdown),headers=header)
remain_jts=pd.read_excel('聚投诉爬虫结果.xlsx',encoding='GB18030')
input_jts=output2[~output2.id.isin(remain_jts.id)].reset_index(drop=True)
for i in range(len(input_jts)):
output_wx(input_jts.loc[i].title,
input_jts.loc[i].shortTopic,
input_jts.loc[i].tail_url,
input_jts.loc[i].picture)
remain_jts=pd.concat([remain_jts,input_jts],ignore_index=True)
remain_jts=remain_jts.sort_values('ctime')
remain_jts.to_excel('聚投诉爬虫结果.xlsx',index=False)