# -*- coding: utf-8 -*-
"""
Created on Tue May 19 16:03:44 2020
@author: weiping
"""
import pandas as pd
import requests
import urllib, sys
from urllib import request as req
import ssl,base64
import time
from lxml import etree
#定义百度api图像识别接口
def image_dist(file):
time.sleep(1)
access_token = '24.4b919381b150522b4c62e47426ea0ffc.2592000.1592464956.282335-15270291' #有效期30天
url = 'https://aip.baidubce.com/rest/2.0/ocr/v1/general?access_token=' + access_token
# 二进制方式打开图文件
f = open(file, 'rb')
# 参数image:图像base64编码
img = base64.b64encode(f.read())
f.close()
params = {"image": img}
par = urllib.parse.urlencode(params).encode('gbk')
request = req.Request(url, par)
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
response = req.urlopen(request)
content = response.read()
if (content):
#return (eval(content.decode()))
return (eval(content.decode())['words_result'][0]['words'])
#print(eval(content.decode()))
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent} #请求
name = []
gender =[]
fans_num = []
zans_num = []
videos_num = []
#爬前10页
for j in range(1,11):
ur = 'https://kolranking.com/home?s=&category=&ot=DESC&order=follower_count&page=' + str(j)
t = requests.get(ur, headers=headers)
html = etree.HTML(t.text)
name2=[]
names = html.xpath('//table[@class="table user-list"]//tr/td/a')
for i in names:
name.append(i.text)
name2.append(i.text)
genders = html.xpath('//table[@class="table user-list"]//tr/td[@class="align-center"]')
for ii in genders:
gender.append(ii.text)
html_data = html.xpath('//table[@class="table user-list"]//tr/td[@class="align-right"]//@src')
t = 0
for name_i in name2:
t += 3
fans = html_data[t-3]
zans = html_data[t-2]
videos = html_data[t-1]
file = 'D:\\python\\抖音kol爬取\\photo\\'
path_fans = file + str(name_i) + 'fans.png'
path_zans = file + str(name_i) + 'zans.png'
path_videos = file + str(name_i) + 'videos.png'
#图片下载到本地,然后上传读取(主要是接口的url图片无法识别)
req.urlretrieve(fans,path_fans)
req.urlretrieve(zans,path_zans)
req.urlretrieve(videos,path_videos)
try:
fans_num.append(image_dist(path_fans))
except:
fans_num.append('-')
continue
try:
zans_num.append(image_dist(path_zans))
except:
zans_num.append('-')
continue
try:
videos_num.append(image_dist(path_videos))
except:
videos_num.append('-')
continue
name_n = pd.Series(name,name = 'name')
gerder_n = pd.Series(gender,name = 'gender')
fans_n = pd.Series(fans_num,name = 'fans_num')
zans_n = pd.Series(zans_num,name = 'zans_num')
videos_n = pd.Series(videos_num,name = 'videos_num')
dy_kol = pd.concat([name_n,gerder_n,fans_n,zans_n,videos_n],axis =1)