python爬虫+图像识别(接口调用)

# -*- coding: utf-8 -*-
"""
Created on Tue May 19 16:03:44 2020

@author: weiping
"""
import pandas as pd
import requests
import urllib, sys 
from urllib import request as req
import ssl,base64
import time
from lxml import etree
#定义百度api图像识别接口
def image_dist(file):
    time.sleep(1)
    access_token = '24.4b919381b150522b4c62e47426ea0ffc.2592000.1592464956.282335-15270291' #有效期30天
    url = 'https://aip.baidubce.com/rest/2.0/ocr/v1/general?access_token=' + access_token
    # 二进制方式打开图文件
    f = open(file, 'rb')
    # 参数image:图像base64编码
    img = base64.b64encode(f.read())
    f.close()
    params = {"image": img}
    par = urllib.parse.urlencode(params).encode('gbk')
    request = req.Request(url, par)
    request.add_header('Content-Type', 'application/x-www-form-urlencoded')
    response = req.urlopen(request)
    content = response.read()
    if (content):
        #return (eval(content.decode()))
        return (eval(content.decode())['words_result'][0]['words'])
        #print(eval(content.decode()))

  
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 
headers = {'User-Agent': user_agent}  #请求

name = []
gender =[]
fans_num = []
zans_num = []
videos_num = []
#爬前10页
for j in range(1,11):

    ur = 'https://kolranking.com/home?s=&category=&ot=DESC&order=follower_count&page=' + str(j) 
    t = requests.get(ur, headers=headers) 

    html = etree.HTML(t.text)
    name2=[]
    names = html.xpath('//table[@class="table user-list"]//tr/td/a')
    for i in names:
        name.append(i.text)
        name2.append(i.text)
    
    genders = html.xpath('//table[@class="table user-list"]//tr/td[@class="align-center"]')
    for ii in genders:
        gender.append(ii.text)
    

    html_data = html.xpath('//table[@class="table user-list"]//tr/td[@class="align-right"]//@src')
    t = 0
    for name_i in name2:
        t += 3
        fans = html_data[t-3]
        zans = html_data[t-2]
        videos = html_data[t-1]
        file = 'D:\\python\\抖音kol爬取\\photo\\' 
        path_fans = file + str(name_i) + 'fans.png'
        path_zans = file + str(name_i) + 'zans.png'
        path_videos = file + str(name_i) + 'videos.png'
        #图片下载到本地,然后上传读取(主要是接口的url图片无法识别)        
        req.urlretrieve(fans,path_fans)
        req.urlretrieve(zans,path_zans)
        req.urlretrieve(videos,path_videos)
        try:
            fans_num.append(image_dist(path_fans))
        except:
            fans_num.append('-')
            continue
        try:
            zans_num.append(image_dist(path_zans))
        except:
            zans_num.append('-')
            continue
        try: 
            videos_num.append(image_dist(path_videos))
        except:
            videos_num.append('-')
            continue


name_n = pd.Series(name,name = 'name')
gerder_n = pd.Series(gender,name = 'gender')
fans_n = pd.Series(fans_num,name = 'fans_num')
zans_n = pd.Series(zans_num,name = 'zans_num')
videos_n = pd.Series(videos_num,name = 'videos_num')

dy_kol = pd.concat([name_n,gerder_n,fans_n,zans_n,videos_n],axis =1)

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值