python爬虫+图像识别（接口调用）

最新推荐文章于 2023-05-06 16:43:19 发布

cutwind

最新推荐文章于 2023-05-06 16:43:19 发布

阅读量728

点赞数

分类专栏： python 文章标签： python 图像识别

本文链接：https://blog.csdn.net/cutwind/article/details/106256573

版权

python 专栏收录该内容

23 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-
"""
Created on Tue May 19 16:03:44 2020

@author: weiping
"""
import pandas as pd
import requests
import urllib, sys 
from urllib import request as req
import ssl,base64
import time
from lxml import etree
#定义百度api图像识别接口
def image_dist(file):
    time.sleep(1)
    access_token = '24.4b919381b150522b4c62e47426ea0ffc.2592000.1592464956.282335-15270291' #有效期30天
    url = 'https://aip.baidubce.com/rest/2.0/ocr/v1/general?access_token=' + access_token
    # 二进制方式打开图文件
    f = open(file, 'rb')
    # 参数image：图像base64编码
    img = base64.b64encode(f.read())
    f.close()
    params = {"image": img}
    par = urllib.parse.urlencode(params).encode('gbk')
    request = req.Request(url, par)
    request.add_header('Content-Type', 'application/x-www-form-urlencoded')
    response = req.urlopen(request)
    content = response.read()
    if (content):
        #return (eval(content.decode()))
        return (eval(content.decode())['words_result'][0]['words'])
        #print(eval(content.decode()))

  
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 
headers = {'User-Agent': user_agent}  #请求

name = []
gender =[]
fans_num = []
zans_num = []
videos_num = []
#爬前10页
for j in range(1,11):

    ur = 'https://kolranking.com/home?s=&category=&ot=DESC&order=follower_count&page=' + str(j) 
    t = requests.get(ur, headers=headers) 

    html = etree.HTML(t.text)
    name2=[]
    names = html.xpath('//table[@class="table user-list"]//tr/td/a')
    for i in names:
        name.append(i.text)
        name2.append(i.text)
    
    genders = html.xpath('//table[@class="table user-list"]//tr/td[@class="align-center"]')
    for ii in genders:
        gender.append(ii.text)
    

    html_data = html.xpath('//table[@class="table user-list"]//tr/td[@class="align-right"]//@src')
    t = 0
    for name_i in name2:
        t += 3
        fans = html_data[t-3]
        zans = html_data[t-2]
        videos = html_data[t-1]
        file = 'D:\\python\\抖音kol爬取\\photo\\' 
        path_fans = file + str(name_i) + 'fans.png'
        path_zans = file + str(name_i) + 'zans.png'
        path_videos = file + str(name_i) + 'videos.png'
        #图片下载到本地，然后上传读取（主要是接口的url图片无法识别）        
        req.urlretrieve(fans,path_fans)
        req.urlretrieve(zans,path_zans)
        req.urlretrieve(videos,path_videos)
        try:
            fans_num.append(image_dist(path_fans))
        except:
            fans_num.append('-')
            continue
        try:
            zans_num.append(image_dist(path_zans))
        except:
            zans_num.append('-')
            continue
        try: 
            videos_num.append(image_dist(path_videos))
        except:
            videos_num.append('-')
            continue


name_n = pd.Series(name,name = 'name')
gerder_n = pd.Series(gender,name = 'gender')
fans_n = pd.Series(fans_num,name = 'fans_num')
zans_n = pd.Series(zans_num,name = 'zans_num')
videos_n = pd.Series(videos_num,name = 'videos_num')

dy_kol = pd.concat([name_n,gerder_n,fans_n,zans_n,videos_n],axis =1)

cutwind

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python爬虫+图像识别（接口调用）

# -*- coding: utf-8 -*-"""Created on Tue May 19 16:03:44 2020@author: weiping"""import pandas as pdimport requestsimport urllib, sys from urllib import request as reqimport ssl,base64import timefrom lxml import etree#定义百度api图像识别接口def image_d.
复制链接

扫一扫

专栏目录