爬取贴吧和Taptap上面的图片，使用ai对剑与远征关卡进行识别重命名

最新推荐文章于 2024-05-10 10:51:51 发布

saberpan

最新推荐文章于 2024-05-10 10:51:51 发布

阅读量591

点赞数

分类专栏： python 爬虫

本文链接：https://blog.csdn.net/saberpan/article/details/105995643

版权

python 同时被 2 个专栏收录

20 篇文章 1 订阅

订阅专栏

爬虫

4 篇文章 0 订阅

订阅专栏

# encoding:utf-8
import base64
import requests
import os
import re
from lxml import html

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
}
class SpiderTieba():
    #解析网页,输入xpath参数，返回需要的值
    def analysis(self,url,xpath):
        #返回网页
        rs = requests.get(url,headers=headers).text
        #生成etree
        etree = html.etree
        #解析网页
        html1 = etree.HTML(rs)
        xpath1 = html1.xpath(xpath)
        return xpath1

    #爬取图片
    def takephoto(self,url,dirname,xpath="//li[@class='l_reply_num'][1]/span[2]/text()"):
        #获取总页数
        page = SpiderTieba().analysis(url,xpath)
        #翻页爬取
        for i in range(1,int(page[1])+1):
            print(f'爬取第{i}页的图片')
            url2 = f'{url}?pn={i}'
            photo = SpiderTieba().analysis(url2, "//img[@class='BDE_Image']/@src")
            for i in photo:
                image_link = requests.get(i)
                image = image_link.content
                print(i)
                #当前目录下新建文件夹
                # dirpath = os.getcwd() + '/afk-25(矹呐咔叽哈)'
                #D盘新建文件夹
                dirpath = 'D:/'+ dirname
                if not os.path.isdir(dirpath):
                    os.makedirs(dirpath)
                imagename = i[-14:]
                print(f'正在下载{imagename}')
                a = open(dirpath+'/'+imagename,'wb')
                a.write(image)
                a.close()

class SpiderTaptap():
    def takephoto(self,url,dirname):
        #get imageurl
        imageurl = SpiderTieba().analysis(url,'//img[@class="bbcode-img"]/@data-origin-url')
        for i in imageurl:
            image= requests.get(i).content
            imagename = i[-11:]
            imgpath = 'D:\\'+dirname
            if not os.path.isdir(imgpath):
                os.makedirs(imgpath)
                print('创建路径完毕')
            imgfile = open(imgpath+'/'+imagename, 'wb')
            imgfile.write(image)
            imgfile.close()
            print('已完成'+imagename)

# client_id 为官网获取的AK， client_secret 为官网获取的SK
def get_token():
	#为了保护个人信息隐藏了host地址，如有需要可联系我
    host = ''
    response = requests.get(host)
    rsjson = response.json()
    access_token = rsjson['access_token']
    # print(access_token)
    return access_token

def get_imagename(imagepath):
    request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage"
    # 二进制方式打开图片文件
    f = open(imagepath, 'rb')
    img = base64.b64encode(f.read())

    params = {"image": img}
    access_token = get_token()
    request_url = request_url + "?access_token=" + access_token
    headers = {'content-type': 'application/x-www-form-urlencoded'}
    response = requests.post(request_url, data=params, headers=headers)
    if response:
        # print(response.json())
        try:
            original = response.json()
            filename = '关卡'+re.findall(r"关卡(.+?)'",str(original))[0]
            print('识别成功'+filename)
            return filename
        except Exception:
            print('识别失败')
            return False

def get_finalimg(dirname):
    dirpath = "D:\\"+dirname
    imglist = os.listdir(dirpath)
    for i in imglist:
        try:
            imgpath = dirpath + '\\' + i
            print('图片地址' + imgpath)
            imgname = get_imagename(imgpath)
            if imgname == False:
                os.remove(imgpath)
                print('已删除' + imgpath)
            else:
                newname = dirpath + '\\' + str(imgname) + '.jpg'
                print('新地址' + newname)
                os.rename(imgpath, newname)
                print('重命名成功')
        except Exception as e:
            print(e)
            
# url = '' 贴吧或tap地址
# dirname = '' 文件夹名字
# SpiderTieba().takephoto(url=url,dirname=dirname)
# SpiderTaptap().takephoto(url=url,dirname=dirname)
# get_finalimg(dirname=dirname)