python 根据关键词下载网站图片

最新推荐文章于 2024-05-10 21:43:40 发布

一颗很菜的菜

最新推荐文章于 2024-05-10 21:43:40 发布

阅读量450

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/true1cc/article/details/100767454

版权

Python 专栏收录该内容

7 篇文章

订阅专栏

def decode_gb2312(data):
    data = data.split()
    key_word = data[1] + data[2]
    type = data[3]
    file_name1 = key_word + type
    type = type.encode('gb2312').hex()
    key_word = key_word.encode('gb2312').hex()
    print(type)
    print(key_word)
    i = 0
    type2 = ""
    type = type.upper()
    while i < len(type):
        if i % 2 == 0:
            type2 = type2 + '%'
        type2 = type2 + type[i]
        i = i + 1
    print(type2)

    i = 0
    key_word2 = ""
    key_word = key_word.upper()
    while i < len(key_word):
        if i % 2 == 0:
            key_word2 = key_word2 + '%'
        key_word2 = key_word2 + key_word[i]
        i = i + 1
    print(key_word2)
    return key_word2, type2, file_name1


def one():
    with open("D:\image_keyword", encoding="utf-8-sig") as file:
        datas = file.readlines()
        for data in datas:
            key_word2, type2, file_name1 = decode_gb2312(data)
            url = 'http://www.chinawestagr.com/bch/searchResult.aspx?type='+type2+'&context='+key_word2
            print(url)
            result = requests.get(url)
            result = result.text
            # print(result)

            html = pq(result)
            # 读取title内容
            # print(html.title)
            # attrs = html.title.attrs
            # print(attrs)
            print(html('title'))
            print(html('a').attr('href'))
            a = html('a').attr('href')
            if a is not None and a.__contains__('CropContent'):
                href = 'http://www.chinawestagr.com/bch/' + a
                result = requests.get(href)
                result = result.text
                # print(result)

                html = pq(result)
                imgs = html('img')
                print(imgs)
                i = 0
                while i < imgs.size():
                    img = imgs[i].attrib['src']
                    if img.__contains__('UploadFiles'):
                        img_url = 'http://www.chinawestagr.com/bch/'+img
                        try:
                            pic = requests.get(img_url, timeout=5)  # 超时异常判断 5秒超时
                        except requests.exceptions.ConnectionError:
                            print('当前图片无法下载')
                            continue
                        file_name = "D:/spider/citrus_spider/spiderfiles/grapimages/" + file_name1 + str(i) + ".jpg"  # 拼接图片名
                        print(file_name)
                        # 将图片存入本地
                        fp = open(file_name, 'wb')
                        fp.write(pic.content)  # 写入图片
                        fp.close()
                    i = i + 1
 if __name__ == '__main__':
         one()