def decode_gb2312(data):
data = data.split()
key_word = data[1] + data[2]
type = data[3]
file_name1 = key_word + type
type = type.encode('gb2312').hex()
key_word = key_word.encode('gb2312').hex()
print(type)
print(key_word)
i = 0
type2 = ""
type = type.upper()
while i < len(type):
if i % 2 == 0:
type2 = type2 + '%'
type2 = type2 + type[i]
i = i + 1
print(type2)
i = 0
key_word2 = ""
key_word = key_word.upper()
while i < len(key_word):
if i % 2 == 0:
key_word2 = key_word2 + '%'
key_word2 = key_word2 + key_word[i]
i = i + 1
print(key_word2)
return key_word2, type2, file_name1
def one():
with open("D:\image_keyword", encoding="utf-8-sig") as file:
datas = file.readlines()
for data in datas:
key_word2, type2, file_name1 = decode_gb2312(data)
url = 'http://www.chinawestagr.com/bch/searchResult.aspx?type='+type2+'&context='+key_word2
print(url)
result = requests.get(url)
result = result.text
# print(result)
html = pq(result)
# 读取title内容
# print(html.title)
# attrs = html.title.attrs
# print(attrs)
print(html('title'))
print(html('a').attr('href'))
a = html('a').attr('href')
if a is not None and a.__contains__('CropContent'):
href = 'http://www.chinawestagr.com/bch/' + a
result = requests.get(href)
result = result.text
# print(result)
html = pq(result)
imgs = html('img')
print(imgs)
i = 0
while i < imgs.size():
img = imgs[i].attrib['src']
if img.__contains__('UploadFiles'):
img_url = 'http://www.chinawestagr.com/bch/'+img
try:
pic = requests.get(img_url, timeout=5) # 超时异常判断 5秒超时
except requests.exceptions.ConnectionError:
print('当前图片无法下载')
continue
file_name = "D:/spider/citrus_spider/spiderfiles/grapimages/" + file_name1 + str(i) + ".jpg" # 拼接图片名
print(file_name)
# 将图片存入本地
fp = open(file_name, 'wb')
fp.write(pic.content) # 写入图片
fp.close()
i = i + 1
if __name__ == '__main__':
one()
python 根据 关键词 下载网站图片
最新推荐文章于 2024-05-10 21:43:40 发布