#!user/bin/env python3
# -*- coding: UTF-8 -*-
import urllib.request
import re
import os
import sys
from multiprocessing.dummy import Pool
def name_read():
names=[]
input_name=[]
judge=[]
with open('secondname.txt','r',encoding='UTF-8')as f:
read_data=f.read()
input_name=read_data.split('\n')
f.close()
with open('names.txt','r',encoding='UTF-8')as f:
read_data=f.read()
names=read_data.split('\n')
f.close()
for name in input_name:
if not(name in names):
if not(name in judge):
judge.append(name)
return judge
def name_write(name):
os.chdir(sys.path[0])
f=open('names.txt','a',encoding='UTF-8')
f.writelines(name+'\n')
f.close()
def download(download_info):
(url, file_name) = download_info
for i in range(6):
try:
global CONSTANT
CONSTANT+=1
file_name=str(CONSTANT)+".jpg"
with urllib.request.urlopen(url, timeout=2) as response, open(file_name, 'wb') as out_file:
data = response.read()
out_file.write(data)
return
except:
CONSTANT-=1
pass
#print('Download failed: %s'%(url))
def mass_download(urls, nthread):
#print('Downloading...')
download_infos = [(url, os.path.basename(url)) for url in urls]
with Pool(nthread) as pool:
pool.map(download, download_infos)
def get_html(url_path):
#print('Fetching html...')
for i in range(5):
try:
with urllib.request.urlopen(url_path) as url:
s = str(url.read())
return s
except:
pass
#print('Fetching html failed...')
def get_image_urls(html_content):
print('Parsing html...')
exp = 'objURL":"([a-z.:/_A-Z0-9]*)"'
image_urls = re.findall(exp, html_content)
#print('%d images found in this page'%(len(image_urls)))
return image_urls
num_image = 169
nthread = 8
names=name_read()
#ames=["王力宏","高清壁纸"]
for name in names:
#print('down%s'%name)
name_write(name)
key_word = repr(name.encode('UTF-8')).replace('\\x', '%').upper()[2:-1]
dest_folder=sys.path[0]+'/'+name
if os.path.exists(dest_folder):
print("文件夹已经存在")
continue
else:
os.makedirs(dest_folder)
os.chdir(dest_folder)
pn = 0
cnt = 0
CONSTANT = 0
downloaded = set()
while cnt < num_image:
print("Page %d:"%(pn+1))
image_urls = []
try:
url = "http://images.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%d&gsm=0"%(key_word, pn*15)
html_content = get_html(url)
temp_urls = get_image_urls(html_content)
for i in temp_urls:
if i not in downloaded:
downloaded.add(i)
image_urls.append(i)
mass_download(image_urls, nthread)
except KeyboardInterrupt:
exit()
except:
pass
pn += 1
cnt += len(image_urls)
if(pn>30):
print("done")
break
python数据挖掘
最新推荐文章于 2023-03-14 12:41:34 发布