存储在本地
import os
from hashlib import md5
def download_image(url):
print ('下载图片',url)
try:
response = requests.get(url)
if response.status_code==200:
save_image(response.content)
#return response.text
#content返回二进制内容,text返回网页正常显示结果
return None
except RequestException:
print ('请求图片出错!')
return None
#图片保存本地
#md5是为了排除重复图片,如果内容相同,md5也会相同
def save_image(content):
file_path='{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
实验结果:
存储在mongodb
MONGO_URL='localhost'
MONGO_DB='toutiao'
MONGO_TABLE='toutiao'
import pymongo
client=pymongo.MongoClient(MONGO_URL,connect=false)
db=client(MONGO_DB)
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print ('存储到MongoDB成功!',result)
return True
return false
之前都是对一个页面进行处理,现在要修改offset,对多个页面进行处理,可开启多线程
多线程处理
from multiprocess import Pool
GROUP_START=1
GROUP_END=20
KEYWORD='街拍'
def main(offset):
html = get_page_index(offset,KEYWORD)
#print ('AJAX请求返回结果:')
#print (html)
for url in parse_page_index(html):
#print (url)
html=get_page_detail(url)
if html:
result=parse_page_detail(html,url)
#print (result)
if result:
save_to_mongo(result)
if __name__=='__main__':
#main()
#multiprocess
groups=[x*20 for x in range(GROUP_START,GROUP_END + 1)]
pool=Pool()
pool.map(main,groups)