**
*爬取今日头条街拍图片 *
**
# coding=utf-8
import os
import re
import time
from multiprocessing.pool import Pool
import requests
from urllib.parse import urlencode
headers={'Cookie': '你的cookie',
'User-Agent':'你的user-agent'
}
#得到首页面上相应详情页的url
def get_search_page(offset):
params = {
'aid':'24',
'app_name': 'web_search',
'offset': offset,
'format':'json',
'keyword':'街拍',
'autoload':'true',
'count':'20',
'en_qc':'1',
'cur_tab':'1',
'from':'search_tab',
'pd':'synthesis',
'timestamp': int(round(time.time() * 1000)),
}
base_url = 'https://www.toutiao.com/api/search/content/?' + urlencode(params)
print(base_url)
try:
response1 = requests.get(base_url,headers=headers)
#print(type(response1))
#search_res = json.loads(requests.get(url+urlencode(params),headers=headers).text,encoding='utf-8')['data']
if response1.status_code == 200:
json1=response1.json()
#print(json1)
#if json1.get('data'):
for item in json1.get('data'):
title=item.get('title')
url_group=item.get('share_url')
try:
if title!=None and url_group!=None:
yield {'title':item.get('title'),
'url_group':item.get('share_url')
}
except:
print('非图片类型页面')
except requests.ConnectionError:
return None
#得到图片的集合
def get_images_group(url_group):
print('正在进行详情页面解析')
response2=requests.get(url_group,headers=headers)
try:
if response2.status_code == 200:
content1s=re.findall('[a-zA-z]+://[^\s]*"',response2.text,re.S)
if (content1s==None):
print("文章类型不对,没有找到图片集合",url_group)
return None
#content3s=re.findall('/pgc-image/[^\s]*"',response2.text,re.S)
#print(content1s)
for content1 in content1s:
content2=re.sub('"','',content1)
content3=re.sub('[a-zA-z]+://[^\s]*/pgc-image/','',content2)
yield {
'name': content3,
'image_url': content2,
}
#print(response2.text)
except ConnectionError:
print('无法连接')
return None
#保存图片
def save_image(item,title):
try:
response = requests.get(item.get('image_url'))
if response.status_code == 200:
file_path1='C:/Users/Desktop/图片/爬虫/'+title+'/'
#print(file_path1)
file_path=file_path1+'{0}.{1}'.format(item.get('name'),'jpg')
if not os.path.exists(file_path):
try:
os.makedirs(file_path1)
except:
print('路径已经存在', file_path1)
with open(file_path, 'wb') as f:
f.write(response.content)
f.close()
else:
print('已经下载', file_path)
except requests.ConnectionError:
print('保存图片失败')
#主函数
def main(offset):
for item1 in get_search_page(offset):
url_group= item1.get('url_group')
title=item1.get('title')
for item2 in get_images_group(url_group):
save_image(item2,title)
print(offset)
GROUP_START = 1
GROUP_END = 20
if __name__ == '__main__':
pool = Pool()# 创建进程池,可以选择创建进程池的数量
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)# 将数组中的每个元素提取出来当作函数的参数,创建一个个进程,放进进程池中
# 第一个参数是函数,第二个参数是一个迭代器,将迭代器中的数字作为参数依次传入函数中
pool.close()#关闭进程池,不再接受新的进程
pool.join()#主进程阻塞等待子进程的退出