#导入模块
import os
import re
import time
import requests
import pandas as pd
from urllib3.request import urlencode #用于网址合并
from bs4 import BeautifulSoup
from hashlib import md5 # 用于对图片进行编码起名字
#自定义函数实现无效名称的修改
def validatetile(title): # 将无效的文件名字修改
rstr = '[/\:*?<>|@]'
new_title = re.sub(rstr,'_',title) # 将无效字符替换成'_'
return new_title
## 获取网页,参数可以在请求中复制出来
def get_page(offset):
params = {
'aid': 24,
'app_name': 'web_search',
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'en_qc': '1',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis'
}
baseurl = 'https://www.toutiao.com/api/search/content/?'
url = baseurl + urlencode(params) # 将之前的参数放置到网址中
return url
#爬取照片
for i in range(5): #不同页面有多条
print('==========================================')
print('正在爬取第{}块'.format(i))
print('==========================================')
i= i * 20
url = get_page(i)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.42 Safari/537.36',
'cookie':'tt_webid=6792047630858323463; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6792047630858323463; csrftoken=7af9539142c93a6bcaba9d3b35a8bd82; ttcid=809a772e22e34003b83237175f254b7233; s_v_web_id=k6hj5e8s_shwSVN5U_M04k_4HHd_9dSE_z7R0OjFevTmN; __tasessionId=93nklqqhd1581435862931; msh=w4vvfHYgSfTSt6Qx30vf9JCrxE8; sso_auth_status=9c806db5b09551389e57c3071bd56559; sso_uid_tt=31be08e0839883508c42a659b9d70a1b; sso_uid_tt_ss=31be08e0839883508c42a659b9d70a1b; toutiao_sso_user=652e5ac113fcc7d25cfe796e1297552c; toutiao_sso_user_ss=652e5ac113fcc7d25cfe796e1297552c; passport_auth_status=2a09d7a752d9608c37ffccdd6a764204%2C61c2707362c84b3b03f9e62a602f1b8a; sid_guard=641538ba14145e4b9cdecfc3b64ef970%7C1581435947%7C5184000%7CSat%2C+11-Apr-2020+15%3A45%3A47+GMT; uid_tt=a4166039a89efdc3d9a694a4f5f28f61; uid_tt_ss=a4166039a89efdc3d9a694a4f5f28f61; sid_tt=641538ba14145e4b9cdecfc3b64ef970; sessionid=641538ba14145e4b9cdecfc3b64ef970; sessionid_ss=641538ba14145e4b9cdecfc3b64ef970; tt_scid=LR52omGrqyLyrrFCdx1bqs0PwKAEErU7ClvyjMbD8ZC8REtNV67EnSqc-rEaTqgx9e44'}
response = requests.get(url, headers = headers)
html = response.json() # 这是一个键值对的数据格式,所以通过json()返回一个数据
data = html.get('data') # 获取data下的数据
for item in data:
if item.get('cell_type') is not None: # 分析发现有cell_type的不是我们需要的数据
continue
urllist = item.get('image_list') # 获取图片链接
title = item.get('title') # 获取标题
title = validatetile(title) # 通过自定义函数将无效字符转换成'_'
print(title)
for url in urllist: # 一个title下的不同图片
newurl = url['url'].replace('list','large').replace('/190x124','') # 将小图片转变成大图片
# print(newurl)
resp = requests.get(newurl)
img_path = 'img' + os.path.sep + title # 文件夹路径。img是新建第一个文件夹,后续都是子文件夹
# print(img_path)
if not os.path.exists(img_path): # if 判断
os.makedirs(img_path)
#md5(resp.content).hexdigest() 对图片进行编码起名,杜绝重复的图片
file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(file_name = md5(resp.content).hexdigest(),file_suffix = 'jpg')
with open(file_path,'wb') as f:
f.write(resp.content)
time.sleep(5)
print('爬取完成')