爬虫–ins
实现ins关键词搜索结果的图片爬取。会出现429。。。
代码
import json
import requests
import wget
import os
import time
file = 'bluehair'
url = 'https://i.instagram.com/api/v1/tags/{}/sections/'.format(file)
if os.path.exists(file):
print('已经存在')
else:
os.mkdir(file)
headers = {
#'Cookie':'mid=YN7BqgAEAAHLlivoMtcDO6TepTgW; ig_did=8C18E318-A2B2-41CA-A791-D3245FAD2115; ig_nrcb=1; csrftoken=iQPBaseIq495cmYBgdI2L2Rzq1BhyF2x; ds_user_id=48177768982; sessionid=48177768982%3Ax1rkMfpi9HuEmy%3A7; rur="ASH\05448177768982\0541657851648:01f7c1295872341223f9e68051adcbb327072697692d0e1b7be45da3fc8053124d6bfecf"',
'Cookie':'mid=YN7BqgAEAAHLlivoMtcDO6TepTgW; ig_did=8C18E318-A2B2-41CA-A791-D3245FAD2115; ig_nrcb=1; ds_user_id=48177768982; csrftoken=cIRLHdJhPz4aj05QN4bHyN4VYN780zcg; sessionid=48177768982%3A29qB31QTXDdYpW%3A7; rur="ASH\05448177768982\0541657939458:01f7b6c17a5a6492ea3574df8abb9a717c579cf954972b9e2e20e196f303db0dfe615c3e"',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.instagram.com',
'referer': 'https://www.instagram.com/',
'x-asbd-id': '437806',
'x-csrftoken': 'cIRLHdJhPz4aj05QN4bHyN4VYN780zcg',
'x-ig-app-id': '936619743392459',
'x-ig-www-claim': 'hmac.AR32jHHsrQmtDBzUGuxosTdMCKjTCIqRUlRHr42U-mBrfb7s',
'x-instagram-ajax': '88a461bc5175',
'sec-ch-ua': '"Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
}
form={
'include_persistent':0,
'max_id':'QVFCNEZyYmsyZVVSbWhLTHE5V181dUo2OHFtSWwydnYyQ2hhMHdWeWFDUUpNOHJPZk5IU1c1bV9tdXNOQ2hpbE5kM0c1eTJjc2dCY254cUtiMWwyWUZkOQ==',
'page':2,
'surface':'grid',
'tab':'recent'
}
response = requests.post(url=url,data=form,headers=headers,)
print(response)
if response.status_code == 429:
print(response.headers['retry-after'])
data = response.text
data = json.loads(data)
page = data['next_page']
max_id = data['next_max_id']
for i in data['sections']:
try:
a = i['layout_content']
b = a['medias']
for k in b:
c = k['media']
d = c['image_versions2']['candidates'][0]
except:
continue
flag = True
while flag:
new_form = {
'include_persistent': 0,
'max_id': max_id,
'page': page,
'surface': 'grid',
'tab': 'recent'
}
print(page)
response = requests.post(url=url,data=new_form,headers=headers)
print(response)
data2 = response.text
data2 = json.loads(data2)
for i in data2['sections']:
try:
a = i['layout_content']
b = a['medias']
for k in b:
c = k['media']
d = c['image_versions2']['candidates'][0]['url']
with open(file+'.txt','a') as f:
f.write(d+'\r')
# wget.download(d, out=file)
except:
continue
try:
page = data2['next_page']
max_id = data2['next_max_id']
except:
flag=False