利用Python来爬取今日头条的街拍图片
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import pymongo
import json
import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
import re
from bs4 import BeautifulSoup
from hashlib import md5
import sys
sys.path.append('C:/Users/Administrator/Desktop/代码')
import mongo
import os
#g构建数据库
#client=pymongo.MongoClient(MONGO_URL)
client=pymongo.MongoClient(host='localhost')
db=client['toutiao']
#db=client[MONGO_DB]
headers={
'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4092.1 Safari/537.36'
}
#获取网页信息
def get_page_index(key,page):
data={
'offset':page,
'format':'json',
'keyword':key,
'autoload':'true',
'count':'20',
'cur_tab':'1',
'from':'search_tab'
}
#链接格式:https://www.toutiao.com/search_content/?offset=60&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
url='https://www.toutiao.com/search_content/?'+urlencode(data)
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except RequestException:
print('请求索引页失败')
return None
#解析json,获得组图链接
def parse_page_index(html):
data=json.loads(html) #转换字符串格式的变量为json的对象
#data存在且里面还有data这个 键值 这个参数,
if data and 'data' in data.keys():
for item in data.get('data'):
#返回链接里面的组图的链接
if item.get('article_url'):
#yield返回执行结果并不中断程序执行,return在返回执行结果的同时中断程序执行。
yield item.get('article_url')
#解析组图链接里的详细
def get_page_detail(url):
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except RequestException:
print('请求详情页失败',url)
return None
#分析获得组图每张照片的链接
def parse_page_detail(html,url):
soup=BeautifulSoup(html,'lxml')
#组图的名称
title=soup.select('title')[0].get_text()
#每张图片的链接
#[{“n”: “a”, “m”: “b”}]----必须要字典格式
#image_pattern=re.compile('gallery: JSON.parse\D\D(.*?),\D\Dlabels',re.S)#注意处理成json要求的特定格式
image_pattern=re.compile('gallery: JSON.parse\D\D(.*?)\D\D,\s',re.S)
result=re.search(image_pattern,html)
#去掉\,,必须要字典格式
result=re.sub('\\x5c','',result.group(1)) #注意\"http:\\/\\/p3.pstatp.com\\/origin\\/pgc-image\\/1531705589892982a1b7531\"这种类型
#print(type(json.loads(result)))
if result:
data=json.loads(result) #转为json对象
if data and 'sub_images' in data.keys():
sub_images=data.get('sub_images')
images=[item.get('url') for item in sub_images]
#下载图片
for img in images:
download_image(img)
return {
'title':title,
'images':images,
'url':url
}
#将图片链接输入数据库
def save_to_mongo(result):
if db['toutiao'].insert(result):
print("储存在MONGODB成功",result)
return True
return False
#下载图片
def download_image(result):
print("正在下载图片:",result)
try:
response=requests.get(result,headers=headers)
if response.status_code==200:
save_image(response.content)
return None
except RequestException:
print("请求图片出错",result)
return None
#保存图片
def save_image(content):
#构建文件路径
file_path='{0},{1}{2}'.format('D:/爬虫数据/今日头条街拍图片/',md5(content).hexdigest(),'.jpg') #防止图片重复
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
def main(offset):
html=get_page_index('街拍',offset) #json格式的,需要转换
for url in parse_page_index(html):
html=get_page_detail(url)
if html:
result=parse_page_detail(html,url)
save_to_mongo(result)
if __name__=='__main__':
main(0 )