采用分析ajax代码的方式
spider.py
#encoding:utf8
__author__ = 'qh'
import requests
from urllib import urlencode
from requests.exceptions import RequestException
import json
from bs4 import BeautifulSoup
import re
import pymongo
from config import *
import os
from hashlib import md5
#引入进程池
from multiprocessing import Pool
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]
##先构造每一页的url(这一页上面有许多图集)
def get_page_index(offset,keyword):
data={
'offset':offset,
'format':'json',
'keyword':keyword,
'autoload':'true',
'count':20,
'cur_tab':3
}
url="http://www.toutiao.com/search_content/?"+urlencode(data)
try:
######主体
response=requests.get(url)
if response.status_code==200:
print '*'*8
print response.text
##返回的类似json
return response.text
return None
except RequestException:
print '请求索引页面出错'
return None
##解析这个页面url,取出每个图集的url(article_url)
def pares_page_index(html):
##将str格式的url转换成json形式
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
##获取每个图集页html
def get_page_detail(url):
try:
response=requests.get(url)
if response.status_code==200:
return response.text
return None
except RequestException:
print '请求详细页面出错',url
return None
##解析每个图集页,html
def parse_page_detail(html,url):
##解析
soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].get_text()
print(title)
##发现,图集藏在var gallery =串里面
##使用正则匹配有图片的 字符串
images_pattern = re.compile('var gallery = (.*?);',re.S)
results = re.search(images_pattern,html)
if results:
#print(results.group(1))
data=json.loads(results.group(1))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
##拿到每一个图片的url
images=[item.get('url') for item in sub_images]
##下载每个图片
for image in images:
download_image(image)
return {
'title':title,
'url':url,
'images':images
}
def save_to_mongo(results):
if db[MONGO_TABLE].insert(results):
print('存储到mongodb成功'+str(results))
return True
return False
def download_image(url):
print('正在下载图片 '+str(url))
try:
response=requests.get(url)
if response.status_code==200:
save_image(response.content)
return None
except RequestException:
print '请求图片出错'+url
return None
##保存图片
def save_image(content):
##构造一个路径
#os.getcwd()获得当前工作目录
##md5(content).hexdigest() 获得16进制的md5加密串,防止图片重复
filepath = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(filepath):
with open(filepath,'wb') as f:
f.write(content)
f.close()
def main(offset):
html = get_page_index(offset,KEYWORD)
for url in pares_page_index(html):
html=get_page_detail(url)
if html:
results=parse_page_detail(html,url)
if results:
save_to_mongo(results)
else:
pass
if __name__=="__main__":
##多进程
groups=[i*20 for i in range(GROUP_START,GROUP_END+1)]
pool=Pool()
pool.map(main,groups)
config.py
#encoding:utf8
__author__ = 'qh'
MONGO_URL='localhost'
MONGO_DB='today'
MONGO_TABLE='ajaxImg'
GROUP_START=1
GROUP_END=5
KEYWORD='街拍'