分析:
1、分析索引页信息
2、解析索引页
3、解析单页图片信息
点击其中一条网页,分析网页代码:
找到图片信息:gallery:JSON.parse("{\"count\":5,\"sub_images\":
[{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/pgc-image\\/15324800265505b953ab972\",\"width\":640,\"url_list\":[{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/pgc-image\\/15324800265505b953ab972\"}, ...
主代码及解析:
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 14 16:43:43 2018
分析网页的Ajax 来抓取今日头条街拍美图
利用json库分析json数据
主要利用正则表达式分析网页详情
@author: Administrator
"""
import requests #requests库解析网页
from urllib.parse import urlencode #url库 分析url
from requests.exceptions import RequestException #requests异常
import json #处理json对象
from bs4 import BeautifulSoup #处理html文档
import re #正则表达式库
import hashlib #利用哈希算法
from multiprocessing import Pool #多进程处理
#from config import *
#import pymongo
#import md5
import os
#==============================================================================
# client = pymongo.MongoClient(MONGO_URL)
# db = client[MONGO_DB]
#==============================================================================
# 1、抓取索引页
def get_page_index(offset,keyword):
data = {
'offset':offset,
'format':'json',
'keyword':keyword,
'autoload':'true',
'count':'20',
'cur_tab':3
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data) #利用urllib库编码url
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} #加入浏览器的代理headers防止网页反爬虫
response = requests.get(url,headers=headers) #利用requests库获取网页信息
if response.status_code==200:
return response.text
return None
except RequestException:
print('请求索引出错')
return None
# 2、解析索引页
def parse_page_index(html):
index = json.loads(html) #将当前字符串形式的html转换成json对象
#print(index)
if 'data' in index.keys(): #确保data在json对象的keys中
for item in index.get('data'):
yield item.get('article_url') #yield用法
# 3、获取单页信息 (与获取索引页方法类似)
def get_page_detail(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
response = requests.get(url,headers=headers) #
if response.status_code==200:
return response.text
return None
except RequestException:
print('请求详情页出错')
return None
# 4、解析单页信息 利用正则表达式解析单页信息
def parse_page_detail(html,url):
soup = BeautifulSoup(html,'lxml') #BeautifulSoup 提取html文件中提取数据,转换成soup文档,便于操作
#print(type(soup.title))
title = soup.title.string #利用BeautifulSoup提取title
print(title)
# title_pattern = re.compile("BASE_DATA.galleryInfo.*?{.*?title:.*?'(.*?)'.*?<script>",re.S)
# titles = re.findall(title_pattern, html) #利用正则提取title
# print(titles) #=正确获取title=
#
# pattern = re.compile('BASE_DATA.galleryInfo = {.*?gallery: JSON.parse\("(.*?)"\)', re.S)
# results = re.findall(pattern, html)
# print(results) #正确获取image
#==============================================================================
#==============================================================================
# result = re.search(pattern, html)
# if result:
# data = json.loads(result.group(1))
# print(data)
#==============================================================================
#data = json.loads(result.group(0))
#pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S) #正则化错误,没有找到真正的json对象(字符串)
pattern = re.compile('gallery: JSON.parse\((.*?)\),', re.S) #正则化正解,找到json标准对象,需要双引号
results = re.search(pattern, html)
#print(type(results.group(1)))
#print(results.group(0)) #group(0)表示匹配整个单引号里的内容
#print(results.group(1)) #group(1)表示匹配第一个括号里的内容
if results:
datas = json.loads(results.group(1)) #将json对象转换成pyhon格式, 此处转成了字符串形式
data = re.sub(r'\\','',datas) #删除数据中的双转义字符'\\' 根据解析数据结果决定删除
#print(type(data))
#print(data)
#print(type(eval(data)))
data = eval(data) #将字符串形式转换成字典形式
#print(data)
if data and "sub_images" in data.keys():
sub_images = data.get("sub_images")
#print(sub_images)
images = [item.get("url") for item in sub_images]
print('images:',images)
for image in images:
download_images(image)
return {
'title': title,
'url': url,
'images': images
}
#==============================================================================
# def save_to_mongo(result):
# if db[MONGO_TABLE].insert(result):
# print('存储成功到MongoDB',result)
# return True
# return False
#==============================================================================
# 5、下载图片
def download_images(url):
print('正在下载',url)
try:
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
response = requests.get(url) #,headers=headers) #
if response.status_code==200:
#print('content:',response.content) #conten返回二进制内容(图片采用二进制返回), .text返回网页正常内容
save_images(response.content)
return None
except RequestException:
print('请求图片下载失败')
return None
# 6、定义保存文件函数
def save_images(content):
#==============================================================================
# with open(r'F:\机器学习\Project\新建文件夹','x') as f:
# f.write(content)
# f.close
#
# #print('content:',content)
#==============================================================================
#md5=hashlib.md5()
#h=hashlib.sha256()
file_path = '{0}/{1}.{2}'.format(os.getcwd(), hashlib.sha224(content).hexdigest(),'jpg') #利用哈西算法保存图片名称,防止重复内容下载
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
def write_to_file(text):
file_path = '{0}/{1}.{2}'.format(os.getcwd(),'imagesInfo','txt')
with open(file_path,'a',encoding='utf-8') as f:
f.write(json.dumps(text,ensure_ascii=False) + '\n')
f.close()
def main(offset):
keyword = '街拍'
html = get_page_index(offset,keyword)
#print(html)
for url in parse_page_index(html):
#print(url)
html2=get_page_detail(url)
#print(html2)
if html2:
result = parse_page_detail(html2,url)
#print("result:",result)
write_to_file(result)
#save_to_mongo(result)
#break
#==============================================================================
# for item in parse_page_detail(html2,url):
# write_to_file(item)
# print('item',result[item])
# #print(result)
# #break
#==============================================================================
#break
if __name__=='__main__':
print(os.getcwd())
for offset in range(1):
print(offset*20)
main(offset)
#break
pool = Pool()
pool.map(main,[offset for offset in range(3)])