今日头条街拍爬取

本次练习使用到的知识点有
* Requests 库的使用
* BeautifulShop 库的使用
* 正则表达式的使用
* pymongo 库的使用
1、项目流程分析
Clipboard Image.png

2、中心调度
# 中心调度
def main (offset):
# 获取列表页
index_data = get_page_index(offset,KEYWORDS)
if index_data is None :
print ( "offset:" +offset+ " 异常 " )
return
# 解析列表页获取所有详情页的 url
for url in parse_page_index(index_data):
# 获取详情页
detail_data = get_page_detail(url)
if detail_data is None :
print ( 'url:%s 异常 ' .format(url))
pass
# 解析详情页
data = parse_page_detail(detail_data, url)
if data is None :
continue
save_to_mongo(data)
3、请求和解析列表页
# 请求获取列表页的响应数据
def get_page_index (offset, keywords ):
params = {
'offset' :offset,
'format' : 'json' ,
'keyword' :KEYWORDS,
'cur_tab' : 3 ,
'autoload' : 'true' ,
'count' : 20
}
try :
response = requests.get( 'http://www.toutiao.com/search_content/' , params =params)
if response.status_code== 200 :
return response.text
return None
except RequestException as e :
return None


# 解析列表页
def parse_page_index (text):
try :
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get( 'data' ):
yield item.get( 'article_url' )
except JSONDecodeError as e :
print ( ' 解析异常 ' )
return []
4、请求和解析详情页
# 解析详情页面
def parse_page_detail (html, url):
soup = BeautifulSoup(html, 'lxml' )
# 获取页面的标题
title = soup.title.string
image_pattern = re.compile( 'var gallery = (.*?);' ,re.S)
result = image_pattern.search(html)
if result:
try :
data = json.loads(result.group( 1 ))
if data and 'sub_images' in data.keys():
# 获取所有的 image url
images = [item.get( 'url' ) for item in data.get( 'sub_images' )]
for image in images:
# 下载图片
download_image(image)
return { 'title' :title, 'url' :url, 'images' :images}
except JSONDecodeError as e :
return None
return None
5、下载图片和保存至Mongodb
# 获取图片的二进制流
def download_image (url):
try :
print ( ' 图片 ' +url+ ' 正在下载 ' )
response = requests.get(url)
if response.status_code == 200 :
# 保存图片
save_image(response.content)
except RequestException as e :
print ( ' 异常 image:' +url)
pass


# 保存二进制流至文件
def save_image (content):
file_path = '{0}/images/{1}.{2}' .format(os.getcwd(), md5(content).hexdigest(), 'jpg' )
if not os.path.exists(file_path):
with open (file_path, 'wb+' ) as file:
file.write(content)
file.close()


def save_to_mongo (data):
if db[MONGO_TABLE].insert(data):
print ( ' 成功保存 ' +data[ 'title' ])
return True
return False
6、完整代码
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import re
import requests
import pymongo
import json
from hashlib import md5
from bs4 import BeautifulSoup
from setting import *
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError
from multiprocessing import Pool

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]


# 请求获取列表页的响应数据
def get_page_index (offset, keywords ):
params = {
'offset' :offset,
'format' : 'json' ,
'keyword' :KEYWORDS,
'cur_tab' : 3 ,
'autoload' : 'true' ,
'count' : 20
}
try :
response = requests.get( 'http://www.toutiao.com/search_content/' , params =params)
if response.status_code== 200 :
return response.text
return None
except RequestException as e :
return None


# 解析列表页
def parse_page_index (text):
try :
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get( 'data' ):
yield item.get( 'article_url' )
except JSONDecodeError as e :
print ( ' 解析异常 ' )
return []


# 请求获取详情页面的响应数据
def get_page_detail (url):
response = requests.get(url)
try :
if response.status_code== 200 :
return response.text
return None
except RequestException as e :
return None


# 解析详情页面
def parse_page_detail (html, url):
soup = BeautifulSoup(html, 'lxml' )
# 获取页面的标题
title = soup.title.string
image_pattern = re.compile( 'var gallery = (.*?);' ,re.S)
result = image_pattern.search(html)
if result:
try :
data = json.loads(result.group( 1 ))
if data and 'sub_images' in data.keys():
# 获取所有的 image url
images = [item.get( 'url' ) for item in data.get( 'sub_images' )]
for image in images:
# 下载图片
download_image(image)
return { 'title' :title, 'url' :url, 'images' :images}
except JSONDecodeError as e :
return None
return None


# 获取图片的二进制流
def download_image (url):
try :
print ( ' 图片 ' +url+ ' 正在下载 ' )
response = requests.get(url)
if response.status_code == 200 :
# 保存图片
save_image(response.content)
except RequestException as e :
print ( ' 异常 image:' +url)
pass


# 保存二进制流至文件
def save_image (content):
file_path = '{0}/images/{1}.{2}' .format(os.getcwd(), md5(content).hexdigest(), 'jpg' )
if not os.path.exists(file_path):
with open (file_path, 'wb+' ) as file:
file.write(content)
file.close()


def save_to_mongo (data):
if db[MONGO_TABLE].insert(data):
print ( ' 成功保存 ' +data[ 'title' ])
return True
return False


# 中心调度
def main (offset):
# 获取列表页
index_data = get_page_index(offset,KEYWORDS)
if index_data is None :
print ( "offset:" +offset+ " 异常 " )
return
# 解析列表页获取所有详情页的 url
for url in parse_page_index(index_data):
# 获取详情页
detail_data = get_page_detail(url)
if detail_data is None :
print ( 'url:%s 异常 ' .format(url))
pass
# 解析详情页
data = parse_page_detail(detail_data, url)
if data is None :
continue
save_to_mongo(data)


if __name__== '__main__' :
groups = [x* 20 for x in range (GROUP_START,GROUP_END+ 1 )]
pool = Pool()
pool.map(main, groups)
7、运行结果
Clipboard Image.png


Clipboard Image.png


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值