本次练习使用到的知识点有
* Requests 库的使用
* BeautifulShop 库的使用
* 正则表达式的使用
* pymongo 库的使用
1、项目流程分析
2、中心调度
#
中心调度
def
main
(offset):
#
获取列表页
index_data = get_page_index(offset,KEYWORDS)
if
index_data
is
None
:
print
(
"offset:"
+offset+
"
异常
"
)
return
#
解析列表页获取所有详情页的
url
for
url
in
parse_page_index(index_data):
#
获取详情页
detail_data = get_page_detail(url)
if
detail_data
is
None
:
print
(
'url:%s
异常
'
.format(url))
pass
#
解析详情页
data = parse_page_detail(detail_data, url)
if
data
is
None
:
continue
save_to_mongo(data)
3、请求和解析列表页
#
请求获取列表页的响应数据
def
get_page_index
(offset,
keywords
):
params = {
'offset'
:offset,
'format'
:
'json'
,
'keyword'
:KEYWORDS,
'cur_tab'
:
3
,
'autoload'
:
'true'
,
'count'
:
20
}
try
:
response = requests.get(
'http://www.toutiao.com/search_content/'
,
params
=params)
if
response.status_code==
200
:
return
response.text
return
None
except
RequestException
as
e
:
return
None
#
解析列表页
def
parse_page_index
(text):
try
:
data = json.loads(text)
if
data
and
'data'
in
data.keys():
for
item
in
data.get(
'data'
):
yield
item.get(
'article_url'
)
except
JSONDecodeError
as
e
:
print
(
'
解析异常
'
)
return
[]
4、请求和解析详情页
#
解析详情页面
def
parse_page_detail
(html, url):
soup = BeautifulSoup(html,
'lxml'
)
#
获取页面的标题
title = soup.title.string
image_pattern = re.compile(
'var gallery = (.*?);'
,re.S)
result = image_pattern.search(html)
if
result:
try
:
data = json.loads(result.group(
1
))
if
data
and
'sub_images'
in
data.keys():
#
获取所有的
image
的
url
images = [item.get(
'url'
)
for
item
in
data.get(
'sub_images'
)]
for
image
in
images:
#
下载图片
download_image(image)
return
{
'title'
:title,
'url'
:url,
'images'
:images}
except
JSONDecodeError
as
e
:
return
None
return
None
5、下载图片和保存至Mongodb
#
获取图片的二进制流
def
download_image
(url):
try
:
print
(
'
图片
'
+url+
'
正在下载
'
)
response = requests.get(url)
if
response.status_code ==
200
:
#
保存图片
save_image(response.content)
except
RequestException
as
e
:
print
(
'
异常
image:'
+url)
pass
#
保存二进制流至文件
def
save_image
(content):
file_path =
'{0}/images/{1}.{2}'
.format(os.getcwd(), md5(content).hexdigest(),
'jpg'
)
if
not
os.path.exists(file_path):
with
open
(file_path,
'wb+'
)
as
file:
file.write(content)
file.close()
def
save_to_mongo
(data):
if
db[MONGO_TABLE].insert(data):
print
(
'
成功保存
'
+data[
'title'
])
return
True
return
False
6、完整代码
#!/usr/bin/python
# -*- coding: utf-8 -*-
import
os
import
re
import
requests
import
pymongo
import
json
from
hashlib
import
md5
from
bs4
import
BeautifulSoup
from
setting
import
*
from
requests.exceptions
import
RequestException
from
json.decoder
import
JSONDecodeError
from
multiprocessing
import
Pool
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
#
请求获取列表页的响应数据
def
get_page_index
(offset,
keywords
):
params = {
'offset'
:offset,
'format'
:
'json'
,
'keyword'
:KEYWORDS,
'cur_tab'
:
3
,
'autoload'
:
'true'
,
'count'
:
20
}
try
:
response = requests.get(
'http://www.toutiao.com/search_content/'
,
params
=params)
if
response.status_code==
200
:
return
response.text
return
None
except
RequestException
as
e
:
return
None
#
解析列表页
def
parse_page_index
(text):
try
:
data = json.loads(text)
if
data
and
'data'
in
data.keys():
for
item
in
data.get(
'data'
):
yield
item.get(
'article_url'
)
except
JSONDecodeError
as
e
:
print
(
'
解析异常
'
)
return
[]
#
请求获取详情页面的响应数据
def
get_page_detail
(url):
response = requests.get(url)
try
:
if
response.status_code==
200
:
return
response.text
return
None
except
RequestException
as
e
:
return
None
#
解析详情页面
def
parse_page_detail
(html, url):
soup = BeautifulSoup(html,
'lxml'
)
#
获取页面的标题
title = soup.title.string
image_pattern = re.compile(
'var gallery = (.*?);'
,re.S)
result = image_pattern.search(html)
if
result:
try
:
data = json.loads(result.group(
1
))
if
data
and
'sub_images'
in
data.keys():
#
获取所有的
image
的
url
images = [item.get(
'url'
)
for
item
in
data.get(
'sub_images'
)]
for
image
in
images:
#
下载图片
download_image(image)
return
{
'title'
:title,
'url'
:url,
'images'
:images}
except
JSONDecodeError
as
e
:
return
None
return
None
#
获取图片的二进制流
def
download_image
(url):
try
:
print
(
'
图片
'
+url+
'
正在下载
'
)
response = requests.get(url)
if
response.status_code ==
200
:
#
保存图片
save_image(response.content)
except
RequestException
as
e
:
print
(
'
异常
image:'
+url)
pass
#
保存二进制流至文件
def
save_image
(content):
file_path =
'{0}/images/{1}.{2}'
.format(os.getcwd(), md5(content).hexdigest(),
'jpg'
)
if
not
os.path.exists(file_path):
with
open
(file_path,
'wb+'
)
as
file:
file.write(content)
file.close()
def
save_to_mongo
(data):
if
db[MONGO_TABLE].insert(data):
print
(
'
成功保存
'
+data[
'title'
])
return
True
return
False
#
中心调度
def
main
(offset):
#
获取列表页
index_data = get_page_index(offset,KEYWORDS)
if
index_data
is
None
:
print
(
"offset:"
+offset+
"
异常
"
)
return
#
解析列表页获取所有详情页的
url
for
url
in
parse_page_index(index_data):
#
获取详情页
detail_data = get_page_detail(url)
if
detail_data
is
None
:
print
(
'url:%s
异常
'
.format(url))
pass
#
解析详情页
data = parse_page_detail(detail_data, url)
if
data
is
None
:
continue
save_to_mongo(data)
if
__name__==
'__main__'
:
groups = [x*
20
for
x
in
range
(GROUP_START,GROUP_END+
1
)]
pool = Pool()
pool.map(main, groups)
7、运行结果