爬取头条街拍图集
最近跟着崔庆才大神学习了爬虫的一些实战项目,今天为大家介绍下爬取头条街拍图集的流程。
-
在开始爬取前,需要先下载好requests、BeautifulSoup和pymongo库以及安装mongoDB。
-
对头条街拍图集网页进行分析后,就可以对图集界面进行爬取工作了,通过对offset和count参数的分析,每一页有20个数据,通过offset增加20来获取下一页数据。代码如下:
def get_page(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 3,
'from': 'gallery'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
req = requests.get(url)
if req.status_code == 200:
return req.text
else:
return None
except RequestException as e:
print('无法获取页面链接:', e)
return None
- 对获取的数据进行分析,确定每条街拍图集的url在data下面的article_url中,通过以下方法解析
def parse_page(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
- 接着,就能根据得到的url获取每个图集的详细内容,这里需要传入headers参数绕过反爬取机制
def get_img(imgurl):
headers = {
'user-agent': 'Mozilla / 5.0(Macintosh; Intel Mac OS X 10_14_2) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 71.0.3578.98 Safari / 537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh - CN, zh;q = 0.9',
'cache-control': 'max-age = 0',
'accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8'
}
try:
req = requests.get(imgurl, headers = headers)
if req.status_code == 200:
return req.text
else:
return None
except RequestException as e:
print('无法获取详情链接:', e)
return None
- 获取到的内容是JS格式的,是一些不规则的数据,可以使用正则来对所需的图片链接进行提取
def parse_img(html, url):
bs = BeautifulSoup(html, 'lxml')
title = bs.select('title')[0].get_text()
imgs_pattern = re.compile('.*?gallery:.*?\((.*?)\)', re.S)
#imgs_pattern = re.compile('.*?gallery:.*?\("(.*?)"\)', re.S) #第二种方法
result = re.search(imgs_pattern, html)
if result:
data = eval(result.group(1)) #result.group(1)获取的是tuple数据,通过eval获取字符串
#data = re.sub(r'\\', '', result.group(1)) #第二种方法
data = json.loads(data)
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
imgs = [item.get('url') for item in sub_images]
return {
'title': title,
'url': url,
'imgs': imgs
}
- 获取到图片的url之后就能下载所需的图片并保存到本地中
def download_img(url):
print('正在下载:', url)
try:
req = requests.get(url)
if req.status_code == 200:
save_img(req.content)
else:
return None
except RequestException as e:
print('无法获取图片链接:', e)
return None
def save_img(content):
path = './imgs/{0}.{1}'.format(md5(content).hexdigest(), 'jpg') #以图片的MD5值作为图片的名字,避免下载到重复的图片
if not os.path.exists(path):
with open(path, 'wb') as f:
f.write(content)
- 可以把爬取到的图集标题、url和图片url保存到mongoDB中,这里需要使用到pymongo库。需要先设置mongoDB的配置文件,并打开mongoDB的服务
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'
GROUP_START = 1
GROUP_END = 20
KEYWORDS = '街拍'
- 接着就能在项目中引用这些配置并进行mongoDB的存储
- 创建mongoDB对象
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
- 保存数据
def save_to_mongo(result):
if db[MONGO_TABLE].insert_one(result): #insert()弃用了,使用insert_one()添加数据
print('存储成功!')
return True
return False
- 如果要下载多页图集的话,使用单线程爬取速度会很慢,所以采用进程池的方式进行爬取提高效率
groups = [i * 20 for i in range(GROUP_START, GROUP_END + 1)]
pool = Pool()
pool.map(main, groups)
pool.close()
pool.join()
- 使用多进程爬取时,会出现一些警告信息:
UserWarning: MongoClient opened before fork. Create MongoClient only after forking. See PyMongo's documentation for details: http://api.mongodb.org/python/current/faq.html#is-pymongo-fork-safe
"MongoClient opened before fork. Create MongoClient only "
- 解决的办法是为每个进程执行时启动一个mongoDB的链接,就是在MongoClient中添加connect参数
client = pymongo.MongoClient(MONGO_URL, connect = False)
完整的实现代码如下:
#-*- coding:UTF-8 -*-
from urllib.parse import urlencode
import requests
from requests.exceptions import RequestException
import re
from bs4 import BeautifulSoup
import json
from config import *
import pymongo
from hashlib import md5
import os
from multiprocessing import Pool
client = pymongo.MongoClient(MONGO_URL, connect = False)
db = client[MONGO_DB]
def get_page(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 3,
'from': 'gallery'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
req = requests.get(url)
if req.status_code == 200:
return req.text
else:
return None
except RequestException as e:
print('无法获取页面链接:', e)
return None
def parse_page(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
def get_img(imgurl):
headers = {
'user-agent': 'Mozilla / 5.0(Macintosh; Intel Mac OS X 10_14_2) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 71.0.3578.98 Safari / 537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh - CN, zh;q = 0.9',
'cache-control': 'max-age = 0',
'accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8'
}
try:
req = requests.get(imgurl, headers = headers)
if req.status_code == 200:
return req.text
else:
return None
except RequestException as e:
print('无法获取详情链接:', e)
return None
def parse_img(html, url):
bs = BeautifulSoup(html, 'lxml')
title = bs.select('title')[0].get_text()
imgs_pattern = re.compile('.*?gallery:.*?\((.*?)\)', re.S)
# imgs_pattern = re.compile('.*?gallery:.*?\("(.*?)"\)', re.S) #第二种方法
result = re.search(imgs_pattern, html)
if result:
data = eval(result.group(1)) # result.group(1)获取的是tuple数据,通过eval获取字符串
# data = re.sub(r'\\', '', result.group(1)) #第二种方法
data = json.loads(data)
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
imgs = [item.get('url') for item in sub_images]
for img in imgs:
download_img(img)
return {
'title': title,
'url': url,
'imgs': imgs
}
def save_to_mongo(result):
if db[MONGO_TABLE].insert_one(result):
print('存储成功!')
return True
return False
def download_img(url):
print('正在下载:', url)
try:
req = requests.get(url)
if req.status_code == 200:
save_img(req.content)
else:
return None
except RequestException as e:
print('无法获取图片链接:', e)
return None
def save_img(content):
path = './imgs/{0}.{1}'.format(md5(content).hexdigest(), 'jpg')
if not os.path.exists(path):
with open(path, 'wb') as f:
f.write(content)
def main(offset):
html = get_page(offset, KEYWORDS)
for url in parse_page(html):
#url有可能为空,需要加判断
if url:
html = get_img(url)
if html:
result = parse_img(html, url)
if result:
save_to_mongo(result)
if __name__ == '__main__':
groups = [i * 20 for i in range(GROUP_START, GROUP_END + 1)]
pool = Pool()
pool.map(main, groups)
pool.close()
pool.join()