今日头条是一个动态加载页面的网站,这一类的网站直接使用requests爬取的话得不到我们想要的内容。所以一般这类的网站都是通过分析ajax来进行抓包来获取我们想要的内容。
老规矩,首先列出需要引入的库:
import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError
from config import *
通过分析网页的netword获取参数数据,然后使用urlencode编码获得想要的url,然后对此url发出请求:
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base + '?' + params
然后解析网页获取图片,即是我们想要的内容,这里使用的是re正则表达式来进行匹配,通过分析网页代码写出正则表达式:
images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
最后把他们放入到一个完整的函数,即可获得我们想要的内容,具体代码如下:
import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError
from config import *
#mongoDB存储配置
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB]
#获得想要的url发送请求
def get_page_index(offset, keyword):
#分析网页获取参数
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
#拼接成url
url = base + '?' + params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
#图片下载
def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
#这里调用的是图片保存方法
save_image(response.content)
return None
except ConnectionError:
return None
#图片的保存
def save_image(content):
#文件的保存路径和命名,这里保存到当前的文件夹,使用hash来命名
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
print(file_path)
#去重
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
#解析引导页面,依据json数据格式提取内容
def parse_page_index(text):
try:
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass
#获取详情页
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
#解析详情页并下载图片
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
result = soup.select('title')
title = result[0].get_text() if result else ''
#正则表达式匹配到图片
images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
result = re.search(images_pattern, html)
if result:
data = json.loads(result.group(1).replace('\\', ''))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images:
download_image(image)
return {
'title': title,
'url': url,
'images': images
}
#保存带mongoDB数据库中
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('Successfully Saved to Mongo', result)
return True
return False
def main(offset):
text = get_page_index(offset, KEYWORD)
urls = parse_page_index(text)
for url in urls:
html = get_page_detail(url)
result = parse_page_detail(html, url)
if result: save_to_mongo(result)
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()