新建文件 config.py
MONGO_URL = 'localhost' MONGO_DB = 'toutiao' MONGO_TABLE = 'toutiao' GROUP_START = 1 GROUP_END = 20 KEYWORD = '街拍'
import requests from urllib.parse import urlencode from requests import RequestException import json from json import JSONDecodeError from bs4 import BeautifulSoup import re, os import pymongo from config import * #导入之前创建的config.py文件 from hashlib import md5 from multiprocessing import Pool client = pymongo.MongoClient(MONGO_URL ,connect=False) db = client[MONGO_DB] #获取首页Json数据,记为1 def get_page_index(offset, keyword): data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'cur_tab': '3', 'from': 'gallery' } headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'} url = 'https://www.toutiao.com/search_content/?' + urlencode(data) try: response = requests.get(url, headers = headers) if response.status_code == 200: return response.text return None except RequestException: print('请求出错') return None #解析1的Json数据,获取所需界面URL地址 def parse_page_index(html): try: data = json.loads(html) if data and 'data' in data.keys(): #判断data是否存在 for i in data.get('data'): # print(i.get('article_url')) yield i.get('article_url') except JSONDecodeError: pass #处理从1中解析得来的数据,记为2 def get_page_detail(url): headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'} try: response = requests.get(url, headers = headers) if response.status_code == 200: return response.text return None except RequestException: print('请求详情页出错',url) return None #解析2 def parse_page_detail(html ,url): headers = {'User-Agent': 'Mozilla / 5.0(X11;Ubuntu;Linuxx86_64;rv: 59.0) Gecko / 20100101Firefox / 59.0'} soup = BeautifulSoup(html,"html.parser") title = soup.select('title')[0].get_text() print(title) images_pattern = re.compile('var gallery = (.*?);',re.S) result = re.search(images_pattern, html) if result: data = json.loads(result.group(1)) #print(result.group(1)) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = ['http:' + i.get('url') for i in sub_images] print(images) for images in images: download_image(images,headers,title) return { 'title':title, 'images': images, 'url':url, } #将解析的数据保存到Mongo数据库中 def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('存储到MongoDB成功',result) return True return False #下载图片 def download_image(url, headers ,title): print("正在下载:", url) try: response = requests.get(url, headers = headers) if response.status_code == 200 : save_image(response.content ,title) return None except RequestException: print('请求图片出错', url) return None def save_image(content, title): if not os.path.exists(title): #如果文件夹不存在就创建 os.mkdir(title) file_path = '{0}/{1}.{2}'.format(title,md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() # offset 为要爬取的页码, KEYWORD为检索内容 def main(offset): html = get_page_index(offset, KEYWORD) for url in parse_page_index(html): html = get_page_detail(url) if html: # print(html) result = parse_page_detail(html, url) #print(result) if result: save_to_mongo(result) if __name__ == '__main__': groups = [x * 20 for x in range(GROUP_START, GROUP_END + 1)] pool = Pool() pool.map(main,groups)