代码如下: # coding=gbk import requests from requests.exceptions import RequestException from urllib.parse import urlencode import json from bs4 import BeautifulSoup import re from config import * import pymongo import os from hashlib import md5 import time from json.decoder import JSONDecodeError from multiprocessing import Pool client=pymongo.MongoClient(MONGO_URL) db=client[MONGO_DB] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1;Win64;x64;rv:66.0)Gecko/20100101 Firefox/66.0', } def get_page_index(offset,keyword,timestamp): data={ 'aid': '24', 'app_name':'web_search', 'offset':offset, 'format':'json', 'keyword':keyword, 'autoload':'true', 'count':'20', 'en_qc':'1', 'cur_tab':'1', 'from':'search_tab', 'pd':'synthesis', 'timestamp':timestamp } url = 'https://www.toutiao.com/api/search/content/?' + urlencode(data) try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: return None def parse_page_index(html): try: data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') except JSONDecodeError: pass def get_page_datail(url): try: response = requests.get(url,headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('请求详情页出错',url) return None def parse_page_detail(html,url): images_pattern = re.compile('articleInfo:.*?title: \'(.*?)\'.*?content.*?\'(.*?)\'', re.S) result = re.search(images_pattern,html) if result: title = result.group(1) url_pattern = re.compile('"(http:.*?)"') img_url = re.findall(url_pattern, str(result.group(2))) if img_url: for img in img_url: download_image(img) data={ 'title':title, 'url':url, 'image':img } return data def save_to_mongo(result): if result: if db[MONGO_TABLE].insert(result): print('存储到mongoDB成功',result) else: return True return False def download_image(url): try: response = requests.get(url,headers=headers) if response.status_code == 200: save_image(response.content) else: return None except RequestException: print('请求图片出现错误',url) return None def save_image(content): file_path='{0}\\imgs\\{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb')as f: f.write(content) f.close() def main(): for i in range(5): offset=i*20 keyword='街拍' t = time.time() timestamp=lambda: int(round(t * 1000)) html=get_page_index(offset,keyword,timestamp()) for url in parse_page_index(html): print(url) html=get_page_datail(url) if html: result=parse_page_detail(html,url) save_to_mongo(result) main()
注意事项:
1,请求链接中后面时间戳的问题,t = time.time() timestamp=lambda: int(round(t * 1000)) 获取当前时间的时间戳的毫秒级别 2,client=pymongo.MongoClient(MONGO_URL) 这里面的函数MongoClient别引用错了或者写错了 ,以及自己的mongodb服务是否开启,关于自己的mongodb服务是否开启,请看链接https://bbs.csdn.net/topics/390156764
3,关于images_pattern = re.compile('articleInfo:.*?title: \'(.*?)\'.*?content.*?\'(.*?)\'', re.S) 这个正则表达式是否写正确,以及网页的标签属性是否改变 需要自己去看一下网页代码来验证一哈
4,这个是单线程的 原因是 自己不会多线程 我看用线程池 我没运行起来 很抱歉
爬取结果: