源代码:
import requests # 爬取页面
import logging # 输出信息
import re # 正则表达式
import pymongo # 存储数据
from pyquery import PyQuery as pq # 直接解析网页
from urllib.parse import urljoin # URL拼接
import multiprocessing
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
BASE_URL = 'https://static1.scrape.cuiqingcai.com'
TOTAL_PAGE = 10
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'movies' # 数据库名称
MONGO_COLLECTION_NAME = 'MOVIES' # 集合名称
client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client['movies']
collection = db['movies']
def scrape_page(url):
"""列表爬取方法"""
logging.info('scraping %s...', url)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests