利用Python进行爬虫时中途总会出错并中止运行,并提示multiprocessing.pool.RemoteTraceback,我把源代码全部贴出来,请大家帮帮忙,谢谢!

源代码:

import requests  # 爬取页面
import logging  # 输出信息
import re  # 正则表达式
import pymongo  # 存储数据
from pyquery import PyQuery as pq  # 直接解析网页
from urllib.parse import urljoin  # URL拼接
import multiprocessing

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')

BASE_URL = 'https://static1.scrape.cuiqingcai.com'
TOTAL_PAGE = 10
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'movies'  # 数据库名称
MONGO_COLLECTION_NAME = 'MOVIES'  # 集合名称

client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client['movies']
collection = db['movies']


def scrape_page(url):
    """列表爬取方法"""
    logging.info('scraping %s...', url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        logging.error('error occurred while scraping %s', url, exc_info=True)


def scrape_index(page):
    """列表页爬取方法"""
    index_url = f'{BASE_URL}/page/{page}'
    return scrape_page(index_url)


def parse_index(html):
    """解析列表页"""
    doc = pq(html)
    links = doc('.el-card .name')  # CSS选择器
    for link in links.items():
        href = link.attr('href')
        detail_url = urljoin(BASE_URL, href)
        logging.info('get detail url %s', detail_url)
        yield detail_url


def scrape_detail(url):
    return scrape_page(url)


def parse_detail(html):
    doc = pq(html)
    cover = doc('img.cover').attr('src')
    name = doc('a > h2').text()
    categories = [item.text() for item in doc('.categories button span').items()]
    published_at = doc('.info:contains(上映)').text()
    published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
        if published_at and re.search('\d{4}-\d{2}-\d{2}', published_at) else None
    drama = doc('.drama p').text()
    score = doc('p.score').text()
    score = float(score) if score else None
    return {
        'cover': cover,
        'name': name,
        'categories': categories,
        'published_at': published_at,
        'drama': drama,
        'score': score
    }


def save_data(data):
    collection.update_one({'name': data.get('name')}, {'$set': data}, upsert=True)


def main(page):
    index_html = scrape_index(page)
    detail_urls = parse_index(index_html)
    for detail_url in detail_urls:
        detail_html = scrape_detail(detail_url)
        data = parse_detail(detail_html)
        logging.info('get detail data %s', data)
        logging.info('saving data to mongodb')
        save_data(data)
        logging.info('data saved successfully')


if __name__ == '__main__':
   
    pool = multiprocessing.Pool()
    pages = range(1, TOTAL_PAGE + 1)
    pool.map(main, pages)
    pool.close()
    pool.join()


刚开始是可以正常运行的,没多久就中止了
错误提示为:

multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38-32\lib\multiprocessing\pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38-32\lib\multiprocessing\pool.py", line 48, in mapstar
    return list(map(*args))
  File "C:\Users\Administrator\PycharmProjects\20_Requests+Requery+MongoDB基本案例实战\3_多进程爬取.py", line 91, in main
    data = parse_detail(detail_html)
  File "C:\Users\Administrator\PycharmProjects\20_Requests+Requery+MongoDB基本案例实战\3_多进程爬取.py", line 56, in parse_detail
    doc = pq(html)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pyquery-1.4.1-py3.8.egg\pyquery\pyquery.py", line 267, in __init__
    raise TypeError(context)
TypeError: None
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:/Users/Administrator/PycharmProjects/20_Requests+Requery+MongoDB基本案例实战/3_多进程爬取.py", line 102, in <module>
    pool.map(main, pages)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38-32\lib\multiprocessing\pool.py", line 364, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38-32\lib\multiprocessing\pool.py", line 768, in get
    raise self._value
TypeError: None
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值