python出现RuntimeError错误

最新推荐文章于 2024-08-18 16:00:08 发布

舔狗一无所有

最新推荐文章于 2024-08-18 16:00:08 发布

阅读量3.9w

点赞数 4

文章标签： python runtimeerror

本文链接：https://blog.csdn.net/weixin_42099082/article/details/89365643

版权

RuntimeError: 
        An attempt has been made to start a new process before the
        current process has finished its bootstrapping phase.

        This probably means that you are not using fork to start your
        child processes and you have forgotten to use the proper idiom
        in the main module:

            if __name__ == '__main__':
                freeze_support()
                ...

        The "freeze_support()" line can be omitted if the program
        is not going to be frozen to produce an executable.

上面是出现的错误解释

下面是出现错误代码的原代码

import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re

base_url = "https://morvanzhou.github.io/"

#crawl爬取网页
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)
    return response.read().decode()

#parse解析网页
def parse(html):
    soup = BeautifulSoup(html,'html.parser')
    urls = soup.find_all('a',{"href":re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href'])for url in urls])
    url = soup.find('meta',{'property':"og:url"})['content']
    return title,page_urls,url

unseen = set([base_url])
seen = set()
restricted_crawl = True

pool = mp.Pool(4)
count, t1 = 1, time.time()
while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
    print('\nDistributed Crawling...')
    crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
    htmls = [j.get() for j in crawl_jobs]      # request connection

    print('\nDistributed Parsing...')
    parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
    results = [j.get() for j in parse_jobs]    # parse html

    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen

    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1))    # 16 s !!!

这是修改后的正确代码

import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re

base_url = "https://morvanzhou.github.io/"

#crawl爬取网页
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)
    return response.read().decode()

#parse解析网页
def parse(html):
    soup = BeautifulSoup(html,'html.parser')
    urls = soup.find_all('a',{"href":re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href'])for url in urls])
    url = soup.find('meta',{'property':"og:url"})['content']
    return title,page_urls,url

def main():
    unseen = set([base_url])
    seen = set()
    restricted_crawl = True

    pool = mp.Pool(4)
    count, t1 = 1, time.time()
    while len(unseen) != 0:                 # still get some url to visit
        if restricted_crawl and len(seen) > 20:
                break
        print('\nDistributed Crawling...')
        crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
        htmls = [j.get() for j in crawl_jobs]      # request connection

        print('\nDistributed Parsing...')
        parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
        results = [j.get() for j in parse_jobs]    # parse html

        print('\nAnalysing...')
        seen.update(unseen)         # seen the crawled
        unseen.clear()              # nothing unseen

        for title, page_urls, url in results:
            print(count, title, url)
            count += 1
            unseen.update(page_urls - seen)     # get new url to crawl
    print('Total time: %.1f s' % (time.time()-t1))    # 16 s !!!


if __name__ == '__main__':
    main()

综上可知，就是把你的运行代码整合成一个函数，然后加入