[转载] python出现RuntimeError错误，亲测有效

最新推荐文章于 2024-07-18 19:59:39 发布

ey_snail

最新推荐文章于 2024-07-18 19:59:39 发布

阅读量2k

点赞数

原文链接：https://blog.csdn.net/weixin_40757206/article/details/110765786?ops_request_misc=%25257B%252522request%25255Fid%252522%25253A%252522160920751016780299082045%252522%25252C%252522scm%252522%25253A%25252220140713.130102334.pc%25255Fall.%252522%25257D&request

版权

参考链接： Python中的NZEC错误

RuntimeError:

An attempt has been made to

start a

new process

before the

current process has finished its bootstrapping phase.

This probably means that you

are

not

using fork

start your

child processes

and you have forgotten

use the proper idiom

in the

main

module:

if __name__ ==

'__main__':

freeze_support()

...

The

"freeze_support()" line can be omitted

if the program

not going

to be frozen

to produce an executable.

上面是出现的错误解释

下面是出现错误代码的原代码

import multiprocessing

as mp

import time

from urllib.request

import urlopen,urljoin

from bs4

import BeautifulSoup

import re

base_url =

"https://morvanzhou.github.io/"

#crawl爬取网页

def crawl(url):

response = urlopen(url)

time.sleep(

0.1)

return response.read().decode()

#parse解析网页

def parse(html):

soup = BeautifulSoup(html,

'html.parser')

urls = soup.find_all(

'a',{

"href":re.compile(

'^/.+?/$')})

title = soup.find(

'h1').get_text().strip()

page_urls = set([urljoin(base_url,url[

'href'])

for url

in urls])

url = soup.find(

'meta',{

'property':

"og:url"})[

'content']

return title,page_urls,url

unseen = set([base_url])

seen = set()

restricted_crawl =

True

pool = mp.Pool(

count, t1 =

1, time.time()

while len(unseen) !=

# still get some url to visit

if restricted_crawl

and len(seen) >

20:

break

print(

'\nDistributed Crawling...')

crawl_jobs = [pool.apply_async(crawl, args=(url,))

for url

in unseen]

htmls = [j.get()

for j

in crawl_jobs]

# request connection

print(

'\nDistributed Parsing...')

parse_jobs = [pool.apply_async(parse, args=(html,))

for html

in htmls]

results = [j.get()

for j

in parse_jobs]

# parse html

print(

'\nAnalysing...')

seen.update(unseen)

# seen the crawled

unseen.clear()

# nothing unseen

for title, page_urls, url

in results:

print(count, title, url)

count +=

unseen.update(page_urls - seen)

# get new url to crawl

print(

'Total time: %.1f s' % (time.time()-t1))

# 16 s !!!

这是修改后的正确代码

import multiprocessing

as mp

import time

from urllib.request

import urlopen,urljoin

from bs4

import BeautifulSoup

import re

base_url =

"https://morvanzhou.github.io/"

#crawl爬取网页

def crawl(url):

response = urlopen(url)

time.sleep(

0.1)

return response.read().decode()

#parse解析网页

def parse(html):

soup = BeautifulSoup(html,

'html.parser')

urls = soup.find_all(

'a',{

"href":re.compile(

'^/.+?/$')})

title = soup.find(

'h1').get_text().strip()

page_urls = set([urljoin(base_url,url[

'href'])

for url

in urls])

url = soup.find(

'meta',{

'property':

"og:url"})[

'content']

return title,page_urls,url

def main():

unseen = set([base_url])

seen = set()

restricted_crawl =

True

pool = mp.Pool(

count, t1 =

1, time.time()

while len(unseen) !=

# still get some url to visit

if restricted_crawl

and len(seen) >

20:

break

print(

'\nDistributed Crawling...')

crawl_jobs = [pool.apply_async(crawl, args=(url,))

for url

in unseen]

htmls = [j.get()

for j

in crawl_jobs]

# request connection

print(

'\nDistributed Parsing...')

parse_jobs = [pool.apply_async(parse, args=(html,))

for html

in htmls]

results = [j.get()

for j

in parse_jobs]

# parse html

print(

'\nAnalysing...')

seen.update(unseen)

# seen the crawled

unseen.clear()

# nothing unseen

for title, page_urls, url

in results:

print(count, title, url)

count +=

unseen.update(page_urls - seen)

# get new url to crawl

print(

'Total time: %.1f s' % (time.time()-t1))

# 16 s !!!

if __name__ ==

'__main__':

main()

综上可知，就是把你的运行代码整合成一个函数，然后加入

if __name__ == '__main__':

main()

这行代码即可解决这个问题。

ey_snail

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫