打开同一个文件,相当危险,出错机率相当大,
多线程不出错,极有可能是GIL,
多进程没有锁,因此容易出错了。
url_text = codecs.open('url.txt','a','utf-8')
建议改为生产者消费都模式!
比如这样
# -*- coding: utf-8 -* -
import time
import os
import codecs
import multiprocessing
import requests
from bs4 import BeautifulSoup
baseurl = ''
baseurl1 = ''
baseurl2 = ''
pageurl = ''
searchword = ''
header = {}
def fake(url, **kwargs):
class Response(object):
pass
o = Response()
o.content = 'foo'.format(url)
return o
requests.get = fake
def Get_urls(start_page, end_page, queue):
print('run task {} ({})'.format(start_page, os.getpid()))
try:
for i in range(start_page, end_page + 1):
pageurl = baseurl1 + str(i) + baseurl2 + searchword
response = requests.get(pageurl, headers=header)
soup = BeautifulSoup(response.content, 'html.parser')
a_list = soup.find_all('a')
for a in a_list:
if a.text != ''and 'wssd_content.jsp?bookid'in a['href']:
text = a.text.strip()
url = baseurl + str(a['href'])
queue.put(text + '\t' + url + '\n')
except Exception as e:
import traceback
traceback.print_exc()
def write_file(queue):
print("start write file")
url_text = codecs.open('url.txt', 'a', 'utf-8')
while True:
line = queue.get()
if line is None:
break
print("write {}".format(line))
url_text.write(line)
url_text.close()
def Multiple_processes_test():
t1 = time.time()
manager = multiprocessing.Manager()
queue = manager.Queue()
print 'parent process {} '.format(os.getpid())
page_ranges_list = [(1, 3), (4, 6), (7, 9)]
consumer = multiprocessing.Process(target=write_file, args=(queue,))
consumer.start()
pool = multiprocessing.Pool(processes=3)
results = []
for page_range in page_ranges_list:
result = pool.apply_async(func=Get_urls,
args=(page_range[0],
page_range[1],
queue
))
results.append(result)
pool.close()
pool.join()
queue.put(None)
consumer.join()
t2 = time.time()
print '时间:', t2 - t1
if __name__ == '__main__':
Multiple_processes_test()
结果
foo /4/wssd_content.jsp?bookid
foo /5/wssd_content.jsp?bookid
foo /6/wssd_content.jsp?bookid
foo /1/wssd_content.jsp?bookid
foo /2/wssd_content.jsp?bookid
foo /3/wssd_content.jsp?bookid
foo /7/wssd_content.jsp?bookid
foo /8/wssd_content.jsp?bookid
foo /9/wssd_content.jsp?bookid