以前电脑死活跑不了多进程,重装了一下系统,居然啥都解决了,于是乎就跑了一下:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@File : jianshu.py.py
@Time : 2019/9/1 20:42
@Author : Sound_of_Silence
"""
import requests
import time
import re
from lxml.html import etree
from multiprocessing import Pool
def get_url_text(url1):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
"upgrade-insecure-requests": '1'}
res = requests.get(url1, headers=headers)
res.encoding = 'utf-8'
# print(res.status_code)
return res.text
except BaseException:
return ''
def clean_str(string):
return re.sub(r'\s|\n','',string)
def get_info(url2):
global count
html = get_url_text(url2)
print(len(html),end=' ')
print(url2)
response = etree.HTML(html)
try:
titles = response.xpath('//div[@id="list-container"]//li//a[@class="title"]/text()')
authors = response.xpath('//div[@id="list-container"]//li//a[@class="nickname"]/text()')
scores = response.xpath(
'//div[@id="list-container"]//li//span[@class="jsd-meta"]/text()')[1::2]
comments = response.xpath(
'//div[@id="list-container"]//li//div[@class="meta"]/a[2]/text()')[1::2]
thumb_ups = response.xpath(
'//div[@id="list-container"]//li//div[@class="meta"]/span[2]/text()')
earnings = response.xpath(
'//div[@id="list-container"]//li//div[@class="meta"]/span[3]/text()')
urls = response.xpath('//div[@id="list-container"]//li//a[@class="title"]/@href')
abstracts = response.xpath(
'//div[@id="list-container"]//li//p[@class="abstract"]/text()')
# print(earnings)
for title, author, score, comment, thumb_up, earning, url, abstract in zip(titles, authors, scores, comments,thumb_ups,earnings,urls, abstracts):
item = dict()
item['title'] = clean_str(title)
item['author'] = clean_str(author)
item['score'] = eval(clean_str(score))
item['comment'] = eval(clean_str(comment))
item['thumb_up'] = eval(clean_str(thumb_up))
item['earning'] = eval(clean_str(earning))
item['url'] = 'https://www.jianshu.com'+ clean_str(url)
item['abstract'] = clean_str(abstract)
print(item)
except (IndexError, SyntaxError) as e:
print(e)
pass
if __name__ == '__main__':
urls = [
'https://www.jianshu.com/c/bDHhpK?order_by=top&page={}'.format(i) for i in range(1, 11)]
count = 0
p = Pool(8)
start = time.perf_counter()
for i in range(10):
p.apply_async(get_info, args=(urls[i],))
time.sleep(2)
p.close()
p.join()
for url in urls:
get_info(url)
time.sleep(2)
end = time.perf_counter()
print(f'{end-start} s')
因为加了sleep,单进程时40秒左右,多进程时20秒左右,效果明显!