1. 多进程
当爬虫的数据量越来越大时,除需要考虑存储方式外,还需考虑爬虫时的速度问题。串行爬取只有当一次爬取完之后才可进行下一次爬取,极大地限制了爬取的速度和效率。
当计算机运行程序时,会创建进程,包含代码和状态。而计算机的一个或多个CPU将会执行这些进程。同一时刻每个CPU只会执行一个进程,然后在不同进程之间快速切换。在一个进程中,程序的执行也是在不同线程之间进行切换的,每个线程将会执行程序的不同部分。
Python进行多进程爬虫使用了multiprocessing库,利用进程池进行多进程爬虫。多进程爬虫速度要远优于串行爬虫。
2. 代码
#多进程运行
def now_run(num,func,list_u):
try:
start=time.time()
pool=Pool(processes=num)
pool.map(func,list_u)
end=time.time()
print("共用时",end-start)
except:
pass
num:进程个数,一般为4
func:需要运行的函数,例如前几个博客的爬虫页面函数
list_u::需要爬虫的网页
3. 相关博客
3.1 子话题网页爬取
第一篇爬虫博客,爬取“心理学”话题的所有层次的子话题网址、以及名称
地址:https://blog.csdn.net/qq_35159009/article/details/90516414
3.2 话题页面动态加载,模拟下滚
知乎界面采用动态加载技术,只有浏览器下滚,才能刷新出数据
第二篇爬虫博客,利用Selenium与PhantomJS模拟浏览器下滚
地址:https://blog.csdn.net/qq_35159009/article/details/90522384
3.3 进入内容页面爬虫
爬取知乎内容页面的标题、标签、以及文本内容(模拟点击事件)
地址:https://blog.csdn.net/qq_35159009/article/details/90522910
4. 运行注意事项
一定保存.py文件,然后在相应文件夹下,shift+右键,选择运行命令行窗口,
输入 python 文件名称.py
这样才不会弹出弹窗,不然烦不胜烦
不要直接用编译器运行
爬虫全部代码
from selenium import webdriver
import time
import pymongo
import pandas as pd
from pandas import Series,DataFrame
from multiprocessing import Pool
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
client = pymongo.MongoClient('localhost',27017)
mydb=client['zhihu_100000']
psy=mydb['psy']
cont=mydb['cont']
topic=mydb['topic']
url_root=['https://www.zhihu.com/topic/19642170/top-answers',#心理现象
'https://www.zhihu.com/topic/19562796/top-answers',#人格
'https://www.zhihu.com/topic/19556937/top-answers',#情绪
'https://www.zhihu.com/topic/19615158/top-answers',#心理阴影
'https://www.zhihu.com/topic/19568143/top-answers',#心理健康
'https://www.zhihu.com/topic/19554945/top-answers',#心理
'https://www.zhihu.com/topic/19676436/top-answers',#心理分析
'https://www.zhihu.com/topic/19551432/top-answers',#心理学
'https://www.zhihu.com/topic/19566667/top-answers',#社会心理
'https://www.zhihu.com/topic/19764326/top-answers'#心理状态
]
topic.insert_one({'key':'心理现象','url':url_root[0]})
topic.insert_one({'key':'人格','url':url_root[1]})
topic.insert_one({'key':'情绪','url':url_root[2]})
topic.insert_one({'key':'心理阴影','url':url_root[3]})
topic.insert_one({'key':'心理健康','url':url_root[4]})
topic.insert_one({'key':'心理','url':url_root[5]})
topic.insert_one({'key':'心理分析','url':url_root[6]})
topic.insert_one({'key':'心理学','url':url_root[7]})
topic.insert_one({'key':'社会心理','url':url_root[8]})
#界面下滚
def scroll_foot(driver):
if driver.name == "chrome" or driver.name == 'phantomjs':
js = "var q=document.body.scrollTop=100000"
elif driver.name == 'internet explorer':
js = "var q=document.documentElement.scrollTop=100000"
return driver.execute_script(js)
#获取topic
def topic_select(url):
#class="TopicLink TopicTag"
topics=DataFrame(list(topic.find()))['key']
driver = webdriver.PhantomJS()
driver.get(url)
#//*[@id="root"]/div/main/div/div[2]/div/div/div/div[2]/div
#//*[@id="root"]/div/main/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/a[1]
infos = driver.find_elements_by_xpath('//*[@class="Card-section"]/div[2]/div[2]/a')
for info in infos:
url_c=info.get_attribute('href')
key=info.find_element_by_xpath('div/div/div/span').text
#topic.update({'key':key},{'$set':{"url":url}})
if key not in str(topics):
topic.insert_one({'key':key,'url':url})
try:
topic_select(url_c)
except:
pass
driver.quit()
#获取子页面网址
def get_info(url):
driver = webdriver.PhantomJS()
#driver.keep_alive = False
driver.get(url)
#driver.implicitly_wait(1)
for i in range(500):
scroll_foot(driver)
time.sleep(2)
#//*[@id="TopicMain"]/div[3]/div/div/div/div[1]
infos = driver.find_elements_by_xpath('//*[@id="TopicMain"]/div[3]/div/div/div/div')
for info in infos:
try:
#//*[@id="TopicMain"]/div[3]/div/div/div/div[1]/div/h2/div/a
questions=info.find_elements_by_xpath('div/h2/div/a')
for question in questions:
question=question.get_attribute('href')
#print(question)
#full_question='https://www.zhihu.com'+question
#print(full_question) 自动加
psy.insert_one({'url':question})
#get_question_info(question)
driver.close
except IndexError :
pass
driver.quit()
#获取子页面内容
def get_question_info(url):
driver = webdriver.PhantomJS()
#driver.keep_alive = False
driver.get(url)
driver.implicitly_wait(1)
try:
topic=driver.find_element_by_xpath('//*[@id="root"]/div/main/div/div[1]/div[2]/div[1]/div[1]/h1').text
#print("题目是",topic)
try:
more=driver.find_element_by_xpath('//*[@id="root"]/div/main/div/div[1]/div[2]/div[1]/div[1]/div[2]/div/div/div/button').click()
#content=driver.find_element_by_xpath('//*[@class="RichText ztext"]').text
#//*[@id="root"]/div/main/div/div[1]/div[2]/div[1]/div[1]/div[2]/div/div/div/span
content=driver.find_element_by_xpath('//*[@id="root"]/div/main/div/div[1]/div[2]/div[1]/div[1]/div[2]/div/div/div/span').text
except:
try:
content=driver.find_element_by_xpath('//*[@class="RichText ztext"]').text
except:
content='无'
#print("内容是",content)
f_label=driver.find_element_by_xpath('//*[@id="root"]/div/main/div/meta[3]').get_attribute('content')
#print(f_label)
labels=driver.find_elements_by_xpath('//*[@class="Tag QuestionTopic"]')
label_x=[]
for label in labels:
try:
label_x.append(label.find_element_by_xpath('span/a/div/div').text)
except IndexError:
pass
#print(label_x)
driver.close()
cont.insert_one({'topic':topic,'content':content,'labels':f_label})
except :
pass
driver.quit()
#多进程运行
def now_run(num,func,list_u):
try:
start=time.time()
pool=Pool(processes=num)
pool.map(func,list_u)
end=time.time()
print("共用时",end-start)
except:
pass
def get_true_topic():
#获取话题
now_run(4,topic_select,url_root)
#获取话题网址
data = DataFrame(list(topic.find()))
data.pop('_id')
print(len(data)) #917
data2=data.drop_duplicates()
print(len(data2)) #462
urls_top=list(data2['url'])
for i in range(len(urls_top)):
if len(urls_top[i])==36:
urls_top[i]=urls_top[i]+'/top-answers'
data2['url']=urls_top
data2.to_csv('topic.csv')
def get_true_url():
#获取问题网址
data = DataFrame(list(psy.find()))
data.pop('_id')
print(len(data)) #11939
data2=data.drop_duplicates()
print(len(data2)) #3261
urls_ture=list(data2['url'])
data2['url']=urls_ture
data2.to_csv('url2.csv')
if __name__ == '__main__':
'''
#get_true_topic()
data=pd.read_csv('topic.csv',encoding='gbk')
urls_top=list(data['url'])
#获取子网页
now_run(4,get_info,urls_top)
get_true_url()
'''
data=pd.read_csv('url2.csv',encoding='gbk')
urls_cont=list(data['url'])
#获取子网页网址
#获取子网页内容
now_run(4,get_question_info,urls_cont)