前言
本周开始对于知乎网站进行网站的问题内容以及问题下面的回答进行爬取,知乎网站的爬取与其他的新闻网站并不太相同。首先体现在的是知乎网站需要登录后才能够爬取到所需要的内容,其次知乎的回答与头条新闻一样,需要爬取到所有的回答,所以我们爬取采用scrapy+selenium的方式。
一、登录
由于知乎的登录需要密码以及验证码,所以我们采用cookie登录的方式避免验证码的干扰。
1.获取cookie
我们采用手动登录的方式来获取登录的cookie:
def login_cookie():
driver = get_driver()
driver.set_page_load_timeout(20)
driver.set_script_timeout(20)
LOGIN_URL = 'https://www.zhihu.com/'
driver.get(LOGIN_URL)
time.sleep(3)
input("请登录后按 Enter")
cookies = driver.get_cookies()
jsonCookies = json.dumps(cookies)
#下面的文件位置需要自己改
with open(r'C:\Users\往痕\Desktop\项目实训_爬虫\zhihu.txt','w') as f:
f.write(jsonCookies)
driver.quit()
2.利用cookie登录
之后我们在middlewares.py里面进行登录:
if spider.middle_control == '登录':
LOGIN_URL = 'https://www.zhihu.com/'
spider.browser.get(LOGIN_URL)
time.sleep(2)
# 下面的文件位置需要自己改,与上面的改动一致
f = open(spider.file_path + '/zhihu.txt')
cookies = f.read()
jsonCookies = json.loads(cookies)
for co in jsonCookies:
spider.browser.add_cookie(co)
spider.browser.refresh()
time.sleep(2)
spider.browser.get(request.url)
sleep(2)
try:
for i in range(10):
spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(2)
except TimeoutException as e:
print('超时')
spider.browser.execute_script('window.stop()')
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source,
encoding="utf-8", request=request)
二、爬虫编写
1.items.py
代码如下:
import scrapy
class SpiderZhihuItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
source = scrapy.Field()
# detail里面包含相关话题以及简介
detail = scrapy.Field()
# 回答之间用||分开
answer = scrapy.Field()
2.middlewares.py
代码如下:
class SpiderZhihuDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
if spider.middle_control == '登录':
LOGIN_URL = 'https://www.zhihu.com/'
spider.browser.get(LOGIN_URL)
time.sleep(2)
# 下面的文件位置需要自己改,与上面的改动一致
f = open(spider.file_path + '/zhihu.txt')
cookies = f.read()
jsonCookies = json.loads(cookies)
for co in jsonCookies:
spider.browser.add_cookie(co)
spider.browser.refresh()
time.sleep(2)
spider.browser.get(request.url)
sleep(2)
try:
for i in range(10):
spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(2)
except TimeoutException as e:
print('超时')
spider.browser.execute_script('window.stop()')
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source,
encoding="utf-8", request=request)
if spider.middle_control == '一级':
try:
spider.browser.get(request.url)
for i in range(30):
spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(2)
except TimeoutException as e:
print('超时')
spider.browser.execute_script('window.stop()')
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source,
encoding="utf-8", request=request)
if spider.middle_control == '二级':
answer_num = 1
try:
spider.browser.get(request.url)
except TimeoutException as e:
print('超时')
spider.browser.execute_script('window.stop()')
try:
answer_num = spider.browser.find_element_by_xpath(
"//a[@class='QuestionMainAction ViewAll-QuestionMainAction']").text
spider.browser.find_element_by_xpath(
"//a[@class='QuestionMainAction ViewAll-QuestionMainAction']").click()
answer_num = int(int(answer_num.strip('查看全部 个回答'))/5)
sleep(1)
if answer_num >= 30:
answer_num = 30
except:
pass
for i in range(answer_num):
spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(2)
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source,
encoding="utf-8", request=request)
3.myspider.py
代码如下:
import os
import pandas as pd
import scrapy
from selenium import webdriver
from ..items import SpiderZhihuItem
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
middle_control = '登录'
file_path = os.path.dirname(__file__)
spider_title = [] # 读取repeat.txt得到的title
url_title = [] # 在爬取过程中的title
def __init__(self):
# 让网页不加载图片
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
self.browser = webdriver.Chrome(options=chrome_options)
self.browser.set_page_load_timeout(30)
# 获取重复的title
store_file = os.path.dirname(__file__) + '/repeat.txt'
f = open(store_file, encoding='utf-8')
for i in f.read().split('||'):
self.spider_title.append(i)
def closed(self, spider):
print("spider closed")
self.browser.close()
def start_requests(self):
url = 'https://www.zhihu.com/topic/19864829/hot'
yield scrapy.Request(url=url, callback=self.parse_1)
self.middle_control = '一级'
self.repeat = ''
url = 'https://www.zhihu.com/topic/19864829/top-answers'
yield scrapy.Request(url=url, callback=self.parse_1)
def parse_1(self, response):
title_list = response.xpath("//div[@class='List-item TopicFeedItem']//h2[@class='ContentItem-title']//a//text()").extract()
url_list = response.xpath("//div[@class='List-item TopicFeedItem']//h2[@class='ContentItem-title']//a/@href").extract()
source_list = response.xpath(
"//div[@class='List-item TopicFeedItem']//div[@class='AuthorInfo-head']//span[@class='UserLink AuthorInfo-name']//text()").extract()
self.middle_control = '二级'
for i in range(len(title_list)):
title = title_list[i]
url = 'https:' + url_list[i]
source = source_list[i]
item = SpiderZhihuItem(title=title, url=url, source=source)
self.spider_title.append(title)
yield scrapy.Request(url=url, meta={'item': item}, callback=self.parse_detail)
print('二级页面爬取完毕')
def parse_detail(self, response):
item = response.meta['item']
self.spider_title.append(item['title'])
total_detail = ''
detail_1 = response.xpath("//meta[@itemprop='keywords']/@content").extract()
for i in detail_1:
total_detail += i + '||'
detail_2 = response.xpath("//meta[@name='keywords']/@content").extract()
for i in detail_2:
total_detail += i + '||'
detail_3 = response.xpath("//div[@class='QuestionRichText QuestionRichText--expandable QuestionRichText--collapsed']//text()").extract()
for i in detail_3:
total_detail += i + '||'
item['detail'] = total_detail
answer = ''
answer_list_1 = response.xpath("//span[@class='RichText ztext CopyrightRichText-richText']//p//text()").extract()
answer_list_2 = response.xpath("//div[@class='RichText ztext CopyrightRichText-richText']//p//text()").extract()
answer_list_3 = response.xpath("//div[@class='RichText ztext Post-RichText']//p//text()").extract()
for i in answer_list_1:
answer += '||' + i
for i in answer_list_2:
answer += '||' + i
for i in answer_list_3:
answer += '||' + i
item['answer'] = answer
yield item
总结
这周进行了知乎网站的爬取,下周将进行微博数据的爬取。但是这周的知乎爬取还存在着一些问题,爬取速度不够快,存在着url相同但是问题相同的回答,所以要对内容进行去重,以及爬虫的健壮性需要优化。