首先获取问题url,如图。
发现url是在a标签下的href,所以使用css选择器即可得到。但是在这里,看到后面的url并不是完整的url,缺少主域名,这时就要用到urllib中的parse,将主域名与上述url连接起来。做个测试,发现可以。如下图
但是为了防止爬到javascript,还是需要对url做过滤,使用filter.代码如下
在这么多url中,我只对问题url感兴趣,所以要做正则表达式的匹配提取。
还有这种url中也要提取出question url
首先在item中创建好问题item和答案item
class ZhihuQuestionItem(scrapy.Item):
#知乎的问题item
zhihu_id = scrapy.Field()
topics = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
answer_num = scrapy.Field()
comments_num = scrapy.Field()
watch_user_num = scrapy.Field()
click_num = scrapy.Field()
crawl_time = scrapy.Field()
crawl_update_time = scrapy.Field()
class ZhihuAnswerItem(scrapy.Field):
#知乎的答案item
zhihu_id = scrapy.Field()
url = scrapy.Field()
question_id = scrapy.Field()
author_id = scrapy.Field()
content = scrapy.Field()
praise_num = scrapy.Field()
comments_num = scrapy.Field()
watcher_user_num = scrapy.Field()
click_num = scrapy.Field()
crawl_time = scrapy.Field()
crawl_update_time = scrapy.Field()
下面看看如何通过css选择器获取具体的字段,这里只以title的获取为例,其他都是类似的
# -*- coding: utf-8 -*-
import re
import json
import datetime
try:
import urlparse as parse
except:
from urllib import parse
import scrapy
from scrapy.loader import ItemLoader
from items import ZhihuQuestionItem, ZhihuAnswerItem
class ZhihuSpider(scrapy.Spider):
name = "zhihu"
allowed_domains = ['www.zhihu.com']
start_urls = ['https://www.zhihu.com/']
# question的第一页answer的请求url
start_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"
headers = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhizhu.com",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
}
custom_settings = {
"COOKIES_ENABLED": True
}
def parse(self, response):
"""
提取出html页面中的所有url 并跟踪这些url进行一步爬取
如果提取的url中格式为 /question/xxx 就下载之后直接进入解析函数
"""
all_urls = response.css("a::attr(href)").extract()
all_urls = [parse.urljoin(response.url, url) for url in all_urls]
all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
for url in all_urls:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)
if match_obj:
# 如果提取到question相关的页面则下载后交由提取函数进行提取
request_url = match_obj.group(1)
yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
else:
# 如果不是question页面则直接进一步跟踪
yield scrapy.Request(url, headers=self.headers, callback=self.parse)
def parse_question(self, response):
# 处理question页面, 从页面中提取出具体的question item
if "QuestionHeader-title" in response.text:
# 处理新版本
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeaderActions button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:
# 处理老版本页面的item提取
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
# item_loader.add_css("title", ".zh-question-title h2 a::text")
item_loader.add_xpath("title",
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
# item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
item_loader.add_xpath("watch_user_num",
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
callback=self.parse_answer)
yield question_item
pass
def parse_answer(self, reponse):
pass
def start_requests(self):
from selenium import webdriver
browser = webdriver.Chrome(executable_path="C:/Users/Fitz/Desktop/software/chromedriver.exe")
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper Input").send_keys(
"17756021040")
browser.find_element_by_css_selector(".SignFlow-password Input").send_keys(
"yinjun123456789")
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
import time
time.sleep(10)
Cookies = browser.get_cookies()
print(Cookies)
cookie_dict = {}
import pickle
for cookie in Cookies:
# 写入文件
f = open('C:/Users/Fitz/Desktop/scrapy/ArticleSpider/cookies/zhihu/' + cookie['name'] + '.zhihu', 'wb')
pickle.dump(cookie, f)
f.close()
cookie_dict[cookie['name']] = cookie['value']
browser.close()
return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict, headers=self.headers)]
打断点进行调试