爬取知乎专栏
#爬取知乎专栏
class XSSpider(scrapy.Spider):
name = 'xiaoshuo'
allowed_domains = ['zhihu.com']
start_urls = ['https://www.zhihu.com/api/v4/columns/c_1059416559054893056/items?limit=10&offset=0']
def parse(self, response):
obj = json.loads(response.text)
icount=0
for p_item in obj['data']:
icount = icount + 1
p_content = get_html_of_response(p_item['url'])
p_selector = etree.HTML(p_content.text)
sid = re.sub(".+/p/","",p_item['url'])
content_json = json.loads(p_selector.xpath("//script[@id='js-initialData']/text()")[0])
txt = ""
i_title = content_json['initialState']['entities']['articles'][sid]['title']
txt =