小技巧
- 通过百度搜索知乎精华话题能跳过知乎登陆界面
爬取众话题id
-
工具:selenium
-
难点:向下滚动
-
代码:
#存储大类网址 lists = ['https://www.zhihu.com/topics#%E7%94%9F%E6%B4%BB%E6%96%B9%E5%BC%8F', 'https://www.zhihu.com/topics#%E7%BB%8F%E6%B5%8E%E5%AD%A6', 'https://www.zhihu.com/topics#%E8%BF%90%E5%8A%A8', 'https://www.zhihu.com/topics#%E4%BA%92%E8%81%94%E7%BD%91', 'https://www.zhihu.com/topics#%E8%89%BA%E6%9C%AF', 'https://www.zhihu.com/topics#%E9%98%85%E8%AF%BB', 'https://www.zhihu.com/topics#%E7%BE%8E%E9%A3%9F', 'https://www.zhihu.com/topics#%E5%8A%A8%E6%BC%AB', 'https://www.zhihu.com/topics#%E6%B1%BD%E8%BD%A6', 'https://www.zhihu.com/topics#%E8%B6%B3%E7%90%83', 'https://www.zhihu.com/topics#%E6%95%99%E8%82%B2', 'https://www.zhihu.com/topics#%E6%91%84%E5%BD%B1', 'https://www.zhihu.com/topics#%E5%8E%86%E5%8F%B2', 'https://www.zhihu.com/topics#%E6%96%87%E5%8C%96', 'https://www.zhihu.com/topics#%E6%97%85%E8%A1%8C', 'https://www.zhihu.com/topics#%E8%81%8C%E4%B8%9A%E5%8F%91%E5%B1%95', 'https://www.zhihu.com/topics#%E9%87%91%E8%9E%8D', 'https://www.zhihu.com/topics#%E6%B8%B8%E6%88%8F', 'https://www.zhihu.com/topics#%E7%AF%AE%E7%90%83', 'https://www.zhihu.com/topics#%E7%94%9F%E7%89%A9%E5%AD%A6', 'https://www.zhihu.com/topics#%E7%89%A9%E7%90%86%E5%AD%A6', 'https://www.zhihu.com/topics#%E5%8C%96%E5%AD%A6', 'https://www.zhihu.com/topics#%E7%A7%91%E6%8A%80', 'https://www.zhihu.com/topics#%E4%BD%93%E8%82%B2', 'https://www.zhihu.com/topics#%E5%95%86%E4%B8%9A', 'https://www.zhihu.com/topics#%E5%81%A5%E5%BA%B7', 'https://www.zhihu.com/topics#%E5%88%9B%E4%B8%9A', 'https://www.zhihu.com/topics#%E8%AE%BE%E8%AE%A1', 'https://www.zhihu.com/topics#%E8%87%AA%E7%84%B6%E7%A7%91%E5%AD%A6', 'https://www.zhihu.com/topics#%E6%B3%95%E5%BE%8B', 'https://www.zhihu.com/topics#%E7%94%B5%E5%BD%B1', 'https://www.zhihu.com/topics#%E9%9F%B3%E4%B9%90', 'https://www.zhihu.com/topics#%E6%8A%95%E8%B5%84'] max = len(lists) #无头浏览器初始化 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') browser = webdriver.Chrome(chrome_options=chrome_options, ) #遍历大类列表的每一个网址,向下滚动,获取每个子话题id for i in range(max): b = int(i) print(str(i)) # 打开大类网址 browser.get(lists[b]) time.sleep(2) # 动态加载 for i in range(10): try: print('休眠2秒') time.sleep(2) browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') a = browser.find_element_by_xpath('//*[@class="zg-btn-white zu-button-more"]')#向下滑动 a.click() except : break # 获取小类id print('获取并保存') aaa = re.findall("/topic/(\d+)", browser.page_source, re.S) aaa=[str(i) for i in aaa] with open('topic_id.txt','a') as tmp: tmp.write('\n'.join(aaa)) print('ok') browser.close()
爬取每个话题的精华问题
-
难点:量大,需调用多线程
-
工具:scrapy
-
代码
class zhihuspider(scrapy.Spider): handle_httpstatus_list = [429, 403] name = 'Zhihu_essence_Spider' get_item=0 #统计正常爬取信息 drop_item=0 #统计不正常爬取信息 def start_requests(self): #获取话题id with open(r'E:\hui_code\work_space\topic_id.txt','r') as tmp: data=tmp.read() topic_id=re.findall('([0-9]*?)\n',data) topic_id=[str(i) for i in topic_id]#转化元素为字符串 for i in topic_id: for times in range(4): url='https://www.zhihu.com/api/v4/topics/' + str(i) + '/feeds/essence?&limit=10&offset='+str(15*times) yield scrapy.Request(url=url,callback=self.parse,meta={'topic_id':str(i)},priority=2) def parse(self, response): try: item = ZhihutopicItem() text = json.loads(response.text) if "data" not in text and "paging" in text: text=text['paging'] for i in range(len(text["data"])): item['topic_url'] = 'https://www.zhihu.com/topic/' + response.meta['topic_id'] item['title'] = text["data"][i]["target"]["question"]["title"] if re.findall('[a-zA-Z]', str(item['title'])): # 匹配英文字符成功则舍弃 continue item['comment_num'] = text["data"][i]["target"]["comment_count"] item['zlike'] = text["data"][i]["target"]["voteup_count"] item['create_time'] = text["data"][i]["target"]["created_time"] item['detail_url'] = 'https://www.zhihu.com/question/' + str( text["data"][i]["target"]["question"]["id"]) item['typeid'] = 0 item['topic_id'] = response.meta['topic_id'] if item['title'] != '': yield item self.get_item = self.get_item + 1 except: print(text) self.drop_item = self.drop_item + 1 def close(self, spider, reason): logging.info('捕获到的item数为' + str(self.get_item)) logging.info('关键信息缺失的item数为' + str(self.drop_item))