知乎精华话题爬虫

最新推荐文章于 2024-08-06 04:17:42 发布

H奇点

最新推荐文章于 2024-08-06 04:17:42 发布

阅读量2.3k

点赞数 1

分类专栏：爬虫 python

本文链接：https://blog.csdn.net/qq_37032449/article/details/84716249

版权

python 同时被 2 个专栏收录

3 篇文章 0 订阅

订阅专栏

爬虫

0 篇文章 0 订阅

订阅专栏

小技巧

通过百度搜索知乎精华话题能跳过知乎登陆界面

爬取众话题id

工具：selenium
难点：向下滚动

代码：

#存储大类网址
lists = ['https://www.zhihu.com/topics#%E7%94%9F%E6%B4%BB%E6%96%B9%E5%BC%8F',
                 'https://www.zhihu.com/topics#%E7%BB%8F%E6%B5%8E%E5%AD%A6',
                 'https://www.zhihu.com/topics#%E8%BF%90%E5%8A%A8',
                 'https://www.zhihu.com/topics#%E4%BA%92%E8%81%94%E7%BD%91',
                 'https://www.zhihu.com/topics#%E8%89%BA%E6%9C%AF',
                 'https://www.zhihu.com/topics#%E9%98%85%E8%AF%BB', 'https://www.zhihu.com/topics#%E7%BE%8E%E9%A3%9F',
                 'https://www.zhihu.com/topics#%E5%8A%A8%E6%BC%AB', 'https://www.zhihu.com/topics#%E6%B1%BD%E8%BD%A6',
                 'https://www.zhihu.com/topics#%E8%B6%B3%E7%90%83', 'https://www.zhihu.com/topics#%E6%95%99%E8%82%B2',
                 'https://www.zhihu.com/topics#%E6%91%84%E5%BD%B1', 'https://www.zhihu.com/topics#%E5%8E%86%E5%8F%B2',
                 'https://www.zhihu.com/topics#%E6%96%87%E5%8C%96', 'https://www.zhihu.com/topics#%E6%97%85%E8%A1%8C',
                 'https://www.zhihu.com/topics#%E8%81%8C%E4%B8%9A%E5%8F%91%E5%B1%95',
                 'https://www.zhihu.com/topics#%E9%87%91%E8%9E%8D',
                 'https://www.zhihu.com/topics#%E6%B8%B8%E6%88%8F', 'https://www.zhihu.com/topics#%E7%AF%AE%E7%90%83',
                 'https://www.zhihu.com/topics#%E7%94%9F%E7%89%A9%E5%AD%A6',
                 'https://www.zhihu.com/topics#%E7%89%A9%E7%90%86%E5%AD%A6',
                 'https://www.zhihu.com/topics#%E5%8C%96%E5%AD%A6', 'https://www.zhihu.com/topics#%E7%A7%91%E6%8A%80',
                 'https://www.zhihu.com/topics#%E4%BD%93%E8%82%B2', 'https://www.zhihu.com/topics#%E5%95%86%E4%B8%9A',
                 'https://www.zhihu.com/topics#%E5%81%A5%E5%BA%B7', 'https://www.zhihu.com/topics#%E5%88%9B%E4%B8%9A',
                 'https://www.zhihu.com/topics#%E8%AE%BE%E8%AE%A1',
                 'https://www.zhihu.com/topics#%E8%87%AA%E7%84%B6%E7%A7%91%E5%AD%A6',
                 'https://www.zhihu.com/topics#%E6%B3%95%E5%BE%8B', 'https://www.zhihu.com/topics#%E7%94%B5%E5%BD%B1',
                 'https://www.zhihu.com/topics#%E9%9F%B3%E4%B9%90', 'https://www.zhihu.com/topics#%E6%8A%95%E8%B5%84']

max = len(lists)

#无头浏览器初始化
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
browser = webdriver.Chrome(chrome_options=chrome_options, )

#遍历大类列表的每一个网址，向下滚动，获取每个子话题id
for i in range(max):
    b = int(i)
    print(str(i))

    # 打开大类网址
    browser.get(lists[b])
    time.sleep(2)

    # 动态加载
    for i in range(10):
        try:
            print('休眠2秒')
            time.sleep(2)
            browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            a = browser.find_element_by_xpath('//*[@class="zg-btn-white zu-button-more"]')#向下滑动
            a.click()
        except :
            break
    # 获取小类id
    print('获取并保存')
    aaa = re.findall("/topic/(\d+)", browser.page_source, re.S)
    aaa=[str(i) for i in aaa]
    with open('topic_id.txt','a') as tmp:
        tmp.write('\n'.join(aaa))
    print('ok')

browser.close()

爬取每个话题的精华问题

难点：量大，需调用多线程
工具：scrapy

代码

class zhihuspider(scrapy.Spider):
    handle_httpstatus_list = [429, 403]
    name = 'Zhihu_essence_Spider'
    get_item=0          #统计正常爬取信息
    drop_item=0         #统计不正常爬取信息
    def start_requests(self):
        #获取话题id
        with open(r'E:\hui_code\work_space\topic_id.txt','r') as tmp:
            data=tmp.read()
            topic_id=re.findall('([0-9]*?)\n',data)

        topic_id=[str(i) for i in topic_id]#转化元素为字符串
        for i in topic_id:
            for times in range(4):
                url='https://www.zhihu.com/api/v4/topics/' + str(i) + '/feeds/essence?&limit=10&offset='+str(15*times)
                yield scrapy.Request(url=url,callback=self.parse,meta={'topic_id':str(i)},priority=2)

    def parse(self, response):
        try:
            item = ZhihutopicItem()
            text = json.loads(response.text)
            if "data" not in text and "paging" in text:
                text=text['paging']
            for i in range(len(text["data"])):

                    item['topic_url'] = 'https://www.zhihu.com/topic/' + response.meta['topic_id']

                    item['title'] = text["data"][i]["target"]["question"]["title"]
                    if re.findall('[a-zA-Z]', str(item['title'])):  # 匹配英文字符成功则舍弃
                        continue
                    item['comment_num'] = text["data"][i]["target"]["comment_count"]
                    item['zlike'] = text["data"][i]["target"]["voteup_count"]
                    item['create_time'] = text["data"][i]["target"]["created_time"]
                    item['detail_url'] = 'https://www.zhihu.com/question/' + str(
                        text["data"][i]["target"]["question"]["id"])
                    item['typeid'] = 0
                    item['topic_id'] = response.meta['topic_id']
                    if item['title'] != '':

                        yield item
                        self.get_item = self.get_item + 1
        except:
            print(text)
            self.drop_item = self.drop_item + 1

    def close(self, spider, reason):
        logging.info('捕获到的item数为' + str(self.get_item))
        logging.info('关键信息缺失的item数为' + str(self.drop_item))