python3爬取知乎某话题下的若干个问题及其回答

思路:通过selenium的webdriver实现页面的点击、下来等操作。页面完全加载后通beautifulsoup来查找相应的标签。将数据放到列表中,然后通过xlwt创建excel,并把数据存进去。

缺点:容易遇到性能问题。可能一个话题有很多的回复,而对于往excel中插入数据来说,上万条就会有卡顿了。其次,此代码处理完一个页面后并不会关闭,而是接着打开下一个网页继续处理。等所有网页都处理完后,再从后往前出栈的方式关闭。对内存消耗大。(后来找到,原来webdriver退出有两种,一种是close,一种是quit。close是只关闭当前标签,并且不清除缓存,而quit是关闭浏览器。如果爬取数量多,记住使用quit)

疑难杂症:没能够实现将数据存到mysql中,一直报各种错,数据库的相关代码在最后,希望有明白人指点一下啊!

报错内容大概是下面这样:

pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near "inset into answerInfo(answerer,answer,upvoteCount,commentCount) values ('%s','%s','%s','%s')"""%(pymysql.escape_string(answerer),answer,voteCount,commentCount) at(xxx line 1")
       

beautifulsoup4文档https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html

代码

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import json
import xlwt


class ZhiHu():
    pubDate=0
    title=''

    def __init__(self,topicURL):  # 类的初始化操作
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}  # 给请求指定一个请求头来模拟chrome浏览器
        self.topicURL = topicURL  # 要访问的话题地址
    def getHtml(self,url):
        driver = webdriver.Chrome()
        driver.get(url)
        #点击查看全部回答按钮
        driver.find_element_by_class_name('QuestionMainAction').click()
        time.sleep(3)
        bs = BeautifulSoup(driver.page_source, 'lxml')
        #循环下拉
        while True:
            b = bs.find('button',{'class':'Button QuestionAnswers-answerButton Button--blue Button--spread'})
            if b!=None:
                break
            else:
                driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        bs = BeautifulSoup(driver.page_source, 'lxml')
        global pubDate,title
        title = bs.find('h1', class_='QuestionHeader-title').string
        pubDate= json.loads(bs.find('script', {'type': 'application/ld+json'}).get_text())["pubDate"][0:10]
        html = bs.find_all('div',{'class':'List-item'})
        print(title+"\t:\t此问题总共有%d条回答"%len(html))
        return html

    def downLoadToTxt(self,html,path):

        for tag in html:
            content = []
            content.append(title)
            content.append(pubDate)
            # 获取回答内容
            answer = tag.find('div', class_='RichContent-inner').find('span').get_text()
            #获取回答人
            answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"]
            content.append(answerer)
            #回答时间
            time = tag.find('div',class_='ContentItem-time').find('span').get_text()[-10:]
            content.append(time)
            #获取赞同数
            upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"]
            content.append(str(upvoteCount))
            #获取评论数
            commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"]
            content.append(str(commentCount))
            content.append(answer)
            with open(path, 'a') as f:  # 如果filename不存在会自动创建, 'w'表示写数据,写之前会清空文件中的原有数据!
                for tag in content:
                    f.write(tag+'\t')
                f.write('\n')
            f.close()
            print(answerer+'\n'+str(upvoteCount)+'\n'+str(commentCount)+'\n\n\n')
    def downLoadToExcel(self,html):
        result = []
        head = ['问题','发布时间','回答人','回答时间','赞同数','评论数','回答内容']
        result.append(head)
        for tag in html:
            content = []
            content.append(title)
            content.append(pubDate)
            # 获取回答内容
            answer = tag.find('div', class_='RichContent-inner').find('span').get_text()
            #获取回答人
            answerer = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-zop'])["authorName"]
            content.append(answerer)
            # 回答时间
            time = tag.find('div', class_='ContentItem-time').find('span').get_text()[-10:]
            content.append(time)
            #获取赞同数
            upvoteCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["upvote_num"]
            content.append(str(upvoteCount))
            #获取评论数
            commentCount = json.loads(tag.find('div',class_='ContentItem AnswerItem')['data-za-extra-module'])["card"]["content"]["comment_num"]
            content.append(str(commentCount))
            content.append(answer)
            result.append(content)
        workbook = xlwt.Workbook(encoding='utf-8')
        booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)
        for i, row in enumerate(result):
            for j, col in enumerate(row):
                booksheet.write(i, j, col)
        workbook.save(title+'.xls')

    def getAnswerItemURLs(self):
        driver = webdriver.Chrome()
        driver.get(self.topicURL)
        time.sleep(2)
        #下拉次数
        i=5
        # 循环下拉
        while i>0:
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            time.sleep(3)
            i=i-1
        bs = BeautifulSoup(driver.page_source, 'lxml')

        #所有的回答
        AnswerItems=bs.find_all('div',class_='ContentItem AnswerItem')

        AnswerItemURLs=[]
        preURL="https://www.zhihu.com"
        for item in AnswerItems:
            tailURL=item.find('a')['href']
            URL=preURL+tailURL
            AnswerItemURLs.append(URL)
            print(URL)
        print("总共有%d条问题!"%len(AnswerItemURLs))
        return AnswerItemURLs

    def getArticleItemURLs(self):
        driver = webdriver.Chrome()
        driver.get(self.topicURL)
        time.sleep(2)
        i=5
        # 循环下拉
        while i>0:
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            time.sleep(3)
            i=i-1
        bs = BeautifulSoup(driver.page_source, 'lxml')

        # 所有的专栏
        ArticleItems = bs.find_all('div', class_='ContentItem ArticleItem')

        ArticleItemURLs=[]
        preURL="https:"
        for item in ArticleItems:
            tailURL=item.find('a')['href']
            URL=preURL+tailURL
            ArticleItemURLs.append(URL)
            print(URL)
        print("总共有%d条问题!"%len(ArticleItemURLs))
        return ArticleItemURLs

zhihu = ZhiHu("话题地址")

AnswerItemURLs = zhihu.getAnswerItemURLs()
for url in AnswerItemURLs:
    html = zhihu.getHtml(url)
    zhihu.downLoadToExcel(html)
print("ok")

 

  • 1
    点赞
  • 18
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值