python数据采集百度知道

博客搬家http://tyan.io

dom解析,用的是BeautifulSoup

还有小小的bug,因为百度知道的标签太混乱了

url还好,都是连续的http://zhidao.baidu.com/question/ + 问题的id



def parseHtml(resultHtml,questionid):
    soup = BeautifulSoup(resultHtml)
    if( soup.find("span", class_="ask-title  ") != None ):
        questiontitle = soup.find("span", class_="ask-title  ").text
    else:
        questiontitle = None<span id="transmark"></span>
    if( soup.find("pre", class_="line mt-10 q-content") != None ):
        fullquestion = soup.find("pre", class_="line mt-10 q-content").text
    else:
        fullquestion = None
    if( soup.find("span", class_="grid-r ask-time") != None ):
        asktime = soup.find("span", class_="grid-r ask-time").text
    else:
        asktime = None
    bestanster = None
    if( soup.find("div", class_="wgt-best ") != None ):
        if( soup.find("div", class_="wgt-best ").find("pre",class_="best-text mb-10") != None ):
            bestanster = soup.find("div", class_="wgt-best ").find("pre",class_="best-text mb-10").text
        elif( soup.find("div", class_="wgt-best ").find("div", class_="bd answer").find("pre",class_="best-text expand-exp mb-10") != None ):
            bestanster = soup.find("div", class_="wgt-best ").find("div", class_="bd answer").find("pre",class_="best-text expand-exp mb-10").text
    elif ( soup.find("div", class_="wgt-recommend ") != None ):
        if( soup.find("div", class_="wgt-recommend ").find("div", class_="bd answer").find("pre",class_="recommend-text mb-10") != None ):
            bestanster = soup.find("div", class_="wgt-recommend ").find("div", class_="bd answer").find("pre",class_="recommend-text mb-10").text
        elif( soup.find("div", class_="wgt-recommend ").find("div", class_="bd answer").find("pre",class_="recommend-text expand-exp mb-10") != None ):
            bestanster = soup.find("div", class_="wgt-recommend ").find("div", class_="bd answer").find("pre",class_="recommend-text expand-exp mb-10").text
    else:
        bestanster = None

    if ( soup.find("div", class_="wgt-answers") != None ):
        otherDiv = soup.find("div", class_="wgt-answers")
        if(otherDiv.find("div", class_="bd answer answer-first    ") != None ):
            anster1 = otherDiv.find("div", class_="bd answer answer-first    ")\
            .find("pre", class_="answer-text mb-10").text
        elif(otherDiv.find("div", class_="bd answer answer-first   answer-fold ") != None ):
            anster1 = otherDiv.find("div", class_="bd answer answer-first   answer-fold ")\
            .find("pre", class_="answer-text mb-10").text
        else:
            anster1 = None
            anster2 = None
            anster3 = None
        if(otherDiv.find("div", class_="bd answer    answer-fold ") != None ):
                answerfold = otherDiv.find_all("div", class_="bd answer    answer-fold ")
                if( len(answerfold) >1 ):
                    anster2 = answerfold[0].find("pre", class_="answer-text mb-10").text
                    anster3 = answerfold[1].find("pre", class_="answer-text mb-10").text
                else:
                    anster2 = answerfold[0].find("pre", class_="answer-text mb-10").text
                    if(otherDiv.find("div", class_="bd answer  answer-last  answer-fold ") != None  ):
                        anster3 = otherDiv.find("div", class_="bd answer  answer-last  answer-fold ")\
                        .find("pre", class_="answer-text mb-10").text
                    elif(otherDiv.find("div", class_="bd answer  answer-last   ") != None ):
                        anster3 = otherDiv.find("div", class_="bd answer  answer-last   ")\
                        .find("pre", class_="answer-text mb-10").text
                    else:
                        anster3 = None
        elif(otherDiv.find("div", class_="bd answer     ") != None ):
                answerfold = otherDiv.find_all("div", class_="bd answer     ")
                if( len(answerfold) >1 ):
                    anster2 = answerfold[0].find("pre", class_="answer-text mb-10").text
                    anster3 = answerfold[1].find("pre", class_="answer-text mb-10").text
                else:
                    anster2 = answerfold[0].find("pre", class_="answer-text mb-10").text
                    if(otherDiv.find("div", class_="bd answer  answer-last  answer-fold ") != None ):
                        anster3 = otherDiv.find("div", class_="bd answer  answer-last  answer-fold ")\
                        .find("pre", class_="answer-text mb-10").text
                    elif(otherDiv.find("div", class_="bd answer  answer-last   ") != None ):
                        anster3 = otherDiv.find("div", class_="bd answer  answer-last   ")\
                        .find("pre", class_="answer-text mb-10").text
                    else:
                        anster3 = None
        elif(otherDiv.find("div", class_="bd answer  answer-last  answer-fold ") != None ):
            anster2 = otherDiv.find("div", class_="bd answer  answer-last  answer-fold ")\
            .find("pre", class_="answer-text mb-10").text
            anster3 = None
        else:
            anster2 = None
            anster3 = None
    else:
        anster1 = None
        anster2 = None
        anster3 = None
#解析出来的变量是

#questionid,questiontitle,fullquestion,asktime,bestanster,anster1,anster2,anster3





  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值