'''
大概第三次修改吧,几年没用又开始爬不了。现在检查下什么原因
过了一段时间 发现TypeError: expected string or bytes-like object
是因为提取的内容为数组,无法进行str验证find_all('a',{'class':'ti'})解决定向
创建人: caz
更新日期:20180821
已经修复可以正常爬取
略有不足的是现实有限
'''
import urllib.request
import http.client
import http.cookiejar
import bs4
import re
import time
from lxml import etree
from urllib.parse import quote
import sys
import getpass
pattern=re.compile('http://zhidao.baidu.com/question/(.+?.html)\?')
pattern2=re.compile('<span class="ask-title ">(.+?)</span>')
pattern3=re.compile('ask-title ')
pattern4=re.compile('con')
wenti='医保'
s_utf=quote(wenti.encode("gbk") )
file_tieba=open('%s.txt'%wenti,'a+',encoding='gbk')
main='https://zhidao.baidu.com/search?word='+s_utf+'&ie=gbk&site=-1&sites=0&date=0&pn='
print(main)
def IsRightId(id):
group=pattern.findall(id)
if group:
return group
else :
return None
def fenxi(main,ye):
mainurl=main+str(ye)
cj = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(handler)
urllib.request.install_opener(opener)
request = urllib.request.Request(mainurl)
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
yemian=urllib.request.urlopen(mainurl).read().decode('gbk')
return yemian
#把字符集合并成字符串
def dai(daiyu):
b=''
for fuli in daiyu:
b+=fuli
return b
#正文获取当前cook
#把当前页地址解析成地址数组集
for ye in range(0,600,10):
yemian=fenxi(main,ye)
soup=bs4.BeautifulSoup(yemian,'html.parser')
jie=1
shuzu=[]
for div in soup.find_all('a',{'class':'ti'}):
div_id=div.get('href')
if IsRightId(div_id):
shuzu+=IsRightId(div_id)
print(shuzu)
for shuzi in shuzu:
#数字为贴吧主页提取的帖子数字
wangzhi="https://zhidao.baidu.com/question/"+shuzi
print(wangzhi)
file_tieba.write('\r\n'+wangzhi+'\r\n')
file2=fenxi(wangzhi,None)
selector = etree.HTML(file2)
title= selector.xpath('//span[@class="ask-title "]/text()')
print(title)
miaosuji= selector.xpath('//span[@class="con"]/text()')
print(miaosuji)
#otherji= selector.xpath('//div[@class="answer-text mb-10 line"]/span[@class="con"]/text()')
goodji= selector.xpath('//div[@class="best-text mb-10"]/text()')
shijian=selector.xpath('//span[@class="wgt-replyer-all-time"]/text()')
otherji= selector.xpath('//div[@class="answer-text mb-10 line"]/text()')
zuiwen=selector.xpath('//pre[@class="qRA"]/text()')
zuida=selector.xpath('//pre[@class="aRA"]/text()')
miaosu=dai(miaosuji)
good=dai(goodji)
other=dai(otherji)
if miaosuji==[] and goodji==[]:
file_tieba.write('\r\n\r\n'+'第'+str(jie)+'章 问:'+title[0])
elif miaosuji==[] and goodji !=[]:
file_tieba.write('\r\n\r\n'+'第'+str(jie)+'章 问:'+title[0]+'\r\n最佳答案:'+good+shijian[0])
elif miaosuji !=[] and goodji ==[]:
file_tieba.write('\r\n\r\n'+'第'+str(jie)+'章 问:'+title[0]+'\r\n问题描述:'+miaosu+'\r\n')
else:
file_tieba.write('\r\n\r\n'+'第'+str(jie)+'章 问:'+title[0]+'\r\n问题描述: '+miaosu+'\r\n最佳答案:'+good+shijian[0])
file_tieba.write('答:'+other)
jie+=1
zonghe=zuiwen+zuida
for dayin in zonghe:
file_tieba.write('答:'+dayin)
print('第'+str(ye)+'行结束!')
file_tieba.close()
#https://yq.aliyun.com/ziliao/125794
百度知道爬虫
最新推荐文章于 2024-09-27 22:51:18 发布